Python FileIngester Examples, FileIngester Python Examples

Example #1

0

Show file

File: Navigator.py Project: DGI-UPITT/Pittsburgh-Book-Ingester

def processFolder(fedora, config):
    """
    Create a bunch of fedora objects (1 for each folder in @config.inDir)
    """

    folder = config.inDir

    # first make sure @folder is a valid folder
    if not os.path.isdir(folder):
        return False

    # the collection overhead
    # the host collection (topmost root)
    hostCollection = addCollectionToFedora(fedora, config.hostCollectionName, myPid=config.hostCollectionPid, tnUrl=config.hostCollectionIcon)
    # the aggregate (contains the books)
    myCollection = addCollectionToFedora(fedora, config.myCollectionName, myPid=config.myCollectionPid, parentPid=config.hostCollectionPid, tnUrl=config.myCollectionIcon)

    # this is the list of all folders to search in for books
    baseFileDict = { 'parentPid' : config.myCollectionPid, 'contentModel' : 'islandora:bookCModel' }
    totalFiles = 0
    completeFiles = 0
    for subFolder in os.listdir(folder):
        if os.path.isdir(os.path.join(folder, subFolder)):

            print("Scan Folder %s" % subFolder)
            # the single item trap - very helpful for testing
            #if subFolder != "00awa1617m":
            #    continue

            fileDict = { 'label': subFolder, 'datastreams' : { } }

            def addFileByPattern(label, pattern):
                file = glob.glob("%s" % os.path.join(folder, subFolder, pattern))
                if len(file) > 0:
                    fileDict['datastreams'][label] = file[0]
                    return True
                return False

            addFileByPattern("MODS", "*.mods.xml")
            addFileByPattern("DC", "*.dc.xml")
            addFileByPattern("MARC", "*.marcxml.xml")
            addFileByPattern("METS", "*.mets.xml")

            ocr = glob.glob("%s" % os.path.join(folder, subFolder, "*.ocr.zip"))
            if ocr:
                fileDict['ocr'] = ocr[0]

            chapters = glob.glob("%s" % os.path.join(folder, subFolder, "*.body.xml"))
            if chapters:
                fileDict['chapters'] = chapters[0]
            # creation of the dictionary here might be bad
            fileDict.update(baseFileDict)
            totalFiles = totalFiles + 1
            if FileIngester.createObjectFromFiles(fedora, config, fileDict):
                print("Object (%s) ingested successfully" % subFolder)
                completeFiles = completeFiles + 1

    return completeFiles

Example #2

0

Show file

File: Navigator.py Project: DGI-UPITT/Pittsburgh-Kogyo-Ingester

def processFolder(fedora, config):
    """
    Create a bunch of fedora objects (1 for each folder in @config.inDir)
    """

    folder = config.inDir

    # first make sure @folder is a valid folder
    if not os.path.isdir(folder):
        return False

    # the collection overhead
    # the host collection (topmost root)
    hostCollection = addCollectionToFedora(fedora, config.hostCollectionName, myPid=config.hostCollectionPid, tnUrl=config.hostCollectionIcon)
    # the aggregate (contains the books)
    # if you set hostcollectionpid and mycollectionpid to the same value then
    # addCollectionToFedora won't add the second one, it will simply return the
    # first one saying "oh, you must have meant this one"
    myCollection = addCollectionToFedora(fedora, config.myCollectionName, myPid=config.myCollectionPid, parentPid=config.hostCollectionPid, tnUrl=config.myCollectionIcon)

    # this is the list of all folders to search in for books
    baseFileDict = { 'parentPid' : config.myCollectionPid, 'contentModel' : 'islandora:bookCModel' }
    totalFiles = 0
    completeFiles = 0
    for subFolder in os.listdir(folder):
        if os.path.isdir(os.path.join(folder, subFolder)):
            # found a page folder, skip it
            if glob.glob(os.path.join(folder, subFolder, "*.tif*")):
                continue
            # found a book folder
            print("Scan Folder %s" % subFolder)
            fileDict = { 'label': subFolder, 'datastreams' : { } }

            def addFileByPattern(label, pattern):
                file = glob.glob("%s" % os.path.join(folder, subFolder, pattern))
                if len(file) > 0:
                    fileDict['datastreams'][label] = file[0]
                    return True
                return False

            addFileByPattern("MODS", "*.mods.xml")
            addFileByPattern("DC", "*.dc.xml")
            addFileByPattern("MARC", "*.marcxml.xml")
            addFileByPattern("METS", "*.mets.xml")
            addFileByPattern("VRA", "*.vra.xml")

            # creation of the dictionary here might be bad
            fileDict.update(baseFileDict)
            totalFiles = totalFiles + 1
            if FileIngester.createObjectFromFiles(fedora, config, fileDict):
                print("Object (%s) ingested successfully" % subFolder)
                completeFiles = completeFiles + 1

    return completeFiles

Example #3

0

Show file

File: Navigator.py Project: DGI-UPITT/Pittsburgh-Map-Ingester

def processFolder(fedora, config):
    """
    Create a bunch of fedora objects (1 for each folder in @config.inDir)
    """

    folder = config.inDir

    # first make sure @folder is a valid folder
    if not os.path.isdir(folder):
        return False

    # the collection overhead
    # the host collection (topmost root)
    hostCollection = addCollectionToFedora(fedora, config.hostCollectionName, myPid=config.hostCollectionPid, tnUrl=config.hostCollectionIcon)
    # the aggregate (contains the books)
    myCollection = addCollectionToFedora(fedora, config.myCollectionName, myPid=config.myCollectionPid, parentPid=config.hostCollectionPid, tnUrl=config.myCollectionIcon)

    # this is the list of all folders to search in for books
    baseFileDict = { 'parentPid' : config.myCollectionPid, 'contentModel' : 'islandora:sp_large_image_cmodel' }
    georefBaseFileDict = { 'parentPid' : config.myCollectionPid, 'contentModel' : 'islandora:mapCModel' }
    totalFiles = 0
    completeFiles = 0
    for subFolder in os.listdir(folder):
        if os.path.isdir(os.path.join(folder, subFolder)):

            print("Scan Folder %s" % subFolder)

            fileDict = { 'label': subFolder, 'datastreams' : { } }

            def addFileByPattern(label, pattern):
                file = glob.glob("%s" % os.path.join(folder, subFolder, pattern))
                if len(file) > 0:
                    fileDict['datastreams'][label] = file[0]
                    return True
                return False

            if not addFileByPattern("TIFF", "*.tif"):
                if not addFileByPattern("TIFF", "*.tiff"):
                    # failed
                    print("Could not find base tif file - skipping directory %s" % subFolder)
                    continue # next subFolder
            addFileByPattern("MODS", "*.mods.xml")
            addFileByPattern("DC", "*.dc.xml")

            # creation of the dictionary here might be bad
            fileDict.update(baseFileDict)
            totalFiles = totalFiles + 1
            if FileIngester.createObjectFromFiles(fedora, config, fileDict):
                print("Object (%s) ingested successfully" % subFolder)
                completeFiles = completeFiles + 1

                # now ingest the georef clips
                for georefclip in get_immediate_subdirectories(os.path.join(folder, subFolder)):
                    # clear the datastreams list for the georef clips
                    fileDict = { 'label': georefclip, 'datastreams' : {} }
                    # georef clip metadata
                    # maybe perform a subloop on these after object creation?
                    if not addFileByPattern("TIFF", "%s/*.tif" % georefclip):
                        if not addFileByPattern("TIFF", "%s/*.tiff" % georefclip):
                            # failed
                            print("Could not find base tif file - skipping georefclip directory %s" % georefclip)
                            continue # next georef
                    addFileByPattern("FGDC", "%s.fgdc.xml" % georefclip)
                    addFileByPattern("CNTRLP", "%s/*.controlpts.txt" % georefclip)
                    addFileByPattern("CNTRLPXML", "%s/*.controlpts.txt.xml" % georefclip)
                    addFileByPattern("TFWX", "%s/*.tfwx" % georefclip)
                    addFileByPattern("AUX", "%s/*.aux" % georefclip)
                    addFileByPattern("RRD", "%s/*.rrd" % georefclip)
                    addFileByPattern("TIFXML", "%s/*.tif.xml" % georefclip)
                    # the mebp
                    addFileByPattern("DBF", "%s/*/*.dbf" % georefclip)
                    addFileByPattern("PRJ", "%s/*/*.prj" % georefclip)
                    addFileByPattern("SBN", "%s/*/*.sbn" % georefclip)
                    addFileByPattern("SBX", "%s/*/*.sbx" % georefclip)
                    addFileByPattern("SHP", "%s/*/*.shp" % georefclip)
                    addFileByPattern("SHX", "%s/*/*.shx" % georefclip)

                    fileDict.update(georefBaseFileDict)
                    extraNamespaces = { 'mapNS' : 'info:islandora/islandora-system:def/mapinfo#' }
                    extraRelationships = { fedora_relationships.rels_predicate('mapNS', 'isGeorefClipOf') : "%s:%s" %(config.fedoraNS, subFolder) }
                    if FileIngester.createObjectFromFiles(fedora, config, fileDict, extraNamespaces=extraNamespaces, extraRelationships=extraRelationships):
                        print("  georef clip (%s) ingested successfully" % georefclip)

    return completeFiles

Example #4

0

Show file

File: Navigator.py Project: DGI-UPITT/Pittsburgh-Image-Ingester

def processFolder(fedora, config):
    """
    Create a bunch of fedora objects (1 for each tif in @config.inDir)
    """

    folder = config.inDir

    # first make sure folder is a valid folder
    if not os.path.isdir(folder):
        return False

    # the collection overhead
    # the host collection (topmost root)
    hostCollection = addCollectionToFedora(fedora, config.hostCollectionName, myPid=config.hostCollectionPid, tnUrl=config.hostCollectionIcon)
    # the aggregate (image root)
    myCollection = addCollectionToFedora(fedora, config.myCollectionName, myPid=config.myCollectionPid, parentPid=config.hostCollectionPid, tnUrl=config.myCollectionIcon)

    # this is the list of all folders to search in for images
    baseFileDict = { 'parentPid' : config.myCollectionPid, 'contentModel' : 'islandora:sp_large_image_cmodel' }
    totalFiles = 0
    completeFiles = 0
    for subFolder in os.listdir(folder):
        if os.path.isdir(os.path.join(folder, subFolder)):
            print("Scan Folder %s" % subFolder)
            fileDict = { 'label': subFolder, 'datastreams' : { } }

            def addFileByPattern(label, pattern):
                file = glob.glob("%s" % os.path.join(folder, subFolder, pattern))
                if len(file) > 0:
                    fileDict['datastreams'][label] = file[0]
                    return True
                return False

            if not addFileByPattern("TIFF", "*.tif"):
                if not addFileByPattern("TIFF", "*.tiff"):
                    # failed
                    print("Could not find base tif file - skipping directory")
                    continue # next subFolder
            addFileByPattern("MODS", "*.mods.xml")
            addFileByPattern("TN", "*.thumb.jpg")
            addFileByPattern("DC", "*.dc.xml")
            addFileByPattern("KML", "*.kml.xml")

            # creation of the dictionary here might be bad
            fileDict.update(baseFileDict)
            totalFiles = totalFiles + 1
            if FileIngester.createObjectFromFiles(fedora, config, fileDict):
                print("Object (%s) ingested successfully" % subFolder)
                completeFiles = completeFiles + 1

    """
    for idx, page in enumerate(pages):
        print("\n==========\nIngesting object %d of %d: %s" % (idx+1, count, page))

        if not config.dryrun:
            basePage = os.path.splitext(os.path.basename(page))[0]

            #pagePid = fedora.getNextPID(config.fedoraNS)
            pagePid = "%s-%d" % (bookPid, idx+1)
            # pageCModel doesn't exist - its just here as a placeholder

            extraNamespaces = { 'pageNS' : 'info:islandora/islandora-system:def/pageinfo#' }
            extraRelationships = { fedora_relationships.rels_predicate('pageNS', 'isPageNumber') : str(idx+1) }

            # ingest the tiff
            tifFile = os.path.join(folder, page)
            fedoraLib.update_datastream(obj, "TIFF", tifFile, label=unicode("%s.tif" % basePage), mimeType=misc.getMimeType("tiff"))

            # create a JP2 datastream
            jp2File = os.path.join(config.tempDir, "%s.jp2" % basePage)

            # ingest the ocr if it exists
            if ocrzip:
                # try to find the files' ocr data
                ocrFileName = "%s.txt" % basePage
                if ocrFileName in ocrzip.namelist():
                    ocrFile = ocrzip.extract(ocrFileName, config.tempDir)
                    fedoraLib.update_datastream(obj, "OCR", os.path.join(config.tempDir, ocrFile), label=unicode(ocrFileName), mimeType=misc.getMimeType("txt"))

            os.remove(jp2File) # finished with that
            os.remove(os.path.join(config.tempDir, ocrFileName)) # get rid of that temp file

        sys.stdout.flush()
        sys.stderr.flush()
    """

    config.message.addLine("Ingested %d/%d objects" % (completeFiles, totalFiles))

    return True

Example #5

0

Show file

def processFolder(fedora, config):
    """
    Create a bunch of fedora objects (1 for each tif in @config.inDir)
    """

    folder = config.inDir

    # first make sure folder is a valid folder
    if not os.path.isdir(folder):
        return False

    # the collection overhead
    # the host collection (topmost root)
    hostCollection = addCollectionToFedora(fedora,
                                           config.hostCollectionName,
                                           myPid=config.hostCollectionPid,
                                           tnUrl=config.hostCollectionIcon)
    # the aggregate (image root)
    myCollection = addCollectionToFedora(fedora,
                                         config.myCollectionName,
                                         myPid=config.myCollectionPid,
                                         parentPid=config.hostCollectionPid,
                                         tnUrl=config.myCollectionIcon)

    # this is the list of all folders to search in for images
    baseFileDict = {
        'parentPid': config.myCollectionPid,
        'contentModel': 'islandora:sp_large_image_cmodel'
    }
    totalFiles = 0
    completeFiles = 0
    for subFolder in os.listdir(folder):
        if os.path.isdir(os.path.join(folder, subFolder)):
            print("Scan Folder %s" % subFolder)
            fileDict = {'label': subFolder, 'datastreams': {}}

            def addFileByPattern(label, pattern):
                file = glob.glob("%s" %
                                 os.path.join(folder, subFolder, pattern))
                if len(file) > 0:
                    fileDict['datastreams'][label] = file[0]
                    return True
                return False

            if not addFileByPattern("TIFF", "*.tif"):
                if not addFileByPattern("TIFF", "*.tiff"):
                    # failed
                    print("Could not find base tif file - skipping directory")
                    continue  # next subFolder
            addFileByPattern("MODS", "*.mods.xml")
            addFileByPattern("TN", "*.thumb.jpg")
            addFileByPattern("DC", "*.dc.xml")
            addFileByPattern("KML", "*.kml.xml")

            # creation of the dictionary here might be bad
            fileDict.update(baseFileDict)
            totalFiles = totalFiles + 1
            if FileIngester.createObjectFromFiles(fedora, config, fileDict):
                print("Object (%s) ingested successfully" % subFolder)
                completeFiles = completeFiles + 1
    """
    for idx, page in enumerate(pages):
        print("\n==========\nIngesting object %d of %d: %s" % (idx+1, count, page))

        if not config.dryrun:
            basePage = os.path.splitext(os.path.basename(page))[0]

            #pagePid = fedora.getNextPID(config.fedoraNS)
            pagePid = "%s-%d" % (bookPid, idx+1)
            # pageCModel doesn't exist - its just here as a placeholder

            extraNamespaces = { 'pageNS' : 'info:islandora/islandora-system:def/pageinfo#' }
            extraRelationships = { fedora_relationships.rels_predicate('pageNS', 'isPageNumber') : str(idx+1) }

            # ingest the tiff
            tifFile = os.path.join(folder, page)
            fedoraLib.update_datastream(obj, "TIFF", tifFile, label=unicode("%s.tif" % basePage), mimeType=misc.getMimeType("tiff"))

            # create a JP2 datastream
            jp2File = os.path.join(config.tempDir, "%s.jp2" % basePage)

            # ingest the ocr if it exists
            if ocrzip:
                # try to find the files' ocr data
                ocrFileName = "%s.txt" % basePage
                if ocrFileName in ocrzip.namelist():
                    ocrFile = ocrzip.extract(ocrFileName, config.tempDir)
                    fedoraLib.update_datastream(obj, "OCR", os.path.join(config.tempDir, ocrFile), label=unicode(ocrFileName), mimeType=misc.getMimeType("txt"))

            os.remove(jp2File) # finished with that
            os.remove(os.path.join(config.tempDir, ocrFileName)) # get rid of that temp file

        sys.stdout.flush()
        sys.stderr.flush()
    """

    config.message.addLine("Ingested %d/%d objects" %
                           (completeFiles, totalFiles))

    return True