def addBookToFedora(): ''' Helper function that handles creating the book collection obj in fedora @param modsFilePath: the source of meta data @return bool: true on function success false on fail ''' #create the fedora book page object global bookPid #global for write to file and use bookPid = fedora.getNextPID(u'uofm') #bookPid = fedora.getNextPID(u'Awill') myLabel = unicode(os.path.basename(os.path.dirname(modsFilePath))) obj = fedora.createObject(bookPid, label=myLabel) #add the book pid to modsFile parser = etree.XMLParser(remove_blank_text=True) xmlFile = etree.parse(modsFilePath, parser) xmlFileRoot = xmlFile.getroot() modsElem = etree.Element("{http://www.loc.gov/mods/v3}identifier", type="pid") modsElem.text = bookPid xmlFileRoot.append(modsElem) xmlFile.write(modsFilePath) #add mods datastream modsUrl = open(modsFilePath) modsContents = modsUrl.read() modsUrl.close() try: obj.addDataStream(u'MODS', unicode(modsContents), label=u'MODS', mimeType=u'text/xml', controlGroup=u'X', logMessage=u'Added basic mods meta data.') logging.info('Added MODS datastream to:' + bookPid) except FedoraConnectionException: logging.error('Error in adding MODS datastream to:' + bookPid + '\n') #add a TN datastream to the object after creating it from the book cover tnPath = os.path.join(os.path.dirname(modsFilePath), (myLabel + '_TN.jpg')) converter.tif_to_jpg( os.path.join(os.path.dirname(modsFilePath), '0001_a_front_cover.tif'), tnPath, 'TN') tnUrl = open(tnPath) try: obj.addDataStream(u'TN', u'aTmpStr', label=u'TN', mimeType=u'image/jpeg', controlGroup=u'M', logMessage=u'Added a jpeg thumbnail.') logging.info('Added TN datastream to:' + bookPid) ds = obj['TN'] ds.setContent(tnUrl) except FedoraConnectionException as fedoraEX: if str(fedoraEX.body).find( 'is currently being modified by another thread') != -1: logging.warning('Trouble (thread lock) adding TN datastream to: ' + bookPid + ' retrying.') loop = True while loop == True: loop = False try: obj.addDataStream(u'TN', u'aTmpStr', label=u'TN', mimeType=u'image/jpeg', controlGroup=u'M', logMessage=u'Added a jpeg thumbnail.') logging.info('Added TN datastream to:' + bookPid) ds = obj['TN'] ds.setContent(tnUrl) except FedoraConnectionException as fedoraEXL: if str(fedoraEXL.body).find( 'is currently being modified by another thread' ) != -1: loop = True logging.warning( 'Trouble (thread lock) adding TN datastream to: ' + bookPid + ' retrying.') else: logging.error('Error in adding TN datastream to:' + bookPid + '\n') else: logging.error('Error in adding TN datastream to:' + bookPid + '\n') #configure rels ext objRelsExt = fedora_relationships.rels_ext( obj, fedora_relationships.rels_namespace( 'fedora-model', 'info:fedora/fedora-system:def/model#')) objRelsExt.addRelationship('isMemberOf', 'islandora:top') objRelsExt.addRelationship( fedora_relationships.rels_predicate('fedora-model', 'hasModel'), 'archiveorg:bookCModel') try: #trying to handle a bug/feature of locking fedora items objRelsExt.update() except FedoraConnectionException as fedoraEX: if str(fedoraEX.body).find( 'is currently being modified by another thread') != -1: logging.warning('Trouble (thread lock) updating obj RELS-EXT: ' + bookPid + ' retrying.') loop = True while loop == True: loop = False try: objRelsExt.update() except FedoraConnectionException as fedoraEXL: if str(fedoraEXL.body).find( 'is currently being modified by another thread' ) != -1: loop = True logging.warning( 'Trouble (thread lock) updating obj RELS-EXT: ' + bookPid + ' retrying.') else: logging.error('Error updating obj RELS-EXT: ' + bookPid) else: logging.error('Error updating obj RELS-EXT: ' + bookPid + ' retrying.') #index the book in solr sendSolr() return True
def addBookToFedora(): ''' Helper function that handles creating the book collection obj in fedora @param modsFilePath: the source of meta data @return bool: true on function success false on fail ''' #create the fedora book page object global bookPid#global for write to file and use bookPid = fedora.getNextPID(u'uofm') #bookPid = fedora.getNextPID(u'Awill') myLabel=unicode(os.path.basename(os.path.dirname(modsFilePath))) obj = fedora.createObject(bookPid, label=myLabel) #add the book pid to modsFile parser = etree.XMLParser(remove_blank_text=True) xmlFile = etree.parse(modsFilePath, parser) xmlFileRoot = xmlFile.getroot() modsElem=etree.Element("{http://www.loc.gov/mods/v3}identifier",type="pid") modsElem.text=bookPid xmlFileRoot.append(modsElem) xmlFile.write(modsFilePath) #add mods datastream modsUrl=open(modsFilePath) modsContents=modsUrl.read() modsUrl.close() try: obj.addDataStream(u'MODS', unicode(modsContents), label=u'MODS', mimeType=u'text/xml', controlGroup=u'X', logMessage=u'Added basic mods meta data.') logging.info('Added MODS datastream to:'+bookPid) except FedoraConnectionException: logging.error('Error in adding MODS datastream to:'+bookPid+'\n') #add a TN datastream to the object after creating it from the book cover tnPath=os.path.join(os.path.dirname(modsFilePath),(myLabel+'_TN.jpg')) converter.tif_to_jpg(os.path.join(os.path.dirname(modsFilePath),'0001_a_front_cover.tif'), tnPath,'TN') tnUrl=open(tnPath) try: obj.addDataStream(u'TN', u'aTmpStr', label=u'TN', mimeType=u'image/jpeg', controlGroup=u'M', logMessage=u'Added a jpeg thumbnail.') logging.info('Added TN datastream to:'+bookPid) ds=obj['TN'] ds.setContent(tnUrl) except FedoraConnectionException as fedoraEX: if str(fedoraEX.body).find('is currently being modified by another thread')!=-1: logging.warning('Trouble (thread lock) adding TN datastream to: '+bookPid+' retrying.') loop=True while loop==True: loop=False try: obj.addDataStream(u'TN', u'aTmpStr', label=u'TN', mimeType=u'image/jpeg', controlGroup=u'M', logMessage=u'Added a jpeg thumbnail.') logging.info('Added TN datastream to:'+bookPid) ds=obj['TN'] ds.setContent(tnUrl) except FedoraConnectionException as fedoraEXL: if str(fedoraEXL.body).find('is currently being modified by another thread')!=-1: loop=True logging.warning('Trouble (thread lock) adding TN datastream to: '+bookPid+' retrying.') else: logging.error('Error in adding TN datastream to:'+bookPid+'\n') else: logging.error('Error in adding TN datastream to:'+bookPid+'\n') #configure rels ext objRelsExt=fedora_relationships.rels_ext(obj,fedora_relationships.rels_namespace('fedora-model','info:fedora/fedora-system:def/model#')) objRelsExt.addRelationship('isMemberOf','islandora:top') objRelsExt.addRelationship(fedora_relationships.rels_predicate('fedora-model','hasModel'),'archiveorg:bookCModel') try:#trying to handle a bug/feature of locking fedora items objRelsExt.update() except FedoraConnectionException as fedoraEX: if str(fedoraEX.body).find('is currently being modified by another thread')!=-1: logging.warning('Trouble (thread lock) updating obj RELS-EXT: '+bookPid+' retrying.') loop=True while loop==True: loop=False try: objRelsExt.update() except FedoraConnectionException as fedoraEXL: if str(fedoraEXL.body).find('is currently being modified by another thread')!=-1: loop=True logging.warning('Trouble (thread lock) updating obj RELS-EXT: '+bookPid+' retrying.') else: logging.error('Error updating obj RELS-EXT: '+bookPid) else: logging.error('Error updating obj RELS-EXT: '+bookPid+' retrying.') #index the book in solr sendSolr() return True
def addBookPageToFedora(inputTiff, tmpDir): ''' Helper function that handles adding and configuring a fedora object for a book page based on the input image and mods file do i need something separate to add a book collection obj? @param inputTiff: the archival data source @param tmpDir: file directory where non-archeival stuff gets put @return bool: true on function success false on fail ''' #run conversions converter.tif_to_jp2(inputTiff, tmpDir, 'default', 'default') converter.tif_OCR(inputTiff, tmpDir, {'PDF': 'default', 'Text': 'default'}) #determine page number: used for naming fullTiffDir = os.path.dirname(inputTiff) tifDir = os.path.basename(fullTiffDir) tiffName = os.path.basename(inputTiff) pageNumber = os.path.basename(inputTiff) pageNumber = int(pageNumber[0:pageNumber.index('_')]) #if front cover if tiffName.count('front_cover') == 1: pageNumber = 1 elif tiffName.count('inner_cover') == 1: pageNumber = 2 #if it's the inner leaf elif tiffName.count('inner_leaf') == 1: pageNumber = 3 #if back cover elif tiffName.count('back_cover') == 1: #get number of tiff files numberOfTiffs = 0 dir = os.path.dirname(inputTiff) for file in os.listdir(dir): if file[(len(file) - 4):len(file)] == '.tif' or file[( len(file) - 5):len(file)] == '.tiff': numberOfTiffs += 1 pageNumber = numberOfTiffs #standard a [left side] elif tiffName.count('a') == 1: if pageNumber == 1: pageNumber = 4 else: pageNumber = pageNumber * 2 + 2 #standard b [right side] elif tiffName.count('b') == 1: if pageNumber == 1: pageNumber = 5 else: pageNumber = pageNumber * 2 + 3 else: logging.error('Bad tiff file name: ' + inputTiff + ' giving fileNumber: ' + str(pageNumber) + '\n') return False logging.info('Working on ingest of page: ' + str(pageNumber) + ' with source file: ' + inputTiff) #create the fedora book page object pagePid = fedora.getNextPID(u'uofm') #pagePid = fedora.getNextPID(u'Awill') myLabel = unicode(tifDir + '_Page' + str(pageNumber)) obj = fedora.createObject(pagePid, label=myLabel) #create ingest urls if tiffName[(len(tiffName) - 4):len(tiffName)] == '.tif': tiffNameNoExt = tiffName[0:len(tiffName) - 4] tifExt = '.tif' if tiffName[(len(tiffName) - 5):len(tiffName)] == '.tiff': tiffNameNoExt = tiffName[0:len(tiffName) - 5] tifExt = '.tiff' baseInUrl = os.path.join(fullTiffDir, tiffNameNoExt) baseOutUrl = os.path.join(tmpDir, tiffNameNoExt) tiffUrl = open(baseInUrl + tifExt) jp2Url = open(baseOutUrl + '.jp2') pdfUrl = open(baseOutUrl + '.pdf') ocrUrl = open(baseOutUrl + '.txt') #this gets the metadata for the page from the tif exifPath = baseOutUrl + '.xml' converter.exif_to_xml(inputTiff, exifPath) exifUrl = open(exifPath) #this is used for creating the book pdf later global pagesDict pagesDict[pageNumber] = baseOutUrl + '.pdf' garbage = u'smelly' #tiff datastream try: obj.addDataStream(u'TIFF', garbage, label=u'TIFF', mimeType=u'image/tiff', controlGroup=u'M', logMessage=u'Added the archival tiff file.') logging.info('Added TIFF datastream to:' + pagePid) ds = obj['TIFF'] ds.setContent(tiffUrl) except FedoraConnectionException: logging.exception('Error in adding TIFF datastream to:' + pagePid + '\n') #jp2 datastream try: obj.addDataStream(u'JP2', garbage, label=u'JP2', mimeType=u'image/jp2', controlGroup=u'M', logMessage=u'Added jp2 image file.') logging.info('Added JP2 datastream to:' + pagePid) ds = obj['JP2'] ds.setContent(jp2Url) except FedoraConnectionException: logging.exception('Error in adding JP2 datastream to:' + pagePid + '\n') #pdf datastream try: obj.addDataStream(u'PDF', garbage, label=u'PDF', mimeType=u'application/pdf', controlGroup=u'M', logMessage=u'Added pdf with OCR.') logging.info('Added PDF datastream to:' + pagePid) ds = obj['PDF'] ds.setContent(pdfUrl) except FedoraConnectionException: logging.exception('Error in adding PDF datastream to:' + pagePid + '\n') #ocr datastream try: obj.addDataStream(u'OCR', garbage, label=u'OCR', mimeType=u'text/plain', controlGroup=u'M', logMessage=u'Added basic text of OCR.') logging.info('Added OCR datastream to:' + pagePid) ds = obj['OCR'] ds.setContent(ocrUrl) except FedoraConnectionException: logging.exception('Error in adding OCR Datastream to:' + pagePid + '\n') #exif datastream try: obj.addDataStream(u'EXIF', garbage, label=u'EXIF', mimeType=u'text/xml', controlGroup=u'M', logMessage=u'Added the archival EXIF file.') logging.info('Added EXIF datastream to:' + pagePid) ds = obj['EXIF'] ds.setContent(exifUrl) except FedoraConnectionException: logging.exception('Error in adding EXIF datastream to:' + pagePid + '\n') objRelsExt = fedora_relationships.rels_ext(obj, [ fedora_relationships.rels_namespace( 'pageNS', 'info:islandora/islandora-system:def/pageinfo#'), fedora_relationships.rels_namespace( 'fedora-model', 'info:fedora/fedora-system:def/model#') ]) objRelsExt.addRelationship('isMemberOf', bookPid) objRelsExt.addRelationship( fedora_relationships.rels_predicate('pageNS', 'isPageNumber'), fedora_relationships.rels_object( str(pageNumber), fedora_relationships.rels_object.LITERAL)) objRelsExt.addRelationship( fedora_relationships.rels_predicate('fedora-model', 'hasModel'), 'archiveorg:pageCModel') objRelsExt.update() #Dynamic Datastreams #grab all files that share a name with the tiff and do not use the already used extensions dynamicDSList = os.listdir(fullTiffDir) dynamicDSListCopy = list( dynamicDSList) #better than taking os.listdir twice for dynamicDSFile in dynamicDSListCopy: if dynamicDSFile[0:dynamicDSFile.find('.')]!=tiffNameNoExt or (dynamicDSFile[dynamicDSFile.find('.'):len(dynamicDSFile)]=='.tif' or \ dynamicDSFile[dynamicDSFile.find('.'):len(dynamicDSFile)]=='.tiff' or dynamicDSFile[dynamicDSFile.find('.'):len(dynamicDSFile)]=='.pdf' \ or dynamicDSFile[dynamicDSFile.find('.'):len(dynamicDSFile)]=='.jp2' or dynamicDSFile[dynamicDSFile.find('.'):len(dynamicDSFile)]=='.txt'\ or dynamicDSFile[dynamicDSFile.find('.'):len(dynamicDSFile)]=='.xml'):#@todo: refactor to use regualar expression module? dynamicDSList.remove(dynamicDSFile) #create the dynamic datastreams for dynamicDSFile in dynamicDSList: dynamicDSFileEXT = dynamicDSFile[dynamicDSFile.find('.') + 1:len(dynamicDSFile)] dynamicDSFileMimeType = misc.getMimeType(dynamicDSFileEXT) dynamicDSFileHandle = open(os.path.join(fullTiffDir, dynamicDSFile), 'r') try: obj.addDataStream(unicode(dynamicDSFileEXT), garbage, label=unicode(dynamicDSFileEXT), mimeType=unicode(dynamicDSFileMimeType), controlGroup=u'M', logMessage=unicode('Added the datastream:' + dynamicDSFileEXT)) logging.info('Added the datastream: ' + dynamicDSFileEXT + ' to: ' + pagePid) ds = obj[dynamicDSFileEXT] ds.setContent(dynamicDSFileHandle) except FedoraConnectionException: logging.exception('Error in adding' + dynamicDSFileEXT + 'datastream to:' + pagePid + '\n') return True
def addBookPageToFedora(inputTiff, tmpDir): ''' Helper function that handles adding and configuring a fedora object for a book page based on the input image and mods file do i need something separate to add a book collection obj? @param inputTiff: the archival data source @param tmpDir: file directory where non-archeival stuff gets put @return bool: true on function success false on fail ''' #run conversions converter.tif_to_jp2(inputTiff,tmpDir,'default','default') converter.tif_OCR(inputTiff,tmpDir,{'PDF':'default','Text':'default'}) #determine page number: used for naming fullTiffDir=os.path.dirname(inputTiff) tifDir=os.path.basename(fullTiffDir) tiffName=os.path.basename(inputTiff) pageNumber=os.path.basename(inputTiff) pageNumber=int(pageNumber[0:pageNumber.index('_')]) #if front cover if tiffName.count('front_cover')==1: pageNumber=1 elif tiffName.count('inner_cover')==1: pageNumber=2 #if it's the inner leaf elif tiffName.count('inner_leaf')==1: pageNumber=3 #if back cover elif tiffName.count('back_cover')==1: #get number of tiff files numberOfTiffs=0 dir=os.path.dirname(inputTiff) for file in os.listdir(dir): if file[(len(file)-4):len(file)]=='.tif' or file[(len(file)-5):len(file)]=='.tiff': numberOfTiffs+=1 pageNumber=numberOfTiffs #standard a [left side] elif tiffName.count('a')==1: if pageNumber==1: pageNumber=4 else: pageNumber=pageNumber*2+2 #standard b [right side] elif tiffName.count('b')==1: if pageNumber==1: pageNumber=5 else: pageNumber=pageNumber*2+3 else: logging.error('Bad tiff file name: '+inputTiff+' giving fileNumber: '+str(pageNumber)+'\n') return False logging.info('Working on ingest of page: '+str(pageNumber)+' with source file: '+inputTiff) #create the fedora book page object pagePid = fedora.getNextPID(u'uofm') #pagePid = fedora.getNextPID(u'Awill') myLabel=unicode(tifDir+'_Page'+str(pageNumber)) obj = fedora.createObject(pagePid, label=myLabel) #create ingest urls if tiffName[(len(tiffName)-4):len(tiffName)]=='.tif': tiffNameNoExt=tiffName[0:len(tiffName)-4] tifExt='.tif' if tiffName[(len(tiffName)-5):len(tiffName)]=='.tiff': tiffNameNoExt=tiffName[0:len(tiffName)-5] tifExt='.tiff' baseInUrl=os.path.join(fullTiffDir,tiffNameNoExt) baseOutUrl=os.path.join(tmpDir,tiffNameNoExt) tiffUrl=open(baseInUrl+tifExt) jp2Url=open(baseOutUrl+'.jp2') pdfUrl=open(baseOutUrl+'.pdf') ocrUrl=open(baseOutUrl+'.txt') #this gets the metadata for the page from the tif exifPath=baseOutUrl+'.xml' converter.exif_to_xml(inputTiff,exifPath) exifUrl= open(exifPath) #this is used for creating the book pdf later global pagesDict pagesDict[pageNumber]=baseOutUrl+'.pdf' garbage=u'smelly' #tiff datastream try: obj.addDataStream(u'TIFF', garbage, label=u'TIFF', mimeType=u'image/tiff', controlGroup=u'M', logMessage=u'Added the archival tiff file.') logging.info('Added TIFF datastream to:'+pagePid) ds=obj['TIFF'] ds.setContent(tiffUrl) except FedoraConnectionException: logging.exception('Error in adding TIFF datastream to:'+pagePid+'\n') #jp2 datastream try: obj.addDataStream(u'JP2',garbage, label=u'JP2', mimeType=u'image/jp2', controlGroup=u'M', logMessage=u'Added jp2 image file.') logging.info('Added JP2 datastream to:'+pagePid) ds=obj['JP2'] ds.setContent(jp2Url) except FedoraConnectionException: logging.exception('Error in adding JP2 datastream to:'+pagePid+'\n') #pdf datastream try: obj.addDataStream(u'PDF', garbage, label=u'PDF', mimeType=u'application/pdf', controlGroup=u'M', logMessage=u'Added pdf with OCR.') logging.info('Added PDF datastream to:'+pagePid) ds=obj['PDF'] ds.setContent(pdfUrl) except FedoraConnectionException: logging.exception('Error in adding PDF datastream to:'+pagePid+'\n') #ocr datastream try: obj.addDataStream(u'OCR', garbage, label=u'OCR', mimeType=u'text/plain', controlGroup=u'M', logMessage=u'Added basic text of OCR.') logging.info('Added OCR datastream to:'+pagePid) ds=obj['OCR'] ds.setContent(ocrUrl) except FedoraConnectionException: logging.exception('Error in adding OCR Datastream to:'+pagePid+'\n') #exif datastream try: obj.addDataStream(u'EXIF', garbage, label=u'EXIF', mimeType=u'text/xml', controlGroup=u'M', logMessage=u'Added the archival EXIF file.') logging.info('Added EXIF datastream to:'+pagePid) ds=obj['EXIF'] ds.setContent(exifUrl) except FedoraConnectionException: logging.exception('Error in adding EXIF datastream to:'+pagePid+'\n') objRelsExt=fedora_relationships.rels_ext(obj,[fedora_relationships.rels_namespace('pageNS','info:islandora/islandora-system:def/pageinfo#'), fedora_relationships.rels_namespace('fedora-model','info:fedora/fedora-system:def/model#')]) objRelsExt.addRelationship('isMemberOf',bookPid) objRelsExt.addRelationship(fedora_relationships.rels_predicate('pageNS','isPageNumber'),fedora_relationships.rels_object(str(pageNumber),fedora_relationships.rels_object.LITERAL)) objRelsExt.addRelationship(fedora_relationships.rels_predicate('fedora-model','hasModel'),'archiveorg:pageCModel') objRelsExt.update() #Dynamic Datastreams #grab all files that share a name with the tiff and do not use the already used extensions dynamicDSList=os.listdir(fullTiffDir) dynamicDSListCopy=list(dynamicDSList)#better than taking os.listdir twice for dynamicDSFile in dynamicDSListCopy: if dynamicDSFile[0:dynamicDSFile.find('.')]!=tiffNameNoExt or (dynamicDSFile[dynamicDSFile.find('.'):len(dynamicDSFile)]=='.tif' or \ dynamicDSFile[dynamicDSFile.find('.'):len(dynamicDSFile)]=='.tiff' or dynamicDSFile[dynamicDSFile.find('.'):len(dynamicDSFile)]=='.pdf' \ or dynamicDSFile[dynamicDSFile.find('.'):len(dynamicDSFile)]=='.jp2' or dynamicDSFile[dynamicDSFile.find('.'):len(dynamicDSFile)]=='.txt'\ or dynamicDSFile[dynamicDSFile.find('.'):len(dynamicDSFile)]=='.xml'):#@todo: refactor to use regualar expression module? dynamicDSList.remove(dynamicDSFile) #create the dynamic datastreams for dynamicDSFile in dynamicDSList: dynamicDSFileEXT=dynamicDSFile[dynamicDSFile.find('.')+1:len(dynamicDSFile)] dynamicDSFileMimeType=misc.getMimeType(dynamicDSFileEXT) dynamicDSFileHandle=open(os.path.join(fullTiffDir,dynamicDSFile),'r') try: obj.addDataStream(unicode(dynamicDSFileEXT), garbage, label=unicode(dynamicDSFileEXT), mimeType=unicode(dynamicDSFileMimeType), controlGroup=u'M', logMessage=unicode('Added the datastream:'+dynamicDSFileEXT)) logging.info('Added the datastream: '+dynamicDSFileEXT+' to: '+pagePid) ds=obj[dynamicDSFileEXT] ds.setContent(dynamicDSFileHandle) except FedoraConnectionException: logging.exception('Error in adding'+ dynamicDSFileEXT +'datastream to:'+pagePid+'\n') return True