Ejemplo n.º 1
0
def createBookPDF(bookPath):
    '''
This function creates the pdf of an entire book and ingests it as a DS into fedora
@param pagesDict: the dictionary containing as keys the page number and as values the file path
@param bookPid:  the pid of the book object to add the pdf datastream to
@return bool: true if added false if not 
'''
    #get page to
    bookPath = os.path.join(bookPath, os.path.basename(bookPath) + '.pdf')
    pageNum = 1
    while pageNum <= len(pagesDict):
        pagePath = pagesDict[pageNum]
        fileManipulator.appendPDFwithPDF(bookPath, pagePath)
        pageNum += 1

    #create and add pdf datastream
    obj = fedora.getObject(bookPid)
    bookFile = open(bookPath, 'rb')
    garbage = 'smelly'
    try:
        obj.addDataStream(u'PDF',
                          garbage,
                          label=u'PDF',
                          mimeType=u'application/pdf',
                          controlGroup=u'M',
                          logMessage=u'Added pdf with OCR.')
        logging.info('Added PDF datastream to:' + bookPid)
        ds = obj['PDF']
        ds.setContent(bookFile)
    except FedoraConnectionException:
        logging.exception('Error in adding PDF datastream to:' + bookPid +
                          '\n')
        return False
    return True
def createBookPDF(bookPath):
    '''
This function creates the pdf of an entire book and ingests it as a DS into fedora
@param pagesDict: the dictionary containing as keys the page number and as values the file path
@param bookPid:  the pid of the book object to add the pdf datastream to
@return bool: true if added false if not 
'''
    #get page to
    bookPath=os.path.join(bookPath,os.path.basename(bookPath)+'.pdf')
    pageNum=1
    while pageNum<=len(pagesDict):
        pagePath=pagesDict[pageNum]
        fileManipulator.appendPDFwithPDF(bookPath, pagePath)
        pageNum+=1         
    
    #create and add pdf datastream
    obj = fedora.getObject(bookPid)
    bookFile=open(bookPath,'rb')
    garbage='smelly'
    try:
        obj.addDataStream(u'PDF', garbage, label=u'PDF',
             mimeType=u'application/pdf', controlGroup=u'M',
             logMessage=u'Added pdf with OCR.')
        logging.info('Added PDF datastream to:'+bookPid)
        ds=obj['PDF']
        ds.setContent(bookFile)
    except FedoraConnectionException:
        logging.exception('Error in adding PDF datastream to:'+bookPid+'\n')
        return False
    return True
            os.remove(jp2File) # finished with that

            # create DC, MODS, VRA datastreams
            for dsid in ['DC', 'MODS', 'VRA']:
                dsfile = os.path.join(bookFolder, "%s.%s.xml" % (os.path.splitext(page)[0], dsid.lower()))
                dspage = os.path.basename(dsfile)
                fedoraLib.update_datastream(obj, unicode(dsid), dsfile, label=unicode(dspage), mimeType=misc.getMimeType("xml"), controlGroup='X')

            pdfFile = os.path.join(config.tempDir, "%s.pdf" % basePage)
            converter.tif_to_pdf(tifFile, pdfFile, 'default')
            #fedoraLib.update_datastream(obj, u'PDF', pdfFile, label=unicode("%s.pdf" % basePage), mimeType=misc.getMimeType("pdf"))
            # for the first page, move it to the full when finished with it
            if idx == 0:
                os.rename(pdfFile, fullPDF)
            # for every other page (>1), append it to fullPDF and delete the original
            else:
                manipulator.appendPDFwithPDF(fullPDF, pdfFile)
                os.remove(pdfFile)

        sys.stdout.flush()
        sys.stderr.flush()

    # ingest the full PDF on the master book object
    # and delete it
    if not config.dryrun:
        print("Ingesting full PDF document")
        fedoraLib.update_datastream(bookObj, u"PDF", fullPDF, label=os.path.basename(fullPDF), mimeType=misc.getMimeType("pdf"))
        os.remove(fullPDF)

    return True