Ejemplo n.º 1
0
def getPDFIntrospection(aPDF):
    #logging.info("Method getPDFIntrospection for:" + str(aPDF))
    content = " "
    pdf = open(aPDF, 'rb')
    reader = PdfFileReader(pdf)
    if reader.isEncrypted:
        reader._override_encryption = True
        reader.decrypt('')
    content = reader.getPage(0).extractText() + "\n"
    content = " ".join(content.replace(u"\xa0", " ").strip().split())
    pdf.close()
    return content
Ejemplo n.º 2
0
def getPubDate(aPDF):
    #logging.info("Method getPubDate for:" + str(aPDF))
    publication_date = " "
    temp_PDF = PdfFileReader(open(aPDF, "rb"))
    if temp_PDF.isEncrypted:
        temp_PDF._override_encryption = True
        temp_PDF.decrypt('')
    pdf_info = temp_PDF.getDocumentInfo()
    for key, val in pdf_info.items():
        if key == '/CreationDate':
            if str(type(val)
                   )[8:39] != 'PyPDF2.generic.TextStringObject' or key == " ":
                publication_date = "No_Year"
            else:
                publication_date = val[:6]
                publication_date = publication_date[2:]
    if publication_date == " ":
        publication_date = "No_Year"
    return publication_date