def getPDFIntrospection(aPDF): #logging.info("Method getPDFIntrospection for:" + str(aPDF)) content = " " pdf = open(aPDF, 'rb') reader = PdfFileReader(pdf) if reader.isEncrypted: reader._override_encryption = True reader.decrypt('') content = reader.getPage(0).extractText() + "\n" content = " ".join(content.replace(u"\xa0", " ").strip().split()) pdf.close() return content
def getPubDate(aPDF): #logging.info("Method getPubDate for:" + str(aPDF)) publication_date = " " temp_PDF = PdfFileReader(open(aPDF, "rb")) if temp_PDF.isEncrypted: temp_PDF._override_encryption = True temp_PDF.decrypt('') pdf_info = temp_PDF.getDocumentInfo() for key, val in pdf_info.items(): if key == '/CreationDate': if str(type(val) )[8:39] != 'PyPDF2.generic.TextStringObject' or key == " ": publication_date = "No_Year" else: publication_date = val[:6] publication_date = publication_date[2:] if publication_date == " ": publication_date = "No_Year" return publication_date