def getPdfMetadata(self, path=None): ''' This method will get the pdf metadata and return book object. ''' logger.debug('getPdfMetadata path: %s', path) if path: try: input = PdfFileReader(open(path, "rb")) logger.debug('getIsEncrypted : %s ', input.getIsEncrypted()) except Exception as e: logger.error(e, exc_info=True) pdf_info = None try: pdf_toread = PdfFileReader(open(path, "rb")) if pdf_toread.isEncrypted: try: pdf_toread.decrypt('') except Exception as e: logger.error(e, exc_info=True) except Exception as e: logger.error(e, exc_info=True) try: pdf_info = pdf_toread.getDocumentInfo() logger.debug('NumPages:%s', pdf_toread.getNumPages()) self.book.numberOfPages = pdf_toread.getNumPages() # value = pdf_info.subject subject = None if pdf_info.subject and type(pdf_info.subject) == str: # Ignore errors even if the string is not proper UTF-8 or has # broken marker bytes. # Python built-in function unicode() can do this. subject = pdf_info.subject # else: # # Assume the value object has proper __unicode__() method # value = unicode(pdf_info.subject) # print 'else' if not self.book.tag and subject: self.book.tag = subject elif self.book.tag and subject: self.book.tag = self.book.tag + '' + subject except Exception as e: logger.error(e, exc_info=True) try: if pdf_info.title != None and pdf_info.title.strip() != '': self.book.bookName = str(pdf_info.title) except Exception as e: logger.error(e, exc_info=True) try: if pdf_info.creator: self.book.publisher = str(pdf_info.creator.encode('utf-8')) except Exception as e: logger.error(e, exc_info=True) self.book.createdOn = datetime.now() try: # print str(pdf_info['/CreationDate'])[2:10] date = datetime.strptime( str(pdf_info['/CreationDate'])[2:10], '%Y%m%d') self.book.publishedOn = date except Exception as e: logger.error(e, exc_info=True) logger.error('CreationDate not found') logger.debug(Util().convert_bytes(os.path.getsize(path))) self.book.fileSize = Util().convert_bytes(os.path.getsize(path)) # if 'ISBN'.lower() in str(pdf_info['/Subject']).lower(): # self.book.isbn_13 = str(pdf_info['/Subject'])[6:] author = Author() val = 'Unknown' try: if pdf_info.author != None and pdf_info.author.strip() != '': val = pdf_info.author # val = val.encode("utf8", "ignore") except Exception as e: logger.error(e, exc_info=True) author.authorName = val authorList = list() authorList.append(author) self.book.authors = authorList
def processFile(self, curr_file): global extractedFrom author = '-' date = '-' generator = '-' created = '-' producer = '-' modded = '-' last_saved = '-' if ".pdf" in curr_file: try: raw_input("Processing " + str(curr_file) + ".\nPress Enter to continue...") # pdfFile = PdfFileReader(file(curr_file, 'rb')) pdfFile = PdfFileReader(open(curr_file, 'rb')) if pdfFile.getIsEncrypted(): print "File is encrypted (maybe, this sometimes has false positives). Trying to decrypt." pdfFile.decrypt('') print "Success! File decrypted." docInfo = pdfFile.getDocumentInfo() if not docInfo: return last_saved = '-' print " The RAW document information" for section in docInfo: print docInfo[section] raw_input("Press Enter to continue...") #looks at the entire dictionary to parse for information if "/CreationDate" in docInfo: print " Processing CREATION DATE information" print " Creation Date RAW: " + docInfo["/CreationDate"] data = docInfo["/CreationDate"].strip("D:|'") year = data[0:4] date = data[4:6] + "/" + data[6:8] created_time = data[8:10] + ":" + data[10:12] print " The value of 'created_time' is: " + str(created_time) print " The data type is: " + str(type(created_time)) print " Expecting H:M format" raw_input(" Does it match?") created_time = time.strftime("%I:%M %p", time.strptime(created_time, "%H:%M")) created = date + "/" + year + " " + created_time if "/Author" in docInfo: print " Processing AUTHOR information" author = docInfo["/Author"] + " " if len(author) <=1: author = "-" if "/Producer" in docInfo: print " Processing PRODUCER information" producer = docInfo["/Producer"].strip("(Windows)") producer = re.sub(r'[^\w]', ' ', producer) if len(producer) == 0: producer = "-" while True: if " " in producer: producer = producer.replace(" ", " ") else: break if "/ModDate" in docInfo: print " Processing MODIFIED DATE information" data = docInfo["/ModDate"].strip("D:|'") year = data[0:4] date = data[4:6] + "/" + data[6:8] modded_time = data[8:10] + ":" + data[10:12] modded_time = time.strftime("%I:%M %p", time.strptime(modded_time, "%H:%M")) modded = date + "/" + year + " " + modded_time #strips '/' off file name (if it includes directory name) if "/" in curr_file: curr_file = curr_file[curr_file.rfind("/")+1:] if "\\" in curr_file: curr_file = curr_file.replace("\\","") #trim information if it's too long if len(curr_file) > 15: # trims file name curr_file = curr_file[:15] + "..." + curr_file[-13:] if len(producer) > 30: producer = producer[:20] + " [snipped] " if len(author) > 20: author = author[:20] + " [snipped] " #appends each piece of information. output will show ONLY if at least ONE file has data in a column self.container.append([" | " + curr_file,created,author,producer,modded,last_saved]) print "Parsing Completed." except NotImplementedError: print "Tried to decrypt a secured/encrypted PDF, and it failed. Try to read details manually." except Exception, err: print "Parsing failed somewhere in the TRY statement." return
def getPdfMetadata(self, path=None): ''' This method will get the pdf metadata and return book object. ''' print path if path: try: input = PdfFileReader(open(path, "rb")) print 'getPdfMetadata', input.getIsEncrypted() except: pass pdf_info = None try: pdf_toread = PdfFileReader(open(path, "rb")) if pdf_toread.isEncrypted: try: pdf_toread.decrypt('') except: traceback.print_exc() except: pass try: pdf_info = pdf_toread.getDocumentInfo() print 'Pages:', pdf_toread.getNumPages() self.book.numberOfPages = pdf_toread.getNumPages() # value = pdf_info.subject if type(pdf_info.subject) == str: # Ignore errors even if the string is not proper UTF-8 or has # broken marker bytes. # Python built-in function unicode() can do this. value = unicode(pdf_info.subject, "utf-8", errors="ignore") else: # Assume the value object has proper __unicode__() method value = unicode(pdf_info.subject) print 'else' if not self.book.tag : self.book.tag = value else: self.book.tag = self.book.tag + '' + value except: traceback.print_exc() try: if pdf_info.title != None and pdf_info.title.strip() != '': self.book.bookName = str(pdf_info.title) except: print 'unable to set bookName', traceback.print_exc() try: if pdf_info.creator: self.book.publisher = str(pdf_info.creator.encode('utf-8')) except: pass self.book.createdOn = datetime.now() try: print str(pdf_info['/CreationDate'])[2:10] date = datetime.strptime(str(pdf_info['/CreationDate'])[2:10] , '%Y%m%d') self.book.publishedOn = date except: print 'CreationDate not found' print path print Util().convert_bytes(os.path.getsize(path)) self.book.fileSize = Util().convert_bytes(os.path.getsize(path)) # if 'ISBN'.lower() in str(pdf_info['/Subject']).lower(): # self.book.isbn_13 = str(pdf_info['/Subject'])[6:] author = Author() val = 'Unknown' try: if pdf_info.author !=None and pdf_info.author.strip()!='': val = pdf_info.author val = val.encode("utf8", "ignore") except: pass author.authorName = val authorList = list() authorList.append(author) self.book.authors = authorList