searchText = pdfText.lower() for folder,strings in self.filer.folder_targets.items(): for s in strings: logging.debug("Checking string %s" % s) if s in searchText: logging.info("Matched keyword '%s'" % s) return folder # No match found, so return return None def file_original (self, original_filename): return self.filer.file_original(original_filename) def move_to_matching_folder(self, filename): pdftotext = subprocess.Popen(['pdftotext', filename, '-'], stdout=subprocess.PIPE) stdout,stderr = pdftotext.communicate() text = stdout.replace('\n', ' ') tgt_folder = self._get_matching_folder(text) if not tgt_folder and self.file_using_filename: tgt_folder = self._get_matching_folder(filename) tgt_file = self.filer.move_to_matching_folder(filename, tgt_folder) return tgt_file if __name__ == '__main__': p = PyPdfFiler(PyFilerDirs()) for page_text in p.iter_pdf_page_text("scan_ocr.pdf"): print (page_text)
def _setup_filing(self): """ Instance the proper PyFiler object (either :class:`pypdfocr.pypdfocr_filer_dirs.PyFilerDirs` or :class:`pypdfocr.pypdfocr_filer_evernote.PyFilerEvernote`) TODO: Make this more generic to allow third-party plugin filing objects :ivar filer: :class:`pypdfocr.pypdfocr_filer.PyFiler` PyFiler subclass object that is instantiated :ivar pdf_filer: :class:`pypdfocr.pypdfocr_pdffiler.PyPdfFiler` object to help with PDF reading :returns: Nothing """ # Look at self.config and create a self.pdf_filer object # -------------------------------------------------- # Some sanity checks # -------------------------------------------------- assert (self.config and self.enable_filing) for required in ['target_folder', 'default_folder']: if not required in self.config: error("%s must be specified in config file" % required) else: # Make sure these required folders are in abspath format self.config[required] = os.path.abspath(self.config[required]) if 'original_move_folder' in self.config: # User wants to move the original after filing orig = 'original_move_folder' self.config[orig] = os.path.abspath(self.config[orig]) if not os.path.exists(self.config[orig]): os.makedirs(self.config[orig]) original_move_folder = self.config[orig] else: original_move_folder = None # -------------------------------------------------- # Start the filing object # -------------------------------------------------- if self.enable_evernote: self.filer = PyFilerEvernote( self.config['evernote_developer_token']) else: self.filer = PyFilerDirs() self.filer.target_folder = self.config['target_folder'] self.filer.default_folder = self.config['default_folder'] self.filer.original_move_folder = original_move_folder self.pdf_filer = PyPdfFiler(self.filer) if self.match_using_filename: print("Matching using filename as a fallback to pdf contents") self.pdf_filer.file_using_filename = True # ------------------------------ # Add all the folder names with associated keywords # to the filer object # ------------------------------ keyword_count = 0 folder_count = 0 if 'folders' in self.config: for folder, keywords in self.config['folders'].items(): folder_count += 1 keyword_count += len(keywords) # Make sure keywords are lower-cased before adding keywords = [str(x).lower() for x in keywords] self.filer.add_folder_target(folder, keywords) print("Filing of PDFs is enabled") print(" - %d target filing folders" % (folder_count)) print(" - %d keywords" % (keyword_count))