Exemple #1
0
        searchText = pdfText.lower()
        for folder,strings in self.filer.folder_targets.items():
            for s in strings:
                logging.debug("Checking string %s" % s)
                if s in searchText:
                    logging.info("Matched keyword '%s'" % s)
                    return folder
        # No match found, so return 
        return None

    def file_original (self, original_filename):
        return self.filer.file_original(original_filename)

    def move_to_matching_folder(self, filename):
        pdftotext = subprocess.Popen(['pdftotext', filename, '-'], stdout=subprocess.PIPE)
        stdout,stderr = pdftotext.communicate()
        text = stdout.replace('\n', ' ')
        tgt_folder = self._get_matching_folder(text)

        if not tgt_folder and self.file_using_filename:
            tgt_folder = self._get_matching_folder(filename)

        tgt_file = self.filer.move_to_matching_folder(filename, tgt_folder)
        return tgt_file
        
if __name__ == '__main__':
    p = PyPdfFiler(PyFilerDirs())
    for page_text in p.iter_pdf_page_text("scan_ocr.pdf"):
        print (page_text)

Exemple #2
0
    def _setup_filing(self):
        """
            Instance the proper PyFiler object (either
            :class:`pypdfocr.pypdfocr_filer_dirs.PyFilerDirs` or
            :class:`pypdfocr.pypdfocr_filer_evernote.PyFilerEvernote`)

            TODO: Make this more generic to allow third-party plugin filing objects

            :ivar filer: :class:`pypdfocr.pypdfocr_filer.PyFiler` PyFiler subclass object that is instantiated
            :ivar pdf_filer: :class:`pypdfocr.pypdfocr_pdffiler.PyPdfFiler` object to help with PDF reading
            :returns: Nothing

        """
        # Look at self.config and create a self.pdf_filer object

        # --------------------------------------------------
        # Some sanity checks
        # --------------------------------------------------
        assert (self.config and self.enable_filing)
        for required in ['target_folder', 'default_folder']:
            if not required in self.config:
                error("%s must be specified in config file" % required)
            else:
                # Make sure these required folders are in abspath format
                self.config[required] = os.path.abspath(self.config[required])
        if 'original_move_folder' in self.config:
            # User wants to move the original after filing
            orig = 'original_move_folder'
            self.config[orig] = os.path.abspath(self.config[orig])
            if not os.path.exists(self.config[orig]):
                os.makedirs(self.config[orig])
            original_move_folder = self.config[orig]
        else:
            original_move_folder = None
        # --------------------------------------------------
        # Start the filing object
        # --------------------------------------------------
        if self.enable_evernote:
            self.filer = PyFilerEvernote(
                self.config['evernote_developer_token'])
        else:
            self.filer = PyFilerDirs()

        self.filer.target_folder = self.config['target_folder']
        self.filer.default_folder = self.config['default_folder']
        self.filer.original_move_folder = original_move_folder

        self.pdf_filer = PyPdfFiler(self.filer)
        if self.match_using_filename:
            print("Matching using filename as a fallback to pdf contents")
            self.pdf_filer.file_using_filename = True

        # ------------------------------
        # Add all the folder names with associated keywords
        # to the filer object
        # ------------------------------
        keyword_count = 0
        folder_count = 0
        if 'folders' in self.config:
            for folder, keywords in self.config['folders'].items():
                folder_count += 1
                keyword_count += len(keywords)
                # Make sure keywords are lower-cased before adding
                keywords = [str(x).lower() for x in keywords]
                self.filer.add_folder_target(folder, keywords)

        print("Filing of PDFs is enabled")
        print(" - %d target filing folders" % (folder_count))
        print(" - %d keywords" % (keyword_count))