def import_doc(file_uri, config, docsearch, current_doc=None): """ Import the specified PDF files """ logger.info("Importing PDF from '%s'" % (file_uri)) parent = Gio.File.parse_name(file_uri) doc = None idx = 0 for child in MultiplePdfImporter.__get_all_children(parent): if not child.get_basename().lower().endswith(".pdf"): continue try: # make sure we can import it Poppler.Document.new_from_file(child.get_uri(), password=None) except Exception: continue doc = PdfDoc(config.workdir) doc.import_pdf(config, child.get_uri()) for page in doc.pages: docsearch.index_page(page) idx += 1 assert (doc is not None) return (doc, doc.pages[0])
def import_doc(file_uri, docsearch, current_doc=None): """ Import the specified PDF files """ logger.info("Importing PDF from '%s'" % (file_uri)) parent = Gio.File.parse_name(file_uri) doc = None docs = [] idx = 0 for child in MultiplePdfImporter.__get_all_children(parent): if not child.get_basename().lower().endswith(".pdf"): continue if docsearch.is_hash_in_index(PdfDoc.hash_file(child.get_path())): logger.info("Document %s already found in the index. Skipped" % (child.get_path())) continue try: # make sure we can import it Poppler.Document.new_from_file(child.get_uri(), password=None) except Exception: continue doc = PdfDoc(docsearch.rootdir) doc.import_pdf(child.get_uri()) docs.append(doc) idx += 1 if doc is None: return (None, None, False) else: return (docs, None, True)
def import_doc(self, file_uri, config, docsearch, current_doc=None): print ("Importing doc '%s'" % (file_uri)) parent = Gio.File.parse_name(file_uri) doc = None idx = 0 for child in self.__get_all_children(parent): if not child.get_basename().lower().endswith(".pdf"): continue try: # make sure we can import it Poppler.Document.new_from_file(child.get_uri(), password=None) except Exception: continue doc = PdfDoc(config.workdir) doc.path += ("_%02d" % idx) doc.docid += ("_%02d" % idx) doc.import_pdf(config, child.get_uri()) for page in doc.pages: docsearch.index_page(page) idx += 1 assert(doc != None) return (doc, doc.pages[0])
def import_doc(file_uri, config, docsearch, current_doc=None): """ Import the specified PDF files """ logger.info("Importing PDF from '%s'" % (file_uri)) parent = Gio.File.parse_name(file_uri) doc = None idx = 0 for child in MultiplePdfImporter.__get_all_children(parent): if not child.get_basename().lower().endswith(".pdf"): continue try: # make sure we can import it Poppler.Document.new_from_file(child.get_uri(), password=None) except Exception: continue doc = PdfDoc(config.workdir) doc.import_pdf(config, child.get_uri()) for page in doc.pages: docsearch.index_page(page) idx += 1 assert(doc is not None) return (doc, doc.pages[0])
def import_doc(self, file_uri, config, docsearch, current_doc=None): print("Importing doc '%s'" % (file_uri)) parent = Gio.File.parse_name(file_uri) doc = None idx = 0 for child in self.__get_all_children(parent): if not child.get_basename().lower().endswith(".pdf"): continue try: # make sure we can import it Poppler.Document.new_from_file(child.get_uri(), password=None) except Exception: continue doc = PdfDoc(config.workdir) doc.path += ("_%02d" % idx) doc.docid += ("_%02d" % idx) doc.import_pdf(config, child.get_uri()) for page in doc.pages: docsearch.index_page(page) idx += 1 assert (doc != None) return (doc, doc.pages[0])
def import_doc(file_uri, docsearch, current_doc=None): """ Import the specified PDF file """ doc = PdfDoc(docsearch.rootdir) logger.info("Importing doc '%s' ..." % file_uri) doc.import_pdf(file_uri) return ([doc], None, True)
def import_doc(file_uri, config, docsearch, current_doc=None): """ Import the specified PDF file """ doc = PdfDoc(config.settings['workdir'].value) logger.info("Importing doc '%s' ..." % file_uri) doc.import_pdf(config, file_uri) return ([doc], None, True)
def import_doc(self, file_uri, config, docsearch, current_doc=None): doc = PdfDoc(config.workdir) print ("Importing doc '%s' ..." % file_uri) doc.import_pdf(config, file_uri) for page in doc.pages: print ("Indexing page %s:p%d ..." % (file_uri, page.page_nb)) docsearch.index_page(page) return (doc, doc.pages[0])
def import_doc(self, file_uri, config, docsearch, current_doc=None): doc = PdfDoc(config.workdir) print("Importing doc '%s' ..." % file_uri) doc.import_pdf(config, file_uri) for page in doc.pages: print("Indexing page %s:p%d ..." % (file_uri, page.page_nb)) docsearch.index_page(page) return (doc, doc.pages[0])
def import_doc(file_uri, config, docsearch, current_doc=None): """ Import the specified PDF file """ doc = PdfDoc(config.workdir) logger.info("Importing doc '%s' ..." % file_uri) doc.import_pdf(config, file_uri) for page in doc.pages: logger.info("Indexing page %s:p%d ..." % (file_uri, page.page_nb)) docsearch.index_page(page) return (doc, doc.pages[0])
def split_pages(self, pages): """ Split the document at these page. """ # You can't leave empty documents if 0 in pages: pages.remove(0) if not pages: return logger.info("Splitting %s at %s", self.docid, pages) # Poppler can't work with individual pages, thus we use pdfrw. from paperwork.backend.pdf.doc import PDF_FILENAME, PdfDoc from paperwork.backend.docimport import SinglePdfImporter import pdfrw doc_pages = self.pages[:] pdir = os.path.abspath(os.path.join(self.path, os.path.pardir)) new_docs = [] pdf_r_name = os.path.join(self.path, PDF_FILENAME) pdf_a_name = os.path.join(self.path, PDF_FILENAME + '.new') pdf_r = pdfrw.PdfReader(pdf_r_name) dest = pdfrw.PdfWriter() dest_path = pdf_a_name offset = 0 for pdf_page, page in zip(pdf_r.pages, doc_pages): if page.page_nb in pages: dest.write(dest_path) new_doc = PdfDoc(pdir, label_store=self.label_store) os.mkdir(new_doc.path) new_doc.labels = self.labels.copy() dest = pdf_b = pdfrw.PdfWriter() dest_path = os.path.join(new_doc.path, PDF_FILENAME) new_docs.append(new_doc) offset = page.page_nb dest.addpage(pdf_page) if offset: offset += 1 page.move_index(new_doc, offset) dest.write(dest_path) self.drop_cache() os.rename(pdf_a_name, pdf_r_name) return new_docs
def import_doc(self, file_uri, config, docsearch, current_doc=None): doc = PdfDoc(config.workdir) doc.import_pdf(config, file_uri) for page in doc.pages: docsearch.index_page(page) return (doc, doc.pages[0])