def extract_body(self, fp, basefile): reader = StreamingPDFReader() parser = "ocr" if self.config.ocr else "xml" reader.read(fp, parser=parser) for page in reader: page.src = "index.pdf" # FIXME: don't hardcode the filename return reader
def extract_body(self, fp, basefile): reader = StreamingPDFReader() parser = "ocr" if self.config.ocr else "xml" intermediate_suffix = ".hocr" if self.config.ocr else ".xml" if self.config.compress: intermediate_suffix += "." + self.config.compress reader.read(fp, parser=parser) for attachment in [ x for x in sorted( self.store.list_attachments(basefile, "downloaded")) if x.endswith(".pdf") ]: downloaded_path = self.store.downloaded_path(basefile, attachment=attachment) iattachment = attachment.replace(".pdf", intermediate_suffix) intermediate_path = self.store.intermediate_path( basefile, attachment=iattachment) if not os.path.exists(intermediate_path): fp = self.convert_pdf(downloaded_path, intermediate_path) else: fp = self.store.open_intermediate(basefile, attachment=iattachment) reader += StreamingPDFReader().read(fp) for page in reader: page.src = "index.pdf" # FIXME: don't hardcode the filename return reader
def extract_body(self, fp, basefile): reader = StreamingPDFReader() parser = "ocr" if self.config.ocr else "xml" intermediate_suffix = ".hocr" if self.config.ocr else ".xml" if self.config.compress: intermediate_suffix += "." + self.config.compress reader.read(fp, parser=parser) for attachment in [x for x in sorted(self.store.list_attachments(basefile, "downloaded")) if x.endswith(".pdf")]: downloaded_path = self.store.downloaded_path(basefile, attachment=attachment) iattachment = attachment.replace(".pdf", intermediate_suffix) intermediate_path = self.store.intermediate_path(basefile, attachment=iattachment) if not os.path.exists(intermediate_path): fp = self.convert_pdf(downloaded_path, intermediate_path) else: fp = self.store.open_intermediate(basefile, attachment=iattachment) reader += StreamingPDFReader().read(fp) for page in reader: page.src = "index.pdf" # FIXME: don't hardcode the filename return reader