def test_right_to_left(self): doc = Document("tests/resources/Fairy-Circles-Truly-a-Fairy-Tale-R-FKB-Kids-Stories_FA.pdf") text = "" for page in doc: text += ' \n'.join(page.lines).strip() with open("tests/resources/Fairy-Circles-Truly-a-Fairy-Tale-R-FKB-Kids-Stories_FA.txt", "r") as f: correct = f.read() assert correct == text
def ingest(self, file_path, entity): """Ingestor implementation.""" try: pdf = Document(bytes(file_path)) except Exception as ex: raise ProcessingException("Could not extract PDF file: %r" % ex) from ex # noqa self.extract_metadata(pdf, entity) self.extract_xmp_metadata(pdf, entity) self.pdf_extract(entity, pdf)
def pdf_alternative_extract(self, entity, pdf_path): checksum = self.manager.store(pdf_path) entity.set('pdfHash', checksum) pdf = Document(bytes(pdf_path)) self.pdf_extract(entity, pdf)
parser = argparse.ArgumentParser( description="checks for presence of absence of text on images") parser.add_argument("input_files", type=str, nargs="+", help="path to a PDF") parser.add_argument( "--threshold", type=int, default=0, help="maximum number of chars to consider a page empty", ) parser.add_argument("--absence", action="store_true", help="returnes pages without text") args = parser.parse_args() for input_file in args.input_files: doc = Document(input_file) output = [] num_pages = 0 for idx, page in enumerate(doc): num_pages += 1 num_chars = sum(map(len, page.lines)) if num_chars > args.threshold: output.append(idx + 1) # 1-based for PDFs if args.absence: output = list(set(range(1, num_pages + 1)).difference(set(output))) print(" ".join(map(str, output)))
def test_empty_pdf(self): with pytest.raises(IOError): Document("tests/resources/empty.pdf")
def test_non_pdf_file(self): with pytest.raises(IOError): Document("tests/resources/not-pdf.txt")
def test_directory_path(self): with pytest.raises(IOError): Document("test/resources/")
def test_non_existent_file(self): with pytest.raises(IOError): Document("tests/resources/not-exists.pdf")
def test_extract_metadata(self): doc = Document("tests/resources/FAC.pdf") assert doc.metadata assert doc.xmp_metadata
def test_extract_text(self): doc = Document("tests/resources/prop.pdf") text = "" for page in doc: text += ' \n'.join(page.lines).strip() assert "Milestones" in text
def test_bytes_paths(self): self._clean_images() doc = Document(b"tests/resources/FAC.pdf") doc.extract_images(path=b"tests/images", prefix="img") assert os.path.exists("tests/images") assert len(glob.glob(os.path.join("tests/images", "*.png"))) == 4
def test_extract_images(self, path, no_imgs): self._clean_images() doc = Document(path) doc.extract_images(path="tests/images", prefix="img") assert os.path.exists("tests/images") assert len(glob.glob(os.path.join("tests/images", "*.png"))) == no_imgs
def pdf_alternative_extract(self, pdf_path): self.result.emit_pdf_alternative(pdf_path) pdf = Document(pdf_path.encode('utf-8')) self.pdf_extract(pdf)
def pdf_alternative_extract(self, pdf_path): pdf = Document(pdf_path.encode('utf-8')) self.pdf_extract(pdf)