def test_extract_extracts_right_amount_of_files(): with TemporaryDirectory() as output_dir: files: List[Path] = get_filenames(CONFIG.test_files_dir / 'test.pdf') extractor: ITextExtractor = TesseractExtractor(dpi=1, fmt='png') extractor.batch_extract(files, output_dir) assert len(sorted(Path(output_dir).glob('*.txt'))) == 8 assert (Path(output_dir) / 'extract.log').exists()
def test_extract_generates_expected_output(): with TemporaryDirectory() as output_dir: files: List[Path] = get_filenames(CONFIG.test_files_dir / 'pdf') extractor: ITextExtractor = PDFPlumberExtractor() extractor.batch_extract(files, output_dir) assert len(sorted(Path(output_dir).glob('*.txt'))) == 5 assert filecmp.dircmp(output_dir, CONFIG.test_files_dir / 'expected/pdfplumber').diff_files == [] assert (Path(output_dir) / 'extract.log').exists()
def test_extract_returns_correct_number_of_pages(first_page, last_page, expected): with TemporaryDirectory() as output_dir: files: List[Path] = get_filenames(CONFIG.test_files_dir / 'pdf') extractor: ITextExtractor = PDFPlumberExtractor() extractor.batch_extract(files, output_dir, first_page=first_page, last_page=last_page) result = len(sorted(Path(output_dir).glob('*.txt'))) assert result == expected assert (Path(output_dir) / 'extract.log').exists()
def test_extract_generates_expected_output(): with TemporaryDirectory() as output_dir: files: List[Path] = get_filenames(CONFIG.test_files_dir / 'test.pdf') extractor: ITextExtractor = PDFMinerExtractor() extractor.batch_extract(files, output_dir) num_extracted = len(sorted(Path(output_dir).glob('*.txt'))) assert num_extracted == 8 # FIXME: Improve for i in range(0, num_extracted): text1 = open(sorted(Path(output_dir).glob('*.txt'))[i]).read() text2 = open( sorted( Path(CONFIG.test_files_dir / 'expected/pdfminer').iterdir())[i]).read() m = SequenceMatcher(None, text1, text2) assert m.quick_ratio() > 0.99
def test_get_filenames_returns_only_files_with_expected_extension(tmp_path): pdf_file = tmp_path / 'test.pdf' pdf_file.touch() txt_file = tmp_path / 'test.txt' txt_file.touch() assert pdf_file in get_filenames(tmp_path) assert txt_file not in get_filenames(tmp_path) assert txt_file in get_filenames(tmp_path, 'txt') assert get_filenames(txt_file) == [] assert get_filenames(pdf_file) == get_filenames(tmp_path) == [pdf_file]
def extract( input_path: Union[str, os.PathLike], output_folder: Union[str, os.PathLike], first_page: int = 1, last_page: Optional[int] = None, extractor: str = 'PDFBox', ) -> None: Path(output_folder).mkdir(exist_ok=True, parents=True) files: List[Path] = get_filenames(input_path) if last_page is not None: last_page = int(last_page) extractor: ITextExtractor = get_extractor(extractor) extractor.batch_extract(files, output_folder, first_page=first_page, last_page=last_page)