コード例 #1
0
def test_extract_extracts_right_amount_of_files():
    with TemporaryDirectory() as output_dir:

        files: List[Path] = get_filenames(CONFIG.test_files_dir / 'test.pdf')
        extractor: ITextExtractor = TesseractExtractor(dpi=1, fmt='png')
        extractor.batch_extract(files, output_dir)

        assert len(sorted(Path(output_dir).glob('*.txt'))) == 8
        assert (Path(output_dir) / 'extract.log').exists()
コード例 #2
0
def test_extract_generates_expected_output():
    with TemporaryDirectory() as output_dir:
        files: List[Path] = get_filenames(CONFIG.test_files_dir / 'pdf')
        extractor: ITextExtractor = PDFPlumberExtractor()
        extractor.batch_extract(files, output_dir)

        assert len(sorted(Path(output_dir).glob('*.txt'))) == 5
        assert filecmp.dircmp(output_dir, CONFIG.test_files_dir /
                              'expected/pdfplumber').diff_files == []
        assert (Path(output_dir) / 'extract.log').exists()
コード例 #3
0
def test_extract_returns_correct_number_of_pages(first_page, last_page,
                                                 expected):
    with TemporaryDirectory() as output_dir:
        files: List[Path] = get_filenames(CONFIG.test_files_dir / 'pdf')
        extractor: ITextExtractor = PDFPlumberExtractor()
        extractor.batch_extract(files,
                                output_dir,
                                first_page=first_page,
                                last_page=last_page)
        result = len(sorted(Path(output_dir).glob('*.txt')))
        assert result == expected
        assert (Path(output_dir) / 'extract.log').exists()
コード例 #4
0
def test_extract_generates_expected_output():
    with TemporaryDirectory() as output_dir:
        files: List[Path] = get_filenames(CONFIG.test_files_dir / 'test.pdf')
        extractor: ITextExtractor = PDFMinerExtractor()
        extractor.batch_extract(files, output_dir)

        num_extracted = len(sorted(Path(output_dir).glob('*.txt')))
        assert num_extracted == 8

        # FIXME: Improve
        for i in range(0, num_extracted):
            text1 = open(sorted(Path(output_dir).glob('*.txt'))[i]).read()
            text2 = open(
                sorted(
                    Path(CONFIG.test_files_dir /
                         'expected/pdfminer').iterdir())[i]).read()
            m = SequenceMatcher(None, text1, text2)
            assert m.quick_ratio() > 0.99
コード例 #5
0
def test_get_filenames_returns_only_files_with_expected_extension(tmp_path):

    pdf_file = tmp_path / 'test.pdf'
    pdf_file.touch()
    txt_file = tmp_path / 'test.txt'
    txt_file.touch()

    assert pdf_file in get_filenames(tmp_path)
    assert txt_file not in get_filenames(tmp_path)
    assert txt_file in get_filenames(tmp_path, 'txt')

    assert get_filenames(txt_file) == []
    assert get_filenames(pdf_file) == get_filenames(tmp_path) == [pdf_file]
コード例 #6
0
ファイル: cli.py プロジェクト: inidun/unesco_data_collection
def extract(
    input_path: Union[str, os.PathLike],
    output_folder: Union[str, os.PathLike],
    first_page: int = 1,
    last_page: Optional[int] = None,
    extractor: str = 'PDFBox',
) -> None:

    Path(output_folder).mkdir(exist_ok=True, parents=True)
    files: List[Path] = get_filenames(input_path)

    if last_page is not None:
        last_page = int(last_page)

    extractor: ITextExtractor = get_extractor(extractor)
    extractor.batch_extract(files,
                            output_folder,
                            first_page=first_page,
                            last_page=last_page)