Ejemplo n.º 1
0
    def test_append_result(self, tmp_path, complete_df):
        result_path = tmp_path / 'result.parquet.gzip'

        extract(SAMPLES_DIR, result_path, saving_interval=1, features='all')

        # Small 'chunk_df_size' to append to result multiple times
        df = pd.read_parquet(result_path, engine=PARQUET_ENGINE)

        check_and_compare(df, complete_df)
Ejemplo n.º 2
0
    def test_extraction_big(self, tmp_path, is_ocr, complete_df):
        result_path = tmp_path / 'result.parquet.gzip'

        extract(SAMPLES_DIR,
                result_path,
                ocr_lang='eng',
                ocr=is_ocr,
                features='all')

        df = pd.read_parquet(result_path, engine=PARQUET_ENGINE)

        if is_ocr:
            df['text'] = df['text'].str.strip()

        check_and_compare(df, complete_df, is_ocr=is_ocr)
Ejemplo n.º 3
0
    def test_extraction_small(self, is_ocr, complete_df):
        df = extract(SAMPLES_DIR, small=True, ocr_lang='eng', ocr=is_ocr)

        if is_ocr:
            df['text'] = df['text'].str.strip()

        check_and_compare(df, complete_df, is_ocr=is_ocr)
Ejemplo n.º 4
0
    def test_image_format(self, format_):
        df = extract(SAMPLES_DIR,
                     image_format=format_,
                     small=True,
                     features='image')

        img_bytes = df['image'].dropna().iloc[0]
        img = Image.open(BytesIO(img_bytes))

        assert img.format.upper() == format_.upper()
Ejemplo n.º 5
0
    def test_exclude_feature(self, excluded, complete_df):
        features = PdfExtractTask.list_features()
        features.remove(excluded)

        df = extract(SAMPLES_DIR, small=True, features=features)

        columns = list(complete_df.columns)
        columns.remove(excluded)

        check_and_compare(df, complete_df[columns])
Ejemplo n.º 6
0
    def test_passing_paths_list(self, tmp_path, complete_df):
        result_path = tmp_path / 'result.parquet.gzip'
        files_list = Path(SAMPLES_DIR).rglob('*.pdf')

        # Test the support for paths as strings
        files_list = [str(f) for f in files_list]

        df = extract(files_list, result_path, small=True)

        # Paths will be relative to pwd, so adapting complete_df
        complete_df['path'] = complete_df['path'].apply(
            lambda p: str(SAMPLES_DIR / p))
        check_and_compare(df, complete_df)
Ejemplo n.º 7
0
    def test_image_resize(self, size):
        df = extract(SAMPLES_DIR,
                     image_size=size,
                     small=True,
                     features='image')

        img_bytes = df['image'].dropna().iloc[0]
        img = Image.open(BytesIO(img_bytes))

        size = size.lower()
        width, height = map(int, size.split('x'))

        assert img.size == (width, height)
Ejemplo n.º 8
0
    def test_filter_processed(self, tmp_path):
        with open(SAMPLES_DIR / 'single_page1.pdf', 'rb') as f:
            single = f.read()

        with open(SAMPLES_DIR / 'multi_page1.pdf', 'rb') as f:
            multi = f.read()

        with open(SAMPLES_DIR / 'invalid1.pdf', 'rb') as f:
            invalid = f.read()

        total_tasks = [
            ('single1.pdf', single),
            ('multi1.pdf', multi, 2),
            ('hey/multi2.pdf', multi, 1),
            ('multi1.pdf', multi, 1),
            ('my_dir/single3.pdf', single),
            ('/opt/invalid.pdf', invalid),
            ('multi1.pdf', multi, 3),
            ('single2.pdf', single),
            ('/tmp/single3.pdf', single),
            ('/tmp/invalid.pdf', invalid),
        ]

        result_path = tmp_path / 'result.parquet.gzip'

        for counter, task in enumerate(total_tasks):

            extraction = ExtractionFromMemory(total_tasks,
                                              out_file=result_path,
                                              features='text')

            tasks = extraction.gen_tasks()
            tasks = extraction.filter_processed_tasks(tasks)

            assert len(tasks) == len(total_tasks) - counter

            extract([task], result_path, features='text')
Ejemplo n.º 9
0
    def test_return_list(self):
        def sort(doc):
            try:
                first_page_idx, text_idx = 0, 1

                return len(doc[first_page_idx][text_idx])
            except TypeError:
                return -1  # For None values

        def hash_images(doc):
            for page_idx, page in enumerate(doc):
                image, text = page
                image = md5(image).hexdigest() if image is not None else None

                doc[page_idx] = [image, text]

            return doc

        list_ = extract(SAMPLES_DIR, return_list=True)
        list_ = [hash_images(doc) for doc in list_]

        # Expected structure:
        #   expected: list[doc],
        #   doc: list[page],
        #   page: list[feature]
        #
        # list[feature] is sorted by feature name (not value!)
        expected = [
            # invalid1.pdf
            [[None, None]],

            # single_page.pdf
            [['975f5049aac2d0b0e85e0083657182fd', 'My beautiful sample!']],

            # sub2/copy_single_page.pdf
            [['975f5049aac2d0b0e85e0083657182fd', 'My beautiful sample!']],

            # multi_page.pdf
            [['5f005131323536c524e0fffa7ab42d0f', 'First page'],
             ['fce06f79de9ca575212152873cb161ea', 'Second page'],
             ['6dacd3629627df0d99ebc5da97b310b2', 'Third page']],

            # sub1/copy_multi_page.pdf
            [['5f005131323536c524e0fffa7ab42d0f', 'First page'],
             ['fce06f79de9ca575212152873cb161ea', 'Second page'],
             ['6dacd3629627df0d99ebc5da97b310b2', 'Third page']],
        ]

        assert sorted(list_, key=sort) == sorted(expected, key=sort)
Ejemplo n.º 10
0
def test_error_recorded():
    df = extract('tests/samples', small=True, task_class=MyCustomTask)
    error_feature, error_msg = 'wrong', 'There was a problem'

    assert error_msg in df.iloc[0].error
    assert f'{error_feature}:' in df.iloc[0].error
Ejemplo n.º 11
0
def test_columns_present():
    df = extract('tests/samples', small=True, task_class=MyCustomTask)
    assert set(MyCustomTask.list_features()) < set(df.columns)
Ejemplo n.º 12
0
def test_saving_to_disk(tmp_path):
    out_file = tmp_path / 'my_df.parquet.gzip'
    extract(SAMPLES_DIR, out_file, task_class=MyCustomTask)

    assert Path(out_file).exists()
Ejemplo n.º 13
0
    def test_empty_feature(self, complete_df):
        df = extract(SAMPLES_DIR, small=True, features='')

        columns = list(PdfExtractTask.fixed_featues) + ['error_bool']
        check_and_compare(df, complete_df[columns])
Ejemplo n.º 14
0
 def test_features_as_list(self, complete_df):
     df = extract(SAMPLES_DIR, small=True, features=['text', 'image'])
     check_and_compare(df, complete_df)