コード例 #1
0
    def assert_func(rfile, headers):
        log.info('Text extraction results are ready...')
        rs: RequestStatus = RequestStatus.from_json(rfile)
        assert rs.status == 'DONE'
        assert os.path.basename(fn) == rs.original_file_name
        assert rs.converted_cleaned_pdf
        assert rs.tables_extracted is False
        assert rs.plain_text_extracted
        assert rs.text_structure_extracted
        assert rs.additional_info == 'hello world'

        text = client.get_plain_text(rs.request_id)
        for i in range(1, 22):
            assert f'This is page {i}' in text

        with client.get_pdf_as_local_file(rs.request_id) as tfn:
            with pikepdf.open(tfn) as pdf:
                assert len(pdf.pages) == 22

        text_struct: PlainTextStructure = client.get_extracted_text_structure_as_msgpack(
            rs.request_id)
        assert text_struct.language == 'en'
        assert len(text_struct.pages) == 22
        assert len(text_struct.paragraphs) == 1
        assert len(text_struct.sentences) > 2

        log.info('Text extraction results look good. All assertions passed.')
コード例 #2
0
 def assert_func(rfile, headers):
     log.info('Text extraction results are ready...')
     rs: RequestStatus = RequestStatus.from_json(rfile)
     assert rs.status == 'DONE'
     assert os.path.basename(fn) == rs.original_file_name
     assert rs.pdf_pages_ocred
     assert rs.searchable_pdf_created
     log.info('Text extraction results look good. All assertions passed.')
コード例 #3
0
    def assert_func(rfile, headers):
        log.info('Text extraction results are ready...')
        rs: RequestStatus = RequestStatus.from_json(rfile)
        assert rs.status == 'DONE'
        assert os.path.basename(fn) == rs.original_file_name
        assert rs.converted_cleaned_pdf is False
        assert rs.searchable_pdf_created
        assert not rs.tables_extracted
        assert rs.plain_text_extracted
        assert rs.text_structure_extracted

        log.info('Text extraction results look good. All assertions passed.')
コード例 #4
0
    def assert_func(rfile, headers):
        log.info('Text extraction results are ready...')
        rs: RequestStatus = RequestStatus.from_json(rfile)
        assert rs.status == 'DONE'
        assert os.path.basename(fn) == rs.original_file_name
        assert rs.converted_cleaned_pdf is False
        assert rs.searchable_pdf_created
        assert rs.tables_extracted
        assert rs.plain_text_extracted
        assert rs.text_structure_extracted

        table_list_json: TableList = client.get_extracted_tables_as_msgpack(
            rs.request_id)
        assert len(table_list_json.tables) == 6

        log.info('Text extraction results look good. All assertions passed.')
コード例 #5
0
    def assert_func(rfile, headers):
        log.info('Text extraction results are ready...')
        rs: RequestStatus = RequestStatus.from_json(rfile)
        assert rs.status == 'DONE'
        assert os.path.basename(fn) == rs.original_file_name
        assert rs.converted_cleaned_pdf
        assert rs.plain_text_extracted
        assert rs.text_structure_extracted
        assert rs.additional_info == 'hello world'

        text = client.get_plain_text(rs.request_id)
        text_struct: PlainTextStructure = client.get_extracted_text_structure_as_msgpack(
            rs.request_id)
        assert len(text_struct.pages) == 4
        assert 'REPRODUCTION, AND DISTRIBUTION' in text  # page 1
        assert 'subsequently incorporated' in text  # page 2
        assert 'conditions stated in this License. ' in text  # page 3
        assert 'See the License for the specific language governing' in text  # page 4

        log.info('Text extraction results look good. All assertions passed.')
    def assert_func(rfile, headers):
        log.info('Text extraction results are ready...')
        rs: RequestStatus = RequestStatus.from_json(rfile)
        assert rs.status == 'DONE'
        assert os.path.basename(fn) == rs.original_file_name
        assert rs.converted_cleaned_pdf is False
        assert rs.tables_extracted is False
        assert rs.plain_text_extracted
        assert rs.text_structure_extracted

        text = client.get_plain_text(rs.request_id)

        with client.get_pdf_as_local_file(rs.request_id) as tfn:
            with pikepdf.open(tfn) as pdf:
                assert len(pdf.pages) == 1

        text_struct: PlainTextStructure = client.get_extracted_text_structure_as_msgpack(
            rs.request_id)
        assert text_struct.language in ('en', 'ru')
        if text_struct.language == 'en':
            assert 'This is top secret' in text
            assert 'Top.' in text
            assert 'являлся Тор.' not in text
        elif text_struct.language == 'ru':
            assert 'This is top secret' not in text
            assert 'Top.' not in text
            assert 'являлся Тор.' in text
        assert len(text_struct.pages) == 1
        assert len(text_struct.paragraphs) == 1
        for i in text_struct.paragraphs:
            assert i.language == text_struct.language
        assert len(text_struct.sentences) == 3
        for i in text_struct.sentences:
            assert i.language == text_struct.language

        log.info('Text extraction results look good. All assertions passed.')
コード例 #7
0
 def assert_func(rfile, headers):
     log.info('Text extraction results are ready...')
     rs: RequestStatus = RequestStatus.from_json(rfile)
     assert rs.status == 'FAILURE'
     log.info('Text extraction results look good. All assertions passed.')
コード例 #8
0
 def get_data_extraction_task_status(self, request_id: str) -> RequestStatus:
     url = f'{self.base_url}/api/v1/data_extraction_tasks/{request_id}/status.json'
     resp = requests.get(url)
     self.raise_for_status(resp)
     return RequestStatus.from_json(resp.content)