def assert_func(rfile, headers): log.info('Text extraction results are ready...') rs: RequestStatus = RequestStatus.from_json(rfile) assert rs.status == 'DONE' assert os.path.basename(fn) == rs.original_file_name assert rs.converted_cleaned_pdf assert rs.tables_extracted is False assert rs.plain_text_extracted assert rs.text_structure_extracted assert rs.additional_info == 'hello world' text = client.get_plain_text(rs.request_id) for i in range(1, 22): assert f'This is page {i}' in text with client.get_pdf_as_local_file(rs.request_id) as tfn: with pikepdf.open(tfn) as pdf: assert len(pdf.pages) == 22 text_struct: PlainTextStructure = client.get_extracted_text_structure_as_msgpack( rs.request_id) assert text_struct.language == 'en' assert len(text_struct.pages) == 22 assert len(text_struct.paragraphs) == 1 assert len(text_struct.sentences) > 2 log.info('Text extraction results look good. All assertions passed.')
def assert_func(rfile, headers): log.info('Text extraction results are ready...') rs: RequestStatus = RequestStatus.from_json(rfile) assert rs.status == 'DONE' assert os.path.basename(fn) == rs.original_file_name assert rs.pdf_pages_ocred assert rs.searchable_pdf_created log.info('Text extraction results look good. All assertions passed.')
def assert_func(rfile, headers): log.info('Text extraction results are ready...') rs: RequestStatus = RequestStatus.from_json(rfile) assert rs.status == 'DONE' assert os.path.basename(fn) == rs.original_file_name assert rs.converted_cleaned_pdf is False assert rs.searchable_pdf_created assert not rs.tables_extracted assert rs.plain_text_extracted assert rs.text_structure_extracted log.info('Text extraction results look good. All assertions passed.')
def assert_func(rfile, headers): log.info('Text extraction results are ready...') rs: RequestStatus = RequestStatus.from_json(rfile) assert rs.status == 'DONE' assert os.path.basename(fn) == rs.original_file_name assert rs.converted_cleaned_pdf is False assert rs.searchable_pdf_created assert rs.tables_extracted assert rs.plain_text_extracted assert rs.text_structure_extracted table_list_json: TableList = client.get_extracted_tables_as_msgpack( rs.request_id) assert len(table_list_json.tables) == 6 log.info('Text extraction results look good. All assertions passed.')
def assert_func(rfile, headers): log.info('Text extraction results are ready...') rs: RequestStatus = RequestStatus.from_json(rfile) assert rs.status == 'DONE' assert os.path.basename(fn) == rs.original_file_name assert rs.converted_cleaned_pdf assert rs.plain_text_extracted assert rs.text_structure_extracted assert rs.additional_info == 'hello world' text = client.get_plain_text(rs.request_id) text_struct: PlainTextStructure = client.get_extracted_text_structure_as_msgpack( rs.request_id) assert len(text_struct.pages) == 4 assert 'REPRODUCTION, AND DISTRIBUTION' in text # page 1 assert 'subsequently incorporated' in text # page 2 assert 'conditions stated in this License. ' in text # page 3 assert 'See the License for the specific language governing' in text # page 4 log.info('Text extraction results look good. All assertions passed.')
def assert_func(rfile, headers): log.info('Text extraction results are ready...') rs: RequestStatus = RequestStatus.from_json(rfile) assert rs.status == 'DONE' assert os.path.basename(fn) == rs.original_file_name assert rs.converted_cleaned_pdf is False assert rs.tables_extracted is False assert rs.plain_text_extracted assert rs.text_structure_extracted text = client.get_plain_text(rs.request_id) with client.get_pdf_as_local_file(rs.request_id) as tfn: with pikepdf.open(tfn) as pdf: assert len(pdf.pages) == 1 text_struct: PlainTextStructure = client.get_extracted_text_structure_as_msgpack( rs.request_id) assert text_struct.language in ('en', 'ru') if text_struct.language == 'en': assert 'This is top secret' in text assert 'Top.' in text assert 'являлся Тор.' not in text elif text_struct.language == 'ru': assert 'This is top secret' not in text assert 'Top.' not in text assert 'являлся Тор.' in text assert len(text_struct.pages) == 1 assert len(text_struct.paragraphs) == 1 for i in text_struct.paragraphs: assert i.language == text_struct.language assert len(text_struct.sentences) == 3 for i in text_struct.sentences: assert i.language == text_struct.language log.info('Text extraction results look good. All assertions passed.')
def assert_func(rfile, headers): log.info('Text extraction results are ready...') rs: RequestStatus = RequestStatus.from_json(rfile) assert rs.status == 'FAILURE' log.info('Text extraction results look good. All assertions passed.')
def get_data_extraction_task_status(self, request_id: str) -> RequestStatus: url = f'{self.base_url}/api/v1/data_extraction_tasks/{request_id}/status.json' resp = requests.get(url) self.raise_for_status(resp) return RequestStatus.from_json(resp.content)