def assert_func(rfile, headers): log.info('Text extraction results are ready...') rs: RequestStatus = RequestStatus.from_json(rfile) assert rs.status == 'DONE' assert os.path.basename(fn) == rs.original_file_name assert rs.converted_cleaned_pdf assert rs.tables_extracted is False assert rs.plain_text_extracted assert rs.text_structure_extracted assert rs.additional_info == 'hello world' text = client.get_plain_text(rs.request_id) for i in range(1, 22): assert f'This is page {i}' in text with client.get_pdf_as_local_file(rs.request_id) as tfn: with pikepdf.open(tfn) as pdf: assert len(pdf.pages) == 22 text_struct: PlainTextStructure = client.get_extracted_text_structure_as_msgpack( rs.request_id) assert text_struct.language == 'en' assert len(text_struct.pages) == 22 assert len(text_struct.paragraphs) == 1 assert len(text_struct.sentences) > 2 log.info('Text extraction results look good. All assertions passed.')
def deliver_error(request_id: str, request_callback_info: RequestCallbackInfo, problem: Optional[str] = None, exc: Optional[Exception] = None): req: Optional[RequestMetadata] = None try: req = load_request_metadata(request_id) if not req: log.warning( f'{request_callback_info.original_file_name} | Not delivering error ' f'because the request files do not exist in storage: ' f'(#{request_id})\n' f'This usually means the request is canceled.') return req.status = STATUS_FAILURE if problem or exc: req.append_error(problem, exc) save_request_metadata(req) except Exception as req_upd_err: log.error( f'{request_callback_info.original_file_name} | Unable to store failed status into ' f'metadata of request #{request_id}', exc_info=req_upd_err) req_status = RequestStatus( request_id=request_id, original_file_name=request_callback_info.original_file_name, status=STATUS_FAILURE, additional_info=request_callback_info.call_back_additional_info, output_format=req.output_format) deliver_results(request_callback_info, req_status)
def assert_func(rfile, headers): log.info('Text extraction results are ready...') rs: RequestStatus = RequestStatus.from_json(rfile) assert rs.status == 'DONE' assert os.path.basename(fn) == rs.original_file_name assert rs.pdf_pages_ocred assert rs.searchable_pdf_created log.info('Text extraction results look good. All assertions passed.')
def deliver_results(req: RequestCallbackInfo, req_status: RequestStatus): if req.call_back_url: try: log.info( f'{req.original_file_name} | POSTing the extraction results to {req.call_back_url}...' ) requests.post(req.call_back_url, json=req_status.to_dict()) except Exception as err: log.error( f'{req.original_file_name} | Unable to POST the extraction results to {req.call_back_url}', exc_info=err) if req.call_back_celery_broker: try: log.info( f'{req.original_file_name} | Sending the extraction results as a celery task:\n' f'broker: {req.call_back_celery_broker}\n' f'queue: {req.call_back_celery_queue}\n' f'task_name: {req.call_back_celery_task_name}\n') send_task(broker_url=req.call_back_celery_broker, queue=req.call_back_celery_queue, task_name=req.call_back_celery_task_name, task_kwargs=req_status.to_dict(), task_id=req.call_back_celery_task_id, parent_task_id=req.call_back_celery_parent_task_id, root_task_id=req.call_back_celery_root_task_id, celery_version=req.call_back_celery_version) except Exception as err: log.error( f'{req.original_file_name} | Unable to send the extraction results as a celery task:\n' f'broker: {req.call_back_celery_broker}\n' f'queue: {req.call_back_celery_queue}\n' f'task_name: {req.call_back_celery_task_name}\n', exc_info=err) status_extra = ', '.join([ 'plain text' if req_status.plain_text_extracted else '', 'coords extracted' if req_status.pdf_coordinates_extracted else '', 'pages OCRed' if req_status.pdf_pages_ocred else '' ]) log.info( f'{req.original_file_name} | Finished processing request (#{req.request_id}). {status_extra}' )
def assert_func(rfile, headers): log.info('Text extraction results are ready...') rs: RequestStatus = RequestStatus.from_json(rfile) assert rs.status == 'DONE' assert os.path.basename(fn) == rs.original_file_name assert rs.converted_cleaned_pdf is False assert rs.searchable_pdf_created assert not rs.tables_extracted assert rs.plain_text_extracted assert rs.text_structure_extracted log.info('Text extraction results look good. All assertions passed.')
def assert_func(rfile, headers): log.info('Text extraction results are ready...') rs: RequestStatus = RequestStatus.from_json(rfile) assert rs.status == 'DONE' assert os.path.basename(fn) == rs.original_file_name assert rs.converted_cleaned_pdf is False assert rs.searchable_pdf_created assert rs.tables_extracted assert rs.plain_text_extracted assert rs.text_structure_extracted table_list_json: TableList = client.get_extracted_tables_as_msgpack( rs.request_id) assert len(table_list_json.tables) == 6 log.info('Text extraction results look good. All assertions passed.')
def to_request_status(self) -> RequestStatus: return RequestStatus( request_id=self.request_id, original_file_name=self.original_file_name, status=self.status, error_message=self.error_message, converted_cleaned_pdf=self.converted_to_pdf is not None, searchable_pdf_created=self.ocred_pdf is not None, corrected_pdf_created=self.corrected_pdf is not None, pdf_pages_ocred=self.pdf_pages_ocred, tables_extracted=self.tables_file is not None, plain_text_extracted=self.plain_text_file is not None, text_structure_extracted=self.text_structure_file is not None, pdf_coordinates_extracted=self.pdf_coordinates_file is not None, additional_info=self.request_callback_info.call_back_additional_info, output_format=self.output_format, page_rotate_angles=self.page_rotate_angles )
def assert_func(rfile, headers): log.info('Text extraction results are ready...') rs: RequestStatus = RequestStatus.from_json(rfile) assert rs.status == 'DONE' assert os.path.basename(fn) == rs.original_file_name assert rs.converted_cleaned_pdf assert rs.plain_text_extracted assert rs.text_structure_extracted assert rs.additional_info == 'hello world' text = client.get_plain_text(rs.request_id) text_struct: PlainTextStructure = client.get_extracted_text_structure_as_msgpack( rs.request_id) assert len(text_struct.pages) == 4 assert 'REPRODUCTION, AND DISTRIBUTION' in text # page 1 assert 'subsequently incorporated' in text # page 2 assert 'conditions stated in this License. ' in text # page 3 assert 'See the License for the specific language governing' in text # page 4 log.info('Text extraction results look good. All assertions passed.')
def assert_func(rfile, headers): log.info('Text extraction results are ready...') rs: RequestStatus = RequestStatus.from_json(rfile) assert rs.status == 'DONE' assert os.path.basename(fn) == rs.original_file_name assert rs.converted_cleaned_pdf is False assert rs.tables_extracted is False assert rs.plain_text_extracted assert rs.text_structure_extracted text = client.get_plain_text(rs.request_id) with client.get_pdf_as_local_file(rs.request_id) as tfn: with pikepdf.open(tfn) as pdf: assert len(pdf.pages) == 1 text_struct: PlainTextStructure = client.get_extracted_text_structure_as_msgpack( rs.request_id) assert text_struct.language in ('en', 'ru') if text_struct.language == 'en': assert 'This is top secret' in text assert 'Top.' in text assert 'являлся Тор.' not in text elif text_struct.language == 'ru': assert 'This is top secret' not in text assert 'Top.' not in text assert 'являлся Тор.' in text assert len(text_struct.pages) == 1 assert len(text_struct.paragraphs) == 1 for i in text_struct.paragraphs: assert i.language == text_struct.language assert len(text_struct.sentences) == 3 for i in text_struct.sentences: assert i.language == text_struct.language log.info('Text extraction results look good. All assertions passed.')
def assert_func(rfile, headers): log.info('Text extraction results are ready...') rs: RequestStatus = RequestStatus.from_json(rfile) assert rs.status == 'FAILURE' log.info('Text extraction results look good. All assertions passed.')
def get_data_extraction_task_status(self, request_id: str) -> RequestStatus: url = f'{self.base_url}/api/v1/data_extraction_tasks/{request_id}/status.json' resp = requests.get(url) self.raise_for_status(resp) return RequestStatus.from_json(resp.content)