def run(self, blob_name, skip_status_table = False, gt_df = None): if self.container_client != None and self.table_service != None: file_content = storage_helpers.download_blob(self.container_client, blob_name) # Check document status to see if it was already processed doctype = blob_name.split('/')[0] file_name = blob_name.split('/')[-1] status = "new" if(skip_status_table == False): status = storage_helpers.query_entity_status(self.table_service, self.app_settings.status_table, doctype, file_name) # If status = "done", we do nothing, if status = "ocr_done", we only find labels if status != 'done': ocr_output_path = blob_name + '.ocr.json' if status != 'ocr-done': # Creating OCR file for document logging.info(f"Creating OCR file for document {blob_name}...") analyze_result = fr_helpers.analyze_layout(self.app_settings.fr_region, self.app_settings.fr_key, file_content, blob_name) analyze_result_string = json.dumps(analyze_result) storage_helpers.upload_blob(self.container_client, ocr_output_path, analyze_result_string) # Updating status if(skip_status_table == False): entity = {'PartitionKey': doctype, 'RowKey': file_name, 'status': 'ocr-done'} if storage_helpers.insert_or_replace_entity(self.table_service, self.app_settings.status_table, entity): logging.info(f"Updated {blob_name} status in status table.") else: logging.error(f"Could not update {blob_name} status in status table.") else: logging.info(f"OCR file for document {blob_name} already created, getting it from storage.") ocr_file = storage_helpers.download_blob(self.container_client, ocr_output_path, 'text') if(ocr_file != None): analyze_result = json.loads(ocr_file) # Creating labels file for document if analyze_result != None: key_field_names = self.fields labels_result, keys = autolabeling.analyze_labels(gt_df if gt_df is not None else self.app_settings.gt_path, blob_name, analyze_result, key_field_names, self.app_settings.lookup_path) logging.info(keys) if labels_result != None and len(keys) > 1: labels_output_path = blob_name + '.labels.json' labels_result_string = json.dumps(labels_result) storage_helpers.upload_blob(self.container_client, labels_output_path, labels_result_string) # Updating status if(skip_status_table == False): entity = {'PartitionKey': doctype, 'RowKey': file_name, 'status': 'done'} if storage_helpers.insert_or_replace_entity(self.table_service, self.app_settings.status_table, entity): logging.info(f"Updated {blob_name} status in status table.") else: logging.error(f"Could not update {blob_name} status in status table.") else: logging.error(f"Could not continue processing for blob {blob_name} as analyze result is missing.")
def test_download_bytes_blob_when_blob_name_invalid(self): # Expecting failure when blob name is invalid and we are downloading bytes result = storage_helpers.download_blob( self.container_client, "abcd") assert result == None
def test_download_bytes_blob_when_valid(self): # Expecting success when all parameters are valid and we are downloading bytes result = storage_helpers.download_blob( self.container_client, self.blob_name_download) assert result != None
def test_download_bytes_blob_when_container_client_invalid(self): # Expecting failure when container client is invalid and we are downloading bytes result = storage_helpers.download_blob( None, self.blob_name_download) assert result == None
def run(self, doctype, reuse=False): folders = storage_helpers.list_doctype_folders(self.container_client) if (doctype in folders): logging.info(f"Found {doctype} folder in storage.") testing_path = doctype + '/test' blobs = storage_helpers.list_blobs(self.container_client, testing_path) if (len(blobs) > 0): # Getting model ID from doctype name partition_key = self.app_settings.environment + '_supervised' model_id = storage_helpers.query_entity_model( self.table_service, self.app_settings.models_table, partition_key, doctype) if model_id != None: logging.info( f"Found model id {model_id} for doc type {doctype}") evaluation_output_path = doctype + '/evaluation_file.json' if (reuse == 'False'): logging.warning("REUSE FALSE") # Batch predictions on all test blobs logging.info(f"Predicting for test set...") predictions, count_analyzed, count_total = fr_helpers.batch_predictions( blobs, model_id, self.app_settings.storage_account_url, self.app_settings.container, self.app_settings.sas, self.app_settings.fr_region, self.app_settings.fr_key) evaluation = model_evaluation.evaluate( predictions, self.app_settings.gt_path, self.app_settings.lookup_path, count_analyzed, count_total) evaluation_file = json.dumps(evaluation) storage_helpers.upload_blob(self.container_client, evaluation_output_path, evaluation_file) else: logging.info( f"Evaluation file for doc type {doctype} already created, getting it from storage." ) evaluation_file = storage_helpers.download_blob( self.container_client, evaluation_output_path, 'text') if (evaluation_file != None): evaluation = json.loads(evaluation_file) if (evaluation != None): model_eval_json, mismatches = model_evaluation.create_eval_file( evaluation, model_id, self.app_settings.lookup_path) response = {} response[ 'text'] = f"Evaluation for doc type {doctype} done." response['eval'] = model_eval_json.copy() model_eval_json['mismatches'] = mismatches model_eval_file = json.dumps(model_eval_json) model_eval_output_path = doctype + '/model_eval.json' storage_helpers.upload_blob(self.container_client, model_eval_output_path, model_eval_file) # Bell sound when the process finishes print("\a") return response else: logging.error( f"Could not continue as model id could not be retrieved." ) raise EnvironmentError(f"Could not retrieve model id.") else: logging.warning( f"Didn't find any testing files in storage for {doctype}") raise Warning(f"No test files.") else: logging.warning(f"Didn't find {doctype} folder in storage.") raise Warning(f"{doctype} not in storage.")