def run(self, blob_name, skip_status_table = False, gt_df = None):
        if self.container_client != None and self.table_service != None:
            file_content = storage_helpers.download_blob(self.container_client, blob_name)

            # Check document status to see if it was already processed
            doctype = blob_name.split('/')[0]
            file_name = blob_name.split('/')[-1]
            status = "new"

            if(skip_status_table == False):
                status = storage_helpers.query_entity_status(self.table_service, self.app_settings.status_table, doctype, file_name)
            # If status = "done", we do nothing, if status = "ocr_done", we only find labels
            if status != 'done':

                ocr_output_path = blob_name + '.ocr.json'
                if status != 'ocr-done':
                    # Creating OCR file for document
                    logging.info(f"Creating OCR file for document {blob_name}...")
                    analyze_result = fr_helpers.analyze_layout(self.app_settings.fr_region, self.app_settings.fr_key, file_content, blob_name)
                    analyze_result_string = json.dumps(analyze_result)
                    storage_helpers.upload_blob(self.container_client, ocr_output_path, analyze_result_string)
                    # Updating status
                    if(skip_status_table == False):
                        entity = {'PartitionKey': doctype, 'RowKey': file_name, 'status': 'ocr-done'}
                        if storage_helpers.insert_or_replace_entity(self.table_service, self.app_settings.status_table, entity):
                            logging.info(f"Updated {blob_name} status in status table.")
                        else:
                            logging.error(f"Could not update {blob_name} status in status table.")
                else:
                    logging.info(f"OCR file for document {blob_name} already created, getting it from storage.")
                    ocr_file = storage_helpers.download_blob(self.container_client, ocr_output_path, 'text')
                    if(ocr_file != None):
                        analyze_result = json.loads(ocr_file)
                
                # Creating labels file for document
                if analyze_result != None:
                    key_field_names = self.fields
                    labels_result, keys = autolabeling.analyze_labels(gt_df if gt_df is not None else self.app_settings.gt_path, blob_name, analyze_result, key_field_names, self.app_settings.lookup_path)
                    logging.info(keys)
                    if  labels_result != None and len(keys) > 1:
                        labels_output_path = blob_name + '.labels.json'
                        labels_result_string = json.dumps(labels_result)
                        storage_helpers.upload_blob(self.container_client, labels_output_path, labels_result_string)
                        # Updating status
                        if(skip_status_table == False):
                            entity = {'PartitionKey': doctype, 'RowKey': file_name, 'status': 'done'}
                            if storage_helpers.insert_or_replace_entity(self.table_service, self.app_settings.status_table, entity):
                                logging.info(f"Updated {blob_name} status in status table.")
                            else:
                                logging.error(f"Could not update {blob_name} status in status table.")

                else:
                    logging.error(f"Could not continue processing for blob {blob_name} as analyze result is missing.")
Esempio n. 2
0
    def test_download_bytes_blob_when_blob_name_invalid(self):
    
        # Expecting failure when blob name is invalid and we are downloading bytes
        result = storage_helpers.download_blob(
                self.container_client,
                "abcd")

        assert result == None
Esempio n. 3
0
    def test_download_bytes_blob_when_valid(self):
    
        # Expecting success when all parameters are valid and we are downloading bytes
        result = storage_helpers.download_blob(
                self.container_client,
                self.blob_name_download)

        assert result != None
Esempio n. 4
0
    def test_download_bytes_blob_when_container_client_invalid(self):
    
        # Expecting failure when container client is invalid and we are downloading bytes
        result = storage_helpers.download_blob(
                None,
                self.blob_name_download)

        assert result == None
    def run(self, doctype, reuse=False):
        folders = storage_helpers.list_doctype_folders(self.container_client)

        if (doctype in folders):
            logging.info(f"Found {doctype} folder in storage.")
            testing_path = doctype + '/test'
            blobs = storage_helpers.list_blobs(self.container_client,
                                               testing_path)
            if (len(blobs) > 0):

                # Getting model ID from doctype name
                partition_key = self.app_settings.environment + '_supervised'

                model_id = storage_helpers.query_entity_model(
                    self.table_service, self.app_settings.models_table,
                    partition_key, doctype)

                if model_id != None:
                    logging.info(
                        f"Found model id {model_id} for doc type {doctype}")

                    evaluation_output_path = doctype + '/evaluation_file.json'

                    if (reuse == 'False'):
                        logging.warning("REUSE FALSE")
                        # Batch predictions on all test blobs
                        logging.info(f"Predicting for test set...")

                        predictions, count_analyzed, count_total = fr_helpers.batch_predictions(
                            blobs, model_id,
                            self.app_settings.storage_account_url,
                            self.app_settings.container, self.app_settings.sas,
                            self.app_settings.fr_region,
                            self.app_settings.fr_key)
                        evaluation = model_evaluation.evaluate(
                            predictions, self.app_settings.gt_path,
                            self.app_settings.lookup_path, count_analyzed,
                            count_total)
                        evaluation_file = json.dumps(evaluation)
                        storage_helpers.upload_blob(self.container_client,
                                                    evaluation_output_path,
                                                    evaluation_file)

                    else:
                        logging.info(
                            f"Evaluation file for doc type {doctype} already created, getting it from storage."
                        )
                        evaluation_file = storage_helpers.download_blob(
                            self.container_client, evaluation_output_path,
                            'text')
                        if (evaluation_file != None):
                            evaluation = json.loads(evaluation_file)

                    if (evaluation != None):

                        model_eval_json, mismatches = model_evaluation.create_eval_file(
                            evaluation, model_id,
                            self.app_settings.lookup_path)
                        response = {}
                        response[
                            'text'] = f"Evaluation for doc type {doctype} done."
                        response['eval'] = model_eval_json.copy()

                        model_eval_json['mismatches'] = mismatches
                        model_eval_file = json.dumps(model_eval_json)
                        model_eval_output_path = doctype + '/model_eval.json'
                        storage_helpers.upload_blob(self.container_client,
                                                    model_eval_output_path,
                                                    model_eval_file)

                        # Bell sound when the process finishes
                        print("\a")

                        return response

                else:
                    logging.error(
                        f"Could not continue as model id could not be retrieved."
                    )
                    raise EnvironmentError(f"Could not retrieve model id.")

            else:
                logging.warning(
                    f"Didn't find any testing files in storage for {doctype}")
                raise Warning(f"No test files.")

        else:
            logging.warning(f"Didn't find {doctype} folder in storage.")
            raise Warning(f"{doctype} not in storage.")