def classify_write(bucket_name, prefix, selected_pdf_folder, prediction_client, storage_client, bq_client, bq_dataset, bq_table, score_threshold, service_account, input_path, model_full_id): bucket = storage_client.bucket(bucket_name) params = {} lines = [] schema = [ bigquery.SchemaField('file', 'STRING', mode='REQUIRED'), bigquery.SchemaField('class', 'STRING', mode='REQUIRED'), bigquery.SchemaField('class_confidence', 'STRING', mode='REQUIRED'), ] table = utils.create_table(bq_client, bq_dataset, bq_table, schema) if score_threshold: params = {"score_threshold": str(score_threshold)} for blob in bucket.list_blobs(prefix=str(prefix + "/")): if blob.name.endswith(".png"): logger.info(os.path.basename(blob.name)) content = utils.sample_handler(storage_client, bucket_name, blob.name) payload = {"image": {"image_bytes": content}} response = prediction_client.predict(model_full_id, payload, params) for result in response.payload: logger.info("File location: {}".format( os.path.join('gs://', bucket_name, blob.name))) logger.info("Predicted class name: {}".format( result.display_name)) logger.info("Predicted class score: {}\n".format( result.classification.score)) if result.display_name == "datasheets": pass else: # Copy from the pdf folder to the selected_pdf_folder filename = os.path.basename(blob.name).replace( '.png', '.pdf') input_pdf_path = os.path.join(input_path, filename) selected_pdf_path = os.path.join(selected_pdf_folder, filename) bucket_input, blob_input = utils.get_bucket_blob( input_pdf_path) bucket_output, blob_output = utils.get_bucket_blob( selected_pdf_path) utils.copy_blob(bucket_input, blob_input, bucket_output, blob_output, service_account) rows_to_insert = [ (str(blob.name).replace(".png", ".pdf").replace( prefix, "").replace("/", ""), result.display_name, result.classification.score), ] load_job = bq_client.insert_rows(table, rows_to_insert)
def predict(main_project_id, input_path, demo_dataset, demo_table, model_id, service_acct, compute_region, score_threshold=0.5): """Reads some PNG, classifies them and copies the non-datasheet ones (PDF version) to new folder. Args: input_folder_png: Path to the folder containing images. input_path: Path to the folder containing pdfs. selected_pdf_folder: Folder where to put the valid pdfs. main_project_id: Project ID where the model lives. model_id: ID of the AutoML classification model. bq_dataset: Existing BigQuery dataset that contains the table that the results will be written to. bq_table: BigQuery table that the results will be written to. service_account: API key needed to access BigQuery. score_threshold: The required confidence level for AutoML to make a prediction. compute_region: Compute region for AutoML model. """ logger.info("Starting image classification.") input_bucket_name = input_path.replace('gs://', '').split('/')[0] input_folder_png = f"gs://{input_bucket_name}/{demo_dataset}/png" selected_pdf_folder = f"gs://{input_bucket_name}/{demo_dataset}/valid_pdf" # Set up client for the AutoML Vision model # Note, you need to give this service account AutoML permission within the pdf-processing-219114 project automl_client = automl.AutoMlClient.from_service_account_json(service_acct) model_full_id = automl_client.model_path( main_project_id, compute_region, model_id) prediction_client = automl.PredictionServiceClient.from_service_account_json( service_acct) # Set up client for BigQuery and GCS. storage_client = storage.Client.from_service_account_json( service_acct) bq_client = bigquery.Client.from_service_account_json( service_acct) bucket_name, file_name = utils.get_bucket_blob(input_folder_png) classify_write( bucket_name, file_name, selected_pdf_folder, prediction_client, storage_client, bq_client, demo_dataset, demo_table, score_threshold, service_acct, input_path, model_full_id, ) logger.info("Image classification finished.\n")
def predict(main_project_id, input_path, demo_dataset, demo_table, model_id, service_acct, compute_region, config): """Runs AutoML NER on a folder and writes results to BigQuery. Args: gcs_ocr_text_folder: JSON folder (outputs of OCR). dataset_bq: BiqQuery dataset name. table_bq_output: BigQuery table where the ner results are written to. project_id_ner: Project ID for AutoML Ner. project_id_bq: Project ID for BigQuery Table. ner_model_id: AutoML Model ID (NER). list_fields: List of field_names to extract (list of string). service_account_ner: Location of service account key to access the NER model. service_account_gcs_bq: Location of service account key to access BQ and Storage. compute_region: Compute Region for NER model. """ logger.info('Starting entity extraction.') input_bucket_name = input_path.replace('gs://', '').split('/')[0] input_txt_folder = f"gs://{input_bucket_name}/{demo_dataset}/txt" list_fields = [ x['field_name'] for x in config["model_ner"]["fields_to_extract"] ] list_fields.remove('gcs_path') storage_client = storage.Client.from_service_account_json(service_acct) bucket_name, path = utils.get_bucket_blob(input_txt_folder) bucket = storage_client.get_bucket(bucket_name) list_results = [] for file in bucket.list_blobs(prefix=path): full_filename = os.path.join(input_txt_folder, os.path.basename(file.name)) logger.info(full_filename) result = run_automl_single(ocr_path=full_filename, list_fields=list_fields, service_acct=service_acct, model_id=model_id, main_project_id=main_project_id, compute_region=compute_region) list_results.append(result) schema = [bigquery.SchemaField('file', 'STRING', mode='NULLABLE')] for field in list_fields: schema.append(bigquery.SchemaField(field, 'STRING', mode='NULLABLE')) utils.save_to_bq(demo_dataset, demo_table, list_results, service_acct, _create_table=True, schema=schema) logger.info('Entity extraction finished.\n')
def predict(main_project_id, input_path, demo_dataset, demo_table, model_id, service_acct, compute_region): """Runs AutoML Text classifier on a GCS folder and pushes results to BigQuery.""" logger.info("Starting text classification.\n") input_bucket_name = input_path.replace('gs://', '').split('/')[0] input_txt_folder = f"gs://{input_bucket_name}/{demo_dataset}/txt" # Set up storage client storage_client = storage.Client.from_service_account_json(service_acct) bucket_name, path = utils.get_bucket_blob(input_txt_folder) bucket = storage_client.get_bucket(bucket_name) results = [] for document_path in bucket.list_blobs(prefix=path): logging.info('Extracting the subject for file: {}'.format( document_path.name)) document_abs_path = os.path.join('gs://', bucket_name, document_path.name) content = utils.download_string(document_abs_path, service_acct).read() subject, score = run_automl_text(content, main_project_id, model_id, service_acct, compute_region) logger.info(f"Predicted subject: {subject}.") logger.info(f"Predicted class score: {score}.") results.append({ 'file': os.path.basename(document_abs_path.replace('.txt', '.pdf')), 'subject': subject, 'score': score }) schema = [ bigquery.SchemaField('file', 'STRING', mode='NULLABLE'), bigquery.SchemaField('subject', 'STRING', mode='NULLABLE'), bigquery.SchemaField('score', 'FLOAT', mode='NULLABLE'), ] utils.save_to_bq(demo_dataset, demo_table, results, service_acct, _create_table=True, schema=schema) logger.info('Text classification finished.\n')
def detect_object(gcs_image_folder, gcs_cropped_image_folder, main_project_id, model_id, bq_dataset_output, bq_table_output, prediction_client, storage_client, bq_client): match = re.match(r'gs://([^/]+)/(.+)', gcs_image_folder) bucket_name = match.group(1) prefix = match.group(2) dataset_ref = bq_client.dataset(bq_dataset_output) table_ref = dataset_ref.table(bq_table_output) bucket = storage_client.bucket(bucket_name) params = {"timeout": "60.0s"} lines = [] schema = [ bigquery.SchemaField('file', 'STRING', mode='REQUIRED'), bigquery.SchemaField('object', 'STRING', mode='REQUIRED'), bigquery.SchemaField('confidence', 'STRING', mode='REQUIRED'), bigquery.SchemaField('x_min', 'STRING', mode='REQUIRED'), bigquery.SchemaField('x_max', 'STRING', mode='REQUIRED'), bigquery.SchemaField('y_min', 'STRING', mode='REQUIRED'), bigquery.SchemaField('y_max', 'STRING', mode='REQUIRED'), ] table = utils.create_table(bq_client, bq_dataset_output, bq_table_output, schema) for blob in bucket.list_blobs(prefix=str(prefix + "/")): if blob.name.endswith(".png"): logger.info("File location: {}".format( os.path.join('gs://', bucket_name, blob.name))) content = utils.sample_handler(storage_client, bucket_name, blob.name) name = 'projects/{}/locations/us-central1/models/{}'.format( main_project_id, model_id) payload = {'image': {'image_bytes': content}} params = {} request = prediction_client.predict(name, payload, params) for result in request.payload: logger.info("Figure detected in file.") rows_to_insert = [ (str(blob.name).replace(".png", ".pdf").replace(prefix,"").replace("/",""), \ result.display_name, \ result.image_object_detection.score, \ result.image_object_detection.bounding_box.normalized_vertices[0].x, result.image_object_detection.bounding_box.normalized_vertices[1].x, \ result.image_object_detection.bounding_box.normalized_vertices[0].y, result.image_object_detection.bounding_box.normalized_vertices[1].y), ] load_job = bq_client.insert_rows(table, rows_to_insert) # As below, crop the object and save the cropped part as a separated image file file_name = blob.name _, temp_local_filename = tempfile.mkstemp() blob.download_to_filename(temp_local_filename) im = Image.open(temp_local_filename) width, height = im.size r_xmin = width * result.image_object_detection.bounding_box.normalized_vertices[ 0].x r_ymin = height * result.image_object_detection.bounding_box.normalized_vertices[ 0].y r_xmax = width * result.image_object_detection.bounding_box.normalized_vertices[ 1].x r_ymax = height * result.image_object_detection.bounding_box.normalized_vertices[ 1].y box = (r_xmin, r_ymin, r_xmax, r_ymax) im = Image.open(temp_local_filename) im2 = im.crop(box) im2.save(temp_local_filename.replace('.png', '-crop.png'), 'png') # Upload cropped image to gcs bucket new_file_name = os.path.join( gcs_cropped_image_folder, os.path.basename(blob.name).replace('.png', '-crop.png')) new_file_bucket, new_file_name = utils.get_bucket_blob( new_file_name) new_blob = blob.bucket.blob(new_file_name) new_blob.upload_from_filename(temp_local_filename) os.remove(temp_local_filename) else: pass