Ejemplo n.º 1
0
def classify_write(bucket_name, prefix, selected_pdf_folder, prediction_client,
                   storage_client, bq_client, bq_dataset, bq_table,
                   score_threshold, service_account, input_path,
                   model_full_id):
    bucket = storage_client.bucket(bucket_name)
    params = {}
    lines = []

    schema = [
        bigquery.SchemaField('file', 'STRING', mode='REQUIRED'),
        bigquery.SchemaField('class', 'STRING', mode='REQUIRED'),
        bigquery.SchemaField('class_confidence', 'STRING', mode='REQUIRED'),
    ]
    table = utils.create_table(bq_client, bq_dataset, bq_table, schema)
    if score_threshold:
        params = {"score_threshold": str(score_threshold)}

    for blob in bucket.list_blobs(prefix=str(prefix + "/")):
        if blob.name.endswith(".png"):
            logger.info(os.path.basename(blob.name))
            content = utils.sample_handler(storage_client, bucket_name,
                                           blob.name)
            payload = {"image": {"image_bytes": content}}
            response = prediction_client.predict(model_full_id, payload,
                                                 params)
            for result in response.payload:
                logger.info("File location: {}".format(
                    os.path.join('gs://', bucket_name, blob.name)))
                logger.info("Predicted class name: {}".format(
                    result.display_name))
                logger.info("Predicted class score: {}\n".format(
                    result.classification.score))

                if result.display_name == "datasheets":
                    pass
                else:
                    # Copy from the pdf folder to the selected_pdf_folder
                    filename = os.path.basename(blob.name).replace(
                        '.png', '.pdf')
                    input_pdf_path = os.path.join(input_path, filename)
                    selected_pdf_path = os.path.join(selected_pdf_folder,
                                                     filename)
                    bucket_input, blob_input = utils.get_bucket_blob(
                        input_pdf_path)
                    bucket_output, blob_output = utils.get_bucket_blob(
                        selected_pdf_path)

                    utils.copy_blob(bucket_input, blob_input, bucket_output,
                                    blob_output, service_account)

                rows_to_insert = [
                    (str(blob.name).replace(".png", ".pdf").replace(
                        prefix, "").replace("/", ""), result.display_name,
                     result.classification.score),
                ]
                load_job = bq_client.insert_rows(table, rows_to_insert)
Ejemplo n.º 2
0
def predict(main_project_id,
            input_path,
            demo_dataset,
            demo_table,
            model_id,
            service_acct,
            compute_region,
            score_threshold=0.5):
    """Reads some PNG, classifies them and copies the non-datasheet ones (PDF version) to new folder.

    Args:
      input_folder_png: Path to the folder containing images.
      input_path: Path to the folder containing pdfs.
      selected_pdf_folder: Folder where to put the valid pdfs.
      main_project_id: Project ID where the model lives.
      model_id: ID of the AutoML classification model.
      bq_dataset: Existing BigQuery dataset that contains the table that the results will be written to.
      bq_table: BigQuery table that the results will be written to.
      service_account: API key needed to access BigQuery.
      score_threshold: The required confidence level for AutoML to make a prediction.
      compute_region: Compute region for AutoML model.
    """
    logger.info("Starting image classification.")
    input_bucket_name = input_path.replace('gs://', '').split('/')[0]
    input_folder_png = f"gs://{input_bucket_name}/{demo_dataset}/png"

    selected_pdf_folder =  f"gs://{input_bucket_name}/{demo_dataset}/valid_pdf"

    # Set up client for the AutoML Vision model
    # Note, you need to give this service account AutoML permission within the pdf-processing-219114 project
    automl_client = automl.AutoMlClient.from_service_account_json(service_acct)
    model_full_id = automl_client.model_path(
        main_project_id, compute_region, model_id)
    prediction_client = automl.PredictionServiceClient.from_service_account_json(
        service_acct)

    # Set up client for BigQuery and GCS.
    storage_client = storage.Client.from_service_account_json(
        service_acct)
    bq_client = bigquery.Client.from_service_account_json(
        service_acct)

    bucket_name, file_name = utils.get_bucket_blob(input_folder_png)
    classify_write(
        bucket_name,
        file_name,
        selected_pdf_folder,
        prediction_client,
        storage_client,
        bq_client,
        demo_dataset,
        demo_table,
        score_threshold,
        service_acct,
        input_path,
        model_full_id,
        )
    logger.info("Image classification finished.\n")
Ejemplo n.º 3
0
def predict(main_project_id, input_path, demo_dataset, demo_table, model_id,
            service_acct, compute_region, config):
    """Runs AutoML NER on a folder and writes results to BigQuery.

  Args:
    gcs_ocr_text_folder: JSON folder (outputs of OCR).    
    dataset_bq: BiqQuery dataset name.
    table_bq_output: BigQuery table where the ner results are written to.
    project_id_ner: Project ID for AutoML Ner.
    project_id_bq: Project ID for BigQuery Table.
    ner_model_id: AutoML Model ID (NER).
    list_fields: List of field_names to extract (list of string).
    service_account_ner: Location of service account key to access the NER model.
    service_account_gcs_bq: Location of service account key to access BQ and Storage.
    compute_region: Compute Region for NER model.
  """
    logger.info('Starting entity extraction.')

    input_bucket_name = input_path.replace('gs://', '').split('/')[0]
    input_txt_folder = f"gs://{input_bucket_name}/{demo_dataset}/txt"

    list_fields = [
        x['field_name'] for x in config["model_ner"]["fields_to_extract"]
    ]
    list_fields.remove('gcs_path')

    storage_client = storage.Client.from_service_account_json(service_acct)
    bucket_name, path = utils.get_bucket_blob(input_txt_folder)
    bucket = storage_client.get_bucket(bucket_name)

    list_results = []
    for file in bucket.list_blobs(prefix=path):
        full_filename = os.path.join(input_txt_folder,
                                     os.path.basename(file.name))
        logger.info(full_filename)
        result = run_automl_single(ocr_path=full_filename,
                                   list_fields=list_fields,
                                   service_acct=service_acct,
                                   model_id=model_id,
                                   main_project_id=main_project_id,
                                   compute_region=compute_region)
        list_results.append(result)

    schema = [bigquery.SchemaField('file', 'STRING', mode='NULLABLE')]
    for field in list_fields:
        schema.append(bigquery.SchemaField(field, 'STRING', mode='NULLABLE'))

    utils.save_to_bq(demo_dataset,
                     demo_table,
                     list_results,
                     service_acct,
                     _create_table=True,
                     schema=schema)

    logger.info('Entity extraction finished.\n')
Ejemplo n.º 4
0
def predict(main_project_id, input_path, demo_dataset, demo_table, model_id,
            service_acct, compute_region):
    """Runs AutoML Text classifier on a GCS folder and pushes results to BigQuery."""
    logger.info("Starting text classification.\n")
    input_bucket_name = input_path.replace('gs://', '').split('/')[0]
    input_txt_folder = f"gs://{input_bucket_name}/{demo_dataset}/txt"

    # Set up storage client
    storage_client = storage.Client.from_service_account_json(service_acct)
    bucket_name, path = utils.get_bucket_blob(input_txt_folder)
    bucket = storage_client.get_bucket(bucket_name)

    results = []
    for document_path in bucket.list_blobs(prefix=path):
        logging.info('Extracting the subject for file: {}'.format(
            document_path.name))
        document_abs_path = os.path.join('gs://', bucket_name,
                                         document_path.name)
        content = utils.download_string(document_abs_path, service_acct).read()
        subject, score = run_automl_text(content, main_project_id, model_id,
                                         service_acct, compute_region)
        logger.info(f"Predicted subject: {subject}.")
        logger.info(f"Predicted class score: {score}.")

        results.append({
            'file':
            os.path.basename(document_abs_path.replace('.txt', '.pdf')),
            'subject':
            subject,
            'score':
            score
        })

    schema = [
        bigquery.SchemaField('file', 'STRING', mode='NULLABLE'),
        bigquery.SchemaField('subject', 'STRING', mode='NULLABLE'),
        bigquery.SchemaField('score', 'FLOAT', mode='NULLABLE'),
    ]
    utils.save_to_bq(demo_dataset,
                     demo_table,
                     results,
                     service_acct,
                     _create_table=True,
                     schema=schema)
    logger.info('Text classification finished.\n')
def detect_object(gcs_image_folder, gcs_cropped_image_folder, main_project_id,
                  model_id, bq_dataset_output, bq_table_output,
                  prediction_client, storage_client, bq_client):

    match = re.match(r'gs://([^/]+)/(.+)', gcs_image_folder)
    bucket_name = match.group(1)
    prefix = match.group(2)
    dataset_ref = bq_client.dataset(bq_dataset_output)
    table_ref = dataset_ref.table(bq_table_output)
    bucket = storage_client.bucket(bucket_name)
    params = {"timeout": "60.0s"}
    lines = []

    schema = [
        bigquery.SchemaField('file', 'STRING', mode='REQUIRED'),
        bigquery.SchemaField('object', 'STRING', mode='REQUIRED'),
        bigquery.SchemaField('confidence', 'STRING', mode='REQUIRED'),
        bigquery.SchemaField('x_min', 'STRING', mode='REQUIRED'),
        bigquery.SchemaField('x_max', 'STRING', mode='REQUIRED'),
        bigquery.SchemaField('y_min', 'STRING', mode='REQUIRED'),
        bigquery.SchemaField('y_max', 'STRING', mode='REQUIRED'),
    ]
    table = utils.create_table(bq_client, bq_dataset_output, bq_table_output,
                               schema)

    for blob in bucket.list_blobs(prefix=str(prefix + "/")):
        if blob.name.endswith(".png"):
            logger.info("File location: {}".format(
                os.path.join('gs://', bucket_name, blob.name)))
            content = utils.sample_handler(storage_client, bucket_name,
                                           blob.name)
            name = 'projects/{}/locations/us-central1/models/{}'.format(
                main_project_id, model_id)
            payload = {'image': {'image_bytes': content}}
            params = {}
            request = prediction_client.predict(name, payload, params)

            for result in request.payload:
                logger.info("Figure detected in file.")
                rows_to_insert = [
                    (str(blob.name).replace(".png", ".pdf").replace(prefix,"").replace("/",""), \
                     result.display_name, \
                     result.image_object_detection.score, \
                     result.image_object_detection.bounding_box.normalized_vertices[0].x, result.image_object_detection.bounding_box.normalized_vertices[1].x, \
                     result.image_object_detection.bounding_box.normalized_vertices[0].y, result.image_object_detection.bounding_box.normalized_vertices[1].y),
                ]
                load_job = bq_client.insert_rows(table, rows_to_insert)

                # As below,  crop the object and save the cropped part as a separated image file
                file_name = blob.name
                _, temp_local_filename = tempfile.mkstemp()
                blob.download_to_filename(temp_local_filename)
                im = Image.open(temp_local_filename)
                width, height = im.size
                r_xmin = width * result.image_object_detection.bounding_box.normalized_vertices[
                    0].x
                r_ymin = height * result.image_object_detection.bounding_box.normalized_vertices[
                    0].y
                r_xmax = width * result.image_object_detection.bounding_box.normalized_vertices[
                    1].x
                r_ymax = height * result.image_object_detection.bounding_box.normalized_vertices[
                    1].y
                box = (r_xmin, r_ymin, r_xmax, r_ymax)
                im = Image.open(temp_local_filename)
                im2 = im.crop(box)
                im2.save(temp_local_filename.replace('.png', '-crop.png'),
                         'png')

                # Upload cropped image to gcs bucket
                new_file_name = os.path.join(
                    gcs_cropped_image_folder,
                    os.path.basename(blob.name).replace('.png', '-crop.png'))
                new_file_bucket, new_file_name = utils.get_bucket_blob(
                    new_file_name)
                new_blob = blob.bucket.blob(new_file_name)
                new_blob.upload_from_filename(temp_local_filename)
                os.remove(temp_local_filename)
        else:
            pass