Example #1
0
    def parse_document(self, input_uri, processor_id, token_path, project_id):
        # Instantiates a client
        credentials = service_account.Credentials.from_service_account_file(
            token_path)
        client_options = {"api_endpoint": "eu-documentai.googleapis.com"}
        client = documentai.DocumentProcessorServiceClient(
            credentials=credentials, client_options=client_options)

        name = f"projects/{project_id}/locations/eu/processors/{processor_id}"
        with self.fs.open(input_uri, "rb") as image:
            image_content = image.read()

        # Read the file into memory
        document = {"content": image_content, "mime_type": "application/pdf"}

        # Configure the process request
        request = documentai.types.ProcessRequest(name=name,
                                                  document=document,
                                                  skip_human_review=False)

        # Recognizes text entities in the PDF document
        result = client.process_document(request=request)

        document = result.document
        self.document = document
def quickstart(project_id: str, location: str, processor_id: str,
               file_path: str):
    client = documentai.DocumentProcessorServiceClient()

    # The full resource name of the processor, e.g.:
    # projects/project-id/locations/location/processor/processor-id
    # You must create new processors in the Cloud Console first
    name = f"projects/{project_id}/locations/{location}/processors/{processor_id}"

    # Read the file into memory
    with open(file_path, "rb") as image:
        image_content = image.read()

    document = {"content": image_content, "mime_type": "application/pdf"}

    # Configure the process request
    request = {"name": name, "document": document}

    result = client.process_document(request=request)
    document = result.document

    document_pages = document.pages

    # For a full list of Document object attributes, please reference this page: https://googleapis.dev/python/documentai/latest/_modules/google/cloud/documentai_v1beta3/types/document.html#Document

    # Read the text recognition output from the processor
    print("The document contains the following paragraphs:")
    for page in document_pages:
        paragraphs = page.paragraphs
        for paragraph in paragraphs:
            print(paragraph)
            paragraph_text = get_text(paragraph.layout, document)
            print(f"Paragraph text: {paragraph_text}")
    def parse_document(self,
                       input_uri,
                       token_path,
                       project_id,
                       processor_id="643a05097d4ab993"):
        """
        Facilitates sending a request to the Doc AI API (via a specified
        processor) and saves the response (a 'document') as a class attribute.

        Arguments:
            input_uri:      The gcs location of a pdf to be processed by Doc AI
            token_path:     Path to the location of json key for authorisation
            project_id:     The gcp project id
            processor_id:   The id of the processor created in the cloud
                            console
        Returns:
            None
        Raises:
            None
        """
        # Instantiates a client
        credentials = service_account.Credentials.from_service_account_file(
            token_path)
        client_options = {"api_endpoint": "eu-documentai.googleapis.com"}
        client = documentai.DocumentProcessorServiceClient(
            credentials=credentials, client_options=client_options)

        name = f"projects/{project_id}/locations/eu/processors/{processor_id}"

        # Read the file into memory
        with self.fs.open(input_uri, "rb") as image:
            image_content = image.read()

        document = {"content": image_content, "mime_type": "application/pdf"}

        # Configure the process request
        request = documentai.types.ProcessRequest(name=name,
                                                  document=document,
                                                  skip_human_review=True)

        # Recognizes text entities in the PDF document
        result = client.process_document(request=request)

        self.document = result.document
Example #4
0
def process_document_sample(project_id: str, location: str, processor_id: str,
                            file_path: str):
    from google.cloud import documentai_v1beta3 as documentai

    # You must set the api_endpoint if you use a location other than 'us', e.g.:
    opts = {}
    if location == "eu":
        opts = {"api_endpoint": "eu-documentai.googleapis.com"}

    client = documentai.DocumentProcessorServiceClient(client_options=opts)

    # The full resource name of the processor, e.g.:
    # projects/project-id/locations/location/processor/processor-id
    # You must create new processors in the Cloud Console first
    name = f"projects/{project_id}/locations/{location}/processors/{processor_id}"

    with open(file_path, "rb") as image:
        image_content = image.read()

    # Read the file into memory
    document = {"content": image_content, "mime_type": "application/pdf"}

    # Configure the process request
    request = {"name": name, "document": document}

    # Recognizes text entities in the PDF document
    result = client.process_document(request=request)

    document = result.document

    print("Document processing complete.")

    # For a full list of Document object attributes, please reference this page: https://googleapis.dev/python/documentai/latest/_modules/google/cloud/documentai_v1beta3/types/document.html#Document

    document_pages = document.pages

    # Read the text recognition output from the processor
    print("The document contains the following paragraphs:")
    for page in document_pages:
        paragraphs = page.paragraphs
        for paragraph in paragraphs:
            paragraph_text = get_text(paragraph.layout, document)
            print(f"Paragraph text: {paragraph_text}")
Example #5
0
def batch_process_documents(
    project_id,
    location,
    processor_id,
    gcs_input_uri,
    gcs_output_uri,
    gcs_output_uri_prefix,
    timeout: int = 300,
):

    # You must set the api_endpoint if you use a location other than 'us', e.g.:
    opts = {}
    if location == "eu":
        opts = {"api_endpoint": "eu-documentai.googleapis.com"}

    client = documentai.DocumentProcessorServiceClient(client_options=opts)

    destination_uri = f"{gcs_output_uri}/{gcs_output_uri_prefix}/"

    # 'mime_type' can be 'application/pdf', 'image/tiff',
    # and 'image/gif', or 'application/json'
    input_config = documentai.types.document_processor_service.BatchProcessRequest.BatchInputConfig(
        gcs_source=gcs_input_uri, mime_type="application/pdf")

    # Where to write results
    output_config = documentai.types.document_processor_service.BatchProcessRequest.BatchOutputConfig(
        gcs_destination=destination_uri)

    # Location can be 'us' or 'eu'
    name = f"projects/{project_id}/locations/{location}/processors/{processor_id}"
    request = documentai.types.document_processor_service.BatchProcessRequest(
        name=name,
        input_configs=[input_config],
        output_config=output_config,
    )

    operation = client.batch_process_documents(request)

    # Wait for the operation to finish
    operation.result(timeout=timeout)

    # Results are written to GCS. Use a regex to find
    # output files
    match = re.match(r"gs://([^/]+)/(.+)", destination_uri)
    output_bucket = match.group(1)
    prefix = match.group(2)

    storage_client = storage.Client()
    bucket = storage_client.get_bucket(output_bucket)
    blob_list = list(bucket.list_blobs(prefix=prefix))
    print("Output files:")

    for i, blob in enumerate(blob_list):
        # If JSON file, download the contents of this blob as a bytes object.
        if ".json" in blob.name:
            blob_as_bytes = blob.download_as_bytes()

            document = documentai.types.Document.from_json(blob_as_bytes)
            print(f"Fetched file {i + 1}")

            # For a full list of Document object attributes, please reference this page:
            # https://cloud.google.com/document-ai/docs/reference/rpc/google.cloud.documentai.v1beta3#document

            # Read the text recognition output from the processor
            for page in document.pages:
                for form_field in page.form_fields:
                    field_name = get_text(form_field.field_name, document)
                    field_value = get_text(form_field.field_value, document)
                    print("Extracted key value pair:")
                    print(f"\t{field_name}, {field_value}")
                for paragraph in document.pages:
                    paragraph_text = get_text(paragraph.layout, document)
                    print(f"Paragraph text:\n{paragraph_text}")
        else:
            print(f"Skipping non-supported file type {blob.name}")
Example #6
0
    load_job.result()  # Waits for the job to complete.
    destination_table = bq_client.get_table(table_id)  # Make an API request.
    print("Loaded {} rows".format(destination_table.num_rows) + " to " +
          table_id)


#---------------------------------------------------------------------------------------------------------------------

### Instantiate processor
processor_name = f'projects/{projectid}/locations/{location}/processors/{processorid}'

with open(sample_invoice, 'rb') as image:
    document = {'content': image.read(), 'mime_type': 'application/pdf'}
    request = {'name': processor_name, 'document': document}
### Capture processor results
results = documentai.DocumentProcessorServiceClient().process_document(
    request=request)

#---------------------------------------------------------------------------------------------------------------------
### We will be using the parse data / entities for two tables viz., invoice and inventory
### Invoice Data --- add processor results to a Pandas Dataframe and transform for BQ ingestion
results_frame = [[
    entity.type_, entity.mention_text,
    round(entity.confidence, 4)
] for entity in results.document.entities]
df = pd.DataFrame(results_frame, columns=['type', 'value', 'confidence'])
df_t = df.rename(columns={'type': 'index'}).drop(columns=['confidence']).T
df_t.columns = df_t.iloc[0]
df_t = df_t.drop(df_t.index[0])
df_t = df_t.reset_index(drop=True)
# transform amount columns and create
for num_col in [col for col in df_t.columns if '_amount' in col]:
def batch_process_documents(
    project_id,
    location,
    processor_id,
    gcs_input_uri,
    gcs_output_uri,
    gcs_output_uri_prefix,
):

    client = documentai.DocumentProcessorServiceClient()

    destination_uri = f"{gcs_output_uri}/{gcs_output_uri_prefix}/"

    # 'mime_type' can be 'application/pdf', 'image/tiff',
    # and 'image/gif', or 'application/json'
    input_config = documentai.types.document_processor_service.BatchProcessRequest.BatchInputConfig(
        gcs_source=gcs_input_uri, mime_type="application/pdf")

    # Where to write results
    output_config = documentai.types.document_processor_service.BatchProcessRequest.BatchOutputConfig(
        gcs_destination=destination_uri)

    # Location can be 'us' or 'eu'
    name = f"projects/{project_id}/locations/{location}/processors/{processor_id}"
    request = documentai.types.document_processor_service.BatchProcessRequest(
        name=name,
        input_configs=[input_config],
        output_config=output_config,
    )

    operation = client.batch_process_documents(request)

    # Wait for the operation to finish
    operation.result()

    # Results are written to GCS. Use a regex to find
    # output files
    match = re.match(r"gs://([^/]+)/(.+)", destination_uri)
    output_bucket = match.group(1)
    prefix = match.group(2)

    storage_client = storage.Client()
    bucket = storage_client.get_bucket(output_bucket)
    blob_list = list(bucket.list_blobs(prefix=prefix))
    print("Output files:")

    for i, blob in enumerate(blob_list):
        # Download the contents of this blob as a bytes object.
        blob_as_bytes = blob.download_as_bytes()
        document = documentai.types.Document.from_json(blob_as_bytes)

        print(f"Fetched file {i + 1}")

        # For a full list of Document object attributes, please reference this page: https://googleapis.dev/python/documentai/latest/_modules/google/cloud/documentai_v1beta3/types/document.html#Document

        # Read the text recognition output from the processor
        for page in document.pages:
            for form_field in page.form_fields:
                field_name = get_text(form_field.field_name, document)
                field_value = get_text(form_field.field_value, document)
                print("Extracted key value pair:")
                print(f"\t{field_name}, {field_value}")
            for paragraph in document.pages:
                paragraph_text = get_text(paragraph.layout, document)
                print(f"Paragraph text:\n{paragraph_text}")
Example #8
0
        print("Error: path to TIF files does not exists.")
        sys.exit(1)
    else:
        path = sys.argv[1]

# Check if there is a creds.json file
if not os.path.isfile("{}/creds.json".format(os.getcwd())):
    print("Error: creds.json missing.")
    sys.exit(1)
else:
    os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = "{}/creds.json".format(
        os.getcwd())

# Load google vision
opts = {}
client = documentai.DocumentProcessorServiceClient(client_options=opts)

name = f"projects/{settings.project_id}/locations/{settings.location}/processors/{settings.processor_id}"


# https://github.com/googleapis/python-documentai/blob/master/samples/snippets/parse_form_v1beta2.py
def _get_text(el):
    """Convert text offset indexes into text snippets."""
    response = ""
    # If a text segment spans several lines, it will
    # be stored in different text segments.
    for segment in el.text_anchor.text_segments:
        start_index = segment.start_index
        end_index = segment.end_index
        response += document.text[start_index:end_index]
    return response