def parse_document(self, input_uri, processor_id, token_path, project_id): # Instantiates a client credentials = service_account.Credentials.from_service_account_file( token_path) client_options = {"api_endpoint": "eu-documentai.googleapis.com"} client = documentai.DocumentProcessorServiceClient( credentials=credentials, client_options=client_options) name = f"projects/{project_id}/locations/eu/processors/{processor_id}" with self.fs.open(input_uri, "rb") as image: image_content = image.read() # Read the file into memory document = {"content": image_content, "mime_type": "application/pdf"} # Configure the process request request = documentai.types.ProcessRequest(name=name, document=document, skip_human_review=False) # Recognizes text entities in the PDF document result = client.process_document(request=request) document = result.document self.document = document
def quickstart(project_id: str, location: str, processor_id: str, file_path: str): client = documentai.DocumentProcessorServiceClient() # The full resource name of the processor, e.g.: # projects/project-id/locations/location/processor/processor-id # You must create new processors in the Cloud Console first name = f"projects/{project_id}/locations/{location}/processors/{processor_id}" # Read the file into memory with open(file_path, "rb") as image: image_content = image.read() document = {"content": image_content, "mime_type": "application/pdf"} # Configure the process request request = {"name": name, "document": document} result = client.process_document(request=request) document = result.document document_pages = document.pages # For a full list of Document object attributes, please reference this page: https://googleapis.dev/python/documentai/latest/_modules/google/cloud/documentai_v1beta3/types/document.html#Document # Read the text recognition output from the processor print("The document contains the following paragraphs:") for page in document_pages: paragraphs = page.paragraphs for paragraph in paragraphs: print(paragraph) paragraph_text = get_text(paragraph.layout, document) print(f"Paragraph text: {paragraph_text}")
def parse_document(self, input_uri, token_path, project_id, processor_id="643a05097d4ab993"): """ Facilitates sending a request to the Doc AI API (via a specified processor) and saves the response (a 'document') as a class attribute. Arguments: input_uri: The gcs location of a pdf to be processed by Doc AI token_path: Path to the location of json key for authorisation project_id: The gcp project id processor_id: The id of the processor created in the cloud console Returns: None Raises: None """ # Instantiates a client credentials = service_account.Credentials.from_service_account_file( token_path) client_options = {"api_endpoint": "eu-documentai.googleapis.com"} client = documentai.DocumentProcessorServiceClient( credentials=credentials, client_options=client_options) name = f"projects/{project_id}/locations/eu/processors/{processor_id}" # Read the file into memory with self.fs.open(input_uri, "rb") as image: image_content = image.read() document = {"content": image_content, "mime_type": "application/pdf"} # Configure the process request request = documentai.types.ProcessRequest(name=name, document=document, skip_human_review=True) # Recognizes text entities in the PDF document result = client.process_document(request=request) self.document = result.document
def process_document_sample(project_id: str, location: str, processor_id: str, file_path: str): from google.cloud import documentai_v1beta3 as documentai # You must set the api_endpoint if you use a location other than 'us', e.g.: opts = {} if location == "eu": opts = {"api_endpoint": "eu-documentai.googleapis.com"} client = documentai.DocumentProcessorServiceClient(client_options=opts) # The full resource name of the processor, e.g.: # projects/project-id/locations/location/processor/processor-id # You must create new processors in the Cloud Console first name = f"projects/{project_id}/locations/{location}/processors/{processor_id}" with open(file_path, "rb") as image: image_content = image.read() # Read the file into memory document = {"content": image_content, "mime_type": "application/pdf"} # Configure the process request request = {"name": name, "document": document} # Recognizes text entities in the PDF document result = client.process_document(request=request) document = result.document print("Document processing complete.") # For a full list of Document object attributes, please reference this page: https://googleapis.dev/python/documentai/latest/_modules/google/cloud/documentai_v1beta3/types/document.html#Document document_pages = document.pages # Read the text recognition output from the processor print("The document contains the following paragraphs:") for page in document_pages: paragraphs = page.paragraphs for paragraph in paragraphs: paragraph_text = get_text(paragraph.layout, document) print(f"Paragraph text: {paragraph_text}")
def batch_process_documents( project_id, location, processor_id, gcs_input_uri, gcs_output_uri, gcs_output_uri_prefix, timeout: int = 300, ): # You must set the api_endpoint if you use a location other than 'us', e.g.: opts = {} if location == "eu": opts = {"api_endpoint": "eu-documentai.googleapis.com"} client = documentai.DocumentProcessorServiceClient(client_options=opts) destination_uri = f"{gcs_output_uri}/{gcs_output_uri_prefix}/" # 'mime_type' can be 'application/pdf', 'image/tiff', # and 'image/gif', or 'application/json' input_config = documentai.types.document_processor_service.BatchProcessRequest.BatchInputConfig( gcs_source=gcs_input_uri, mime_type="application/pdf") # Where to write results output_config = documentai.types.document_processor_service.BatchProcessRequest.BatchOutputConfig( gcs_destination=destination_uri) # Location can be 'us' or 'eu' name = f"projects/{project_id}/locations/{location}/processors/{processor_id}" request = documentai.types.document_processor_service.BatchProcessRequest( name=name, input_configs=[input_config], output_config=output_config, ) operation = client.batch_process_documents(request) # Wait for the operation to finish operation.result(timeout=timeout) # Results are written to GCS. Use a regex to find # output files match = re.match(r"gs://([^/]+)/(.+)", destination_uri) output_bucket = match.group(1) prefix = match.group(2) storage_client = storage.Client() bucket = storage_client.get_bucket(output_bucket) blob_list = list(bucket.list_blobs(prefix=prefix)) print("Output files:") for i, blob in enumerate(blob_list): # If JSON file, download the contents of this blob as a bytes object. if ".json" in blob.name: blob_as_bytes = blob.download_as_bytes() document = documentai.types.Document.from_json(blob_as_bytes) print(f"Fetched file {i + 1}") # For a full list of Document object attributes, please reference this page: # https://cloud.google.com/document-ai/docs/reference/rpc/google.cloud.documentai.v1beta3#document # Read the text recognition output from the processor for page in document.pages: for form_field in page.form_fields: field_name = get_text(form_field.field_name, document) field_value = get_text(form_field.field_value, document) print("Extracted key value pair:") print(f"\t{field_name}, {field_value}") for paragraph in document.pages: paragraph_text = get_text(paragraph.layout, document) print(f"Paragraph text:\n{paragraph_text}") else: print(f"Skipping non-supported file type {blob.name}")
load_job.result() # Waits for the job to complete. destination_table = bq_client.get_table(table_id) # Make an API request. print("Loaded {} rows".format(destination_table.num_rows) + " to " + table_id) #--------------------------------------------------------------------------------------------------------------------- ### Instantiate processor processor_name = f'projects/{projectid}/locations/{location}/processors/{processorid}' with open(sample_invoice, 'rb') as image: document = {'content': image.read(), 'mime_type': 'application/pdf'} request = {'name': processor_name, 'document': document} ### Capture processor results results = documentai.DocumentProcessorServiceClient().process_document( request=request) #--------------------------------------------------------------------------------------------------------------------- ### We will be using the parse data / entities for two tables viz., invoice and inventory ### Invoice Data --- add processor results to a Pandas Dataframe and transform for BQ ingestion results_frame = [[ entity.type_, entity.mention_text, round(entity.confidence, 4) ] for entity in results.document.entities] df = pd.DataFrame(results_frame, columns=['type', 'value', 'confidence']) df_t = df.rename(columns={'type': 'index'}).drop(columns=['confidence']).T df_t.columns = df_t.iloc[0] df_t = df_t.drop(df_t.index[0]) df_t = df_t.reset_index(drop=True) # transform amount columns and create for num_col in [col for col in df_t.columns if '_amount' in col]:
def batch_process_documents( project_id, location, processor_id, gcs_input_uri, gcs_output_uri, gcs_output_uri_prefix, ): client = documentai.DocumentProcessorServiceClient() destination_uri = f"{gcs_output_uri}/{gcs_output_uri_prefix}/" # 'mime_type' can be 'application/pdf', 'image/tiff', # and 'image/gif', or 'application/json' input_config = documentai.types.document_processor_service.BatchProcessRequest.BatchInputConfig( gcs_source=gcs_input_uri, mime_type="application/pdf") # Where to write results output_config = documentai.types.document_processor_service.BatchProcessRequest.BatchOutputConfig( gcs_destination=destination_uri) # Location can be 'us' or 'eu' name = f"projects/{project_id}/locations/{location}/processors/{processor_id}" request = documentai.types.document_processor_service.BatchProcessRequest( name=name, input_configs=[input_config], output_config=output_config, ) operation = client.batch_process_documents(request) # Wait for the operation to finish operation.result() # Results are written to GCS. Use a regex to find # output files match = re.match(r"gs://([^/]+)/(.+)", destination_uri) output_bucket = match.group(1) prefix = match.group(2) storage_client = storage.Client() bucket = storage_client.get_bucket(output_bucket) blob_list = list(bucket.list_blobs(prefix=prefix)) print("Output files:") for i, blob in enumerate(blob_list): # Download the contents of this blob as a bytes object. blob_as_bytes = blob.download_as_bytes() document = documentai.types.Document.from_json(blob_as_bytes) print(f"Fetched file {i + 1}") # For a full list of Document object attributes, please reference this page: https://googleapis.dev/python/documentai/latest/_modules/google/cloud/documentai_v1beta3/types/document.html#Document # Read the text recognition output from the processor for page in document.pages: for form_field in page.form_fields: field_name = get_text(form_field.field_name, document) field_value = get_text(form_field.field_value, document) print("Extracted key value pair:") print(f"\t{field_name}, {field_value}") for paragraph in document.pages: paragraph_text = get_text(paragraph.layout, document) print(f"Paragraph text:\n{paragraph_text}")
print("Error: path to TIF files does not exists.") sys.exit(1) else: path = sys.argv[1] # Check if there is a creds.json file if not os.path.isfile("{}/creds.json".format(os.getcwd())): print("Error: creds.json missing.") sys.exit(1) else: os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = "{}/creds.json".format( os.getcwd()) # Load google vision opts = {} client = documentai.DocumentProcessorServiceClient(client_options=opts) name = f"projects/{settings.project_id}/locations/{settings.location}/processors/{settings.processor_id}" # https://github.com/googleapis/python-documentai/blob/master/samples/snippets/parse_form_v1beta2.py def _get_text(el): """Convert text offset indexes into text snippets.""" response = "" # If a text segment spans several lines, it will # be stored in different text segments. for segment in el.text_anchor.text_segments: start_index = segment.start_index end_index = segment.end_index response += document.text[start_index:end_index] return response