def parse_with_model(
        project_id='YOUR_PROJECT_ID',
        input_uri='gs://cloud-samples-data/documentai/invoice.pdf',
        automl_model_name='YOUR_AUTOML_MODEL_NAME'):
    """Process a single document with the Document AI API.

    Args:
        project_id: your Google Cloud project id
        input_uri: the Cloud Storage URI of your input PDF
        automl_model_name: the AutoML model name formatted as:
            `projects/[PROJECT_ID]/locations/[LOCATION]/models/[MODEL_ID]
            where LOCATION is a Compute Engine region, e.g. `us-central1`
    """

    client = documentai.DocumentUnderstandingServiceClient()

    gcs_source = documentai.types.GcsSource(uri=input_uri)

    # mime_type can be application/pdf, image/tiff,
    # and image/gif, or application/json
    input_config = documentai.types.InputConfig(gcs_source=gcs_source,
                                                mime_type='application/pdf')

    automl_params = documentai.types.AutoMlParams(model=automl_model_name)

    # Location can be 'us' or 'eu'
    parent = 'projects/{}/locations/us'.format(project_id)
    request = documentai.types.ProcessDocumentRequest(
        parent=parent, input_config=input_config, automl_params=automl_params)

    document = client.process_document(request=request)

    for label in document.labels:
        print('Label detected: {}'.format(label.name))
        print('Confidence: {}'.format(label.confidence))
Exemple #2
0
def set_endpoint(project_id='YOUR_PROJECT_ID',
                 input_uri='gs://cloud-samples-data/documentai/invoice.pdf'):
    """Process a single document with the Document AI API, including
    text extraction and entity extraction."""

    # [START documentai_set_endpoint_beta]
    from google.cloud import documentai_v1beta2 as documentai

    client = documentai.DocumentUnderstandingServiceClient(
        client_options={'api_endpoint': 'eu-documentai.googleapis.com'})
    # [END documentai_set_endpoint_beta]

    gcs_source = documentai.types.GcsSource(uri=input_uri)

    # mime_type can be application/pdf, image/tiff,
    # and image/gif, or application/json
    input_config = documentai.types.InputConfig(gcs_source=gcs_source,
                                                mime_type='application/pdf')

    # Location can be 'us' or 'eu'
    parent = 'projects/{}/locations/eu'.format(project_id)
    request = documentai.types.ProcessDocumentRequest(
        parent=parent, input_config=input_config)

    document = client.process_document(request=request)

    # All text extracted from the document
    print('Document Text: {}'.format(document.text))
Exemple #3
0
def parse_invoice(project_id='temporal-tensor-307222',
                  input_uri='gs://cloud-samples-data/documentai/invoice.pdf'):
    #  input_uri='gs://docu_test/AC8BR-05U.PDF'):
    """Procsingle document with the Document AI API, including
    text extraction and entity extraction."""

    client = documentai.DocumentUnderstandingServiceClient()

    gcs_source = documentai.types.GcsSource(uri=input_uri)

    # mime_type can be application/pdf, image/tiff,
    # and image/gif, or application/json
    input_config = documentai.types.InputConfig(gcs_source=gcs_source,
                                                mime_type='application/pdf')

    # Location can be 'us' or 'eu'
    parent = 'projects/{}/locations/us'.format(project_id)
    request = documentai.types.ProcessDocumentRequest(
        parent=parent, input_config=input_config)

    document = client.process_document(request=request)

    print(type(document.content))

    # create_json(json.dump(document))

    # All text extracted from the document
    print('Document Text: {}'.format(document.content))
Exemple #4
0
def get_parsed_document(
    project_id="white-flame-244921",
    input_uri="gs://cloud-samples-data/documentai/invoice.pdf",
):
    """Process a single document with the Document AI API, including
    text extraction and entity extraction."""

    client = documentai.DocumentUnderstandingServiceClient()

    gcs_source = documentai.types.GcsSource(uri=input_uri)

    # mime_type can be application/pdf, image/tiff,
    # and image/gif, or application/json
    input_config = documentai.types.InputConfig(
        gcs_source=gcs_source,
        mime_type="image/tiff",

    )

    # Location can be 'us' or 'eu'
    parent = "projects/{}/locations/eu".format(project_id)
    request = documentai.types.ProcessDocumentRequest(
        parent=parent, input_config=input_config
    )

    document = client.process_document(request=request)
    return document
Exemple #5
0
def get_form_fields(bucket, filename):
    """Parse a form"""

    client = documentai.DocumentUnderstandingServiceClient()

    gcs_source = documentai.types.GcsSource(uri=f"gs://{bucket}/{filename}")

    # mime_type can be application/pdf, image/tiff,
    # and image/gif, or application/json
    input_config = documentai.types.InputConfig(gcs_source=gcs_source,
                                                mime_type='application/pdf')

    # Improve form parsing results by providing key-value pair hints.
    # For each key hint, key is text that is likely to appear in the
    # document as a form field name (i.e. "DOB").
    # Value types are optional, but can be one or more of:
    # ADDRESS, LOCATION, ORGANIZATION, PERSON, PHONE_NUMBER, ID,
    # NUMBER, EMAIL, PRICE, TERMS, DATE, NAME
    key_value_pair_hints = [
        documentai.types.KeyValuePairHint(key='Emergency Contact',
                                          value_types=['NAME']),
        documentai.types.KeyValuePairHint(key='Referred By')
    ]

    # Setting enabled=True enables form extraction
    form_extraction_params = documentai.types.FormExtractionParams(
        enabled=True, key_value_pair_hints=key_value_pair_hints)

    # Location can be 'us' or 'eu'
    parent = 'projects/{}/locations/us'.format(os.environ["PROJECT_ID"])
    request = documentai.types.ProcessDocumentRequest(
        parent=parent,
        input_config=input_config,
        form_extraction_params=form_extraction_params)

    document = client.process_document(request=request)

    def _get_text(el):
        """Doc AI identifies form fields by their offsets
        in document text. This function converts offsets
        to text snippets.
        """
        response = ''
        # If a text segment spans several lines, it will
        # be stored in different text segments.
        for segment in el.text_anchor.text_segments:
            start_index = segment.start_index
            end_index = segment.end_index
            response += document.text[start_index:end_index]
        return response

    # Return an array of form fields
    return [{
        "filename": filename,
        "page": page.page_number,
        "form_field_name": _get_text(form_field.field_name),
        "form_field_value": _get_text(form_field.field_value)
    } for page in document.pages for form_field in page.form_fields]
Exemple #6
0
 def __init__(self, pdf):
     self.doc_in = pdf
     self.project_id = 'teak-span-275205'
     self.client = documentai.DocumentUnderstandingServiceClient()
     self.document = self.client.process_document(
         request=self.generate_request())
     self.tables = self.get_tables()
     self.max_cols = self.get_max_cols()
     self.table_indices = []
Exemple #7
0
def parse_form(project_id='YOUR_PROJECT_ID',
               input_uri='gs://cloud-samples-data/documentai/form.pdf'):
    """Parse a form"""

    client = documentai.DocumentUnderstandingServiceClient()

    gcs_source = documentai.types.GcsSource(uri=input_uri)

    # mime_type can be application/pdf, image/tiff,
    # and image/gif, or application/json
    input_config = documentai.types.InputConfig(gcs_source=gcs_source,
                                                mime_type='application/pdf')

    # Improve form parsing results by providing key-value pair hints.
    # For each key hint, key is text that is likely to appear in the
    # document as a form field name (i.e. "DOB").
    # Value types are optional, but can be one or more of:
    # ADDRESS, LOCATION, ORGANIZATION, PERSON, PHONE_NUMBER, ID,
    # NUMBER, EMAIL, PRICE, TERMS, DATE, NAME
    key_value_pair_hints = [
        documentai.types.KeyValuePairHint(key='Emergency Contact',
                                          value_types=['NAME']),
        documentai.types.KeyValuePairHint(key='Referred By')
    ]

    # Setting enabled=True enables form extraction
    form_extraction_params = documentai.types.FormExtractionParams(
        enabled=True, key_value_pair_hints=key_value_pair_hints)

    # Location can be 'us' or 'eu'
    parent = 'projects/{}/locations/us'.format(project_id)
    request = documentai.types.ProcessDocumentRequest(
        parent=parent,
        input_config=input_config,
        form_extraction_params=form_extraction_params)

    document = client.process_document(request=request)
    text = document.text
    t = text.split('\n')
    skills = []
    for i in range(0, len(t)):
        x = t[i]
        if 'SKILLS' in x:
            j = i + 1
            y = t[j]
            while ("●" in y or "•" in y or y.isupper() == False):
                new_y = y.replace("●", "")
                new_y = new_y.replace("•", "")
                skills.append(new_y)
                j += 1
                y = t[j]
    return skills
Exemple #8
0
def parse_file(input_uri):
    client = documentai.DocumentUnderstandingServiceClient()
    gcs_source = documentai.types.GcsSource(uri=input_uri)

    input_config = documentai.types.InputConfig(gcs_source=gcs_source, mime_type='application/pdf')
    parent = 'projects/{}/locations/us'.format(project_id)
    request = documentai.types.ProcessDocumentRequest(
        parent=parent,
        input_config=input_config)
    document = client.process_document(request=request)

    print(document.entities)

    return [entity.mention_text for entity in document.entities]
def get_document_text(input_uri, project_id):
    project_id = project_id
    input_uri = input_uri

    client = documentai.DocumentUnderstandingServiceClient()
    gcs_source = documentai.types.GcsSource(uri=input_uri)
    input_config = documentai.types.InputConfig(gcs_source=gcs_source,
                                                mime_type='application/pdf')

    # Location can be 'us' or 'eu'
    parent = 'projects/{}/locations/us'.format(project_id)
    request = documentai.types.ProcessDocumentRequest(
        parent=parent, input_config=input_config)

    document = client.process_document(request=request)
    return document.text
def main(project_id='document-ai-project-291706',
         input_uri='gs://analysis_report_samples/sample_ocr_1.pdf'):
    #  input_uri='gs://cloud-samples-data/documentai/invoice.pdf'):
    """Process a single document with the Document AI API, including
    text extraction and entity extraction."""

    client = documentai.DocumentUnderstandingServiceClient()

    gcs_source = documentai.types.GcsSource(uri=input_uri)

    # mime_type can be application/pdf, image/tiff,
    # and image/gif, or application/json
    input_config = documentai.types.InputConfig(gcs_source=gcs_source,
                                                mime_type='application/pdf')

    # Location can be 'us' or 'eu'
    parent = 'projects/{}/locations/us'.format(project_id)
    request = documentai.types.ProcessDocumentRequest(
        parent=parent, input_config=input_config)

    document = client.process_document(request=request)

    file_ = open('output.text', 'w')
    # All text extracted from the document
    print('Document Text: {}'.format(document.text))
    file_.write('Document Text: {}'.format(document.text))

    def _get_text(el):
        """Convert text offset indexes into text snippets.
        """
        response = ''
        # If a text segment spans several lines, it will
        # be stored in different text segments.
        for segment in el.text_anchor.text_segments:
            start_index = segment.start_index
            end_index = segment.end_index
            response += document.text[start_index:end_index]
        return response

    for entity in document.entities:
        # print('Entity type: {}'.format(entity.type))
        print('Text: {}'.format(_get_text(entity)))
        file_.write('Text: {}'.format(_get_text(entity)))
        print('Mention text: {}\n'.format(entity.mention_text))
        file_.write('Mention text: {}\n'.format(entity.mention_text))

    file_.close()
def sample_analyze_entities(input_uri):
    """
    Analyzing Entities in text file stored in Cloud Storage
    """

    client = documentai.DocumentUnderstandingServiceClient(credentials=credentials)
    gcs_source = documentai.types.GcsSource(uri=input_uri)
    input_config = documentai.types.InputConfig(gcs_source=gcs_source, mime_type='application/pdf')
    parent = 'projects/{}/locations/us'.format(project_id)
    request = documentai.types.ProcessDocumentRequest(parent=parent, input_config=input_config)
    document = client.process_document(request=request)

    client = language_v1.LanguageServiceClient(credentials=credentials)


    type_ = enums.Document.Type.PLAIN_TEXT
    document = {"content": document.text, "type": type_}

    encoding_type = enums.EncodingType.UTF8

    response = client.analyze_entities(document, encoding_type=encoding_type)
    # Loop through entitites returned from the API
    for entity in response.entities:
        print(u"Representative name for the entity: {}".format(entity.name))
        # Get entity type, e.g. PERSON, LOCATION, ADDRESS, NUMBER, et al
        print(u"Entity type: {}".format(enums.Entity.Type(entity.type).name))
        # Get the salience score associated with the entity in the [0, 1.0] range
        print(u"Salience score: {}".format(entity.salience))

        # Loop over the metadata associated with entity. For many known entities,
        # the metadata is a Wikipedia URL (wikipedia_url) and Knowledge Graph MID (mid).
        # Some entity types may have additional metadata, e.g. ADDRESS entities
        # may have metadata for the address street_name, postal_code, et al.
        for metadata_name, metadata_value in entity.metadata.items():
            print(u"{}: {}".format(metadata_name, metadata_value))

        # Loop over the mentions of this entity in the input document.
        # The API currently supports proper noun mentions.
        for mention in entity.mentions:
            print(u"Mention text: {}".format(mention.text.content))
            # Get the mention type, e.g. PROPER for proper noun
            print(u"Mention type: {}".format(enums.EntityMention.Type(mention.type).name))

    """
Exemple #12
0
def main(
    project_id="YOUR_PROJECT_ID",
    input_uri="gs://cloud-samples-data/documentai/invoice.pdf",
):
    """Process a single document with the Document AI API, including
    text extraction and entity extraction."""

    client = documentai.DocumentUnderstandingServiceClient()

    gcs_source = documentai.types.GcsSource(uri=input_uri)

    # mime_type can be application/pdf, image/tiff,
    # and image/gif, or application/json
    input_config = documentai.types.InputConfig(gcs_source=gcs_source,
                                                mime_type="application/pdf")

    # Location can be 'us' or 'eu'
    parent = "projects/{}/locations/us".format(project_id)
    request = documentai.types.ProcessDocumentRequest(
        parent=parent, input_config=input_config)

    document = client.process_document(request=request)

    # All text extracted from the document
    print("Document Text: {}".format(document.text))

    def _get_text(el):
        """Convert text offset indexes into text snippets."""
        response = ""
        # If a text segment spans several lines, it will
        # be stored in different text segments.
        for segment in el.text_anchor.text_segments:
            start_index = segment.start_index
            end_index = segment.end_index
            response += document.text[start_index:end_index]
        return response

    for entity in document.entities:
        print("Entity type: {}".format(entity.type_))
        print("Text: {}".format(_get_text(entity)))
        print("Mention text: {}\n".format(entity.mention_text))
Exemple #13
0
def parse_form(project_id='quantiphi-ttest',
               input_uri='gs://document_ai1/Payslip_11176322 (2).pdf'):
    """Parse a form"""

    client = documentai.DocumentUnderstandingServiceClient()

    gcs_source = documentai.types.GcsSource(uri=input_uri)

    # mime_type can be application/pdf, image/tiff,
    # and image/gif, or application/json
    input_config = documentai.types.InputConfig(
        gcs_source=gcs_source, mime_type='application/pdf')

    # Improve form parsing results by providing key-value pair hints.
    # For each key hint, key is text that is likely to appear in the
    # document as a form field name (i.e. "DOB").
    # Value types are optional, but can be one or more of:
    # ADDRESS, LOCATION, ORGANIZATION, PERSON, PHONE_NUMBER, ID,
    # NUMBER, EMAIL, PRICE, TERMS, DATE, NAME
    key_value_pair_hints = [
        documentai.types.KeyValuePairHint(key='Personnel No',
                                          ),
        documentai.types.KeyValuePairHint(
            key='Name', value_types=['NAME']),

        documentai.types.KeyValuePairHint(
            key='Bank'),

        documentai.types.KeyValuePairHint(
            key='Bank A/c No'),

        documentai.types.KeyValuePairHint(
            key='DOJ'),

        documentai.types.KeyValuePairHint(
            key='LOP Days'),

        documentai.types.KeyValuePairHint(
            key='PF No.'),

        documentai.types.KeyValuePairHint(
            key='Location'),

        documentai.types.KeyValuePairHint(
            key='Facility'),

        documentai.types.KeyValuePairHint(
            key='Department'),

        documentai.types.KeyValuePairHint(
            key='INCOME TAX'),

        documentai.types.KeyValuePairHint(
            key='PROFESSIONAL TAX'),

        documentai.types.KeyValuePairHint(
            key='GROSS DEDUCTIONS'),

        documentai.types.KeyValuePairHint(
            key='PROVIDENT FUND'),

        documentai.types.KeyValuePairHint(
            key='NGO CONTRIBUTION'),

        documentai.types.KeyValuePairHint(
            key='PF – UAN'),
    ]

    # Setting enabled=True enables form extraction
    form_extraction_params = documentai.types.FormExtractionParams(
        enabled=True, key_value_pair_hints=key_value_pair_hints)

    # Location can be 'us' or 'eu'
    parent = 'projects/{}/locations/us'.format(project_id)
    request = documentai.types.ProcessDocumentRequest(
        parent=parent,
        input_config=input_config,
        form_extraction_params=form_extraction_params)

    document = client.process_document(request=request)

    def _get_text(el):
        """Doc AI identifies form fields by their offsets
        in document text. This function converts offsets
        to text snippets.
        """
        response = ''
        # If a text segment spans several lines, it will
        # be stored in different text segments.
        for segment in el.text_anchor.text_segments:
            start_index = segment.start_index
            end_index = segment.end_index
            response += document.text[start_index:end_index]
        return response

    jsonDict = {}
    for page in document.pages:
        print('Page number: {}'.format(page.page_number))
        for form_field in page.form_fields:
            # fieldNames.append(_get_text(form_field.field_name))
            print('Field Name: {}\tConfidence: {}'.format(
                _get_text(form_field.field_name),
                form_field.field_name.confidence))

            # fieldValues.append(_get_text(form_field.field_value))
            print('Field Value: {}\tConfidence: {}'.format(
                _get_text(form_field.field_value),
                form_field.field_value.confidence))

            jsonDict[_get_text(form_field.field_name) \
            .strip() \
            .replace('PF \u2013 UAN', 'UAN')] = _get_text(form_field.field_value) \
                                                        .replace('\n','') \
                                                        .strip()


    print(json.dumps(jsonDict))

    client = bigquery.Client()
    filename = '/path/to/file/in/nd-format.json'
    dataset_id = 'quantiphi'
    table_id = 'dataLoad'
def parse_table(project_id, input_uri, filename, cred):
    """Parse a form"""

    RIGHE = []
    RIGHE = pd.DataFrame(RIGHE)

    client = documentai.DocumentUnderstandingServiceClient(credentials=cred)

    gcs_source = documentai.types.GcsSource(uri=input_uri)

    # mime_type can be application/pdf, image/tiff,
    # and image/gif, or application/json
    input_config = documentai.types.InputConfig(gcs_source=gcs_source,
                                                mime_type='application/pdf')

    # Improve table parsing results by providing bounding boxes
    # specifying where the box appears in the document (optional)
    table_bound_hints = [
        documentai.types.TableBoundHint(
            page_number=1,
            bounding_box=documentai.types.BoundingPoly(
                # Define a polygon around tables to detect
                # Each vertice coordinate must be a number between 0 and 1
                normalized_vertices=[
                    # Top left
                    documentai.types.geometry.NormalizedVertex(x=0, y=0),
                    # Top right
                    documentai.types.geometry.NormalizedVertex(x=1, y=0),
                    # Bottom right
                    documentai.types.geometry.NormalizedVertex(x=1, y=1),
                    # Bottom left
                    documentai.types.geometry.NormalizedVertex(x=0, y=1)
                ]))
    ]

    # Setting enabled=True enables form extraction
    table_extraction_params = documentai.types.TableExtractionParams(
        enabled=True, table_bound_hints=table_bound_hints)

    # Location can be 'us' or 'eu'
    parent = 'projects/{}/locations/us'.format(project_id)
    request = documentai.types.ProcessDocumentRequest(
        parent=parent,
        input_config=input_config,
        table_extraction_params=table_extraction_params)

    document = client.process_document(request=request)

    def _get_text(el):
        """Convert text offset indexes into text snippets.
        """
        response = ''
        # If a text segment spans several lines, it will
        # be stored in different text segments.
        for segment in el.text_anchor.text_segments:
            start_index = segment.start_index
            end_index = segment.end_index
            response += document.text[start_index:end_index]
        return response

    for page in document.pages:
        print('Page number: {}'.format(page.page_number))
        for table_num, table in enumerate(page.tables):
            print('Table {}: '.format(table_num))
            for row_num, row in enumerate(table.header_rows):
                cells = '\t'.join(
                    [_get_text(cell.layout) for cell in row.cells])
                print('Header Row {}: {}'.format(row_num, cells))

                ppp1 = cells.split('\n')
                ppp1 = pd.DataFrame([x.split('\t') for x in ppp1])
                ppp1['RowNum_Header'] = row_num
                ppp1['Table'] = table_num
                ppp1['Page'] = page.page_number

                RIGHE = RIGHE.append(ppp1)

            for row_num, row in enumerate(table.body_rows):
                cells = '\t'.join(
                    [_get_text(cell.layout) for cell in row.cells])
                print('Row {}: {}'.format(row_num, cells))

                ###MODIFICHE MARCO PER CERCARE DI METTERE IN DATAFRAME ANZICHE PRINTARE

                ppp1 = cells.split('\n')
                ppp1 = pd.DataFrame([x.split('\t') for x in ppp1])
                ppp1['RowNum'] = row_num
                ppp1['Table'] = table_num
                ppp1['Page'] = page.page_number

                RIGHE = RIGHE.append(ppp1)

                FF = os.path.splitext(filename)[0]
                FF = FF + '.pkl'
                RIGHE.to_pickle(os.path.join(FF), protocol=2)

    return RIGHE
Exemple #15
0
def batch_parse_table(
        project_id='YOUR_PROJECT_ID',
        input_uri='gs://cloud-samples-data/documentai/form.pdf',
        destination_uri='gs://your-bucket-id/path/to/save/results/'):
    """Parse a form"""

    client = documentai.DocumentUnderstandingServiceClient()

    gcs_source = documentai.types.GcsSource(uri=input_uri)

    # mime_type can be application/pdf, image/tiff,
    # and image/gif, or application/json
    input_config = documentai.types.InputConfig(gcs_source=gcs_source,
                                                mime_type='application/pdf')

    # where to write results
    output_config = documentai.types.OutputConfig(
        gcs_destination=documentai.types.GcsDestination(uri=destination_uri),
        pages_per_shard=1  # Map one doc page to one output page
    )

    # Improve table parsing results by providing bounding boxes
    # specifying where the box appears in the document (optional)
    table_bound_hints = [
        documentai.types.TableBoundHint(
            page_number=1,
            bounding_box=documentai.types.BoundingPoly(
                # Define a polygon around tables to detect
                # Each vertice coordinate must be a number between 0 and 1
                normalized_vertices=[
                    # Top left
                    documentai.types.geometry.NormalizedVertex(x=0, y=0),
                    # Top right
                    documentai.types.geometry.NormalizedVertex(x=1, y=0),
                    # Bottom right
                    documentai.types.geometry.NormalizedVertex(x=1, y=1),
                    # Bottom left
                    documentai.types.geometry.NormalizedVertex(x=0, y=1)
                ]))
    ]

    # Setting enabled=True enables form extraction
    table_extraction_params = documentai.types.TableExtractionParams(
        enabled=True, table_bound_hints=table_bound_hints)

    # Location can be 'us' or 'eu'
    parent = 'projects/{}/locations/us'.format(project_id)
    request = documentai.types.ProcessDocumentRequest(
        input_config=input_config,
        output_config=output_config,
        table_extraction_params=table_extraction_params)

    requests = []
    requests.append(request)

    batch_request = documentai.types.BatchProcessDocumentsRequest(
        parent=parent, requests=requests)

    operation = client.batch_process_documents(batch_request)

    # Wait for the operation to finish
    operation.result()

    # Results are written to GCS. Use a regex to find
    # output files
    match = re.match(r'gs://([^/]+)/(.+)', destination_uri)
    output_bucket = match.group(1)
    prefix = match.group(2)

    storage_client = storage.client.Client()
    bucket = storage_client.get_bucket(output_bucket)
    blob_list = list(bucket.list_blobs(prefix=prefix))
    print('Output files:')
    for blob in blob_list:
        print(blob.name)
def parse_table(
    project_id="YOUR_PROJECT_ID",
    input_uri="gs://cloud-samples-data/documentai/invoice.pdf",
):
    """Parse a form"""

    client = documentai.DocumentUnderstandingServiceClient()

    gcs_source = documentai.types.GcsSource(uri=input_uri)

    # mime_type can be application/pdf, image/tiff,
    # and image/gif, or application/json
    input_config = documentai.types.InputConfig(gcs_source=gcs_source,
                                                mime_type="application/pdf")

    # Improve table parsing results by providing bounding boxes
    # specifying where the box appears in the document (optional)
    table_bound_hints = [
        documentai.types.TableBoundHint(
            page_number=1,
            bounding_box=documentai.types.BoundingPoly(
                # Define a polygon around tables to detect
                # Each vertice coordinate must be a number between 0 and 1
                normalized_vertices=[
                    # Top left
                    documentai.types.geometry.NormalizedVertex(x=0, y=0),
                    # Top right
                    documentai.types.geometry.NormalizedVertex(x=1, y=0),
                    # Bottom right
                    documentai.types.geometry.NormalizedVertex(x=1, y=1),
                    # Bottom left
                    documentai.types.geometry.NormalizedVertex(x=0, y=1),
                ]),
        )
    ]

    # Setting enabled=True enables form extraction
    table_extraction_params = documentai.types.TableExtractionParams(
        enabled=True, table_bound_hints=table_bound_hints)

    # Location can be 'us' or 'eu'
    parent = "projects/{}/locations/us".format(project_id)
    request = documentai.types.ProcessDocumentRequest(
        parent=parent,
        input_config=input_config,
        table_extraction_params=table_extraction_params,
    )

    document = client.process_document(request=request)

    def _get_text(el):
        """Convert text offset indexes into text snippets."""
        response = ""
        # If a text segment spans several lines, it will
        # be stored in different text segments.
        for segment in el.text_anchor.text_segments:
            start_index = segment.start_index
            end_index = segment.end_index
            response += document.text[start_index:end_index]
        return response

    for page in document.pages:
        print("Page number: {}".format(page.page_number))
        for table_num, table in enumerate(page.tables):
            print("Table {}: ".format(table_num))
            for row_num, row in enumerate(table.header_rows):
                cells = "\t".join(
                    [_get_text(cell.layout) for cell in row.cells])
                print("Header Row {}: {}".format(row_num, cells))
            for row_num, row in enumerate(table.body_rows):
                cells = "\t".join(
                    [_get_text(cell.layout) for cell in row.cells])
                print("Row {}: {}".format(row_num, cells))
Exemple #17
0
def parse_form(
    project_id="YOUR_PROJECT_ID",
    input_uri="gs://cloud-samples-data/documentai/form.pdf",
):
    """Parse a form"""

    client = documentai.DocumentUnderstandingServiceClient()

    gcs_source = documentai.types.GcsSource(uri=input_uri)

    # mime_type can be application/pdf, image/tiff,
    # and image/gif, or application/json
    input_config = documentai.types.InputConfig(gcs_source=gcs_source,
                                                mime_type="application/pdf")

    # Improve form parsing results by providing key-value pair hints.
    # For each key hint, key is text that is likely to appear in the
    # document as a form field name (i.e. "DOB").
    # Value types are optional, but can be one or more of:
    # ADDRESS, LOCATION, ORGANIZATION, PERSON, PHONE_NUMBER, ID,
    # NUMBER, EMAIL, PRICE, TERMS, DATE, NAME
    key_value_pair_hints = [
        documentai.types.KeyValuePairHint(key="Emergency Contact",
                                          value_types=["NAME"]),
        documentai.types.KeyValuePairHint(key="Referred By"),
    ]

    # Setting enabled=True enables form extraction
    form_extraction_params = documentai.types.FormExtractionParams(
        enabled=True, key_value_pair_hints=key_value_pair_hints)

    # Location can be 'us' or 'eu'
    parent = "projects/{}/locations/us".format(project_id)
    request = documentai.types.ProcessDocumentRequest(
        parent=parent,
        input_config=input_config,
        form_extraction_params=form_extraction_params,
    )

    document = client.process_document(request=request)

    def _get_text(el):
        """Doc AI identifies form fields by their offsets
        in document text. This function converts offsets
        to text snippets.
        """
        response = ""
        # If a text segment spans several lines, it will
        # be stored in different text segments.
        for segment in el.text_anchor.text_segments:
            start_index = segment.start_index
            end_index = segment.end_index
            response += document.text[start_index:end_index]
        return response

    for page in document.pages:
        print("Page number: {}".format(page.page_number))
        for form_field in page.form_fields:
            print("Field Name: {}\tConfidence: {}".format(
                _get_text(form_field.field_name),
                form_field.field_name.confidence))
            print("Field Value: {}\tConfidence: {}".format(
                _get_text(form_field.field_value),
                form_field.field_value.confidence))
Exemple #18
0
    def parse_table(project_id='ons-companies-house-dev',
            input_uri='gs://ons-companies-house-dev-scraped-pdf-data/doc_ai_outputs/bs_pdfs/04391694_active_bs.pdf', print_stuff = False):
        """Parse a form"""

        client = documentai.DocumentUnderstandingServiceClient()

        gcs_source = documentai.types.GcsSource(uri=input_uri)

        # mime_type can be application/pdf, image/tiff,
        # and image/gif, or application/json
        input_config = documentai.types.InputConfig(
            gcs_source=gcs_source, mime_type='application/pdf')

        # Improve table parsing results by providing bounding boxes
        # specifying where the box appears in the document (optional)
        table_bound_hints = [
            documentai.types.TableBoundHint(
                page_number=1,
                bounding_box=documentai.types.BoundingPoly(
                    # Define a polygon around tables to detect
                    # Each vertice coordinate must be a number between 0 and 1
                    normalized_vertices=[
                        # Top left
                        documentai.types.geometry.NormalizedVertex(
                            x=0,
                            y=0
                        ),
                        # Top right
                        documentai.types.geometry.NormalizedVertex(
                            x=1,
                            y=0
                        ),
                        # Bottom right
                        documentai.types.geometry.NormalizedVertex(
                            x=1,
                            y=1
                        ),
                        # Bottom left
                        documentai.types.geometry.NormalizedVertex(
                            x=0,
                            y=1
                        )
                    ]
                )
            )
        ]

        # Setting enabled=True enables form extraction
        table_extraction_params = documentai.types.TableExtractionParams(
            enabled=True, table_bound_hints=table_bound_hints,model_version = "builtin/latest",
            header_hints = ["At 31 December\n", "2019\n", "$ million2018\n"])

        # Location can be 'us' or 'eu'
        parent = 'projects/{}/locations/us'.format(project_id)
        request = documentai.types.ProcessDocumentRequest(
            parent=parent,
            input_config=input_config,
            table_extraction_params=table_extraction_params)

        document = client.process_document(request=request)
        return(document)
def parse_table(filename, condense=False):
    input_uri = secrets.token_hex(nbytes=16)
    blob = bucket.blob(input_uri)
    blob.upload_from_filename(filename)

    client = documentai.DocumentUnderstandingServiceClient()
    gcs_source = documentai.types.GcsSource(uri=BUCKET_URL + input_uri)

    input_config = documentai.types.InputConfig(gcs_source=gcs_source,
                                                mime_type="application/pdf")

    table_bound_hints = [
        documentai.types.TableBoundHint(
            page_number=1,
            bounding_box=documentai.types.BoundingPoly(normalized_vertices=[
                documentai.types.geometry.NormalizedVertex(x=0, y=0),
                documentai.types.geometry.NormalizedVertex(x=1, y=0),
                documentai.types.geometry.NormalizedVertex(x=1, y=1),
                documentai.types.geometry.NormalizedVertex(x=0, y=1),
            ]),
        )
    ]

    # Setting enabled=True enables form extraction
    table_extraction_params = documentai.types.TableExtractionParams(
        enabled=True, table_bound_hints=table_bound_hints)

    parent = "projects/{}/locations/us".format(PROJECT_ID)
    request = documentai.types.ProcessDocumentRequest(
        parent=parent,
        input_config=input_config,
        table_extraction_params=table_extraction_params,
    )

    document = client.process_document(request=request)

    def _get_text(el):
        response = ""
        for segment in el.text_anchor.text_segments:
            start_index = segment.start_index
            end_index = segment.end_index
            response += document.text[start_index:end_index]
        for char in ["\n", "\t", "•", ":"]:
            response = response.replace(char, " ")
        while "  " in char:
            response = response.replace("  ", " ")
        return response.strip()

    out = []
    for page in document.pages:
        for table_num, table in enumerate(page.tables):
            for row_num, row in enumerate(table.header_rows):
                out.append("|".join(
                    [_get_text(cell.layout) for cell in row.cells]))
            for row_num, row in enumerate(table.body_rows):
                out.append("|".join(
                    [_get_text(cell.layout) for cell in row.cells]))
            out.append("")
    data = "\n".join(out)[:6000]

    if len(data) < 1000:
        data = document.text

    if condense:
        data = summarize(data)

    out_uri = secrets.token_hex(nbytes=16)
    blob = bucket.blob(out_uri)
    blob.upload_from_string(data)
    return out_uri, data
Exemple #20
0
def parse_invoice(
    project_id='temporal-tensor-307222',
    #  input_uri='gs://cloud-samples-data/documentai/invoice.pdf'):
    input_uri='gs://docu_test/AC8BR-05U-2.pdf'):
    """Procsingle document with the Document AI API, including
    text extraction and entity extraction."""

    destination_uri = 'gs://docu_test/'

    client = documentai.DocumentUnderstandingServiceClient()

    gcs_source = documentai.types.GcsSource(uri=input_uri)

    # mime_type can be application/pdf, image/tiff,
    # and image/gif, or application/json
    input_config = documentai.types.InputConfig(gcs_source=gcs_source,
                                                mime_type='application/pdf')

    # Location can be 'us' or 'eu'
    parent = 'projects/{}/locations/us'.format(project_id)
    request = documentai.types.ProcessDocumentRequest(
        parent=parent, input_config=input_config)

    document = client.process_document(request=request)

    # create_json(document)

    # All text extracted from the document
    # print('Document Text: {}'.format(document))

    f = open('file.txt', 'w')
    f.write('dict = ' + repr(document) + '\n')
    f.close()

    document_pages = document.pages

    # print(document_pages)

    rows_to_insert = []
    # Read the text recognition output from the processor
    print("The document contains the following paragraphs:")
    for page in document_pages:
        paragraphs = page.paragraphs
        for paragraph in paragraphs:
            paragraph_text = get_text(paragraph.layout, document)
            # print(f"Paragraph text: {paragraph_text}")
            print(paragraph_text)
            y = json.dumps(paragraph_text)

            # the result is a JSON string:
            print(y)

            rows_to_insert.append(y)
        print(rows_to_insert)
        with open('data.txt', 'w') as outfile:
            json.dump(rows_to_insert, outfile)
        upload_blob("docu_test", "data.txt", "data.json")

    from google.cloud import bigquery

    # # Construct a BigQuery client object.
    client = bigquery.Client()

    # # TODO(developer): Set table_id to the ID of the table to create.
    table_id = "temporal-tensor-307222.docu_test.printer_tab"

    # # Set the encryption key to use for the destination.
    # # TODO: Replace this key with a key you have created in KMS.
    # # kms_key_name = "projects/{}/locations/{}/keyRings/{}/cryptoKeys/{}".format(
    # #     "cloud-samples-tests", "us", "test", "test"
    # # )
    job_config = bigquery.LoadJobConfig(
        autodetect=True,
        source_format=bigquery.SourceFormat.NEWLINE_DELIMITED_JSON)
    uri = "gs://docu_test/data.json"
    load_job = client.load_table_from_uri(
        uri, table_id, job_config=job_config)  # Make an API request.
    load_job.result()  # Waits for the job to complete.
    destination_table = client.get_table(table_id)
    print("Loaded {} rows.".format(destination_table.num_rows))

    # document1 =documentai.types.Document.from_json(document)

    if (1 == 2):

        # Results are written to GCS. Use a regex to find
        # output files
        match = re.match(r"gs://([^/]+)/(.+)", destination_uri)
        output_bucket = match.group(1)
        prefix = match.group(2)

        storage_client = storage.Client()
        bucket = storage_client.get_bucket(output_bucket)
        blob_list = list(bucket.list_blobs(prefix=prefix))
        print("Output files:")

        for i, blob in enumerate(blob_list):
            # If JSON file, download the contents of this blob as a bytes object.
            if ".json" in blob.name:
                blob_as_bytes = blob.download_as_bytes()

                document = documentai.types.Document.from_json(blob_as_bytes)
                print(f"Fetched file {i + 1}")

                # For a full list of Document object attributes, please reference this page:
                # https://cloud.google.com/document-ai/docs/reference/rpc/google.cloud.documentai.v1beta3#document

                # Read the text recognition output from the processor
                for page in document.pages:
                    for form_field in page.form_fields:
                        field_name = get_text(form_field.field_name, document)
                        field_value = get_text(form_field.field_value,
                                               document)
                        print("Extracted key value pair:")
                        print(f"\t{field_name}, {field_value}")
                    for paragraph in document.pages:
                        paragraph_text = get_text(paragraph.layout, document)
                        # print(f"Paragraph text:\n{paragraph_text}")
            else:
                print(f"Skipping non-supported file type {blob.name}")
Exemple #21
0
def parse_table(project_id, input_uri):
    """Parse a form"""

    client = documentai.DocumentUnderstandingServiceClient()

    gcs_source = documentai.types.GcsSource(uri=input_uri)

    # mime_type can be application/pdf, image/tiff,
    # and image/gif, or application/json
    input_config = documentai.types.InputConfig(gcs_source=gcs_source,
                                                mime_type='application/pdf')

    # Improve table parsing results by providing bounding boxes
    # specifying where the box appears in the document (optional)
    table_bound_hints = [
        documentai.types.TableBoundHint(
            page_number=1,
            bounding_box=documentai.types.BoundingPoly(
                # Define a polygon around tables to detect
                # Each vertice coordinate must be a number between 0 and 1
                normalized_vertices=[
                    # Top left
                    documentai.types.geometry.NormalizedVertex(x=0, y=0),
                    # Top right
                    documentai.types.geometry.NormalizedVertex(x=1, y=0),
                    # Bottom right
                    documentai.types.geometry.NormalizedVertex(x=1, y=1),
                    # Bottom left
                    documentai.types.geometry.NormalizedVertex(x=0, y=1)
                ]))
    ]

    # Setting enabled=True enables form extraction
    table_extraction_params = documentai.types.TableExtractionParams(
        enabled=True, table_bound_hints=table_bound_hints)

    # Location can be 'us' or 'eu'
    parent = 'projects/{}/locations/us'.format(project_id)
    request = documentai.types.ProcessDocumentRequest(
        parent=parent,
        input_config=input_config,
        table_extraction_params=table_extraction_params)

    document = client.process_document(request=request)

    def _get_text(el):
        """Convert text offset indexes into text snippets.
        """
        response = ''
        # If a text segment spans several lines, it will
        # be stored in different text segments.
        for segment in el.text_anchor.text_segments:
            start_index = segment.start_index
            end_index = segment.end_index
            response += document.text[start_index:end_index]

        return response

    pdf_filename = re.findall("[\w.-]+?(?=\.)", input_uri)
    pdf_filename = pdf_filename[0]
    print("pdf_filename", pdf_filename)
    file_name_txt = '/home/srinidhi/angular/upload/' + pdf_filename + '.txt'
    for page in document.pages:
        file = open(file_name_txt, "a")
        # file.write('Page number: {}'.format(page.page_number))
        for table_num, table in enumerate(page.tables):
            #file.write('Table {}: '.format(table_num))
            for row_num, row in enumerate(table.header_rows):
                cells = '\t'.join(
                    [_get_text(cell.layout) for cell in row.cells])
                file.write('Row:{}'.format(cells))
            for row_num, row in enumerate(table.body_rows):
                cells = '\t'.join(
                    [_get_text(cell.layout) for cell in row.cells])
                file.write('Row:{}'.format(cells))
        file.close()
Exemple #22
0
def parse_form(input_uri):
    """Parse a form"""

    client = documentai.DocumentUnderstandingServiceClient()

    gcs_source = documentai.types.GcsSource(uri=input_uri)
    input_config = documentai.types.InputConfig(gcs_source=gcs_source,
                                                mime_type='application/pdf')

    # Improve form parsing results by providing key-value pair hints.
    # For each key hint, key is text that is likely to appear in the
    # document as a form field name (i.e. "DOB").
    # Value types are optional, but can be one or more of:
    # ADDRESS, LOCATION, ORGANIZATION, PERSON, PHONE_NUMBER, ID,
    # NUMBER, EMAIL, PRICE, TERMS, DATE, NAME
    key_value_pair_hints = [
        documentai.types.KeyValuePairHint(key='Emergency Contact',
                                          value_types=['NAME']),
        documentai.types.KeyValuePairHint(key='Referred By')
    ]

    # Setting enabled=True enables form extraction
    form_extraction_params = documentai.types.FormExtractionParams(
        enabled=True, key_value_pair_hints=key_value_pair_hints)

    # Location can be 'us' or 'eu'
    parent = 'projects/{}/locations/us'.format(project_id)
    request = documentai.types.ProcessDocumentRequest(
        parent=parent,
        input_config=input_config,
        form_extraction_params=form_extraction_params)

    document = client.process_document(request=request)

    def _get_text(el):
        """Doc AI identifies form fields by their offsets
        in document text. This function converts offsets
        to text snippets.
        """
        response = ''
        # If a text segment spans several lines, it will
        # be stored in different text segments.
        for segment in el.text_anchor.text_segments:
            start_index = segment.start_index
            end_index = segment.end_index
            response += document.text[start_index:end_index]
        return response

    payload = dict()
    for page in document.pages:
        for form_field in page.form_fields:

            name = _get_text(form_field.field_name).rstrip()
            value = _get_text(form_field.field_value).rstrip()
            payload[name] = value

            # if _get_text(form_field.field_name) == 'Requester : Name':
            #     print('Field Name: {}\tConfidence: {}'.format(
            #         _get_text(form_field.field_name),
            #         form_field.field_name.confidence))
            #     print('Field Value: {}\tConfidence: {}'.format(
            #         _get_text(form_field.field_value),
            #         form_field.field_value.confidence))

    return payload
Exemple #23
0
def batch_parse_form(
        project_id='YOUR_PROJECT_ID',
        input_uri='gs://cloud-samples-data/documentai/form.pdf',
        destination_uri='gs://your-bucket-id/path/to/save/results/'):
    """Parse a form"""

    client = documentai.DocumentUnderstandingServiceClient()

    gcs_source = documentai.types.GcsSource(uri=input_uri)

    # mime_type can be application/pdf, image/tiff,
    # and image/gif, or application/json
    input_config = documentai.types.InputConfig(gcs_source=gcs_source,
                                                mime_type='application/pdf')

    # where to write results
    output_config = documentai.types.OutputConfig(
        gcs_destination=documentai.types.GcsDestination(uri=destination_uri),
        pages_per_shard=1  # Map one doc page to one output page
    )

    # Improve form parsing results by providing key-value pair hints.
    # For each key hint, key is text that is likely to appear in the
    # document as a form field name (i.e. "DOB").
    # Value types are optional, but can be one or more of:
    # ADDRESS, LOCATION, ORGANIZATION, PERSON, PHONE_NUMBER, ID,
    # NUMBER, EMAIL, PRICE, TERMS, DATE, NAME
    key_value_pair_hints = [
        documentai.types.KeyValuePairHint(key='Emergency Contact',
                                          value_types=['NAME']),
        documentai.types.KeyValuePairHint(key='Referred By')
    ]

    # Setting enabled=True enables form extraction
    form_extraction_params = documentai.types.FormExtractionParams(
        enabled=True, key_value_pair_hints=key_value_pair_hints)

    # Location can be 'us' or 'eu'
    parent = 'projects/{}/locations/us'.format(project_id)
    request = documentai.types.ProcessDocumentRequest(
        input_config=input_config,
        output_config=output_config,
        form_extraction_params=form_extraction_params)

    # Add each ProcessDocumentRequest to the batch request
    requests = []
    requests.append(request)

    batch_request = documentai.types.BatchProcessDocumentsRequest(
        parent=parent, requests=requests)

    operation = client.batch_process_documents(batch_request)

    # Wait for the operation to finish
    operation.result()

    # Results are written to GCS. Use a regex to find
    # output files
    match = re.match(r'gs://([^/]+)/(.+)', destination_uri)
    output_bucket = match.group(1)
    prefix = match.group(2)

    storage_client = storage.client.Client()
    bucket = storage_client.get_bucket(output_bucket)
    blob_list = list(bucket.list_blobs(prefix=prefix))
    print('Output files:')
    for blob in blob_list:
        print(blob.name)
Exemple #24
0
def main(project_id='document-ai-project-291706',
        input_uri='gs://analysis_report_samples/sample_ocr_7.pdf'):
        #  input_uri='gs://cloud-samples-data/documentai/invoice.pdf'):

    """Process a single document with the Document AI API, including
    text extraction and entity extraction."""

    client = documentai.DocumentUnderstandingServiceClient()

    gcs_source = documentai.types.GcsSource(uri=input_uri)

    # mime_type can be application/pdf, image/tiff,
    # and image/gif, or application/json
    input_config = documentai.types.InputConfig(
        gcs_source=gcs_source, mime_type='application/pdf')

    # insert table extraction code here 
    table_bound_hints = [
        documentai.types.TableBoundHint(
            page_number=1,
            bounding_box=documentai.types.BoundingPoly(
                # Define a polygon around tables to detect
                # Each vertice coordinate must be a number between 0 and 1
                normalized_vertices=[
                    # Top left
                    documentai.types.geometry.NormalizedVertex(
                        x=0,
                        y=0
                    ),
                    # Top right
                    documentai.types.geometry.NormalizedVertex(
                        x=1,
                        y=0
                    ),
                    # Bottom right
                    documentai.types.geometry.NormalizedVertex(
                        x=1,
                        y=1
                    ),
                    # Bottom left
                    documentai.types.geometry.NormalizedVertex(
                        x=0,
                        y=1
                    )
                ]
            )
        )
    ]

    # Setting enabled=True enables form extraction
    table_extraction_params = documentai.types.TableExtractionParams(
        enabled=True, table_bound_hints=table_bound_hints)


    # Location can be 'us' or 'eu'
    parent = 'projects/{}/locations/us'.format(project_id)
    request = documentai.types.ProcessDocumentRequest(
        parent=parent,
        input_config=input_config,
        table_extraction_params=table_extraction_params)


    document = client.process_document(request=request)

    file_= open('data_report_6.text','w')
    file_.write('*BEGIN_TEXT_EXTRACTION \n\n')
    # All text extracted from the document
    print('Document Text: {}'.format(document.text))
    file_.write('Document Text: {}'.format(document.text))
    def _get_text(el):
        """Convert text offset indexes into text snippets.
        """
        response = ''
        # If a text segment spans several lines, it will
        # be stored in different text segments.
        for segment in el.text_anchor.text_segments:
            start_index = segment.start_index
            end_index = segment.end_index
            response += document.text[start_index:end_index]
        return response

    for entity in document.entities:
        # print('Entity type: {}'.format(entity.type))
        print('Text: {}'.format(_get_text(entity)))
        file_.write('Text: {}'.format(_get_text(entity)))
        print('Mention text: {}\n'.format(entity.mention_text))
        file_.write('Mention text: {}\n'.format(entity.mention_text))

    file_.write('\n *BEGIN_TABLE_EXTRACTION \n\n')

    for page in document.pages:
        print('Page number: {}'.format(page.page_number))
        file_.write('Page number: {}'.format(page.page_number))
        for table_num, table in enumerate(page.tables):
            print('Table {}: '.format(table_num))
            file_.write('Table {}: '.format(table_num))
            for row_num, row in enumerate(table.header_rows):
                cells = '\t'.join(
                    [_get_text(cell.layout) for cell in row.cells])
                print('Header Row {}: {}'.format(row_num, cells))
                file_.write('Header Row {}: {}'.format(row_num, cells))
            for row_num, row in enumerate(table.body_rows):
                cells = '\t'.join(
                    [_get_text(cell.layout) for cell in row.cells])
                print('Row {}: {}'.format(row_num, cells))
                file_.write('Row {}: {}'.format(row_num, cells))



    file_.write('\n\n END_OCR')
    file_.close()