Ejemplo n.º 1
0
def get_tables_string(textract_json_string: str,
                      table_format: Pretty_Print_Table_Format = Pretty_Print_Table_Format.github,
                      with_confidence: bool = False,
                      with_geo: bool = False) -> str:
    """
    doc: Textract response in form of trp.Document (https://github.com/aws-samples/amazon-textract-response-parser/tree/master/src-python)
    table_format: uses tabulate to pretty print the tabels to ascii. See https://pypi.org/project/tabulate/ for a lsit of table format values
    with_confidence: output confidence scores as well
    with_geo: output geo information as well
    """
    logger.debug(f"table_format: {table_format}")
    doc = trp.Document(json.loads(textract_json_string))
    result_value = ""
    if not table_format==Pretty_Print_Table_Format.csv:
        for page in doc.pages:
            for table in page.tables:
                table_list = convert_table_to_list(
                    table, with_confidence=with_confidence, with_geo=with_geo)
                result_value += tabulate(table_list, tablefmt=table_format.name) + "\n\n"
    if table_format==Pretty_Print_Table_Format.csv:
        logger.debug(f"pretty print - csv")
        csv_output = StringIO()
        csv_writer = csv.writer(csv_output, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)
        for page in doc.pages:
            for table in page.tables:
                table_list = convert_table_to_list(
                    table, with_confidence=with_confidence, with_geo=with_geo)
                csv_writer.writerows(table_list)
                csv_writer.writerow([])
        result_value = csv_output.getvalue()
    return result_value
Ejemplo n.º 2
0
def get_forms_string(textract_json_string: str,
                    table_format: Pretty_Print_Table_Format = Pretty_Print_Table_Format.github,
                    with_confidence: bool = False,
                    with_geo: bool = False) -> str:
    """
    returns string with key-values printed out in format: key: value
    """
    logger.debug(f"table_format: {table_format}")
    doc = trp.Document(json.loads(textract_json_string))
    result_value = ""
    if not table_format==Pretty_Print_Table_Format.csv:
        for page in doc.pages:
            forms_list = convert_form_to_list(
                page.form, with_confidence=with_confidence, with_geo=with_geo)
            result_value += tabulate(forms_list, tablefmt=table_format.name) + "\n\n"
    if table_format==Pretty_Print_Table_Format.csv:
        logger.debug(f"pretty print - csv")
        csv_output = StringIO()
        csv_writer = csv.writer(csv_output, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)
        for page in doc.pages:
            forms_list = convert_form_to_list(
                page.form, with_confidence=with_confidence, with_geo=with_geo)
            csv_writer.writerows(forms_list)
        csv_writer.writerow([])
        result_value = csv_output.getvalue()
    return result_value
Ejemplo n.º 3
0
def extractTextract(bucket, textractObjectName):

    response = textract.start_document_analysis(DocumentLocation={
        'S3Object': {
            'Bucket': bucket,
            'Name': textractObjectName
        }
    },
                                                FeatureTypes=[
                                                    'TABLES',
                                                ])

    textractJobId = response["JobId"]
    print('job id is: ', textractJobId)
    time.sleep(15)
    response = textract.get_document_analysis(JobId=textractJobId)
    status = response["JobStatus"]

    while (status == "IN_PROGRESS"):
        time.sleep(5)
        response = textract.get_document_analysis(JobId=textractJobId)
        status = response["JobStatus"]
        print("Textract Job status: {}".format(status))

    pages = extract_text(textractJobId, response)
    doc = trp.Document(pages)
    return doc
def test_tblock_order_block_by_geo_multi_page():
    p = os.path.dirname(os.path.realpath(__file__))
    f = open(os.path.join(p, "data/gib_multi_page_tables.json"))
    j = json.load(f)
    t_document: t2.TDocument = t2.TDocumentSchema().load(j)
    t_document = order_blocks_by_geo(t_document)
    doc = t1.Document(t2.TDocumentSchema().dump(t_document))
    assert "Page 1 - Value 1.1.1" == doc.pages[0].tables[0].rows[0].cells[0].text.strip()
    assert "Page 1 - Value 2.1.1" == doc.pages[0].tables[1].rows[0].cells[0].text.strip()
def test_next_token_response():
    p = os.path.dirname(os.path.realpath(__file__))
    f = open(os.path.join(p, "data/gib.json"))
    j = json.load(f)
    assert j['NextToken']
    t_document: t2.TDocument = t2.TDocumentSchema().load(j)
    t_document = add_page_orientation(t_document)
    assert t_document.pages[0].custom

    doc = t1.Document(t2.TDocumentSchema().dump(t_document))
    for page in doc.pages:
        print(page.custom['Orientation'])
Ejemplo n.º 6
0
def ExecuteTableValidations(t_doc: t2.TDocument,
                            header_footer_type: HeaderFooterType,
                            accuracy_percentage: float):
    """
    Invoke validations for first and last tables on all pages recursively
    """
    page_compare_proc = 0
    table_ids_to_merge = {}
    table_ids_merge_list = []
    from trp.t_pipeline import order_blocks_by_geo
    ordered_doc = order_blocks_by_geo(t_doc)
    trp_doc = trp.Document(TDocumentSchema().dump(ordered_doc))

    for current_page in trp_doc.pages:
        if (page_compare_proc >= len(trp_doc.pages) - 1):
            break
        if (len(current_page.tables) == 0 or len(current_page.tables) == 0):
            page_compare_proc += 1
            break
        current_page_table = current_page.tables[len(current_page.tables) - 1]
        next_page = trp_doc.pages[page_compare_proc + 1]
        next_page_table = next_page.tables[0]
        result_1 = __validate_objects_between_tables(current_page,
                                                     current_page_table,
                                                     next_page,
                                                     next_page_table,
                                                     header_footer_type)
        if (result_1):
            result_2_1 = __compare_table_column_numbers(
                current_page_table, next_page_table)
            result_2_2 = __compare_table_headers(current_page_table,
                                                 next_page_table)
            if (result_2_1 or result_2_2):
                result3 = __compare_table_dimensions(current_page_table,
                                                     next_page_table,
                                                     accuracy_percentage)
                if (result3):
                    table_ids_to_merge[
                        next_page_table.id] = current_page_table.id
                    if (table_ids_merge_list):
                        if (any(merge_pairs[1] == current_page_table.id
                                for merge_pairs in table_ids_merge_list)):
                            table_ids_merge_list[len(table_ids_merge_list) -
                                                 1].append(next_page_table.id)
                    else:
                        table_ids_merge_list.append(
                            [current_page_table.id, next_page_table.id])
        page_compare_proc += 1
    return table_ids_merge_list
Ejemplo n.º 7
0
def get_lines_string(textract_json_string: str,
                     with_page_number: bool = False) -> str:
    """
    returns string with lines seperated by \n
    """
    doc = trp.Document(json.loads(textract_json_string))
    i = 0
    result_value = ""
    for page in doc.pages:
        if with_page_number:
            result_value += f"--------- page number: {i} - page ID: {page.id} --------------"
        for line in page.lines:
            result_value += f"{line.text}\n"
        i += 1
    return result_value
Ejemplo n.º 8
0
def test_kv_ocr_confidence(caplog):
    caplog.set_level(logging.DEBUG)
    p = os.path.dirname(os.path.realpath(__file__))
    f = open(os.path.join(p, "data/employment-application.json"))
    j = json.load(f)
    t_document: t2.TDocument = t2.TDocumentSchema().load(j)
    t_document = add_kv_ocr_confidence(t_document)

    doc = t1.Document(t2.TDocumentSchema().dump(t_document))
    for page in doc.pages:
        k1 = page.form.getFieldByKey("Home Address:")
        k1.key.custom['OCRConfidence'] == {'mean': 99.60698318481445}
        k1.value.custom['OCRConfidence'] == {'mean': 99.8596928914388}
        k1 = page.form.getFieldByKey("Phone Number:")
        k1.key.custom['OCRConfidence'] == {'mean': 99.55334854125977}
        k1.value.custom['OCRConfidence'] == {'mean': 99.23233032226562}
Ejemplo n.º 9
0
def get_words_string(textract_json: dict,
                     with_page_number: bool = False) -> str:
    """
    returns string with words seperated by \n
    """
    doc = trp.Document(textract_json)
    i = 0
    result_value = ""
    for page in doc.pages:
        if with_page_number:
            result_value += f"--------- page number: {i} - page ID: {page.id} --------------"
        for line in page.lines:
            for word in line.words:
                result_value += f"{word.text}\n"
        i += 1
    return result_value
Ejemplo n.º 10
0
def test_adjust_bounding_boxes_and_polygons_to_orientation():
    # p = os.path.dirname(os.path.realpath(__file__))
    # f = open(os.path.join(p, "data/gib.json"))
    # j = json.load(f)
    # t_document: t2.TDocument = t2.TDocumentSchema().load(j)
    # t_document = add_page_orientation(t_document)
    # doc = t1.Document(t2.TDocumentSchema().dump(t_document))
    # key = "Date:"
    # fields = doc.pages[0].form.searchFieldsByKey(key)
    # for field in fields:
    #     print(f"Field: Key: {field.key}, Value: {field.value}, Geo: {field.geometry} ")

    p = os.path.dirname(os.path.realpath(__file__))
    f = open(os.path.join(p, "data/gib__180_degrees.json"))
    j = json.load(f)
    t_document: t2.TDocument = t2.TDocumentSchema().load(j)
    t_document = add_page_orientation(t_document)
    new_order = order_blocks_by_geo(t_document)
    doc = t1.Document(t2.TDocumentSchema().dump(t_document))
Ejemplo n.º 11
0
def lambda_handler(event, context):

    # Amazon Textract
    textract = boto3.client(service_name='textract', region_name='us-east-1')

    # Amazon s3
    s3 = boto3.client('s3')

    try:
        obj = event["Records"][0]["s3"]
        bucket = str(obj["bucket"]["name"])
        file_name = str(obj["object"]["key"])
        file_name_final = file_name.split(".")
        # AWS textract being from here
        response = textract.analyze_document(
            Document={'S3Object': {
                'Bucket': bucket,
                'Name': file_name
            }},
            FeatureTypes=['TABLES', 'FORMS'])

        #calling textract parser module
        doc = trp.Document(response)
        #table_content = []
        line_content = []
        content_table = []
        content_form = []
        #Looping through doc response
        for page in doc.pages:

            for line in page.lines:
                #line_content += (line.text) + "\n"
                line_content.append(line.text)

            #for forms
            forms = hp.outputForm(page)

            for items in forms:
                #content += '\n'
                for item in items:
                    #content +=item
                    content_form.append(item)

            # for tables
            content_table = hp.outputTable(page)

        # removing duplicates
        for line in line_content:
            #print("line value => ",line)
            for item in content_table:
                #print("item value before => ",item)

                if line in item:
                    #print("line after delete => ",line)
                    line_index = line_content.index(line)
                    line_content[line_index] = "table"
                    break

        #final removable of duplicates
        final_line_list = hp.Remove(line_content)
        for item in final_line_list:
            if "table" in item:
                final_line_list.remove("table")

        # copying the list elements into text
        content = ""
        for item in final_line_list:

            content += item + ' '

        for items in content_table:
            #print('')
            content += '\n'
            for item in items:
                content += item + '\t'

        #uploading the file into the bucket
        s3.put_object(Bucket=bucket,
                      Key="text_files/{}.txt".format(file_name_final[0]),
                      Body=content)

    except Exception as e:
        raise
    else:
        pass
    finally:
        pass
Ejemplo n.º 12
0
	Document={
		'S3Object': {
			'Bucket':'your_bucket_name',
			#'Name':str(sys.argv[1])
			'Name' : file_name
		}
	},
	FeatureTypes=['TABLES','FORMS'])


print('')




doc = trp.Document(response)
content =''
for page in doc.pages:
	table = outputTable(page)
	for items in table:
		#print('')
		content += '\n'
		for item in items:
			content += item + '\t'

			#print(item,'\t',end=' ')
	#forms = outputForm(page)
s3.Object('your_bucket_name',file_name+'.txt').put(Body=content)


def test_custom_page_orientation(json_response):
    doc = Document(json_response)
    assert 1 == len(doc.pages)
    lines = [line for line in doc.pages[0].lines]
    assert 22 == len(lines)
    words = [word for line in lines for word in line.words]
    assert 53 == len(words)
    t_document: t2.TDocument = t2.TDocumentSchema().load(json_response)
    t_document.custom = {'orientation': 180}
    new_t_doc_json = t2.TDocumentSchema().dump(t_document)
    assert "Custom" in new_t_doc_json
    assert "orientation" in new_t_doc_json["Custom"]
    assert new_t_doc_json["Custom"]["orientation"] == 180

    p = os.path.dirname(os.path.realpath(__file__))
    f = open(os.path.join(p, "data/gib.json"))
    j = json.load(f)
    t_document: t2.TDocument = t2.TDocumentSchema().load(j)
    t_document = add_page_orientation(t_document)
    assert -1 < t_document.pages[0].custom['Orientation'] < 2

    p = os.path.dirname(os.path.realpath(__file__))
    f = open(os.path.join(p, "data/gib_10_degrees.json"))
    j = json.load(f)
    t_document: t2.TDocument = t2.TDocumentSchema().load(j)
    t_document = add_page_orientation(t_document)
    assert 5 < t_document.pages[0].custom['Orientation'] < 15

    p = os.path.dirname(os.path.realpath(__file__))
    f = open(os.path.join(p, "data/gib__15_degrees.json"))
    j = json.load(f)
    t_document: t2.TDocument = t2.TDocumentSchema().load(j)
    t_document = add_page_orientation(t_document)
    assert 10 < t_document.pages[0].custom['Orientation'] < 20

    p = os.path.dirname(os.path.realpath(__file__))
    f = open(os.path.join(p, "data/gib__25_degrees.json"))
    j = json.load(f)
    t_document: t2.TDocument = t2.TDocumentSchema().load(j)
    t_document = add_page_orientation(t_document)
    assert 17 < t_document.pages[0].custom['Orientation'] < 30

    p = os.path.dirname(os.path.realpath(__file__))
    f = open(os.path.join(p, "data/gib__180_degrees.json"))
    j = json.load(f)
    t_document: t2.TDocument = t2.TDocumentSchema().load(j)
    t_document = add_page_orientation(t_document)
    assert 170 < t_document.pages[0].custom['Orientation'] < 190

    p = os.path.dirname(os.path.realpath(__file__))
    f = open(os.path.join(p, "data/gib__270_degrees.json"))
    j = json.load(f)
    t_document: t2.TDocument = t2.TDocumentSchema().load(j)
    t_document = add_page_orientation(t_document)
    assert -100 < t_document.pages[0].custom['Orientation'] < -80

    p = os.path.dirname(os.path.realpath(__file__))
    f = open(os.path.join(p, "data/gib__90_degrees.json"))
    j = json.load(f)
    t_document: t2.TDocument = t2.TDocumentSchema().load(j)
    t_document = add_page_orientation(t_document)
    assert 80 < t_document.pages[0].custom['Orientation'] < 100

    p = os.path.dirname(os.path.realpath(__file__))
    f = open(os.path.join(p, "data/gib__minus_10_degrees.json"))
    j = json.load(f)
    t_document: t2.TDocument = t2.TDocumentSchema().load(j)
    t_document = add_page_orientation(t_document)
    assert -10 < t_document.pages[0].custom['Orientation'] < 5

    doc = t1.Document(t2.TDocumentSchema().dump(t_document))
    for page in doc.pages:
        assert page.custom['Orientation']
Ejemplo n.º 14
0
def test_Document():
    with open(blocks_json, "rt") as f:
        blocks = json.load(f)
    doc = trp.Document(blocks)
    assert doc
Ejemplo n.º 15
0
def get_bounding_boxes(
        textract_json: dict, overlay_features: List[Textract_Types],
        document_dimensions: DocumentDimensions) -> List[BoundingBox]:
    doc = trp.Document(textract_json)
    bounding_box_list: List[BoundingBox] = list()
    page_number: int = 0
    for page in doc.pages:
        page_number += 1
        if Textract_Types.WORD in overlay_features or Textract_Types.LINE in overlay_features:
            for line in page.lines:
                if Textract_Types.LINE in overlay_features:
                    if line:
                        bounding_box_list.append(
                            BoundingBox(
                                geometry=line.geometry,
                                document_dimensions=document_dimensions,
                                box_type=Textract_Types.LINE,
                                page_number=page_number))
                if Textract_Types.WORD in overlay_features:
                    for word in line.words:
                        if word:
                            bounding_box_list.append(
                                BoundingBox(
                                    geometry=word.geometry,
                                    document_dimensions=document_dimensions,
                                    box_type=Textract_Types.WORD,
                                    page_number=page_number))

        if any([
                x for x in overlay_features if x in
            [Textract_Types.FORM, Textract_Types.KEY, Textract_Types.VALUE]
        ]):
            for field in page.form.fields:
                if any([
                        x for x in overlay_features
                        if x in [Textract_Types.FORM, Textract_Types.KEY]
                ]):
                    if field and field.key:
                        bounding_box_list.append(
                            BoundingBox(
                                geometry=field.key.geometry,
                                document_dimensions=document_dimensions,
                                box_type=Textract_Types.KEY,
                                page_number=page_number))
                if any([
                        x for x in overlay_features
                        if x in [Textract_Types.FORM, Textract_Types.VALUE]
                ]):
                    if field and field.value:
                        bounding_box_list.append(
                            BoundingBox(
                                geometry=field.value.geometry,
                                document_dimensions=document_dimensions,
                                box_type=Textract_Types.VALUE,
                                page_number=page_number))

        if any([
                x for x in overlay_features
                if x in [Textract_Types.TABLE, Textract_Types.CELL]
        ]):
            for table in page.tables:
                if Textract_Types.TABLE in overlay_features:
                    bounding_box_list.append(
                        BoundingBox(geometry=table.geometry,
                                    document_dimensions=document_dimensions,
                                    box_type=Textract_Types.TABLE,
                                    page_number=page_number))

                if Textract_Types.CELL in overlay_features:
                    for _, row in enumerate(table.rows):
                        for _, cell in enumerate(row.cells):
                            if cell:
                                bounding_box_list.append(
                                    BoundingBox(
                                        geometry=cell.geometry,
                                        document_dimensions=document_dimensions,
                                        box_type=Textract_Types.CELL,
                                        page_number=page_number))

    return bounding_box_list