예제 #1
0
 def test_analyze_document_none_model_id(self, formrecognizer_test_endpoint,
                                         formrecognizer_test_api_key):
     client = DocumentAnalysisClient(
         formrecognizer_test_endpoint,
         AzureKeyCredential(formrecognizer_test_api_key))
     with self.assertRaises(ValueError):
         client.begin_analyze_document(model=None, document=b"xx")
 def test_analyze_document_empty_model_id(self, **kwargs):
     formrecognizer_test_endpoint = kwargs.pop(
         "formrecognizer_test_endpoint")
     formrecognizer_test_api_key = kwargs.pop("formrecognizer_test_api_key")
     client = DocumentAnalysisClient(
         formrecognizer_test_endpoint,
         AzureKeyCredential(formrecognizer_test_api_key))
     with pytest.raises(ValueError):
         client.begin_analyze_document(model="", document=b"xx")
예제 #3
0
def analyze_identity_documents():
    path_to_sample_documents = os.path.abspath(
        os.path.join(
            os.path.abspath(__file__),
            "..",
            "..",
            "./sample_forms/id_documents/license.jpg",
        ))

    from azure.core.credentials import AzureKeyCredential
    from azure.ai.formrecognizer import DocumentAnalysisClient

    endpoint = os.environ["AZURE_FORM_RECOGNIZER_ENDPOINT"]
    key = os.environ["AZURE_FORM_RECOGNIZER_KEY"]

    document_analysis_client = DocumentAnalysisClient(
        endpoint=endpoint, credential=AzureKeyCredential(key))
    with open(path_to_sample_documents, "rb") as f:
        poller = document_analysis_client.begin_analyze_document(
            "prebuilt-idDocument", document=f)
    id_documents = poller.result()

    for idx, id_document in enumerate(id_documents.documents):
        print("--------Recognizing ID document #{}--------".format(idx + 1))
        first_name = id_document.fields.get("FirstName")
        if first_name:
            print("First Name: {} has confidence: {}".format(
                first_name.value, first_name.confidence))
        last_name = id_document.fields.get("LastName")
        if last_name:
            print("Last Name: {} has confidence: {}".format(
                last_name.value, last_name.confidence))
        document_number = id_document.fields.get("DocumentNumber")
        if document_number:
            print("Document Number: {} has confidence: {}".format(
                document_number.value, document_number.confidence))
        dob = id_document.fields.get("DateOfBirth")
        if dob:
            print("Date of Birth: {} has confidence: {}".format(
                dob.value, dob.confidence))
        doe = id_document.fields.get("DateOfExpiration")
        if doe:
            print("Date of Expiration: {} has confidence: {}".format(
                doe.value, doe.confidence))
        sex = id_document.fields.get("Sex")
        if sex:
            print("Sex: {} has confidence: {}".format(sex.value,
                                                      sex.confidence))
        address = id_document.fields.get("Address")
        if address:
            print("Address: {} has confidence: {}".format(
                address.value, address.confidence))
        country_region = id_document.fields.get("CountryRegion")
        if country_region:
            print("Country/Region: {} has confidence: {}".format(
                country_region.value, country_region.confidence))
        region = id_document.fields.get("Region")
        if region:
            print("Region: {} has confidence: {}".format(
                region.value, region.confidence))
예제 #4
0
def analyze_read():
    path_to_sample_documents = os.path.abspath(
        os.path.join(
            os.path.abspath(__file__),
            "..",
            "..",
            "./sample_forms/forms/Form_1.jpg",
        ))

    from azure.core.credentials import AzureKeyCredential
    from azure.ai.formrecognizer import DocumentAnalysisClient

    endpoint = os.environ["AZURE_FORM_RECOGNIZER_ENDPOINT"]
    key = os.environ["AZURE_FORM_RECOGNIZER_KEY"]

    document_analysis_client = DocumentAnalysisClient(
        endpoint=endpoint, credential=AzureKeyCredential(key))
    with open(path_to_sample_documents, "rb") as f:
        poller = document_analysis_client.begin_analyze_document(
            "prebuilt-read", document=f)
    result = poller.result()

    print("----Languages detected in the document----")
    for language in result.languages:
        print("Language code: '{}' with confidence {}".format(
            language.language_code, language.confidence))

    for page in result.pages:
        print("----Analyzing document from page #{}----".format(
            page.page_number))
        print(
            "Page has width: {} and height: {}, measured with unit: {}".format(
                page.width, page.height, page.unit))

        for line_idx, line in enumerate(page.lines):
            words = line.get_words()
            print(
                "...Line # {} has {} words and text '{}' within bounding box '{}'"
                .format(
                    line_idx,
                    len(words),
                    line.content,
                    format_bounding_box(line.bounding_box),
                ))

            for word in words:
                print("......Word '{}' has a confidence of {}".format(
                    word.content, word.confidence))

        for selection_mark in page.selection_marks:
            print(
                "...Selection mark is '{}' within bounding box '{}' and has a confidence of {}"
                .format(
                    selection_mark.state,
                    format_bounding_box(selection_mark.bounding_box),
                    selection_mark.confidence,
                ))

    print("----------------------------------------")
예제 #5
0
    def test_authentication_bad_key(self, formrecognizer_test_endpoint,
                                    formrecognizer_test_api_key, **kwargs):
        client = DocumentAnalysisClient(formrecognizer_test_endpoint,
                                        AzureKeyCredential("xxxx"))
        with pytest.raises(ClientAuthenticationError):
            poller = client.begin_analyze_document("prebuilt-receipt", b"xx")

        return {}
 def test_receipt_bad_endpoint(self, formrecognizer_test_endpoint,
                               formrecognizer_test_api_key, **kwargs):
     with open(self.receipt_jpg, "rb") as fd:
         myfile = fd.read()
     with pytest.raises(ServiceRequestError):
         client = DocumentAnalysisClient(
             "http://notreal.azure.com",
             AzureKeyCredential(formrecognizer_test_api_key))
         poller = client.begin_analyze_document("prebuilt-receipt", myfile)
예제 #7
0
def analyze_custom_documents(custom_model_id):
    path_to_sample_documents = os.path.abspath(
        os.path.join(os.path.abspath(__file__), "..", "..",
                     "./sample_forms/forms/Form_1.jpg"))
    # [START analyze_custom_documents]
    from azure.core.credentials import AzureKeyCredential
    from azure.ai.formrecognizer import DocumentAnalysisClient

    endpoint = os.environ["AZURE_FORM_RECOGNIZER_ENDPOINT"]
    key = os.environ["AZURE_FORM_RECOGNIZER_KEY"]
    model_id = os.getenv("CUSTOM_BUILT_MODEL_ID", custom_model_id)

    document_analysis_client = DocumentAnalysisClient(
        endpoint=endpoint, credential=AzureKeyCredential(key))

    # Make sure your document's type is included in the list of document types the custom model can analyze
    with open(path_to_sample_documents, "rb") as f:
        poller = document_analysis_client.begin_analyze_document(
            model=model_id, document=f)
    result = poller.result()

    for idx, document in enumerate(result.documents):
        print("--------Analyzing document #{}--------".format(idx + 1))
        print("Document has type {}".format(document.doc_type))
        print("Document has confidence {}".format(document.confidence))
        print("Document was analyzed by model with ID {}".format(
            result.model_id))
        for name, field in document.fields.items():
            field_value = field.value if field.value else field.content
            print(
                "......found field of type '{}' with value '{}' and with confidence {}"
                .format(field.value_type, field_value, field.confidence))

    # iterate over tables, lines, and selection marks on each page
    for page in result.pages:
        print("\nLines found on page {}".format(page.page_number))
        for line in page.lines:
            print("...Line '{}'".format(line.content))
        for word in page.words:
            print("...Word '{}' has a confidence of {}".format(
                word.content, word.confidence))
        for selection_mark in page.selection_marks:
            print(
                "...Selection mark is '{}' and has a confidence of {}".format(
                    selection_mark.state, selection_mark.confidence))

    for i, table in enumerate(result.tables):
        print("\nTable {} can be found on page:".format(i + 1))
        for region in table.bounding_regions:
            print("...{}".format(i + 1, region.page_number))
        for cell in table.cells:
            print("...Cell[{}][{}] has content '{}'".format(
                cell.row_index, cell.column_index, cell.content))
    print("-----------------------------------")
예제 #8
0
def get_words_on_document_line():
    path_to_sample_documents = os.path.abspath(
        os.path.join(
            os.path.abspath(__file__),
            "..",
            "..",
            "./sample_forms/forms/Form_1.jpg",
        ))

    from azure.core.credentials import AzureKeyCredential
    from azure.ai.formrecognizer import DocumentAnalysisClient

    endpoint = os.environ["AZURE_FORM_RECOGNIZER_ENDPOINT"]
    key = os.environ["AZURE_FORM_RECOGNIZER_KEY"]

    document_analysis_client = DocumentAnalysisClient(
        endpoint=endpoint, credential=AzureKeyCredential(key))
    with open(path_to_sample_documents, "rb") as f:
        poller = document_analysis_client.begin_analyze_document(
            "prebuilt-document", document=f)
    result = poller.result()

    for idx, page in enumerate(result.pages):
        print("----Analyzing lines and words from page #{}----".format(idx +
                                                                       1))
        print(
            "Page has width: {} and height: {}, measured with unit: {}".format(
                page.width, page.height, page.unit))

        for line_idx, line in enumerate(page.lines):
            words = line.get_words()
            print(
                "...Line # {} has word count {} and text '{}' within bounding box '{}'"
                .format(
                    line_idx,
                    len(words),
                    line.content,
                    format_bounding_box(line.bounding_box),
                ))

            for word in words:
                print("......Word '{}' has a confidence of {}".format(
                    word.content, word.confidence))

    print("----------------------------------------")
예제 #9
0
def convert_to_and_from_dict():
    path_to_sample_documents = os.path.abspath(
        os.path.join(
            os.path.abspath(__file__),
            "..",
            "..",
            "./sample_forms/forms/Form_1.jpg",
        ))

    from azure.core.serialization import AzureJSONEncoder
    from azure.core.credentials import AzureKeyCredential
    from azure.ai.formrecognizer import DocumentAnalysisClient, AnalyzeResult

    endpoint = os.environ["AZURE_FORM_RECOGNIZER_ENDPOINT"]
    key = os.environ["AZURE_FORM_RECOGNIZER_KEY"]

    document_analysis_client = DocumentAnalysisClient(
        endpoint=endpoint, credential=AzureKeyCredential(key))
    with open(path_to_sample_documents, "rb") as f:
        poller = document_analysis_client.begin_analyze_document(
            "prebuilt-document", document=f)
    result = poller.result()

    # convert the received model to a dictionary
    analyze_result_dict = result.to_dict()

    # save the dictionary as JSON content in a JSON file, use the AzureJSONEncoder
    # to help make types, such as dates, JSON serializable
    # NOTE: AzureJSONEncoder is only available with azure.core>=1.18.0.
    with open('data.json', 'w') as f:
        json.dump(analyze_result_dict, f, cls=AzureJSONEncoder)

    # convert the dictionary back to the original model
    model = AnalyzeResult.from_dict(analyze_result_dict)

    # use the model as normal
    print("----Converted from dictionary AnalyzeResult----")
    print("Model ID: '{}'".format(model.model_id))
    print("Number of pages analyzed {}".format(len(model.pages)))
    print("API version used: {}".format(model.api_version))

    print("----------------------------------------")
class AnalyzeDocumentRequestPreparation(PerfStressTest):
    def __init__(self, arguments):
        super().__init__(arguments)

        with open(
                os.path.abspath(
                    os.path.join(os.path.abspath(__file__), "..",
                                 "./../sample_forms/forms/Form_1.jpg")),
                "rb") as fd:
            self.document_jpg = fd.read()

        # read test related env vars
        formrecognizer_test_endpoint = os.environ[
            "FORMRECOGNIZER_TEST_ENDPOINT"]
        form_recognizer_account_key = os.environ["FORMRECOGNIZER_TEST_API_KEY"]

        # assign the clients that will be used in the perf tests
        self.service_client = DocumentAnalysisClient(
            formrecognizer_test_endpoint,
            AzureKeyCredential(form_recognizer_account_key))
        self.async_service_client = AsyncDocumentAnalysisClient(
            formrecognizer_test_endpoint,
            AzureKeyCredential(form_recognizer_account_key))

    async def close(self):
        """This is run after cleanup."""
        await self.async_service_client.close()
        self.service_client.close()
        await super().close()

    def run_sync(self):
        """The synchronous perf test."""
        poller = self.service_client.begin_analyze_document(
            "prebuilt-document", self.document_jpg)
        assert poller

    async def run_async(self):
        """The asynchronous perf test."""
        poller = await self.async_service_client.begin_analyze_document(
            "prebuilt-document", self.document_jpg)
        assert poller
예제 #11
0
def analyze_tax_us_w2():
    path_to_sample_documents = os.path.abspath(
        os.path.join(
            os.path.abspath(__file__),
            "..",
            "..",
            "./sample_forms/tax/sample_w2.png",
        ))

    from azure.core.credentials import AzureKeyCredential
    from azure.ai.formrecognizer import DocumentAnalysisClient

    endpoint = os.environ["AZURE_FORM_RECOGNIZER_ENDPOINT"]
    key = os.environ["AZURE_FORM_RECOGNIZER_KEY"]

    document_analysis_client = DocumentAnalysisClient(
        endpoint=endpoint, credential=AzureKeyCredential(key))
    with open(path_to_sample_documents, "rb") as f:
        poller = document_analysis_client.begin_analyze_document(
            "prebuilt-tax.us.w2", document=f, locale="en-US")
    w2s = poller.result()

    for idx, w2 in enumerate(w2s.documents):
        print("--------Recognizing US Tax W-2 Form #{}--------".format(idx +
                                                                       1))
        form_variant = w2.fields.get("W2FormVariant")
        if form_variant:
            print("Form variant: {} has confidence: {}".format(
                form_variant.value, form_variant.confidence))
        tax_year = w2.fields.get("TaxYear")
        if tax_year:
            print("Tax year: {} has confidence: {}".format(
                tax_year.value, tax_year.confidence))
        w2_copy = w2.fields.get("W2Copy")
        if w2_copy:
            print("W-2 Copy: {} has confidence: {}".format(
                w2_copy.value,
                w2_copy.confidence,
            ))
        employee = w2.fields.get("Employee")
        if employee:
            print("Employee data:")
            employee_name = employee.value.get("Name")
            if employee_name:
                print("...Name: {} has confidence: {}".format(
                    employee_name.value, employee_name.confidence))
            employee_ssn = employee.value.get("SocialSecurityNumber")
            if employee_ssn:
                print("...SSN: {} has confidence: {}".format(
                    employee_ssn.value, employee_ssn.confidence))
            employee_address = employee.value.get("Address")
            if employee_address:
                print("...Address: {} has confidence: {}".format(
                    employee_address.value, employee_address.confidence))
            employee_zipcode = employee.value.get("ZipCode")
            if employee_zipcode:
                print("...Zipcode: {} has confidence: {}".format(
                    employee_zipcode.value, employee_zipcode.confidence))
        control_number = w2.fields.get("ControlNumber")
        if control_number:
            print("Control Number: {} has confidence: {}".format(
                control_number.value, control_number.confidence))
        employer = w2.fields.get("Employer")
        if employer:
            print("Employer data:")
            employer_name = employer.value.get("Name")
            if employer_name:
                print("...Name: {} has confidence: {}".format(
                    employer_name.value, employer_name.confidence))
            employer_id = employer.value.get("IdNumber")
            if employer_id:
                print("...ID Number: {} has confidence: {}".format(
                    employer_id.value, employer_id.confidence))
            employer_address = employer.value.get("Address")
            if employer_address:
                print("...Address: {} has confidence: {}".format(
                    employer_address.value, employer_address.confidence))
            employer_zipcode = employer.value.get("ZipCode")
            if employer_zipcode:
                print("...Zipcode: {} has confidence: {}".format(
                    employer_zipcode.value, employer_zipcode.confidence))
        wages_tips = w2.fields.get("WagesTipsAndOtherCompensation")
        if wages_tips:
            print("Wages, tips, and other compensation: {} has confidence: {}".
                  format(
                      wages_tips.value,
                      wages_tips.confidence,
                  ))
        fed_income_tax_withheld = w2.fields.get("FederalIncomeTaxWithheld")
        if fed_income_tax_withheld:
            print("Federal income tax withheld: {} has confidence: {}".format(
                fed_income_tax_withheld.value,
                fed_income_tax_withheld.confidence))
        social_security_wages = w2.fields.get("SocialSecurityWages")
        if social_security_wages:
            print("Social Security wages: {} has confidence: {}".format(
                social_security_wages.value, social_security_wages.confidence))
        social_security_tax_withheld = w2.fields.get(
            "SocialSecurityTaxWithheld")
        if social_security_tax_withheld:
            print("Social Security tax withheld: {} has confidence: {}".format(
                social_security_tax_withheld.value,
                social_security_tax_withheld.confidence))
        medicare_wages_tips = w2.fields.get("MedicareWagesAndTips")
        if medicare_wages_tips:
            print("Medicare wages and tips: {} has confidence: {}".format(
                medicare_wages_tips.value, medicare_wages_tips.confidence))
        medicare_tax_withheld = w2.fields.get("MedicareTaxWithheld")
        if medicare_tax_withheld:
            print("Medicare tax withheld: {} has confidence: {}".format(
                medicare_tax_withheld.value, medicare_tax_withheld.confidence))
        social_security_tips = w2.fields.get("SocialSecurityTips")
        if social_security_tips:
            print("Social Security tips: {} has confidence: {}".format(
                social_security_tips.value, social_security_tips.confidence))
        allocated_tips = w2.fields.get("AllocatedTips")
        if allocated_tips:
            print("Allocated tips: {} has confidence: {}".format(
                allocated_tips.value,
                allocated_tips.confidence,
            ))
        verification_code = w2.fields.get("VerificationCode")
        if verification_code:
            print("Verification code: {} has confidence: {}".format(
                verification_code.value, verification_code.confidence))
        dependent_care_benefits = w2.fields.get("DependentCareBenefits")
        if dependent_care_benefits:
            print("Dependent care benefits: {} has confidence: {}".format(
                dependent_care_benefits.value,
                dependent_care_benefits.confidence,
            ))
        non_qualified_plans = w2.fields.get("NonQualifiedPlans")
        if non_qualified_plans:
            print("Non-qualified plans: {} has confidence: {}".format(
                non_qualified_plans.value,
                non_qualified_plans.confidence,
            ))
        additional_info = w2.fields.get("AdditionalInfo")
        if additional_info:
            print("Additional information:")
            for item in additional_info.value:
                letter_code = item.value.get("LetterCode")
                if letter_code:
                    print("...Letter code: {} has confidence: {}".format(
                        letter_code.value, letter_code.confidence))
                amount = item.value.get("Amount")
                if amount:
                    print("...Amount: {} has confidence: {}".format(
                        amount.value, amount.confidence))
        is_statutory_employee = w2.fields.get("IsStatutoryEmployee")
        if is_statutory_employee:
            print("Is statutory employee: {} has confidence: {}".format(
                is_statutory_employee.value, is_statutory_employee.confidence))
        is_retirement_plan = w2.fields.get("IsRetirementPlan")
        if is_retirement_plan:
            print("Is retirement plan: {} has confidence: {}".format(
                is_retirement_plan.value, is_retirement_plan.confidence))
        third_party_sick_pay = w2.fields.get("IsThirdPartySickPay")
        if third_party_sick_pay:
            print("Is third party sick pay: {} has confidence: {}".format(
                third_party_sick_pay.value, third_party_sick_pay.confidence))
        other_info = w2.fields.get("Other")
        if other_info:
            print("Other information: {} has confidence: {}".format(
                other_info.value,
                other_info.confidence,
            ))
        state_tax_info = w2.fields.get("StateTaxInfos")
        if state_tax_info:
            print("State Tax info:")
            for tax in state_tax_info.value:
                state = tax.value.get("State")
                if state:
                    print("...State: {} has confidence: {}".format(
                        state.value, state.confidence))
                employer_state_id_number = tax.value.get(
                    "EmployerStateIdNumber")
                if employer_state_id_number:
                    print("...Employer state ID number: {} has confidence: {}".
                          format(employer_state_id_number.value,
                                 employer_state_id_number.confidence))
                state_wages_tips = tax.value.get("StateWagesTipsEtc")
                if state_wages_tips:
                    print("...State wages, tips, etc: {} has confidence: {}".
                          format(state_wages_tips.value,
                                 state_wages_tips.confidence))
                state_income_tax = tax.value.get("StateIncomeTax")
                if state_income_tax:
                    print("...State income tax: {} has confidence: {}".format(
                        state_income_tax.value, state_income_tax.confidence))
        local_tax_info = w2.fields.get("LocalTaxInfos")
        if local_tax_info:
            print("Local Tax info:")
            for tax in local_tax_info.value:
                local_wages_tips = tax.value.get("LocalWagesTipsEtc")
                if local_wages_tips:
                    print("...Local wages, tips, etc: {} has confidence: {}".
                          format(local_wages_tips.value,
                                 local_wages_tips.confidence))
                local_income_tax = tax.value.get("LocalIncomeTax")
                if local_income_tax:
                    print("...Local income tax: {} has confidence: {}".format(
                        local_income_tax.value, local_income_tax.confidence))
                locality_name = tax.value.get("LocalityName")
                if locality_name:
                    print("...Locality name: {} has confidence: {}".format(
                        locality_name.value, locality_name.confidence))
예제 #12
0
def analyze_business_card():
    path_to_sample_documents = os.path.abspath(
        os.path.join(
            os.path.abspath(__file__),
            "..",
            "..",
            "./sample_forms/business_cards/business-card-english.jpg",
        ))

    from azure.core.credentials import AzureKeyCredential
    from azure.ai.formrecognizer import DocumentAnalysisClient

    endpoint = os.environ["AZURE_FORM_RECOGNIZER_ENDPOINT"]
    key = os.environ["AZURE_FORM_RECOGNIZER_KEY"]

    document_analysis_client = DocumentAnalysisClient(
        endpoint=endpoint, credential=AzureKeyCredential(key))
    with open(path_to_sample_documents, "rb") as f:
        poller = document_analysis_client.begin_analyze_document(
            "prebuilt-businessCard", document=f, locale="en-US")
    business_cards = poller.result()

    for idx, business_card in enumerate(business_cards.documents):
        print("--------Analyzing business card #{}--------".format(idx + 1))
        contact_names = business_card.fields.get("ContactNames")
        if contact_names:
            for contact_name in contact_names.value:
                print("Contact First Name: {} has confidence: {}".format(
                    contact_name.value["FirstName"].value,
                    contact_name.value["FirstName"].confidence,
                ))
                print("Contact Last Name: {} has confidence: {}".format(
                    contact_name.value["LastName"].value,
                    contact_name.value["LastName"].confidence,
                ))
        company_names = business_card.fields.get("CompanyNames")
        if company_names:
            for company_name in company_names.value:
                print("Company Name: {} has confidence: {}".format(
                    company_name.value, company_name.confidence))
        departments = business_card.fields.get("Departments")
        if departments:
            for department in departments.value:
                print("Department: {} has confidence: {}".format(
                    department.value, department.confidence))
        job_titles = business_card.fields.get("JobTitles")
        if job_titles:
            for job_title in job_titles.value:
                print("Job Title: {} has confidence: {}".format(
                    job_title.value, job_title.confidence))
        emails = business_card.fields.get("Emails")
        if emails:
            for email in emails.value:
                print("Email: {} has confidence: {}".format(
                    email.value, email.confidence))
        websites = business_card.fields.get("Websites")
        if websites:
            for website in websites.value:
                print("Website: {} has confidence: {}".format(
                    website.value, website.confidence))
        addresses = business_card.fields.get("Addresses")
        if addresses:
            for address in addresses.value:
                print("Address: {} has confidence: {}".format(
                    address.value, address.confidence))
        mobile_phones = business_card.fields.get("MobilePhones")
        if mobile_phones:
            for phone in mobile_phones.value:
                print("Mobile phone number: {} has confidence: {}".format(
                    phone.content, phone.confidence))
        faxes = business_card.fields.get("Faxes")
        if faxes:
            for fax in faxes.value:
                print("Fax number: {} has confidence: {}".format(
                    fax.content, fax.confidence))
        work_phones = business_card.fields.get("WorkPhones")
        if work_phones:
            for work_phone in work_phones.value:
                print("Work phone number: {} has confidence: {}".format(
                    work_phone.content, work_phone.confidence))
        other_phones = business_card.fields.get("OtherPhones")
        if other_phones:
            for other_phone in other_phones.value:
                print("Other phone number: {} has confidence: {}".format(
                    other_phone.value, other_phone.confidence))
예제 #13
0
def analyze_layout():
    path_to_sample_documents = os.path.abspath(
        os.path.join(
            os.path.abspath(__file__),
            "..",
            "..",
            "./sample_forms/forms/form_selection_mark.png",
        )
    )

    from azure.core.credentials import AzureKeyCredential
    from azure.ai.formrecognizer import DocumentAnalysisClient

    endpoint = os.environ["AZURE_FORM_RECOGNIZER_ENDPOINT"]
    key = os.environ["AZURE_FORM_RECOGNIZER_KEY"]

    document_analysis_client = DocumentAnalysisClient(
        endpoint=endpoint, credential=AzureKeyCredential(key)
    )
    with open(path_to_sample_documents, "rb") as f:
        poller = document_analysis_client.begin_analyze_document(
            "prebuilt-layout", document=f
        )
    result = poller.result()

    for idx, style in enumerate(result.styles):
        print(
            "Document contains {} content".format(
                "handwritten" if style.is_handwritten else "no handwritten"
            )
        )

    for page in result.pages:
        print("----Analyzing layout from page #{}----".format(page.page_number))
        print(
            "Page has width: {} and height: {}, measured with unit: {}".format(
                page.width, page.height, page.unit
            )
        )

        for line_idx, line in enumerate(page.lines):
            words = line.get_words()
            print(
                "...Line # {} has word count {} and text '{}' within bounding box '{}'".format(
                    line_idx,
                    len(words),
                    line.content,
                    format_bounding_box(line.bounding_box),
                )
            )

            for word in words:
                print(
                    "......Word '{}' has a confidence of {}".format(
                        word.content, word.confidence
                    )
                )

        for selection_mark in page.selection_marks:
            print(
                "...Selection mark is '{}' within bounding box '{}' and has a confidence of {}".format(
                    selection_mark.state,
                    format_bounding_box(selection_mark.bounding_box),
                    selection_mark.confidence,
                )
            )

    for table_idx, table in enumerate(result.tables):
        print(
            "Table # {} has {} rows and {} columns".format(
                table_idx, table.row_count, table.column_count
            )
        )
        for region in table.bounding_regions:
            print(
                "Table # {} location on page: {} is {}".format(
                    table_idx,
                    region.page_number,
                    format_bounding_box(region.bounding_box),
                )
            )
        for cell in table.cells:
            print(
                "...Cell[{}][{}] has content '{}'".format(
                    cell.row_index,
                    cell.column_index,
                    cell.content,
                )
            )
            for region in cell.bounding_regions:
                print(
                    "...content on page {} is within bounding box '{}'".format(
                        region.page_number,
                        format_bounding_box(region.bounding_box),
                    )
                )

    print("----------------------------------------")
예제 #14
0
def analyze_invoice():
    path_to_sample_documents = os.path.abspath(
        os.path.join(
            os.path.abspath(__file__),
            "..",
            "..",
            "./sample_forms/forms/sample_invoice.jpg",
        ))

    # [START analyze_invoices]
    from azure.core.credentials import AzureKeyCredential
    from azure.ai.formrecognizer import DocumentAnalysisClient

    endpoint = os.environ["AZURE_FORM_RECOGNIZER_ENDPOINT"]
    key = os.environ["AZURE_FORM_RECOGNIZER_KEY"]

    document_analysis_client = DocumentAnalysisClient(
        endpoint=endpoint, credential=AzureKeyCredential(key))
    with open(path_to_sample_documents, "rb") as f:
        poller = document_analysis_client.begin_analyze_document(
            "prebuilt-invoice", document=f, locale="en-US")
    invoices = poller.result()

    for idx, invoice in enumerate(invoices.documents):
        print("--------Recognizing invoice #{}--------".format(idx + 1))
        vendor_name = invoice.fields.get("VendorName")
        if vendor_name:
            print("Vendor Name: {} has confidence: {}".format(
                vendor_name.value, vendor_name.confidence))
        vendor_address = invoice.fields.get("VendorAddress")
        if vendor_address:
            print("Vendor Address: {} has confidence: {}".format(
                vendor_address.value, vendor_address.confidence))
        vendor_address_recipient = invoice.fields.get("VendorAddressRecipient")
        if vendor_address_recipient:
            print("Vendor Address Recipient: {} has confidence: {}".format(
                vendor_address_recipient.value,
                vendor_address_recipient.confidence))
        customer_name = invoice.fields.get("CustomerName")
        if customer_name:
            print("Customer Name: {} has confidence: {}".format(
                customer_name.value, customer_name.confidence))
        customer_id = invoice.fields.get("CustomerId")
        if customer_id:
            print("Customer Id: {} has confidence: {}".format(
                customer_id.value, customer_id.confidence))
        customer_address = invoice.fields.get("CustomerAddress")
        if customer_address:
            print("Customer Address: {} has confidence: {}".format(
                customer_address.value, customer_address.confidence))
        customer_address_recipient = invoice.fields.get(
            "CustomerAddressRecipient")
        if customer_address_recipient:
            print("Customer Address Recipient: {} has confidence: {}".format(
                customer_address_recipient.value,
                customer_address_recipient.confidence,
            ))
        invoice_id = invoice.fields.get("InvoiceId")
        if invoice_id:
            print("Invoice Id: {} has confidence: {}".format(
                invoice_id.value, invoice_id.confidence))
        invoice_date = invoice.fields.get("InvoiceDate")
        if invoice_date:
            print("Invoice Date: {} has confidence: {}".format(
                invoice_date.value, invoice_date.confidence))
        invoice_total = invoice.fields.get("InvoiceTotal")
        if invoice_total:
            print("Invoice Total: {} has confidence: {}".format(
                invoice_total.value, invoice_total.confidence))
        due_date = invoice.fields.get("DueDate")
        if due_date:
            print("Due Date: {} has confidence: {}".format(
                due_date.value, due_date.confidence))
        purchase_order = invoice.fields.get("PurchaseOrder")
        if purchase_order:
            print("Purchase Order: {} has confidence: {}".format(
                purchase_order.value, purchase_order.confidence))
        billing_address = invoice.fields.get("BillingAddress")
        if billing_address:
            print("Billing Address: {} has confidence: {}".format(
                billing_address.value, billing_address.confidence))
        billing_address_recipient = invoice.fields.get(
            "BillingAddressRecipient")
        if billing_address_recipient:
            print("Billing Address Recipient: {} has confidence: {}".format(
                billing_address_recipient.value,
                billing_address_recipient.confidence,
            ))
        shipping_address = invoice.fields.get("ShippingAddress")
        if shipping_address:
            print("Shipping Address: {} has confidence: {}".format(
                shipping_address.value, shipping_address.confidence))
        shipping_address_recipient = invoice.fields.get(
            "ShippingAddressRecipient")
        if shipping_address_recipient:
            print("Shipping Address Recipient: {} has confidence: {}".format(
                shipping_address_recipient.value,
                shipping_address_recipient.confidence,
            ))
        print("Invoice items:")
        for idx, item in enumerate(invoice.fields.get("Items").value):
            print("...Item #{}".format(idx + 1))
            item_description = item.value.get("Description")
            if item_description:
                print("......Description: {} has confidence: {}".format(
                    item_description.value, item_description.confidence))
            item_quantity = item.value.get("Quantity")
            if item_quantity:
                print("......Quantity: {} has confidence: {}".format(
                    item_quantity.value, item_quantity.confidence))
            unit = item.value.get("Unit")
            if unit:
                print("......Unit: {} has confidence: {}".format(
                    unit.value, unit.confidence))
            unit_price = item.value.get("UnitPrice")
            if unit_price:
                print("......Unit Price: {} has confidence: {}".format(
                    unit_price.value, unit_price.confidence))
            product_code = item.value.get("ProductCode")
            if product_code:
                print("......Product Code: {} has confidence: {}".format(
                    product_code.value, product_code.confidence))
            item_date = item.value.get("Date")
            if item_date:
                print("......Date: {} has confidence: {}".format(
                    item_date.value, item_date.confidence))
            tax = item.value.get("Tax")
            if tax:
                print("......Tax: {} has confidence: {}".format(
                    tax.value, tax.confidence))
            amount = item.value.get("Amount")
            if amount:
                print("......Amount: {} has confidence: {}".format(
                    amount.value, amount.confidence))
        subtotal = invoice.fields.get("SubTotal")
        if subtotal:
            print("Subtotal: {} has confidence: {}".format(
                subtotal.value, subtotal.confidence))
        total_tax = invoice.fields.get("TotalTax")
        if total_tax:
            print("Total Tax: {} has confidence: {}".format(
                total_tax.value, total_tax.confidence))
        previous_unpaid_balance = invoice.fields.get("PreviousUnpaidBalance")
        if previous_unpaid_balance:
            print("Previous Unpaid Balance: {} has confidence: {}".format(
                previous_unpaid_balance.value,
                previous_unpaid_balance.confidence))
        amount_due = invoice.fields.get("AmountDue")
        if amount_due:
            print("Amount Due: {} has confidence: {}".format(
                amount_due.value, amount_due.confidence))
        service_start_date = invoice.fields.get("ServiceStartDate")
        if service_start_date:
            print("Service Start Date: {} has confidence: {}".format(
                service_start_date.value, service_start_date.confidence))
        service_end_date = invoice.fields.get("ServiceEndDate")
        if service_end_date:
            print("Service End Date: {} has confidence: {}".format(
                service_end_date.value, service_end_date.confidence))
        service_address = invoice.fields.get("ServiceAddress")
        if service_address:
            print("Service Address: {} has confidence: {}".format(
                service_address.value, service_address.confidence))
        service_address_recipient = invoice.fields.get(
            "ServiceAddressRecipient")
        if service_address_recipient:
            print("Service Address Recipient: {} has confidence: {}".format(
                service_address_recipient.value,
                service_address_recipient.confidence,
            ))
        remittance_address = invoice.fields.get("RemittanceAddress")
        if remittance_address:
            print("Remittance Address: {} has confidence: {}".format(
                remittance_address.value, remittance_address.confidence))
        remittance_address_recipient = invoice.fields.get(
            "RemittanceAddressRecipient")
        if remittance_address_recipient:
            print("Remittance Address Recipient: {} has confidence: {}".format(
                remittance_address_recipient.value,
                remittance_address_recipient.confidence,
            ))
예제 #15
0
def analyze_receipts():
    path_to_sample_documents = os.path.abspath(
        os.path.join(
            os.path.abspath(__file__),
            "..",
            "..",
            "./sample_forms/receipt/contoso-allinone.jpg",
        ))

    from azure.core.credentials import AzureKeyCredential
    from azure.ai.formrecognizer import DocumentAnalysisClient

    endpoint = os.environ["AZURE_FORM_RECOGNIZER_ENDPOINT"]
    key = os.environ["AZURE_FORM_RECOGNIZER_KEY"]

    document_analysis_client = DocumentAnalysisClient(
        endpoint=endpoint, credential=AzureKeyCredential(key))
    with open(path_to_sample_documents, "rb") as f:
        poller = document_analysis_client.begin_analyze_document(
            "prebuilt-receipt", document=f, locale="en-US")
    receipts = poller.result()

    for idx, receipt in enumerate(receipts.documents):
        print("--------Recognizing receipt #{}--------".format(idx + 1))
        print("Receipt type: {}".format(receipt.doc_type or "N/A"))
        merchant_name = receipt.fields.get("MerchantName")
        if merchant_name:
            print("Merchant Name: {} has confidence: {}".format(
                merchant_name.value, merchant_name.confidence))
        transaction_date = receipt.fields.get("TransactionDate")
        if transaction_date:
            print("Transaction Date: {} has confidence: {}".format(
                transaction_date.value, transaction_date.confidence))
        if receipt.fields.get("Items"):
            print("Receipt items:")
            for idx, item in enumerate(receipt.fields.get("Items").value):
                print("...Item #{}".format(idx + 1))
                item_name = item.value.get("Name")
                if item_name:
                    print("......Item Name: {} has confidence: {}".format(
                        item_name.value, item_name.confidence))
                item_quantity = item.value.get("Quantity")
                if item_quantity:
                    print("......Item Quantity: {} has confidence: {}".format(
                        item_quantity.value, item_quantity.confidence))
                item_price = item.value.get("Price")
                if item_price:
                    print("......Individual Item Price: {} has confidence: {}".
                          format(item_price.value, item_price.confidence))
                item_total_price = item.value.get("TotalPrice")
                if item_total_price:
                    print(
                        "......Total Item Price: {} has confidence: {}".format(
                            item_total_price.value,
                            item_total_price.confidence))
        subtotal = receipt.fields.get("Subtotal")
        if subtotal:
            print("Subtotal: {} has confidence: {}".format(
                subtotal.value, subtotal.confidence))
        tax = receipt.fields.get("Tax")
        if tax:
            print("Tax: {} has confidence: {}".format(tax.value,
                                                      tax.confidence))
        tip = receipt.fields.get("Tip")
        if tip:
            print("Tip: {} has confidence: {}".format(tip.value,
                                                      tip.confidence))
        total = receipt.fields.get("Total")
        if total:
            print("Total: {} has confidence: {}".format(
                total.value, total.confidence))
        print("--------------------------------------")
def get_elements_with_spans():
    path_to_sample_documents = os.path.abspath(
        os.path.join(
            os.path.abspath(__file__),
            "..",
            "..",
            "./sample_forms/forms/Form_1.jpg",
        )
    )

    from azure.core.credentials import AzureKeyCredential
    from azure.ai.formrecognizer import DocumentAnalysisClient

    endpoint = os.environ["AZURE_FORM_RECOGNIZER_ENDPOINT"]
    key = os.environ["AZURE_FORM_RECOGNIZER_KEY"]

    document_analysis_client = DocumentAnalysisClient(
        endpoint=endpoint, credential=AzureKeyCredential(key)
    )
    with open(path_to_sample_documents, "rb") as f:
        poller = document_analysis_client.begin_analyze_document(
            "prebuilt-document", document=f
        )
    result = poller.result()

    # Below is a method to search for the lines of a particular element by using spans.
    # This example uses DocumentTable, but other elements that also have a `spans` or `span` field
    # can also be used to search for related elements, such as lines in this case.
    # To see an example for searching for words which have a `span` field, see
    # `sample_get_words_on_document_line.py` under the samples v3.2-beta directory.
    for table_idx, table in enumerate(result.tables):
        print(
            "Table # {} has {} rows and {} columns".format(
                table_idx, table.row_count, table.column_count
            )
        )

        lines = []

        for region in table.bounding_regions:
            print(
                "Table # {} location on page: {}".format(
                    table_idx,
                    region.page_number,
                )
            )
            lines.extend(get_lines(table.spans, get_page(region.page_number, result.pages)))

        print("Found # {} lines in the table".format(len(lines)))
        for line in lines:
            print(
                "...Line '{}' is within bounding box: '{}'".format(
                    line.content,
                    line.bounding_box,
                )
            )

    # Below is a method to search for the style of a particular element by using spans.
    # This example uses DocumentEntity, but other elements that also have a `spans` or `span`
    # field can also be used to search for document text style.
    for entity in result.entities:
        styles = get_styles(entity.spans, result.styles)
        print(
            "Found entity '{}' of type '{}' with style:".format(
                entity.content, entity.category,
            )
        )
        if not styles:
            print(
                "...no handwritten text found"
            )
        for style in styles:
            if style.is_handwritten:
                print(
                    "...handwritten with confidence {}".format(style.confidence)
                )
    print("----------------------------------------")
예제 #17
0
def analyze_general_documents():
    path_to_sample_documents = os.path.abspath(
        os.path.join(
            os.path.abspath(__file__),
            "..",
            "..",
            "./sample_forms/forms/form_selection_mark.png",
        ))

    from azure.core.credentials import AzureKeyCredential
    from azure.ai.formrecognizer import DocumentAnalysisClient

    endpoint = os.environ["AZURE_FORM_RECOGNIZER_ENDPOINT"]
    key = os.environ["AZURE_FORM_RECOGNIZER_KEY"]

    document_analysis_client = DocumentAnalysisClient(
        endpoint=endpoint, credential=AzureKeyCredential(key))
    with open(path_to_sample_documents, "rb") as f:
        poller = document_analysis_client.begin_analyze_document(
            "prebuilt-document", document=f)
    result = poller.result()

    for style in result.styles:
        if style.is_handwritten:
            print("Document contains handwritten content: ")
            print(",".join([
                result.content[span.offset:span.offset + span.length]
                for span in style.spans
            ]))

    print("----Key-value pairs found in document----")
    for kv_pair in result.key_value_pairs:
        if kv_pair.key:
            print("Key '{}' found within '{}' bounding regions".format(
                kv_pair.key.content,
                format_bounding_region(kv_pair.key.bounding_regions),
            ))
        if kv_pair.value:
            print("Value '{}' found within '{}' bounding regions\n".format(
                kv_pair.value.content,
                format_bounding_region(kv_pair.value.bounding_regions),
            ))

    print("----Entities found in document----")
    for entity in result.entities:
        print("Entity of category '{}' with sub-category '{}'".format(
            entity.category, entity.sub_category))
        print("...has content '{}'".format(entity.content))
        print("...within '{}' bounding regions".format(
            format_bounding_region(entity.bounding_regions)))
        print("...with confidence {}\n".format(entity.confidence))

    for page in result.pages:
        print("----Analyzing document from page #{}----".format(
            page.page_number))
        print(
            "Page has width: {} and height: {}, measured with unit: {}".format(
                page.width, page.height, page.unit))

        for line_idx, line in enumerate(page.lines):
            words = line.get_words()
            print(
                "...Line # {} has {} words and text '{}' within bounding box '{}'"
                .format(
                    line_idx,
                    len(words),
                    line.content,
                    format_bounding_box(line.bounding_box),
                ))

            for word in words:
                print("......Word '{}' has a confidence of {}".format(
                    word.content, word.confidence))

        for selection_mark in page.selection_marks:
            print(
                "...Selection mark is '{}' within bounding box '{}' and has a confidence of {}"
                .format(
                    selection_mark.state,
                    format_bounding_box(selection_mark.bounding_box),
                    selection_mark.confidence,
                ))

    for table_idx, table in enumerate(result.tables):
        print("Table # {} has {} rows and {} columns".format(
            table_idx, table.row_count, table.column_count))
        for region in table.bounding_regions:
            print("Table # {} location on page: {} is {}".format(
                table_idx,
                region.page_number,
                format_bounding_box(region.bounding_box),
            ))
        for cell in table.cells:
            print("...Cell[{}][{}] has content '{}'".format(
                cell.row_index,
                cell.column_index,
                cell.content,
            ))
            for region in cell.bounding_regions:
                print("...content on page {} is within bounding box '{}'\n".
                      format(
                          region.page_number,
                          format_bounding_box(region.bounding_box),
                      ))
    print("----------------------------------------")