def test_analyze_document_none_model_id(self, formrecognizer_test_endpoint, formrecognizer_test_api_key): client = DocumentAnalysisClient( formrecognizer_test_endpoint, AzureKeyCredential(formrecognizer_test_api_key)) with self.assertRaises(ValueError): client.begin_analyze_document(model=None, document=b"xx")
def test_analyze_document_empty_model_id(self, **kwargs): formrecognizer_test_endpoint = kwargs.pop( "formrecognizer_test_endpoint") formrecognizer_test_api_key = kwargs.pop("formrecognizer_test_api_key") client = DocumentAnalysisClient( formrecognizer_test_endpoint, AzureKeyCredential(formrecognizer_test_api_key)) with pytest.raises(ValueError): client.begin_analyze_document(model="", document=b"xx")
def analyze_identity_documents(): path_to_sample_documents = os.path.abspath( os.path.join( os.path.abspath(__file__), "..", "..", "./sample_forms/id_documents/license.jpg", )) from azure.core.credentials import AzureKeyCredential from azure.ai.formrecognizer import DocumentAnalysisClient endpoint = os.environ["AZURE_FORM_RECOGNIZER_ENDPOINT"] key = os.environ["AZURE_FORM_RECOGNIZER_KEY"] document_analysis_client = DocumentAnalysisClient( endpoint=endpoint, credential=AzureKeyCredential(key)) with open(path_to_sample_documents, "rb") as f: poller = document_analysis_client.begin_analyze_document( "prebuilt-idDocument", document=f) id_documents = poller.result() for idx, id_document in enumerate(id_documents.documents): print("--------Recognizing ID document #{}--------".format(idx + 1)) first_name = id_document.fields.get("FirstName") if first_name: print("First Name: {} has confidence: {}".format( first_name.value, first_name.confidence)) last_name = id_document.fields.get("LastName") if last_name: print("Last Name: {} has confidence: {}".format( last_name.value, last_name.confidence)) document_number = id_document.fields.get("DocumentNumber") if document_number: print("Document Number: {} has confidence: {}".format( document_number.value, document_number.confidence)) dob = id_document.fields.get("DateOfBirth") if dob: print("Date of Birth: {} has confidence: {}".format( dob.value, dob.confidence)) doe = id_document.fields.get("DateOfExpiration") if doe: print("Date of Expiration: {} has confidence: {}".format( doe.value, doe.confidence)) sex = id_document.fields.get("Sex") if sex: print("Sex: {} has confidence: {}".format(sex.value, sex.confidence)) address = id_document.fields.get("Address") if address: print("Address: {} has confidence: {}".format( address.value, address.confidence)) country_region = id_document.fields.get("CountryRegion") if country_region: print("Country/Region: {} has confidence: {}".format( country_region.value, country_region.confidence)) region = id_document.fields.get("Region") if region: print("Region: {} has confidence: {}".format( region.value, region.confidence))
def analyze_read(): path_to_sample_documents = os.path.abspath( os.path.join( os.path.abspath(__file__), "..", "..", "./sample_forms/forms/Form_1.jpg", )) from azure.core.credentials import AzureKeyCredential from azure.ai.formrecognizer import DocumentAnalysisClient endpoint = os.environ["AZURE_FORM_RECOGNIZER_ENDPOINT"] key = os.environ["AZURE_FORM_RECOGNIZER_KEY"] document_analysis_client = DocumentAnalysisClient( endpoint=endpoint, credential=AzureKeyCredential(key)) with open(path_to_sample_documents, "rb") as f: poller = document_analysis_client.begin_analyze_document( "prebuilt-read", document=f) result = poller.result() print("----Languages detected in the document----") for language in result.languages: print("Language code: '{}' with confidence {}".format( language.language_code, language.confidence)) for page in result.pages: print("----Analyzing document from page #{}----".format( page.page_number)) print( "Page has width: {} and height: {}, measured with unit: {}".format( page.width, page.height, page.unit)) for line_idx, line in enumerate(page.lines): words = line.get_words() print( "...Line # {} has {} words and text '{}' within bounding box '{}'" .format( line_idx, len(words), line.content, format_bounding_box(line.bounding_box), )) for word in words: print("......Word '{}' has a confidence of {}".format( word.content, word.confidence)) for selection_mark in page.selection_marks: print( "...Selection mark is '{}' within bounding box '{}' and has a confidence of {}" .format( selection_mark.state, format_bounding_box(selection_mark.bounding_box), selection_mark.confidence, )) print("----------------------------------------")
def test_authentication_bad_key(self, formrecognizer_test_endpoint, formrecognizer_test_api_key, **kwargs): client = DocumentAnalysisClient(formrecognizer_test_endpoint, AzureKeyCredential("xxxx")) with pytest.raises(ClientAuthenticationError): poller = client.begin_analyze_document("prebuilt-receipt", b"xx") return {}
def test_receipt_bad_endpoint(self, formrecognizer_test_endpoint, formrecognizer_test_api_key, **kwargs): with open(self.receipt_jpg, "rb") as fd: myfile = fd.read() with pytest.raises(ServiceRequestError): client = DocumentAnalysisClient( "http://notreal.azure.com", AzureKeyCredential(formrecognizer_test_api_key)) poller = client.begin_analyze_document("prebuilt-receipt", myfile)
def analyze_custom_documents(custom_model_id): path_to_sample_documents = os.path.abspath( os.path.join(os.path.abspath(__file__), "..", "..", "./sample_forms/forms/Form_1.jpg")) # [START analyze_custom_documents] from azure.core.credentials import AzureKeyCredential from azure.ai.formrecognizer import DocumentAnalysisClient endpoint = os.environ["AZURE_FORM_RECOGNIZER_ENDPOINT"] key = os.environ["AZURE_FORM_RECOGNIZER_KEY"] model_id = os.getenv("CUSTOM_BUILT_MODEL_ID", custom_model_id) document_analysis_client = DocumentAnalysisClient( endpoint=endpoint, credential=AzureKeyCredential(key)) # Make sure your document's type is included in the list of document types the custom model can analyze with open(path_to_sample_documents, "rb") as f: poller = document_analysis_client.begin_analyze_document( model=model_id, document=f) result = poller.result() for idx, document in enumerate(result.documents): print("--------Analyzing document #{}--------".format(idx + 1)) print("Document has type {}".format(document.doc_type)) print("Document has confidence {}".format(document.confidence)) print("Document was analyzed by model with ID {}".format( result.model_id)) for name, field in document.fields.items(): field_value = field.value if field.value else field.content print( "......found field of type '{}' with value '{}' and with confidence {}" .format(field.value_type, field_value, field.confidence)) # iterate over tables, lines, and selection marks on each page for page in result.pages: print("\nLines found on page {}".format(page.page_number)) for line in page.lines: print("...Line '{}'".format(line.content)) for word in page.words: print("...Word '{}' has a confidence of {}".format( word.content, word.confidence)) for selection_mark in page.selection_marks: print( "...Selection mark is '{}' and has a confidence of {}".format( selection_mark.state, selection_mark.confidence)) for i, table in enumerate(result.tables): print("\nTable {} can be found on page:".format(i + 1)) for region in table.bounding_regions: print("...{}".format(i + 1, region.page_number)) for cell in table.cells: print("...Cell[{}][{}] has content '{}'".format( cell.row_index, cell.column_index, cell.content)) print("-----------------------------------")
def get_words_on_document_line(): path_to_sample_documents = os.path.abspath( os.path.join( os.path.abspath(__file__), "..", "..", "./sample_forms/forms/Form_1.jpg", )) from azure.core.credentials import AzureKeyCredential from azure.ai.formrecognizer import DocumentAnalysisClient endpoint = os.environ["AZURE_FORM_RECOGNIZER_ENDPOINT"] key = os.environ["AZURE_FORM_RECOGNIZER_KEY"] document_analysis_client = DocumentAnalysisClient( endpoint=endpoint, credential=AzureKeyCredential(key)) with open(path_to_sample_documents, "rb") as f: poller = document_analysis_client.begin_analyze_document( "prebuilt-document", document=f) result = poller.result() for idx, page in enumerate(result.pages): print("----Analyzing lines and words from page #{}----".format(idx + 1)) print( "Page has width: {} and height: {}, measured with unit: {}".format( page.width, page.height, page.unit)) for line_idx, line in enumerate(page.lines): words = line.get_words() print( "...Line # {} has word count {} and text '{}' within bounding box '{}'" .format( line_idx, len(words), line.content, format_bounding_box(line.bounding_box), )) for word in words: print("......Word '{}' has a confidence of {}".format( word.content, word.confidence)) print("----------------------------------------")
def convert_to_and_from_dict(): path_to_sample_documents = os.path.abspath( os.path.join( os.path.abspath(__file__), "..", "..", "./sample_forms/forms/Form_1.jpg", )) from azure.core.serialization import AzureJSONEncoder from azure.core.credentials import AzureKeyCredential from azure.ai.formrecognizer import DocumentAnalysisClient, AnalyzeResult endpoint = os.environ["AZURE_FORM_RECOGNIZER_ENDPOINT"] key = os.environ["AZURE_FORM_RECOGNIZER_KEY"] document_analysis_client = DocumentAnalysisClient( endpoint=endpoint, credential=AzureKeyCredential(key)) with open(path_to_sample_documents, "rb") as f: poller = document_analysis_client.begin_analyze_document( "prebuilt-document", document=f) result = poller.result() # convert the received model to a dictionary analyze_result_dict = result.to_dict() # save the dictionary as JSON content in a JSON file, use the AzureJSONEncoder # to help make types, such as dates, JSON serializable # NOTE: AzureJSONEncoder is only available with azure.core>=1.18.0. with open('data.json', 'w') as f: json.dump(analyze_result_dict, f, cls=AzureJSONEncoder) # convert the dictionary back to the original model model = AnalyzeResult.from_dict(analyze_result_dict) # use the model as normal print("----Converted from dictionary AnalyzeResult----") print("Model ID: '{}'".format(model.model_id)) print("Number of pages analyzed {}".format(len(model.pages))) print("API version used: {}".format(model.api_version)) print("----------------------------------------")
class AnalyzeDocumentRequestPreparation(PerfStressTest): def __init__(self, arguments): super().__init__(arguments) with open( os.path.abspath( os.path.join(os.path.abspath(__file__), "..", "./../sample_forms/forms/Form_1.jpg")), "rb") as fd: self.document_jpg = fd.read() # read test related env vars formrecognizer_test_endpoint = os.environ[ "FORMRECOGNIZER_TEST_ENDPOINT"] form_recognizer_account_key = os.environ["FORMRECOGNIZER_TEST_API_KEY"] # assign the clients that will be used in the perf tests self.service_client = DocumentAnalysisClient( formrecognizer_test_endpoint, AzureKeyCredential(form_recognizer_account_key)) self.async_service_client = AsyncDocumentAnalysisClient( formrecognizer_test_endpoint, AzureKeyCredential(form_recognizer_account_key)) async def close(self): """This is run after cleanup.""" await self.async_service_client.close() self.service_client.close() await super().close() def run_sync(self): """The synchronous perf test.""" poller = self.service_client.begin_analyze_document( "prebuilt-document", self.document_jpg) assert poller async def run_async(self): """The asynchronous perf test.""" poller = await self.async_service_client.begin_analyze_document( "prebuilt-document", self.document_jpg) assert poller
def analyze_tax_us_w2(): path_to_sample_documents = os.path.abspath( os.path.join( os.path.abspath(__file__), "..", "..", "./sample_forms/tax/sample_w2.png", )) from azure.core.credentials import AzureKeyCredential from azure.ai.formrecognizer import DocumentAnalysisClient endpoint = os.environ["AZURE_FORM_RECOGNIZER_ENDPOINT"] key = os.environ["AZURE_FORM_RECOGNIZER_KEY"] document_analysis_client = DocumentAnalysisClient( endpoint=endpoint, credential=AzureKeyCredential(key)) with open(path_to_sample_documents, "rb") as f: poller = document_analysis_client.begin_analyze_document( "prebuilt-tax.us.w2", document=f, locale="en-US") w2s = poller.result() for idx, w2 in enumerate(w2s.documents): print("--------Recognizing US Tax W-2 Form #{}--------".format(idx + 1)) form_variant = w2.fields.get("W2FormVariant") if form_variant: print("Form variant: {} has confidence: {}".format( form_variant.value, form_variant.confidence)) tax_year = w2.fields.get("TaxYear") if tax_year: print("Tax year: {} has confidence: {}".format( tax_year.value, tax_year.confidence)) w2_copy = w2.fields.get("W2Copy") if w2_copy: print("W-2 Copy: {} has confidence: {}".format( w2_copy.value, w2_copy.confidence, )) employee = w2.fields.get("Employee") if employee: print("Employee data:") employee_name = employee.value.get("Name") if employee_name: print("...Name: {} has confidence: {}".format( employee_name.value, employee_name.confidence)) employee_ssn = employee.value.get("SocialSecurityNumber") if employee_ssn: print("...SSN: {} has confidence: {}".format( employee_ssn.value, employee_ssn.confidence)) employee_address = employee.value.get("Address") if employee_address: print("...Address: {} has confidence: {}".format( employee_address.value, employee_address.confidence)) employee_zipcode = employee.value.get("ZipCode") if employee_zipcode: print("...Zipcode: {} has confidence: {}".format( employee_zipcode.value, employee_zipcode.confidence)) control_number = w2.fields.get("ControlNumber") if control_number: print("Control Number: {} has confidence: {}".format( control_number.value, control_number.confidence)) employer = w2.fields.get("Employer") if employer: print("Employer data:") employer_name = employer.value.get("Name") if employer_name: print("...Name: {} has confidence: {}".format( employer_name.value, employer_name.confidence)) employer_id = employer.value.get("IdNumber") if employer_id: print("...ID Number: {} has confidence: {}".format( employer_id.value, employer_id.confidence)) employer_address = employer.value.get("Address") if employer_address: print("...Address: {} has confidence: {}".format( employer_address.value, employer_address.confidence)) employer_zipcode = employer.value.get("ZipCode") if employer_zipcode: print("...Zipcode: {} has confidence: {}".format( employer_zipcode.value, employer_zipcode.confidence)) wages_tips = w2.fields.get("WagesTipsAndOtherCompensation") if wages_tips: print("Wages, tips, and other compensation: {} has confidence: {}". format( wages_tips.value, wages_tips.confidence, )) fed_income_tax_withheld = w2.fields.get("FederalIncomeTaxWithheld") if fed_income_tax_withheld: print("Federal income tax withheld: {} has confidence: {}".format( fed_income_tax_withheld.value, fed_income_tax_withheld.confidence)) social_security_wages = w2.fields.get("SocialSecurityWages") if social_security_wages: print("Social Security wages: {} has confidence: {}".format( social_security_wages.value, social_security_wages.confidence)) social_security_tax_withheld = w2.fields.get( "SocialSecurityTaxWithheld") if social_security_tax_withheld: print("Social Security tax withheld: {} has confidence: {}".format( social_security_tax_withheld.value, social_security_tax_withheld.confidence)) medicare_wages_tips = w2.fields.get("MedicareWagesAndTips") if medicare_wages_tips: print("Medicare wages and tips: {} has confidence: {}".format( medicare_wages_tips.value, medicare_wages_tips.confidence)) medicare_tax_withheld = w2.fields.get("MedicareTaxWithheld") if medicare_tax_withheld: print("Medicare tax withheld: {} has confidence: {}".format( medicare_tax_withheld.value, medicare_tax_withheld.confidence)) social_security_tips = w2.fields.get("SocialSecurityTips") if social_security_tips: print("Social Security tips: {} has confidence: {}".format( social_security_tips.value, social_security_tips.confidence)) allocated_tips = w2.fields.get("AllocatedTips") if allocated_tips: print("Allocated tips: {} has confidence: {}".format( allocated_tips.value, allocated_tips.confidence, )) verification_code = w2.fields.get("VerificationCode") if verification_code: print("Verification code: {} has confidence: {}".format( verification_code.value, verification_code.confidence)) dependent_care_benefits = w2.fields.get("DependentCareBenefits") if dependent_care_benefits: print("Dependent care benefits: {} has confidence: {}".format( dependent_care_benefits.value, dependent_care_benefits.confidence, )) non_qualified_plans = w2.fields.get("NonQualifiedPlans") if non_qualified_plans: print("Non-qualified plans: {} has confidence: {}".format( non_qualified_plans.value, non_qualified_plans.confidence, )) additional_info = w2.fields.get("AdditionalInfo") if additional_info: print("Additional information:") for item in additional_info.value: letter_code = item.value.get("LetterCode") if letter_code: print("...Letter code: {} has confidence: {}".format( letter_code.value, letter_code.confidence)) amount = item.value.get("Amount") if amount: print("...Amount: {} has confidence: {}".format( amount.value, amount.confidence)) is_statutory_employee = w2.fields.get("IsStatutoryEmployee") if is_statutory_employee: print("Is statutory employee: {} has confidence: {}".format( is_statutory_employee.value, is_statutory_employee.confidence)) is_retirement_plan = w2.fields.get("IsRetirementPlan") if is_retirement_plan: print("Is retirement plan: {} has confidence: {}".format( is_retirement_plan.value, is_retirement_plan.confidence)) third_party_sick_pay = w2.fields.get("IsThirdPartySickPay") if third_party_sick_pay: print("Is third party sick pay: {} has confidence: {}".format( third_party_sick_pay.value, third_party_sick_pay.confidence)) other_info = w2.fields.get("Other") if other_info: print("Other information: {} has confidence: {}".format( other_info.value, other_info.confidence, )) state_tax_info = w2.fields.get("StateTaxInfos") if state_tax_info: print("State Tax info:") for tax in state_tax_info.value: state = tax.value.get("State") if state: print("...State: {} has confidence: {}".format( state.value, state.confidence)) employer_state_id_number = tax.value.get( "EmployerStateIdNumber") if employer_state_id_number: print("...Employer state ID number: {} has confidence: {}". format(employer_state_id_number.value, employer_state_id_number.confidence)) state_wages_tips = tax.value.get("StateWagesTipsEtc") if state_wages_tips: print("...State wages, tips, etc: {} has confidence: {}". format(state_wages_tips.value, state_wages_tips.confidence)) state_income_tax = tax.value.get("StateIncomeTax") if state_income_tax: print("...State income tax: {} has confidence: {}".format( state_income_tax.value, state_income_tax.confidence)) local_tax_info = w2.fields.get("LocalTaxInfos") if local_tax_info: print("Local Tax info:") for tax in local_tax_info.value: local_wages_tips = tax.value.get("LocalWagesTipsEtc") if local_wages_tips: print("...Local wages, tips, etc: {} has confidence: {}". format(local_wages_tips.value, local_wages_tips.confidence)) local_income_tax = tax.value.get("LocalIncomeTax") if local_income_tax: print("...Local income tax: {} has confidence: {}".format( local_income_tax.value, local_income_tax.confidence)) locality_name = tax.value.get("LocalityName") if locality_name: print("...Locality name: {} has confidence: {}".format( locality_name.value, locality_name.confidence))
def analyze_business_card(): path_to_sample_documents = os.path.abspath( os.path.join( os.path.abspath(__file__), "..", "..", "./sample_forms/business_cards/business-card-english.jpg", )) from azure.core.credentials import AzureKeyCredential from azure.ai.formrecognizer import DocumentAnalysisClient endpoint = os.environ["AZURE_FORM_RECOGNIZER_ENDPOINT"] key = os.environ["AZURE_FORM_RECOGNIZER_KEY"] document_analysis_client = DocumentAnalysisClient( endpoint=endpoint, credential=AzureKeyCredential(key)) with open(path_to_sample_documents, "rb") as f: poller = document_analysis_client.begin_analyze_document( "prebuilt-businessCard", document=f, locale="en-US") business_cards = poller.result() for idx, business_card in enumerate(business_cards.documents): print("--------Analyzing business card #{}--------".format(idx + 1)) contact_names = business_card.fields.get("ContactNames") if contact_names: for contact_name in contact_names.value: print("Contact First Name: {} has confidence: {}".format( contact_name.value["FirstName"].value, contact_name.value["FirstName"].confidence, )) print("Contact Last Name: {} has confidence: {}".format( contact_name.value["LastName"].value, contact_name.value["LastName"].confidence, )) company_names = business_card.fields.get("CompanyNames") if company_names: for company_name in company_names.value: print("Company Name: {} has confidence: {}".format( company_name.value, company_name.confidence)) departments = business_card.fields.get("Departments") if departments: for department in departments.value: print("Department: {} has confidence: {}".format( department.value, department.confidence)) job_titles = business_card.fields.get("JobTitles") if job_titles: for job_title in job_titles.value: print("Job Title: {} has confidence: {}".format( job_title.value, job_title.confidence)) emails = business_card.fields.get("Emails") if emails: for email in emails.value: print("Email: {} has confidence: {}".format( email.value, email.confidence)) websites = business_card.fields.get("Websites") if websites: for website in websites.value: print("Website: {} has confidence: {}".format( website.value, website.confidence)) addresses = business_card.fields.get("Addresses") if addresses: for address in addresses.value: print("Address: {} has confidence: {}".format( address.value, address.confidence)) mobile_phones = business_card.fields.get("MobilePhones") if mobile_phones: for phone in mobile_phones.value: print("Mobile phone number: {} has confidence: {}".format( phone.content, phone.confidence)) faxes = business_card.fields.get("Faxes") if faxes: for fax in faxes.value: print("Fax number: {} has confidence: {}".format( fax.content, fax.confidence)) work_phones = business_card.fields.get("WorkPhones") if work_phones: for work_phone in work_phones.value: print("Work phone number: {} has confidence: {}".format( work_phone.content, work_phone.confidence)) other_phones = business_card.fields.get("OtherPhones") if other_phones: for other_phone in other_phones.value: print("Other phone number: {} has confidence: {}".format( other_phone.value, other_phone.confidence))
def analyze_layout(): path_to_sample_documents = os.path.abspath( os.path.join( os.path.abspath(__file__), "..", "..", "./sample_forms/forms/form_selection_mark.png", ) ) from azure.core.credentials import AzureKeyCredential from azure.ai.formrecognizer import DocumentAnalysisClient endpoint = os.environ["AZURE_FORM_RECOGNIZER_ENDPOINT"] key = os.environ["AZURE_FORM_RECOGNIZER_KEY"] document_analysis_client = DocumentAnalysisClient( endpoint=endpoint, credential=AzureKeyCredential(key) ) with open(path_to_sample_documents, "rb") as f: poller = document_analysis_client.begin_analyze_document( "prebuilt-layout", document=f ) result = poller.result() for idx, style in enumerate(result.styles): print( "Document contains {} content".format( "handwritten" if style.is_handwritten else "no handwritten" ) ) for page in result.pages: print("----Analyzing layout from page #{}----".format(page.page_number)) print( "Page has width: {} and height: {}, measured with unit: {}".format( page.width, page.height, page.unit ) ) for line_idx, line in enumerate(page.lines): words = line.get_words() print( "...Line # {} has word count {} and text '{}' within bounding box '{}'".format( line_idx, len(words), line.content, format_bounding_box(line.bounding_box), ) ) for word in words: print( "......Word '{}' has a confidence of {}".format( word.content, word.confidence ) ) for selection_mark in page.selection_marks: print( "...Selection mark is '{}' within bounding box '{}' and has a confidence of {}".format( selection_mark.state, format_bounding_box(selection_mark.bounding_box), selection_mark.confidence, ) ) for table_idx, table in enumerate(result.tables): print( "Table # {} has {} rows and {} columns".format( table_idx, table.row_count, table.column_count ) ) for region in table.bounding_regions: print( "Table # {} location on page: {} is {}".format( table_idx, region.page_number, format_bounding_box(region.bounding_box), ) ) for cell in table.cells: print( "...Cell[{}][{}] has content '{}'".format( cell.row_index, cell.column_index, cell.content, ) ) for region in cell.bounding_regions: print( "...content on page {} is within bounding box '{}'".format( region.page_number, format_bounding_box(region.bounding_box), ) ) print("----------------------------------------")
def analyze_invoice(): path_to_sample_documents = os.path.abspath( os.path.join( os.path.abspath(__file__), "..", "..", "./sample_forms/forms/sample_invoice.jpg", )) # [START analyze_invoices] from azure.core.credentials import AzureKeyCredential from azure.ai.formrecognizer import DocumentAnalysisClient endpoint = os.environ["AZURE_FORM_RECOGNIZER_ENDPOINT"] key = os.environ["AZURE_FORM_RECOGNIZER_KEY"] document_analysis_client = DocumentAnalysisClient( endpoint=endpoint, credential=AzureKeyCredential(key)) with open(path_to_sample_documents, "rb") as f: poller = document_analysis_client.begin_analyze_document( "prebuilt-invoice", document=f, locale="en-US") invoices = poller.result() for idx, invoice in enumerate(invoices.documents): print("--------Recognizing invoice #{}--------".format(idx + 1)) vendor_name = invoice.fields.get("VendorName") if vendor_name: print("Vendor Name: {} has confidence: {}".format( vendor_name.value, vendor_name.confidence)) vendor_address = invoice.fields.get("VendorAddress") if vendor_address: print("Vendor Address: {} has confidence: {}".format( vendor_address.value, vendor_address.confidence)) vendor_address_recipient = invoice.fields.get("VendorAddressRecipient") if vendor_address_recipient: print("Vendor Address Recipient: {} has confidence: {}".format( vendor_address_recipient.value, vendor_address_recipient.confidence)) customer_name = invoice.fields.get("CustomerName") if customer_name: print("Customer Name: {} has confidence: {}".format( customer_name.value, customer_name.confidence)) customer_id = invoice.fields.get("CustomerId") if customer_id: print("Customer Id: {} has confidence: {}".format( customer_id.value, customer_id.confidence)) customer_address = invoice.fields.get("CustomerAddress") if customer_address: print("Customer Address: {} has confidence: {}".format( customer_address.value, customer_address.confidence)) customer_address_recipient = invoice.fields.get( "CustomerAddressRecipient") if customer_address_recipient: print("Customer Address Recipient: {} has confidence: {}".format( customer_address_recipient.value, customer_address_recipient.confidence, )) invoice_id = invoice.fields.get("InvoiceId") if invoice_id: print("Invoice Id: {} has confidence: {}".format( invoice_id.value, invoice_id.confidence)) invoice_date = invoice.fields.get("InvoiceDate") if invoice_date: print("Invoice Date: {} has confidence: {}".format( invoice_date.value, invoice_date.confidence)) invoice_total = invoice.fields.get("InvoiceTotal") if invoice_total: print("Invoice Total: {} has confidence: {}".format( invoice_total.value, invoice_total.confidence)) due_date = invoice.fields.get("DueDate") if due_date: print("Due Date: {} has confidence: {}".format( due_date.value, due_date.confidence)) purchase_order = invoice.fields.get("PurchaseOrder") if purchase_order: print("Purchase Order: {} has confidence: {}".format( purchase_order.value, purchase_order.confidence)) billing_address = invoice.fields.get("BillingAddress") if billing_address: print("Billing Address: {} has confidence: {}".format( billing_address.value, billing_address.confidence)) billing_address_recipient = invoice.fields.get( "BillingAddressRecipient") if billing_address_recipient: print("Billing Address Recipient: {} has confidence: {}".format( billing_address_recipient.value, billing_address_recipient.confidence, )) shipping_address = invoice.fields.get("ShippingAddress") if shipping_address: print("Shipping Address: {} has confidence: {}".format( shipping_address.value, shipping_address.confidence)) shipping_address_recipient = invoice.fields.get( "ShippingAddressRecipient") if shipping_address_recipient: print("Shipping Address Recipient: {} has confidence: {}".format( shipping_address_recipient.value, shipping_address_recipient.confidence, )) print("Invoice items:") for idx, item in enumerate(invoice.fields.get("Items").value): print("...Item #{}".format(idx + 1)) item_description = item.value.get("Description") if item_description: print("......Description: {} has confidence: {}".format( item_description.value, item_description.confidence)) item_quantity = item.value.get("Quantity") if item_quantity: print("......Quantity: {} has confidence: {}".format( item_quantity.value, item_quantity.confidence)) unit = item.value.get("Unit") if unit: print("......Unit: {} has confidence: {}".format( unit.value, unit.confidence)) unit_price = item.value.get("UnitPrice") if unit_price: print("......Unit Price: {} has confidence: {}".format( unit_price.value, unit_price.confidence)) product_code = item.value.get("ProductCode") if product_code: print("......Product Code: {} has confidence: {}".format( product_code.value, product_code.confidence)) item_date = item.value.get("Date") if item_date: print("......Date: {} has confidence: {}".format( item_date.value, item_date.confidence)) tax = item.value.get("Tax") if tax: print("......Tax: {} has confidence: {}".format( tax.value, tax.confidence)) amount = item.value.get("Amount") if amount: print("......Amount: {} has confidence: {}".format( amount.value, amount.confidence)) subtotal = invoice.fields.get("SubTotal") if subtotal: print("Subtotal: {} has confidence: {}".format( subtotal.value, subtotal.confidence)) total_tax = invoice.fields.get("TotalTax") if total_tax: print("Total Tax: {} has confidence: {}".format( total_tax.value, total_tax.confidence)) previous_unpaid_balance = invoice.fields.get("PreviousUnpaidBalance") if previous_unpaid_balance: print("Previous Unpaid Balance: {} has confidence: {}".format( previous_unpaid_balance.value, previous_unpaid_balance.confidence)) amount_due = invoice.fields.get("AmountDue") if amount_due: print("Amount Due: {} has confidence: {}".format( amount_due.value, amount_due.confidence)) service_start_date = invoice.fields.get("ServiceStartDate") if service_start_date: print("Service Start Date: {} has confidence: {}".format( service_start_date.value, service_start_date.confidence)) service_end_date = invoice.fields.get("ServiceEndDate") if service_end_date: print("Service End Date: {} has confidence: {}".format( service_end_date.value, service_end_date.confidence)) service_address = invoice.fields.get("ServiceAddress") if service_address: print("Service Address: {} has confidence: {}".format( service_address.value, service_address.confidence)) service_address_recipient = invoice.fields.get( "ServiceAddressRecipient") if service_address_recipient: print("Service Address Recipient: {} has confidence: {}".format( service_address_recipient.value, service_address_recipient.confidence, )) remittance_address = invoice.fields.get("RemittanceAddress") if remittance_address: print("Remittance Address: {} has confidence: {}".format( remittance_address.value, remittance_address.confidence)) remittance_address_recipient = invoice.fields.get( "RemittanceAddressRecipient") if remittance_address_recipient: print("Remittance Address Recipient: {} has confidence: {}".format( remittance_address_recipient.value, remittance_address_recipient.confidence, ))
def analyze_receipts(): path_to_sample_documents = os.path.abspath( os.path.join( os.path.abspath(__file__), "..", "..", "./sample_forms/receipt/contoso-allinone.jpg", )) from azure.core.credentials import AzureKeyCredential from azure.ai.formrecognizer import DocumentAnalysisClient endpoint = os.environ["AZURE_FORM_RECOGNIZER_ENDPOINT"] key = os.environ["AZURE_FORM_RECOGNIZER_KEY"] document_analysis_client = DocumentAnalysisClient( endpoint=endpoint, credential=AzureKeyCredential(key)) with open(path_to_sample_documents, "rb") as f: poller = document_analysis_client.begin_analyze_document( "prebuilt-receipt", document=f, locale="en-US") receipts = poller.result() for idx, receipt in enumerate(receipts.documents): print("--------Recognizing receipt #{}--------".format(idx + 1)) print("Receipt type: {}".format(receipt.doc_type or "N/A")) merchant_name = receipt.fields.get("MerchantName") if merchant_name: print("Merchant Name: {} has confidence: {}".format( merchant_name.value, merchant_name.confidence)) transaction_date = receipt.fields.get("TransactionDate") if transaction_date: print("Transaction Date: {} has confidence: {}".format( transaction_date.value, transaction_date.confidence)) if receipt.fields.get("Items"): print("Receipt items:") for idx, item in enumerate(receipt.fields.get("Items").value): print("...Item #{}".format(idx + 1)) item_name = item.value.get("Name") if item_name: print("......Item Name: {} has confidence: {}".format( item_name.value, item_name.confidence)) item_quantity = item.value.get("Quantity") if item_quantity: print("......Item Quantity: {} has confidence: {}".format( item_quantity.value, item_quantity.confidence)) item_price = item.value.get("Price") if item_price: print("......Individual Item Price: {} has confidence: {}". format(item_price.value, item_price.confidence)) item_total_price = item.value.get("TotalPrice") if item_total_price: print( "......Total Item Price: {} has confidence: {}".format( item_total_price.value, item_total_price.confidence)) subtotal = receipt.fields.get("Subtotal") if subtotal: print("Subtotal: {} has confidence: {}".format( subtotal.value, subtotal.confidence)) tax = receipt.fields.get("Tax") if tax: print("Tax: {} has confidence: {}".format(tax.value, tax.confidence)) tip = receipt.fields.get("Tip") if tip: print("Tip: {} has confidence: {}".format(tip.value, tip.confidence)) total = receipt.fields.get("Total") if total: print("Total: {} has confidence: {}".format( total.value, total.confidence)) print("--------------------------------------")
def get_elements_with_spans(): path_to_sample_documents = os.path.abspath( os.path.join( os.path.abspath(__file__), "..", "..", "./sample_forms/forms/Form_1.jpg", ) ) from azure.core.credentials import AzureKeyCredential from azure.ai.formrecognizer import DocumentAnalysisClient endpoint = os.environ["AZURE_FORM_RECOGNIZER_ENDPOINT"] key = os.environ["AZURE_FORM_RECOGNIZER_KEY"] document_analysis_client = DocumentAnalysisClient( endpoint=endpoint, credential=AzureKeyCredential(key) ) with open(path_to_sample_documents, "rb") as f: poller = document_analysis_client.begin_analyze_document( "prebuilt-document", document=f ) result = poller.result() # Below is a method to search for the lines of a particular element by using spans. # This example uses DocumentTable, but other elements that also have a `spans` or `span` field # can also be used to search for related elements, such as lines in this case. # To see an example for searching for words which have a `span` field, see # `sample_get_words_on_document_line.py` under the samples v3.2-beta directory. for table_idx, table in enumerate(result.tables): print( "Table # {} has {} rows and {} columns".format( table_idx, table.row_count, table.column_count ) ) lines = [] for region in table.bounding_regions: print( "Table # {} location on page: {}".format( table_idx, region.page_number, ) ) lines.extend(get_lines(table.spans, get_page(region.page_number, result.pages))) print("Found # {} lines in the table".format(len(lines))) for line in lines: print( "...Line '{}' is within bounding box: '{}'".format( line.content, line.bounding_box, ) ) # Below is a method to search for the style of a particular element by using spans. # This example uses DocumentEntity, but other elements that also have a `spans` or `span` # field can also be used to search for document text style. for entity in result.entities: styles = get_styles(entity.spans, result.styles) print( "Found entity '{}' of type '{}' with style:".format( entity.content, entity.category, ) ) if not styles: print( "...no handwritten text found" ) for style in styles: if style.is_handwritten: print( "...handwritten with confidence {}".format(style.confidence) ) print("----------------------------------------")
def analyze_general_documents(): path_to_sample_documents = os.path.abspath( os.path.join( os.path.abspath(__file__), "..", "..", "./sample_forms/forms/form_selection_mark.png", )) from azure.core.credentials import AzureKeyCredential from azure.ai.formrecognizer import DocumentAnalysisClient endpoint = os.environ["AZURE_FORM_RECOGNIZER_ENDPOINT"] key = os.environ["AZURE_FORM_RECOGNIZER_KEY"] document_analysis_client = DocumentAnalysisClient( endpoint=endpoint, credential=AzureKeyCredential(key)) with open(path_to_sample_documents, "rb") as f: poller = document_analysis_client.begin_analyze_document( "prebuilt-document", document=f) result = poller.result() for style in result.styles: if style.is_handwritten: print("Document contains handwritten content: ") print(",".join([ result.content[span.offset:span.offset + span.length] for span in style.spans ])) print("----Key-value pairs found in document----") for kv_pair in result.key_value_pairs: if kv_pair.key: print("Key '{}' found within '{}' bounding regions".format( kv_pair.key.content, format_bounding_region(kv_pair.key.bounding_regions), )) if kv_pair.value: print("Value '{}' found within '{}' bounding regions\n".format( kv_pair.value.content, format_bounding_region(kv_pair.value.bounding_regions), )) print("----Entities found in document----") for entity in result.entities: print("Entity of category '{}' with sub-category '{}'".format( entity.category, entity.sub_category)) print("...has content '{}'".format(entity.content)) print("...within '{}' bounding regions".format( format_bounding_region(entity.bounding_regions))) print("...with confidence {}\n".format(entity.confidence)) for page in result.pages: print("----Analyzing document from page #{}----".format( page.page_number)) print( "Page has width: {} and height: {}, measured with unit: {}".format( page.width, page.height, page.unit)) for line_idx, line in enumerate(page.lines): words = line.get_words() print( "...Line # {} has {} words and text '{}' within bounding box '{}'" .format( line_idx, len(words), line.content, format_bounding_box(line.bounding_box), )) for word in words: print("......Word '{}' has a confidence of {}".format( word.content, word.confidence)) for selection_mark in page.selection_marks: print( "...Selection mark is '{}' within bounding box '{}' and has a confidence of {}" .format( selection_mark.state, format_bounding_box(selection_mark.bounding_box), selection_mark.confidence, )) for table_idx, table in enumerate(result.tables): print("Table # {} has {} rows and {} columns".format( table_idx, table.row_count, table.column_count)) for region in table.bounding_regions: print("Table # {} location on page: {} is {}".format( table_idx, region.page_number, format_bounding_box(region.bounding_box), )) for cell in table.cells: print("...Cell[{}][{}] has content '{}'".format( cell.row_index, cell.column_index, cell.content, )) for region in cell.bounding_regions: print("...content on page {} is within bounding box '{}'\n". format( region.page_number, format_bounding_box(region.bounding_box), )) print("----------------------------------------")