def test_active_directory_auth(self): token = self.generate_oauth_token() endpoint = self.get_oauth_endpoint() client = DocumentAnalysisClient(endpoint, token) poller = client.begin_analyze_document_from_url("prebuilt-receipt", self.receipt_url_jpg) result = poller.result() assert result is not None
def test_receipt_url_auth_bad_key(self, formrecognizer_test_endpoint, formrecognizer_test_api_key): client = DocumentAnalysisClient(formrecognizer_test_endpoint, AzureKeyCredential("xxxx")) with self.assertRaises(ClientAuthenticationError): poller = client.begin_analyze_document_from_url( "prebuilt-receipt", self.receipt_url_jpg)
def test_mock_quota_exceeded_403(self, formrecognizer_test_endpoint, formrecognizer_test_api_key): response = mock.Mock(status_code=403, headers={ "Retry-After": 186688, "Content-Type": "application/json" }, reason="Bad Request") response.text = lambda encoding=None: json.dumps({ "error": { "code": "403", "message": "Out of call volume quota for FormRecognizer F0 pricing tier. " "Please retry after 1 day. To increase your call volume switch to a paid tier." } }) response.content_type = "application/json" transport = mock.Mock(send=lambda request, **kwargs: response) client = DocumentAnalysisClient( formrecognizer_test_endpoint, AzureKeyCredential(formrecognizer_test_api_key), transport=transport) with pytest.raises(HttpResponseError) as e: poller = client.begin_analyze_document_from_url( "prebuilt-receipt", self.receipt_url_jpg) assert e.value.status_code == 403 assert e.value.error.message == 'Out of call volume quota for FormRecognizer F0 pricing tier. Please retry after 1 day. To increase your call volume switch to a paid tier.'
def test_analyze_document_none_model_id(self, formrecognizer_test_endpoint, formrecognizer_test_api_key): client = DocumentAnalysisClient( formrecognizer_test_endpoint, AzureKeyCredential(formrecognizer_test_api_key)) with self.assertRaises(ValueError): client.begin_analyze_document(model=None, document=b"xx")
def analyze_identity_documents(): path_to_sample_documents = os.path.abspath( os.path.join( os.path.abspath(__file__), "..", "..", "./sample_forms/id_documents/license.jpg", )) from azure.core.credentials import AzureKeyCredential from azure.ai.formrecognizer import DocumentAnalysisClient endpoint = os.environ["AZURE_FORM_RECOGNIZER_ENDPOINT"] key = os.environ["AZURE_FORM_RECOGNIZER_KEY"] document_analysis_client = DocumentAnalysisClient( endpoint=endpoint, credential=AzureKeyCredential(key)) with open(path_to_sample_documents, "rb") as f: poller = document_analysis_client.begin_analyze_document( "prebuilt-idDocument", document=f) id_documents = poller.result() for idx, id_document in enumerate(id_documents.documents): print("--------Recognizing ID document #{}--------".format(idx + 1)) first_name = id_document.fields.get("FirstName") if first_name: print("First Name: {} has confidence: {}".format( first_name.value, first_name.confidence)) last_name = id_document.fields.get("LastName") if last_name: print("Last Name: {} has confidence: {}".format( last_name.value, last_name.confidence)) document_number = id_document.fields.get("DocumentNumber") if document_number: print("Document Number: {} has confidence: {}".format( document_number.value, document_number.confidence)) dob = id_document.fields.get("DateOfBirth") if dob: print("Date of Birth: {} has confidence: {}".format( dob.value, dob.confidence)) doe = id_document.fields.get("DateOfExpiration") if doe: print("Date of Expiration: {} has confidence: {}".format( doe.value, doe.confidence)) sex = id_document.fields.get("Sex") if sex: print("Sex: {} has confidence: {}".format(sex.value, sex.confidence)) address = id_document.fields.get("Address") if address: print("Address: {} has confidence: {}".format( address.value, address.confidence)) country_region = id_document.fields.get("CountryRegion") if country_region: print("Country/Region: {} has confidence: {}".format( country_region.value, country_region.confidence)) region = id_document.fields.get("Region") if region: print("Region: {} has confidence: {}".format( region.value, region.confidence))
def analyze_read(): path_to_sample_documents = os.path.abspath( os.path.join( os.path.abspath(__file__), "..", "..", "./sample_forms/forms/Form_1.jpg", )) from azure.core.credentials import AzureKeyCredential from azure.ai.formrecognizer import DocumentAnalysisClient endpoint = os.environ["AZURE_FORM_RECOGNIZER_ENDPOINT"] key = os.environ["AZURE_FORM_RECOGNIZER_KEY"] document_analysis_client = DocumentAnalysisClient( endpoint=endpoint, credential=AzureKeyCredential(key)) with open(path_to_sample_documents, "rb") as f: poller = document_analysis_client.begin_analyze_document( "prebuilt-read", document=f) result = poller.result() print("----Languages detected in the document----") for language in result.languages: print("Language code: '{}' with confidence {}".format( language.language_code, language.confidence)) for page in result.pages: print("----Analyzing document from page #{}----".format( page.page_number)) print( "Page has width: {} and height: {}, measured with unit: {}".format( page.width, page.height, page.unit)) for line_idx, line in enumerate(page.lines): words = line.get_words() print( "...Line # {} has {} words and text '{}' within bounding box '{}'" .format( line_idx, len(words), line.content, format_bounding_box(line.bounding_box), )) for word in words: print("......Word '{}' has a confidence of {}".format( word.content, word.confidence)) for selection_mark in page.selection_marks: print( "...Selection mark is '{}' within bounding box '{}' and has a confidence of {}" .format( selection_mark.state, format_bounding_box(selection_mark.bounding_box), selection_mark.confidence, )) print("----------------------------------------")
def test_receipt_url_auth_bad_key(self, formrecognizer_test_endpoint, **kwargs): # this can be reverted to set_bodiless_matcher() after tests are re-recorded and don't contain these headers set_custom_default_matcher( compare_bodies=False, excluded_headers="Authorization,Content-Length,x-ms-client-request-id,x-ms-request-id" ) client = DocumentAnalysisClient(formrecognizer_test_endpoint, AzureKeyCredential("xxxx")) with pytest.raises(ClientAuthenticationError): poller = client.begin_analyze_document_from_url("prebuilt-receipt", self.receipt_url_jpg)
def test_authentication_bad_key(self, formrecognizer_test_endpoint, formrecognizer_test_api_key, **kwargs): client = DocumentAnalysisClient(formrecognizer_test_endpoint, AzureKeyCredential("xxxx")) with pytest.raises(ClientAuthenticationError): poller = client.begin_analyze_document("prebuilt-receipt", b"xx") return {}
def test_document_analysis_none_model(self, formrecognizer_test_endpoint, formrecognizer_test_api_key): client = DocumentAnalysisClient( formrecognizer_test_endpoint, AzureKeyCredential(formrecognizer_test_api_key)) with self.assertRaises(ValueError): client.begin_analyze_document_from_url( model=None, document_url="https://badurl.jpg")
def test_receipt_url_auth_bad_key(self, formrecognizer_test_endpoint, **kwargs): set_bodiless_matcher() client = DocumentAnalysisClient(formrecognizer_test_endpoint, AzureKeyCredential("xxxx")) with pytest.raises(ClientAuthenticationError): poller = client.begin_analyze_document_from_url( "prebuilt-receipt", self.receipt_url_jpg)
def test_receipt_url_bad_endpoint(self, formrecognizer_test_endpoint, formrecognizer_test_api_key): with self.assertRaises(ServiceRequestError): client = DocumentAnalysisClient( "http://notreal.azure.com", AzureKeyCredential(formrecognizer_test_api_key)) poller = client.begin_analyze_document_from_url( "prebuilt-receipt", self.receipt_url_jpg)
def test_receipt_url_bad_endpoint(self, formrecognizer_test_api_key, **kwargs): # this can be reverted to set_bodiless_matcher() after tests are re-recorded and don't contain these headers set_custom_default_matcher( compare_bodies=False, excluded_headers="Authorization,Content-Length,x-ms-client-request-id,x-ms-request-id" ) with pytest.raises(ServiceRequestError): client = DocumentAnalysisClient("http://notreal.azure.com", AzureKeyCredential(formrecognizer_test_api_key)) poller = client.begin_analyze_document_from_url("prebuilt-receipt", self.receipt_url_jpg)
def test_receipt_bad_endpoint(self, formrecognizer_test_endpoint, formrecognizer_test_api_key, **kwargs): with open(self.receipt_jpg, "rb") as fd: myfile = fd.read() with pytest.raises(ServiceRequestError): client = DocumentAnalysisClient( "http://notreal.azure.com", AzureKeyCredential(formrecognizer_test_api_key)) poller = client.begin_analyze_document("prebuilt-receipt", myfile)
def test_analyze_document_empty_model_id(self, **kwargs): formrecognizer_test_endpoint = kwargs.pop( "formrecognizer_test_endpoint") formrecognizer_test_api_key = kwargs.pop("formrecognizer_test_api_key") client = DocumentAnalysisClient( formrecognizer_test_endpoint, AzureKeyCredential(formrecognizer_test_api_key)) with pytest.raises(ValueError): client.begin_analyze_document(model="", document=b"xx")
def test_receipt_url_bad_endpoint(self, formrecognizer_test_api_key, **kwargs): set_bodiless_matcher() with pytest.raises(ServiceRequestError): client = DocumentAnalysisClient( "http://notreal.azure.com", AzureKeyCredential(formrecognizer_test_api_key)) poller = client.begin_analyze_document_from_url( "prebuilt-receipt", self.receipt_url_jpg)
def test_document_analysis_empty_model_id(self, formrecognizer_test_endpoint, formrecognizer_test_api_key, **kwargs): client = DocumentAnalysisClient( formrecognizer_test_endpoint, AzureKeyCredential(formrecognizer_test_api_key)) with pytest.raises(ValueError): client.begin_analyze_document_from_url( model="", document_url="https://badurl.jpg")
def test_document_analysis_none_model(self, **kwargs): formrecognizer_test_endpoint = kwargs.pop( "formrecognizer_test_endpoint") formrecognizer_test_api_key = kwargs.pop("formrecognizer_test_api_key") client = DocumentAnalysisClient( formrecognizer_test_endpoint, AzureKeyCredential(formrecognizer_test_api_key)) with pytest.raises(ValueError): client.begin_analyze_document_from_url( model=None, document_url="https://badurl.jpg")
def test_polling_interval(self, formrecognizer_test_endpoint, formrecognizer_test_api_key): client = DocumentAnalysisClient(formrecognizer_test_endpoint, AzureKeyCredential(formrecognizer_test_api_key), polling_interval=7) self.assertEqual(client._client._config.polling_interval, 7) poller = client.begin_analyze_document_from_url("prebuilt-receipt", self.receipt_url_jpg, polling_interval=6) poller.wait() self.assertEqual(poller._polling_method._timeout, 6) poller2 = client.begin_analyze_document_from_url("prebuilt-receipt", self.receipt_url_jpg) poller2.wait() self.assertEqual(poller2._polling_method._timeout, 7) # goes back to client default
def analyze_custom_documents(custom_model_id): path_to_sample_documents = os.path.abspath( os.path.join(os.path.abspath(__file__), "..", "..", "./sample_forms/forms/Form_1.jpg")) # [START analyze_custom_documents] from azure.core.credentials import AzureKeyCredential from azure.ai.formrecognizer import DocumentAnalysisClient endpoint = os.environ["AZURE_FORM_RECOGNIZER_ENDPOINT"] key = os.environ["AZURE_FORM_RECOGNIZER_KEY"] model_id = os.getenv("CUSTOM_BUILT_MODEL_ID", custom_model_id) document_analysis_client = DocumentAnalysisClient( endpoint=endpoint, credential=AzureKeyCredential(key)) # Make sure your document's type is included in the list of document types the custom model can analyze with open(path_to_sample_documents, "rb") as f: poller = document_analysis_client.begin_analyze_document( model=model_id, document=f) result = poller.result() for idx, document in enumerate(result.documents): print("--------Analyzing document #{}--------".format(idx + 1)) print("Document has type {}".format(document.doc_type)) print("Document has confidence {}".format(document.confidence)) print("Document was analyzed by model with ID {}".format( result.model_id)) for name, field in document.fields.items(): field_value = field.value if field.value else field.content print( "......found field of type '{}' with value '{}' and with confidence {}" .format(field.value_type, field_value, field.confidence)) # iterate over tables, lines, and selection marks on each page for page in result.pages: print("\nLines found on page {}".format(page.page_number)) for line in page.lines: print("...Line '{}'".format(line.content)) for word in page.words: print("...Word '{}' has a confidence of {}".format( word.content, word.confidence)) for selection_mark in page.selection_marks: print( "...Selection mark is '{}' and has a confidence of {}".format( selection_mark.state, selection_mark.confidence)) for i, table in enumerate(result.tables): print("\nTable {} can be found on page:".format(i + 1)) for region in table.bounding_regions: print("...{}".format(i + 1, region.page_number)) for cell in table.cells: print("...Cell[{}][{}] has content '{}'".format( cell.row_index, cell.column_index, cell.content)) print("-----------------------------------")
def authentication_with_api_key_credential_document_analysis_client(): # [START create_da_client_with_key] from azure.core.credentials import AzureKeyCredential from azure.ai.formrecognizer import DocumentAnalysisClient endpoint = os.environ["AZURE_FORM_RECOGNIZER_ENDPOINT"] key = os.environ["AZURE_FORM_RECOGNIZER_KEY"] document_analysis_client = DocumentAnalysisClient(endpoint, AzureKeyCredential(key)) # [END create_da_client_with_key] poller = document_analysis_client.begin_analyze_document_from_url( "prebuilt-layout", url ) result = poller.result()
def test_polling_interval(self, formrecognizer_test_endpoint, formrecognizer_test_api_key, **kwargs): # this can be reverted to set_bodiless_matcher() after tests are re-recorded and don't contain these headers set_custom_default_matcher( compare_bodies=False, excluded_headers="Authorization,Content-Length,x-ms-client-request-id,x-ms-request-id" ) client = DocumentAnalysisClient(formrecognizer_test_endpoint, AzureKeyCredential(formrecognizer_test_api_key), polling_interval=7) assert client._client._config.polling_interval == 7 poller = client.begin_analyze_document_from_url("prebuilt-receipt", self.receipt_url_jpg, polling_interval=6) poller.wait() assert poller._polling_method._timeout == 6 poller2 = client.begin_analyze_document_from_url("prebuilt-receipt", self.receipt_url_jpg) poller2.wait() assert poller2._polling_method._timeout == 7 # goes back to client default
def analyze_document(endpoint, key, data): recordId = data['recordId'] formUrl = data["formUrl"] + data["formSasToken"] model = data["model"] document_analysis_client = DocumentAnalysisClient( endpoint=endpoint, credential=AzureKeyCredential(key)) poller = document_analysis_client.begin_analyze_document_from_url( model, formUrl) result = poller.result() output_record = {} if model == "prebuilt-layout": output_record = { "tables": get_tables(result), "pages": get_pages(result) } elif model == "prebuilt-document": output_record = { "kvp": get_key_value_pairs(result), "entities": get_entities(result), "tables": get_tables(result), "pages": get_pages(result) } elif model == "prebuilt-receipt": output_record = { "fields": get_fields(result), "tables": get_tables(result), "pages": get_pages(result) } elif model == "prebuilt-idDocument": output_record = { "fields": get_fields(result), "tables": get_tables(result), "pages": get_pages(result) } elif model == "prebuilt-invoice": output_record = { "fields": get_fields(result), "tables": get_tables(result), "pages": get_pages(result) } else: output_record = { "kvp": get_fields(result), "tables": get_tables(result), "pages": get_pages(result) } return output_record
def authentication_with_azure_active_directory_document_analysis_client(): # [START create_da_client_with_aad] """DefaultAzureCredential will use the values from these environment variables: AZURE_CLIENT_ID, AZURE_TENANT_ID, AZURE_CLIENT_SECRET """ from azure.ai.formrecognizer import DocumentAnalysisClient from azure.identity import DefaultAzureCredential endpoint = os.environ["AZURE_FORM_RECOGNIZER_ENDPOINT"] credential = DefaultAzureCredential() document_analysis_client = DocumentAnalysisClient(endpoint, credential) # [END create_da_client_with_aad] poller = document_analysis_client.begin_analyze_document_from_url( "prebuilt-layout", url ) result = poller.result()
def __init__(self, arguments): super().__init__(arguments) self.document_jpg_url = "https://raw.githubusercontent.com/Azure/azure-sdk-for-python/main/sdk/formrecognizer/azure-ai-formrecognizer/tests/sample_forms/forms/Form_1.jpg" # read test related env vars formrecognizer_test_endpoint = os.environ[ "FORMRECOGNIZER_TEST_ENDPOINT"] form_recognizer_account_key = os.environ["FORMRECOGNIZER_TEST_API_KEY"] # assign the clients that will be used in the perf tests self.service_client = DocumentAnalysisClient( formrecognizer_test_endpoint, AzureKeyCredential(form_recognizer_account_key)) self.async_service_client = AsyncDocumentAnalysisClient( formrecognizer_test_endpoint, AzureKeyCredential(form_recognizer_account_key))
def get_words_on_document_line(): path_to_sample_documents = os.path.abspath( os.path.join( os.path.abspath(__file__), "..", "..", "./sample_forms/forms/Form_1.jpg", )) from azure.core.credentials import AzureKeyCredential from azure.ai.formrecognizer import DocumentAnalysisClient endpoint = os.environ["AZURE_FORM_RECOGNIZER_ENDPOINT"] key = os.environ["AZURE_FORM_RECOGNIZER_KEY"] document_analysis_client = DocumentAnalysisClient( endpoint=endpoint, credential=AzureKeyCredential(key)) with open(path_to_sample_documents, "rb") as f: poller = document_analysis_client.begin_analyze_document( "prebuilt-document", document=f) result = poller.result() for idx, page in enumerate(result.pages): print("----Analyzing lines and words from page #{}----".format(idx + 1)) print( "Page has width: {} and height: {}, measured with unit: {}".format( page.width, page.height, page.unit)) for line_idx, line in enumerate(page.lines): words = line.get_words() print( "...Line # {} has word count {} and text '{}' within bounding box '{}'" .format( line_idx, len(words), line.content, format_bounding_box(line.bounding_box), )) for word in words: print("......Word '{}' has a confidence of {}".format( word.content, word.confidence)) print("----------------------------------------")
def test_polling_interval(self, formrecognizer_test_endpoint, formrecognizer_test_api_key, **kwargs): set_bodiless_matcher() client = DocumentAnalysisClient( formrecognizer_test_endpoint, AzureKeyCredential(formrecognizer_test_api_key), polling_interval=7) assert client._client._config.polling_interval == 7 poller = client.begin_analyze_document_from_url("prebuilt-receipt", self.receipt_url_jpg, polling_interval=6) poller.wait() assert poller._polling_method._timeout == 6 poller2 = client.begin_analyze_document_from_url( "prebuilt-receipt", self.receipt_url_jpg) poller2.wait() assert poller2._polling_method._timeout == 7 # goes back to client default
def convert_to_and_from_dict(): path_to_sample_documents = os.path.abspath( os.path.join( os.path.abspath(__file__), "..", "..", "./sample_forms/forms/Form_1.jpg", )) from azure.core.serialization import AzureJSONEncoder from azure.core.credentials import AzureKeyCredential from azure.ai.formrecognizer import DocumentAnalysisClient, AnalyzeResult endpoint = os.environ["AZURE_FORM_RECOGNIZER_ENDPOINT"] key = os.environ["AZURE_FORM_RECOGNIZER_KEY"] document_analysis_client = DocumentAnalysisClient( endpoint=endpoint, credential=AzureKeyCredential(key)) with open(path_to_sample_documents, "rb") as f: poller = document_analysis_client.begin_analyze_document( "prebuilt-document", document=f) result = poller.result() # convert the received model to a dictionary analyze_result_dict = result.to_dict() # save the dictionary as JSON content in a JSON file, use the AzureJSONEncoder # to help make types, such as dates, JSON serializable # NOTE: AzureJSONEncoder is only available with azure.core>=1.18.0. with open('data.json', 'w') as f: json.dump(analyze_result_dict, f, cls=AzureJSONEncoder) # convert the dictionary back to the original model model = AnalyzeResult.from_dict(analyze_result_dict) # use the model as normal print("----Converted from dictionary AnalyzeResult----") print("Model ID: '{}'".format(model.model_id)) print("Number of pages analyzed {}".format(len(model.pages))) print("API version used: {}".format(model.api_version)) print("----------------------------------------")
class AnalyzeDocumentRequestPreparation(PerfStressTest): def __init__(self, arguments): super().__init__(arguments) with open( os.path.abspath( os.path.join(os.path.abspath(__file__), "..", "./../sample_forms/forms/Form_1.jpg")), "rb") as fd: self.document_jpg = fd.read() # read test related env vars formrecognizer_test_endpoint = os.environ[ "FORMRECOGNIZER_TEST_ENDPOINT"] form_recognizer_account_key = os.environ["FORMRECOGNIZER_TEST_API_KEY"] # assign the clients that will be used in the perf tests self.service_client = DocumentAnalysisClient( formrecognizer_test_endpoint, AzureKeyCredential(form_recognizer_account_key)) self.async_service_client = AsyncDocumentAnalysisClient( formrecognizer_test_endpoint, AzureKeyCredential(form_recognizer_account_key)) async def close(self): """This is run after cleanup.""" await self.async_service_client.close() self.service_client.close() await super().close() def run_sync(self): """The synchronous perf test.""" poller = self.service_client.begin_analyze_document( "prebuilt-document", self.document_jpg) assert poller async def run_async(self): """The asynchronous perf test.""" poller = await self.async_service_client.begin_analyze_document( "prebuilt-document", self.document_jpg) assert poller
def __init__(self, arguments): super().__init__(arguments) with open( os.path.abspath( os.path.join(os.path.abspath(__file__), "..", "./../sample_forms/forms/Form_1.jpg")), "rb") as fd: self.document_jpg = fd.read() # read test related env vars formrecognizer_test_endpoint = os.environ[ "FORMRECOGNIZER_TEST_ENDPOINT"] form_recognizer_account_key = os.environ["FORMRECOGNIZER_TEST_API_KEY"] # assign the clients that will be used in the perf tests self.service_client = DocumentAnalysisClient( formrecognizer_test_endpoint, AzureKeyCredential(form_recognizer_account_key)) self.async_service_client = AsyncDocumentAnalysisClient( formrecognizer_test_endpoint, AzureKeyCredential(form_recognizer_account_key))
def test_logging_info_dac_client(self, formrecognizer_test_endpoint, formrecognizer_test_api_key): client = DocumentAnalysisClient( formrecognizer_test_endpoint, AzureKeyCredential(formrecognizer_test_api_key)) mock_handler = MockHandler() logger = logging.getLogger("azure") logger.addHandler(mock_handler) logger.setLevel(logging.INFO) poller = client.begin_analyze_document_from_url( "prebuilt-receipt", self.receipt_url_jpg) result = poller.result() for message in mock_handler.messages: if message.levelname == "INFO": # not able to use json.loads here. At INFO level only API key should be REDACTED if message.message.find("Ocp-Apim-Subscription-Key") != -1: assert message.message.find("REDACTED") != -1 else: assert message.message.find("REDACTED") == -1