def test_given_ad_hoc_deny_list_recognizer_the_right_entities_are_returned(): request_body = r""" { "text": "Mr. John Smith's drivers license is AC432223", "language": "en", "ad_hoc_recognizers":[ { "name": "Mr. Recognizer", "supported_language": "en", "deny_list": ["Mr", "Mr.", "Mister"], "supported_entity":"MR_TITLE" }, { "name": "Ms. Recognizer", "supported_language": "en", "deny_list": ["Ms", "Ms.", "Miss", "Mrs", "Mrs."], "supported_entity":"MS_TITLE" } ] } """ response_status, response_content = analyze(request_body) expected_response = """ [ {"entity_type": "PERSON", "start": 4, "end": 14, "score": 0.85, "analysis_explanation":null}, {"entity_type": "US_DRIVER_LICENSE", "start": 36, "end": 44, "score": 0.6499999999999999, "analysis_explanation":null}, {"entity_type": "MR_TITLE", "start": 0, "end": 3, "score": 1.0, "analysis_explanation":null} ] """ assert response_status == 200 assert equal_json_strings(expected_response, response_content)
def test_given_ad_hoc_pattern_recognizer_context_raises_confidence(): request_body = r""" { "text": "John Smith drivers license is AC432223. Zip code: 10023", "language": "en", "ad_hoc_recognizers":[ { "name": "Zip code Recognizer", "supported_language": "en", "patterns": [ { "name": "zip code (weak)", "regex": "(\\b\\d{5}(?:\\-\\d{4})?\\b)", "score": 0.01 } ], "context": ["zip", "code"], "supported_entity":"ZIP" } ] } """ response_status, response_content = analyze(request_body) expected_response = """ [ {"entity_type": "PERSON", "start": 0, "end": 10, "score": 0.85, "analysis_explanation":null}, {"entity_type": "US_DRIVER_LICENSE", "start": 30, "end": 38, "score": 0.6499999999999999, "analysis_explanation":null}, {"entity_type": "ZIP", "start": 50, "end": 55, "score": 0.4, "analysis_explanation":null} ] """ assert response_status == 200 assert equal_json_strings(expected_response, response_content)
def test_given_wrong_ad_hoc_json_exception_is_given(): malformed_request_body = r""" { "text": "John Smith drivers license is AC432223. Zip code: 10023", "language": "en", "ad_hoc_recognizers":[ { "name": "Zip code Recognizer", "supported_language": "en", "patterns": [ { "type": "zip code (weak)", "bebex": "(\\b\\d{5}(?:\\-\\d{4})?\\b)", "confidence": 0.01 } ], "supported_entity":"ZIP" } ] } """ response_status, response_content = analyze(malformed_request_body) expected_response = """ { "error":"Failed to parse /analyze request for AnalyzerEngine.analyze(). __init__() got an unexpected keyword argument \'type\'" } """ assert equal_json_strings(expected_response, response_content) assert response_status == 400
def test_given_decision_process_enabled_for_analyze_input_then_return_response_with_decision_process( ): request_body = """ { "text": "John Smith drivers license is AC432223", "language": "en", "return_decision_process": true } """ response_status, response_content = analyze(request_body) expected_response = """ [ {"entity_type": "PERSON", "start": 0, "end": 10, "score": 0.85, "analysis_explanation": { "recognizer": "SpacyRecognizer", "pattern_name": null, "pattern": null, "original_score": 0.85, "score": 0.85, "textual_explanation": "Identified as PERSON by Spacy's Named Entity Recognition", "score_context_improvement": 0, "supportive_context_word": "", "validation_result": null } }, {"entity_type": "US_DRIVER_LICENSE", "start": 30, "end": 38, "score": 0.6499999999999999, "analysis_explanation": { "recognizer": "UsLicenseRecognizer", "pattern_name": "Driver License - Alphanumeric (weak)", "pattern": "\\\\b([A-Z][0-9]{3,6}|[A-Z][0-9]{5,9}|[A-Z][0-9]{6,8}|[A-Z][0-9]{4,8}|[A-Z][0-9]{9,11}|[A-Z]{1,2}[0-9]{5,6}|H[0-9]{8}|V[0-9]{6}|X[0-9]{8}|A-Z]{2}[0-9]{2,5}|[A-Z]{2}[0-9]{3,7}|[0-9]{2}[A-Z]{3}[0-9]{5,6}|[A-Z][0-9]{13,14}|[A-Z][0-9]{18}|[A-Z][0-9]{6}R|[A-Z][0-9]{9}|[A-Z][0-9]{1,12}|[0-9]{9}[A-Z]|[A-Z]{2}[0-9]{6}[A-Z]|[0-9]{8}[A-Z]{2}|[0-9]{3}[A-Z]{2}[0-9]{4}|[A-Z][0-9][A-Z][0-9][A-Z]|[0-9]{7,8}[A-Z])\\\\b", "original_score": 0.3, "score": 0.6499999999999999, "textual_explanation": null, "score_context_improvement": 0.3499999999999999, "supportive_context_word": "driver", "validation_result": null } } ] """ assert response_status == 200 assert equal_json_strings(expected_response, response_content)
def test_given_no_analyze_text_input_then_return_error(): request_body = "{}" response_status, response_content = analyze(request_body) expected_response = """ {"error": "No text provided"} """ assert response_status == 500 assert equal_json_strings(expected_response, response_content)
def test_given_a_trace_true_analyze_input_then_return_normal_response(): request_body = """ { "text": "John Smith drivers license is AC432223", "language": "en", "trace": "1" } """ response_status, response_content = analyze(request_body) assert response_status == 200
def test_demo_website_text_returns_correct_anonymized_version(): # Analyzer request info dir_path = Path(__file__).resolve().parent.parent with open(Path(dir_path, "resources", "demo.txt"), encoding="utf-8") as f: text_into_rows = f.read().split("\n") text_into_rows = [txt.strip() for txt in text_into_rows] text = " ".join(text_into_rows) language = "en" score_threshold = 0.35 analyzer_request = { "text": text, "language": language, "score_threshold": score_threshold, } # Call analyzer analyzer_status_code, analyzer_content = analyze(json.dumps(analyzer_request)) analyzer_data = json.loads(analyzer_content) # Anonymizer request info anonymizer_request = { "text": analyzer_request["text"], "analyzer_results": analyzer_data, } # Call anonymizer anonymizer_status_code, anonymizer_response = anonymize( json.dumps(anonymizer_request) ) anonymizer_response_dict = json.loads(anonymizer_response) actual_anonymized_text = anonymizer_response_dict["text"] # Expected output: with open( Path(dir_path, "resources", "demo_anonymized.txt"), encoding="utf-8" ) as f_exp: text_into_rows = f_exp.read().split("\n") text_into_rows = [txt.strip() for txt in text_into_rows] expected_anonymized_text = " ".join(text_into_rows) # Assert equal assert expected_anonymized_text == actual_anonymized_text
def test_given_a_incorrect_analyze_language_input_then_return_error(): request_body = """ { "text": "John Smith drivers license is AC432223", "language": "zz" } """ response_status, response_content = analyze(request_body) assert response_status == 500 expected_response = """ {"error": "No matching recognizers were found to serve the request."} """ assert equal_json_strings(expected_response, response_content)
def test_given_analyze_text_no_language_input_then_return_error(): request_body = """ { "text": "John Smith drivers license is AC432223" } """ response_status, response_content = analyze(request_body) expected_response = """ {"error": "No language provided"} """ assert response_status == 500 assert equal_json_strings(expected_response, response_content)
def test_given_a_correct_analyze_input_then_return_full_response(): request_body = """ { "text": "John Smith drivers license is AC432223", "language": "en" } """ response_status, response_content = analyze(request_body) expected_response = """ [ {"entity_type": "PERSON", "start": 0, "end": 10, "score": 0.85, "analysis_explanation":null}, {"entity_type": "US_DRIVER_LICENSE", "start": 30, "end": 38, "score": 0.6499999999999999, "analysis_explanation":null} ] """ assert response_status == 200 assert equal_json_strings(expected_response, response_content)
def test_given_analyze_threshold_input_then_return_result_above_threshold(): request_body = """ { "text": "John Smith drivers license is AC432223", "language": "en", "score_threshold": 0.7 } """ response_status, response_content = analyze(request_body) expected_response = """ [ {"entity_type": "PERSON", "start": 0, "end": 10, "score": 0.85, "analysis_explanation": null } ] """ assert response_status == 200 assert equal_json_strings(expected_response, response_content)
def analyze_and_assert(analyzer_request, expected_response): response_status, response_content = analyze(json.dumps(analyzer_request)) assert response_status == 200 assert equal_json_strings(expected_response, response_content) analyzer_data = json.loads(response_content) return analyzer_data