def __init__(self, stopwords=[], punct_words=[], nlp_artifacts=None): self.stopwords = stopwords self.punct_words = punct_words if nlp_artifacts is None: self.nlp_artifacts = NlpArtifacts([], [], [], [], None, "en") else: self.nlp_artifacts = nlp_artifacts
def loaded_analyzer_engine(loaded_registry, app_tracer): mock_nlp_artifacts = NlpArtifacts([], [], [], [], None, "en") analyzer_engine = AnalyzerEngine( loaded_registry, NlpEngineMock(stopwords=[], punct_words=[], nlp_artifacts=mock_nlp_artifacts), app_tracer=app_tracer, log_decision_process=True, ) return analyzer_engine
def doc_to_nlp_artifact(self, doc, language): tokens = [token.text for token in doc] lemmas = [token.lemma_ for token in doc] tokens_indices = [token.idx for token in doc] entities = doc.ents return NlpArtifacts(entities=entities, tokens=tokens, tokens_indices=tokens_indices, lemmas=lemmas, nlp_engine=self, language=language)
def __init__(self, *args, **kwargs): super(TestAnalyzerEngine, self).__init__(*args, **kwargs) self.loaded_registry = MockRecognizerRegistry(RecognizerStoreApiMock()) mock_nlp_artifacts = NlpArtifacts([], [], [], [], None, "en") self.app_tracer = AppTracerMock(enable_interpretability=True) self.loaded_analyzer_engine = AnalyzerEngine(self.loaded_registry, MockNlpEngine(stopwords=[], punct_words=[], nlp_artifacts=mock_nlp_artifacts), app_tracer=self.app_tracer, enable_trace_pii=True) self.unit_test_guid = "00000000-0000-0000-0000-000000000000"
def _doc_to_nlp_artifact(self, doc: Doc, language: str) -> NlpArtifacts: lemmas = [token.lemma_ for token in doc] tokens_indices = [token.idx for token in doc] entities = doc.ents return NlpArtifacts( entities=entities, tokens=doc, tokens_indices=tokens_indices, lemmas=lemmas, nlp_engine=self, language=language, )
def test_text_with_context_improves_score(self): nlp_engine = TESTS_NLP_ENGINE mock_nlp_artifacts = NlpArtifacts([], [], [], [], None, "en") for item in self.context_sentences: text = item[0] recognizer = item[1] entities = item[2] nlp_artifacts = nlp_engine.process_text(text, "en") results_without_context = recognizer.analyze(text, entities, mock_nlp_artifacts) results_with_context = recognizer.analyze(text, entities, nlp_artifacts) assert(len(results_without_context) == len(results_with_context)) for i in range(len(results_with_context)): assert(results_without_context[i].score < results_with_context[i].score)
def test_context_custom_recognizer(self): nlp_engine = TESTS_NLP_ENGINE mock_nlp_artifacts = NlpArtifacts([], [], [], [], None, "en") # This test checks that a custom recognizer is also enhanced by context. # However this test also verifies a specific case in which the pattern also # includes a preceeding space (' rocket'). This in turn cause for a misalignment # between the tokens and the regex match (the token will be just 'rocket'). # This misalignment is handled in order to find the correct context window. rocket_recognizer = PatternRecognizer(supported_entity="ROCKET", name="rocketrecognizer", context=["cool"], patterns=[Pattern("rocketpattern", "\\s+(rocket)", 0.3)]) text = "hi, this is a cool ROCKET" recognizer = rocket_recognizer entities = ["ROCKET"] nlp_artifacts = nlp_engine.process_text(text, "en") results_without_context = recognizer.analyze(text, entities, mock_nlp_artifacts) results_with_context = recognizer.analyze(text, entities, nlp_artifacts) assert(len(results_without_context) == len(results_with_context)) for i in range(len(results_with_context)): assert(results_without_context[i].score < results_with_context[i].score)
def mock_nlp_artifacts(): return NlpArtifacts([], [], [], [], None, "en")