def test_tokenization(self): """ The whether the elasticsearch analyzer yields the right tokens for the german analyzer. Check the comments in mainapp.documents.index for more details """ tokenizations = { "die": [], "hunde": ["hunde", "hund"], "wi-fi": ["wi", "fi"], "Feuerwehr": ["feuerwehr"], # Would ideally split the words "oktopoden": ["oktopoden", "oktopod"], "Äpfel": ["äpfel", "apfel"], "ging": ["ging"], "schwierigste": ["schwierigste", "schwierig"], "1234/89": ["1234", "89"], # Would be better if it included "1234/89" } text_analyzer = get_text_analyzer("german") elastic_index = Index("mst-test-tokenization") if not elastic_index.exists(): elastic_index.create() elastic_index.close() elastic_index.analyzer(text_analyzer) elastic_index.save() elastic_index.open() elastic_index.flush() for word, expected_tokens in tokenizations.items(): analysis = elastic_index.analyze( body={"analyzer": "text_analyzer", "text": word} ) actual_tokens = [i["token"] for i in analysis["tokens"]] self.assertEqual(expected_tokens, actual_tokens, "Word was {}".format(word))
def analyze(self, text: str) -> Dict[str, List[Dict]]: """Shows what elasticsearch does with the tokens""" elastic_index_file = Index(settings.ELASTICSEARCH_PREFIX + "-file") elastic_index_file.analyzer(autocomplete_analyzer) elastic_index_file.analyzer(text_analyzer) return elastic_index_file.analyze( body={"analyzer": "text_analyzer", "text": text} )