def test_tokenization(self): """ The whether the elasticsearch analyzer yields the right tokens for the german analyzer. Check the comments in mainapp.documents.index for more details """ tokenizations = { "die": [], "hunde": ["hunde", "hund"], "wi-fi": ["wi", "fi"], "Feuerwehr": ["feuerwehr"], # Would ideally split the words "oktopoden": ["oktopoden", "oktopod"], "Äpfel": ["äpfel", "apfel"], "ging": ["ging"], "schwierigste": ["schwierigste", "schwierig"], "1234/89": ["1234", "89"], # Would be better if it included "1234/89" } text_analyzer = get_text_analyzer("german") elastic_index = Index("mst-test-tokenization") if not elastic_index.exists(): elastic_index.create() elastic_index.close() elastic_index.analyzer(text_analyzer) elastic_index.save() elastic_index.open() elastic_index.flush() for word, expected_tokens in tokenizations.items(): analysis = elastic_index.analyze( body={"analyzer": "text_analyzer", "text": word} ) actual_tokens = [i["token"] for i in analysis["tokens"]] self.assertEqual(expected_tokens, actual_tokens, "Word was {}".format(word))
def analyze(self, text: str) -> Dict[str, List[Dict]]: """Shows what elasticsearch does with the tokens""" elastic_index_file = Index(settings.ELASTICSEARCH_PREFIX + "-file") elastic_index_file.analyzer(autocomplete_analyzer) elastic_index_file.analyzer(text_analyzer) return elastic_index_file.analyze( body={"analyzer": "text_analyzer", "text": text} )
# Создаем анализаторы ru_analyzer = analyzer( 'ru_analyzer', type='custom', tokenizer='standard', filter=['lowercase', russian_stop_filter, russian_stemmer_filter], ) en_analyzer = analyzer('en_analyzer', type='custom', tokenizer='standard', filter=[ english_possessive_stemmer_filter, 'lowercase', english_stop_filter, english_stemmer_filter ]) # Добавляем анализаторы в Индекс movie_index.analyzer(ru_analyzer) movie_index.analyzer(en_analyzer) @movie_index.doc_type class MovieDocument(Document): title = fields.TextField( analyzer=ru_analyzer, # Анализатор для индексации search_analyzer=ru_analyzer # Анализатор для поискового запроса ) description = fields.TextField( analyzer=ru_analyzer, # Анализатор для индексации search_analyzer=ru_analyzer # Анализатор для поискового запроса ) subtitles = fields.TextField( attr='get_subtitles',
# Name of the Elasticsearch index from django.conf import settings from django_elasticsearch_dsl import Index, DEDField, Integer from elasticsearch_dsl import analyzer, token_filter class RelatedToValueList(DEDField, Integer): def get_value_from_instance(self, data): return [obj.id for obj in super().get_value_from_instance(data)] mainIndex = Index(settings.ELASTICSEARCH_INDEX) # See Elasticsearch Indices API reference for available settings mainIndex.settings(number_of_shards=1, number_of_replicas=0) autocomplete_filter = token_filter( "autocomplete_filter", "edge_ngram", min_gram=1, max_gram=20, ) # Using this analyzer with an empty field fails, so we're using methods instead that add a space autocomplete_analyzer = analyzer( 'autocomplete', tokenizer="standard", filter=["lowercase", autocomplete_filter], ) mainIndex.analyzer(autocomplete_analyzer)
from django.conf import settings from django.db.models import QuerySet from django_elasticsearch_dsl import Index from metarecord.models import Classification from search_indices import get_finnish_analyzer from search_indices.documents.base import BaseDocument # Name of the Elasticsearch index INDEX = Index(settings.ELASTICSEARCH_INDEX_NAMES[__name__]) finnish_analyzer = get_finnish_analyzer() INDEX.analyzer(finnish_analyzer) INDEX.settings(max_result_window=500000, ) @INDEX.document class ClassificationDocument(BaseDocument): class Django: model = Classification def get_queryset(self) -> QuerySet: return Classification.objects.latest_version()
def make_index(suffix: str) -> Index: elastic_index = Index(settings.ELASTICSEARCH_PREFIX + "-" + suffix) elastic_index.analyzer(autocomplete_analyzer) elastic_index.analyzer(text_analyzer) return elastic_index
# Name of the Elasticsearch index job_listing = Index('joblistings') # See Elasticsearch Indices API reference for available settings job_listing.settings( number_of_shards=1, number_of_replicas=0 ) custom_tokenizer = tokenizer( "pattern", "pattern", pattern="\s|-|\n|/|,|\.\s" ) keyword_analyzer = analyzer("default", type="custom", tokenizer=custom_tokenizer, filter=["lowercase"]) job_listing.analyzer(keyword_analyzer) filter_shingle = token_filter(name_or_instance="filter_shingle", type="shingle", max_shingle_size=2, min_shingle_size=2, output_unigrams="false") shingle_analyzer = analyzer("shingle", tokenizer=custom_tokenizer, type="custom", filter=["lowercase", filter_shingle]) job_listing.analyzer(shingle_analyzer) # triple_filter_shingle = token_filter(name_or_instance="triple_filter_shingle", type="shingle", max_shingle_size=3, min_shingle_size=3, output_unigrams="false") # triple_shingle_analyzer = analyzer("triple_shingle", tokenizer=custom_tokenizer, type="custom", filter=["lowercase", triple_filter_shingle]) # job_listing.analyzer(triple_shingle_analyzer) @job_listing.doc_type class JobListingDocument(DocType): keywords = fields.TextField(attr="description", fielddata=True) # shingles = fields.TextField(attr="description", analyzer="shingle", fielddata=True)