def test_simulate_complex(client): a = analyzer('my-analyzer', tokenizer=tokenizer('split_words', 'simple_pattern_split', pattern=':'), filter=['lowercase', token_filter('no-ifs', 'stop', stopwords=['if'])]) tokens = a.simulate('if:this:works', using=client).tokens assert len(tokens) == 2 assert ['this', 'works'] == [t.token for t in tokens]
def ngram(min_gram=2, max_gram=4): base_name = "ngram_%d_%d" % (min_gram, max_gram) return dsl.analyzer(base_name + "_analyzer", tokenizer=dsl.tokenizer(base_name + "_tokenizer", 'nGram', min_gram=min_gram, max_gram=max_gram, token_chars=[ "letter", "digit" ]), filter=['lowercase'])
def get_default_ngram_analyzer(): return analyzer( 'froide_ngram_analyzer', tokenizer=tokenizer( 'froide_ngram_tokenzier', type='edge_ngram', min_gram=1, max_gram=15, token_chars=['letter', 'digit'] ), filter=[ 'standard', 'lowercase', 'asciifolding', ] )
def email_ngram_analyzer(): """ An analyzer for creating email safe ngrams. This analyzer first splits the local part and domain name, then creates n-grams (overlapping fragments) from the remaining strings, minus any special characters. Returns: Analyzer: An analyzer suitable for analyzing email addresses. """ return analyzer( 'email_ngram', # Split the email address at the @ sign. tokenizer=tokenizer( 'at_sign_tokenizer', type='pattern', pattern='@', ), filter=[ 'lowercase', # Strip any special characters from the email address. token_filter( 'email_ngram_word_delimiter', type='word_delimiter', split_on_numerics=False, catenate_all=True, ), # Create trigrams from the address. token_filter( 'email_ngram_filter', type='ngram', min_gram=3, max_gram=3, ), ], )
def bigram_analyzer(): """ A n-gram analyzer of length 2. Bigrams provide nice partial, fuzzy matching. Returns: Analyzer """ return analyzer( 'bigram', tokenizer=tokenizer( 'bigram_tokenizer', type='ngram', min_gram=2, max_gram=2, token_chars=['letter', 'digit'], ), filter=[ 'standard', 'lowercase', 'asciifolding', ], )
"""Elastic search fields for Resolwe.""" import elasticsearch_dsl as dsl # pylint: disable=invalid-name # Process type analyzer. During indexing we tokenize by type paths, during search, # we do not tokenize at all. process_type_tokenizer = dsl.tokenizer('process_type_tokenizer', type='path_hierarchy', delimiter=':') process_type_analyzer = dsl.analyzer('process_type_analyzer', tokenizer=process_type_tokenizer, filter=['lowercase']) process_type_search_analyzer = dsl.analyzer('process_type_search_analyzer', tokenizer='keyword', filter=['lowercase']) # Name analyzer. name_analyzer = dsl.analyzer( 'name_analyzer', type='pattern', # The pattern matches token separators. pattern=r''' ([^\p{L}\d]+) # swallow non letters and numbers, | (?<=\D)(?=\d) # or non-number followed by number, | (?<=\d)(?=\D) # or number followed by non-number, ''', flags='CASE_INSENSITIVE|COMMENTS', lowercase=True, ) # During indexing, we lowercase terms and tokenize using edge_ngram. ngrams_analyzer = dsl.analyzer( 'ngrams_index', tokenizer='standard', filter=[ 'lowercase', dsl.token_filter(
import pickle from datetime import datetime from elasticsearch_dsl import connections, Document, Completion, Text, Integer, Float, Keyword, analyzer, tokenizer, \ Date, Index ELASTIC_INDEX = 'movies-suggestions' custom_analyzer = analyzer( 'my_analyzer', tokenizer=tokenizer('bigram', 'nGram', min_gram=2, max_gram=2), filter=['lowercase'] ) class MoviesIndex(Document): # title = Text(fields={'keyword': Keyword()}) title = Text() rating = Float() year = Integer() genre = Text() suggest = Completion(analyzer=custom_analyzer) created = Date() def clean(self): self.suggest = { 'input': self.title.split(), 'weight': round(self.rating) } class Index: name = ELASTIC_INDEX
from elasticsearch_dsl import String, Nested, Boolean, DocType, tokenizer, analyzer # Required for case sensitivity # To add an analyzer to an existing mapping requires mapping to be "closed" case_sensitive_analyzer = analyzer("case_sensitive_analyzer", tokenizer=tokenizer("keyword")) class Metadata(DocType): property_list = Nested( properties={ "name": String(analyzer=case_sensitive_analyzer), "value": String(analyzer=case_sensitive_analyzer), "immutable": Boolean(), } ) def update_all(self, metadata): """ Updates all metadata related to an artifact. Args metadata(dict): collection of metadata for document. """ self.property_list = metadata.values()
kadaster_object_aanduiding = analysis.token_filter( 'kad_obj_aanduiding_filter', type='ngram', min_gram=4, max_gram=16 ) #################################### # Analyzers # #################################### bouwblok = es.analyzer( 'bouwblok', tokenizer=tokenizer( 'edge_ngram_filter', type='edge_ngram', min_gram=2, max_gram=4, token_chars=["letter", "digit"]), filter=['lowercase', divider_stripper], # char_filter=[divider_stripper] ) adres = es.analyzer( 'adres', tokenizer='standard', filter=['lowercase', 'asciifolding', synonym_filter], # filter=['lowercase', 'asciifolding'], char_filter=[naam_stripper], ) straatnaam = es.analyzer(
import datetime from elasticsearch import RequestError from elasticsearch_dsl import Date, Keyword, Text, Index, analyzer, Integer, tokenizer, Document, Double, GeoPoint, \ Search, A from elasticsearch_dsl.connections import connections import logging from elasticsearch_dsl.query import MultiMatch, MatchAll, Query, MoreLikeThis autocomplete = analyzer('autocomplete', tokenizer=tokenizer('ngram', 'edge_ngram', min_gram=2, max_gram=15, token_chars=["letter", "digit"]), filter=['lowercase'] ) autocomplete_search = analyzer('autocomplete_search', tokenizer=tokenizer('lowercase') ) # Star Documents are ElasticSearch documents and can be used to index an Event, # Location, Resource, or Study class StarDocument(Document): type = Keyword() label = Keyword() id = Integer() title = Text(analyzer=autocomplete, search_analyzer=autocomplete_search) date = Date() last_updated = Date() content = Text(analyzer=autocomplete, search_analyzer=autocomplete_search) description = Text(analyzer=autocomplete, search_analyzer=autocomplete_search)
from elasticsearch_dsl import analyzer, tokenizer from django_elasticsearch_dsl.registries import registry from django_elasticsearch_dsl import Document, Index, fields from .models import Note INDEX = Index('search_notes') INDEX.settings(number_of_shards=1, number_of_replicas=1) my_analyzer = analyzer('my_analyzer', tokenizer=tokenizer('standard'), filter=['lowercase', 'stop', 'trim'], char_filter=["html_strip"]) @registry.register_document @INDEX.document class NoteDocument(Document): """Note Elastic search document.""" class Django: model = Note id = fields.StringField(attr='id', analyzer="standard") title = fields.StringField(fields={ 'raw': fields.KeywordField(), }) user = fields.StringField(attr='user_indexing', fields={ 'raw': fields.KeywordField(),
def search_resources(self, search): resource_search = ResourceSearch( search.query, search.jsonFilters(), search.sort, index=self.resource_index_name) resource_search = resource_search[search.start:search.start + search.size] return resource_search.execute() autocomplete = analyzer( 'autocomplete', tokenizer=tokenizer( 'ngram', 'edge_ngram', min_gram=2, max_gram=15, token_chars=["letter", "digit"]), filter=['lowercase']) autocomplete_search = analyzer( 'autocomplete_search', tokenizer=tokenizer('lowercase')) class ElasticResource(DocType): id = Integer() name = Text(analyzer=autocomplete, search_analyzer=autocomplete_search) last_updated = Date() description = Text() type = Keyword() institution = Keyword() website = Keyword()
#!/usr/bin/env python3 import sys import fileinput import json from elasticsearch_dsl import Document, field, InnerDoc import logging logger = logging.getLogger('chibi_gob_mx_elasticsearch.models.open_data') from elasticsearch_dsl import analyzer, tokenizer category = analyzer( 'category', tokenizer=tokenizer('trigram', 'ngram', min_gram=4, max_gram=5), filter=["asciifolding", "lowercase"], ) titles = analyzer( 'titles', tokenizer=tokenizer('trigram', 'ngram', min_gram=4, max_gram=5), filter=["asciifolding", "lowercase"], ) titles_space = analyzer( 'titles_space', tokenizer='whitespace', filter=["asciifolding", "lowercase"], ) class Data_set_resource(InnerDoc):
stopwords='_french_') fr_stem_filter = token_filter('fr_stem_filter', type='stemmer', language='minimal_french') # Deal with French specific aspects. fr_elision = token_filter('fr_elision', type='elision', articles=[ 'l', 'm', 't', 'qu', 'n', 's', 'j', 'd', 'c', 'jusqu', 'quoiqu', 'lorsqu', 'puisqu' ]) # Languages related analyzers. de_analyzer = analyzer( 'de_analyzer', tokenizer=tokenizer('icu_tokenizer'), filter=['icu_folding', 'icu_normalizer', de_stop_filter, de_stem_filter], char_filter=[char_filter('html_strip')]) en_analyzer = analyzer( 'en_analyzer', tokenizer=tokenizer('icu_tokenizer'), filter=['icu_folding', 'icu_normalizer', en_stop_filter, en_stem_filter], char_filter=[char_filter('html_strip')]) es_analyzer = analyzer( 'es_analyzer', tokenizer=tokenizer('icu_tokenizer'), filter=['icu_folding', 'icu_normalizer', es_stop_filter, es_stem_filter], char_filter=[char_filter('html_strip')])
lowercase = analysis.normalizer('lowercase_keyword', filter=['lowercase']) strip_zero = analysis.CustomCharFilter("strip_zero", builtin_type="pattern_replace", pattern="^0+(.*)", replacement="$1") #################################### # Analyzers # #################################### bouwblokid = es.analyzer( 'bouwbloknummer', tokenizer=tokenizer('bouwbloktokens', 'edge_ngram', min_gram=1, max_gram=4, token_chars=["letter", "digit"]), filter=['lowercase', divider_stripper], ) adres = es.analyzer( 'adres', tokenizer='standard', filter=['lowercase', 'asciifolding', synonym_filter], # fi`lter=['lowercase', 'asciifolding'], char_filter=[naam_stripper], ) straatnaam = es.analyzer( 'straatnaam',
from elasticsearch.helpers import bulk from elasticsearch_dsl import (Boolean, DocType, Index, Integer, String, analyzer, tokenizer) from elasticsearch_dsl.connections import connections logger = logging.getLogger(__name__) # Tokenizer that splits on tokens matching a hex number # a decimal number, or anything non-alphanumeric. message_tokenizer = tokenizer('message_tokenizer', 'pattern', pattern=r"0x[0-9a-fA-F]+|[\W0-9]+?") message_analyzer = analyzer('message_analyzer', type="custom", tokenizer=message_tokenizer, filters=[]) class RoutedDocType(DocType): _routing_key = None @property def routing(self): return getattr(self, self._routing_key)
location += '?{}'.format(query_string) return location class Index: name = 'publication' settings = { 'number_of_shards': 1 } autocomplete_analyzer = analyzer('autocomplete_analyzer', tokenizer=tokenizer( 'edge_ngram_tokenizer', type='edge_ngram', min_gram=3, max_gram=10, token_chars=[ "letter", "digit" ]), filter=['lowercase', 'asciifolding', 'trim']) def get_search_index(model): lookup = { Author: AuthorDoc, Container: ContainerDoc, Platform: PlatformDoc, Sponsor: SponsorDoc, Tag: TagDoc, }
from elasticmodels import BooleanField, DateField, Index, IntegerField, StringField from elasticsearch_dsl import MetaField, analyzer, token_filter, tokenizer from .models import Report # override the default analyzer for ES to use an ngram filter that breaks words using # the standard tokenizer. Allow words to be broken up with underscores name = analyzer( "name", # the standard analyzer splits the words nicely by default tokenizer=tokenizer("standard"), filter=[ # technically, the standard filter doesn't do anything but we include # it anyway just in case ES decides to make use of it "standard", # obviously, lowercasing the tokens is a good thing "lowercase", # this enumates a 3-4 ngram, but also includes the whole token itself # (which prevents us from having to create multifields) token_filter("simple_edge", type="pattern_capture", patterns=["(?=(...))(?=(....))"]), ], ) class ReportIndex(Index): category = StringField( attr="category.name", # need a non_analyzed field for sorting fields={"raw": StringField(index="not_analyzed")}, ) category_id = IntegerField(attr="category.pk")
from elasticsearch_dsl import analyzer, token_filter, tokenizer label_autocomplete = analyzer( 'label_autocomplete', tokenizer=tokenizer( 'trigram', 'edge_ngram', min_gram=2, max_gram=25, token_chars=["letter", "digit"] ), filter=['lowercase', token_filter('ascii_fold', 'asciifolding')] ) synonynm_filter = token_filter( 'synonym_filter_en', 'synonym', synonyms_path='/usr/share/elasticsearch/config/synonyms/synonyms_en.txt' ) synonym_analyzer = analyzer( 'synonym_analyzer_en', type='custom', tokenizer='standard', filter=[ synonynm_filter, 'lowercase' ])
DocType, Keyword, Text, Index, analyzer, tokenizer, token_filter, Date ) namesAutocompleteAnalyzer = analyzer( "namesAutocompleteAnalyzer", tokenizer=tokenizer( "autocompleteTokenizer", type="edge_ngram", min_gram=1, max_gram=25, token_chars=["letter", "digit"], ), filter=["lowercase"], ) namesAutocompleteSearchAnalyzer = analyzer( "namesAutocompleteSearchAnalyzer", tokenizer=tokenizer("lowercase") ) ukrainianAddressesStopwordsAnalyzer = analyzer( "ukrainianAddressesStopwordsAnalyzer", type="ukrainian", filter=[ token_filter(
# -*- coding: utf-8 -*- from __future__ import unicode_literals, absolute_import from elasticsearch_dsl import analyzer, tokenizer # autocomplete tokenizer edge_ngram_tokenizer = tokenizer( 'edge_ngram_tokenizer', type='edge_ngram', min_gram=1, max_gram=20, token_chars=['letter', 'digit'] ) # autocomplete analyzer edge_ngram_analyzer = analyzer( 'edge_ngram_analyzer', tokenizer=edge_ngram_tokenizer, filter=['lowercase', 'asciifolding'], ) # autocomplete *search* tokenizer edge_ngram_search_tokenizer = tokenizer( 'edge_ngram_search_tokenizer', type='edge_ngram', token_chars=['letter', 'digit'] ) search_tokenizer = tokenizer( 'search_tokenizer', type='standard',
fr_stem_filter = token_filter( 'fr_stem_filter', type='stemmer', language='minimal_french') # Deal with French specific aspects. fr_elision = token_filter( 'fr_elision', type='elision', articles=[ 'l', 'm', 't', 'qu', 'n', 's', 'j', 'd', 'c', 'jusqu', 'quoiqu', 'lorsqu', 'puisqu' ] ) # Languages related analyzers. de_analyzer = analyzer( 'de_analyzer', tokenizer=tokenizer('icu_tokenizer'), filter=['icu_folding', 'icu_normalizer', de_stop_filter, de_stem_filter], char_filter=[char_filter('html_strip')] ) en_analyzer = analyzer( 'en_analyzer', tokenizer=tokenizer('icu_tokenizer'), filter=['icu_folding', 'icu_normalizer', en_stop_filter, en_stem_filter], char_filter=[char_filter('html_strip')] ) es_analyzer = analyzer( 'es_analyzer', tokenizer=tokenizer('icu_tokenizer'), filter=['icu_folding', 'icu_normalizer', es_stop_filter, es_stem_filter],
class NoneAwareDate(Date): """Elasticsearch DSL Date field chokes on None values and parses empty strings as current date, hence the workaround. TODO: move this upstream in some form.""" def _to_python(self, data): if data is None: return data return super(NoneAwareDate, self)._to_python(data) namesAutocompleteAnalyzer = analyzer( "namesAutocompleteAnalyzer", tokenizer=tokenizer( "autocompleteTokenizer", type="edge_ngram", min_gram=1, max_gram=25, token_chars=["letter", "digit"], ), filter=["lowercase"], ) namesAutocompleteSearchAnalyzer = analyzer("namesAutocompleteSearchAnalyzer", tokenizer=tokenizer("whitespace"), filter=["lowercase"]) class AbstractDeclaration(object): def infocard(self): raise NotImplemented()