def test_mapping_can_collect_all_analyzers(): a1 = analysis.analyzer('my_analyzer1', tokenizer='keyword', filter=['lowercase', analysis.token_filter('my_filter1', 'stop', stopwords=['a', 'b'])], ) a2 = analysis.analyzer('english') a3 = analysis.analyzer('unknown_custom') a4 = analysis.analyzer('my_analyzer2', tokenizer=analysis.tokenizer('trigram', 'nGram', min_gram=3, max_gram=3), filter=[analysis.token_filter('my_filter2', 'stop', stopwords=['c', 'd'])], ) m = mapping.Mapping('article') m.field('title', 'string', analyzer=a1, fields={ 'english': String(index_analyzer=a2), 'unknown': String(search_analyzer=a3), } ) m.field('comments', Nested(properties={ 'author': String(index_analyzer=a4) })) assert { 'analyzer': { 'my_analyzer1': {'filter': ['lowercase', 'my_filter1'], 'tokenizer': 'keyword', 'type': 'custom'}, 'my_analyzer2': {'filter': ['my_filter2'], 'tokenizer': 'trigram', 'type': 'custom'}}, 'filter': { 'my_filter1': {'stopwords': ['a', 'b'], 'type': 'stop'}, 'my_filter2': {'stopwords': ['c', 'd'], 'type': 'stop'}, }, 'tokenizer': { 'trigram': {'max_gram': 3, 'min_gram': 3, 'type': 'nGram'}, } } == m._collect_analysis()
def test_mapping_can_collect_all_analyzers(): a1 = analysis.analyzer('my_analyzer1', tokenizer='keyword', filter=['lowercase', analysis.token_filter('my_filter1', 'stop', stopwords=['a', 'b'])], ) a2 = analysis.analyzer('english') a3 = analysis.analyzer('unknown_custom') a4 = analysis.analyzer('my_analyzer2', tokenizer=analysis.tokenizer('trigram', 'nGram', min_gram=3, max_gram=3), filter=[analysis.token_filter('my_filter2', 'stop', stopwords=['c', 'd'])], ) m = mapping.Mapping('article') m.field('title', 'string', analyzer=a1, fields={ 'english': String(analyzer=a2), 'unknown': String(analyzer=a3), } ) m.field('comments', Nested(properties={ 'author': String(analyzer=a4) })) assert { 'analyzer': { 'my_analyzer1': {'filter': ['lowercase', 'my_filter1'], 'tokenizer': 'keyword', 'type': 'custom'}, 'my_analyzer2': {'filter': ['my_filter2'], 'tokenizer': 'trigram', 'type': 'custom'}}, 'filter': { 'my_filter1': {'stopwords': ['a', 'b'], 'type': 'stop'}, 'my_filter2': {'stopwords': ['c', 'd'], 'type': 'stop'}, }, 'tokenizer': { 'trigram': {'max_gram': 3, 'min_gram': 3, 'type': 'nGram'}, } } == m._collect_analysis()
def test_multiplexer_with_custom_filter(): a = analysis.analyzer( "my_analyzer", tokenizer="keyword", filter=[ analysis.token_filter( "my_multi", "multiplexer", filters=[ [analysis.token_filter("en", "snowball", language="English")], "lowercase, stop", ], ) ], ) assert { "analyzer": { "my_analyzer": { "filter": ["my_multi"], "tokenizer": "keyword", "type": "custom", } }, "filter": { "en": {"type": "snowball", "language": "English"}, "my_multi": {"filters": ["en", "lowercase, stop"], "type": "multiplexer"}, }, } == a.get_analysis_definition()
def test_mapping_can_collect_multiple_analyzers(): a1 = analysis.analyzer( "my_analyzer1", tokenizer="keyword", filter=["lowercase", analysis.token_filter("my_filter1", "stop", stopwords=["a", "b"])], ) a2 = analysis.analyzer( "my_analyzer2", tokenizer=analysis.tokenizer("trigram", "nGram", min_gram=3, max_gram=3), filter=[analysis.token_filter("my_filter2", "stop", stopwords=["c", "d"])], ) m = mapping.Mapping("article") m.field("title", "string", analyzer=a1, search_analyzer=a2) m.field( "text", "string", analyzer=a1, fields={"english": String(analyzer=a1), "unknown": String(analyzer=a1, search_analyzer=a2)}, ) assert { "analyzer": { "my_analyzer1": {"filter": ["lowercase", "my_filter1"], "tokenizer": "keyword", "type": "custom"}, "my_analyzer2": {"filter": ["my_filter2"], "tokenizer": "trigram", "type": "custom"}, }, "filter": { "my_filter1": {"stopwords": ["a", "b"], "type": "stop"}, "my_filter2": {"stopwords": ["c", "d"], "type": "stop"}, }, "tokenizer": {"trigram": {"max_gram": 3, "min_gram": 3, "type": "nGram"}}, } == m._collect_analysis()
def test_multiplexer_with_custom_filter(): a = analysis.analyzer('my_analyzer', tokenizer='keyword', filter=[ analysis.token_filter('my_multi', 'multiplexer', filters=[[ analysis.token_filter( 'en', 'snowball', language='English') ], 'lowercase, stop']) ]) assert { "analyzer": { "my_analyzer": { "filter": ["my_multi"], "tokenizer": "keyword", "type": "custom" } }, "filter": { "en": { "type": "snowball", "language": "English" }, "my_multi": { "filters": ["en", "lowercase, stop"], "type": "multiplexer" } } } == a.get_analysis_definition()
def test_conditional_token_filter(): a = analysis.analyzer( "my_cond", tokenizer=analysis.tokenizer("keyword"), filter=[ analysis.token_filter( "testing", "condition", script={"source": "return true"}, filter=[ "lowercase", analysis.token_filter("en", "snowball", language="English"), ], ), "stop", ], ) assert { "analyzer": { "my_cond": { "filter": ["testing", "stop"], "tokenizer": "keyword", "type": "custom", } }, "filter": { "en": {"language": "English", "type": "snowball"}, "testing": { "script": {"source": "return true"}, "filter": ["lowercase", "en"], "type": "condition", }, }, } == a.get_analysis_definition()
def test_mapping_can_collect_multiple_analyzers(): a1 = analysis.analyzer( 'my_analyzer1', tokenizer='keyword', filter=['lowercase', analysis.token_filter('my_filter1', 'stop', stopwords=['a', 'b'])], ) a2 = analysis.analyzer( 'my_analyzer2', tokenizer=analysis.tokenizer('trigram', 'nGram', min_gram=3, max_gram=3), filter=[analysis.token_filter('my_filter2', 'stop', stopwords=['c', 'd'])], ) m = mapping.Mapping() m.field('title', 'text', analyzer=a1, search_analyzer=a2) m.field( 'text', 'text', analyzer=a1, fields={ 'english': Text(analyzer=a1), 'unknown': Keyword(analyzer=a1, search_analyzer=a2), } ) assert { 'analyzer': { 'my_analyzer1': {'filter': ['lowercase', 'my_filter1'], 'tokenizer': 'keyword', 'type': 'custom'}, 'my_analyzer2': {'filter': ['my_filter2'], 'tokenizer': 'trigram', 'type': 'custom'}}, 'filter': { 'my_filter1': {'stopwords': ['a', 'b'], 'type': 'stop'}, 'my_filter2': {'stopwords': ['c', 'd'], 'type': 'stop'}}, 'tokenizer': {'trigram': {'max_gram': 3, 'min_gram': 3, 'type': 'nGram'}} } == m._collect_analysis()
def test_mapping_can_collect_all_analyzers_and_normalizers(): a1 = analysis.analyzer('my_analyzer1', tokenizer='keyword', filter=['lowercase', analysis.token_filter('my_filter1', 'stop', stopwords=['a', 'b'])], ) a2 = analysis.analyzer('english') a3 = analysis.analyzer('unknown_custom') a4 = analysis.analyzer('my_analyzer2', tokenizer=analysis.tokenizer('trigram', 'nGram', min_gram=3, max_gram=3), filter=[analysis.token_filter('my_filter2', 'stop', stopwords=['c', 'd'])], ) a5 = analysis.analyzer('my_analyzer3', tokenizer='keyword') n1 = analysis.normalizer('my_normalizer1', filter=['lowercase'] ) n2 = analysis.normalizer('my_normalizer2', filter=['my_filter1', 'my_filter2', analysis.token_filter('my_filter3', 'stop', stopwords=['e', 'f'])] ) n3 = analysis.normalizer('unknown_custom') m = mapping.Mapping() m.field('title', 'text', analyzer=a1, fields={ 'english': Text(analyzer=a2), 'unknown': Keyword(search_analyzer=a3), } ) m.field('comments', Nested(properties={ 'author': Text(analyzer=a4) })) m.field('normalized_title', 'keyword', normalizer=n1) m.field('normalized_comment', 'keyword', normalizer=n2) m.field('unknown', 'keyword', normalizer=n3) m.meta('_all', analyzer=a5) assert { 'analyzer': { 'my_analyzer1': {'filter': ['lowercase', 'my_filter1'], 'tokenizer': 'keyword', 'type': 'custom'}, 'my_analyzer2': {'filter': ['my_filter2'], 'tokenizer': 'trigram', 'type': 'custom'}, 'my_analyzer3': {'tokenizer': 'keyword', 'type': 'custom'}, }, 'normalizer': { 'my_normalizer1': {'filter': ['lowercase'], 'type': 'custom'}, 'my_normalizer2': {'filter': ['my_filter1', 'my_filter2', 'my_filter3'], 'type': 'custom'}, }, 'filter': { 'my_filter1': {'stopwords': ['a', 'b'], 'type': 'stop'}, 'my_filter2': {'stopwords': ['c', 'd'], 'type': 'stop'}, 'my_filter3': {'stopwords': ['e', 'f'], 'type': 'stop'}, }, 'tokenizer': { 'trigram': {'max_gram': 3, 'min_gram': 3, 'type': 'nGram'}, } } == m._collect_analysis() assert json.loads(json.dumps(m.to_dict())) == m.to_dict()
class Analyzers: first_name_synonym_analyzer = analyzer( 'name_search_synonym_analyzer', type="custom", tokenizer="standard", filter=[ 'lowercase', analysis.token_filter( 'name_search_synonym_filter', type="synonym", expand=True, lenient=True, synonyms_path="common_first_name_synonyms.txt", ) ]) @staticmethod def analyze_first_name_synonym(text): return Analyzers.first_name_synonym_analyzer.simulate(text) nysis_phonetic_analyzer = analyzer('name_search_nysiis_analyzer', type="custom", tokenizer="standard", filter=[ 'lowercase', analysis.token_filter( 'name_search_nysiis_filter', type="phonetic", encoder="nysiis", replace="true") ]) @staticmethod def analyze_nysis_phonetic(text): return Analyzers.nysis_phonetic_analyzer.simulate(text) beider_morse_phonetic_analyzer = analyzer( 'name_search_beider_morse_analyzer', type="custom", tokenizer="standard", filter=[ 'lowercase', analysis.token_filter('name_search_beider_morse_filter', type="phonetic", encoder="beider_morse", replace="true") ]) @staticmethod def analyze_beider_morse_phonetic(text): return Analyzers.beider_morse_phonetic_analyzer.simulate(text)
def test_stemmer_analyzer_can_pass_name(): t = analysis.token_filter('my_english_filter', name="minimal_english", type="stemmer") assert t.to_dict() == 'my_english_filter' assert { "type" : "stemmer", "name" : "minimal_english" } == t.get_definition()
def test_custom_analyzer_can_collect_custom_items(): trigram = analysis.tokenizer('trigram', 'nGram', min_gram=3, max_gram=3) my_stop = analysis.token_filter('my_stop', 'stop', stopwords=['a', 'b']) umlauts = analysis.char_filter('umlauts', 'pattern_replace', mappings=['ü=>ue']) a = analysis.analyzer( 'my_analyzer', tokenizer=trigram, filter=['lowercase', my_stop], char_filter=['html_strip', umlauts] ) assert a.to_dict() == 'my_analyzer' assert { 'analyzer': { 'my_analyzer': { 'type': 'custom', 'tokenizer': 'trigram', 'filter': ['lowercase', 'my_stop'], 'char_filter': ['html_strip', 'umlauts'] } }, 'tokenizer': { 'trigram': trigram.get_definition() }, 'filter': { 'my_stop': my_stop.get_definition() }, 'char_filter': { 'umlauts': umlauts.get_definition() } } == a.get_analysis_definition()
def test_simple_multiplexer_filter(): a = analysis.analyzer('my_analyzer', tokenizer='keyword', filter=[ analysis.token_filter( 'my_multi', 'multiplexer', filters=['lowercase', 'lowercase, stop']) ]) assert { "analyzer": { "my_analyzer": { "filter": ["my_multi"], "tokenizer": "keyword", "type": "custom" } }, "filter": { "my_multi": { "filters": ["lowercase", "lowercase, stop"], "type": "multiplexer" } } } == a.get_analysis_definition()
def test_simple_multiplexer_filter(): a = analysis.analyzer( "my_analyzer", tokenizer="keyword", filter=[ analysis.token_filter( "my_multi", "multiplexer", filters=["lowercase", "lowercase, stop"] ) ], ) assert { "analyzer": { "my_analyzer": { "filter": ["my_multi"], "tokenizer": "keyword", "type": "custom", } }, "filter": { "my_multi": { "filters": ["lowercase", "lowercase, stop"], "type": "multiplexer", } }, } == a.get_analysis_definition()
def test_custom_analyzer_can_collect_custom_items(): trigram = analysis.tokenizer("trigram", "nGram", min_gram=3, max_gram=3) my_stop = analysis.token_filter("my_stop", "stop", stopwords=["a", "b"]) umlauts = analysis.char_filter("umlauts", "pattern_replace", mappings=["ü=>ue"]) a = analysis.analyzer( "my_analyzer", tokenizer=trigram, filter=["lowercase", my_stop], char_filter=["html_strip", umlauts], ) assert a.to_dict() == "my_analyzer" assert { "analyzer": { "my_analyzer": { "type": "custom", "tokenizer": "trigram", "filter": ["lowercase", "my_stop"], "char_filter": ["html_strip", "umlauts"], } }, "tokenizer": {"trigram": trigram.get_definition()}, "filter": {"my_stop": my_stop.get_definition()}, "char_filter": {"umlauts": umlauts.get_definition()}, } == a.get_analysis_definition()
def test_custom_analyzer_can_collect_custom_items(): trigram = analysis.tokenizer('trigram', 'nGram', min_gram=3, max_gram=3) my_stop = analysis.token_filter('my_stop', 'stop', stopwords=['a', 'b']) umlauts = analysis.char_filter('umlauts', 'pattern_replace', mappings=['ü=>ue']) a = analysis.analyzer('my_analyzer', tokenizer=trigram, filter=['lowercase', my_stop], char_filter=['html_strip', umlauts]) assert a.to_dict() == 'my_analyzer' assert { 'analyzer': { 'my_analyzer': { 'type': 'custom', 'tokenizer': 'trigram', 'filter': ['lowercase', 'my_stop'], 'char_filter': ['html_strip', 'umlauts'] } }, 'tokenizer': { 'trigram': trigram.get_definition() }, 'filter': { 'my_stop': my_stop.get_definition() }, 'char_filter': { 'umlauts': umlauts.get_definition() } } == a.get_analysis_definition()
def test_conflicting_nested_filters_cause_error(): a = analysis.analyzer( "my_cond", tokenizer=analysis.tokenizer("keyword"), filter=[ analysis.token_filter("en", "stemmer", language="english"), analysis.token_filter( "testing", "condition", script={"source": "return true"}, filter=[ "lowercase", analysis.token_filter("en", "snowball", language="English"), ], ), ], ) with raises(ValueError): a.get_analysis_definition()
def test_conflicting_nested_filters_cause_error(): a = analysis.analyzer('my_cond', tokenizer=analysis.tokenizer('keyword'), filter=[ analysis.token_filter('en', 'stemmer', language='english'), analysis.token_filter( 'testing', 'condition', script={'source': 'return true'}, filter=[ 'lowercase', analysis.token_filter('en', 'snowball', language='English') ]) ]) with raises(ValueError): a.get_analysis_definition()
def autocomplete(self): autocomplete_filter = analysis.token_filter( "autocomplete_filter", "edge_ngram", min_gram=1, max_gram=20 ) return analyzer( "autocomplete", type="custom", tokenizer="standard", filter=["lowercase", autocomplete_filter] )
def name_delimiter_analyzer(): ''' Analyzer for the fields with composed parts (dash, underscore, ...) ''' word_delimiter_graph_preserve_original = analysis.token_filter( 'word_delimiter_graph_preserve_original', type="word_delimiter_graph", preserve_original=True) return analyzer('name_delimiter', tokenizer="keyword", filter=[ word_delimiter_graph_preserve_original, "flatten_graph", "lowercase", "stop", "snowball", "remove_duplicates" ])
def get_analyzer(lang_analyzer, delete_old_index, user_dictionary_file='', synonyms=None): """ Return analyzer for specific language. If Japanese (``lang_analyzer == ja``) and the index doesn't need to be recreated (no delete required and no new synonyms) then return only the name of the analyzer. :param lang_analyzer: ``str`` which analyzer to get e.g. 'standard','kuromoji','english' :param delete_old_index: (only Japanese) ``bool`` if list is empty and index is not deleted, keep previous analyzer with synonyms :param user_dictionary_file: (only Japanese) ``str`` user-dictionary file with custom terms in the form of 東京スカイツリー,東京 スカイツリー,トウキョウ スカイツリー,カスタム名詞 See: https://www.elastic.co/guide/en/elasticsearch/plugins/current/analysis-kuromoji-tokenizer.html :param synonyms: (only Japanese) ``list`` of synonyms to be used in the form of ['京産大, 京都産業大学','a, b'] if list is empty and index is not deleted, keep previous analyzer with synonyms :return: ``analyzer`` or ``str`` of analyzer to be used """ if synonyms is None: synonyms = [] if lang_analyzer == constants.SUPPORTED_LANG_CODES_ANALYZERS['ja']: # Use existing analyzer (with synonyms) if new synonyms list is empty. (Only if index is not re-built) if (not delete_old_index) & (len(synonyms) == 0): analyzer_lang = '{0}_custom'.format( lang_analyzer) # Use existing analyzer with existing synonyms else: analyzer_lang = analysis.analyzer( '{0}_custom'.format(lang_analyzer), tokenizer=analysis.tokenizer( 'kuromoji_tokenizer_user_dict', type='kuromoji_tokenizer', user_dictionary=user_dictionary_file), filter=[ 'kuromoji_baseform', 'kuromoji_part_of_speech', 'cjk_width', 'ja_stop', 'kuromoji_stemmer', 'lowercase', analysis.token_filter( 'synonym', type='synonym', synonyms=synonyms), # ['京産大, 京都産業大学'] ]) # Extra token filters: kuromoji_number, kuromoji_readingform # Extra character filter: kuromoji_iteration_mark # user_dictionary="userdict_ja.txt") # /etc/elasticsearch/ else: analyzer_lang = analysis.analyzer(lang_analyzer) return analyzer_lang
def test_conditional_token_filter(): a = analysis.analyzer('my_cond', tokenizer=analysis.tokenizer('keyword'), filter=[ analysis.token_filter( 'testing', 'condition', script={'source': 'return true'}, filter=[ 'lowercase', analysis.token_filter('en', 'snowball', language='English') ]), 'stop' ]) assert { "analyzer": { "my_cond": { "filter": ["testing", "stop"], "tokenizer": "keyword", "type": "custom" } }, "filter": { "en": { "language": "English", "type": "snowball" }, "testing": { "script": { "source": "return true" }, "filter": ["lowercase", "en"], "type": "condition" } } } == a.get_analysis_definition()
def test_mapping_can_collect_all_analyzers(): a1 = analysis.analyzer( "my_analyzer1", tokenizer="keyword", filter=["lowercase", analysis.token_filter("my_filter1", "stop", stopwords=["a", "b"])], ) a2 = analysis.analyzer("english") a3 = analysis.analyzer("unknown_custom") a4 = analysis.analyzer( "my_analyzer2", tokenizer=analysis.tokenizer("trigram", "nGram", min_gram=3, max_gram=3), filter=[analysis.token_filter("my_filter2", "stop", stopwords=["c", "d"])], ) a5 = analysis.analyzer("my_analyzer3", tokenizer="keyword") m = mapping.Mapping("article") m.field( "title", "string", analyzer=a1, fields={"english": String(analyzer=a2), "unknown": String(search_analyzer=a3)} ) m.field("comments", Nested(properties={"author": String(analyzer=a4)})) m.meta("_all", analyzer=a5) assert { "analyzer": { "my_analyzer1": {"filter": ["lowercase", "my_filter1"], "tokenizer": "keyword", "type": "custom"}, "my_analyzer2": {"filter": ["my_filter2"], "tokenizer": "trigram", "type": "custom"}, "my_analyzer3": {"tokenizer": "keyword", "type": "custom"}, }, "filter": { "my_filter1": {"stopwords": ["a", "b"], "type": "stop"}, "my_filter2": {"stopwords": ["c", "d"], "type": "stop"}, }, "tokenizer": {"trigram": {"max_gram": 3, "min_gram": 3, "type": "nGram"}}, } == m._collect_analysis() assert json.loads(json.dumps(m.to_dict())) == m.to_dict()
def get_settings(self): shingle_filter = analysis.token_filter( 'filter_shingle', 'shingle', max_shingle_size=5, min_shingle_size=2, output_unigrams=True) shingle_analyzer = analysis.analyzer( 'analyzer_shingle', tokenizer='standard', filter=['standard', 'lowercase', shingle_filter]) return { 'settings': { 'index': { 'analysis': shingle_analyzer.get_analysis_definition() } } }
# Define your item pipelines here # # Don't forget to add your pipeline to the ITEM_PIPELINES setting # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html from elasticsearch_dsl.document import DocType from elasticsearch_dsl.field import Text, Date, Keyword, Integer, String, Completion, Float from elasticsearch_dsl.analysis import token_filter, analyzer from elasticsearch_dsl import Index from elasticsearch import Elasticsearch es = Elasticsearch() ngram_filter = token_filter('ngram_filter', type='nGram', min_gram=1, max_gram=20) ngram_analyzer = analyzer('ngram_analyzer', type='custom', tokenizer='whitespace', filter=['lowercase', 'asciifolding', ngram_filter]) class ImdbspiderPipeline(object): def __init__(self): movies = Index('imdb', using=es) movies.doc_type(Movie) movies.delete(ignore=404) movies.create()
from elasticsearch_dsl.analysis import analyzer, token_filter edge_ngram_analyzer = analyzer( 'edge_ngram_analyzer', type='custom', tokenizer='standard', filter=[ 'lowercase', token_filter( 'edge_ngram_filter', type='edgeNGram', min_gram=2, max_gram=20 ) ] )
def test_stemmer_analyzer_can_pass_name(): t = analysis.token_filter('my_english_filter', name="minimal_english", type="stemmer") assert t.to_dict() == 'my_english_filter' assert {"type": "stemmer", "name": "minimal_english"} == t.get_definition()
import logging import elasticsearch_dsl as es from elasticsearch_dsl import analysis from django.conf import settings from datasets.bag.models import Nummeraanduiding from datasets.hr.models import DataSelectie log = logging.getLogger(__name__) edge_ngram_filter = analysis.token_filter( 'edge_ngram_filter', type='edge_ngram', min_gram=1, max_gram=15 ) autocomplete = es.analyzer( 'autocomplete', tokenizer='standard', filter=['lowercase', edge_ngram_filter] ) class Inschrijving(es.DocType): """ Elastic data of 'vestigingen' or 'mac' from handelsregister
from elasticsearch_dsl import analysis orderings = dict( openbare_ruimte=10, kadastraal_subject=25, adres=50, kadastraal_object=100, ) synonym_filter = analysis.token_filter( 'synonyms', type='synonym', synonyms=[ '1e=>eerste', '2e=>tweede', '3e=>derde', '4e=>vierde', ] ) huisnummer_generate = analysis.char_filter( 'huisnummer_expand', type='pattern_replace', pattern='(\d+)', replacement=""" $1-1 $1- $1-2 $1-3 $1a $1b $1a-1 $1b-1 $1-a $1-b $1b 1-b $1c 1-c
from apps.core.models import Article from apps.user.models import UserProfile ngram_analyzer = analyzer( 'ngram_anl', type='custom', tokenizer=tokenizer( 'ngram_tkn', type='char_group', tokenize_on_chars=['\n'] ), filter=[ token_filter( 'ngram_tkf', type='ngram', min_gram=1, max_gram=10, ), 'lowercase' ] ) newline_analyzer = analyzer( 'nl_anl', type='custom', tokenizer=tokenizer( 'nl_tkn', type='char_group', tokenize_on_chars=['\n'] ),
from elasticsearch_dsl import (Boolean, Document, Date, Nested, InnerDoc, Keyword, Text, Float, Integer, analyzer, tokenizer, analysis) # define custom token filters and anlyzers bigram_token_filter = analysis.token_filter('bigram', 'shingle', min_shingle_size=2, max_shingle_size=2, output_unigrams=False) trigram_token_filter = analysis.token_filter('trigram', 'shingle', min_shingle_size=3, max_shingle_size=3, output_unigrams=False) quadragram_token_filter = analysis.token_filter('quadragram', 'shingle', min_shingle_size=4, max_shingle_size=4, output_unigrams=False) pentagram_token_filter = analysis.token_filter('pentagram', 'shingle', min_shingle_size=5, max_shingle_size=5, output_unigrams=False) stemmer_token_filter = analysis.token_filter('english_stemmer', 'stemmer', name="english") standard = analyzer( 'standard_analyzer',
def test_mapping_can_collect_multiple_analyzers(): a1 = analysis.analyzer( "my_analyzer1", tokenizer="keyword", filter=[ "lowercase", analysis.token_filter("my_filter1", "stop", stopwords=["a", "b"]), ], ) a2 = analysis.analyzer( "my_analyzer2", tokenizer=analysis.tokenizer("trigram", "nGram", min_gram=3, max_gram=3), filter=[ analysis.token_filter("my_filter2", "stop", stopwords=["c", "d"]) ], ) m = mapping.Mapping() m.field("title", "text", analyzer=a1, search_analyzer=a2) m.field( "text", "text", analyzer=a1, fields={ "english": Text(analyzer=a1), "unknown": Keyword(analyzer=a1, search_analyzer=a2), }, ) assert { "analyzer": { "my_analyzer1": { "filter": ["lowercase", "my_filter1"], "tokenizer": "keyword", "type": "custom", }, "my_analyzer2": { "filter": ["my_filter2"], "tokenizer": "trigram", "type": "custom", }, }, "filter": { "my_filter1": { "stopwords": ["a", "b"], "type": "stop" }, "my_filter2": { "stopwords": ["c", "d"], "type": "stop" }, }, "tokenizer": { "trigram": { "max_gram": 3, "min_gram": 3, "type": "nGram" } }, } == m._collect_analysis()
def test_mapping_can_collect_all_analyzers_and_normalizers(): a1 = analysis.analyzer( "my_analyzer1", tokenizer="keyword", filter=[ "lowercase", analysis.token_filter("my_filter1", "stop", stopwords=["a", "b"]), ], ) a2 = analysis.analyzer("english") a3 = analysis.analyzer("unknown_custom") a4 = analysis.analyzer( "my_analyzer2", tokenizer=analysis.tokenizer("trigram", "nGram", min_gram=3, max_gram=3), filter=[ analysis.token_filter("my_filter2", "stop", stopwords=["c", "d"]) ], ) a5 = analysis.analyzer("my_analyzer3", tokenizer="keyword") n1 = analysis.normalizer("my_normalizer1", filter=["lowercase"]) n2 = analysis.normalizer( "my_normalizer2", filter=[ "my_filter1", "my_filter2", analysis.token_filter("my_filter3", "stop", stopwords=["e", "f"]), ], ) n3 = analysis.normalizer("unknown_custom") m = mapping.Mapping() m.field( "title", "text", analyzer=a1, fields={ "english": Text(analyzer=a2), "unknown": Keyword(search_analyzer=a3) }, ) m.field("comments", Nested(properties={"author": Text(analyzer=a4)})) m.field("normalized_title", "keyword", normalizer=n1) m.field("normalized_comment", "keyword", normalizer=n2) m.field("unknown", "keyword", normalizer=n3) m.meta("_all", analyzer=a5) assert { "analyzer": { "my_analyzer1": { "filter": ["lowercase", "my_filter1"], "tokenizer": "keyword", "type": "custom", }, "my_analyzer2": { "filter": ["my_filter2"], "tokenizer": "trigram", "type": "custom", }, "my_analyzer3": { "tokenizer": "keyword", "type": "custom" }, }, "normalizer": { "my_normalizer1": { "filter": ["lowercase"], "type": "custom" }, "my_normalizer2": { "filter": ["my_filter1", "my_filter2", "my_filter3"], "type": "custom", }, }, "filter": { "my_filter1": { "stopwords": ["a", "b"], "type": "stop" }, "my_filter2": { "stopwords": ["c", "d"], "type": "stop" }, "my_filter3": { "stopwords": ["e", "f"], "type": "stop" }, }, "tokenizer": { "trigram": { "max_gram": 3, "min_gram": 3, "type": "nGram" } }, } == m._collect_analysis() assert json.loads(json.dumps(m.to_dict())) == m.to_dict()
part_number_analyzer = analysis.analyzer( "part_number_analyzer", tokenizer=analysis.tokenizer("part_number_path_hierarchy", "path_hierarchy", delimiter="-"), filter=["lowercase", "trim"], ) reference_code_analyzer = analysis.analyzer( "reference_code_analyzer", tokenizer="path_hierarchy", filter=["lowercase", "trim"] ) descriptive_text_analyzer = analysis.analyzer( "descriptive_text_analyzer", tokenizer="classic", filter=["lowercase", "trim", "stemmer"] ) ngram_filter = analysis.token_filter("ngram_filter", type="ngram", min_gram=2, max_gram=20) ngram_analyzer = analysis.analyzer( "ngram_completion", tokenizer="whitespace", filter=["lowercase", "asciifolding", ngram_filter] ) whitespace_analyzer = analysis.analyzer( "whitespace_analyzer", tokenizer="whitespace", filter=["lowercase", "asciifolding"] ) lowercase_normalizer = analysis.normalizer("lowercase_normalizer", filter=["lowercase"]) email_analyzer = analysis.analyzer( "email_analyzer", type="custom", tokenizer=analysis.tokenizer(
from django_elasticsearch_dsl import DocType, Index, fields from elasticsearch_dsl.analysis import token_filter from data_refinery_common.utils import get_supported_microarray_platforms, get_supported_rnaseq_platforms from .models import Sample, Experiment, Organism experiment_index = Index('experiments') experiment_index.settings(number_of_shards=1, number_of_replicas=0, max_result_window=9999999) # via https://django-elasticsearch-dsl-drf.readthedocs.io/en/0.17.2/advanced_usage_examples.html?highlight=ngram#id8 # via https://github.com/barseghyanartur/django-elasticsearch-dsl-drf/issues/110 edge_ngram_completion_filter = token_filter('edge_ngram_completion_filter', type="edge_ngram", min_gram=3, max_gram=12) html_strip = analyzer('html_strip', tokenizer="whitespace", filter=[ edge_ngram_completion_filter, "standard", "lowercase", "stop", "snowball" ], char_filter=["html_strip"]) html_strip_no_ngram = analyzer('html_strip_no_ngram', tokenizer="standard", filter=["standard", "lowercase", "stop"], char_filter=["html_strip"]) html_strip_no_stop = analyzer('html_strip_no_stop', tokenizer="whitespace", filter=["standard", "lowercase"],
from elasticsearch_dsl import analyzer, analysis from django_elasticsearch_dsl import Document, fields, Index from movie.models import MovieModel movie_index = Index('movies') # Создаем TokenFilters из документации russian_stop_filter = analysis.token_filter('russian_stop', type='stop', stopwords='_russian_') russian_stemmer_filter = analysis.token_filter('russian_stemmer', type='stemmer', language='russian') english_stop_filter = analysis.token_filter('english_stop', type='stop', stopwords='_english_') english_stemmer_filter = analysis.token_filter('english_stemmer', type='stemmer', language='english') english_possessive_stemmer_filter = analysis.token_filter( 'english_stemmer', type='stemmer', language='possessive_english') # Создаем анализаторы ru_analyzer = analyzer( 'ru_analyzer', type='custom', tokenizer='standard', filter=['lowercase', russian_stop_filter, russian_stemmer_filter], ) en_analyzer = analyzer('en_analyzer',
import elasticsearch_dsl as es from elasticsearch_dsl import analysis, tokenizer #################################### # Filters # #################################### # Replaces the number street shortening with the actual word synonym_filter = analysis.token_filter( 'synonyms', type='synonym', synonyms=[ '1e, eerste => 1e, eerste', '2e, tweede => 2e, tweede', '3e, derde => 3e, derde', '4e, vierde => 4e, vierde', ] ) strip_zero = analysis.CustomCharFilter( "strip_zero", builtin_type="pattern_replace", pattern="^0+(.*)", replacement="$1" ) # Change dash and dot to space naam_stripper = analysis.char_filter(
def _keywords_filter(keywords): return token_filter( "keywords_autophrase_syn", type="synonym", synonyms=_autophrased_synonyms(keywords), )
'kadastraal_subject': 25, 'adres': 50, 'kadastraal_object': 100 } #################################### # Filters # #################################### # Replaces the number street shortening with the actual word synonym_filter = analysis.token_filter( 'synonyms', type='synonym', synonyms=[ '1e, eerste => 1e, eerste', '2e, tweede => 2e, tweede', '3e, derde => 3e, derde', '4e, vierde => 4e, vierde', ] ) huisnummer_expand = analysis.token_filter( 'huisnummer_expand', type='word_delimiter', generate_numer_parts=True, preserve_original=True ) # Change dash and dot to space
- Part 1: Define the documents in which you define who fields will be indexed. - Part 2: Building elasticsearch json queries. BOTH of these parts are equally importand to havea search API that returns relevant results. """ import logging import typing import elasticsearch_dsl as es from elasticsearch_dsl import analysis log = logging.getLogger(__name__) edge_ngram_filter = analysis.token_filter('edge_ngram_filter', type='edge_ngram', min_gram=1, max_gram=15) autocomplete = es.analyzer( 'autocomplete', tokenizer='standard', filter=['standard', 'asciifolding', 'lowercase', edge_ngram_filter]) class User(es.DocType): """Elastic document describing user.""" objectID = es.Keyword() username = es.Text(fielddata=True, analyzer=autocomplete) username_exact = es.Keyword()
# INFO - G.M - 2019-05-31 - Analyzer/indexing explained: # Instead of relying of wildcard for autocompletion which is costly and make some feature doesn't # work correctly, for example ranking, We use ngram mecanism. # This means that for work "elephant", we will index thing like "ele", "elep", "lepha", etc... # As we don't want to have a *text* matching but only an autocomplete matching like text*, we use # edge_ngram, we will only index for "elephant": "ele" , "elep" , "eleph" , etc.. # We want that ele match elephant result, but we do not want that elephant match ele result, # that's why we set different analyzer for search (we search word given) and indexing (we index ngram # of label of content to allow autocompletion) # INFO - G.M - 2019-05-23 - search_analyser: do search for content given an some similar word folding = analyzer("folding", tokenizer="standard", filter=["lowercase", "asciifolding"]) # INFO - G.M - 2019-05-23 - index_analysers, index edge ngram for autocompletion and strip html for indexing edge_ngram_token_filter = analysis.token_filter( "edge_ngram_filter", type="edge_ngram", min_ngram=2, max_gram=20 ) edge_ngram_folding = analyzer( "edge_ngram_folding", tokenizer="standard", filter=["lowercase", "asciifolding", edge_ngram_token_filter], ) html_folding = analyzer( "html_folding", tokenizer="standard", filter=["lowercase", "asciifolding", edge_ngram_token_filter], char_filter="html_strip", ) class DigestUser(InnerDoc):