def test_unchanged_mapping_is_not_updated(write_client): m = mapping.Mapping('test-type') m.field('name', 'string', analyzer=analysis.analyzer("my_analyzer", tokenizer="standard", filter=[ token_filter("simple_edge", type="edgeNGram", min_gram=2, max_gram=3 )] ) ) m.save('test-mapping', using=write_client) # this should not trigger an error since the mapping didn't change m.save('test-mapping', using=write_client) # change the mapping just a little bit m.field('name', 'string', analyzer=analysis.analyzer("my_analyzer", tokenizer="standard", filter=[ token_filter("simple_edge", type="edgeNGram", min_gram=2, max_gram=4 # changed from 3 to 4 )] ) ) with raises(exceptions.IllegalOperation): m.save('test-mapping', using=write_client)
def test_mapping_can_collect_multiple_analyzers(): a1 = analysis.analyzer( "my_analyzer1", tokenizer="keyword", filter=["lowercase", analysis.token_filter("my_filter1", "stop", stopwords=["a", "b"])], ) a2 = analysis.analyzer( "my_analyzer2", tokenizer=analysis.tokenizer("trigram", "nGram", min_gram=3, max_gram=3), filter=[analysis.token_filter("my_filter2", "stop", stopwords=["c", "d"])], ) m = mapping.Mapping("article") m.field("title", "string", analyzer=a1, search_analyzer=a2) m.field( "text", "string", analyzer=a1, fields={"english": String(analyzer=a1), "unknown": String(analyzer=a1, search_analyzer=a2)}, ) assert { "analyzer": { "my_analyzer1": {"filter": ["lowercase", "my_filter1"], "tokenizer": "keyword", "type": "custom"}, "my_analyzer2": {"filter": ["my_filter2"], "tokenizer": "trigram", "type": "custom"}, }, "filter": { "my_filter1": {"stopwords": ["a", "b"], "type": "stop"}, "my_filter2": {"stopwords": ["c", "d"], "type": "stop"}, }, "tokenizer": {"trigram": {"max_gram": 3, "min_gram": 3, "type": "nGram"}}, } == m._collect_analysis()
def test_mapping_can_collect_multiple_analyzers(): a1 = analysis.analyzer( 'my_analyzer1', tokenizer='keyword', filter=['lowercase', analysis.token_filter('my_filter1', 'stop', stopwords=['a', 'b'])], ) a2 = analysis.analyzer( 'my_analyzer2', tokenizer=analysis.tokenizer('trigram', 'nGram', min_gram=3, max_gram=3), filter=[analysis.token_filter('my_filter2', 'stop', stopwords=['c', 'd'])], ) m = mapping.Mapping() m.field('title', 'text', analyzer=a1, search_analyzer=a2) m.field( 'text', 'text', analyzer=a1, fields={ 'english': Text(analyzer=a1), 'unknown': Keyword(analyzer=a1, search_analyzer=a2), } ) assert { 'analyzer': { 'my_analyzer1': {'filter': ['lowercase', 'my_filter1'], 'tokenizer': 'keyword', 'type': 'custom'}, 'my_analyzer2': {'filter': ['my_filter2'], 'tokenizer': 'trigram', 'type': 'custom'}}, 'filter': { 'my_filter1': {'stopwords': ['a', 'b'], 'type': 'stop'}, 'my_filter2': {'stopwords': ['c', 'd'], 'type': 'stop'}}, 'tokenizer': {'trigram': {'max_gram': 3, 'min_gram': 3, 'type': 'nGram'}} } == m._collect_analysis()
class Analyzer: # tokenizes and makes the tokens lowercase general_analyzer = analysis.analyzer( "general_analyzer", tokenizer=Tokenizer.alphanum_tokenizer, filter=["lowercase"]) # provides light stemming for english tokens stemming_analyzer = analysis.analyzer( "stemming_analyzer", tokenizer=Tokenizer.alphanum_tokenizer, filter=["lowercase", "kstem"]) # uses grammar based tokenization before analysis (e.g. "it's fine" -> ["it's", "fine"]) english_analyzer = analysis.analyzer( "english_analyzer", tokenizer=tokenizer("standard_tokenizer", type="standard"), filter=[ Filter.english_possessive_stemmer, "lowercase", Filter.english_stop, Filter.english_stemmer ]) # tokenizes for words and numbers, removing all other characters before analysis # (e.g. "it's fine" -> ["it", "s", "fine"] or "hello_word" -> ["hello", "world"]) alphanum_analyzer = analysis.analyzer( "alphanum_analyzer", tokenizer=Tokenizer.alphanum_tokenizer, filter=[ Filter.english_possessive_stemmer, "lowercase", Filter.english_stop, Filter.english_stemmer ])
def test_mapping_can_collect_all_analyzers(): a1 = analysis.analyzer('my_analyzer1', tokenizer='keyword', filter=['lowercase', analysis.token_filter('my_filter1', 'stop', stopwords=['a', 'b'])], ) a2 = analysis.analyzer('english') a3 = analysis.analyzer('unknown_custom') a4 = analysis.analyzer('my_analyzer2', tokenizer=analysis.tokenizer('trigram', 'nGram', min_gram=3, max_gram=3), filter=[analysis.token_filter('my_filter2', 'stop', stopwords=['c', 'd'])], ) m = mapping.Mapping('article') m.field('title', 'string', analyzer=a1, fields={ 'english': String(index_analyzer=a2), 'unknown': String(search_analyzer=a3), } ) m.field('comments', Nested(properties={ 'author': String(index_analyzer=a4) })) assert { 'analyzer': { 'my_analyzer1': {'filter': ['lowercase', 'my_filter1'], 'tokenizer': 'keyword', 'type': 'custom'}, 'my_analyzer2': {'filter': ['my_filter2'], 'tokenizer': 'trigram', 'type': 'custom'}}, 'filter': { 'my_filter1': {'stopwords': ['a', 'b'], 'type': 'stop'}, 'my_filter2': {'stopwords': ['c', 'd'], 'type': 'stop'}, }, 'tokenizer': { 'trigram': {'max_gram': 3, 'min_gram': 3, 'type': 'nGram'}, } } == m._collect_analysis()
def test_unchanged_mapping_is_not_updated(write_client): m = mapping.Mapping('test-type') m.field('name', 'string', analyzer=analysis.analyzer("my_analyzer", tokenizer="standard", filter=[ token_filter("simple_edge", type="edgeNGram", min_gram=2, max_gram=3) ])) m.save('test-mapping', using=write_client) # this should not trigger an error since the mapping didn't change m.save('test-mapping', using=write_client) # change the mapping just a little bit m.field( 'name', 'string', analyzer=analysis.analyzer( "my_analyzer", tokenizer="standard", filter=[ token_filter( "simple_edge", type="edgeNGram", min_gram=2, max_gram=4 # changed from 3 to 4 ) ])) with raises(exceptions.IllegalOperation): m.save('test-mapping', using=write_client)
def test_mapping_can_collect_all_analyzers(): a1 = analysis.analyzer('my_analyzer1', tokenizer='keyword', filter=['lowercase', analysis.token_filter('my_filter1', 'stop', stopwords=['a', 'b'])], ) a2 = analysis.analyzer('english') a3 = analysis.analyzer('unknown_custom') a4 = analysis.analyzer('my_analyzer2', tokenizer=analysis.tokenizer('trigram', 'nGram', min_gram=3, max_gram=3), filter=[analysis.token_filter('my_filter2', 'stop', stopwords=['c', 'd'])], ) m = mapping.Mapping('article') m.field('title', 'string', analyzer=a1, fields={ 'english': String(analyzer=a2), 'unknown': String(analyzer=a3), } ) m.field('comments', Nested(properties={ 'author': String(analyzer=a4) })) assert { 'analyzer': { 'my_analyzer1': {'filter': ['lowercase', 'my_filter1'], 'tokenizer': 'keyword', 'type': 'custom'}, 'my_analyzer2': {'filter': ['my_filter2'], 'tokenizer': 'trigram', 'type': 'custom'}}, 'filter': { 'my_filter1': {'stopwords': ['a', 'b'], 'type': 'stop'}, 'my_filter2': {'stopwords': ['c', 'd'], 'type': 'stop'}, }, 'tokenizer': { 'trigram': {'max_gram': 3, 'min_gram': 3, 'type': 'nGram'}, } } == m._collect_analysis()
def test_mapping_can_collect_all_analyzers_and_normalizers(): a1 = analysis.analyzer('my_analyzer1', tokenizer='keyword', filter=['lowercase', analysis.token_filter('my_filter1', 'stop', stopwords=['a', 'b'])], ) a2 = analysis.analyzer('english') a3 = analysis.analyzer('unknown_custom') a4 = analysis.analyzer('my_analyzer2', tokenizer=analysis.tokenizer('trigram', 'nGram', min_gram=3, max_gram=3), filter=[analysis.token_filter('my_filter2', 'stop', stopwords=['c', 'd'])], ) a5 = analysis.analyzer('my_analyzer3', tokenizer='keyword') n1 = analysis.normalizer('my_normalizer1', filter=['lowercase'] ) n2 = analysis.normalizer('my_normalizer2', filter=['my_filter1', 'my_filter2', analysis.token_filter('my_filter3', 'stop', stopwords=['e', 'f'])] ) n3 = analysis.normalizer('unknown_custom') m = mapping.Mapping() m.field('title', 'text', analyzer=a1, fields={ 'english': Text(analyzer=a2), 'unknown': Keyword(search_analyzer=a3), } ) m.field('comments', Nested(properties={ 'author': Text(analyzer=a4) })) m.field('normalized_title', 'keyword', normalizer=n1) m.field('normalized_comment', 'keyword', normalizer=n2) m.field('unknown', 'keyword', normalizer=n3) m.meta('_all', analyzer=a5) assert { 'analyzer': { 'my_analyzer1': {'filter': ['lowercase', 'my_filter1'], 'tokenizer': 'keyword', 'type': 'custom'}, 'my_analyzer2': {'filter': ['my_filter2'], 'tokenizer': 'trigram', 'type': 'custom'}, 'my_analyzer3': {'tokenizer': 'keyword', 'type': 'custom'}, }, 'normalizer': { 'my_normalizer1': {'filter': ['lowercase'], 'type': 'custom'}, 'my_normalizer2': {'filter': ['my_filter1', 'my_filter2', 'my_filter3'], 'type': 'custom'}, }, 'filter': { 'my_filter1': {'stopwords': ['a', 'b'], 'type': 'stop'}, 'my_filter2': {'stopwords': ['c', 'd'], 'type': 'stop'}, 'my_filter3': {'stopwords': ['e', 'f'], 'type': 'stop'}, }, 'tokenizer': { 'trigram': {'max_gram': 3, 'min_gram': 3, 'type': 'nGram'}, } } == m._collect_analysis() assert json.loads(json.dumps(m.to_dict())) == m.to_dict()
def get_analyzer(lang_analyzer, delete_old_index, user_dictionary_file='', synonyms=None): """ Return analyzer for specific language. If Japanese (``lang_analyzer == ja``) and the index doesn't need to be recreated (no delete required and no new synonyms) then return only the name of the analyzer. :param lang_analyzer: ``str`` which analyzer to get e.g. 'standard','kuromoji','english' :param delete_old_index: (only Japanese) ``bool`` if list is empty and index is not deleted, keep previous analyzer with synonyms :param user_dictionary_file: (only Japanese) ``str`` user-dictionary file with custom terms in the form of 東京スカイツリー,東京 スカイツリー,トウキョウ スカイツリー,カスタム名詞 See: https://www.elastic.co/guide/en/elasticsearch/plugins/current/analysis-kuromoji-tokenizer.html :param synonyms: (only Japanese) ``list`` of synonyms to be used in the form of ['京産大, 京都産業大学','a, b'] if list is empty and index is not deleted, keep previous analyzer with synonyms :return: ``analyzer`` or ``str`` of analyzer to be used """ if synonyms is None: synonyms = [] if lang_analyzer == constants.SUPPORTED_LANG_CODES_ANALYZERS['ja']: # Use existing analyzer (with synonyms) if new synonyms list is empty. (Only if index is not re-built) if (not delete_old_index) & (len(synonyms) == 0): analyzer_lang = '{0}_custom'.format( lang_analyzer) # Use existing analyzer with existing synonyms else: analyzer_lang = analysis.analyzer( '{0}_custom'.format(lang_analyzer), tokenizer=analysis.tokenizer( 'kuromoji_tokenizer_user_dict', type='kuromoji_tokenizer', user_dictionary=user_dictionary_file), filter=[ 'kuromoji_baseform', 'kuromoji_part_of_speech', 'cjk_width', 'ja_stop', 'kuromoji_stemmer', 'lowercase', analysis.token_filter( 'synonym', type='synonym', synonyms=synonyms), # ['京産大, 京都産業大学'] ]) # Extra token filters: kuromoji_number, kuromoji_readingform # Extra character filter: kuromoji_iteration_mark # user_dictionary="userdict_ja.txt") # /etc/elasticsearch/ else: analyzer_lang = analysis.analyzer(lang_analyzer) return analyzer_lang
def test_mapping_saved_into_es(write_client): m = mapping.Mapping('test-type') m.field('name', 'string', analyzer=analysis.analyzer('my_analyzer', tokenizer='keyword')) m.field('tags', 'string', index='not_analyzed') m.save('test-mapping', using=write_client) m = mapping.Mapping('other-type') m.field('title', 'string').field('categories', 'string', index='not_analyzed') m.save('test-mapping', using=write_client) assert write_client.indices.exists_type(index='test-mapping', doc_type='test-type') assert { 'test-mapping': { 'mappings': { 'test-type': { 'properties': { 'name': {'type': 'string', 'analyzer': 'my_analyzer'}, 'tags': {'index': 'not_analyzed', 'type': 'string'} } }, 'other-type': { 'properties': { 'title': {'type': 'string'}, 'categories': {'index': 'not_analyzed', 'type': 'string'} } } } } } == write_client.indices.get_mapping(index='test-mapping')
def test_mapping_saved_into_es_when_index_already_exists_with_analysis(write_client): m = mapping.Mapping('test-type') analyzer = analysis.analyzer('my_analyzer', tokenizer='keyword') m.field('name', 'text', analyzer=analyzer) new_analysis = analyzer.get_analysis_definition() new_analysis['analyzer']['other_analyzer'] = { 'type': 'custom', 'tokenizer': 'whitespace' } write_client.indices.create(index='test-mapping', body={'settings': {'analysis': new_analysis}}) m.field('title', 'text', analyzer=analyzer) m.save('test-mapping', using=write_client) assert { 'test-mapping': { 'mappings': { 'test-type': { 'properties': { 'name': {'type': 'text', 'analyzer': 'my_analyzer'}, 'title': {'type': 'text', 'analyzer': 'my_analyzer'}, } } } } } == write_client.indices.get_mapping(index='test-mapping')
def test_mapping_saved_into_es_when_index_already_exists_with_analysis( write_client): m = mapping.Mapping('test-type') analyzer = analysis.analyzer('my_analyzer', tokenizer='keyword') m.field('name', 'string', analyzer=analyzer) write_client.indices.create( index='test-mapping', body={'settings': { 'analysis': analyzer.get_analysis_definition() }}) m.save('test-mapping', using=write_client) assert { 'test-mapping': { 'mappings': { 'test-type': { 'properties': { 'name': { 'type': 'string', 'analyzer': 'my_analyzer' }, } } } } } == write_client.indices.get_mapping(index='test-mapping')
def test_custom_analyzer_can_collect_custom_items(): trigram = analysis.tokenizer('trigram', 'nGram', min_gram=3, max_gram=3) my_stop = analysis.token_filter('my_stop', 'stop', stopwords=['a', 'b']) umlauts = analysis.char_filter('umlauts', 'pattern_replace', mappings=['ü=>ue']) a = analysis.analyzer( 'my_analyzer', tokenizer=trigram, filter=['lowercase', my_stop], char_filter=['html_strip', umlauts] ) assert a.to_dict() == 'my_analyzer' assert { 'analyzer': { 'my_analyzer': { 'type': 'custom', 'tokenizer': 'trigram', 'filter': ['lowercase', 'my_stop'], 'char_filter': ['html_strip', 'umlauts'] } }, 'tokenizer': { 'trigram': trigram.get_definition() }, 'filter': { 'my_stop': my_stop.get_definition() }, 'char_filter': { 'umlauts': umlauts.get_definition() } } == a.get_analysis_definition()
def test_custom_analyzer_can_collect_custom_items(): trigram = analysis.tokenizer('trigram', 'nGram', min_gram=3, max_gram=3) my_stop = analysis.token_filter('my_stop', 'stop', stopwords=['a', 'b']) umlauts = analysis.char_filter('umlauts', 'pattern_replace', mappings=['ü=>ue']) a = analysis.analyzer('my_analyzer', tokenizer=trigram, filter=['lowercase', my_stop], char_filter=['html_strip', umlauts]) assert a.to_dict() == 'my_analyzer' assert { 'analyzer': { 'my_analyzer': { 'type': 'custom', 'tokenizer': 'trigram', 'filter': ['lowercase', 'my_stop'], 'char_filter': ['html_strip', 'umlauts'] } }, 'tokenizer': { 'trigram': trigram.get_definition() }, 'filter': { 'my_stop': my_stop.get_definition() }, 'char_filter': { 'umlauts': umlauts.get_definition() } } == a.get_analysis_definition()
def test_multiplexer_with_custom_filter(): a = analysis.analyzer( "my_analyzer", tokenizer="keyword", filter=[ analysis.token_filter( "my_multi", "multiplexer", filters=[ [analysis.token_filter("en", "snowball", language="English")], "lowercase, stop", ], ) ], ) assert { "analyzer": { "my_analyzer": { "filter": ["my_multi"], "tokenizer": "keyword", "type": "custom", } }, "filter": { "en": {"type": "snowball", "language": "English"}, "my_multi": {"filters": ["en", "lowercase, stop"], "type": "multiplexer"}, }, } == a.get_analysis_definition()
def test_simple_multiplexer_filter(): a = analysis.analyzer( "my_analyzer", tokenizer="keyword", filter=[ analysis.token_filter( "my_multi", "multiplexer", filters=["lowercase", "lowercase, stop"] ) ], ) assert { "analyzer": { "my_analyzer": { "filter": ["my_multi"], "tokenizer": "keyword", "type": "custom", } }, "filter": { "my_multi": { "filters": ["lowercase", "lowercase, stop"], "type": "multiplexer", } }, } == a.get_analysis_definition()
def test_custom_analyzer_can_collect_custom_items(): trigram = analysis.tokenizer("trigram", "nGram", min_gram=3, max_gram=3) my_stop = analysis.token_filter("my_stop", "stop", stopwords=["a", "b"]) umlauts = analysis.char_filter("umlauts", "pattern_replace", mappings=["ü=>ue"]) a = analysis.analyzer( "my_analyzer", tokenizer=trigram, filter=["lowercase", my_stop], char_filter=["html_strip", umlauts], ) assert a.to_dict() == "my_analyzer" assert { "analyzer": { "my_analyzer": { "type": "custom", "tokenizer": "trigram", "filter": ["lowercase", "my_stop"], "char_filter": ["html_strip", "umlauts"], } }, "tokenizer": {"trigram": trigram.get_definition()}, "filter": {"my_stop": my_stop.get_definition()}, "char_filter": {"umlauts": umlauts.get_definition()}, } == a.get_analysis_definition()
def test_mapping_saved_into_es_when_index_already_exists_with_analysis( write_client): m = mapping.Mapping() analyzer = analysis.analyzer('my_analyzer', tokenizer='keyword') m.field('name', 'text', analyzer=analyzer) new_analysis = analyzer.get_analysis_definition() new_analysis['analyzer']['other_analyzer'] = { 'type': 'custom', 'tokenizer': 'whitespace' } write_client.indices.create(index='test-mapping', body={'settings': { 'analysis': new_analysis }}) m.field('title', 'text', analyzer=analyzer) m.save('test-mapping', using=write_client) assert { 'test-mapping': { 'mappings': { 'properties': { 'name': { 'type': 'text', 'analyzer': 'my_analyzer' }, 'title': { 'type': 'text', 'analyzer': 'my_analyzer' }, } } } } == write_client.indices.get_mapping(index='test-mapping')
def test_conditional_token_filter(): a = analysis.analyzer( "my_cond", tokenizer=analysis.tokenizer("keyword"), filter=[ analysis.token_filter( "testing", "condition", script={"source": "return true"}, filter=[ "lowercase", analysis.token_filter("en", "snowball", language="English"), ], ), "stop", ], ) assert { "analyzer": { "my_cond": { "filter": ["testing", "stop"], "tokenizer": "keyword", "type": "custom", } }, "filter": { "en": {"language": "English", "type": "snowball"}, "testing": { "script": {"source": "return true"}, "filter": ["lowercase", "en"], "type": "condition", }, }, } == a.get_analysis_definition()
def test_mapping_saved_into_es_when_index_already_exists_with_analysis(write_client): m = mapping.Mapping() analyzer = analysis.analyzer("my_analyzer", tokenizer="keyword") m.field("name", "text", analyzer=analyzer) new_analysis = analyzer.get_analysis_definition() new_analysis["analyzer"]["other_analyzer"] = { "type": "custom", "tokenizer": "whitespace", } write_client.indices.create( index="test-mapping", body={"settings": {"analysis": new_analysis}} ) m.field("title", "text", analyzer=analyzer) m.save("test-mapping", using=write_client) assert { "test-mapping": { "mappings": { "properties": { "name": {"type": "text", "analyzer": "my_analyzer"}, "title": {"type": "text", "analyzer": "my_analyzer"}, } } } } == write_client.indices.get_mapping(index="test-mapping")
class Movie(Document): title = Text(fields={'raw': {'type': 'keyword'}}) film_rating = Text() duration = Text() genre = Keyword(multi=True) release_date = Text() release_date_unix_time = Float() imdb_ratingValue = Float() imdb_bestRating = Float() imdb_ratingCount = Float() description = Text() storyline = Text() poster = Text() trailer_img = Text() director = Keyword(multi=True) creator = Keyword(multi=True) writer = Keyword(multi=True) stars = Keyword(multi=True) taglines = Keyword(multi=True) url = Keyword() req_headers = Object(enabled=False) res_headers = Object(enabled=False) suggest = Completion(analyzer=ngram_analyzer, search_analyzer=analyzer('standard')) class Index: name = 'imdb'
def test_multiplexer_with_custom_filter(): a = analysis.analyzer('my_analyzer', tokenizer='keyword', filter=[ analysis.token_filter('my_multi', 'multiplexer', filters=[[ analysis.token_filter( 'en', 'snowball', language='English') ], 'lowercase, stop']) ]) assert { "analyzer": { "my_analyzer": { "filter": ["my_multi"], "tokenizer": "keyword", "type": "custom" } }, "filter": { "en": { "type": "snowball", "language": "English" }, "my_multi": { "filters": ["en", "lowercase, stop"], "type": "multiplexer" } } } == a.get_analysis_definition()
def test_mapping_saved_into_es_when_index_already_exists_closed(write_client): m = mapping.Mapping() m.field('name', 'text', analyzer=analysis.analyzer('my_analyzer', tokenizer='keyword')) write_client.indices.create(index='test-mapping') with raises(exceptions.IllegalOperation): m.save('test-mapping', using=write_client) write_client.cluster.health(index='test-mapping', wait_for_status='yellow') write_client.indices.close(index='test-mapping') m.save('test-mapping', using=write_client) assert { 'test-mapping': { 'mappings': { 'properties': { 'name': { 'type': 'text', 'analyzer': 'my_analyzer' }, } } } } == write_client.indices.get_mapping(index='test-mapping')
def test_simple_multiplexer_filter(): a = analysis.analyzer('my_analyzer', tokenizer='keyword', filter=[ analysis.token_filter( 'my_multi', 'multiplexer', filters=['lowercase', 'lowercase, stop']) ]) assert { "analyzer": { "my_analyzer": { "filter": ["my_multi"], "tokenizer": "keyword", "type": "custom" } }, "filter": { "my_multi": { "filters": ["lowercase", "lowercase, stop"], "type": "multiplexer" } } } == a.get_analysis_definition()
def test_even_non_custom_analyzers_can_have_params(): a1 = analysis.analyzer("whitespace", type="pattern", pattern=r"\\s+") m = mapping.Mapping() m.field("title", "text", analyzer=a1) assert { "analyzer": { "whitespace": { "type": "pattern", "pattern": r"\\s+" } } } == m._collect_analysis()
def test_even_non_custom_analyzers_can_have_params(): a1 = analysis.analyzer('whitespace', type='pattern', pattern=r'\\s+') m = mapping.Mapping() m.field('title', 'text', analyzer=a1) assert { "analyzer": { "whitespace": { "type": "pattern", "pattern": r"\\s+" } } } == m._collect_analysis()
def test_mapping_can_collect_all_analyzers(): a1 = analysis.analyzer( "my_analyzer1", tokenizer="keyword", filter=["lowercase", analysis.token_filter("my_filter1", "stop", stopwords=["a", "b"])], ) a2 = analysis.analyzer("english") a3 = analysis.analyzer("unknown_custom") a4 = analysis.analyzer( "my_analyzer2", tokenizer=analysis.tokenizer("trigram", "nGram", min_gram=3, max_gram=3), filter=[analysis.token_filter("my_filter2", "stop", stopwords=["c", "d"])], ) a5 = analysis.analyzer("my_analyzer3", tokenizer="keyword") m = mapping.Mapping("article") m.field( "title", "string", analyzer=a1, fields={"english": String(analyzer=a2), "unknown": String(search_analyzer=a3)} ) m.field("comments", Nested(properties={"author": String(analyzer=a4)})) m.meta("_all", analyzer=a5) assert { "analyzer": { "my_analyzer1": {"filter": ["lowercase", "my_filter1"], "tokenizer": "keyword", "type": "custom"}, "my_analyzer2": {"filter": ["my_filter2"], "tokenizer": "trigram", "type": "custom"}, "my_analyzer3": {"tokenizer": "keyword", "type": "custom"}, }, "filter": { "my_filter1": {"stopwords": ["a", "b"], "type": "stop"}, "my_filter2": {"stopwords": ["c", "d"], "type": "stop"}, }, "tokenizer": {"trigram": {"max_gram": 3, "min_gram": 3, "type": "nGram"}}, } == m._collect_analysis() assert json.loads(json.dumps(m.to_dict())) == m.to_dict()
def suggest_search(self, query): response = AdDocument.search().query( Q('match', title={ 'query': query, 'analyzer': analyzer('simple'), 'fuzziness': 1 })).execute() data = [{ 'text': hit['_source']['title'] } for hit in response.hits.hits] return data
class Movie(DocType): title = Text(fields={'raw': {'type': 'keyword'}}) summary = Text() datePublished = Date() creators = Keyword(multi=True) genres = Keyword(multi=True) casts = Keyword(multi=True) time = Integer() countries = Keyword(multi=True) plot_keywords = Keyword(multi=True) languages = Keyword(multi=True) rating = Float() poster = Keyword() suggest = Completion(analyzer=ngram_analyzer, search_analyzer=analyzer('standard')) class Meta: index = 'imdb'
def test_mapping_saved_into_es(write_client): m = mapping.Mapping() m.field( "name", "text", analyzer=analysis.analyzer("my_analyzer", tokenizer="keyword") ) m.field("tags", "keyword") m.save("test-mapping", using=write_client) assert { "test-mapping": { "mappings": { "properties": { "name": {"type": "text", "analyzer": "my_analyzer"}, "tags": {"type": "keyword"}, } } } } == write_client.indices.get_mapping(index="test-mapping")
def test_mapping_saved_into_es(write_client): m = mapping.Mapping('test-type') m.field('name', 'text', analyzer=analysis.analyzer('my_analyzer', tokenizer='keyword')) m.field('tags', 'keyword') m.save('test-mapping', using=write_client) assert write_client.indices.exists_type(index='test-mapping', doc_type='test-type') assert { 'test-mapping': { 'mappings': { 'test-type': { 'properties': { 'name': {'type': 'text', 'analyzer': 'my_analyzer'}, 'tags': {'type': 'keyword'} } } } } } == write_client.indices.get_mapping(index='test-mapping')
def test_mapping_saved_into_es_when_index_already_exists_with_analysis(write_client): m = mapping.Mapping('test-type') analyzer = analysis.analyzer('my_analyzer', tokenizer='keyword') m.field('name', 'string', analyzer=analyzer) write_client.indices.create(index='test-mapping', body={'settings': {'analysis': analyzer.get_analysis_definition()}}) m.save('test-mapping', using=write_client) assert { 'test-mapping': { 'mappings': { 'test-type': { 'properties': { 'name': {'type': 'string', 'analyzer': 'my_analyzer'}, } } } } } == write_client.indices.get_mapping(index='test-mapping')
def add_mapping_fields(self, mapping, analyzer_lang, analyzer_case_insensitive_sort): """ Add custom fields for Mails to the passed Index-mapping. :param mapping: ``Mapping`` Elastic-search DSL mapping to add fields to :param analyzer_lang: ``analyzer`` or ``str`` of analyzer to be used for language-specific fields :param analyzer_case_insensitive_sort: ``analyzer`` of analyzer to be used :return: None (Mapping is modified!) """ # Specific fields email analyzer_email = analysis.analyzer('email', tokenizer=analysis.tokenizer('uax_url_email'), filter=['lowercase', 'unique']) mapping.field('fromName', 'text', analyzer=analyzer_lang, fields={ 'keyword': 'keyword', }) mapping.field('fromEmail', 'text', analyzer=analyzer_email, fields={ 'keyword': 'keyword', }) mapping.field('toName', 'text', analyzer=analyzer_lang, fields={ 'keyword': 'keyword', }) mapping.field('toEmail', 'text', analyzer=analyzer_email, fields={ 'keyword': 'keyword', }) mapping.field('replyToName', 'text', analyzer=analyzer_lang, fields={ 'keyword': 'keyword', }) mapping.field('replyToEmail', 'text', analyzer=analyzer_email, fields={ 'keyword': 'keyword', }) mapping.field('subject', 'text', analyzer=analyzer_lang) mapping.field('date', 'date') mapping.field('body', 'text', analyzer=analyzer_lang) mapping.field('spam', 'boolean') mapping.field('hasAttachmet', 'boolean') mapping.field('attachmentNames', 'text', analyzer=analyzer_lang)
def test_conflicting_nested_filters_cause_error(): a = analysis.analyzer( "my_cond", tokenizer=analysis.tokenizer("keyword"), filter=[ analysis.token_filter("en", "stemmer", language="english"), analysis.token_filter( "testing", "condition", script={"source": "return true"}, filter=[ "lowercase", analysis.token_filter("en", "snowball", language="English"), ], ), ], ) with raises(ValueError): a.get_analysis_definition()
def get_settings(self): shingle_filter = analysis.token_filter( 'filter_shingle', 'shingle', max_shingle_size=5, min_shingle_size=2, output_unigrams=True) shingle_analyzer = analysis.analyzer( 'analyzer_shingle', tokenizer='standard', filter=['standard', 'lowercase', shingle_filter]) return { 'settings': { 'index': { 'analysis': shingle_analyzer.get_analysis_definition() } } }
def test_mapping_saved_into_es_when_index_already_exists_closed(write_client): m = mapping.Mapping() m.field( "name", "text", analyzer=analysis.analyzer("my_analyzer", tokenizer="keyword") ) write_client.indices.create(index="test-mapping") with raises(exceptions.IllegalOperation): m.save("test-mapping", using=write_client) write_client.cluster.health(index="test-mapping", wait_for_status="yellow") write_client.indices.close(index="test-mapping") m.save("test-mapping", using=write_client) assert { "test-mapping": { "mappings": { "properties": {"name": {"type": "text", "analyzer": "my_analyzer"}} } } } == write_client.indices.get_mapping(index="test-mapping")
def test_conflicting_nested_filters_cause_error(): a = analysis.analyzer('my_cond', tokenizer=analysis.tokenizer('keyword'), filter=[ analysis.token_filter('en', 'stemmer', language='english'), analysis.token_filter( 'testing', 'condition', script={'source': 'return true'}, filter=[ 'lowercase', analysis.token_filter('en', 'snowball', language='English') ]) ]) with raises(ValueError): a.get_analysis_definition()
def test_mapping_saved_into_es_when_index_already_exists_closed(write_client): m = mapping.Mapping('test-type') m.field('name', 'string', analyzer=analysis.analyzer('my_analyzer', tokenizer='keyword')) write_client.indices.create(index='test-mapping') with raises(exceptions.IllegalOperation): m.save('test-mapping', using=write_client) write_client.cluster.health(index='test-mapping', wait_for_status='yellow') write_client.indices.close(index='test-mapping') m.save('test-mapping', using=write_client) assert { 'test-mapping': { 'mappings': { 'test-type': { 'properties': { 'name': {'type': 'string', 'analyzer': 'my_analyzer'}, } } } } } == write_client.indices.get_mapping(index='test-mapping')
def test_even_non_custom_analyzers_can_have_params(): a1 = analysis.analyzer("whitespace", type="pattern", pattern=r"\\s+") m = mapping.Mapping("some_type") m.field("title", "string", analyzer=a1) assert {"analyzer": {"whitespace": {"type": "pattern", "pattern": r"\\s+"}}} == m._collect_analysis()
from elasticsearch_dsl.analysis import analyzer, token_filter edge_ngram_analyzer = analyzer( 'edge_ngram_analyzer', type='custom', tokenizer='standard', filter=[ 'lowercase', token_filter( 'edge_ngram_filter', type='edgeNGram', min_gram=2, max_gram=20 ) ] )
def test_analyzer_serializes_as_name(): a = analysis.analyzer('my_analyzer') assert 'my_analyzer' == a.to_dict()
class PathHierarchyTokenizer(analysis.Tokenizer): name = 'path_hierarchy' class WhitespaceTokenizer(analysis.Tokenizer): name = 'whitespace' path_analyzer = analysis.CustomAnalyzer('path', tokenizer='path_hierarchy', filter=['lowercase']) lower_whitespace_analyzer = analysis.analyzer('lower_whitespace', tokenizer='whitespace', filter=['lowercase', 'stop'], char_filter=['html_strip']) class DocumentDocType(ImprovedDocType): """ The main documentation doc type to be used for searching. It stores a bit of meta data so we don't have to hit the db when rendering search results. The search view will be using the 'lang' and 'version' fields of the document's release to filter the search results, depending which was found in the URL. The breadcrumbs are shown under the search result title. """