Python analyzer Examples, elasticsearch_dsl.analysis.analyzer Python Examples

Example #1

2

Show file

File: test_mapping.py Project: mdj2/elasticsearch-dsl-py

def test_unchanged_mapping_is_not_updated(write_client):
    m = mapping.Mapping('test-type')
    m.field('name', 'string', analyzer=analysis.analyzer("my_analyzer",
        tokenizer="standard",
        filter=[
            token_filter("simple_edge",
                type="edgeNGram",
                min_gram=2,
                max_gram=3
            )]
        )
    )


    m.save('test-mapping', using=write_client)
    # this should not trigger an error since the mapping didn't change
    m.save('test-mapping', using=write_client)


    # change the mapping just a little bit
    m.field('name', 'string', analyzer=analysis.analyzer("my_analyzer",
        tokenizer="standard",
        filter=[
            token_filter("simple_edge",
                type="edgeNGram",
                min_gram=2,
                max_gram=4 # changed from 3 to 4
            )]
        )
    )

    with raises(exceptions.IllegalOperation):
        m.save('test-mapping', using=write_client)

Example #2

0

Show file

File: test_mapping.py Project: REVLWorld/elasticsearch-dsl-py

def test_mapping_can_collect_multiple_analyzers():
    a1 = analysis.analyzer(
        "my_analyzer1",
        tokenizer="keyword",
        filter=["lowercase", analysis.token_filter("my_filter1", "stop", stopwords=["a", "b"])],
    )
    a2 = analysis.analyzer(
        "my_analyzer2",
        tokenizer=analysis.tokenizer("trigram", "nGram", min_gram=3, max_gram=3),
        filter=[analysis.token_filter("my_filter2", "stop", stopwords=["c", "d"])],
    )
    m = mapping.Mapping("article")
    m.field("title", "string", analyzer=a1, search_analyzer=a2)
    m.field(
        "text",
        "string",
        analyzer=a1,
        fields={"english": String(analyzer=a1), "unknown": String(analyzer=a1, search_analyzer=a2)},
    )
    assert {
        "analyzer": {
            "my_analyzer1": {"filter": ["lowercase", "my_filter1"], "tokenizer": "keyword", "type": "custom"},
            "my_analyzer2": {"filter": ["my_filter2"], "tokenizer": "trigram", "type": "custom"},
        },
        "filter": {
            "my_filter1": {"stopwords": ["a", "b"], "type": "stop"},
            "my_filter2": {"stopwords": ["c", "d"], "type": "stop"},
        },
        "tokenizer": {"trigram": {"max_gram": 3, "min_gram": 3, "type": "nGram"}},
    } == m._collect_analysis()

Example #3

0

Show file

def test_mapping_can_collect_multiple_analyzers():
    a1 = analysis.analyzer(
        'my_analyzer1',
        tokenizer='keyword',
        filter=['lowercase', analysis.token_filter('my_filter1', 'stop', stopwords=['a', 'b'])],
    )
    a2 = analysis.analyzer(
        'my_analyzer2',
        tokenizer=analysis.tokenizer('trigram', 'nGram', min_gram=3, max_gram=3),
        filter=[analysis.token_filter('my_filter2', 'stop', stopwords=['c', 'd'])],
    )
    m = mapping.Mapping()
    m.field('title', 'text', analyzer=a1, search_analyzer=a2)
    m.field(
        'text', 'text', analyzer=a1,
        fields={
            'english': Text(analyzer=a1),
            'unknown': Keyword(analyzer=a1, search_analyzer=a2),
        }
    )
    assert {
       'analyzer': {
           'my_analyzer1': {'filter': ['lowercase', 'my_filter1'],
                            'tokenizer': 'keyword',
                            'type': 'custom'},
           'my_analyzer2': {'filter': ['my_filter2'],
                            'tokenizer': 'trigram',
                            'type': 'custom'}},
       'filter': {
           'my_filter1': {'stopwords': ['a', 'b'], 'type': 'stop'},
           'my_filter2': {'stopwords': ['c', 'd'], 'type': 'stop'}},
       'tokenizer': {'trigram': {'max_gram': 3, 'min_gram': 3, 'type': 'nGram'}}
    } == m._collect_analysis()

Example #4

0

Show file

class Analyzer:
    # tokenizes and makes the tokens lowercase
    general_analyzer = analysis.analyzer(
        "general_analyzer",
        tokenizer=Tokenizer.alphanum_tokenizer,
        filter=["lowercase"])

    # provides light stemming for english tokens
    stemming_analyzer = analysis.analyzer(
        "stemming_analyzer",
        tokenizer=Tokenizer.alphanum_tokenizer,
        filter=["lowercase", "kstem"])

    # uses grammar based tokenization before analysis (e.g. "it's fine" -> ["it's", "fine"])
    english_analyzer = analysis.analyzer(
        "english_analyzer",
        tokenizer=tokenizer("standard_tokenizer", type="standard"),
        filter=[
            Filter.english_possessive_stemmer, "lowercase",
            Filter.english_stop, Filter.english_stemmer
        ])

    # tokenizes for words and numbers, removing all other characters before analysis
    # (e.g. "it's fine" -> ["it", "s", "fine"] or "hello_word" -> ["hello", "world"])
    alphanum_analyzer = analysis.analyzer(
        "alphanum_analyzer",
        tokenizer=Tokenizer.alphanum_tokenizer,
        filter=[
            Filter.english_possessive_stemmer, "lowercase",
            Filter.english_stop, Filter.english_stemmer
        ])

Example #5

0

Show file

File: test_mapping.py Project: smal/elasticsearch-dsl-py

def test_mapping_can_collect_all_analyzers():
    a1 = analysis.analyzer('my_analyzer1',
        tokenizer='keyword',
        filter=['lowercase', analysis.token_filter('my_filter1', 'stop', stopwords=['a', 'b'])],
    )
    a2 = analysis.analyzer('english')
    a3 = analysis.analyzer('unknown_custom')
    a4 = analysis.analyzer('my_analyzer2', 
        tokenizer=analysis.tokenizer('trigram', 'nGram', min_gram=3, max_gram=3),
        filter=[analysis.token_filter('my_filter2', 'stop', stopwords=['c', 'd'])],
    )

    m = mapping.Mapping('article')
    m.field('title', 'string', analyzer=a1,
        fields={
            'english': String(index_analyzer=a2),
            'unknown': String(search_analyzer=a3),
        }
    )
    m.field('comments', Nested(properties={
        'author': String(index_analyzer=a4)
    }))

    assert {
        'analyzer': {
            'my_analyzer1': {'filter': ['lowercase', 'my_filter1'], 'tokenizer': 'keyword', 'type': 'custom'},
            'my_analyzer2': {'filter': ['my_filter2'], 'tokenizer': 'trigram', 'type': 'custom'}},
        'filter': {
            'my_filter1': {'stopwords': ['a', 'b'], 'type': 'stop'},
            'my_filter2': {'stopwords': ['c', 'd'], 'type': 'stop'},
        },
        'tokenizer': {
            'trigram': {'max_gram': 3, 'min_gram': 3, 'type': 'nGram'},
        }
    } == m._collect_analysis()

Example #6

0

Show file

File: test_mapping.py Project: mdj2/elasticsearch-dsl-py

def test_unchanged_mapping_is_not_updated(write_client):
    m = mapping.Mapping('test-type')
    m.field('name',
            'string',
            analyzer=analysis.analyzer("my_analyzer",
                                       tokenizer="standard",
                                       filter=[
                                           token_filter("simple_edge",
                                                        type="edgeNGram",
                                                        min_gram=2,
                                                        max_gram=3)
                                       ]))

    m.save('test-mapping', using=write_client)
    # this should not trigger an error since the mapping didn't change
    m.save('test-mapping', using=write_client)

    # change the mapping just a little bit
    m.field(
        'name',
        'string',
        analyzer=analysis.analyzer(
            "my_analyzer",
            tokenizer="standard",
            filter=[
                token_filter(
                    "simple_edge",
                    type="edgeNGram",
                    min_gram=2,
                    max_gram=4  # changed from 3 to 4
                )
            ]))

    with raises(exceptions.IllegalOperation):
        m.save('test-mapping', using=write_client)

Example #7

0

Show file

File: test_mapping.py Project: benjaminrigaud/elasticsearch-dsl-py

def test_mapping_can_collect_all_analyzers():
    a1 = analysis.analyzer('my_analyzer1',
        tokenizer='keyword',
        filter=['lowercase', analysis.token_filter('my_filter1', 'stop', stopwords=['a', 'b'])],
    )
    a2 = analysis.analyzer('english')
    a3 = analysis.analyzer('unknown_custom')
    a4 = analysis.analyzer('my_analyzer2', 
        tokenizer=analysis.tokenizer('trigram', 'nGram', min_gram=3, max_gram=3),
        filter=[analysis.token_filter('my_filter2', 'stop', stopwords=['c', 'd'])],
    )

    m = mapping.Mapping('article')
    m.field('title', 'string', analyzer=a1,
        fields={
            'english': String(analyzer=a2),
            'unknown': String(analyzer=a3),
        }
    )
    m.field('comments', Nested(properties={
        'author': String(analyzer=a4)
    }))

    assert {
        'analyzer': {
            'my_analyzer1': {'filter': ['lowercase', 'my_filter1'], 'tokenizer': 'keyword', 'type': 'custom'},
            'my_analyzer2': {'filter': ['my_filter2'], 'tokenizer': 'trigram', 'type': 'custom'}},
        'filter': {
            'my_filter1': {'stopwords': ['a', 'b'], 'type': 'stop'},
            'my_filter2': {'stopwords': ['c', 'd'], 'type': 'stop'},
        },
        'tokenizer': {
            'trigram': {'max_gram': 3, 'min_gram': 3, 'type': 'nGram'},
        }
    } == m._collect_analysis()

Example #8

0

Show file

def test_mapping_can_collect_all_analyzers_and_normalizers():
    a1 = analysis.analyzer('my_analyzer1',
        tokenizer='keyword',
        filter=['lowercase', analysis.token_filter('my_filter1', 'stop', stopwords=['a', 'b'])],
    )
    a2 = analysis.analyzer('english')
    a3 = analysis.analyzer('unknown_custom')
    a4 = analysis.analyzer('my_analyzer2',
        tokenizer=analysis.tokenizer('trigram', 'nGram', min_gram=3, max_gram=3),
        filter=[analysis.token_filter('my_filter2', 'stop', stopwords=['c', 'd'])],
    )
    a5 = analysis.analyzer('my_analyzer3', tokenizer='keyword')
    n1 = analysis.normalizer('my_normalizer1',
        filter=['lowercase']
    )
    n2 = analysis.normalizer('my_normalizer2',
        filter=['my_filter1', 'my_filter2', analysis.token_filter('my_filter3', 'stop', stopwords=['e', 'f'])]
    )
    n3 = analysis.normalizer('unknown_custom')

    m = mapping.Mapping()
    m.field('title', 'text', analyzer=a1,
        fields={
            'english': Text(analyzer=a2),
            'unknown': Keyword(search_analyzer=a3),
        }
    )
    m.field('comments', Nested(properties={
        'author': Text(analyzer=a4)
    }))
    m.field('normalized_title', 'keyword', normalizer=n1)
    m.field('normalized_comment', 'keyword', normalizer=n2)
    m.field('unknown', 'keyword', normalizer=n3)
    m.meta('_all', analyzer=a5)

    assert {
        'analyzer': {
            'my_analyzer1': {'filter': ['lowercase', 'my_filter1'], 'tokenizer': 'keyword', 'type': 'custom'},
            'my_analyzer2': {'filter': ['my_filter2'], 'tokenizer': 'trigram', 'type': 'custom'},
            'my_analyzer3': {'tokenizer': 'keyword', 'type': 'custom'},
        },
        'normalizer': {
            'my_normalizer1': {'filter': ['lowercase'], 'type': 'custom'},
            'my_normalizer2': {'filter': ['my_filter1', 'my_filter2', 'my_filter3'], 'type': 'custom'},
        },
        'filter': {
            'my_filter1': {'stopwords': ['a', 'b'], 'type': 'stop'},
            'my_filter2': {'stopwords': ['c', 'd'], 'type': 'stop'},
            'my_filter3': {'stopwords': ['e', 'f'], 'type': 'stop'},
        },
        'tokenizer': {
            'trigram': {'max_gram': 3, 'min_gram': 3, 'type': 'nGram'},
        }
    } == m._collect_analysis()

    assert json.loads(json.dumps(m.to_dict())) == m.to_dict()

Example #9

0

Show file

File: helpers.py Project: Luke31/howler

def get_analyzer(lang_analyzer,
                 delete_old_index,
                 user_dictionary_file='',
                 synonyms=None):
    """
    Return analyzer for specific language.

    If Japanese (``lang_analyzer == ja``) and the index doesn't need to be recreated (no delete required and
    no new synonyms) then return only the name of the analyzer.

    :param lang_analyzer: ``str`` which analyzer to get e.g. 'standard','kuromoji','english'
    :param delete_old_index: (only Japanese) ``bool`` if list is empty and index is not deleted, keep previous analyzer
        with synonyms
    :param user_dictionary_file: (only Japanese) ``str`` user-dictionary file with custom terms in the form of
        東京スカイツリー,東京 スカイツリー,トウキョウ スカイツリー,カスタム名詞
        See: https://www.elastic.co/guide/en/elasticsearch/plugins/current/analysis-kuromoji-tokenizer.html
    :param synonyms: (only Japanese) ``list`` of synonyms to be used in the form of ['京産大, 京都産業大学','a, b']
        if list is empty and index is not deleted, keep previous analyzer with synonyms
    :return: ``analyzer`` or ``str`` of analyzer to be used
    """
    if synonyms is None:
        synonyms = []
    if lang_analyzer == constants.SUPPORTED_LANG_CODES_ANALYZERS['ja']:
        # Use existing analyzer (with synonyms) if new synonyms list is empty. (Only if index is not re-built)
        if (not delete_old_index) & (len(synonyms) == 0):
            analyzer_lang = '{0}_custom'.format(
                lang_analyzer)  # Use existing analyzer with existing synonyms
        else:
            analyzer_lang = analysis.analyzer(
                '{0}_custom'.format(lang_analyzer),
                tokenizer=analysis.tokenizer(
                    'kuromoji_tokenizer_user_dict',
                    type='kuromoji_tokenizer',
                    user_dictionary=user_dictionary_file),
                filter=[
                    'kuromoji_baseform',
                    'kuromoji_part_of_speech',
                    'cjk_width',
                    'ja_stop',
                    'kuromoji_stemmer',
                    'lowercase',
                    analysis.token_filter(
                        'synonym', type='synonym',
                        synonyms=synonyms),  # ['京産大, 京都産業大学']
                ])
            # Extra token filters: kuromoji_number, kuromoji_readingform
            # Extra character filter: kuromoji_iteration_mark
            # user_dictionary="userdict_ja.txt")  # /etc/elasticsearch/
    else:
        analyzer_lang = analysis.analyzer(lang_analyzer)
    return analyzer_lang

Example #10

0

Show file

File: test_mapping.py Project: ziky90/elasticsearch-dsl-py

def test_mapping_saved_into_es(write_client):
    m = mapping.Mapping('test-type')
    m.field('name', 'string', analyzer=analysis.analyzer('my_analyzer', tokenizer='keyword'))
    m.field('tags', 'string', index='not_analyzed')
    m.save('test-mapping', using=write_client)

    m = mapping.Mapping('other-type')
    m.field('title', 'string').field('categories', 'string', index='not_analyzed')

    m.save('test-mapping', using=write_client)


    assert write_client.indices.exists_type(index='test-mapping', doc_type='test-type')
    assert {
        'test-mapping': {
            'mappings': {
                'test-type': {
                    'properties': {
                        'name': {'type': 'string', 'analyzer': 'my_analyzer'},
                        'tags': {'index': 'not_analyzed', 'type': 'string'}
                    }
                },
                'other-type': {
                    'properties': {
                        'title': {'type': 'string'},
                        'categories': {'index': 'not_analyzed', 'type': 'string'}
                    }
                }
            }
        }
    } == write_client.indices.get_mapping(index='test-mapping')

Example #11

0

Show file

File: test_mapping.py Project: AndreCimander/elasticsearch-dsl-py

def test_mapping_saved_into_es_when_index_already_exists_with_analysis(write_client):
    m = mapping.Mapping('test-type')
    analyzer = analysis.analyzer('my_analyzer', tokenizer='keyword')
    m.field('name', 'text', analyzer=analyzer)

    new_analysis = analyzer.get_analysis_definition()
    new_analysis['analyzer']['other_analyzer'] = {
        'type': 'custom',
        'tokenizer': 'whitespace'
    }
    write_client.indices.create(index='test-mapping', body={'settings': {'analysis': new_analysis}})

    m.field('title', 'text', analyzer=analyzer)
    m.save('test-mapping', using=write_client)

    assert {
        'test-mapping': {
            'mappings': {
                'test-type': {
                    'properties': {
                        'name': {'type': 'text', 'analyzer': 'my_analyzer'},
                        'title': {'type': 'text', 'analyzer': 'my_analyzer'},
                    }
                }
            }
        }
    } == write_client.indices.get_mapping(index='test-mapping')

Example #12

0

Show file

File: test_mapping.py Project: mieciu/elasticsearch-dsl-py

def test_mapping_saved_into_es_when_index_already_exists_with_analysis(
        write_client):
    m = mapping.Mapping('test-type')
    analyzer = analysis.analyzer('my_analyzer', tokenizer='keyword')
    m.field('name', 'string', analyzer=analyzer)
    write_client.indices.create(
        index='test-mapping',
        body={'settings': {
            'analysis': analyzer.get_analysis_definition()
        }})

    m.save('test-mapping', using=write_client)

    assert {
        'test-mapping': {
            'mappings': {
                'test-type': {
                    'properties': {
                        'name': {
                            'type': 'string',
                            'analyzer': 'my_analyzer'
                        },
                    }
                }
            }
        }
    } == write_client.indices.get_mapping(index='test-mapping')

Example #13

0

Show file

File: test_analysis.py Project: 3lnc/elasticsearch-dsl-py

def test_custom_analyzer_can_collect_custom_items():
    trigram = analysis.tokenizer('trigram', 'nGram', min_gram=3, max_gram=3)
    my_stop = analysis.token_filter('my_stop', 'stop', stopwords=['a', 'b'])
    umlauts = analysis.char_filter('umlauts', 'pattern_replace', mappings=['ü=>ue'])
    a = analysis.analyzer(
        'my_analyzer',
        tokenizer=trigram,
        filter=['lowercase', my_stop],
        char_filter=['html_strip', umlauts]
    )

    assert a.to_dict() == 'my_analyzer'
    assert {
        'analyzer': {
            'my_analyzer': {
                'type': 'custom',
                'tokenizer': 'trigram',
                'filter': ['lowercase', 'my_stop'],
                'char_filter': ['html_strip', 'umlauts']
            }
        },
        'tokenizer': {
            'trigram': trigram.get_definition()
        },
        'filter': {
            'my_stop': my_stop.get_definition()
        },
        'char_filter': {
            'umlauts': umlauts.get_definition()
        }
    } == a.get_analysis_definition()

Example #14

0

Show file

File: test_analysis.py Project: zwjwhxz/elasticsearch-dsl-py

def test_custom_analyzer_can_collect_custom_items():
    trigram = analysis.tokenizer('trigram', 'nGram', min_gram=3, max_gram=3)
    my_stop = analysis.token_filter('my_stop', 'stop', stopwords=['a', 'b'])
    umlauts = analysis.char_filter('umlauts',
                                   'pattern_replace',
                                   mappings=['ü=>ue'])
    a = analysis.analyzer('my_analyzer',
                          tokenizer=trigram,
                          filter=['lowercase', my_stop],
                          char_filter=['html_strip', umlauts])

    assert a.to_dict() == 'my_analyzer'
    assert {
        'analyzer': {
            'my_analyzer': {
                'type': 'custom',
                'tokenizer': 'trigram',
                'filter': ['lowercase', 'my_stop'],
                'char_filter': ['html_strip', 'umlauts']
            }
        },
        'tokenizer': {
            'trigram': trigram.get_definition()
        },
        'filter': {
            'my_stop': my_stop.get_definition()
        },
        'char_filter': {
            'umlauts': umlauts.get_definition()
        }
    } == a.get_analysis_definition()

Example #15

0

Show file

File: test_analysis.py Project: xiaoshiyi123/elasticsearch-dsl-py

def test_multiplexer_with_custom_filter():
    a = analysis.analyzer(
        "my_analyzer",
        tokenizer="keyword",
        filter=[
            analysis.token_filter(
                "my_multi",
                "multiplexer",
                filters=[
                    [analysis.token_filter("en", "snowball", language="English")],
                    "lowercase, stop",
                ],
            )
        ],
    )

    assert {
        "analyzer": {
            "my_analyzer": {
                "filter": ["my_multi"],
                "tokenizer": "keyword",
                "type": "custom",
            }
        },
        "filter": {
            "en": {"type": "snowball", "language": "English"},
            "my_multi": {"filters": ["en", "lowercase, stop"], "type": "multiplexer"},
        },
    } == a.get_analysis_definition()

Example #16

0

Show file

File: test_analysis.py Project: xiaoshiyi123/elasticsearch-dsl-py

def test_simple_multiplexer_filter():
    a = analysis.analyzer(
        "my_analyzer",
        tokenizer="keyword",
        filter=[
            analysis.token_filter(
                "my_multi", "multiplexer", filters=["lowercase", "lowercase, stop"]
            )
        ],
    )

    assert {
        "analyzer": {
            "my_analyzer": {
                "filter": ["my_multi"],
                "tokenizer": "keyword",
                "type": "custom",
            }
        },
        "filter": {
            "my_multi": {
                "filters": ["lowercase", "lowercase, stop"],
                "type": "multiplexer",
            }
        },
    } == a.get_analysis_definition()

Example #17

0

Show file

File: test_analysis.py Project: xiaoshiyi123/elasticsearch-dsl-py

def test_custom_analyzer_can_collect_custom_items():
    trigram = analysis.tokenizer("trigram", "nGram", min_gram=3, max_gram=3)
    my_stop = analysis.token_filter("my_stop", "stop", stopwords=["a", "b"])
    umlauts = analysis.char_filter("umlauts", "pattern_replace", mappings=["ü=>ue"])
    a = analysis.analyzer(
        "my_analyzer",
        tokenizer=trigram,
        filter=["lowercase", my_stop],
        char_filter=["html_strip", umlauts],
    )

    assert a.to_dict() == "my_analyzer"
    assert {
        "analyzer": {
            "my_analyzer": {
                "type": "custom",
                "tokenizer": "trigram",
                "filter": ["lowercase", "my_stop"],
                "char_filter": ["html_strip", "umlauts"],
            }
        },
        "tokenizer": {"trigram": trigram.get_definition()},
        "filter": {"my_stop": my_stop.get_definition()},
        "char_filter": {"umlauts": umlauts.get_definition()},
    } == a.get_analysis_definition()

Example #18

0

Show file

File: test_mapping.py Project: zwjwhxz/elasticsearch-dsl-py

def test_mapping_saved_into_es_when_index_already_exists_with_analysis(
        write_client):
    m = mapping.Mapping()
    analyzer = analysis.analyzer('my_analyzer', tokenizer='keyword')
    m.field('name', 'text', analyzer=analyzer)

    new_analysis = analyzer.get_analysis_definition()
    new_analysis['analyzer']['other_analyzer'] = {
        'type': 'custom',
        'tokenizer': 'whitespace'
    }
    write_client.indices.create(index='test-mapping',
                                body={'settings': {
                                    'analysis': new_analysis
                                }})

    m.field('title', 'text', analyzer=analyzer)
    m.save('test-mapping', using=write_client)

    assert {
        'test-mapping': {
            'mappings': {
                'properties': {
                    'name': {
                        'type': 'text',
                        'analyzer': 'my_analyzer'
                    },
                    'title': {
                        'type': 'text',
                        'analyzer': 'my_analyzer'
                    },
                }
            }
        }
    } == write_client.indices.get_mapping(index='test-mapping')

Example #19

0

Show file

File: test_analysis.py Project: xiaoshiyi123/elasticsearch-dsl-py

def test_conditional_token_filter():
    a = analysis.analyzer(
        "my_cond",
        tokenizer=analysis.tokenizer("keyword"),
        filter=[
            analysis.token_filter(
                "testing",
                "condition",
                script={"source": "return true"},
                filter=[
                    "lowercase",
                    analysis.token_filter("en", "snowball", language="English"),
                ],
            ),
            "stop",
        ],
    )

    assert {
        "analyzer": {
            "my_cond": {
                "filter": ["testing", "stop"],
                "tokenizer": "keyword",
                "type": "custom",
            }
        },
        "filter": {
            "en": {"language": "English", "type": "snowball"},
            "testing": {
                "script": {"source": "return true"},
                "filter": ["lowercase", "en"],
                "type": "condition",
            },
        },
    } == a.get_analysis_definition()

Example #20

0

Show file

def test_mapping_saved_into_es_when_index_already_exists_with_analysis(write_client):
    m = mapping.Mapping()
    analyzer = analysis.analyzer("my_analyzer", tokenizer="keyword")
    m.field("name", "text", analyzer=analyzer)

    new_analysis = analyzer.get_analysis_definition()
    new_analysis["analyzer"]["other_analyzer"] = {
        "type": "custom",
        "tokenizer": "whitespace",
    }
    write_client.indices.create(
        index="test-mapping", body={"settings": {"analysis": new_analysis}}
    )

    m.field("title", "text", analyzer=analyzer)
    m.save("test-mapping", using=write_client)

    assert {
        "test-mapping": {
            "mappings": {
                "properties": {
                    "name": {"type": "text", "analyzer": "my_analyzer"},
                    "title": {"type": "text", "analyzer": "my_analyzer"},
                }
            }
        }
    } == write_client.indices.get_mapping(index="test-mapping")

Example #21

0

Show file

class Movie(Document):
    title = Text(fields={'raw': {'type': 'keyword'}})
    film_rating = Text()
    duration = Text()
    genre = Keyword(multi=True)
    release_date = Text()
    release_date_unix_time = Float()
    imdb_ratingValue = Float()
    imdb_bestRating = Float()
    imdb_ratingCount = Float()
    description = Text()
    storyline = Text()
    poster = Text()
    trailer_img = Text()
    director = Keyword(multi=True)
    creator = Keyword(multi=True)
    writer = Keyword(multi=True)
    stars = Keyword(multi=True)
    taglines = Keyword(multi=True)
    url = Keyword()
    req_headers = Object(enabled=False)
    res_headers = Object(enabled=False)

    suggest = Completion(analyzer=ngram_analyzer,
                         search_analyzer=analyzer('standard'))

    class Index:
        name = 'imdb'

Example #22

0

Show file

File: test_analysis.py Project: zwjwhxz/elasticsearch-dsl-py

def test_multiplexer_with_custom_filter():
    a = analysis.analyzer('my_analyzer',
                          tokenizer='keyword',
                          filter=[
                              analysis.token_filter('my_multi',
                                                    'multiplexer',
                                                    filters=[[
                                                        analysis.token_filter(
                                                            'en',
                                                            'snowball',
                                                            language='English')
                                                    ], 'lowercase, stop'])
                          ])

    assert {
        "analyzer": {
            "my_analyzer": {
                "filter": ["my_multi"],
                "tokenizer": "keyword",
                "type": "custom"
            }
        },
        "filter": {
            "en": {
                "type": "snowball",
                "language": "English"
            },
            "my_multi": {
                "filters": ["en", "lowercase, stop"],
                "type": "multiplexer"
            }
        }
    } == a.get_analysis_definition()

Example #23

0

Show file

File: test_mapping.py Project: zwjwhxz/elasticsearch-dsl-py

def test_mapping_saved_into_es_when_index_already_exists_closed(write_client):
    m = mapping.Mapping()
    m.field('name',
            'text',
            analyzer=analysis.analyzer('my_analyzer', tokenizer='keyword'))
    write_client.indices.create(index='test-mapping')

    with raises(exceptions.IllegalOperation):
        m.save('test-mapping', using=write_client)

    write_client.cluster.health(index='test-mapping', wait_for_status='yellow')
    write_client.indices.close(index='test-mapping')
    m.save('test-mapping', using=write_client)

    assert {
        'test-mapping': {
            'mappings': {
                'properties': {
                    'name': {
                        'type': 'text',
                        'analyzer': 'my_analyzer'
                    },
                }
            }
        }
    } == write_client.indices.get_mapping(index='test-mapping')

Example #24

0

Show file

File: test_analysis.py Project: zwjwhxz/elasticsearch-dsl-py

def test_simple_multiplexer_filter():
    a = analysis.analyzer('my_analyzer',
                          tokenizer='keyword',
                          filter=[
                              analysis.token_filter(
                                  'my_multi',
                                  'multiplexer',
                                  filters=['lowercase', 'lowercase, stop'])
                          ])

    assert {
        "analyzer": {
            "my_analyzer": {
                "filter": ["my_multi"],
                "tokenizer": "keyword",
                "type": "custom"
            }
        },
        "filter": {
            "my_multi": {
                "filters": ["lowercase", "lowercase, stop"],
                "type": "multiplexer"
            }
        }
    } == a.get_analysis_definition()

Example #25

0

Show file

File: test_mapping.py Project: tommyzli/elasticsearch-dsl-py

def test_even_non_custom_analyzers_can_have_params():
    a1 = analysis.analyzer("whitespace", type="pattern", pattern=r"\\s+")
    m = mapping.Mapping()
    m.field("title", "text", analyzer=a1)

    assert {
        "analyzer": {
            "whitespace": {
                "type": "pattern",
                "pattern": r"\\s+"
            }
        }
    } == m._collect_analysis()

Example #26

0

Show file

def test_even_non_custom_analyzers_can_have_params():
    a1 = analysis.analyzer('whitespace', type='pattern', pattern=r'\\s+')
    m = mapping.Mapping()
    m.field('title', 'text', analyzer=a1)

    assert {
        "analyzer": {
            "whitespace": {
                "type": "pattern",
                "pattern": r"\\s+"
            }
        }
    } == m._collect_analysis()

Example #27

0

Show file

File: test_mapping.py Project: REVLWorld/elasticsearch-dsl-py

def test_mapping_can_collect_all_analyzers():
    a1 = analysis.analyzer(
        "my_analyzer1",
        tokenizer="keyword",
        filter=["lowercase", analysis.token_filter("my_filter1", "stop", stopwords=["a", "b"])],
    )
    a2 = analysis.analyzer("english")
    a3 = analysis.analyzer("unknown_custom")
    a4 = analysis.analyzer(
        "my_analyzer2",
        tokenizer=analysis.tokenizer("trigram", "nGram", min_gram=3, max_gram=3),
        filter=[analysis.token_filter("my_filter2", "stop", stopwords=["c", "d"])],
    )
    a5 = analysis.analyzer("my_analyzer3", tokenizer="keyword")

    m = mapping.Mapping("article")
    m.field(
        "title", "string", analyzer=a1, fields={"english": String(analyzer=a2), "unknown": String(search_analyzer=a3)}
    )
    m.field("comments", Nested(properties={"author": String(analyzer=a4)}))
    m.meta("_all", analyzer=a5)

    assert {
        "analyzer": {
            "my_analyzer1": {"filter": ["lowercase", "my_filter1"], "tokenizer": "keyword", "type": "custom"},
            "my_analyzer2": {"filter": ["my_filter2"], "tokenizer": "trigram", "type": "custom"},
            "my_analyzer3": {"tokenizer": "keyword", "type": "custom"},
        },
        "filter": {
            "my_filter1": {"stopwords": ["a", "b"], "type": "stop"},
            "my_filter2": {"stopwords": ["c", "d"], "type": "stop"},
        },
        "tokenizer": {"trigram": {"max_gram": 3, "min_gram": 3, "type": "nGram"}},
    } == m._collect_analysis()

    assert json.loads(json.dumps(m.to_dict())) == m.to_dict()

Example #28

0

Show file

File: search.py Project: farolanf/pytoko

    def suggest_search(self, query):

        response = AdDocument.search().query(
            Q('match',
              title={
                  'query': query,
                  'analyzer': analyzer('simple'),
                  'fuzziness': 1
              })).execute()

        data = [{
            'text': hit['_source']['title']
        } for hit in response.hits.hits]

        return data

Example #29

0

Show file

class Movie(DocType):
    title = Text(fields={'raw': {'type': 'keyword'}})
    summary = Text()
    datePublished = Date()
    creators = Keyword(multi=True)
    genres = Keyword(multi=True)
    casts = Keyword(multi=True)
    time = Integer()
    countries = Keyword(multi=True)
    plot_keywords = Keyword(multi=True)
    languages = Keyword(multi=True)
    rating = Float()
    poster = Keyword()
    suggest = Completion(analyzer=ngram_analyzer,
                         search_analyzer=analyzer('standard'))

    class Meta:
        index = 'imdb'

Example #30

0

Show file

def test_mapping_saved_into_es(write_client):
    m = mapping.Mapping()
    m.field(
        "name", "text", analyzer=analysis.analyzer("my_analyzer", tokenizer="keyword")
    )
    m.field("tags", "keyword")
    m.save("test-mapping", using=write_client)

    assert {
        "test-mapping": {
            "mappings": {
                "properties": {
                    "name": {"type": "text", "analyzer": "my_analyzer"},
                    "tags": {"type": "keyword"},
                }
            }
        }
    } == write_client.indices.get_mapping(index="test-mapping")

Example #31

0

Show file

File: test_mapping.py Project: yodasantu/elasticsearch-dsl-py

def test_mapping_saved_into_es(write_client):
    m = mapping.Mapping('test-type')
    m.field('name', 'text', analyzer=analysis.analyzer('my_analyzer', tokenizer='keyword'))
    m.field('tags', 'keyword')
    m.save('test-mapping', using=write_client)

    assert write_client.indices.exists_type(index='test-mapping', doc_type='test-type')
    assert {
        'test-mapping': {
            'mappings': {
                'test-type': {
                    'properties': {
                        'name': {'type': 'text', 'analyzer': 'my_analyzer'},
                        'tags': {'type': 'keyword'}
                    }
                }
            }
        }
    } == write_client.indices.get_mapping(index='test-mapping')

Example #32

0

Show file

File: test_mapping.py Project: mieciu/elasticsearch-dsl-py

def test_mapping_saved_into_es_when_index_already_exists_with_analysis(write_client):
    m = mapping.Mapping('test-type')
    analyzer = analysis.analyzer('my_analyzer', tokenizer='keyword')
    m.field('name', 'string', analyzer=analyzer)
    write_client.indices.create(index='test-mapping', body={'settings': {'analysis': analyzer.get_analysis_definition()}})

    m.save('test-mapping', using=write_client)

    assert {
        'test-mapping': {
            'mappings': {
                'test-type': {
                    'properties': {
                        'name': {'type': 'string', 'analyzer': 'my_analyzer'},
                    }
                }
            }
        }
    } == write_client.indices.get_mapping(index='test-mapping')

Example #33

0

Show file

File: index.py Project: Luke31/howler

    def add_mapping_fields(self, mapping, analyzer_lang, analyzer_case_insensitive_sort):
        """
        Add custom fields for Mails to the passed Index-mapping.

        :param mapping: ``Mapping`` Elastic-search DSL mapping to add fields to
        :param analyzer_lang: ``analyzer`` or ``str`` of analyzer to be used for language-specific fields
        :param analyzer_case_insensitive_sort: ``analyzer`` of analyzer to be used
        :return: None (Mapping is modified!)
        """
        # Specific fields email
        analyzer_email = analysis.analyzer('email', tokenizer=analysis.tokenizer('uax_url_email'),
                                           filter=['lowercase', 'unique'])
        mapping.field('fromName', 'text', analyzer=analyzer_lang,
                      fields={
                          'keyword': 'keyword',
                      })
        mapping.field('fromEmail', 'text', analyzer=analyzer_email,
                      fields={
                          'keyword': 'keyword',
                      })
        mapping.field('toName', 'text', analyzer=analyzer_lang,
                      fields={
                          'keyword': 'keyword',
                      })
        mapping.field('toEmail', 'text', analyzer=analyzer_email,
                      fields={
                          'keyword': 'keyword',
                      })
        mapping.field('replyToName', 'text', analyzer=analyzer_lang,
                      fields={
                          'keyword': 'keyword',
                      })
        mapping.field('replyToEmail', 'text', analyzer=analyzer_email,
                      fields={
                          'keyword': 'keyword',
                      })
        mapping.field('subject', 'text', analyzer=analyzer_lang)
        mapping.field('date', 'date')
        mapping.field('body', 'text', analyzer=analyzer_lang)
        mapping.field('spam', 'boolean')
        mapping.field('hasAttachmet', 'boolean')
        mapping.field('attachmentNames', 'text', analyzer=analyzer_lang)

Example #34

0

Show file

File: test_analysis.py Project: xiaoshiyi123/elasticsearch-dsl-py

def test_conflicting_nested_filters_cause_error():
    a = analysis.analyzer(
        "my_cond",
        tokenizer=analysis.tokenizer("keyword"),
        filter=[
            analysis.token_filter("en", "stemmer", language="english"),
            analysis.token_filter(
                "testing",
                "condition",
                script={"source": "return true"},
                filter=[
                    "lowercase",
                    analysis.token_filter("en", "snowball", language="English"),
                ],
            ),
        ],
    )

    with raises(ValueError):
        a.get_analysis_definition()

Example #35

0

Show file

File: __init__.py Project: Darriall/editorsnotes

    def get_settings(self):
        shingle_filter = analysis.token_filter(
            'filter_shingle',
            'shingle',
            max_shingle_size=5,
            min_shingle_size=2,
            output_unigrams=True)

        shingle_analyzer = analysis.analyzer(
            'analyzer_shingle',
            tokenizer='standard',
            filter=['standard', 'lowercase', shingle_filter])

        return {
            'settings': {
                'index': {
                    'analysis': shingle_analyzer.get_analysis_definition()
                }
            }
        }

Example #36

0

Show file

def test_mapping_saved_into_es_when_index_already_exists_closed(write_client):
    m = mapping.Mapping()
    m.field(
        "name", "text", analyzer=analysis.analyzer("my_analyzer", tokenizer="keyword")
    )
    write_client.indices.create(index="test-mapping")

    with raises(exceptions.IllegalOperation):
        m.save("test-mapping", using=write_client)

    write_client.cluster.health(index="test-mapping", wait_for_status="yellow")
    write_client.indices.close(index="test-mapping")
    m.save("test-mapping", using=write_client)

    assert {
        "test-mapping": {
            "mappings": {
                "properties": {"name": {"type": "text", "analyzer": "my_analyzer"}}
            }
        }
    } == write_client.indices.get_mapping(index="test-mapping")

Example #37

0

Show file

File: test_analysis.py Project: zwjwhxz/elasticsearch-dsl-py

def test_conflicting_nested_filters_cause_error():
    a = analysis.analyzer('my_cond',
                          tokenizer=analysis.tokenizer('keyword'),
                          filter=[
                              analysis.token_filter('en',
                                                    'stemmer',
                                                    language='english'),
                              analysis.token_filter(
                                  'testing',
                                  'condition',
                                  script={'source': 'return true'},
                                  filter=[
                                      'lowercase',
                                      analysis.token_filter('en',
                                                            'snowball',
                                                            language='English')
                                  ])
                          ])

    with raises(ValueError):
        a.get_analysis_definition()

Example #38

0

Show file

File: test_mapping.py Project: steeve/elasticsearch-dsl-py

def test_mapping_saved_into_es_when_index_already_exists_closed(write_client):
    m = mapping.Mapping('test-type')
    m.field('name', 'string', analyzer=analysis.analyzer('my_analyzer', tokenizer='keyword'))
    write_client.indices.create(index='test-mapping')

    with raises(exceptions.IllegalOperation):
        m.save('test-mapping', using=write_client)

    write_client.cluster.health(index='test-mapping', wait_for_status='yellow')
    write_client.indices.close(index='test-mapping')
    m.save('test-mapping', using=write_client)


    assert {
        'test-mapping': {
            'mappings': {
                'test-type': {
                    'properties': {
                        'name': {'type': 'string', 'analyzer': 'my_analyzer'},
                    }
                }
            }
        }
    } == write_client.indices.get_mapping(index='test-mapping')

Example #39

0

Show file

File: test_mapping.py Project: REVLWorld/elasticsearch-dsl-py

def test_even_non_custom_analyzers_can_have_params():
    a1 = analysis.analyzer("whitespace", type="pattern", pattern=r"\\s+")
    m = mapping.Mapping("some_type")
    m.field("title", "string", analyzer=a1)

    assert {"analyzer": {"whitespace": {"type": "pattern", "pattern": r"\\s+"}}} == m._collect_analysis()

Example #40

0

Show file

File: analysis.py Project: ChristopherRabotin/bungiesearch

from elasticsearch_dsl.analysis import analyzer, token_filter

edge_ngram_analyzer = analyzer(
    'edge_ngram_analyzer',
    type='custom',
    tokenizer='standard',
    filter=[
        'lowercase',
        token_filter(
            'edge_ngram_filter',
            type='edgeNGram',
            min_gram=2,
            max_gram=20
        )
    ]
)

Example #41

0

Show file

File: test_analysis.py Project: 3lnc/elasticsearch-dsl-py

def test_analyzer_serializes_as_name():
    a = analysis.analyzer('my_analyzer')

    assert 'my_analyzer' == a.to_dict()

Example #42

0

Show file

File: search.py Project: Alissonps/djangoproject.com

class PathHierarchyTokenizer(analysis.Tokenizer):
    name = 'path_hierarchy'


class WhitespaceTokenizer(analysis.Tokenizer):
    name = 'whitespace'


path_analyzer = analysis.CustomAnalyzer('path',
                                        tokenizer='path_hierarchy',
                                        filter=['lowercase'])


lower_whitespace_analyzer = analysis.analyzer('lower_whitespace',
                                              tokenizer='whitespace',
                                              filter=['lowercase', 'stop'],
                                              char_filter=['html_strip'])


class DocumentDocType(ImprovedDocType):
    """
    The main documentation doc type to be used for searching.
    It stores a bit of meta data so we don't have to hit the db
    when rendering search results.

    The search view will be using the 'lang' and 'version' fields
    of the document's release to filter the search results, depending
    which was found in the URL.

    The breadcrumbs are shown under the search result title.
    """