Esempio n. 1
2
def test_unchanged_mapping_is_not_updated(write_client):
    m = mapping.Mapping('test-type')
    m.field('name', 'string', analyzer=analysis.analyzer("my_analyzer",
        tokenizer="standard",
        filter=[
            token_filter("simple_edge",
                type="edgeNGram",
                min_gram=2,
                max_gram=3
            )]
        )
    )


    m.save('test-mapping', using=write_client)
    # this should not trigger an error since the mapping didn't change
    m.save('test-mapping', using=write_client)


    # change the mapping just a little bit
    m.field('name', 'string', analyzer=analysis.analyzer("my_analyzer",
        tokenizer="standard",
        filter=[
            token_filter("simple_edge",
                type="edgeNGram",
                min_gram=2,
                max_gram=4 # changed from 3 to 4
            )]
        )
    )

    with raises(exceptions.IllegalOperation):
        m.save('test-mapping', using=write_client)
def test_mapping_can_collect_multiple_analyzers():
    a1 = analysis.analyzer(
        "my_analyzer1",
        tokenizer="keyword",
        filter=["lowercase", analysis.token_filter("my_filter1", "stop", stopwords=["a", "b"])],
    )
    a2 = analysis.analyzer(
        "my_analyzer2",
        tokenizer=analysis.tokenizer("trigram", "nGram", min_gram=3, max_gram=3),
        filter=[analysis.token_filter("my_filter2", "stop", stopwords=["c", "d"])],
    )
    m = mapping.Mapping("article")
    m.field("title", "string", analyzer=a1, search_analyzer=a2)
    m.field(
        "text",
        "string",
        analyzer=a1,
        fields={"english": String(analyzer=a1), "unknown": String(analyzer=a1, search_analyzer=a2)},
    )
    assert {
        "analyzer": {
            "my_analyzer1": {"filter": ["lowercase", "my_filter1"], "tokenizer": "keyword", "type": "custom"},
            "my_analyzer2": {"filter": ["my_filter2"], "tokenizer": "trigram", "type": "custom"},
        },
        "filter": {
            "my_filter1": {"stopwords": ["a", "b"], "type": "stop"},
            "my_filter2": {"stopwords": ["c", "d"], "type": "stop"},
        },
        "tokenizer": {"trigram": {"max_gram": 3, "min_gram": 3, "type": "nGram"}},
    } == m._collect_analysis()
Esempio n. 3
0
def test_mapping_can_collect_multiple_analyzers():
    a1 = analysis.analyzer(
        'my_analyzer1',
        tokenizer='keyword',
        filter=['lowercase', analysis.token_filter('my_filter1', 'stop', stopwords=['a', 'b'])],
    )
    a2 = analysis.analyzer(
        'my_analyzer2',
        tokenizer=analysis.tokenizer('trigram', 'nGram', min_gram=3, max_gram=3),
        filter=[analysis.token_filter('my_filter2', 'stop', stopwords=['c', 'd'])],
    )
    m = mapping.Mapping()
    m.field('title', 'text', analyzer=a1, search_analyzer=a2)
    m.field(
        'text', 'text', analyzer=a1,
        fields={
            'english': Text(analyzer=a1),
            'unknown': Keyword(analyzer=a1, search_analyzer=a2),
        }
    )
    assert {
       'analyzer': {
           'my_analyzer1': {'filter': ['lowercase', 'my_filter1'],
                            'tokenizer': 'keyword',
                            'type': 'custom'},
           'my_analyzer2': {'filter': ['my_filter2'],
                            'tokenizer': 'trigram',
                            'type': 'custom'}},
       'filter': {
           'my_filter1': {'stopwords': ['a', 'b'], 'type': 'stop'},
           'my_filter2': {'stopwords': ['c', 'd'], 'type': 'stop'}},
       'tokenizer': {'trigram': {'max_gram': 3, 'min_gram': 3, 'type': 'nGram'}}
    } == m._collect_analysis()
Esempio n. 4
0
class Analyzer:
    # tokenizes and makes the tokens lowercase
    general_analyzer = analysis.analyzer(
        "general_analyzer",
        tokenizer=Tokenizer.alphanum_tokenizer,
        filter=["lowercase"])

    # provides light stemming for english tokens
    stemming_analyzer = analysis.analyzer(
        "stemming_analyzer",
        tokenizer=Tokenizer.alphanum_tokenizer,
        filter=["lowercase", "kstem"])

    # uses grammar based tokenization before analysis (e.g. "it's fine" -> ["it's", "fine"])
    english_analyzer = analysis.analyzer(
        "english_analyzer",
        tokenizer=tokenizer("standard_tokenizer", type="standard"),
        filter=[
            Filter.english_possessive_stemmer, "lowercase",
            Filter.english_stop, Filter.english_stemmer
        ])

    # tokenizes for words and numbers, removing all other characters before analysis
    # (e.g. "it's fine" -> ["it", "s", "fine"] or "hello_word" -> ["hello", "world"])
    alphanum_analyzer = analysis.analyzer(
        "alphanum_analyzer",
        tokenizer=Tokenizer.alphanum_tokenizer,
        filter=[
            Filter.english_possessive_stemmer, "lowercase",
            Filter.english_stop, Filter.english_stemmer
        ])
Esempio n. 5
0
def test_mapping_can_collect_all_analyzers():
    a1 = analysis.analyzer('my_analyzer1',
        tokenizer='keyword',
        filter=['lowercase', analysis.token_filter('my_filter1', 'stop', stopwords=['a', 'b'])],
    )
    a2 = analysis.analyzer('english')
    a3 = analysis.analyzer('unknown_custom')
    a4 = analysis.analyzer('my_analyzer2', 
        tokenizer=analysis.tokenizer('trigram', 'nGram', min_gram=3, max_gram=3),
        filter=[analysis.token_filter('my_filter2', 'stop', stopwords=['c', 'd'])],
    )

    m = mapping.Mapping('article')
    m.field('title', 'string', analyzer=a1,
        fields={
            'english': String(index_analyzer=a2),
            'unknown': String(search_analyzer=a3),
        }
    )
    m.field('comments', Nested(properties={
        'author': String(index_analyzer=a4)
    }))

    assert {
        'analyzer': {
            'my_analyzer1': {'filter': ['lowercase', 'my_filter1'], 'tokenizer': 'keyword', 'type': 'custom'},
            'my_analyzer2': {'filter': ['my_filter2'], 'tokenizer': 'trigram', 'type': 'custom'}},
        'filter': {
            'my_filter1': {'stopwords': ['a', 'b'], 'type': 'stop'},
            'my_filter2': {'stopwords': ['c', 'd'], 'type': 'stop'},
        },
        'tokenizer': {
            'trigram': {'max_gram': 3, 'min_gram': 3, 'type': 'nGram'},
        }
    } == m._collect_analysis()
Esempio n. 6
0
def test_unchanged_mapping_is_not_updated(write_client):
    m = mapping.Mapping('test-type')
    m.field('name',
            'string',
            analyzer=analysis.analyzer("my_analyzer",
                                       tokenizer="standard",
                                       filter=[
                                           token_filter("simple_edge",
                                                        type="edgeNGram",
                                                        min_gram=2,
                                                        max_gram=3)
                                       ]))

    m.save('test-mapping', using=write_client)
    # this should not trigger an error since the mapping didn't change
    m.save('test-mapping', using=write_client)

    # change the mapping just a little bit
    m.field(
        'name',
        'string',
        analyzer=analysis.analyzer(
            "my_analyzer",
            tokenizer="standard",
            filter=[
                token_filter(
                    "simple_edge",
                    type="edgeNGram",
                    min_gram=2,
                    max_gram=4  # changed from 3 to 4
                )
            ]))

    with raises(exceptions.IllegalOperation):
        m.save('test-mapping', using=write_client)
def test_mapping_can_collect_all_analyzers():
    a1 = analysis.analyzer('my_analyzer1',
        tokenizer='keyword',
        filter=['lowercase', analysis.token_filter('my_filter1', 'stop', stopwords=['a', 'b'])],
    )
    a2 = analysis.analyzer('english')
    a3 = analysis.analyzer('unknown_custom')
    a4 = analysis.analyzer('my_analyzer2', 
        tokenizer=analysis.tokenizer('trigram', 'nGram', min_gram=3, max_gram=3),
        filter=[analysis.token_filter('my_filter2', 'stop', stopwords=['c', 'd'])],
    )

    m = mapping.Mapping('article')
    m.field('title', 'string', analyzer=a1,
        fields={
            'english': String(analyzer=a2),
            'unknown': String(analyzer=a3),
        }
    )
    m.field('comments', Nested(properties={
        'author': String(analyzer=a4)
    }))

    assert {
        'analyzer': {
            'my_analyzer1': {'filter': ['lowercase', 'my_filter1'], 'tokenizer': 'keyword', 'type': 'custom'},
            'my_analyzer2': {'filter': ['my_filter2'], 'tokenizer': 'trigram', 'type': 'custom'}},
        'filter': {
            'my_filter1': {'stopwords': ['a', 'b'], 'type': 'stop'},
            'my_filter2': {'stopwords': ['c', 'd'], 'type': 'stop'},
        },
        'tokenizer': {
            'trigram': {'max_gram': 3, 'min_gram': 3, 'type': 'nGram'},
        }
    } == m._collect_analysis()
Esempio n. 8
0
def test_mapping_can_collect_all_analyzers_and_normalizers():
    a1 = analysis.analyzer('my_analyzer1',
        tokenizer='keyword',
        filter=['lowercase', analysis.token_filter('my_filter1', 'stop', stopwords=['a', 'b'])],
    )
    a2 = analysis.analyzer('english')
    a3 = analysis.analyzer('unknown_custom')
    a4 = analysis.analyzer('my_analyzer2',
        tokenizer=analysis.tokenizer('trigram', 'nGram', min_gram=3, max_gram=3),
        filter=[analysis.token_filter('my_filter2', 'stop', stopwords=['c', 'd'])],
    )
    a5 = analysis.analyzer('my_analyzer3', tokenizer='keyword')
    n1 = analysis.normalizer('my_normalizer1',
        filter=['lowercase']
    )
    n2 = analysis.normalizer('my_normalizer2',
        filter=['my_filter1', 'my_filter2', analysis.token_filter('my_filter3', 'stop', stopwords=['e', 'f'])]
    )
    n3 = analysis.normalizer('unknown_custom')

    m = mapping.Mapping()
    m.field('title', 'text', analyzer=a1,
        fields={
            'english': Text(analyzer=a2),
            'unknown': Keyword(search_analyzer=a3),
        }
    )
    m.field('comments', Nested(properties={
        'author': Text(analyzer=a4)
    }))
    m.field('normalized_title', 'keyword', normalizer=n1)
    m.field('normalized_comment', 'keyword', normalizer=n2)
    m.field('unknown', 'keyword', normalizer=n3)
    m.meta('_all', analyzer=a5)

    assert {
        'analyzer': {
            'my_analyzer1': {'filter': ['lowercase', 'my_filter1'], 'tokenizer': 'keyword', 'type': 'custom'},
            'my_analyzer2': {'filter': ['my_filter2'], 'tokenizer': 'trigram', 'type': 'custom'},
            'my_analyzer3': {'tokenizer': 'keyword', 'type': 'custom'},
        },
        'normalizer': {
            'my_normalizer1': {'filter': ['lowercase'], 'type': 'custom'},
            'my_normalizer2': {'filter': ['my_filter1', 'my_filter2', 'my_filter3'], 'type': 'custom'},
        },
        'filter': {
            'my_filter1': {'stopwords': ['a', 'b'], 'type': 'stop'},
            'my_filter2': {'stopwords': ['c', 'd'], 'type': 'stop'},
            'my_filter3': {'stopwords': ['e', 'f'], 'type': 'stop'},
        },
        'tokenizer': {
            'trigram': {'max_gram': 3, 'min_gram': 3, 'type': 'nGram'},
        }
    } == m._collect_analysis()

    assert json.loads(json.dumps(m.to_dict())) == m.to_dict()
Esempio n. 9
0
def get_analyzer(lang_analyzer,
                 delete_old_index,
                 user_dictionary_file='',
                 synonyms=None):
    """
    Return analyzer for specific language.

    If Japanese (``lang_analyzer == ja``) and the index doesn't need to be recreated (no delete required and
    no new synonyms) then return only the name of the analyzer.

    :param lang_analyzer: ``str`` which analyzer to get e.g. 'standard','kuromoji','english'
    :param delete_old_index: (only Japanese) ``bool`` if list is empty and index is not deleted, keep previous analyzer
        with synonyms
    :param user_dictionary_file: (only Japanese) ``str`` user-dictionary file with custom terms in the form of
        東京スカイツリー,東京 スカイツリー,トウキョウ スカイツリー,カスタム名詞
        See: https://www.elastic.co/guide/en/elasticsearch/plugins/current/analysis-kuromoji-tokenizer.html
    :param synonyms: (only Japanese) ``list`` of synonyms to be used in the form of ['京産大, 京都産業大学','a, b']
        if list is empty and index is not deleted, keep previous analyzer with synonyms
    :return: ``analyzer`` or ``str`` of analyzer to be used
    """
    if synonyms is None:
        synonyms = []
    if lang_analyzer == constants.SUPPORTED_LANG_CODES_ANALYZERS['ja']:
        # Use existing analyzer (with synonyms) if new synonyms list is empty. (Only if index is not re-built)
        if (not delete_old_index) & (len(synonyms) == 0):
            analyzer_lang = '{0}_custom'.format(
                lang_analyzer)  # Use existing analyzer with existing synonyms
        else:
            analyzer_lang = analysis.analyzer(
                '{0}_custom'.format(lang_analyzer),
                tokenizer=analysis.tokenizer(
                    'kuromoji_tokenizer_user_dict',
                    type='kuromoji_tokenizer',
                    user_dictionary=user_dictionary_file),
                filter=[
                    'kuromoji_baseform',
                    'kuromoji_part_of_speech',
                    'cjk_width',
                    'ja_stop',
                    'kuromoji_stemmer',
                    'lowercase',
                    analysis.token_filter(
                        'synonym', type='synonym',
                        synonyms=synonyms),  # ['京産大, 京都産業大学']
                ])
            # Extra token filters: kuromoji_number, kuromoji_readingform
            # Extra character filter: kuromoji_iteration_mark
            # user_dictionary="userdict_ja.txt")  # /etc/elasticsearch/
    else:
        analyzer_lang = analysis.analyzer(lang_analyzer)
    return analyzer_lang
Esempio n. 10
0
def test_mapping_saved_into_es(write_client):
    m = mapping.Mapping('test-type')
    m.field('name', 'string', analyzer=analysis.analyzer('my_analyzer', tokenizer='keyword'))
    m.field('tags', 'string', index='not_analyzed')
    m.save('test-mapping', using=write_client)

    m = mapping.Mapping('other-type')
    m.field('title', 'string').field('categories', 'string', index='not_analyzed')

    m.save('test-mapping', using=write_client)


    assert write_client.indices.exists_type(index='test-mapping', doc_type='test-type')
    assert {
        'test-mapping': {
            'mappings': {
                'test-type': {
                    'properties': {
                        'name': {'type': 'string', 'analyzer': 'my_analyzer'},
                        'tags': {'index': 'not_analyzed', 'type': 'string'}
                    }
                },
                'other-type': {
                    'properties': {
                        'title': {'type': 'string'},
                        'categories': {'index': 'not_analyzed', 'type': 'string'}
                    }
                }
            }
        }
    } == write_client.indices.get_mapping(index='test-mapping')
def test_mapping_saved_into_es_when_index_already_exists_with_analysis(write_client):
    m = mapping.Mapping('test-type')
    analyzer = analysis.analyzer('my_analyzer', tokenizer='keyword')
    m.field('name', 'text', analyzer=analyzer)

    new_analysis = analyzer.get_analysis_definition()
    new_analysis['analyzer']['other_analyzer'] = {
        'type': 'custom',
        'tokenizer': 'whitespace'
    }
    write_client.indices.create(index='test-mapping', body={'settings': {'analysis': new_analysis}})

    m.field('title', 'text', analyzer=analyzer)
    m.save('test-mapping', using=write_client)

    assert {
        'test-mapping': {
            'mappings': {
                'test-type': {
                    'properties': {
                        'name': {'type': 'text', 'analyzer': 'my_analyzer'},
                        'title': {'type': 'text', 'analyzer': 'my_analyzer'},
                    }
                }
            }
        }
    } == write_client.indices.get_mapping(index='test-mapping')
Esempio n. 12
0
def test_mapping_saved_into_es_when_index_already_exists_with_analysis(
        write_client):
    m = mapping.Mapping('test-type')
    analyzer = analysis.analyzer('my_analyzer', tokenizer='keyword')
    m.field('name', 'string', analyzer=analyzer)
    write_client.indices.create(
        index='test-mapping',
        body={'settings': {
            'analysis': analyzer.get_analysis_definition()
        }})

    m.save('test-mapping', using=write_client)

    assert {
        'test-mapping': {
            'mappings': {
                'test-type': {
                    'properties': {
                        'name': {
                            'type': 'string',
                            'analyzer': 'my_analyzer'
                        },
                    }
                }
            }
        }
    } == write_client.indices.get_mapping(index='test-mapping')
Esempio n. 13
0
def test_custom_analyzer_can_collect_custom_items():
    trigram = analysis.tokenizer('trigram', 'nGram', min_gram=3, max_gram=3)
    my_stop = analysis.token_filter('my_stop', 'stop', stopwords=['a', 'b'])
    umlauts = analysis.char_filter('umlauts', 'pattern_replace', mappings=['ü=>ue'])
    a = analysis.analyzer(
        'my_analyzer',
        tokenizer=trigram,
        filter=['lowercase', my_stop],
        char_filter=['html_strip', umlauts]
    )

    assert a.to_dict() == 'my_analyzer'
    assert {
        'analyzer': {
            'my_analyzer': {
                'type': 'custom',
                'tokenizer': 'trigram',
                'filter': ['lowercase', 'my_stop'],
                'char_filter': ['html_strip', 'umlauts']
            }
        },
        'tokenizer': {
            'trigram': trigram.get_definition()
        },
        'filter': {
            'my_stop': my_stop.get_definition()
        },
        'char_filter': {
            'umlauts': umlauts.get_definition()
        }
    } == a.get_analysis_definition()
def test_custom_analyzer_can_collect_custom_items():
    trigram = analysis.tokenizer('trigram', 'nGram', min_gram=3, max_gram=3)
    my_stop = analysis.token_filter('my_stop', 'stop', stopwords=['a', 'b'])
    umlauts = analysis.char_filter('umlauts',
                                   'pattern_replace',
                                   mappings=['ü=>ue'])
    a = analysis.analyzer('my_analyzer',
                          tokenizer=trigram,
                          filter=['lowercase', my_stop],
                          char_filter=['html_strip', umlauts])

    assert a.to_dict() == 'my_analyzer'
    assert {
        'analyzer': {
            'my_analyzer': {
                'type': 'custom',
                'tokenizer': 'trigram',
                'filter': ['lowercase', 'my_stop'],
                'char_filter': ['html_strip', 'umlauts']
            }
        },
        'tokenizer': {
            'trigram': trigram.get_definition()
        },
        'filter': {
            'my_stop': my_stop.get_definition()
        },
        'char_filter': {
            'umlauts': umlauts.get_definition()
        }
    } == a.get_analysis_definition()
def test_multiplexer_with_custom_filter():
    a = analysis.analyzer(
        "my_analyzer",
        tokenizer="keyword",
        filter=[
            analysis.token_filter(
                "my_multi",
                "multiplexer",
                filters=[
                    [analysis.token_filter("en", "snowball", language="English")],
                    "lowercase, stop",
                ],
            )
        ],
    )

    assert {
        "analyzer": {
            "my_analyzer": {
                "filter": ["my_multi"],
                "tokenizer": "keyword",
                "type": "custom",
            }
        },
        "filter": {
            "en": {"type": "snowball", "language": "English"},
            "my_multi": {"filters": ["en", "lowercase, stop"], "type": "multiplexer"},
        },
    } == a.get_analysis_definition()
def test_simple_multiplexer_filter():
    a = analysis.analyzer(
        "my_analyzer",
        tokenizer="keyword",
        filter=[
            analysis.token_filter(
                "my_multi", "multiplexer", filters=["lowercase", "lowercase, stop"]
            )
        ],
    )

    assert {
        "analyzer": {
            "my_analyzer": {
                "filter": ["my_multi"],
                "tokenizer": "keyword",
                "type": "custom",
            }
        },
        "filter": {
            "my_multi": {
                "filters": ["lowercase", "lowercase, stop"],
                "type": "multiplexer",
            }
        },
    } == a.get_analysis_definition()
def test_custom_analyzer_can_collect_custom_items():
    trigram = analysis.tokenizer("trigram", "nGram", min_gram=3, max_gram=3)
    my_stop = analysis.token_filter("my_stop", "stop", stopwords=["a", "b"])
    umlauts = analysis.char_filter("umlauts", "pattern_replace", mappings=["ü=>ue"])
    a = analysis.analyzer(
        "my_analyzer",
        tokenizer=trigram,
        filter=["lowercase", my_stop],
        char_filter=["html_strip", umlauts],
    )

    assert a.to_dict() == "my_analyzer"
    assert {
        "analyzer": {
            "my_analyzer": {
                "type": "custom",
                "tokenizer": "trigram",
                "filter": ["lowercase", "my_stop"],
                "char_filter": ["html_strip", "umlauts"],
            }
        },
        "tokenizer": {"trigram": trigram.get_definition()},
        "filter": {"my_stop": my_stop.get_definition()},
        "char_filter": {"umlauts": umlauts.get_definition()},
    } == a.get_analysis_definition()
Esempio n. 18
0
def test_mapping_saved_into_es_when_index_already_exists_with_analysis(
        write_client):
    m = mapping.Mapping()
    analyzer = analysis.analyzer('my_analyzer', tokenizer='keyword')
    m.field('name', 'text', analyzer=analyzer)

    new_analysis = analyzer.get_analysis_definition()
    new_analysis['analyzer']['other_analyzer'] = {
        'type': 'custom',
        'tokenizer': 'whitespace'
    }
    write_client.indices.create(index='test-mapping',
                                body={'settings': {
                                    'analysis': new_analysis
                                }})

    m.field('title', 'text', analyzer=analyzer)
    m.save('test-mapping', using=write_client)

    assert {
        'test-mapping': {
            'mappings': {
                'properties': {
                    'name': {
                        'type': 'text',
                        'analyzer': 'my_analyzer'
                    },
                    'title': {
                        'type': 'text',
                        'analyzer': 'my_analyzer'
                    },
                }
            }
        }
    } == write_client.indices.get_mapping(index='test-mapping')
def test_conditional_token_filter():
    a = analysis.analyzer(
        "my_cond",
        tokenizer=analysis.tokenizer("keyword"),
        filter=[
            analysis.token_filter(
                "testing",
                "condition",
                script={"source": "return true"},
                filter=[
                    "lowercase",
                    analysis.token_filter("en", "snowball", language="English"),
                ],
            ),
            "stop",
        ],
    )

    assert {
        "analyzer": {
            "my_cond": {
                "filter": ["testing", "stop"],
                "tokenizer": "keyword",
                "type": "custom",
            }
        },
        "filter": {
            "en": {"language": "English", "type": "snowball"},
            "testing": {
                "script": {"source": "return true"},
                "filter": ["lowercase", "en"],
                "type": "condition",
            },
        },
    } == a.get_analysis_definition()
Esempio n. 20
0
def test_mapping_saved_into_es_when_index_already_exists_with_analysis(write_client):
    m = mapping.Mapping()
    analyzer = analysis.analyzer("my_analyzer", tokenizer="keyword")
    m.field("name", "text", analyzer=analyzer)

    new_analysis = analyzer.get_analysis_definition()
    new_analysis["analyzer"]["other_analyzer"] = {
        "type": "custom",
        "tokenizer": "whitespace",
    }
    write_client.indices.create(
        index="test-mapping", body={"settings": {"analysis": new_analysis}}
    )

    m.field("title", "text", analyzer=analyzer)
    m.save("test-mapping", using=write_client)

    assert {
        "test-mapping": {
            "mappings": {
                "properties": {
                    "name": {"type": "text", "analyzer": "my_analyzer"},
                    "title": {"type": "text", "analyzer": "my_analyzer"},
                }
            }
        }
    } == write_client.indices.get_mapping(index="test-mapping")
Esempio n. 21
0
class Movie(Document):
    title = Text(fields={'raw': {'type': 'keyword'}})
    film_rating = Text()
    duration = Text()
    genre = Keyword(multi=True)
    release_date = Text()
    release_date_unix_time = Float()
    imdb_ratingValue = Float()
    imdb_bestRating = Float()
    imdb_ratingCount = Float()
    description = Text()
    storyline = Text()
    poster = Text()
    trailer_img = Text()
    director = Keyword(multi=True)
    creator = Keyword(multi=True)
    writer = Keyword(multi=True)
    stars = Keyword(multi=True)
    taglines = Keyword(multi=True)
    url = Keyword()
    req_headers = Object(enabled=False)
    res_headers = Object(enabled=False)

    suggest = Completion(analyzer=ngram_analyzer,
                         search_analyzer=analyzer('standard'))

    class Index:
        name = 'imdb'
def test_multiplexer_with_custom_filter():
    a = analysis.analyzer('my_analyzer',
                          tokenizer='keyword',
                          filter=[
                              analysis.token_filter('my_multi',
                                                    'multiplexer',
                                                    filters=[[
                                                        analysis.token_filter(
                                                            'en',
                                                            'snowball',
                                                            language='English')
                                                    ], 'lowercase, stop'])
                          ])

    assert {
        "analyzer": {
            "my_analyzer": {
                "filter": ["my_multi"],
                "tokenizer": "keyword",
                "type": "custom"
            }
        },
        "filter": {
            "en": {
                "type": "snowball",
                "language": "English"
            },
            "my_multi": {
                "filters": ["en", "lowercase, stop"],
                "type": "multiplexer"
            }
        }
    } == a.get_analysis_definition()
Esempio n. 23
0
def test_mapping_saved_into_es_when_index_already_exists_closed(write_client):
    m = mapping.Mapping()
    m.field('name',
            'text',
            analyzer=analysis.analyzer('my_analyzer', tokenizer='keyword'))
    write_client.indices.create(index='test-mapping')

    with raises(exceptions.IllegalOperation):
        m.save('test-mapping', using=write_client)

    write_client.cluster.health(index='test-mapping', wait_for_status='yellow')
    write_client.indices.close(index='test-mapping')
    m.save('test-mapping', using=write_client)

    assert {
        'test-mapping': {
            'mappings': {
                'properties': {
                    'name': {
                        'type': 'text',
                        'analyzer': 'my_analyzer'
                    },
                }
            }
        }
    } == write_client.indices.get_mapping(index='test-mapping')
def test_simple_multiplexer_filter():
    a = analysis.analyzer('my_analyzer',
                          tokenizer='keyword',
                          filter=[
                              analysis.token_filter(
                                  'my_multi',
                                  'multiplexer',
                                  filters=['lowercase', 'lowercase, stop'])
                          ])

    assert {
        "analyzer": {
            "my_analyzer": {
                "filter": ["my_multi"],
                "tokenizer": "keyword",
                "type": "custom"
            }
        },
        "filter": {
            "my_multi": {
                "filters": ["lowercase", "lowercase, stop"],
                "type": "multiplexer"
            }
        }
    } == a.get_analysis_definition()
def test_even_non_custom_analyzers_can_have_params():
    a1 = analysis.analyzer("whitespace", type="pattern", pattern=r"\\s+")
    m = mapping.Mapping()
    m.field("title", "text", analyzer=a1)

    assert {
        "analyzer": {
            "whitespace": {
                "type": "pattern",
                "pattern": r"\\s+"
            }
        }
    } == m._collect_analysis()
Esempio n. 26
0
def test_even_non_custom_analyzers_can_have_params():
    a1 = analysis.analyzer('whitespace', type='pattern', pattern=r'\\s+')
    m = mapping.Mapping()
    m.field('title', 'text', analyzer=a1)

    assert {
        "analyzer": {
            "whitespace": {
                "type": "pattern",
                "pattern": r"\\s+"
            }
        }
    } == m._collect_analysis()
def test_mapping_can_collect_all_analyzers():
    a1 = analysis.analyzer(
        "my_analyzer1",
        tokenizer="keyword",
        filter=["lowercase", analysis.token_filter("my_filter1", "stop", stopwords=["a", "b"])],
    )
    a2 = analysis.analyzer("english")
    a3 = analysis.analyzer("unknown_custom")
    a4 = analysis.analyzer(
        "my_analyzer2",
        tokenizer=analysis.tokenizer("trigram", "nGram", min_gram=3, max_gram=3),
        filter=[analysis.token_filter("my_filter2", "stop", stopwords=["c", "d"])],
    )
    a5 = analysis.analyzer("my_analyzer3", tokenizer="keyword")

    m = mapping.Mapping("article")
    m.field(
        "title", "string", analyzer=a1, fields={"english": String(analyzer=a2), "unknown": String(search_analyzer=a3)}
    )
    m.field("comments", Nested(properties={"author": String(analyzer=a4)}))
    m.meta("_all", analyzer=a5)

    assert {
        "analyzer": {
            "my_analyzer1": {"filter": ["lowercase", "my_filter1"], "tokenizer": "keyword", "type": "custom"},
            "my_analyzer2": {"filter": ["my_filter2"], "tokenizer": "trigram", "type": "custom"},
            "my_analyzer3": {"tokenizer": "keyword", "type": "custom"},
        },
        "filter": {
            "my_filter1": {"stopwords": ["a", "b"], "type": "stop"},
            "my_filter2": {"stopwords": ["c", "d"], "type": "stop"},
        },
        "tokenizer": {"trigram": {"max_gram": 3, "min_gram": 3, "type": "nGram"}},
    } == m._collect_analysis()

    assert json.loads(json.dumps(m.to_dict())) == m.to_dict()
Esempio n. 28
0
    def suggest_search(self, query):

        response = AdDocument.search().query(
            Q('match',
              title={
                  'query': query,
                  'analyzer': analyzer('simple'),
                  'fuzziness': 1
              })).execute()

        data = [{
            'text': hit['_source']['title']
        } for hit in response.hits.hits]

        return data
Esempio n. 29
0
class Movie(DocType):
    title = Text(fields={'raw': {'type': 'keyword'}})
    summary = Text()
    datePublished = Date()
    creators = Keyword(multi=True)
    genres = Keyword(multi=True)
    casts = Keyword(multi=True)
    time = Integer()
    countries = Keyword(multi=True)
    plot_keywords = Keyword(multi=True)
    languages = Keyword(multi=True)
    rating = Float()
    poster = Keyword()
    suggest = Completion(analyzer=ngram_analyzer,
                         search_analyzer=analyzer('standard'))

    class Meta:
        index = 'imdb'
Esempio n. 30
0
def test_mapping_saved_into_es(write_client):
    m = mapping.Mapping()
    m.field(
        "name", "text", analyzer=analysis.analyzer("my_analyzer", tokenizer="keyword")
    )
    m.field("tags", "keyword")
    m.save("test-mapping", using=write_client)

    assert {
        "test-mapping": {
            "mappings": {
                "properties": {
                    "name": {"type": "text", "analyzer": "my_analyzer"},
                    "tags": {"type": "keyword"},
                }
            }
        }
    } == write_client.indices.get_mapping(index="test-mapping")
def test_mapping_saved_into_es(write_client):
    m = mapping.Mapping('test-type')
    m.field('name', 'text', analyzer=analysis.analyzer('my_analyzer', tokenizer='keyword'))
    m.field('tags', 'keyword')
    m.save('test-mapping', using=write_client)

    assert write_client.indices.exists_type(index='test-mapping', doc_type='test-type')
    assert {
        'test-mapping': {
            'mappings': {
                'test-type': {
                    'properties': {
                        'name': {'type': 'text', 'analyzer': 'my_analyzer'},
                        'tags': {'type': 'keyword'}
                    }
                }
            }
        }
    } == write_client.indices.get_mapping(index='test-mapping')
Esempio n. 32
0
def test_mapping_saved_into_es_when_index_already_exists_with_analysis(write_client):
    m = mapping.Mapping('test-type')
    analyzer = analysis.analyzer('my_analyzer', tokenizer='keyword')
    m.field('name', 'string', analyzer=analyzer)
    write_client.indices.create(index='test-mapping', body={'settings': {'analysis': analyzer.get_analysis_definition()}})

    m.save('test-mapping', using=write_client)

    assert {
        'test-mapping': {
            'mappings': {
                'test-type': {
                    'properties': {
                        'name': {'type': 'string', 'analyzer': 'my_analyzer'},
                    }
                }
            }
        }
    } == write_client.indices.get_mapping(index='test-mapping')
Esempio n. 33
0
    def add_mapping_fields(self, mapping, analyzer_lang, analyzer_case_insensitive_sort):
        """
        Add custom fields for Mails to the passed Index-mapping.

        :param mapping: ``Mapping`` Elastic-search DSL mapping to add fields to
        :param analyzer_lang: ``analyzer`` or ``str`` of analyzer to be used for language-specific fields
        :param analyzer_case_insensitive_sort: ``analyzer`` of analyzer to be used
        :return: None (Mapping is modified!)
        """
        # Specific fields email
        analyzer_email = analysis.analyzer('email', tokenizer=analysis.tokenizer('uax_url_email'),
                                           filter=['lowercase', 'unique'])
        mapping.field('fromName', 'text', analyzer=analyzer_lang,
                      fields={
                          'keyword': 'keyword',
                      })
        mapping.field('fromEmail', 'text', analyzer=analyzer_email,
                      fields={
                          'keyword': 'keyword',
                      })
        mapping.field('toName', 'text', analyzer=analyzer_lang,
                      fields={
                          'keyword': 'keyword',
                      })
        mapping.field('toEmail', 'text', analyzer=analyzer_email,
                      fields={
                          'keyword': 'keyword',
                      })
        mapping.field('replyToName', 'text', analyzer=analyzer_lang,
                      fields={
                          'keyword': 'keyword',
                      })
        mapping.field('replyToEmail', 'text', analyzer=analyzer_email,
                      fields={
                          'keyword': 'keyword',
                      })
        mapping.field('subject', 'text', analyzer=analyzer_lang)
        mapping.field('date', 'date')
        mapping.field('body', 'text', analyzer=analyzer_lang)
        mapping.field('spam', 'boolean')
        mapping.field('hasAttachmet', 'boolean')
        mapping.field('attachmentNames', 'text', analyzer=analyzer_lang)
def test_conflicting_nested_filters_cause_error():
    a = analysis.analyzer(
        "my_cond",
        tokenizer=analysis.tokenizer("keyword"),
        filter=[
            analysis.token_filter("en", "stemmer", language="english"),
            analysis.token_filter(
                "testing",
                "condition",
                script={"source": "return true"},
                filter=[
                    "lowercase",
                    analysis.token_filter("en", "snowball", language="English"),
                ],
            ),
        ],
    )

    with raises(ValueError):
        a.get_analysis_definition()
Esempio n. 35
0
    def get_settings(self):
        shingle_filter = analysis.token_filter(
            'filter_shingle',
            'shingle',
            max_shingle_size=5,
            min_shingle_size=2,
            output_unigrams=True)

        shingle_analyzer = analysis.analyzer(
            'analyzer_shingle',
            tokenizer='standard',
            filter=['standard', 'lowercase', shingle_filter])

        return {
            'settings': {
                'index': {
                    'analysis': shingle_analyzer.get_analysis_definition()
                }
            }
        }
Esempio n. 36
0
def test_mapping_saved_into_es_when_index_already_exists_closed(write_client):
    m = mapping.Mapping()
    m.field(
        "name", "text", analyzer=analysis.analyzer("my_analyzer", tokenizer="keyword")
    )
    write_client.indices.create(index="test-mapping")

    with raises(exceptions.IllegalOperation):
        m.save("test-mapping", using=write_client)

    write_client.cluster.health(index="test-mapping", wait_for_status="yellow")
    write_client.indices.close(index="test-mapping")
    m.save("test-mapping", using=write_client)

    assert {
        "test-mapping": {
            "mappings": {
                "properties": {"name": {"type": "text", "analyzer": "my_analyzer"}}
            }
        }
    } == write_client.indices.get_mapping(index="test-mapping")
def test_conflicting_nested_filters_cause_error():
    a = analysis.analyzer('my_cond',
                          tokenizer=analysis.tokenizer('keyword'),
                          filter=[
                              analysis.token_filter('en',
                                                    'stemmer',
                                                    language='english'),
                              analysis.token_filter(
                                  'testing',
                                  'condition',
                                  script={'source': 'return true'},
                                  filter=[
                                      'lowercase',
                                      analysis.token_filter('en',
                                                            'snowball',
                                                            language='English')
                                  ])
                          ])

    with raises(ValueError):
        a.get_analysis_definition()
Esempio n. 38
0
def test_mapping_saved_into_es_when_index_already_exists_closed(write_client):
    m = mapping.Mapping('test-type')
    m.field('name', 'string', analyzer=analysis.analyzer('my_analyzer', tokenizer='keyword'))
    write_client.indices.create(index='test-mapping')

    with raises(exceptions.IllegalOperation):
        m.save('test-mapping', using=write_client)

    write_client.cluster.health(index='test-mapping', wait_for_status='yellow')
    write_client.indices.close(index='test-mapping')
    m.save('test-mapping', using=write_client)


    assert {
        'test-mapping': {
            'mappings': {
                'test-type': {
                    'properties': {
                        'name': {'type': 'string', 'analyzer': 'my_analyzer'},
                    }
                }
            }
        }
    } == write_client.indices.get_mapping(index='test-mapping')
def test_even_non_custom_analyzers_can_have_params():
    a1 = analysis.analyzer("whitespace", type="pattern", pattern=r"\\s+")
    m = mapping.Mapping("some_type")
    m.field("title", "string", analyzer=a1)

    assert {"analyzer": {"whitespace": {"type": "pattern", "pattern": r"\\s+"}}} == m._collect_analysis()
Esempio n. 40
0
from elasticsearch_dsl.analysis import analyzer, token_filter

edge_ngram_analyzer = analyzer(
    'edge_ngram_analyzer',
    type='custom',
    tokenizer='standard',
    filter=[
        'lowercase',
        token_filter(
            'edge_ngram_filter',
            type='edgeNGram',
            min_gram=2,
            max_gram=20
        )
    ]
)
Esempio n. 41
0
def test_analyzer_serializes_as_name():
    a = analysis.analyzer('my_analyzer')

    assert 'my_analyzer' == a.to_dict()
Esempio n. 42
0
class PathHierarchyTokenizer(analysis.Tokenizer):
    name = 'path_hierarchy'


class WhitespaceTokenizer(analysis.Tokenizer):
    name = 'whitespace'


path_analyzer = analysis.CustomAnalyzer('path',
                                        tokenizer='path_hierarchy',
                                        filter=['lowercase'])


lower_whitespace_analyzer = analysis.analyzer('lower_whitespace',
                                              tokenizer='whitespace',
                                              filter=['lowercase', 'stop'],
                                              char_filter=['html_strip'])


class DocumentDocType(ImprovedDocType):
    """
    The main documentation doc type to be used for searching.
    It stores a bit of meta data so we don't have to hit the db
    when rendering search results.

    The search view will be using the 'lang' and 'version' fields
    of the document's release to filter the search results, depending
    which was found in the URL.

    The breadcrumbs are shown under the search result title.
    """