コード例 #1
2
def test_unchanged_mapping_is_not_updated(write_client):
    m = mapping.Mapping('test-type')
    m.field('name', 'string', analyzer=analysis.analyzer("my_analyzer",
        tokenizer="standard",
        filter=[
            token_filter("simple_edge",
                type="edgeNGram",
                min_gram=2,
                max_gram=3
            )]
        )
    )


    m.save('test-mapping', using=write_client)
    # this should not trigger an error since the mapping didn't change
    m.save('test-mapping', using=write_client)


    # change the mapping just a little bit
    m.field('name', 'string', analyzer=analysis.analyzer("my_analyzer",
        tokenizer="standard",
        filter=[
            token_filter("simple_edge",
                type="edgeNGram",
                min_gram=2,
                max_gram=4 # changed from 3 to 4
            )]
        )
    )

    with raises(exceptions.IllegalOperation):
        m.save('test-mapping', using=write_client)
コード例 #2
0
def test_mapping_can_collect_multiple_analyzers():
    a1 = analysis.analyzer(
        "my_analyzer1",
        tokenizer="keyword",
        filter=["lowercase", analysis.token_filter("my_filter1", "stop", stopwords=["a", "b"])],
    )
    a2 = analysis.analyzer(
        "my_analyzer2",
        tokenizer=analysis.tokenizer("trigram", "nGram", min_gram=3, max_gram=3),
        filter=[analysis.token_filter("my_filter2", "stop", stopwords=["c", "d"])],
    )
    m = mapping.Mapping("article")
    m.field("title", "string", analyzer=a1, search_analyzer=a2)
    m.field(
        "text",
        "string",
        analyzer=a1,
        fields={"english": String(analyzer=a1), "unknown": String(analyzer=a1, search_analyzer=a2)},
    )
    assert {
        "analyzer": {
            "my_analyzer1": {"filter": ["lowercase", "my_filter1"], "tokenizer": "keyword", "type": "custom"},
            "my_analyzer2": {"filter": ["my_filter2"], "tokenizer": "trigram", "type": "custom"},
        },
        "filter": {
            "my_filter1": {"stopwords": ["a", "b"], "type": "stop"},
            "my_filter2": {"stopwords": ["c", "d"], "type": "stop"},
        },
        "tokenizer": {"trigram": {"max_gram": 3, "min_gram": 3, "type": "nGram"}},
    } == m._collect_analysis()
コード例 #3
0
def test_mapping_can_collect_multiple_analyzers():
    a1 = analysis.analyzer(
        'my_analyzer1',
        tokenizer='keyword',
        filter=['lowercase', analysis.token_filter('my_filter1', 'stop', stopwords=['a', 'b'])],
    )
    a2 = analysis.analyzer(
        'my_analyzer2',
        tokenizer=analysis.tokenizer('trigram', 'nGram', min_gram=3, max_gram=3),
        filter=[analysis.token_filter('my_filter2', 'stop', stopwords=['c', 'd'])],
    )
    m = mapping.Mapping()
    m.field('title', 'text', analyzer=a1, search_analyzer=a2)
    m.field(
        'text', 'text', analyzer=a1,
        fields={
            'english': Text(analyzer=a1),
            'unknown': Keyword(analyzer=a1, search_analyzer=a2),
        }
    )
    assert {
       'analyzer': {
           'my_analyzer1': {'filter': ['lowercase', 'my_filter1'],
                            'tokenizer': 'keyword',
                            'type': 'custom'},
           'my_analyzer2': {'filter': ['my_filter2'],
                            'tokenizer': 'trigram',
                            'type': 'custom'}},
       'filter': {
           'my_filter1': {'stopwords': ['a', 'b'], 'type': 'stop'},
           'my_filter2': {'stopwords': ['c', 'd'], 'type': 'stop'}},
       'tokenizer': {'trigram': {'max_gram': 3, 'min_gram': 3, 'type': 'nGram'}}
    } == m._collect_analysis()
コード例 #4
0
class Analyzer:
    # tokenizes and makes the tokens lowercase
    general_analyzer = analysis.analyzer(
        "general_analyzer",
        tokenizer=Tokenizer.alphanum_tokenizer,
        filter=["lowercase"])

    # provides light stemming for english tokens
    stemming_analyzer = analysis.analyzer(
        "stemming_analyzer",
        tokenizer=Tokenizer.alphanum_tokenizer,
        filter=["lowercase", "kstem"])

    # uses grammar based tokenization before analysis (e.g. "it's fine" -> ["it's", "fine"])
    english_analyzer = analysis.analyzer(
        "english_analyzer",
        tokenizer=tokenizer("standard_tokenizer", type="standard"),
        filter=[
            Filter.english_possessive_stemmer, "lowercase",
            Filter.english_stop, Filter.english_stemmer
        ])

    # tokenizes for words and numbers, removing all other characters before analysis
    # (e.g. "it's fine" -> ["it", "s", "fine"] or "hello_word" -> ["hello", "world"])
    alphanum_analyzer = analysis.analyzer(
        "alphanum_analyzer",
        tokenizer=Tokenizer.alphanum_tokenizer,
        filter=[
            Filter.english_possessive_stemmer, "lowercase",
            Filter.english_stop, Filter.english_stemmer
        ])
コード例 #5
0
def test_mapping_can_collect_all_analyzers():
    a1 = analysis.analyzer('my_analyzer1',
        tokenizer='keyword',
        filter=['lowercase', analysis.token_filter('my_filter1', 'stop', stopwords=['a', 'b'])],
    )
    a2 = analysis.analyzer('english')
    a3 = analysis.analyzer('unknown_custom')
    a4 = analysis.analyzer('my_analyzer2', 
        tokenizer=analysis.tokenizer('trigram', 'nGram', min_gram=3, max_gram=3),
        filter=[analysis.token_filter('my_filter2', 'stop', stopwords=['c', 'd'])],
    )

    m = mapping.Mapping('article')
    m.field('title', 'string', analyzer=a1,
        fields={
            'english': String(index_analyzer=a2),
            'unknown': String(search_analyzer=a3),
        }
    )
    m.field('comments', Nested(properties={
        'author': String(index_analyzer=a4)
    }))

    assert {
        'analyzer': {
            'my_analyzer1': {'filter': ['lowercase', 'my_filter1'], 'tokenizer': 'keyword', 'type': 'custom'},
            'my_analyzer2': {'filter': ['my_filter2'], 'tokenizer': 'trigram', 'type': 'custom'}},
        'filter': {
            'my_filter1': {'stopwords': ['a', 'b'], 'type': 'stop'},
            'my_filter2': {'stopwords': ['c', 'd'], 'type': 'stop'},
        },
        'tokenizer': {
            'trigram': {'max_gram': 3, 'min_gram': 3, 'type': 'nGram'},
        }
    } == m._collect_analysis()
コード例 #6
0
def test_unchanged_mapping_is_not_updated(write_client):
    m = mapping.Mapping('test-type')
    m.field('name',
            'string',
            analyzer=analysis.analyzer("my_analyzer",
                                       tokenizer="standard",
                                       filter=[
                                           token_filter("simple_edge",
                                                        type="edgeNGram",
                                                        min_gram=2,
                                                        max_gram=3)
                                       ]))

    m.save('test-mapping', using=write_client)
    # this should not trigger an error since the mapping didn't change
    m.save('test-mapping', using=write_client)

    # change the mapping just a little bit
    m.field(
        'name',
        'string',
        analyzer=analysis.analyzer(
            "my_analyzer",
            tokenizer="standard",
            filter=[
                token_filter(
                    "simple_edge",
                    type="edgeNGram",
                    min_gram=2,
                    max_gram=4  # changed from 3 to 4
                )
            ]))

    with raises(exceptions.IllegalOperation):
        m.save('test-mapping', using=write_client)
コード例 #7
0
def test_mapping_can_collect_all_analyzers():
    a1 = analysis.analyzer('my_analyzer1',
        tokenizer='keyword',
        filter=['lowercase', analysis.token_filter('my_filter1', 'stop', stopwords=['a', 'b'])],
    )
    a2 = analysis.analyzer('english')
    a3 = analysis.analyzer('unknown_custom')
    a4 = analysis.analyzer('my_analyzer2', 
        tokenizer=analysis.tokenizer('trigram', 'nGram', min_gram=3, max_gram=3),
        filter=[analysis.token_filter('my_filter2', 'stop', stopwords=['c', 'd'])],
    )

    m = mapping.Mapping('article')
    m.field('title', 'string', analyzer=a1,
        fields={
            'english': String(analyzer=a2),
            'unknown': String(analyzer=a3),
        }
    )
    m.field('comments', Nested(properties={
        'author': String(analyzer=a4)
    }))

    assert {
        'analyzer': {
            'my_analyzer1': {'filter': ['lowercase', 'my_filter1'], 'tokenizer': 'keyword', 'type': 'custom'},
            'my_analyzer2': {'filter': ['my_filter2'], 'tokenizer': 'trigram', 'type': 'custom'}},
        'filter': {
            'my_filter1': {'stopwords': ['a', 'b'], 'type': 'stop'},
            'my_filter2': {'stopwords': ['c', 'd'], 'type': 'stop'},
        },
        'tokenizer': {
            'trigram': {'max_gram': 3, 'min_gram': 3, 'type': 'nGram'},
        }
    } == m._collect_analysis()
コード例 #8
0
def test_mapping_can_collect_all_analyzers_and_normalizers():
    a1 = analysis.analyzer('my_analyzer1',
        tokenizer='keyword',
        filter=['lowercase', analysis.token_filter('my_filter1', 'stop', stopwords=['a', 'b'])],
    )
    a2 = analysis.analyzer('english')
    a3 = analysis.analyzer('unknown_custom')
    a4 = analysis.analyzer('my_analyzer2',
        tokenizer=analysis.tokenizer('trigram', 'nGram', min_gram=3, max_gram=3),
        filter=[analysis.token_filter('my_filter2', 'stop', stopwords=['c', 'd'])],
    )
    a5 = analysis.analyzer('my_analyzer3', tokenizer='keyword')
    n1 = analysis.normalizer('my_normalizer1',
        filter=['lowercase']
    )
    n2 = analysis.normalizer('my_normalizer2',
        filter=['my_filter1', 'my_filter2', analysis.token_filter('my_filter3', 'stop', stopwords=['e', 'f'])]
    )
    n3 = analysis.normalizer('unknown_custom')

    m = mapping.Mapping()
    m.field('title', 'text', analyzer=a1,
        fields={
            'english': Text(analyzer=a2),
            'unknown': Keyword(search_analyzer=a3),
        }
    )
    m.field('comments', Nested(properties={
        'author': Text(analyzer=a4)
    }))
    m.field('normalized_title', 'keyword', normalizer=n1)
    m.field('normalized_comment', 'keyword', normalizer=n2)
    m.field('unknown', 'keyword', normalizer=n3)
    m.meta('_all', analyzer=a5)

    assert {
        'analyzer': {
            'my_analyzer1': {'filter': ['lowercase', 'my_filter1'], 'tokenizer': 'keyword', 'type': 'custom'},
            'my_analyzer2': {'filter': ['my_filter2'], 'tokenizer': 'trigram', 'type': 'custom'},
            'my_analyzer3': {'tokenizer': 'keyword', 'type': 'custom'},
        },
        'normalizer': {
            'my_normalizer1': {'filter': ['lowercase'], 'type': 'custom'},
            'my_normalizer2': {'filter': ['my_filter1', 'my_filter2', 'my_filter3'], 'type': 'custom'},
        },
        'filter': {
            'my_filter1': {'stopwords': ['a', 'b'], 'type': 'stop'},
            'my_filter2': {'stopwords': ['c', 'd'], 'type': 'stop'},
            'my_filter3': {'stopwords': ['e', 'f'], 'type': 'stop'},
        },
        'tokenizer': {
            'trigram': {'max_gram': 3, 'min_gram': 3, 'type': 'nGram'},
        }
    } == m._collect_analysis()

    assert json.loads(json.dumps(m.to_dict())) == m.to_dict()
コード例 #9
0
ファイル: helpers.py プロジェクト: Luke31/howler
def get_analyzer(lang_analyzer,
                 delete_old_index,
                 user_dictionary_file='',
                 synonyms=None):
    """
    Return analyzer for specific language.

    If Japanese (``lang_analyzer == ja``) and the index doesn't need to be recreated (no delete required and
    no new synonyms) then return only the name of the analyzer.

    :param lang_analyzer: ``str`` which analyzer to get e.g. 'standard','kuromoji','english'
    :param delete_old_index: (only Japanese) ``bool`` if list is empty and index is not deleted, keep previous analyzer
        with synonyms
    :param user_dictionary_file: (only Japanese) ``str`` user-dictionary file with custom terms in the form of
        東京スカイツリー,東京 スカイツリー,トウキョウ スカイツリー,カスタム名詞
        See: https://www.elastic.co/guide/en/elasticsearch/plugins/current/analysis-kuromoji-tokenizer.html
    :param synonyms: (only Japanese) ``list`` of synonyms to be used in the form of ['京産大, 京都産業大学','a, b']
        if list is empty and index is not deleted, keep previous analyzer with synonyms
    :return: ``analyzer`` or ``str`` of analyzer to be used
    """
    if synonyms is None:
        synonyms = []
    if lang_analyzer == constants.SUPPORTED_LANG_CODES_ANALYZERS['ja']:
        # Use existing analyzer (with synonyms) if new synonyms list is empty. (Only if index is not re-built)
        if (not delete_old_index) & (len(synonyms) == 0):
            analyzer_lang = '{0}_custom'.format(
                lang_analyzer)  # Use existing analyzer with existing synonyms
        else:
            analyzer_lang = analysis.analyzer(
                '{0}_custom'.format(lang_analyzer),
                tokenizer=analysis.tokenizer(
                    'kuromoji_tokenizer_user_dict',
                    type='kuromoji_tokenizer',
                    user_dictionary=user_dictionary_file),
                filter=[
                    'kuromoji_baseform',
                    'kuromoji_part_of_speech',
                    'cjk_width',
                    'ja_stop',
                    'kuromoji_stemmer',
                    'lowercase',
                    analysis.token_filter(
                        'synonym', type='synonym',
                        synonyms=synonyms),  # ['京産大, 京都産業大学']
                ])
            # Extra token filters: kuromoji_number, kuromoji_readingform
            # Extra character filter: kuromoji_iteration_mark
            # user_dictionary="userdict_ja.txt")  # /etc/elasticsearch/
    else:
        analyzer_lang = analysis.analyzer(lang_analyzer)
    return analyzer_lang
コード例 #10
0
def test_mapping_saved_into_es(write_client):
    m = mapping.Mapping('test-type')
    m.field('name', 'string', analyzer=analysis.analyzer('my_analyzer', tokenizer='keyword'))
    m.field('tags', 'string', index='not_analyzed')
    m.save('test-mapping', using=write_client)

    m = mapping.Mapping('other-type')
    m.field('title', 'string').field('categories', 'string', index='not_analyzed')

    m.save('test-mapping', using=write_client)


    assert write_client.indices.exists_type(index='test-mapping', doc_type='test-type')
    assert {
        'test-mapping': {
            'mappings': {
                'test-type': {
                    'properties': {
                        'name': {'type': 'string', 'analyzer': 'my_analyzer'},
                        'tags': {'index': 'not_analyzed', 'type': 'string'}
                    }
                },
                'other-type': {
                    'properties': {
                        'title': {'type': 'string'},
                        'categories': {'index': 'not_analyzed', 'type': 'string'}
                    }
                }
            }
        }
    } == write_client.indices.get_mapping(index='test-mapping')
コード例 #11
0
def test_mapping_saved_into_es_when_index_already_exists_with_analysis(write_client):
    m = mapping.Mapping('test-type')
    analyzer = analysis.analyzer('my_analyzer', tokenizer='keyword')
    m.field('name', 'text', analyzer=analyzer)

    new_analysis = analyzer.get_analysis_definition()
    new_analysis['analyzer']['other_analyzer'] = {
        'type': 'custom',
        'tokenizer': 'whitespace'
    }
    write_client.indices.create(index='test-mapping', body={'settings': {'analysis': new_analysis}})

    m.field('title', 'text', analyzer=analyzer)
    m.save('test-mapping', using=write_client)

    assert {
        'test-mapping': {
            'mappings': {
                'test-type': {
                    'properties': {
                        'name': {'type': 'text', 'analyzer': 'my_analyzer'},
                        'title': {'type': 'text', 'analyzer': 'my_analyzer'},
                    }
                }
            }
        }
    } == write_client.indices.get_mapping(index='test-mapping')
コード例 #12
0
def test_mapping_saved_into_es_when_index_already_exists_with_analysis(
        write_client):
    m = mapping.Mapping('test-type')
    analyzer = analysis.analyzer('my_analyzer', tokenizer='keyword')
    m.field('name', 'string', analyzer=analyzer)
    write_client.indices.create(
        index='test-mapping',
        body={'settings': {
            'analysis': analyzer.get_analysis_definition()
        }})

    m.save('test-mapping', using=write_client)

    assert {
        'test-mapping': {
            'mappings': {
                'test-type': {
                    'properties': {
                        'name': {
                            'type': 'string',
                            'analyzer': 'my_analyzer'
                        },
                    }
                }
            }
        }
    } == write_client.indices.get_mapping(index='test-mapping')
コード例 #13
0
def test_custom_analyzer_can_collect_custom_items():
    trigram = analysis.tokenizer('trigram', 'nGram', min_gram=3, max_gram=3)
    my_stop = analysis.token_filter('my_stop', 'stop', stopwords=['a', 'b'])
    umlauts = analysis.char_filter('umlauts', 'pattern_replace', mappings=['ü=>ue'])
    a = analysis.analyzer(
        'my_analyzer',
        tokenizer=trigram,
        filter=['lowercase', my_stop],
        char_filter=['html_strip', umlauts]
    )

    assert a.to_dict() == 'my_analyzer'
    assert {
        'analyzer': {
            'my_analyzer': {
                'type': 'custom',
                'tokenizer': 'trigram',
                'filter': ['lowercase', 'my_stop'],
                'char_filter': ['html_strip', 'umlauts']
            }
        },
        'tokenizer': {
            'trigram': trigram.get_definition()
        },
        'filter': {
            'my_stop': my_stop.get_definition()
        },
        'char_filter': {
            'umlauts': umlauts.get_definition()
        }
    } == a.get_analysis_definition()
コード例 #14
0
def test_custom_analyzer_can_collect_custom_items():
    trigram = analysis.tokenizer('trigram', 'nGram', min_gram=3, max_gram=3)
    my_stop = analysis.token_filter('my_stop', 'stop', stopwords=['a', 'b'])
    umlauts = analysis.char_filter('umlauts',
                                   'pattern_replace',
                                   mappings=['ü=>ue'])
    a = analysis.analyzer('my_analyzer',
                          tokenizer=trigram,
                          filter=['lowercase', my_stop],
                          char_filter=['html_strip', umlauts])

    assert a.to_dict() == 'my_analyzer'
    assert {
        'analyzer': {
            'my_analyzer': {
                'type': 'custom',
                'tokenizer': 'trigram',
                'filter': ['lowercase', 'my_stop'],
                'char_filter': ['html_strip', 'umlauts']
            }
        },
        'tokenizer': {
            'trigram': trigram.get_definition()
        },
        'filter': {
            'my_stop': my_stop.get_definition()
        },
        'char_filter': {
            'umlauts': umlauts.get_definition()
        }
    } == a.get_analysis_definition()
コード例 #15
0
def test_multiplexer_with_custom_filter():
    a = analysis.analyzer(
        "my_analyzer",
        tokenizer="keyword",
        filter=[
            analysis.token_filter(
                "my_multi",
                "multiplexer",
                filters=[
                    [analysis.token_filter("en", "snowball", language="English")],
                    "lowercase, stop",
                ],
            )
        ],
    )

    assert {
        "analyzer": {
            "my_analyzer": {
                "filter": ["my_multi"],
                "tokenizer": "keyword",
                "type": "custom",
            }
        },
        "filter": {
            "en": {"type": "snowball", "language": "English"},
            "my_multi": {"filters": ["en", "lowercase, stop"], "type": "multiplexer"},
        },
    } == a.get_analysis_definition()
コード例 #16
0
def test_simple_multiplexer_filter():
    a = analysis.analyzer(
        "my_analyzer",
        tokenizer="keyword",
        filter=[
            analysis.token_filter(
                "my_multi", "multiplexer", filters=["lowercase", "lowercase, stop"]
            )
        ],
    )

    assert {
        "analyzer": {
            "my_analyzer": {
                "filter": ["my_multi"],
                "tokenizer": "keyword",
                "type": "custom",
            }
        },
        "filter": {
            "my_multi": {
                "filters": ["lowercase", "lowercase, stop"],
                "type": "multiplexer",
            }
        },
    } == a.get_analysis_definition()
コード例 #17
0
def test_custom_analyzer_can_collect_custom_items():
    trigram = analysis.tokenizer("trigram", "nGram", min_gram=3, max_gram=3)
    my_stop = analysis.token_filter("my_stop", "stop", stopwords=["a", "b"])
    umlauts = analysis.char_filter("umlauts", "pattern_replace", mappings=["ü=>ue"])
    a = analysis.analyzer(
        "my_analyzer",
        tokenizer=trigram,
        filter=["lowercase", my_stop],
        char_filter=["html_strip", umlauts],
    )

    assert a.to_dict() == "my_analyzer"
    assert {
        "analyzer": {
            "my_analyzer": {
                "type": "custom",
                "tokenizer": "trigram",
                "filter": ["lowercase", "my_stop"],
                "char_filter": ["html_strip", "umlauts"],
            }
        },
        "tokenizer": {"trigram": trigram.get_definition()},
        "filter": {"my_stop": my_stop.get_definition()},
        "char_filter": {"umlauts": umlauts.get_definition()},
    } == a.get_analysis_definition()
コード例 #18
0
def test_mapping_saved_into_es_when_index_already_exists_with_analysis(
        write_client):
    m = mapping.Mapping()
    analyzer = analysis.analyzer('my_analyzer', tokenizer='keyword')
    m.field('name', 'text', analyzer=analyzer)

    new_analysis = analyzer.get_analysis_definition()
    new_analysis['analyzer']['other_analyzer'] = {
        'type': 'custom',
        'tokenizer': 'whitespace'
    }
    write_client.indices.create(index='test-mapping',
                                body={'settings': {
                                    'analysis': new_analysis
                                }})

    m.field('title', 'text', analyzer=analyzer)
    m.save('test-mapping', using=write_client)

    assert {
        'test-mapping': {
            'mappings': {
                'properties': {
                    'name': {
                        'type': 'text',
                        'analyzer': 'my_analyzer'
                    },
                    'title': {
                        'type': 'text',
                        'analyzer': 'my_analyzer'
                    },
                }
            }
        }
    } == write_client.indices.get_mapping(index='test-mapping')
コード例 #19
0
def test_conditional_token_filter():
    a = analysis.analyzer(
        "my_cond",
        tokenizer=analysis.tokenizer("keyword"),
        filter=[
            analysis.token_filter(
                "testing",
                "condition",
                script={"source": "return true"},
                filter=[
                    "lowercase",
                    analysis.token_filter("en", "snowball", language="English"),
                ],
            ),
            "stop",
        ],
    )

    assert {
        "analyzer": {
            "my_cond": {
                "filter": ["testing", "stop"],
                "tokenizer": "keyword",
                "type": "custom",
            }
        },
        "filter": {
            "en": {"language": "English", "type": "snowball"},
            "testing": {
                "script": {"source": "return true"},
                "filter": ["lowercase", "en"],
                "type": "condition",
            },
        },
    } == a.get_analysis_definition()
コード例 #20
0
def test_mapping_saved_into_es_when_index_already_exists_with_analysis(write_client):
    m = mapping.Mapping()
    analyzer = analysis.analyzer("my_analyzer", tokenizer="keyword")
    m.field("name", "text", analyzer=analyzer)

    new_analysis = analyzer.get_analysis_definition()
    new_analysis["analyzer"]["other_analyzer"] = {
        "type": "custom",
        "tokenizer": "whitespace",
    }
    write_client.indices.create(
        index="test-mapping", body={"settings": {"analysis": new_analysis}}
    )

    m.field("title", "text", analyzer=analyzer)
    m.save("test-mapping", using=write_client)

    assert {
        "test-mapping": {
            "mappings": {
                "properties": {
                    "name": {"type": "text", "analyzer": "my_analyzer"},
                    "title": {"type": "text", "analyzer": "my_analyzer"},
                }
            }
        }
    } == write_client.indices.get_mapping(index="test-mapping")
コード例 #21
0
class Movie(Document):
    title = Text(fields={'raw': {'type': 'keyword'}})
    film_rating = Text()
    duration = Text()
    genre = Keyword(multi=True)
    release_date = Text()
    release_date_unix_time = Float()
    imdb_ratingValue = Float()
    imdb_bestRating = Float()
    imdb_ratingCount = Float()
    description = Text()
    storyline = Text()
    poster = Text()
    trailer_img = Text()
    director = Keyword(multi=True)
    creator = Keyword(multi=True)
    writer = Keyword(multi=True)
    stars = Keyword(multi=True)
    taglines = Keyword(multi=True)
    url = Keyword()
    req_headers = Object(enabled=False)
    res_headers = Object(enabled=False)

    suggest = Completion(analyzer=ngram_analyzer,
                         search_analyzer=analyzer('standard'))

    class Index:
        name = 'imdb'
コード例 #22
0
def test_multiplexer_with_custom_filter():
    a = analysis.analyzer('my_analyzer',
                          tokenizer='keyword',
                          filter=[
                              analysis.token_filter('my_multi',
                                                    'multiplexer',
                                                    filters=[[
                                                        analysis.token_filter(
                                                            'en',
                                                            'snowball',
                                                            language='English')
                                                    ], 'lowercase, stop'])
                          ])

    assert {
        "analyzer": {
            "my_analyzer": {
                "filter": ["my_multi"],
                "tokenizer": "keyword",
                "type": "custom"
            }
        },
        "filter": {
            "en": {
                "type": "snowball",
                "language": "English"
            },
            "my_multi": {
                "filters": ["en", "lowercase, stop"],
                "type": "multiplexer"
            }
        }
    } == a.get_analysis_definition()
コード例 #23
0
def test_mapping_saved_into_es_when_index_already_exists_closed(write_client):
    m = mapping.Mapping()
    m.field('name',
            'text',
            analyzer=analysis.analyzer('my_analyzer', tokenizer='keyword'))
    write_client.indices.create(index='test-mapping')

    with raises(exceptions.IllegalOperation):
        m.save('test-mapping', using=write_client)

    write_client.cluster.health(index='test-mapping', wait_for_status='yellow')
    write_client.indices.close(index='test-mapping')
    m.save('test-mapping', using=write_client)

    assert {
        'test-mapping': {
            'mappings': {
                'properties': {
                    'name': {
                        'type': 'text',
                        'analyzer': 'my_analyzer'
                    },
                }
            }
        }
    } == write_client.indices.get_mapping(index='test-mapping')
コード例 #24
0
def test_simple_multiplexer_filter():
    a = analysis.analyzer('my_analyzer',
                          tokenizer='keyword',
                          filter=[
                              analysis.token_filter(
                                  'my_multi',
                                  'multiplexer',
                                  filters=['lowercase', 'lowercase, stop'])
                          ])

    assert {
        "analyzer": {
            "my_analyzer": {
                "filter": ["my_multi"],
                "tokenizer": "keyword",
                "type": "custom"
            }
        },
        "filter": {
            "my_multi": {
                "filters": ["lowercase", "lowercase, stop"],
                "type": "multiplexer"
            }
        }
    } == a.get_analysis_definition()
コード例 #25
0
def test_even_non_custom_analyzers_can_have_params():
    a1 = analysis.analyzer("whitespace", type="pattern", pattern=r"\\s+")
    m = mapping.Mapping()
    m.field("title", "text", analyzer=a1)

    assert {
        "analyzer": {
            "whitespace": {
                "type": "pattern",
                "pattern": r"\\s+"
            }
        }
    } == m._collect_analysis()
コード例 #26
0
def test_even_non_custom_analyzers_can_have_params():
    a1 = analysis.analyzer('whitespace', type='pattern', pattern=r'\\s+')
    m = mapping.Mapping()
    m.field('title', 'text', analyzer=a1)

    assert {
        "analyzer": {
            "whitespace": {
                "type": "pattern",
                "pattern": r"\\s+"
            }
        }
    } == m._collect_analysis()
コード例 #27
0
def test_mapping_can_collect_all_analyzers():
    a1 = analysis.analyzer(
        "my_analyzer1",
        tokenizer="keyword",
        filter=["lowercase", analysis.token_filter("my_filter1", "stop", stopwords=["a", "b"])],
    )
    a2 = analysis.analyzer("english")
    a3 = analysis.analyzer("unknown_custom")
    a4 = analysis.analyzer(
        "my_analyzer2",
        tokenizer=analysis.tokenizer("trigram", "nGram", min_gram=3, max_gram=3),
        filter=[analysis.token_filter("my_filter2", "stop", stopwords=["c", "d"])],
    )
    a5 = analysis.analyzer("my_analyzer3", tokenizer="keyword")

    m = mapping.Mapping("article")
    m.field(
        "title", "string", analyzer=a1, fields={"english": String(analyzer=a2), "unknown": String(search_analyzer=a3)}
    )
    m.field("comments", Nested(properties={"author": String(analyzer=a4)}))
    m.meta("_all", analyzer=a5)

    assert {
        "analyzer": {
            "my_analyzer1": {"filter": ["lowercase", "my_filter1"], "tokenizer": "keyword", "type": "custom"},
            "my_analyzer2": {"filter": ["my_filter2"], "tokenizer": "trigram", "type": "custom"},
            "my_analyzer3": {"tokenizer": "keyword", "type": "custom"},
        },
        "filter": {
            "my_filter1": {"stopwords": ["a", "b"], "type": "stop"},
            "my_filter2": {"stopwords": ["c", "d"], "type": "stop"},
        },
        "tokenizer": {"trigram": {"max_gram": 3, "min_gram": 3, "type": "nGram"}},
    } == m._collect_analysis()

    assert json.loads(json.dumps(m.to_dict())) == m.to_dict()
コード例 #28
0
ファイル: search.py プロジェクト: farolanf/pytoko
    def suggest_search(self, query):

        response = AdDocument.search().query(
            Q('match',
              title={
                  'query': query,
                  'analyzer': analyzer('simple'),
                  'fuzziness': 1
              })).execute()

        data = [{
            'text': hit['_source']['title']
        } for hit in response.hits.hits]

        return data
コード例 #29
0
class Movie(DocType):
    title = Text(fields={'raw': {'type': 'keyword'}})
    summary = Text()
    datePublished = Date()
    creators = Keyword(multi=True)
    genres = Keyword(multi=True)
    casts = Keyword(multi=True)
    time = Integer()
    countries = Keyword(multi=True)
    plot_keywords = Keyword(multi=True)
    languages = Keyword(multi=True)
    rating = Float()
    poster = Keyword()
    suggest = Completion(analyzer=ngram_analyzer,
                         search_analyzer=analyzer('standard'))

    class Meta:
        index = 'imdb'
コード例 #30
0
def test_mapping_saved_into_es(write_client):
    m = mapping.Mapping()
    m.field(
        "name", "text", analyzer=analysis.analyzer("my_analyzer", tokenizer="keyword")
    )
    m.field("tags", "keyword")
    m.save("test-mapping", using=write_client)

    assert {
        "test-mapping": {
            "mappings": {
                "properties": {
                    "name": {"type": "text", "analyzer": "my_analyzer"},
                    "tags": {"type": "keyword"},
                }
            }
        }
    } == write_client.indices.get_mapping(index="test-mapping")
コード例 #31
0
def test_mapping_saved_into_es(write_client):
    m = mapping.Mapping('test-type')
    m.field('name', 'text', analyzer=analysis.analyzer('my_analyzer', tokenizer='keyword'))
    m.field('tags', 'keyword')
    m.save('test-mapping', using=write_client)

    assert write_client.indices.exists_type(index='test-mapping', doc_type='test-type')
    assert {
        'test-mapping': {
            'mappings': {
                'test-type': {
                    'properties': {
                        'name': {'type': 'text', 'analyzer': 'my_analyzer'},
                        'tags': {'type': 'keyword'}
                    }
                }
            }
        }
    } == write_client.indices.get_mapping(index='test-mapping')
コード例 #32
0
def test_mapping_saved_into_es_when_index_already_exists_with_analysis(write_client):
    m = mapping.Mapping('test-type')
    analyzer = analysis.analyzer('my_analyzer', tokenizer='keyword')
    m.field('name', 'string', analyzer=analyzer)
    write_client.indices.create(index='test-mapping', body={'settings': {'analysis': analyzer.get_analysis_definition()}})

    m.save('test-mapping', using=write_client)

    assert {
        'test-mapping': {
            'mappings': {
                'test-type': {
                    'properties': {
                        'name': {'type': 'string', 'analyzer': 'my_analyzer'},
                    }
                }
            }
        }
    } == write_client.indices.get_mapping(index='test-mapping')
コード例 #33
0
ファイル: index.py プロジェクト: Luke31/howler
    def add_mapping_fields(self, mapping, analyzer_lang, analyzer_case_insensitive_sort):
        """
        Add custom fields for Mails to the passed Index-mapping.

        :param mapping: ``Mapping`` Elastic-search DSL mapping to add fields to
        :param analyzer_lang: ``analyzer`` or ``str`` of analyzer to be used for language-specific fields
        :param analyzer_case_insensitive_sort: ``analyzer`` of analyzer to be used
        :return: None (Mapping is modified!)
        """
        # Specific fields email
        analyzer_email = analysis.analyzer('email', tokenizer=analysis.tokenizer('uax_url_email'),
                                           filter=['lowercase', 'unique'])
        mapping.field('fromName', 'text', analyzer=analyzer_lang,
                      fields={
                          'keyword': 'keyword',
                      })
        mapping.field('fromEmail', 'text', analyzer=analyzer_email,
                      fields={
                          'keyword': 'keyword',
                      })
        mapping.field('toName', 'text', analyzer=analyzer_lang,
                      fields={
                          'keyword': 'keyword',
                      })
        mapping.field('toEmail', 'text', analyzer=analyzer_email,
                      fields={
                          'keyword': 'keyword',
                      })
        mapping.field('replyToName', 'text', analyzer=analyzer_lang,
                      fields={
                          'keyword': 'keyword',
                      })
        mapping.field('replyToEmail', 'text', analyzer=analyzer_email,
                      fields={
                          'keyword': 'keyword',
                      })
        mapping.field('subject', 'text', analyzer=analyzer_lang)
        mapping.field('date', 'date')
        mapping.field('body', 'text', analyzer=analyzer_lang)
        mapping.field('spam', 'boolean')
        mapping.field('hasAttachmet', 'boolean')
        mapping.field('attachmentNames', 'text', analyzer=analyzer_lang)
コード例 #34
0
def test_conflicting_nested_filters_cause_error():
    a = analysis.analyzer(
        "my_cond",
        tokenizer=analysis.tokenizer("keyword"),
        filter=[
            analysis.token_filter("en", "stemmer", language="english"),
            analysis.token_filter(
                "testing",
                "condition",
                script={"source": "return true"},
                filter=[
                    "lowercase",
                    analysis.token_filter("en", "snowball", language="English"),
                ],
            ),
        ],
    )

    with raises(ValueError):
        a.get_analysis_definition()
コード例 #35
0
ファイル: __init__.py プロジェクト: Darriall/editorsnotes
    def get_settings(self):
        shingle_filter = analysis.token_filter(
            'filter_shingle',
            'shingle',
            max_shingle_size=5,
            min_shingle_size=2,
            output_unigrams=True)

        shingle_analyzer = analysis.analyzer(
            'analyzer_shingle',
            tokenizer='standard',
            filter=['standard', 'lowercase', shingle_filter])

        return {
            'settings': {
                'index': {
                    'analysis': shingle_analyzer.get_analysis_definition()
                }
            }
        }
コード例 #36
0
def test_mapping_saved_into_es_when_index_already_exists_closed(write_client):
    m = mapping.Mapping()
    m.field(
        "name", "text", analyzer=analysis.analyzer("my_analyzer", tokenizer="keyword")
    )
    write_client.indices.create(index="test-mapping")

    with raises(exceptions.IllegalOperation):
        m.save("test-mapping", using=write_client)

    write_client.cluster.health(index="test-mapping", wait_for_status="yellow")
    write_client.indices.close(index="test-mapping")
    m.save("test-mapping", using=write_client)

    assert {
        "test-mapping": {
            "mappings": {
                "properties": {"name": {"type": "text", "analyzer": "my_analyzer"}}
            }
        }
    } == write_client.indices.get_mapping(index="test-mapping")
コード例 #37
0
def test_conflicting_nested_filters_cause_error():
    a = analysis.analyzer('my_cond',
                          tokenizer=analysis.tokenizer('keyword'),
                          filter=[
                              analysis.token_filter('en',
                                                    'stemmer',
                                                    language='english'),
                              analysis.token_filter(
                                  'testing',
                                  'condition',
                                  script={'source': 'return true'},
                                  filter=[
                                      'lowercase',
                                      analysis.token_filter('en',
                                                            'snowball',
                                                            language='English')
                                  ])
                          ])

    with raises(ValueError):
        a.get_analysis_definition()
コード例 #38
0
def test_mapping_saved_into_es_when_index_already_exists_closed(write_client):
    m = mapping.Mapping('test-type')
    m.field('name', 'string', analyzer=analysis.analyzer('my_analyzer', tokenizer='keyword'))
    write_client.indices.create(index='test-mapping')

    with raises(exceptions.IllegalOperation):
        m.save('test-mapping', using=write_client)

    write_client.cluster.health(index='test-mapping', wait_for_status='yellow')
    write_client.indices.close(index='test-mapping')
    m.save('test-mapping', using=write_client)


    assert {
        'test-mapping': {
            'mappings': {
                'test-type': {
                    'properties': {
                        'name': {'type': 'string', 'analyzer': 'my_analyzer'},
                    }
                }
            }
        }
    } == write_client.indices.get_mapping(index='test-mapping')
コード例 #39
0
def test_even_non_custom_analyzers_can_have_params():
    a1 = analysis.analyzer("whitespace", type="pattern", pattern=r"\\s+")
    m = mapping.Mapping("some_type")
    m.field("title", "string", analyzer=a1)

    assert {"analyzer": {"whitespace": {"type": "pattern", "pattern": r"\\s+"}}} == m._collect_analysis()
コード例 #40
0
from elasticsearch_dsl.analysis import analyzer, token_filter

edge_ngram_analyzer = analyzer(
    'edge_ngram_analyzer',
    type='custom',
    tokenizer='standard',
    filter=[
        'lowercase',
        token_filter(
            'edge_ngram_filter',
            type='edgeNGram',
            min_gram=2,
            max_gram=20
        )
    ]
)
コード例 #41
0
def test_analyzer_serializes_as_name():
    a = analysis.analyzer('my_analyzer')

    assert 'my_analyzer' == a.to_dict()
コード例 #42
0
ファイル: search.py プロジェクト: Alissonps/djangoproject.com
class PathHierarchyTokenizer(analysis.Tokenizer):
    name = 'path_hierarchy'


class WhitespaceTokenizer(analysis.Tokenizer):
    name = 'whitespace'


path_analyzer = analysis.CustomAnalyzer('path',
                                        tokenizer='path_hierarchy',
                                        filter=['lowercase'])


lower_whitespace_analyzer = analysis.analyzer('lower_whitespace',
                                              tokenizer='whitespace',
                                              filter=['lowercase', 'stop'],
                                              char_filter=['html_strip'])


class DocumentDocType(ImprovedDocType):
    """
    The main documentation doc type to be used for searching.
    It stores a bit of meta data so we don't have to hit the db
    when rendering search results.

    The search view will be using the 'lang' and 'version' fields
    of the document's release to filter the search results, depending
    which was found in the URL.

    The breadcrumbs are shown under the search result title.
    """