Ejemplo n.º 1
0
def test_mapping_can_collect_all_analyzers():
    a1 = analysis.analyzer('my_analyzer1',
        tokenizer='keyword',
        filter=['lowercase', analysis.token_filter('my_filter1', 'stop', stopwords=['a', 'b'])],
    )
    a2 = analysis.analyzer('english')
    a3 = analysis.analyzer('unknown_custom')
    a4 = analysis.analyzer('my_analyzer2', 
        tokenizer=analysis.tokenizer('trigram', 'nGram', min_gram=3, max_gram=3),
        filter=[analysis.token_filter('my_filter2', 'stop', stopwords=['c', 'd'])],
    )

    m = mapping.Mapping('article')
    m.field('title', 'string', analyzer=a1,
        fields={
            'english': String(index_analyzer=a2),
            'unknown': String(search_analyzer=a3),
        }
    )
    m.field('comments', Nested(properties={
        'author': String(index_analyzer=a4)
    }))

    assert {
        'analyzer': {
            'my_analyzer1': {'filter': ['lowercase', 'my_filter1'], 'tokenizer': 'keyword', 'type': 'custom'},
            'my_analyzer2': {'filter': ['my_filter2'], 'tokenizer': 'trigram', 'type': 'custom'}},
        'filter': {
            'my_filter1': {'stopwords': ['a', 'b'], 'type': 'stop'},
            'my_filter2': {'stopwords': ['c', 'd'], 'type': 'stop'},
        },
        'tokenizer': {
            'trigram': {'max_gram': 3, 'min_gram': 3, 'type': 'nGram'},
        }
    } == m._collect_analysis()
def test_mapping_can_collect_all_analyzers():
    a1 = analysis.analyzer('my_analyzer1',
        tokenizer='keyword',
        filter=['lowercase', analysis.token_filter('my_filter1', 'stop', stopwords=['a', 'b'])],
    )
    a2 = analysis.analyzer('english')
    a3 = analysis.analyzer('unknown_custom')
    a4 = analysis.analyzer('my_analyzer2', 
        tokenizer=analysis.tokenizer('trigram', 'nGram', min_gram=3, max_gram=3),
        filter=[analysis.token_filter('my_filter2', 'stop', stopwords=['c', 'd'])],
    )

    m = mapping.Mapping('article')
    m.field('title', 'string', analyzer=a1,
        fields={
            'english': String(analyzer=a2),
            'unknown': String(analyzer=a3),
        }
    )
    m.field('comments', Nested(properties={
        'author': String(analyzer=a4)
    }))

    assert {
        'analyzer': {
            'my_analyzer1': {'filter': ['lowercase', 'my_filter1'], 'tokenizer': 'keyword', 'type': 'custom'},
            'my_analyzer2': {'filter': ['my_filter2'], 'tokenizer': 'trigram', 'type': 'custom'}},
        'filter': {
            'my_filter1': {'stopwords': ['a', 'b'], 'type': 'stop'},
            'my_filter2': {'stopwords': ['c', 'd'], 'type': 'stop'},
        },
        'tokenizer': {
            'trigram': {'max_gram': 3, 'min_gram': 3, 'type': 'nGram'},
        }
    } == m._collect_analysis()
def test_multiplexer_with_custom_filter():
    a = analysis.analyzer(
        "my_analyzer",
        tokenizer="keyword",
        filter=[
            analysis.token_filter(
                "my_multi",
                "multiplexer",
                filters=[
                    [analysis.token_filter("en", "snowball", language="English")],
                    "lowercase, stop",
                ],
            )
        ],
    )

    assert {
        "analyzer": {
            "my_analyzer": {
                "filter": ["my_multi"],
                "tokenizer": "keyword",
                "type": "custom",
            }
        },
        "filter": {
            "en": {"type": "snowball", "language": "English"},
            "my_multi": {"filters": ["en", "lowercase, stop"], "type": "multiplexer"},
        },
    } == a.get_analysis_definition()
Ejemplo n.º 4
0
def test_mapping_can_collect_multiple_analyzers():
    a1 = analysis.analyzer(
        "my_analyzer1",
        tokenizer="keyword",
        filter=["lowercase", analysis.token_filter("my_filter1", "stop", stopwords=["a", "b"])],
    )
    a2 = analysis.analyzer(
        "my_analyzer2",
        tokenizer=analysis.tokenizer("trigram", "nGram", min_gram=3, max_gram=3),
        filter=[analysis.token_filter("my_filter2", "stop", stopwords=["c", "d"])],
    )
    m = mapping.Mapping("article")
    m.field("title", "string", analyzer=a1, search_analyzer=a2)
    m.field(
        "text",
        "string",
        analyzer=a1,
        fields={"english": String(analyzer=a1), "unknown": String(analyzer=a1, search_analyzer=a2)},
    )
    assert {
        "analyzer": {
            "my_analyzer1": {"filter": ["lowercase", "my_filter1"], "tokenizer": "keyword", "type": "custom"},
            "my_analyzer2": {"filter": ["my_filter2"], "tokenizer": "trigram", "type": "custom"},
        },
        "filter": {
            "my_filter1": {"stopwords": ["a", "b"], "type": "stop"},
            "my_filter2": {"stopwords": ["c", "d"], "type": "stop"},
        },
        "tokenizer": {"trigram": {"max_gram": 3, "min_gram": 3, "type": "nGram"}},
    } == m._collect_analysis()
Ejemplo n.º 5
0
def test_multiplexer_with_custom_filter():
    a = analysis.analyzer('my_analyzer',
                          tokenizer='keyword',
                          filter=[
                              analysis.token_filter('my_multi',
                                                    'multiplexer',
                                                    filters=[[
                                                        analysis.token_filter(
                                                            'en',
                                                            'snowball',
                                                            language='English')
                                                    ], 'lowercase, stop'])
                          ])

    assert {
        "analyzer": {
            "my_analyzer": {
                "filter": ["my_multi"],
                "tokenizer": "keyword",
                "type": "custom"
            }
        },
        "filter": {
            "en": {
                "type": "snowball",
                "language": "English"
            },
            "my_multi": {
                "filters": ["en", "lowercase, stop"],
                "type": "multiplexer"
            }
        }
    } == a.get_analysis_definition()
def test_conditional_token_filter():
    a = analysis.analyzer(
        "my_cond",
        tokenizer=analysis.tokenizer("keyword"),
        filter=[
            analysis.token_filter(
                "testing",
                "condition",
                script={"source": "return true"},
                filter=[
                    "lowercase",
                    analysis.token_filter("en", "snowball", language="English"),
                ],
            ),
            "stop",
        ],
    )

    assert {
        "analyzer": {
            "my_cond": {
                "filter": ["testing", "stop"],
                "tokenizer": "keyword",
                "type": "custom",
            }
        },
        "filter": {
            "en": {"language": "English", "type": "snowball"},
            "testing": {
                "script": {"source": "return true"},
                "filter": ["lowercase", "en"],
                "type": "condition",
            },
        },
    } == a.get_analysis_definition()
Ejemplo n.º 7
0
def test_mapping_can_collect_multiple_analyzers():
    a1 = analysis.analyzer(
        'my_analyzer1',
        tokenizer='keyword',
        filter=['lowercase', analysis.token_filter('my_filter1', 'stop', stopwords=['a', 'b'])],
    )
    a2 = analysis.analyzer(
        'my_analyzer2',
        tokenizer=analysis.tokenizer('trigram', 'nGram', min_gram=3, max_gram=3),
        filter=[analysis.token_filter('my_filter2', 'stop', stopwords=['c', 'd'])],
    )
    m = mapping.Mapping()
    m.field('title', 'text', analyzer=a1, search_analyzer=a2)
    m.field(
        'text', 'text', analyzer=a1,
        fields={
            'english': Text(analyzer=a1),
            'unknown': Keyword(analyzer=a1, search_analyzer=a2),
        }
    )
    assert {
       'analyzer': {
           'my_analyzer1': {'filter': ['lowercase', 'my_filter1'],
                            'tokenizer': 'keyword',
                            'type': 'custom'},
           'my_analyzer2': {'filter': ['my_filter2'],
                            'tokenizer': 'trigram',
                            'type': 'custom'}},
       'filter': {
           'my_filter1': {'stopwords': ['a', 'b'], 'type': 'stop'},
           'my_filter2': {'stopwords': ['c', 'd'], 'type': 'stop'}},
       'tokenizer': {'trigram': {'max_gram': 3, 'min_gram': 3, 'type': 'nGram'}}
    } == m._collect_analysis()
Ejemplo n.º 8
0
def test_mapping_can_collect_all_analyzers_and_normalizers():
    a1 = analysis.analyzer('my_analyzer1',
        tokenizer='keyword',
        filter=['lowercase', analysis.token_filter('my_filter1', 'stop', stopwords=['a', 'b'])],
    )
    a2 = analysis.analyzer('english')
    a3 = analysis.analyzer('unknown_custom')
    a4 = analysis.analyzer('my_analyzer2',
        tokenizer=analysis.tokenizer('trigram', 'nGram', min_gram=3, max_gram=3),
        filter=[analysis.token_filter('my_filter2', 'stop', stopwords=['c', 'd'])],
    )
    a5 = analysis.analyzer('my_analyzer3', tokenizer='keyword')
    n1 = analysis.normalizer('my_normalizer1',
        filter=['lowercase']
    )
    n2 = analysis.normalizer('my_normalizer2',
        filter=['my_filter1', 'my_filter2', analysis.token_filter('my_filter3', 'stop', stopwords=['e', 'f'])]
    )
    n3 = analysis.normalizer('unknown_custom')

    m = mapping.Mapping()
    m.field('title', 'text', analyzer=a1,
        fields={
            'english': Text(analyzer=a2),
            'unknown': Keyword(search_analyzer=a3),
        }
    )
    m.field('comments', Nested(properties={
        'author': Text(analyzer=a4)
    }))
    m.field('normalized_title', 'keyword', normalizer=n1)
    m.field('normalized_comment', 'keyword', normalizer=n2)
    m.field('unknown', 'keyword', normalizer=n3)
    m.meta('_all', analyzer=a5)

    assert {
        'analyzer': {
            'my_analyzer1': {'filter': ['lowercase', 'my_filter1'], 'tokenizer': 'keyword', 'type': 'custom'},
            'my_analyzer2': {'filter': ['my_filter2'], 'tokenizer': 'trigram', 'type': 'custom'},
            'my_analyzer3': {'tokenizer': 'keyword', 'type': 'custom'},
        },
        'normalizer': {
            'my_normalizer1': {'filter': ['lowercase'], 'type': 'custom'},
            'my_normalizer2': {'filter': ['my_filter1', 'my_filter2', 'my_filter3'], 'type': 'custom'},
        },
        'filter': {
            'my_filter1': {'stopwords': ['a', 'b'], 'type': 'stop'},
            'my_filter2': {'stopwords': ['c', 'd'], 'type': 'stop'},
            'my_filter3': {'stopwords': ['e', 'f'], 'type': 'stop'},
        },
        'tokenizer': {
            'trigram': {'max_gram': 3, 'min_gram': 3, 'type': 'nGram'},
        }
    } == m._collect_analysis()

    assert json.loads(json.dumps(m.to_dict())) == m.to_dict()
Ejemplo n.º 9
0
class Analyzers:
    first_name_synonym_analyzer = analyzer(
        'name_search_synonym_analyzer',
        type="custom",
        tokenizer="standard",
        filter=[
            'lowercase',
            analysis.token_filter(
                'name_search_synonym_filter',
                type="synonym",
                expand=True,
                lenient=True,
                synonyms_path="common_first_name_synonyms.txt",
            )
        ])

    @staticmethod
    def analyze_first_name_synonym(text):
        return Analyzers.first_name_synonym_analyzer.simulate(text)

    nysis_phonetic_analyzer = analyzer('name_search_nysiis_analyzer',
                                       type="custom",
                                       tokenizer="standard",
                                       filter=[
                                           'lowercase',
                                           analysis.token_filter(
                                               'name_search_nysiis_filter',
                                               type="phonetic",
                                               encoder="nysiis",
                                               replace="true")
                                       ])

    @staticmethod
    def analyze_nysis_phonetic(text):
        return Analyzers.nysis_phonetic_analyzer.simulate(text)

    beider_morse_phonetic_analyzer = analyzer(
        'name_search_beider_morse_analyzer',
        type="custom",
        tokenizer="standard",
        filter=[
            'lowercase',
            analysis.token_filter('name_search_beider_morse_filter',
                                  type="phonetic",
                                  encoder="beider_morse",
                                  replace="true")
        ])

    @staticmethod
    def analyze_beider_morse_phonetic(text):
        return Analyzers.beider_morse_phonetic_analyzer.simulate(text)
Ejemplo n.º 10
0
def test_stemmer_analyzer_can_pass_name():
    t = analysis.token_filter('my_english_filter', name="minimal_english", type="stemmer")
    assert t.to_dict() == 'my_english_filter'
    assert {
        "type" : "stemmer",
        "name" : "minimal_english"
    } == t.get_definition()
Ejemplo n.º 11
0
def test_custom_analyzer_can_collect_custom_items():
    trigram = analysis.tokenizer('trigram', 'nGram', min_gram=3, max_gram=3)
    my_stop = analysis.token_filter('my_stop', 'stop', stopwords=['a', 'b'])
    umlauts = analysis.char_filter('umlauts', 'pattern_replace', mappings=['ü=>ue'])
    a = analysis.analyzer(
        'my_analyzer',
        tokenizer=trigram,
        filter=['lowercase', my_stop],
        char_filter=['html_strip', umlauts]
    )

    assert a.to_dict() == 'my_analyzer'
    assert {
        'analyzer': {
            'my_analyzer': {
                'type': 'custom',
                'tokenizer': 'trigram',
                'filter': ['lowercase', 'my_stop'],
                'char_filter': ['html_strip', 'umlauts']
            }
        },
        'tokenizer': {
            'trigram': trigram.get_definition()
        },
        'filter': {
            'my_stop': my_stop.get_definition()
        },
        'char_filter': {
            'umlauts': umlauts.get_definition()
        }
    } == a.get_analysis_definition()
Ejemplo n.º 12
0
def test_simple_multiplexer_filter():
    a = analysis.analyzer('my_analyzer',
                          tokenizer='keyword',
                          filter=[
                              analysis.token_filter(
                                  'my_multi',
                                  'multiplexer',
                                  filters=['lowercase', 'lowercase, stop'])
                          ])

    assert {
        "analyzer": {
            "my_analyzer": {
                "filter": ["my_multi"],
                "tokenizer": "keyword",
                "type": "custom"
            }
        },
        "filter": {
            "my_multi": {
                "filters": ["lowercase", "lowercase, stop"],
                "type": "multiplexer"
            }
        }
    } == a.get_analysis_definition()
def test_simple_multiplexer_filter():
    a = analysis.analyzer(
        "my_analyzer",
        tokenizer="keyword",
        filter=[
            analysis.token_filter(
                "my_multi", "multiplexer", filters=["lowercase", "lowercase, stop"]
            )
        ],
    )

    assert {
        "analyzer": {
            "my_analyzer": {
                "filter": ["my_multi"],
                "tokenizer": "keyword",
                "type": "custom",
            }
        },
        "filter": {
            "my_multi": {
                "filters": ["lowercase", "lowercase, stop"],
                "type": "multiplexer",
            }
        },
    } == a.get_analysis_definition()
def test_custom_analyzer_can_collect_custom_items():
    trigram = analysis.tokenizer("trigram", "nGram", min_gram=3, max_gram=3)
    my_stop = analysis.token_filter("my_stop", "stop", stopwords=["a", "b"])
    umlauts = analysis.char_filter("umlauts", "pattern_replace", mappings=["ü=>ue"])
    a = analysis.analyzer(
        "my_analyzer",
        tokenizer=trigram,
        filter=["lowercase", my_stop],
        char_filter=["html_strip", umlauts],
    )

    assert a.to_dict() == "my_analyzer"
    assert {
        "analyzer": {
            "my_analyzer": {
                "type": "custom",
                "tokenizer": "trigram",
                "filter": ["lowercase", "my_stop"],
                "char_filter": ["html_strip", "umlauts"],
            }
        },
        "tokenizer": {"trigram": trigram.get_definition()},
        "filter": {"my_stop": my_stop.get_definition()},
        "char_filter": {"umlauts": umlauts.get_definition()},
    } == a.get_analysis_definition()
Ejemplo n.º 15
0
def test_custom_analyzer_can_collect_custom_items():
    trigram = analysis.tokenizer('trigram', 'nGram', min_gram=3, max_gram=3)
    my_stop = analysis.token_filter('my_stop', 'stop', stopwords=['a', 'b'])
    umlauts = analysis.char_filter('umlauts',
                                   'pattern_replace',
                                   mappings=['ü=>ue'])
    a = analysis.analyzer('my_analyzer',
                          tokenizer=trigram,
                          filter=['lowercase', my_stop],
                          char_filter=['html_strip', umlauts])

    assert a.to_dict() == 'my_analyzer'
    assert {
        'analyzer': {
            'my_analyzer': {
                'type': 'custom',
                'tokenizer': 'trigram',
                'filter': ['lowercase', 'my_stop'],
                'char_filter': ['html_strip', 'umlauts']
            }
        },
        'tokenizer': {
            'trigram': trigram.get_definition()
        },
        'filter': {
            'my_stop': my_stop.get_definition()
        },
        'char_filter': {
            'umlauts': umlauts.get_definition()
        }
    } == a.get_analysis_definition()
def test_conflicting_nested_filters_cause_error():
    a = analysis.analyzer(
        "my_cond",
        tokenizer=analysis.tokenizer("keyword"),
        filter=[
            analysis.token_filter("en", "stemmer", language="english"),
            analysis.token_filter(
                "testing",
                "condition",
                script={"source": "return true"},
                filter=[
                    "lowercase",
                    analysis.token_filter("en", "snowball", language="English"),
                ],
            ),
        ],
    )

    with raises(ValueError):
        a.get_analysis_definition()
Ejemplo n.º 17
0
def test_conflicting_nested_filters_cause_error():
    a = analysis.analyzer('my_cond',
                          tokenizer=analysis.tokenizer('keyword'),
                          filter=[
                              analysis.token_filter('en',
                                                    'stemmer',
                                                    language='english'),
                              analysis.token_filter(
                                  'testing',
                                  'condition',
                                  script={'source': 'return true'},
                                  filter=[
                                      'lowercase',
                                      analysis.token_filter('en',
                                                            'snowball',
                                                            language='English')
                                  ])
                          ])

    with raises(ValueError):
        a.get_analysis_definition()
Ejemplo n.º 18
0
 def autocomplete(self):
     autocomplete_filter = analysis.token_filter(
         "autocomplete_filter",
         "edge_ngram",
         min_gram=1,
         max_gram=20
     )
     return analyzer(
         "autocomplete",
         type="custom",
         tokenizer="standard",
         filter=["lowercase", autocomplete_filter]
     )
Ejemplo n.º 19
0
def name_delimiter_analyzer():
    ''' Analyzer for the fields with composed parts (dash, underscore, ...) '''
    word_delimiter_graph_preserve_original = analysis.token_filter(
        'word_delimiter_graph_preserve_original',
        type="word_delimiter_graph",
        preserve_original=True)
    return analyzer('name_delimiter',
                    tokenizer="keyword",
                    filter=[
                        word_delimiter_graph_preserve_original,
                        "flatten_graph", "lowercase", "stop", "snowball",
                        "remove_duplicates"
                    ])
Ejemplo n.º 20
0
def get_analyzer(lang_analyzer,
                 delete_old_index,
                 user_dictionary_file='',
                 synonyms=None):
    """
    Return analyzer for specific language.

    If Japanese (``lang_analyzer == ja``) and the index doesn't need to be recreated (no delete required and
    no new synonyms) then return only the name of the analyzer.

    :param lang_analyzer: ``str`` which analyzer to get e.g. 'standard','kuromoji','english'
    :param delete_old_index: (only Japanese) ``bool`` if list is empty and index is not deleted, keep previous analyzer
        with synonyms
    :param user_dictionary_file: (only Japanese) ``str`` user-dictionary file with custom terms in the form of
        東京スカイツリー,東京 スカイツリー,トウキョウ スカイツリー,カスタム名詞
        See: https://www.elastic.co/guide/en/elasticsearch/plugins/current/analysis-kuromoji-tokenizer.html
    :param synonyms: (only Japanese) ``list`` of synonyms to be used in the form of ['京産大, 京都産業大学','a, b']
        if list is empty and index is not deleted, keep previous analyzer with synonyms
    :return: ``analyzer`` or ``str`` of analyzer to be used
    """
    if synonyms is None:
        synonyms = []
    if lang_analyzer == constants.SUPPORTED_LANG_CODES_ANALYZERS['ja']:
        # Use existing analyzer (with synonyms) if new synonyms list is empty. (Only if index is not re-built)
        if (not delete_old_index) & (len(synonyms) == 0):
            analyzer_lang = '{0}_custom'.format(
                lang_analyzer)  # Use existing analyzer with existing synonyms
        else:
            analyzer_lang = analysis.analyzer(
                '{0}_custom'.format(lang_analyzer),
                tokenizer=analysis.tokenizer(
                    'kuromoji_tokenizer_user_dict',
                    type='kuromoji_tokenizer',
                    user_dictionary=user_dictionary_file),
                filter=[
                    'kuromoji_baseform',
                    'kuromoji_part_of_speech',
                    'cjk_width',
                    'ja_stop',
                    'kuromoji_stemmer',
                    'lowercase',
                    analysis.token_filter(
                        'synonym', type='synonym',
                        synonyms=synonyms),  # ['京産大, 京都産業大学']
                ])
            # Extra token filters: kuromoji_number, kuromoji_readingform
            # Extra character filter: kuromoji_iteration_mark
            # user_dictionary="userdict_ja.txt")  # /etc/elasticsearch/
    else:
        analyzer_lang = analysis.analyzer(lang_analyzer)
    return analyzer_lang
Ejemplo n.º 21
0
def test_conditional_token_filter():
    a = analysis.analyzer('my_cond',
                          tokenizer=analysis.tokenizer('keyword'),
                          filter=[
                              analysis.token_filter(
                                  'testing',
                                  'condition',
                                  script={'source': 'return true'},
                                  filter=[
                                      'lowercase',
                                      analysis.token_filter('en',
                                                            'snowball',
                                                            language='English')
                                  ]), 'stop'
                          ])

    assert {
        "analyzer": {
            "my_cond": {
                "filter": ["testing", "stop"],
                "tokenizer": "keyword",
                "type": "custom"
            }
        },
        "filter": {
            "en": {
                "language": "English",
                "type": "snowball"
            },
            "testing": {
                "script": {
                    "source": "return true"
                },
                "filter": ["lowercase", "en"],
                "type": "condition"
            }
        }
    } == a.get_analysis_definition()
Ejemplo n.º 22
0
def test_mapping_can_collect_all_analyzers():
    a1 = analysis.analyzer(
        "my_analyzer1",
        tokenizer="keyword",
        filter=["lowercase", analysis.token_filter("my_filter1", "stop", stopwords=["a", "b"])],
    )
    a2 = analysis.analyzer("english")
    a3 = analysis.analyzer("unknown_custom")
    a4 = analysis.analyzer(
        "my_analyzer2",
        tokenizer=analysis.tokenizer("trigram", "nGram", min_gram=3, max_gram=3),
        filter=[analysis.token_filter("my_filter2", "stop", stopwords=["c", "d"])],
    )
    a5 = analysis.analyzer("my_analyzer3", tokenizer="keyword")

    m = mapping.Mapping("article")
    m.field(
        "title", "string", analyzer=a1, fields={"english": String(analyzer=a2), "unknown": String(search_analyzer=a3)}
    )
    m.field("comments", Nested(properties={"author": String(analyzer=a4)}))
    m.meta("_all", analyzer=a5)

    assert {
        "analyzer": {
            "my_analyzer1": {"filter": ["lowercase", "my_filter1"], "tokenizer": "keyword", "type": "custom"},
            "my_analyzer2": {"filter": ["my_filter2"], "tokenizer": "trigram", "type": "custom"},
            "my_analyzer3": {"tokenizer": "keyword", "type": "custom"},
        },
        "filter": {
            "my_filter1": {"stopwords": ["a", "b"], "type": "stop"},
            "my_filter2": {"stopwords": ["c", "d"], "type": "stop"},
        },
        "tokenizer": {"trigram": {"max_gram": 3, "min_gram": 3, "type": "nGram"}},
    } == m._collect_analysis()

    assert json.loads(json.dumps(m.to_dict())) == m.to_dict()
Ejemplo n.º 23
0
    def get_settings(self):
        shingle_filter = analysis.token_filter(
            'filter_shingle',
            'shingle',
            max_shingle_size=5,
            min_shingle_size=2,
            output_unigrams=True)

        shingle_analyzer = analysis.analyzer(
            'analyzer_shingle',
            tokenizer='standard',
            filter=['standard', 'lowercase', shingle_filter])

        return {
            'settings': {
                'index': {
                    'analysis': shingle_analyzer.get_analysis_definition()
                }
            }
        }
Ejemplo n.º 24
0
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html

from elasticsearch_dsl.document import DocType
from elasticsearch_dsl.field import Text, Date, Keyword, Integer, String, Completion, Float
from elasticsearch_dsl.analysis import token_filter, analyzer
from elasticsearch_dsl import Index
from elasticsearch import Elasticsearch

es = Elasticsearch()

ngram_filter = token_filter('ngram_filter',
                            type='nGram',
                            min_gram=1,
                            max_gram=20)

ngram_analyzer = analyzer('ngram_analyzer',
                          type='custom',
                          tokenizer='whitespace',
                          filter=['lowercase', 'asciifolding', ngram_filter])


class ImdbspiderPipeline(object):
    def __init__(self):
        movies = Index('imdb', using=es)
        movies.doc_type(Movie)
        movies.delete(ignore=404)
        movies.create()
Ejemplo n.º 25
0
from elasticsearch_dsl.analysis import analyzer, token_filter

edge_ngram_analyzer = analyzer(
    'edge_ngram_analyzer',
    type='custom',
    tokenizer='standard',
    filter=[
        'lowercase',
        token_filter(
            'edge_ngram_filter',
            type='edgeNGram',
            min_gram=2,
            max_gram=20
        )
    ]
)
Ejemplo n.º 26
0
def test_stemmer_analyzer_can_pass_name():
    t = analysis.token_filter('my_english_filter',
                              name="minimal_english",
                              type="stemmer")
    assert t.to_dict() == 'my_english_filter'
    assert {"type": "stemmer", "name": "minimal_english"} == t.get_definition()
Ejemplo n.º 27
0
import logging

import elasticsearch_dsl as es
from elasticsearch_dsl import analysis

from django.conf import settings

from datasets.bag.models import Nummeraanduiding
from datasets.hr.models import DataSelectie

log = logging.getLogger(__name__)


edge_ngram_filter = analysis.token_filter(
    'edge_ngram_filter',
    type='edge_ngram',
    min_gram=1,
    max_gram=15
)


autocomplete = es.analyzer(
    'autocomplete',
    tokenizer='standard',
    filter=['lowercase', edge_ngram_filter]
)


class Inschrijving(es.DocType):
    """
    Elastic data of 'vestigingen' or 'mac'
    from handelsregister
Ejemplo n.º 28
0
from elasticsearch_dsl import analysis


orderings = dict(
    openbare_ruimte=10,
    kadastraal_subject=25,
    adres=50,
    kadastraal_object=100,
)


synonym_filter = analysis.token_filter(
    'synonyms',
    type='synonym',
    synonyms=[
        '1e=>eerste',
        '2e=>tweede',
        '3e=>derde',
        '4e=>vierde',
    ]
)


huisnummer_generate = analysis.char_filter(
    'huisnummer_expand',
    type='pattern_replace',
    pattern='(\d+)',
    replacement="""
        $1-1 $1- $1-2 $1-3
        $1a $1b $1a-1 $1b-1 $1-a $1-b
        $1b 1-b
        $1c 1-c
Ejemplo n.º 29
0
from apps.core.models import Article
from apps.user.models import UserProfile


ngram_analyzer = analyzer(
    'ngram_anl',
    type='custom',
    tokenizer=tokenizer(
        'ngram_tkn',
        type='char_group',
        tokenize_on_chars=['\n']
    ),
    filter=[
        token_filter(
            'ngram_tkf',
            type='ngram',
            min_gram=1,
            max_gram=10,
        ),
        'lowercase'
    ]
)


newline_analyzer = analyzer(
    'nl_anl',
    type='custom',
    tokenizer=tokenizer(
        'nl_tkn',
        type='char_group',
        tokenize_on_chars=['\n']
    ),
Ejemplo n.º 30
0
from elasticsearch_dsl import (Boolean, Document, Date, Nested, InnerDoc,
                               Keyword, Text, Float, Integer, analyzer,
                               tokenizer, analysis)

# define custom token filters and anlyzers
bigram_token_filter = analysis.token_filter('bigram',
                                            'shingle',
                                            min_shingle_size=2,
                                            max_shingle_size=2,
                                            output_unigrams=False)
trigram_token_filter = analysis.token_filter('trigram',
                                             'shingle',
                                             min_shingle_size=3,
                                             max_shingle_size=3,
                                             output_unigrams=False)
quadragram_token_filter = analysis.token_filter('quadragram',
                                                'shingle',
                                                min_shingle_size=4,
                                                max_shingle_size=4,
                                                output_unigrams=False)
pentagram_token_filter = analysis.token_filter('pentagram',
                                               'shingle',
                                               min_shingle_size=5,
                                               max_shingle_size=5,
                                               output_unigrams=False)
stemmer_token_filter = analysis.token_filter('english_stemmer',
                                             'stemmer',
                                             name="english")

standard = analyzer(
    'standard_analyzer',
Ejemplo n.º 31
0
def test_mapping_can_collect_multiple_analyzers():
    a1 = analysis.analyzer(
        "my_analyzer1",
        tokenizer="keyword",
        filter=[
            "lowercase",
            analysis.token_filter("my_filter1", "stop", stopwords=["a", "b"]),
        ],
    )
    a2 = analysis.analyzer(
        "my_analyzer2",
        tokenizer=analysis.tokenizer("trigram",
                                     "nGram",
                                     min_gram=3,
                                     max_gram=3),
        filter=[
            analysis.token_filter("my_filter2", "stop", stopwords=["c", "d"])
        ],
    )
    m = mapping.Mapping()
    m.field("title", "text", analyzer=a1, search_analyzer=a2)
    m.field(
        "text",
        "text",
        analyzer=a1,
        fields={
            "english": Text(analyzer=a1),
            "unknown": Keyword(analyzer=a1, search_analyzer=a2),
        },
    )
    assert {
        "analyzer": {
            "my_analyzer1": {
                "filter": ["lowercase", "my_filter1"],
                "tokenizer": "keyword",
                "type": "custom",
            },
            "my_analyzer2": {
                "filter": ["my_filter2"],
                "tokenizer": "trigram",
                "type": "custom",
            },
        },
        "filter": {
            "my_filter1": {
                "stopwords": ["a", "b"],
                "type": "stop"
            },
            "my_filter2": {
                "stopwords": ["c", "d"],
                "type": "stop"
            },
        },
        "tokenizer": {
            "trigram": {
                "max_gram": 3,
                "min_gram": 3,
                "type": "nGram"
            }
        },
    } == m._collect_analysis()
Ejemplo n.º 32
0
def test_mapping_can_collect_all_analyzers_and_normalizers():
    a1 = analysis.analyzer(
        "my_analyzer1",
        tokenizer="keyword",
        filter=[
            "lowercase",
            analysis.token_filter("my_filter1", "stop", stopwords=["a", "b"]),
        ],
    )
    a2 = analysis.analyzer("english")
    a3 = analysis.analyzer("unknown_custom")
    a4 = analysis.analyzer(
        "my_analyzer2",
        tokenizer=analysis.tokenizer("trigram",
                                     "nGram",
                                     min_gram=3,
                                     max_gram=3),
        filter=[
            analysis.token_filter("my_filter2", "stop", stopwords=["c", "d"])
        ],
    )
    a5 = analysis.analyzer("my_analyzer3", tokenizer="keyword")
    n1 = analysis.normalizer("my_normalizer1", filter=["lowercase"])
    n2 = analysis.normalizer(
        "my_normalizer2",
        filter=[
            "my_filter1",
            "my_filter2",
            analysis.token_filter("my_filter3", "stop", stopwords=["e", "f"]),
        ],
    )
    n3 = analysis.normalizer("unknown_custom")

    m = mapping.Mapping()
    m.field(
        "title",
        "text",
        analyzer=a1,
        fields={
            "english": Text(analyzer=a2),
            "unknown": Keyword(search_analyzer=a3)
        },
    )
    m.field("comments", Nested(properties={"author": Text(analyzer=a4)}))
    m.field("normalized_title", "keyword", normalizer=n1)
    m.field("normalized_comment", "keyword", normalizer=n2)
    m.field("unknown", "keyword", normalizer=n3)
    m.meta("_all", analyzer=a5)

    assert {
        "analyzer": {
            "my_analyzer1": {
                "filter": ["lowercase", "my_filter1"],
                "tokenizer": "keyword",
                "type": "custom",
            },
            "my_analyzer2": {
                "filter": ["my_filter2"],
                "tokenizer": "trigram",
                "type": "custom",
            },
            "my_analyzer3": {
                "tokenizer": "keyword",
                "type": "custom"
            },
        },
        "normalizer": {
            "my_normalizer1": {
                "filter": ["lowercase"],
                "type": "custom"
            },
            "my_normalizer2": {
                "filter": ["my_filter1", "my_filter2", "my_filter3"],
                "type": "custom",
            },
        },
        "filter": {
            "my_filter1": {
                "stopwords": ["a", "b"],
                "type": "stop"
            },
            "my_filter2": {
                "stopwords": ["c", "d"],
                "type": "stop"
            },
            "my_filter3": {
                "stopwords": ["e", "f"],
                "type": "stop"
            },
        },
        "tokenizer": {
            "trigram": {
                "max_gram": 3,
                "min_gram": 3,
                "type": "nGram"
            }
        },
    } == m._collect_analysis()

    assert json.loads(json.dumps(m.to_dict())) == m.to_dict()
Ejemplo n.º 33
0
part_number_analyzer = analysis.analyzer(
    "part_number_analyzer",
    tokenizer=analysis.tokenizer("part_number_path_hierarchy", "path_hierarchy", delimiter="-"),
    filter=["lowercase", "trim"],
)

reference_code_analyzer = analysis.analyzer(
    "reference_code_analyzer", tokenizer="path_hierarchy", filter=["lowercase", "trim"]
)

descriptive_text_analyzer = analysis.analyzer(
    "descriptive_text_analyzer", tokenizer="classic", filter=["lowercase", "trim", "stemmer"]
)

ngram_filter = analysis.token_filter("ngram_filter", type="ngram", min_gram=2, max_gram=20)

ngram_analyzer = analysis.analyzer(
    "ngram_completion", tokenizer="whitespace", filter=["lowercase", "asciifolding", ngram_filter]
)

whitespace_analyzer = analysis.analyzer(
    "whitespace_analyzer", tokenizer="whitespace", filter=["lowercase", "asciifolding"]
)

lowercase_normalizer = analysis.normalizer("lowercase_normalizer", filter=["lowercase"])

email_analyzer = analysis.analyzer(
    "email_analyzer",
    type="custom",
    tokenizer=analysis.tokenizer(
Ejemplo n.º 34
0
from django_elasticsearch_dsl import DocType, Index, fields
from elasticsearch_dsl.analysis import token_filter

from data_refinery_common.utils import get_supported_microarray_platforms, get_supported_rnaseq_platforms
from .models import Sample, Experiment, Organism

experiment_index = Index('experiments')
experiment_index.settings(number_of_shards=1,
                          number_of_replicas=0,
                          max_result_window=9999999)

# via https://django-elasticsearch-dsl-drf.readthedocs.io/en/0.17.2/advanced_usage_examples.html?highlight=ngram#id8
# via https://github.com/barseghyanartur/django-elasticsearch-dsl-drf/issues/110
edge_ngram_completion_filter = token_filter('edge_ngram_completion_filter',
                                            type="edge_ngram",
                                            min_gram=3,
                                            max_gram=12)
html_strip = analyzer('html_strip',
                      tokenizer="whitespace",
                      filter=[
                          edge_ngram_completion_filter, "standard",
                          "lowercase", "stop", "snowball"
                      ],
                      char_filter=["html_strip"])
html_strip_no_ngram = analyzer('html_strip_no_ngram',
                               tokenizer="standard",
                               filter=["standard", "lowercase", "stop"],
                               char_filter=["html_strip"])
html_strip_no_stop = analyzer('html_strip_no_stop',
                              tokenizer="whitespace",
                              filter=["standard", "lowercase"],
Ejemplo n.º 35
0
from elasticsearch_dsl import analyzer, analysis
from django_elasticsearch_dsl import Document, fields, Index

from movie.models import MovieModel

movie_index = Index('movies')

# Создаем TokenFilters из документации
russian_stop_filter = analysis.token_filter('russian_stop',
                                            type='stop',
                                            stopwords='_russian_')
russian_stemmer_filter = analysis.token_filter('russian_stemmer',
                                               type='stemmer',
                                               language='russian')
english_stop_filter = analysis.token_filter('english_stop',
                                            type='stop',
                                            stopwords='_english_')
english_stemmer_filter = analysis.token_filter('english_stemmer',
                                               type='stemmer',
                                               language='english')
english_possessive_stemmer_filter = analysis.token_filter(
    'english_stemmer', type='stemmer', language='possessive_english')

# Создаем анализаторы
ru_analyzer = analyzer(
    'ru_analyzer',
    type='custom',
    tokenizer='standard',
    filter=['lowercase', russian_stop_filter, russian_stemmer_filter],
)
en_analyzer = analyzer('en_analyzer',
Ejemplo n.º 36
0
import elasticsearch_dsl as es

from elasticsearch_dsl import analysis, tokenizer


####################################
#            Filters               #
####################################

# Replaces the number street shortening with the actual word
synonym_filter = analysis.token_filter(
    'synonyms',
    type='synonym',
    synonyms=[
        '1e, eerste => 1e, eerste',
        '2e, tweede => 2e, tweede',
        '3e, derde  => 3e, derde',
        '4e, vierde => 4e, vierde',
    ]
)


strip_zero = analysis.CustomCharFilter(
    "strip_zero",
    builtin_type="pattern_replace",
    pattern="^0+(.*)",
    replacement="$1"
)

# Change dash and dot to space
naam_stripper = analysis.char_filter(
Ejemplo n.º 37
0
def _keywords_filter(keywords):
    return token_filter(
        "keywords_autophrase_syn",
        type="synonym",
        synonyms=_autophrased_synonyms(keywords),
    )
Ejemplo n.º 38
0
    'kadastraal_subject': 25,
    'adres': 50,
    'kadastraal_object': 100
}


####################################
#            Filters               #
####################################

# Replaces the number street shortening with the actual word
synonym_filter = analysis.token_filter(
    'synonyms',
    type='synonym',
    synonyms=[
        '1e, eerste => 1e, eerste',
        '2e, tweede => 2e, tweede',
        '3e, derde  => 3e, derde',
        '4e, vierde => 4e, vierde',
    ]
)


huisnummer_expand = analysis.token_filter(
    'huisnummer_expand',
    type='word_delimiter',
    generate_numer_parts=True,
    preserve_original=True
)


# Change dash and dot to space
Ejemplo n.º 39
0
- Part 1: Define the documents in which you define who fields will be indexed.
- Part 2: Building elasticsearch json queries.

BOTH of these parts are equally importand to havea search API that returns
relevant results.
"""
import logging
import typing

import elasticsearch_dsl as es
from elasticsearch_dsl import analysis

log = logging.getLogger(__name__)

edge_ngram_filter = analysis.token_filter('edge_ngram_filter',
                                          type='edge_ngram',
                                          min_gram=1,
                                          max_gram=15)

autocomplete = es.analyzer(
    'autocomplete',
    tokenizer='standard',
    filter=['standard', 'asciifolding', 'lowercase', edge_ngram_filter])


class User(es.DocType):
    """Elastic document describing user."""

    objectID = es.Keyword()

    username = es.Text(fielddata=True, analyzer=autocomplete)
    username_exact = es.Keyword()
Ejemplo n.º 40
0
# INFO - G.M - 2019-05-31 - Analyzer/indexing explained:
# Instead of relying of wildcard for autocompletion which is costly and make some feature doesn't
# work correctly, for example ranking, We use ngram mecanism.
# This means that for work "elephant", we will index thing like "ele", "elep", "lepha", etc...
# As we don't want to have a *text* matching but only an autocomplete matching like text*, we use
# edge_ngram, we will only index for "elephant": "ele" , "elep" , "eleph" , etc..
# We want that ele match elephant result, but we do not want that elephant match ele result,
# that's why we set different analyzer for search (we search word given) and indexing (we index ngram
# of label of content to allow autocompletion)

# INFO - G.M - 2019-05-23 - search_analyser: do search for content given an some similar word
folding = analyzer("folding", tokenizer="standard", filter=["lowercase", "asciifolding"])
# INFO - G.M - 2019-05-23 -  index_analysers, index edge ngram for autocompletion and strip html for indexing
edge_ngram_token_filter = analysis.token_filter(
    "edge_ngram_filter", type="edge_ngram", min_ngram=2, max_gram=20
)
edge_ngram_folding = analyzer(
    "edge_ngram_folding",
    tokenizer="standard",
    filter=["lowercase", "asciifolding", edge_ngram_token_filter],
)
html_folding = analyzer(
    "html_folding",
    tokenizer="standard",
    filter=["lowercase", "asciifolding", edge_ngram_token_filter],
    char_filter="html_strip",
)


class DigestUser(InnerDoc):