def test_mapping_can_collect_all_analyzers():
    a1 = analysis.analyzer('my_analyzer1',
        tokenizer='keyword',
        filter=['lowercase', analysis.token_filter('my_filter1', 'stop', stopwords=['a', 'b'])],
    )
    a2 = analysis.analyzer('english')
    a3 = analysis.analyzer('unknown_custom')
    a4 = analysis.analyzer('my_analyzer2', 
        tokenizer=analysis.tokenizer('trigram', 'nGram', min_gram=3, max_gram=3),
        filter=[analysis.token_filter('my_filter2', 'stop', stopwords=['c', 'd'])],
    )

    m = mapping.Mapping('article')
    m.field('title', 'string', analyzer=a1,
        fields={
            'english': String(index_analyzer=a2),
            'unknown': String(search_analyzer=a3),
        }
    )
    m.field('comments', Nested(properties={
        'author': String(index_analyzer=a4)
    }))

    assert {
        'analyzer': {
            'my_analyzer1': {'filter': ['lowercase', 'my_filter1'], 'tokenizer': 'keyword', 'type': 'custom'},
            'my_analyzer2': {'filter': ['my_filter2'], 'tokenizer': 'trigram', 'type': 'custom'}},
        'filter': {
            'my_filter1': {'stopwords': ['a', 'b'], 'type': 'stop'},
            'my_filter2': {'stopwords': ['c', 'd'], 'type': 'stop'},
        },
        'tokenizer': {
            'trigram': {'max_gram': 3, 'min_gram': 3, 'type': 'nGram'},
        }
    } == m._collect_analysis()
def test_custom_analyzer_can_collect_custom_items():
    trigram = analysis.tokenizer('trigram', 'nGram', min_gram=3, max_gram=3)
    my_stop = analysis.token_filter('my_stop', 'stop', stopwords=['a', 'b'])
    umlauts = analysis.char_filter('umlauts', 'pattern_replace', mappings=['ü=>ue'])
    a = analysis.analyzer(
        'my_analyzer',
        tokenizer=trigram,
        filter=['lowercase', my_stop],
        char_filter=['html_strip', umlauts]
    )

    assert a.to_dict() == 'my_analyzer'
    assert {
        'analyzer': {
            'my_analyzer': {
                'type': 'custom',
                'tokenizer': 'trigram',
                'filter': ['lowercase', 'my_stop'],
                'char_filter': ['html_strip', 'umlauts']
            }
        },
        'tokenizer': {
            'trigram': trigram.get_definition()
        },
        'filter': {
            'my_stop': my_stop.get_definition()
        },
        'char_filter': {
            'umlauts': umlauts.get_definition()
        }
    } == a.get_analysis_definition()
def test_mapping_can_collect_multiple_analyzers():
    a1 = analysis.analyzer(
        "my_analyzer1",
        tokenizer="keyword",
        filter=["lowercase", analysis.token_filter("my_filter1", "stop", stopwords=["a", "b"])],
    )
    a2 = analysis.analyzer(
        "my_analyzer2",
        tokenizer=analysis.tokenizer("trigram", "nGram", min_gram=3, max_gram=3),
        filter=[analysis.token_filter("my_filter2", "stop", stopwords=["c", "d"])],
    )
    m = mapping.Mapping("article")
    m.field("title", "string", analyzer=a1, search_analyzer=a2)
    m.field(
        "text",
        "string",
        analyzer=a1,
        fields={"english": String(analyzer=a1), "unknown": String(analyzer=a1, search_analyzer=a2)},
    )
    assert {
        "analyzer": {
            "my_analyzer1": {"filter": ["lowercase", "my_filter1"], "tokenizer": "keyword", "type": "custom"},
            "my_analyzer2": {"filter": ["my_filter2"], "tokenizer": "trigram", "type": "custom"},
        },
        "filter": {
            "my_filter1": {"stopwords": ["a", "b"], "type": "stop"},
            "my_filter2": {"stopwords": ["c", "d"], "type": "stop"},
        },
        "tokenizer": {"trigram": {"max_gram": 3, "min_gram": 3, "type": "nGram"}},
    } == m._collect_analysis()
def test_custom_analyzer_can_collect_custom_items():
    trigram = analysis.tokenizer("trigram", "nGram", min_gram=3, max_gram=3)
    my_stop = analysis.token_filter("my_stop", "stop", stopwords=["a", "b"])
    umlauts = analysis.char_filter("umlauts", "pattern_replace", mappings=["ü=>ue"])
    a = analysis.analyzer(
        "my_analyzer",
        tokenizer=trigram,
        filter=["lowercase", my_stop],
        char_filter=["html_strip", umlauts],
    )

    assert a.to_dict() == "my_analyzer"
    assert {
        "analyzer": {
            "my_analyzer": {
                "type": "custom",
                "tokenizer": "trigram",
                "filter": ["lowercase", "my_stop"],
                "char_filter": ["html_strip", "umlauts"],
            }
        },
        "tokenizer": {"trigram": trigram.get_definition()},
        "filter": {"my_stop": my_stop.get_definition()},
        "char_filter": {"umlauts": umlauts.get_definition()},
    } == a.get_analysis_definition()
Exemple #5
0
def test_mapping_can_collect_multiple_analyzers():
    a1 = analysis.analyzer(
        'my_analyzer1',
        tokenizer='keyword',
        filter=['lowercase', analysis.token_filter('my_filter1', 'stop', stopwords=['a', 'b'])],
    )
    a2 = analysis.analyzer(
        'my_analyzer2',
        tokenizer=analysis.tokenizer('trigram', 'nGram', min_gram=3, max_gram=3),
        filter=[analysis.token_filter('my_filter2', 'stop', stopwords=['c', 'd'])],
    )
    m = mapping.Mapping()
    m.field('title', 'text', analyzer=a1, search_analyzer=a2)
    m.field(
        'text', 'text', analyzer=a1,
        fields={
            'english': Text(analyzer=a1),
            'unknown': Keyword(analyzer=a1, search_analyzer=a2),
        }
    )
    assert {
       'analyzer': {
           'my_analyzer1': {'filter': ['lowercase', 'my_filter1'],
                            'tokenizer': 'keyword',
                            'type': 'custom'},
           'my_analyzer2': {'filter': ['my_filter2'],
                            'tokenizer': 'trigram',
                            'type': 'custom'}},
       'filter': {
           'my_filter1': {'stopwords': ['a', 'b'], 'type': 'stop'},
           'my_filter2': {'stopwords': ['c', 'd'], 'type': 'stop'}},
       'tokenizer': {'trigram': {'max_gram': 3, 'min_gram': 3, 'type': 'nGram'}}
    } == m._collect_analysis()
def test_conditional_token_filter():
    a = analysis.analyzer(
        "my_cond",
        tokenizer=analysis.tokenizer("keyword"),
        filter=[
            analysis.token_filter(
                "testing",
                "condition",
                script={"source": "return true"},
                filter=[
                    "lowercase",
                    analysis.token_filter("en", "snowball", language="English"),
                ],
            ),
            "stop",
        ],
    )

    assert {
        "analyzer": {
            "my_cond": {
                "filter": ["testing", "stop"],
                "tokenizer": "keyword",
                "type": "custom",
            }
        },
        "filter": {
            "en": {"language": "English", "type": "snowball"},
            "testing": {
                "script": {"source": "return true"},
                "filter": ["lowercase", "en"],
                "type": "condition",
            },
        },
    } == a.get_analysis_definition()
def test_mapping_can_collect_all_analyzers():
    a1 = analysis.analyzer('my_analyzer1',
        tokenizer='keyword',
        filter=['lowercase', analysis.token_filter('my_filter1', 'stop', stopwords=['a', 'b'])],
    )
    a2 = analysis.analyzer('english')
    a3 = analysis.analyzer('unknown_custom')
    a4 = analysis.analyzer('my_analyzer2', 
        tokenizer=analysis.tokenizer('trigram', 'nGram', min_gram=3, max_gram=3),
        filter=[analysis.token_filter('my_filter2', 'stop', stopwords=['c', 'd'])],
    )

    m = mapping.Mapping('article')
    m.field('title', 'string', analyzer=a1,
        fields={
            'english': String(analyzer=a2),
            'unknown': String(analyzer=a3),
        }
    )
    m.field('comments', Nested(properties={
        'author': String(analyzer=a4)
    }))

    assert {
        'analyzer': {
            'my_analyzer1': {'filter': ['lowercase', 'my_filter1'], 'tokenizer': 'keyword', 'type': 'custom'},
            'my_analyzer2': {'filter': ['my_filter2'], 'tokenizer': 'trigram', 'type': 'custom'}},
        'filter': {
            'my_filter1': {'stopwords': ['a', 'b'], 'type': 'stop'},
            'my_filter2': {'stopwords': ['c', 'd'], 'type': 'stop'},
        },
        'tokenizer': {
            'trigram': {'max_gram': 3, 'min_gram': 3, 'type': 'nGram'},
        }
    } == m._collect_analysis()
def test_custom_analyzer_can_collect_custom_items():
    trigram = analysis.tokenizer('trigram', 'nGram', min_gram=3, max_gram=3)
    my_stop = analysis.token_filter('my_stop', 'stop', stopwords=['a', 'b'])
    umlauts = analysis.char_filter('umlauts',
                                   'pattern_replace',
                                   mappings=['ü=>ue'])
    a = analysis.analyzer('my_analyzer',
                          tokenizer=trigram,
                          filter=['lowercase', my_stop],
                          char_filter=['html_strip', umlauts])

    assert a.to_dict() == 'my_analyzer'
    assert {
        'analyzer': {
            'my_analyzer': {
                'type': 'custom',
                'tokenizer': 'trigram',
                'filter': ['lowercase', 'my_stop'],
                'char_filter': ['html_strip', 'umlauts']
            }
        },
        'tokenizer': {
            'trigram': trigram.get_definition()
        },
        'filter': {
            'my_stop': my_stop.get_definition()
        },
        'char_filter': {
            'umlauts': umlauts.get_definition()
        }
    } == a.get_analysis_definition()
def test_tokenizer():
    t = analysis.tokenizer("trigram", "nGram", min_gram=3, max_gram=3)

    assert t.to_dict() == "trigram"
    assert {
        "type": "nGram",
        "min_gram": 3,
        "max_gram": 3
    } == t.get_definition()
def test_tokenizer():
    t = analysis.tokenizer('trigram', 'nGram', min_gram=3, max_gram=3)

    assert t.to_dict() == 'trigram'
    assert {
        'type': 'nGram',
        'min_gram': 3,
        'max_gram': 3
    } == t.get_definition()
def test_tokenizer():
    t = analysis.tokenizer('trigram', 'nGram', min_gram=3, max_gram=3)

    assert t.to_dict() == 'trigram'
    assert {
        'type': 'nGram',
        'min_gram': 3,
        'max_gram': 3
    } == t.get_definition()
Exemple #12
0
def test_mapping_can_collect_all_analyzers_and_normalizers():
    a1 = analysis.analyzer('my_analyzer1',
        tokenizer='keyword',
        filter=['lowercase', analysis.token_filter('my_filter1', 'stop', stopwords=['a', 'b'])],
    )
    a2 = analysis.analyzer('english')
    a3 = analysis.analyzer('unknown_custom')
    a4 = analysis.analyzer('my_analyzer2',
        tokenizer=analysis.tokenizer('trigram', 'nGram', min_gram=3, max_gram=3),
        filter=[analysis.token_filter('my_filter2', 'stop', stopwords=['c', 'd'])],
    )
    a5 = analysis.analyzer('my_analyzer3', tokenizer='keyword')
    n1 = analysis.normalizer('my_normalizer1',
        filter=['lowercase']
    )
    n2 = analysis.normalizer('my_normalizer2',
        filter=['my_filter1', 'my_filter2', analysis.token_filter('my_filter3', 'stop', stopwords=['e', 'f'])]
    )
    n3 = analysis.normalizer('unknown_custom')

    m = mapping.Mapping()
    m.field('title', 'text', analyzer=a1,
        fields={
            'english': Text(analyzer=a2),
            'unknown': Keyword(search_analyzer=a3),
        }
    )
    m.field('comments', Nested(properties={
        'author': Text(analyzer=a4)
    }))
    m.field('normalized_title', 'keyword', normalizer=n1)
    m.field('normalized_comment', 'keyword', normalizer=n2)
    m.field('unknown', 'keyword', normalizer=n3)
    m.meta('_all', analyzer=a5)

    assert {
        'analyzer': {
            'my_analyzer1': {'filter': ['lowercase', 'my_filter1'], 'tokenizer': 'keyword', 'type': 'custom'},
            'my_analyzer2': {'filter': ['my_filter2'], 'tokenizer': 'trigram', 'type': 'custom'},
            'my_analyzer3': {'tokenizer': 'keyword', 'type': 'custom'},
        },
        'normalizer': {
            'my_normalizer1': {'filter': ['lowercase'], 'type': 'custom'},
            'my_normalizer2': {'filter': ['my_filter1', 'my_filter2', 'my_filter3'], 'type': 'custom'},
        },
        'filter': {
            'my_filter1': {'stopwords': ['a', 'b'], 'type': 'stop'},
            'my_filter2': {'stopwords': ['c', 'd'], 'type': 'stop'},
            'my_filter3': {'stopwords': ['e', 'f'], 'type': 'stop'},
        },
        'tokenizer': {
            'trigram': {'max_gram': 3, 'min_gram': 3, 'type': 'nGram'},
        }
    } == m._collect_analysis()

    assert json.loads(json.dumps(m.to_dict())) == m.to_dict()
Exemple #13
0
def get_analyzer(lang_analyzer,
                 delete_old_index,
                 user_dictionary_file='',
                 synonyms=None):
    """
    Return analyzer for specific language.

    If Japanese (``lang_analyzer == ja``) and the index doesn't need to be recreated (no delete required and
    no new synonyms) then return only the name of the analyzer.

    :param lang_analyzer: ``str`` which analyzer to get e.g. 'standard','kuromoji','english'
    :param delete_old_index: (only Japanese) ``bool`` if list is empty and index is not deleted, keep previous analyzer
        with synonyms
    :param user_dictionary_file: (only Japanese) ``str`` user-dictionary file with custom terms in the form of
        東京スカイツリー,東京 スカイツリー,トウキョウ スカイツリー,カスタム名詞
        See: https://www.elastic.co/guide/en/elasticsearch/plugins/current/analysis-kuromoji-tokenizer.html
    :param synonyms: (only Japanese) ``list`` of synonyms to be used in the form of ['京産大, 京都産業大学','a, b']
        if list is empty and index is not deleted, keep previous analyzer with synonyms
    :return: ``analyzer`` or ``str`` of analyzer to be used
    """
    if synonyms is None:
        synonyms = []
    if lang_analyzer == constants.SUPPORTED_LANG_CODES_ANALYZERS['ja']:
        # Use existing analyzer (with synonyms) if new synonyms list is empty. (Only if index is not re-built)
        if (not delete_old_index) & (len(synonyms) == 0):
            analyzer_lang = '{0}_custom'.format(
                lang_analyzer)  # Use existing analyzer with existing synonyms
        else:
            analyzer_lang = analysis.analyzer(
                '{0}_custom'.format(lang_analyzer),
                tokenizer=analysis.tokenizer(
                    'kuromoji_tokenizer_user_dict',
                    type='kuromoji_tokenizer',
                    user_dictionary=user_dictionary_file),
                filter=[
                    'kuromoji_baseform',
                    'kuromoji_part_of_speech',
                    'cjk_width',
                    'ja_stop',
                    'kuromoji_stemmer',
                    'lowercase',
                    analysis.token_filter(
                        'synonym', type='synonym',
                        synonyms=synonyms),  # ['京産大, 京都産業大学']
                ])
            # Extra token filters: kuromoji_number, kuromoji_readingform
            # Extra character filter: kuromoji_iteration_mark
            # user_dictionary="userdict_ja.txt")  # /etc/elasticsearch/
    else:
        analyzer_lang = analysis.analyzer(lang_analyzer)
    return analyzer_lang
Exemple #14
0
    def add_mapping_fields(self, mapping, analyzer_lang, analyzer_case_insensitive_sort):
        """
        Add custom fields for Mails to the passed Index-mapping.

        :param mapping: ``Mapping`` Elastic-search DSL mapping to add fields to
        :param analyzer_lang: ``analyzer`` or ``str`` of analyzer to be used for language-specific fields
        :param analyzer_case_insensitive_sort: ``analyzer`` of analyzer to be used
        :return: None (Mapping is modified!)
        """
        # Specific fields email
        analyzer_email = analysis.analyzer('email', tokenizer=analysis.tokenizer('uax_url_email'),
                                           filter=['lowercase', 'unique'])
        mapping.field('fromName', 'text', analyzer=analyzer_lang,
                      fields={
                          'keyword': 'keyword',
                      })
        mapping.field('fromEmail', 'text', analyzer=analyzer_email,
                      fields={
                          'keyword': 'keyword',
                      })
        mapping.field('toName', 'text', analyzer=analyzer_lang,
                      fields={
                          'keyword': 'keyword',
                      })
        mapping.field('toEmail', 'text', analyzer=analyzer_email,
                      fields={
                          'keyword': 'keyword',
                      })
        mapping.field('replyToName', 'text', analyzer=analyzer_lang,
                      fields={
                          'keyword': 'keyword',
                      })
        mapping.field('replyToEmail', 'text', analyzer=analyzer_email,
                      fields={
                          'keyword': 'keyword',
                      })
        mapping.field('subject', 'text', analyzer=analyzer_lang)
        mapping.field('date', 'date')
        mapping.field('body', 'text', analyzer=analyzer_lang)
        mapping.field('spam', 'boolean')
        mapping.field('hasAttachmet', 'boolean')
        mapping.field('attachmentNames', 'text', analyzer=analyzer_lang)
def test_conflicting_nested_filters_cause_error():
    a = analysis.analyzer(
        "my_cond",
        tokenizer=analysis.tokenizer("keyword"),
        filter=[
            analysis.token_filter("en", "stemmer", language="english"),
            analysis.token_filter(
                "testing",
                "condition",
                script={"source": "return true"},
                filter=[
                    "lowercase",
                    analysis.token_filter("en", "snowball", language="English"),
                ],
            ),
        ],
    )

    with raises(ValueError):
        a.get_analysis_definition()
def test_conflicting_nested_filters_cause_error():
    a = analysis.analyzer('my_cond',
                          tokenizer=analysis.tokenizer('keyword'),
                          filter=[
                              analysis.token_filter('en',
                                                    'stemmer',
                                                    language='english'),
                              analysis.token_filter(
                                  'testing',
                                  'condition',
                                  script={'source': 'return true'},
                                  filter=[
                                      'lowercase',
                                      analysis.token_filter('en',
                                                            'snowball',
                                                            language='English')
                                  ])
                          ])

    with raises(ValueError):
        a.get_analysis_definition()
def test_conditional_token_filter():
    a = analysis.analyzer('my_cond',
                          tokenizer=analysis.tokenizer('keyword'),
                          filter=[
                              analysis.token_filter(
                                  'testing',
                                  'condition',
                                  script={'source': 'return true'},
                                  filter=[
                                      'lowercase',
                                      analysis.token_filter('en',
                                                            'snowball',
                                                            language='English')
                                  ]), 'stop'
                          ])

    assert {
        "analyzer": {
            "my_cond": {
                "filter": ["testing", "stop"],
                "tokenizer": "keyword",
                "type": "custom"
            }
        },
        "filter": {
            "en": {
                "language": "English",
                "type": "snowball"
            },
            "testing": {
                "script": {
                    "source": "return true"
                },
                "filter": ["lowercase", "en"],
                "type": "condition"
            }
        }
    } == a.get_analysis_definition()
def test_mapping_can_collect_all_analyzers():
    a1 = analysis.analyzer(
        "my_analyzer1",
        tokenizer="keyword",
        filter=["lowercase", analysis.token_filter("my_filter1", "stop", stopwords=["a", "b"])],
    )
    a2 = analysis.analyzer("english")
    a3 = analysis.analyzer("unknown_custom")
    a4 = analysis.analyzer(
        "my_analyzer2",
        tokenizer=analysis.tokenizer("trigram", "nGram", min_gram=3, max_gram=3),
        filter=[analysis.token_filter("my_filter2", "stop", stopwords=["c", "d"])],
    )
    a5 = analysis.analyzer("my_analyzer3", tokenizer="keyword")

    m = mapping.Mapping("article")
    m.field(
        "title", "string", analyzer=a1, fields={"english": String(analyzer=a2), "unknown": String(search_analyzer=a3)}
    )
    m.field("comments", Nested(properties={"author": String(analyzer=a4)}))
    m.meta("_all", analyzer=a5)

    assert {
        "analyzer": {
            "my_analyzer1": {"filter": ["lowercase", "my_filter1"], "tokenizer": "keyword", "type": "custom"},
            "my_analyzer2": {"filter": ["my_filter2"], "tokenizer": "trigram", "type": "custom"},
            "my_analyzer3": {"tokenizer": "keyword", "type": "custom"},
        },
        "filter": {
            "my_filter1": {"stopwords": ["a", "b"], "type": "stop"},
            "my_filter2": {"stopwords": ["c", "d"], "type": "stop"},
        },
        "tokenizer": {"trigram": {"max_gram": 3, "min_gram": 3, "type": "nGram"}},
    } == m._collect_analysis()

    assert json.loads(json.dumps(m.to_dict())) == m.to_dict()
Exemple #19
0
    def add_mapping_to_index(self, lang_code, lang_analyzer, delete_old_index=False, kuromoji_synonyms=None):
        """
        Add or update mail/irc-mapping to EL-index, create/update required analyzers and add fields.

        :param lang_code: ``str`` Language of index e.g. 'ja'
        :param lang_analyzer: ``str`` Name of analyzer for language e.g. 'kuromoji', 'standard' etc.
        :param delete_old_index: ``bool`` Delete index if existing? Default: False = Update existing index (Close, Update, Open)
        :param kuromoji_synonyms: ``dict`` Synonyms for kuromoji Japanese analyzer.
            Keep old synonyms if synonyms list empty and index not deleted
        :return: None
        """
        if kuromoji_synonyms is None:
            kuromoji_synonyms = []
        analyzer_lang = helpers.get_analyzer(lang_analyzer, delete_old_index=delete_old_index,
                                             user_dictionary_file=self._user_dictionary_file,
                                             synonyms=kuromoji_synonyms)
        analyzer_case_insensitive_sort = analysis.analyzer('case_insensitive_sort',
                                                           tokenizer=analysis.tokenizer('keyword'),
                                                           filter=['lowercase'])
        mapping = Mapping(self._type_name)
        reopen_index = False
        index_name = self._index_prefix.format(lang_code)
        if self._es.indices.exists(index=index_name):
            if delete_old_index:
                self._es.indices.delete(index=index_name, ignore=[400, 404])
            else:
                self._es.indices.close(index=index_name)
                reopen_index = True
                mapping = Mapping.from_es(index_name, self._type_name, using=self._es)  # Get existing index from server

        self.add_mapping_fields(mapping, analyzer_lang, analyzer_case_insensitive_sort)

        mapping.save(index_name, using=self._es)  # Insert or update

        if reopen_index:
            self._es.indices.open(index=index_name)
Exemple #20
0
from django.conf import settings as config
from shop.search.documents import ProductDocument
from elasticsearch_dsl.analysis import analyzer, token_filter, tokenizer


settings = {
    'number_of_shards': 1,
    'number_of_replicas': 0,
}

for language, _ in config.LANGUAGES:
    analyzer_name = language+'_'+_+ '_analyzer'
    language_analizers = {
        language: analyzer('german_analyzer',
            type='custom',
            tokenizer=tokenizer('trigram', 'ngram', min_gram=3, max_gram=3),
            filter=[
                'lowercase',
                token_filter('asciifolding', type='asciifolding', preserve_original=False),
                token_filter('german_stop', type='stop', language='german'),
                token_filter('german_stemmer', type='snowball', language='german'),
            ],
            char_filter=['html_strip'],
        )
    }
    
    ProductDocument(language=language, settings=settings, language_analizers=language_analizers)
    'max_result_window': settings.MAX_RESULT_WINDOW,
    'number_of_shards': 1,
    'number_of_replicas': 0
}

insitu_products.settings(**ELASTICSEARCH_INDEX_SETTINGS)
insitu_requirements.settings(**ELASTICSEARCH_INDEX_SETTINGS)
insitu_data.settings(**ELASTICSEARCH_INDEX_SETTINGS)
insitu_dataproviders.settings(**ELASTICSEARCH_INDEX_SETTINGS)

if not getattr(Search, '_patched', False):
    Search.order_by = Search.sort
    Search._patched = True

case_insensitive_analyzer = analyzer('case_insensitive_analyzer',
                                     tokenizer=tokenizer('trigram', 'nGram'),
                                     filter=['lowercase'])

case_insensitive_normalizer = normalizer(
    type="custom",
    name_or_instance='case_insensitive_normalizer',
    char_filter=[],
    filter="lowercase",
)


@insitu_products.doc_type
class ProductDoc(DocType):
    acronym = fields.KeywordField()
    description = fields.TextField()
    name = fields.TextField(analyzer=case_insensitive_analyzer,
    preserve_original=True
)

adres_split = analysis.char_filter(
    'adres_split',
    type='mapping',
    mappings=[
        "-=>' '",  # strip '-'
        ".=>' '",  # change '.' to separator
    ]
)

boutnummer_ngram = analysis.tokenizer(
    'boutnummer_ngram',
    'edgeNGram',
    min_gram=1,
    max_gram=8,
    token_chars=['letter', 'digit']
)

postcode_ngram = analysis.tokenizer(
    'postcode_ngram',
    'edgeNGram',
    min_gram=2,
    max_gram=4,
    token_chars=['letter', 'digit']
)

naam_stripper = analysis.char_filter(
    'naam_stripper',
    type='mapping',
Exemple #23
0
from django_elasticsearch_dsl.registries import registry

from tardis.tardis_portal.models import Dataset, Experiment, \
    DataFile, Instrument, ObjectACL

logger = logging.getLogger(__name__)

elasticsearch_index_settings = getattr(settings,
                                       'ELASTICSEARCH_DSL_INDEX_SETTINGS', {
                                           'number_of_shards': 1,
                                           'number_of_replicas': 0
                                       })
elasticsearch_parallel_index_settings = getattr(
    settings, 'ELASTICSEARCH_PARALLEL_INDEX_SETTINGS', {})

trigram = analysis.tokenizer('trigram', 'nGram', min_gram=3, max_gram=3)

analyzer = analyzer(
    "analyzer",
    tokenizer=trigram,
    filter='lowercase',
)


@registry.register_document
class ExperimentDocument(Document):
    def parallel_bulk(self, actions, **kwargs):
        Document.parallel_bulk(self,
                               actions=actions,
                               **elasticsearch_parallel_index_settings)

logger = getLogger(__name__)


# Normalises values to improve sorting (by keeping e, E, è, ê etc. together)
lowercase_asciifolding_normalizer = analysis.normalizer(
    'lowercase_asciifolding_normalizer',
    filter=('lowercase', 'asciifolding'),
)

# Trigram tokenizer enables us to support partial matching
trigram = analysis.tokenizer(
    'trigram',
    'nGram',
    min_gram=3,
    max_gram=3,
    token_chars=('letter', 'digit'),
)

# Filters out "-" so that t-shirt and tshirt can be matched
special_chars = analysis.char_filter('special_chars', 'mapping', mappings=('-=>',))
trigram_analyzer = analysis.CustomAnalyzer(
    'trigram_analyzer',
    tokenizer=trigram,
    char_filter=special_chars,
    filter=('lowercase',),
)

space_remover = analysis.token_filter(
    'space_remover',
Exemple #25
0
""" Define base fields for all documents except ConceptDocument."""

# Use Any for typing because mypy does not recognize models as subtypes of ModelBase
from typing import Any, Dict

from django_elasticsearch_dsl import Document, fields
from elasticsearch_dsl import analyzer
from elasticsearch_dsl.analysis import tokenizer

from ddionrails.studies.models import Study

edge_ngram_completion = analyzer(
    "edge_ngram_completion",
    tokenizer=tokenizer("edge_ngram", "edge_ngram", min_gram=1, max_gram=10),
)


class GenericDocument(Document):
    """Base for search documents."""

    # attributes
    id = fields.TextField()
    name = fields.TextField(analyzer=edge_ngram_completion)
    label = fields.TextField(analyzer="english")
    label_de = fields.TextField(analyzer="german")
    description = fields.TextField(analyzer="english")
    description_de = fields.TextField(analyzer="german")

    # relations as attributes
    study = fields.ObjectField(
        properties={
def test_mapping_can_collect_multiple_analyzers():
    a1 = analysis.analyzer(
        "my_analyzer1",
        tokenizer="keyword",
        filter=[
            "lowercase",
            analysis.token_filter("my_filter1", "stop", stopwords=["a", "b"]),
        ],
    )
    a2 = analysis.analyzer(
        "my_analyzer2",
        tokenizer=analysis.tokenizer("trigram",
                                     "nGram",
                                     min_gram=3,
                                     max_gram=3),
        filter=[
            analysis.token_filter("my_filter2", "stop", stopwords=["c", "d"])
        ],
    )
    m = mapping.Mapping()
    m.field("title", "text", analyzer=a1, search_analyzer=a2)
    m.field(
        "text",
        "text",
        analyzer=a1,
        fields={
            "english": Text(analyzer=a1),
            "unknown": Keyword(analyzer=a1, search_analyzer=a2),
        },
    )
    assert {
        "analyzer": {
            "my_analyzer1": {
                "filter": ["lowercase", "my_filter1"],
                "tokenizer": "keyword",
                "type": "custom",
            },
            "my_analyzer2": {
                "filter": ["my_filter2"],
                "tokenizer": "trigram",
                "type": "custom",
            },
        },
        "filter": {
            "my_filter1": {
                "stopwords": ["a", "b"],
                "type": "stop"
            },
            "my_filter2": {
                "stopwords": ["c", "d"],
                "type": "stop"
            },
        },
        "tokenizer": {
            "trigram": {
                "max_gram": 3,
                "min_gram": 3,
                "type": "nGram"
            }
        },
    } == m._collect_analysis()
def test_mapping_can_collect_all_analyzers_and_normalizers():
    a1 = analysis.analyzer(
        "my_analyzer1",
        tokenizer="keyword",
        filter=[
            "lowercase",
            analysis.token_filter("my_filter1", "stop", stopwords=["a", "b"]),
        ],
    )
    a2 = analysis.analyzer("english")
    a3 = analysis.analyzer("unknown_custom")
    a4 = analysis.analyzer(
        "my_analyzer2",
        tokenizer=analysis.tokenizer("trigram",
                                     "nGram",
                                     min_gram=3,
                                     max_gram=3),
        filter=[
            analysis.token_filter("my_filter2", "stop", stopwords=["c", "d"])
        ],
    )
    a5 = analysis.analyzer("my_analyzer3", tokenizer="keyword")
    n1 = analysis.normalizer("my_normalizer1", filter=["lowercase"])
    n2 = analysis.normalizer(
        "my_normalizer2",
        filter=[
            "my_filter1",
            "my_filter2",
            analysis.token_filter("my_filter3", "stop", stopwords=["e", "f"]),
        ],
    )
    n3 = analysis.normalizer("unknown_custom")

    m = mapping.Mapping()
    m.field(
        "title",
        "text",
        analyzer=a1,
        fields={
            "english": Text(analyzer=a2),
            "unknown": Keyword(search_analyzer=a3)
        },
    )
    m.field("comments", Nested(properties={"author": Text(analyzer=a4)}))
    m.field("normalized_title", "keyword", normalizer=n1)
    m.field("normalized_comment", "keyword", normalizer=n2)
    m.field("unknown", "keyword", normalizer=n3)
    m.meta("_all", analyzer=a5)

    assert {
        "analyzer": {
            "my_analyzer1": {
                "filter": ["lowercase", "my_filter1"],
                "tokenizer": "keyword",
                "type": "custom",
            },
            "my_analyzer2": {
                "filter": ["my_filter2"],
                "tokenizer": "trigram",
                "type": "custom",
            },
            "my_analyzer3": {
                "tokenizer": "keyword",
                "type": "custom"
            },
        },
        "normalizer": {
            "my_normalizer1": {
                "filter": ["lowercase"],
                "type": "custom"
            },
            "my_normalizer2": {
                "filter": ["my_filter1", "my_filter2", "my_filter3"],
                "type": "custom",
            },
        },
        "filter": {
            "my_filter1": {
                "stopwords": ["a", "b"],
                "type": "stop"
            },
            "my_filter2": {
                "stopwords": ["c", "d"],
                "type": "stop"
            },
            "my_filter3": {
                "stopwords": ["e", "f"],
                "type": "stop"
            },
        },
        "tokenizer": {
            "trigram": {
                "max_gram": 3,
                "min_gram": 3,
                "type": "nGram"
            }
        },
    } == m._collect_analysis()

    assert json.loads(json.dumps(m.to_dict())) == m.to_dict()
Exemple #28
0
from django_elasticsearch_dsl.registries import registry
from elasticsearch_dsl import analysis, InnerDoc
from elasticsearch_dsl.field import Text

from django.db.models import Prefetch

from api.applications import models


address_analyzer = analysis.analyzer(
    "address_analyzer", tokenizer="whitespace", filter=["lowercase", "asciifolding", "trim",],
)

part_number_analyzer = analysis.analyzer(
    "part_number_analyzer",
    tokenizer=analysis.tokenizer("part_number_path_hierarchy", "path_hierarchy", delimiter="-"),
    filter=["lowercase", "trim"],
)

reference_code_analyzer = analysis.analyzer(
    "reference_code_analyzer", tokenizer="path_hierarchy", filter=["lowercase", "trim"]
)

descriptive_text_analyzer = analysis.analyzer(
    "descriptive_text_analyzer", tokenizer="classic", filter=["lowercase", "trim", "stemmer"]
)

ngram_filter = analysis.token_filter("ngram_filter", type="ngram", min_gram=2, max_gram=20)

ngram_analyzer = analysis.analyzer(
    "ngram_completion", tokenizer="whitespace", filter=["lowercase", "asciifolding", ngram_filter]