def test_custom_analyzer_can_collect_custom_items():
    trigram = analysis.tokenizer('trigram', 'nGram', min_gram=3, max_gram=3)
    my_stop = analysis.token_filter('my_stop', 'stop', stopwords=['a', 'b'])
    umlauts = analysis.char_filter('umlauts', 'pattern_replace', mappings=['ΓΌ=>ue'])
    a = analysis.analyzer(
        'my_analyzer',
        tokenizer=trigram,
        filter=['lowercase', my_stop],
        char_filter=['html_strip', umlauts]
    )

    assert a.to_dict() == 'my_analyzer'
    assert {
        'analyzer': {
            'my_analyzer': {
                'type': 'custom',
                'tokenizer': 'trigram',
                'filter': ['lowercase', 'my_stop'],
                'char_filter': ['html_strip', 'umlauts']
            }
        },
        'tokenizer': {
            'trigram': trigram.get_definition()
        },
        'filter': {
            'my_stop': my_stop.get_definition()
        },
        'char_filter': {
            'umlauts': umlauts.get_definition()
        }
    } == a.get_analysis_definition()
Example #2
0
        '2e=>tweede',
        '3e=>derde',
        '4e=>vierde',
    ]
)


huisnummer_generate = analysis.char_filter(
    'huisnummer_expand',
    type='pattern_replace',
    pattern='(\d+)',
    replacement="""
        $1-1 $1- $1-2 $1-3
        $1a $1b $1a-1 $1b-1 $1-a $1-b
        $1b 1-b
        $1c 1-c
        $1d 1-d
        $1e 1-e
        $1f 1-f
        $1g 1-g
        $1h 1-h
        $1i 1-i
        $1j 1-j
    """
)


huisnummer_expand = analysis.token_filter(
    'huisnummer_expand',
    type='word_delimiter',
    generate_numer_parts=True,
    preserve_original=True

huisnummer_expand = analysis.token_filter(
    'huisnummer_expand',
    type='word_delimiter',
    generate_numer_parts=True,
    preserve_original=True
)


# Change dash and dot to space
naam_stripper = analysis.char_filter(
    'naam_stripper',
    type='mapping',
    mappings=[
        "-=>' '",  # change '-' to separator
        ".=>' '",  # change '.' to separator
        "/=>' '",  # change '/' to separator
    ]
)

# normalizes ., -, / to space from text
divider_normalizer = analysis.token_filter(
    'divider_normalizer',
    type='pattern_replace',
    pattern='(str\.|\/|-)',
    replacement=' '
)


# Removes ., -, / and space from text
Example #4
0
                                           '1e, eerste => 1e, eerste',
                                           '2e, tweede => 2e, tweede',
                                           '3e, derde  => 3e, derde',
                                           '4e, vierde => 4e, vierde',
                                       ])

strip_zero = analysis.CustomCharFilter("strip_zero",
                                       builtin_type="pattern_replace",
                                       pattern="^0+(.*)",
                                       replacement="$1")

# Change dash and dot to space
naam_stripper = analysis.char_filter(
    'naam_stripper',
    type='mapping',
    mappings=[
        "-=>' '",  # change '-' to separator
        ".=>' '",  # change '.' to separator
        "/=>' '",  # change '/' to separator
    ])

# Create edge ngram filtering to postcode
edge_ngram_filter = analysis.token_filter('edge_ngram_filter',
                                          type='edge_ngram',
                                          min_gram=1,
                                          max_gram=15)

####################################
#           Analyzers              #
####################################

adres = es.analyzer(