def test_custom_analyzer_can_collect_custom_items(): trigram = analysis.tokenizer('trigram', 'nGram', min_gram=3, max_gram=3) my_stop = analysis.token_filter('my_stop', 'stop', stopwords=['a', 'b']) umlauts = analysis.char_filter('umlauts', 'pattern_replace', mappings=['ΓΌ=>ue']) a = analysis.analyzer( 'my_analyzer', tokenizer=trigram, filter=['lowercase', my_stop], char_filter=['html_strip', umlauts] ) assert a.to_dict() == 'my_analyzer' assert { 'analyzer': { 'my_analyzer': { 'type': 'custom', 'tokenizer': 'trigram', 'filter': ['lowercase', 'my_stop'], 'char_filter': ['html_strip', 'umlauts'] } }, 'tokenizer': { 'trigram': trigram.get_definition() }, 'filter': { 'my_stop': my_stop.get_definition() }, 'char_filter': { 'umlauts': umlauts.get_definition() } } == a.get_analysis_definition()
'2e=>tweede', '3e=>derde', '4e=>vierde', ] ) huisnummer_generate = analysis.char_filter( 'huisnummer_expand', type='pattern_replace', pattern='(\d+)', replacement=""" $1-1 $1- $1-2 $1-3 $1a $1b $1a-1 $1b-1 $1-a $1-b $1b 1-b $1c 1-c $1d 1-d $1e 1-e $1f 1-f $1g 1-g $1h 1-h $1i 1-i $1j 1-j """ ) huisnummer_expand = analysis.token_filter( 'huisnummer_expand', type='word_delimiter', generate_numer_parts=True, preserve_original=True
huisnummer_expand = analysis.token_filter( 'huisnummer_expand', type='word_delimiter', generate_numer_parts=True, preserve_original=True ) # Change dash and dot to space naam_stripper = analysis.char_filter( 'naam_stripper', type='mapping', mappings=[ "-=>' '", # change '-' to separator ".=>' '", # change '.' to separator "/=>' '", # change '/' to separator ] ) # normalizes ., -, / to space from text divider_normalizer = analysis.token_filter( 'divider_normalizer', type='pattern_replace', pattern='(str\.|\/|-)', replacement=' ' ) # Removes ., -, / and space from text
'1e, eerste => 1e, eerste', '2e, tweede => 2e, tweede', '3e, derde => 3e, derde', '4e, vierde => 4e, vierde', ]) strip_zero = analysis.CustomCharFilter("strip_zero", builtin_type="pattern_replace", pattern="^0+(.*)", replacement="$1") # Change dash and dot to space naam_stripper = analysis.char_filter( 'naam_stripper', type='mapping', mappings=[ "-=>' '", # change '-' to separator ".=>' '", # change '.' to separator "/=>' '", # change '/' to separator ]) # Create edge ngram filtering to postcode edge_ngram_filter = analysis.token_filter('edge_ngram_filter', type='edge_ngram', min_gram=1, max_gram=15) #################################### # Analyzers # #################################### adres = es.analyzer(