def mapping_func(position_filter_tuple): position, filter = position_filter_tuple if type(filter) is dict: name = f'{locale}_{position}_{filter["type"]}' if char: return char_filter(name, **filter) return token_filter(name, **filter) return filter
def mapping_func(position_filter_tuple): position, filter = position_filter_tuple if type(filter) is dict: prefix = analyzer_name default_filters = config.ES_DEFAULT_ANALYZER["char_filter" if char else "filter"] if filter in default_filters: # detect if this filter exists in the default analyzer # if it does use the same name as the default # to avoid defining the same filter for each locale prefix = config.ES_DEFAULT_ANALYZER_NAME position = default_filters.index(filter) name = f'{prefix}_{position}_{filter["type"]}' if char: return char_filter(name, **filter) return token_filter(name, **filter) return filter
# Deal with French specific aspects. fr_elision = token_filter( 'fr_elision', type='elision', articles=[ 'l', 'm', 't', 'qu', 'n', 's', 'j', 'd', 'c', 'jusqu', 'quoiqu', 'lorsqu', 'puisqu' ] ) # Languages related analyzers. de_analyzer = analyzer( 'de_analyzer', tokenizer=tokenizer('icu_tokenizer'), filter=['icu_folding', 'icu_normalizer', de_stop_filter, de_stem_filter], char_filter=[char_filter('html_strip')] ) en_analyzer = analyzer( 'en_analyzer', tokenizer=tokenizer('icu_tokenizer'), filter=['icu_folding', 'icu_normalizer', en_stop_filter, en_stem_filter], char_filter=[char_filter('html_strip')] ) es_analyzer = analyzer( 'es_analyzer', tokenizer=tokenizer('icu_tokenizer'), filter=['icu_folding', 'icu_normalizer', es_stop_filter, es_stem_filter], char_filter=[char_filter('html_strip')] )
"will", ], ) keep_html_char_filter = char_filter( "keep_html_char_filter", type="pattern_replace", pattern="<([a-z]+)>", # The magic here is that turning things like `<video>` to `_video_` # and `<a>` to `_a_` means that it gets analyzed as its own token like that. # This way you can search for `<a>` and find `<a>: The Anchor element`. # But note that a search for `<section>` will *also* match # `<nav>: The Navigation Section element` because `<section>` is turned in the # the following two tokens: `['_section_', 'section']`. # Not great. # However, what if the user wants to find the page about the `<section>` HTML # tag and they search for `section`. Then it's a good thing that that token # expands to both forms. # A more extreme variant would be something that doesn't get token delimited. # For example: # # `replacement="html$1html"` # # This would turn `<a>` to `htmlahtml` which means a search for `<a>` will # work expected but a search for `video` wouldn't match # the `<video>: The Video Embed element` page. replacement="_$1_", ) special_charater_name_char_filter = char_filter( "special_charater_name_char_filter", type="pattern_replace",
# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS # INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN # CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) # ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE # POSSIBILITY OF SUCH DAMAGE. ############################################################################## from threatshell.doctypes.generic import (convert_cidr, BetterDate, email_analyzer, GenericDoc, ThreatshellIndex) from elasticsearch_dsl import (Boolean, Date, Float, GeoPoint, Index, Integer, Ip, Nested, String, analyzer, char_filter) import json status_filter = char_filter( "opendns_status_filter", type="mapping", mappings=["1 => benign", "0 => unknown", "-1 => malicious"]) rir_filter = char_filter("opendns_rir_filter", type="mapping", mappings=[ "0 => Unknown", "1 => AfriNIC", "2 => APNIC", "3 => Arin", "4 => LACNIC", "5 => RIPE" ]) status_analyzer = analyzer("opendns_status_analyzer", tokenizer="standard", char_filter=[status_filter]) rir_analyzer = analyzer("opendns_rir_analyzer", tokenizer="standard",
# Deal with French specific aspects. fr_elision = token_filter( 'fr_elision', type='elision', articles=[ 'l', 'm', 't', 'qu', 'n', 's', 'j', 'd', 'c', 'jusqu', 'quoiqu', 'lorsqu', 'puisqu' ] ) # Languages related analyzers. de_analyzer = analyzer( 'de_analyzer', tokenizer=tokenizer('icu_tokenizer'), filter=['icu_folding', 'icu_normalizer', de_stop_filter, de_stem_filter], char_filter=[char_filter('html_strip')] ) en_analyzer = analyzer( 'en_analyzer', tokenizer=tokenizer('icu_tokenizer'), filter=['icu_folding', 'icu_normalizer', en_stop_filter, en_stem_filter], char_filter=[char_filter('html_strip')] ) es_analyzer = analyzer( 'es_analyzer', tokenizer=tokenizer('icu_tokenizer'), filter=['icu_folding', 'icu_normalizer', es_stop_filter, es_stem_filter], char_filter=[char_filter('html_strip')] )
) ngram_analyzer = analyzer( 'autocomplete_analyzer', tokenizer='uax_url_email', filter=[ 'lowercase', token_filter('autocomplete_filter', type="edgeNGram", min_gram=1, max_gram=20) ] ) oye_char_filter = char_filter( 'oye_char_filter', type='mapping', mappings=[ "$ => s" ] ) lowercase_analyzer = analyzer( 'lowercase_analyzer', char_filter=[ oye_char_filter ], tokenizer='standard', filter=[ 'lowercase' ] )