Example #1
0
 def mapping_func(position_filter_tuple):
     position, filter = position_filter_tuple
     if type(filter) is dict:
         name = f'{locale}_{position}_{filter["type"]}'
         if char:
             return char_filter(name, **filter)
         return token_filter(name, **filter)
     return filter
Example #2
0
 def mapping_func(position_filter_tuple):
     position, filter = position_filter_tuple
     if type(filter) is dict:
         prefix = analyzer_name
         default_filters = config.ES_DEFAULT_ANALYZER["char_filter" if char else "filter"]
         if filter in default_filters:
             # detect if this filter exists in the default analyzer
             # if it does use the same name as the default
             # to avoid defining the same filter for each locale
             prefix = config.ES_DEFAULT_ANALYZER_NAME
             position = default_filters.index(filter)
         name = f'{prefix}_{position}_{filter["type"]}'
         if char:
             return char_filter(name, **filter)
         return token_filter(name, **filter)
     return filter
Example #3
0
# Deal with French specific aspects.
fr_elision = token_filter(
    'fr_elision',
    type='elision',
    articles=[
        'l', 'm', 't', 'qu', 'n', 's', 'j', 'd', 'c',
        'jusqu', 'quoiqu', 'lorsqu', 'puisqu'
    ]
)

# Languages related analyzers.
de_analyzer = analyzer(
    'de_analyzer',
    tokenizer=tokenizer('icu_tokenizer'),
    filter=['icu_folding', 'icu_normalizer', de_stop_filter, de_stem_filter],
    char_filter=[char_filter('html_strip')]
)

en_analyzer = analyzer(
    'en_analyzer',
    tokenizer=tokenizer('icu_tokenizer'),
    filter=['icu_folding', 'icu_normalizer', en_stop_filter, en_stem_filter],
    char_filter=[char_filter('html_strip')]
)

es_analyzer = analyzer(
    'es_analyzer',
    tokenizer=tokenizer('icu_tokenizer'),
    filter=['icu_folding', 'icu_normalizer', es_stop_filter, es_stem_filter],
    char_filter=[char_filter('html_strip')]
)
Example #4
0
        "will",
    ],
)

keep_html_char_filter = char_filter(
    "keep_html_char_filter",
    type="pattern_replace",
    pattern="<([a-z]+)>",
    # The magic here is that turning things like `<video>` to `_video_`
    # and `<a>` to `_a_` means that it gets analyzed as its own token like that.
    # This way you can search for `<a>` and find `<a>: The Anchor element`.
    # But note that a search for `<section>` will *also* match
    # `<nav>: The Navigation Section element` because `<section>` is turned in the
    # the following two tokens: `['_section_', 'section']`.
    # Not great.
    # However, what if the user wants to find the page about the `<section>` HTML
    # tag and they search for `section`. Then it's a good thing that that token
    # expands to both forms.
    # A more extreme variant would be something that doesn't get token delimited.
    # For example:
    #
    #   `replacement="html$1html"`
    #
    # This would turn `<a>` to `htmlahtml` which means a search for `<a>` will
    # work expected but a search for `video` wouldn't match
    # the `<video>: The Video Embed element` page.
    replacement="_$1_",
)

special_charater_name_char_filter = char_filter(
    "special_charater_name_char_filter",
    type="pattern_replace",
Example #5
0
# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
# POSSIBILITY OF SUCH DAMAGE.
##############################################################################

from threatshell.doctypes.generic import (convert_cidr, BetterDate,
                                          email_analyzer, GenericDoc,
                                          ThreatshellIndex)
from elasticsearch_dsl import (Boolean, Date, Float, GeoPoint, Index, Integer,
                               Ip, Nested, String, analyzer, char_filter)
import json

status_filter = char_filter(
    "opendns_status_filter",
    type="mapping",
    mappings=["1 => benign", "0 => unknown", "-1 => malicious"])

rir_filter = char_filter("opendns_rir_filter",
                         type="mapping",
                         mappings=[
                             "0 => Unknown", "1 => AfriNIC", "2 => APNIC",
                             "3 => Arin", "4 => LACNIC", "5 => RIPE"
                         ])

status_analyzer = analyzer("opendns_status_analyzer",
                           tokenizer="standard",
                           char_filter=[status_filter])

rir_analyzer = analyzer("opendns_rir_analyzer",
                        tokenizer="standard",
Example #6
0
# Deal with French specific aspects.
fr_elision = token_filter(
    'fr_elision',
    type='elision',
    articles=[
        'l', 'm', 't', 'qu', 'n', 's', 'j', 'd', 'c',
        'jusqu', 'quoiqu', 'lorsqu', 'puisqu'
    ]
)

# Languages related analyzers.
de_analyzer = analyzer(
    'de_analyzer',
    tokenizer=tokenizer('icu_tokenizer'),
    filter=['icu_folding', 'icu_normalizer', de_stop_filter, de_stem_filter],
    char_filter=[char_filter('html_strip')]
)

en_analyzer = analyzer(
    'en_analyzer',
    tokenizer=tokenizer('icu_tokenizer'),
    filter=['icu_folding', 'icu_normalizer', en_stop_filter, en_stem_filter],
    char_filter=[char_filter('html_strip')]
)

es_analyzer = analyzer(
    'es_analyzer',
    tokenizer=tokenizer('icu_tokenizer'),
    filter=['icu_folding', 'icu_normalizer', es_stop_filter, es_stem_filter],
    char_filter=[char_filter('html_strip')]
)
Example #7
0
    )


ngram_analyzer = analyzer(
    'autocomplete_analyzer',
    tokenizer='uax_url_email',
    filter=[
        'lowercase',
        token_filter('autocomplete_filter', type="edgeNGram", min_gram=1, max_gram=20)
    ]
)

oye_char_filter = char_filter(
    'oye_char_filter',
    type='mapping',
    mappings=[
        "$ => s"
    ]
)

lowercase_analyzer = analyzer(
    'lowercase_analyzer',
    char_filter=[
        oye_char_filter
    ],
    tokenizer='standard',
    filter=[
        'lowercase'
    ]
)