예제 #1
0
    def __init__(self,
                 entity_name,
                 pattern,
                 asr_enabled=False,
                 re_flags=DEFAULT_FLAGS,
                 max_matches=50,
                 language='en'):
        """
        Args:
            entity_name (str): an indicator value as tag to replace detected values
            pattern (raw str or str or unicode): pattern to be compiled into a re object
            asr_enabled (bool) : True if message is from ASR and needs to be processed accordingly
            re_flags (int): flags to pass to re.compile.
                Defaults to `regex.U | regex.V1 | regex.WORD`  for `regex` lib  and `re.U` for stdlib `re`
            max_matches (int): maximum number of matches to consider.
            language (str): Source language for the message

        Raises:
            TypeError: if the given pattern fails to compile
        """
        self.entity_name = entity_name
        self.text = ''
        self.tagged_text = ''
        self.processed_text = ''
        self.asr_enabled = asr_enabled
        self.uncompiled_pattern = pattern
        self.language = language
        try:
            self.pattern = re.compile(pattern, flags=re_flags)
        except re.error:
            # In very rare cases it is possible we encounter a pattern that is invalid for V1 engine but works just
            # fine on V0 engine/Python's built in re. E.g. nested character sets '[[]]'
            if _regex_available and (re_flags & re.V1):
                re_flags = (re_flags ^ re.V1) | re.V0
                self.pattern = re.compile(pattern, flags=re_flags)
                ner_logger.warning(
                    f'Failed to compile `{pattern}` with regex.V1, falling back to regex.V0'
                )
            else:
                raise
        self.max_matches = max_matches
        self.tag = '__' + self.entity_name + '__'
예제 #2
0
from __future__ import absolute_import

import importlib
import math
import os
from six.moves import zip

from chatbot_ner.config import ner_logger

try:
    import regex as re

    _re_flags = re.UNICODE | re.V1 | re.WORD

except ImportError:
    ner_logger.warning('Error importing `regex` lib, falling back to stdlib re')
    import re

    _re_flags = re.UNICODE

from language_utilities.constant import ENGLISH_LANG
from ner_v2.detectors.base_detector import BaseDetector
from ner_v2.detectors.numeral.constant import NUMBER_DETECTION_RETURN_DICT_VALUE, NUMBER_DETECTION_RETURN_DICT_UNIT
from ner_v2.detectors.utils import get_lang_data_path

COMMON_NON_NUMERIC_PUNCTUATIONS = re.escape('!"#%&\'()*/;<=>?@[\\]^_`{|}~ред')


class NumberDetector(BaseDetector):
    """Detects number from the text  and tags them.