def __init__(self, entity_name, pattern, asr_enabled=False, re_flags=DEFAULT_FLAGS, max_matches=50, language='en'): """ Args: entity_name (str): an indicator value as tag to replace detected values pattern (raw str or str or unicode): pattern to be compiled into a re object asr_enabled (bool) : True if message is from ASR and needs to be processed accordingly re_flags (int): flags to pass to re.compile. Defaults to `regex.U | regex.V1 | regex.WORD` for `regex` lib and `re.U` for stdlib `re` max_matches (int): maximum number of matches to consider. language (str): Source language for the message Raises: TypeError: if the given pattern fails to compile """ self.entity_name = entity_name self.text = '' self.tagged_text = '' self.processed_text = '' self.asr_enabled = asr_enabled self.uncompiled_pattern = pattern self.language = language try: self.pattern = re.compile(pattern, flags=re_flags) except re.error: # In very rare cases it is possible we encounter a pattern that is invalid for V1 engine but works just # fine on V0 engine/Python's built in re. E.g. nested character sets '[[]]' if _regex_available and (re_flags & re.V1): re_flags = (re_flags ^ re.V1) | re.V0 self.pattern = re.compile(pattern, flags=re_flags) ner_logger.warning( f'Failed to compile `{pattern}` with regex.V1, falling back to regex.V0' ) else: raise self.max_matches = max_matches self.tag = '__' + self.entity_name + '__'
from __future__ import absolute_import import importlib import math import os from six.moves import zip from chatbot_ner.config import ner_logger try: import regex as re _re_flags = re.UNICODE | re.V1 | re.WORD except ImportError: ner_logger.warning('Error importing `regex` lib, falling back to stdlib re') import re _re_flags = re.UNICODE from language_utilities.constant import ENGLISH_LANG from ner_v2.detectors.base_detector import BaseDetector from ner_v2.detectors.numeral.constant import NUMBER_DETECTION_RETURN_DICT_VALUE, NUMBER_DETECTION_RETURN_DICT_UNIT from ner_v2.detectors.utils import get_lang_data_path COMMON_NON_NUMERIC_PUNCTUATIONS = re.escape('!"#%&\'()*/;<=>?@[\\]^_`{|}~ред') class NumberDetector(BaseDetector): """Detects number from the text and tags them.