コード例 #1
0
        def run_test(self):
            message = testcase["message"]
            unit_type = testcase.get("unit_type", None)
            number_detector_object = NumberDetector(entity_name="number",
                                                    language=language,
                                                    unit_type=unit_type)
            number_detector_object.set_min_max_digits(
                min_digit=testcase.get('min_digit',
                                       number_detector_object.min_digit),
                max_digit=testcase.get('max_digit',
                                       number_detector_object.max_digit))
            number_dicts, spans = number_detector_object.detect_entity(message)

            expected_number_dicts, expected_spans = parse_expected_outputs(
                testcase["outputs"])
            expected_outputs = list(
                six.moves.zip(expected_number_dicts, expected_spans))

            prefix = failure_string_prefix.format(message=message,
                                                  language=language)

            self.assertEqual(
                len(number_dicts), len(spans), prefix +
                u"Returned numbers and original_texts have different lengths")
            self.assertEqual(
                len(spans), len(expected_outputs), prefix +
                u"Returned numbers and expected_outputs have different lengths"
            )

            for output in six.moves.zip(number_dicts, spans):

                self.assertIn(
                    output, expected_outputs,
                    prefix + u"{got} not in {expected_outputs}".format(
                        got=output, expected_outputs=expected_outputs))
コード例 #2
0
 def test_en_number_detection_for_integer_number(self):
     """
     Number detection for english language for integer number like '100', '2'
     """
     message = u'100 got selected for interview'
     number_detector_object = NumberDetector(entity_name=self.entity_name,
                                             language='en')
     number_dicts, original_texts = number_detector_object.detect_entity(
         message)
     zipped = zip(number_dicts, original_texts)
     self.assertEqual(len(zipped), 1)
     self.assertIn(({'value': '100', 'unit': None}, u'100'), zipped)
コード例 #3
0
 def test_en_number_detection_for_integer_number_with_unit(self):
     """
     Number detection for english language for integer number with units like 'Rs100', '2Rs'
     """
     message = u'rs.100 is the application charger'
     number_detector_object = NumberDetector(entity_name=self.entity_name,
                                             language='en')
     number_dicts, original_texts = number_detector_object.detect_entity(
         message)
     zipped = zip(number_dicts, original_texts)
     self.assertEqual(len(zipped), 1)
     self.assertIn(({'value': '100', 'unit': 'rupees'}, 'rs.100'), zipped)
コード例 #4
0
    def test_en_number_detection_for_decimal_number_with_scale_and_unit(self):
        """
        Number detection for english language for decimal number with scale like '1.2 thousand', '2.2k' excluding unit
        """
        message = 'I bought a car toy for 2.3k rupees'
        number_detector_object = NumberDetector(entity_name=self.entity_name,
                                                language='en')
        number_dicts, original_texts = number_detector_object.detect_entity(
            message)

        zipped = list(zip(number_dicts, original_texts))
        self.assertEqual(len(zipped), 1)
        self.assertIn(({'value': '2300', 'unit': None}, u'2.3k'), zipped)
コード例 #5
0
    def test_en_number_detection_for_decimal_number_with_scale(self):
        """
        Number detection for english language for decimal number with scale like '1.2 thousand', '2.2k', '1.4m'
        """
        message = 'my monthly salary is 2.2k'
        number_detector_object = NumberDetector(entity_name=self.entity_name,
                                                language='en')
        number_dicts, original_texts = number_detector_object.detect_entity(
            message)

        zipped = zip(number_dicts, original_texts)
        self.assertEqual(len(zipped), 1)
        self.assertIn(({'value': '2200', 'unit': None}, u'2.2k'), zipped)
コード例 #6
0
    def test_en_number_detection_for_integer_number_with_scale(self):
        """
        Number detection for english language for integer number with scale like '1 thousand', '1k', '1m'
        """
        message = '1 thousand men were killed in war'
        number_detector_object = NumberDetector(entity_name=self.entity_name,
                                                language='en')
        number_dicts, original_texts = number_detector_object.detect_entity(
            message)

        zipped = zip(number_dicts, original_texts)
        self.assertEqual(len(zipped), 1)
        self.assertIn(({'value': '1000', 'unit': None}, u'1 thousand'), zipped)
コード例 #7
0
    def test_en_number_detection_for_decimal_number(self):
        """
        Number detection for english language for decimal number like '100.2'
        """
        message = u'Todays temperature is 11.2 degree celsius'
        number_detector_object = NumberDetector(entity_name=self.entity_name,
                                                language='en')
        number_dicts, original_texts = number_detector_object.detect_entity(
            message)

        zipped = zip(number_dicts, original_texts)
        self.assertEqual(len(zipped), 1)
        self.assertIn(({'value': '11.2', 'unit': None}, u'11.2'), zipped)
コード例 #8
0
def resolve_numerals(text, language) -> str:
    """
    Uses NumberDetector to resolve numeric occurrences in text for both English and Hindi.
    Args:
        text (str): processed string with numerals and character constants fixed
        language (str): Language for NumberDetector
    Returns:
        processed_text (str): modified text
    """
    processed_text = text
    number_detector = NumberDetector('asr_dummy', language=language)
    # FIXME: Detection fails if text starts with '0' since number detector discards it
    detected_numerals, original_texts = number_detector.detect_entity(
        text=text)
    detected_numerals_hi, original_texts_hi = number_detector.detect_entity(
        text=text, language='hi')
    detected_numerals.extend(detected_numerals_hi)
    original_texts.extend(original_texts_hi)
    for number, original_text in zip(detected_numerals, original_texts):
        substitution_reg = re.compile(re.escape(original_text), re.IGNORECASE)
        processed_text = substitution_reg.sub(
            number[NUMBER_DETECTION_RETURN_DICT_VALUE], processed_text)
    return processed_text
コード例 #9
0
    def test_en_number_detection_for_decimal_number_with_scale_and_unit_and_different_unit_type_given(
            self):
        """
        Number detection for english language for decimal number with scale like '1.2 thousand rupees', 'Rupees 2.2k'
        """
        message = 'I buys 2.3k kg mango'
        number_detector_object = NumberDetector(entity_name=self.entity_name,
                                                language='en',
                                                unit_type='currency')
        number_dicts, original_texts = number_detector_object.detect_entity(
            message)

        zipped = list(zip(number_dicts, original_texts))
        self.assertEqual(len(zipped), 0)
コード例 #10
0
    def test_en_number_detection_for_integer_number_with_scale_and_unit(self):
        """
        Number detection for english language for integer number with scale and unit like 'Rs 1 thousand', '1k Rs'
        """
        message = 'i need 1 thousand rupees'
        number_detector_object = NumberDetector(entity_name=self.entity_name,
                                                language='en')
        number_dicts, original_texts = number_detector_object.detect_entity(
            message)

        zipped = zip(number_dicts, original_texts)
        self.assertEqual(len(zipped), 1)
        self.assertIn(({
            'value': '1000',
            'unit': 'rupees'
        }, u'1 thousand rupees'), zipped)
コード例 #11
0
    def test_en_number_detection_for_decimal_number_with_unit(self):
        """
        Number detection for english language for decimal number with unit like '10.2k rupees'
        """
        message = u'my monthly salary is 10.12k rupees'
        number_detector_object = NumberDetector(entity_name=self.entity_name,
                                                language='en')
        number_dicts, original_texts = number_detector_object.detect_entity(
            message)

        zipped = zip(number_dicts, original_texts)
        self.assertEqual(len(zipped), 1)
        self.assertIn(({
            'value': '10120',
            'unit': 'rupees'
        }, u'10.12k rupees'), zipped)
コード例 #12
0
    def test_en_number_detection_for_decimal_number_with_scale_and_unit_and_unit_type_given(
            self):
        """
        Number detection for english language for decimal number with scale like '1.2 thousand rupees', 'Rupees 2.2k'
        """
        message = 'I bought a car toy for 2.3k rupees'
        number_detector_object = NumberDetector(entity_name=self.entity_name,
                                                language='en',
                                                unit_type='currency')
        number_dicts, original_texts = number_detector_object.detect_entity(
            message)

        zipped = zip(number_dicts, original_texts)
        self.assertEqual(len(zipped), 1)
        self.assertIn(({
            'value': '2300',
            'unit': 'rupees'
        }, u'2.3k rupees'), zipped)
コード例 #13
0
class BaseNumberRangeDetector(object):
    def __init__(self,
                 entity_name,
                 language,
                 data_directory_path,
                 unit_type=None):
        """
        Standard Number detection class, read data from language data path and help to detect number ranges like min
        and max value from given number range text for given languages.
        Args:
            entity_name (str): entity_name: string by which the detected number would be replaced
            language (str): language code of text
            data_directory_path (str): path of data folder for given language
            unit_type (str, optional): number unit types like weight, currency, temperature, used to detect number with
                                       specific unit type only. If None, it will detect all number ranges irrespective
                                       of units. You can see all unit types supported inside number detection
                                       language data with filename unit.csv.

        """
        self.text = ''
        self.tagged_text = ''
        self.processed_text = ''
        self.entity_name = entity_name
        self.tag = '__' + entity_name + '__'
        self.range_variants_map = {}
        self.unit_type = unit_type

        self.min_range_prefix_variants = None
        self.min_range_suffix_variants = None
        self.max_range_prefix_variants = None
        self.max_range_suffix_variants = None
        self.min_max_range_variants = None
        self.number_detected_map = {}

        self.number_detector = NumberDetector(entity_name=entity_name,
                                              language=language)
        self.number_detector.set_min_max_digits(1, 100)

        # Method to initialise regex params
        self._init_regex_for_range(data_directory_path)

        # Variable to define default order in which detector will work
        self.detector_preferences = [
            self._detect_min_max_num_range,
            self._detect_min_num_range_with_prefix_variants,
            self._detect_min_num_range_with_suffix_variants,
            self._detect_max_num_range_with_prefix_variants,
            self._detect_max_num_range_with_suffix_variants
        ]

    def _init_regex_for_range(self, data_directory_path):
        """
        Initialise params which hold variants of keywords defining whether a given number range in text contains
        min value, max value or both.

        Params:
             min_range_start_variants (list): List of keywords which occur before min value in text
             min_range_end_variants (list): List of keywords which occur after min value in text
             max_range_start_variants (list): List of keywords which occur before max value in text
             max_range_end_variants (list): List of keywords which occur after max value in text
             min_max_range_variants (list): List of keywords which occur in between min and max value in text
        Args:
            data_directory_path (str): Data directory path
        Returns:
            None
        """
        number_range_df = pd.read_csv(os.path.join(
            data_directory_path,
            numeral_constant.NUMBER_RANGE_KEYWORD_FILE_NAME),
                                      encoding='utf-8')
        for index, row in number_range_df.iterrows():
            range_variants = get_list_from_pipe_sep_string(
                row[numeral_constant.COLUMN_NUMBER_RANGE_VARIANTS])
            for variant in range_variants:
                self.range_variants_map[variant] = \
                    NumberRangeVariant(position=row[numeral_constant.COLUMN_NUMBER_RANGE_POSITION],
                                       range_type=row[numeral_constant.COLUMN_NUMBER_RANGE_RANGE_TYPE])

        self.min_range_prefix_variants = [
            re.escape(variant)
            for variant, value in self.range_variants_map.items()
            if (value.position == -1
                and value.range_type == numeral_constant.NUMBER_RANGE_MIN_TYPE)
        ]

        self.min_range_suffix_variants = [
            re.escape(variant)
            for variant, value in self.range_variants_map.items()
            if (value.position == 1
                and value.range_type == numeral_constant.NUMBER_RANGE_MIN_TYPE)
        ]

        self.max_range_prefix_variants = [
            re.escape(variant)
            for variant, value in self.range_variants_map.items()
            if (value.position == -1
                and value.range_type == numeral_constant.NUMBER_RANGE_MAX_TYPE)
        ]

        self.max_range_suffix_variants = [
            re.escape(variant)
            for variant, value in self.range_variants_map.items()
            if (value.position == 1
                and value.range_type == numeral_constant.NUMBER_RANGE_MAX_TYPE)
        ]

        self.min_max_range_variants = [
            re.escape(variant)
            for variant, value in self.range_variants_map.items()
            if (value.position == 0 and value.range_type ==
                numeral_constant.NUMBER_RANGE_MIN_MAX_TYPE)
        ]

    def _tag_number_in_text(self, processed_text):
        """
        replace number in text with number tag from number_detected_map
        Args:
            processed_text (str): processed text
        Returns:
            (str): text with number replaced with tag
        Examples:
            >>> text = 'i want to buy 3 apples and more than two bananas'
            >>> number_detected_map = {'__number__0': ({'value': '2', 'unit': None}, 'two'),
                                       '__number__1': ({'value': '3', 'unit': None}, '3')}
            >>> self._tag_number_in_text(text)
            i want to buy __number__1 apples and more than __number__0 bananas
        """
        tagged_number_text = processed_text
        sorted_number_detected_map = sorted(
            self.number_detected_map.items(),
            key=lambda kv: len(kv[1].original_text),
            reverse=True)
        for number_tag in sorted_number_detected_map:
            tagged_number_text = tagged_number_text.replace(
                number_tag[1].original_text, number_tag[0], 1)
        return tagged_number_text

    def _get_number_tag_dict(self):
        """
        Method to create number tag dict. Its run number detection on text and create a dict having number tag as key
        and value as tuple of entity value and original text.
        Returns:
            (dict): dict containing number tag and their corresponding value and original text
        Examples:
            >>> text = 'I want 12 dozen banana'
            >>> self._get_number_tag_dict()
            {'__number_1': ({'value': 12, 'unit': None}, '12')}
        """
        detected_number_dict = {}
        entity_value_list, original_text_list = self.number_detector.detect_entity(
            self.processed_text)
        for index, (entity_value, original_text) in enumerate(
                zip(entity_value_list, original_text_list)):
            detected_number_dict[numeral_constant.NUMBER_REPLACE_TEXT +
                                 str(index)] = ValueTextPair(
                                     entity_value=entity_value,
                                     original_text=original_text)
        return detected_number_dict

    def _get_original_text_from_tagged_text(self, number_tag_text):
        """
        Return original text value of number tag from number detected map
        Args:
            number_tag_text (str): tagged number
        Returns:
            (str or None): Original value of tagged number if found else None
        """
        original = number_tag_text
        for number_tag in self.number_detected_map:
            original = original.replace(
                number_tag, self.number_detected_map[number_tag].original_text)
        if original == number_tag_text:
            return None
        return original

    def detect_number_range(self, text):
        """
        Detect number-range from number range text. Run through list of detectors defined in detector_preferences in
        the preferences.
        Args:
            text(str): text string
        Returns:
            (tuple): a tuple containing
                (list): list containing detected numeric text
                (list): list containing original numeral text

        """
        self.text = text
        self.tagged_text = text
        self.processed_text = text
        self.number_detected_map = self._get_number_tag_dict()
        self.processed_text = self._tag_number_in_text(text)

        number_list, original_list = None, None
        for detector in self.detector_preferences:
            number_list, original_list = detector(number_list, original_list)
            self._update_tagged_text(original_list)
        return number_list, original_list

    def _get_number_range(self, min_part_match, max_part_match, full_match):
        """
        Update number_range_list and original_list by finding entity value of number tag and original text from
        number_detected_map
        Args:
            min_part_match (str or None): tagged min number
            max_part_match (str or None): tagged max number
            full_match (str): text matching regex
        Returns:
            (tuple): a tuple containing
                (list): list containing detected numeric text
                (list): list containing original numeral text
        """
        number_range = None
        original_text = None

        if full_match not in self.processed_text:
            return number_range, original_text

        entity_value_min, entity_value_max, entity_unit = None, None, None

        if min_part_match and min_part_match in self.number_detected_map:
            entity_dict = self.number_detected_map[min_part_match].entity_value
            entity_value_min = entity_dict[
                numeral_constant.NUMBER_DETECTION_RETURN_DICT_VALUE]
            entity_unit = entity_dict[
                numeral_constant.NUMBER_DETECTION_RETURN_DICT_UNIT]

        if max_part_match and max_part_match in self.number_detected_map:
            entity_dict = self.number_detected_map[max_part_match].entity_value
            entity_value_max = entity_dict[
                numeral_constant.NUMBER_DETECTION_RETURN_DICT_VALUE]
            entity_unit = entity_dict[
                numeral_constant.NUMBER_DETECTION_RETURN_DICT_UNIT]

        if self.unit_type and (entity_unit is None or
                               self.number_detector.get_unit_type(entity_unit)
                               != self.unit_type):
            return number_range, original_text

        original_text = self._get_original_text_from_tagged_text(full_match)
        if (entity_value_min or entity_value_max) and original_text:
            self.processed_text = self.processed_text.replace(
                full_match.strip(), '', 1)
            original_text = original_text.strip()
            number_range = {
                numeral_constant.NUMBER_RANGE_MIN_VALUE: entity_value_min,
                numeral_constant.NUMBER_RANGE_MAX_VALUE: entity_value_max,
                numeral_constant.NUMBER_RANGE_VALUE_UNIT: entity_unit
            }
        return number_range, original_text

    def _detect_min_num_range_with_prefix_variants(self,
                                                   number_range_list=None,
                                                   original_list=None):
        """
        Method to detect number range containing only min value and keywords which identify value as min present
        before them. Example - More than 2 {'more than' => keyword, '2' => min value},
                               At least seven hundred rupees {'At least' => keyword, 'seven hundred rupees'=>min value}
        Args:
            number_range_list (list):
            original_list (list):
        Returns:
            (tuple): a tuple containing
                (list): list containing detected numeric text
                (list): list containing original numeral text
        """
        number_range_list = number_range_list or []
        original_list = original_list or []

        if self.min_range_prefix_variants:
            min_prefix_choices = '|'.join(self.min_range_prefix_variants)
            min_range_start_pattern = re.compile(
                ur'((?:{min_prefix_choices})\s+({number}\d+))'.format(
                    number=numeral_constant.NUMBER_REPLACE_TEXT,
                    min_prefix_choices=min_prefix_choices), re.UNICODE)
            number_range_matches = min_range_start_pattern.findall(
                self.processed_text)
            for match in number_range_matches:
                number_range, original_text = self._get_number_range(
                    min_part_match=match[1],
                    max_part_match=None,
                    full_match=match[0])
                if number_range and original_text:
                    number_range_list.append(number_range)
                    original_list.append(original_text)
        return number_range_list, original_list

    def _detect_min_num_range_with_suffix_variants(self,
                                                   number_range_list=None,
                                                   original_list=None):
        """
        Method to detect number range containing only min value and keywords which identify value as min present
        after them.
        Args:
            number_range_list (list):
            original_list (list):
        Returns:
           (tuple): a tuple containing
                (list): list containing detected numeric text
                (list): list containing original numeral text
        """
        number_range_list = number_range_list or []
        original_list = original_list or []

        if self.min_range_suffix_variants:
            min_suffix_choices = '|'.join(self.min_range_suffix_variants)
            min_range_end_pattern = re.compile(
                ur'(({number}\d+)\s+(?:{min_suffix_choices}))'.format(
                    number=numeral_constant.NUMBER_REPLACE_TEXT,
                    min_suffix_choices=min_suffix_choices), re.UNICODE)
            number_range_matches = min_range_end_pattern.findall(
                self.processed_text)
            for match in number_range_matches:
                number_range, original_text = self._get_number_range(
                    min_part_match=match[1],
                    max_part_match=None,
                    full_match=match[0])
                if number_range and original_text:
                    number_range_list.append(number_range)
                    original_list.append(original_text)

        return number_range_list, original_list

    def _detect_max_num_range_with_prefix_variants(self,
                                                   number_range_list=None,
                                                   original_list=None):
        """
        Method to detect number range containing only max value and keywords which identify value as min present
        before them. Example - less than 2 {'less than' => keyword, '2' => max value},
                               At most seven hundred rupees {'At most' => keyword, 'seven hundred rupees'=>min value}
        Args:
            number_range_list (list):
            original_list (list):
        Returns:
            (tuple): a tuple containing
                (list): list containing detected numeric text
                (list): list containing original numeral text
        """
        number_range_list = number_range_list or []
        original_list = original_list or []

        if self.max_range_prefix_variants:
            max_prefix_choices = '|'.join(self.max_range_prefix_variants)
            max_range_start_pattern = re.compile(
                ur'((?:{max_prefix_choices})\s+({number}\d+))'.format(
                    number=numeral_constant.NUMBER_REPLACE_TEXT,
                    max_prefix_choices=max_prefix_choices), re.UNICODE)
            number_range_matches = max_range_start_pattern.findall(
                self.processed_text)
            for match in number_range_matches:
                number_range, original_text = self._get_number_range(
                    min_part_match=None,
                    max_part_match=match[1],
                    full_match=match[0])
                if number_range and original_text:
                    number_range_list.append(number_range)
                    original_list.append(original_text)

        return number_range_list, original_list

    def _detect_max_num_range_with_suffix_variants(self,
                                                   number_range_list=None,
                                                   original_list=None):
        """
        Method to detect number range containing only max value and keywords which identify value as min present
        after them.
        Args:
            number_range_list (list):
            original_list (list):
        Returns:
            (tuple): a tuple containing
                (list): list containing detected numeric text
                (list): list containing original numeral text
        """
        number_range_list = number_range_list or []
        original_list = original_list or []

        if self.max_range_suffix_variants:
            max_suffix_choices = '|'.join(self.max_range_suffix_variants)
            max_range_end_pattern = re.compile(
                ur'(({number}\d+)\s+(?:{max_suffix_choices}))'.format(
                    number=numeral_constant.NUMBER_REPLACE_TEXT,
                    max_suffix_choices=max_suffix_choices), re.UNICODE)
            number_range_matches = max_range_end_pattern.findall(
                self.processed_text)
            for match in number_range_matches:
                number_range, original_text = self._get_number_range(
                    min_part_match=None,
                    max_part_match=match[1],
                    full_match=match[0])
                if number_range and original_text:
                    number_range_list.append(number_range)
                    original_list.append(original_text)

        return number_range_list, original_list

    def _detect_min_max_num_range(self,
                                  number_range_list=None,
                                  original_list=None):
        """
        Method to detect number range containing both min and max value and keywords them present in between
        Example - 2000 to 30000 {'to' => keyword, '2000' => min value, '30000' => ,max_value},
                 2k-3k hundred rupees {'-' => keyword, '2k' => min value, '3k' => ,max_value}
        Args:
            number_range_list (list):
            original_list (list):
        Returns:
            (tuple): a tuple containing
                (list): list containing detected numeric text
                (list): list containing original numeral text
        """
        number_range_list = number_range_list or []
        original_list = original_list or []

        if self.min_max_range_variants:
            min_max_choices = '|'.join(self.min_max_range_variants)
            min_max_range_pattern = re.compile(
                ur'(({number}\d+)\s*(?:{min_max_choices})\s*'
                ur'({number}\d+))'.format(
                    number=numeral_constant.NUMBER_REPLACE_TEXT,
                    min_max_choices=min_max_choices), re.UNICODE)
            number_range_matches = min_max_range_pattern.findall(
                self.processed_text)
            for match in number_range_matches:
                number_range, original_text = self._get_number_range(
                    min_part_match=match[1],
                    max_part_match=match[2],
                    full_match=match[0])
                if number_range and original_text:
                    number_range_list.append(number_range)
                    original_list.append(original_text)

        return number_range_list, original_list

    def _update_tagged_text(self, original_number_list):
        """
        Replaces detected date with tag generated from entity_name used to initialize the object with

        A final string with all dates replaced will be stored in object's tagged_text attribute
        A string with all dates removed will be stored in object's processed_text attribute

        Args:
            original_number_list (list): list of substrings of original text to be replaced with tag
                                       created from entity_name
        """
        for detected_text in original_number_list:
            self.tagged_text = self.tagged_text.replace(
                detected_text, self.tag)
コード例 #14
0
ファイル: modules_helping.py プロジェクト: shilpajv/bankBOT
def get_currency(text,detected_lang):
    from ner_v2.detectors.numeral.number.number_detection import NumberDetector
    detector = NumberDetector(entity_name='number', language=detected_lang,unit_type='currency')
    number = detector.detect_entity(text)
    return number