def run_test(self):
            message = testcase["message"]
            unit_type = testcase.get("unit_type", None)
            number_detector_object = NumberDetector(entity_name="number",
                                                    language=language,
                                                    unit_type=unit_type)
            number_detector_object.set_min_max_digits(
                min_digit=testcase.get('min_digit',
                                       number_detector_object.min_digit),
                max_digit=testcase.get('max_digit',
                                       number_detector_object.max_digit))
            number_dicts, spans = number_detector_object.detect_entity(message)

            expected_number_dicts, expected_spans = parse_expected_outputs(
                testcase["outputs"])
            expected_outputs = list(
                six.moves.zip(expected_number_dicts, expected_spans))

            prefix = failure_string_prefix.format(message=message,
                                                  language=language)

            self.assertEqual(
                len(number_dicts), len(spans), prefix +
                u"Returned numbers and original_texts have different lengths")
            self.assertEqual(
                len(spans), len(expected_outputs), prefix +
                u"Returned numbers and expected_outputs have different lengths"
            )

            for output in six.moves.zip(number_dicts, spans):

                self.assertIn(
                    output, expected_outputs,
                    prefix + u"{got} not in {expected_outputs}".format(
                        got=output, expected_outputs=expected_outputs))
Ejemplo n.º 2
0
 def test_en_number_detection_for_integer_number_with_unit(self):
     """
     Number detection for english language for integer number with units like 'Rs100', '2Rs'
     """
     message = u'rs.100 is the application charger'
     number_detector_object = NumberDetector(entity_name=self.entity_name,
                                             language='en')
     number_dicts, original_texts = number_detector_object.detect_entity(
         message)
     zipped = zip(number_dicts, original_texts)
     self.assertEqual(len(zipped), 1)
     self.assertIn(({'value': '100', 'unit': 'rupees'}, 'rs.100'), zipped)
Ejemplo n.º 3
0
 def test_en_number_detection_for_integer_number(self):
     """
     Number detection for english language for integer number like '100', '2'
     """
     message = u'100 got selected for interview'
     number_detector_object = NumberDetector(entity_name=self.entity_name,
                                             language='en')
     number_dicts, original_texts = number_detector_object.detect_entity(
         message)
     zipped = zip(number_dicts, original_texts)
     self.assertEqual(len(zipped), 1)
     self.assertIn(({'value': '100', 'unit': None}, u'100'), zipped)
Ejemplo n.º 4
0
    def test_en_number_detection_for_decimal_number_with_scale_and_unit(self):
        """
        Number detection for english language for decimal number with scale like '1.2 thousand', '2.2k' excluding unit
        """
        message = 'I bought a car toy for 2.3k rupees'
        number_detector_object = NumberDetector(entity_name=self.entity_name,
                                                language='en')
        number_dicts, original_texts = number_detector_object.detect_entity(
            message)

        zipped = list(zip(number_dicts, original_texts))
        self.assertEqual(len(zipped), 1)
        self.assertIn(({'value': '2300', 'unit': None}, u'2.3k'), zipped)
Ejemplo n.º 5
0
    def test_en_number_detection_for_decimal_number_with_scale(self):
        """
        Number detection for english language for decimal number with scale like '1.2 thousand', '2.2k', '1.4m'
        """
        message = 'my monthly salary is 2.2k'
        number_detector_object = NumberDetector(entity_name=self.entity_name,
                                                language='en')
        number_dicts, original_texts = number_detector_object.detect_entity(
            message)

        zipped = zip(number_dicts, original_texts)
        self.assertEqual(len(zipped), 1)
        self.assertIn(({'value': '2200', 'unit': None}, u'2.2k'), zipped)
Ejemplo n.º 6
0
    def test_en_number_detection_for_integer_number_with_scale(self):
        """
        Number detection for english language for integer number with scale like '1 thousand', '1k', '1m'
        """
        message = '1 thousand men were killed in war'
        number_detector_object = NumberDetector(entity_name=self.entity_name,
                                                language='en')
        number_dicts, original_texts = number_detector_object.detect_entity(
            message)

        zipped = zip(number_dicts, original_texts)
        self.assertEqual(len(zipped), 1)
        self.assertIn(({'value': '1000', 'unit': None}, u'1 thousand'), zipped)
Ejemplo n.º 7
0
    def test_en_number_detection_for_decimal_number(self):
        """
        Number detection for english language for decimal number like '100.2'
        """
        message = u'Todays temperature is 11.2 degree celsius'
        number_detector_object = NumberDetector(entity_name=self.entity_name,
                                                language='en')
        number_dicts, original_texts = number_detector_object.detect_entity(
            message)

        zipped = zip(number_dicts, original_texts)
        self.assertEqual(len(zipped), 1)
        self.assertIn(({'value': '11.2', 'unit': None}, u'11.2'), zipped)
    def __init__(self,
                 entity_name,
                 language,
                 data_directory_path,
                 unit_type=None):
        """
        Standard Number detection class, read data from language data path and help to detect number ranges like min
        and max value from given number range text for given languages.
        Args:
            entity_name (str): entity_name: string by which the detected number would be replaced
            language (str): language code of text
            data_directory_path (str): path of data folder for given language
            unit_type (str, optional): number unit types like weight, currency, temperature, used to detect number with
                                       specific unit type only. If None, it will detect all number ranges irrespective
                                       of units. You can see all unit types supported inside number detection
                                       language data with filename unit.csv.

        """
        self.text = ''
        self.tagged_text = ''
        self.processed_text = ''
        self.entity_name = entity_name
        self.tag = '__' + entity_name + '__'
        self.range_variants_map = {}
        self.unit_type = unit_type
        self.language = language
        self.min_range_prefix_variants = None
        self.min_range_suffix_variants = None
        self.max_range_prefix_variants = None
        self.max_range_suffix_variants = None
        self.min_max_range_variants = None
        self.number_detected_map = {}

        self.number_detector = NumberDetector(entity_name=entity_name,
                                              language=language,
                                              unit_type=unit_type,
                                              detect_without_unit=True)
        self.number_detector.set_min_max_digits(1, 100)

        # Method to initialise regex params
        self._init_regex_for_range(data_directory_path)

        # Variable to define default order in which detector will work
        self.detector_preferences = [
            self._detect_min_max_num_range,
            self._detect_min_num_range_with_prefix_variants,
            self._detect_min_num_range_with_suffix_variants,
            self._detect_max_num_range_with_prefix_variants,
            self._detect_max_num_range_with_suffix_variants,
            self._detect_absolute_number
        ]
Ejemplo n.º 9
0
    def test_en_number_detection_for_decimal_number_with_scale_and_unit_and_different_unit_type_given(
            self):
        """
        Number detection for english language for decimal number with scale like '1.2 thousand rupees', 'Rupees 2.2k'
        """
        message = 'I buys 2.3k kg mango'
        number_detector_object = NumberDetector(entity_name=self.entity_name,
                                                language='en',
                                                unit_type='currency')
        number_dicts, original_texts = number_detector_object.detect_entity(
            message)

        zipped = list(zip(number_dicts, original_texts))
        self.assertEqual(len(zipped), 0)
Ejemplo n.º 10
0
    def test_en_number_detection_for_decimal_number_with_unit(self):
        """
        Number detection for english language for decimal number with unit like '10.2k rupees'
        """
        message = u'my monthly salary is 10.12k rupees'
        number_detector_object = NumberDetector(entity_name=self.entity_name,
                                                language='en')
        number_dicts, original_texts = number_detector_object.detect_entity(
            message)

        zipped = zip(number_dicts, original_texts)
        self.assertEqual(len(zipped), 1)
        self.assertIn(({
            'value': '10120',
            'unit': 'rupees'
        }, u'10.12k rupees'), zipped)
Ejemplo n.º 11
0
    def test_en_number_detection_for_integer_number_with_scale_and_unit(self):
        """
        Number detection for english language for integer number with scale and unit like 'Rs 1 thousand', '1k Rs'
        """
        message = 'i need 1 thousand rupees'
        number_detector_object = NumberDetector(entity_name=self.entity_name,
                                                language='en')
        number_dicts, original_texts = number_detector_object.detect_entity(
            message)

        zipped = zip(number_dicts, original_texts)
        self.assertEqual(len(zipped), 1)
        self.assertIn(({
            'value': '1000',
            'unit': 'rupees'
        }, u'1 thousand rupees'), zipped)
Ejemplo n.º 12
0
    def test_en_number_detection_for_decimal_number_with_scale_and_unit_and_unit_type_given(
            self):
        """
        Number detection for english language for decimal number with scale like '1.2 thousand rupees', 'Rupees 2.2k'
        """
        message = 'I bought a car toy for 2.3k rupees'
        number_detector_object = NumberDetector(entity_name=self.entity_name,
                                                language='en',
                                                unit_type='currency')
        number_dicts, original_texts = number_detector_object.detect_entity(
            message)

        zipped = zip(number_dicts, original_texts)
        self.assertEqual(len(zipped), 1)
        self.assertIn(({
            'value': '2300',
            'unit': 'rupees'
        }, u'2.3k rupees'), zipped)
Ejemplo n.º 13
0
def resolve_numerals(text, language) -> str:
    """
    Uses NumberDetector to resolve numeric occurrences in text for both English and Hindi.
    Args:
        text (str): processed string with numerals and character constants fixed
        language (str): Language for NumberDetector
    Returns:
        processed_text (str): modified text
    """
    processed_text = text
    number_detector = NumberDetector('asr_dummy', language=language)
    # FIXME: Detection fails if text starts with '0' since number detector discards it
    detected_numerals, original_texts = number_detector.detect_entity(
        text=text)
    detected_numerals_hi, original_texts_hi = number_detector.detect_entity(
        text=text, language='hi')
    detected_numerals.extend(detected_numerals_hi)
    original_texts.extend(original_texts_hi)
    for number, original_text in zip(detected_numerals, original_texts):
        substitution_reg = re.compile(re.escape(original_text), re.IGNORECASE)
        processed_text = substitution_reg.sub(
            number[NUMBER_DETECTION_RETURN_DICT_VALUE], processed_text)
    return processed_text
Ejemplo n.º 14
0
 def __init__(self, entity_name, language=ENGLISH_LANG, locale=None):
     """
     Args:
         entity_name (str): A string by which the detected numbers would be replaced with
         on calling detect_entity()
         language (str, optional): language code of number text, defaults to 'en'
         locale(str, optional): locale of the country from which you are dialing. Ex: 'en-IN'
     """
     self._supported_languages = NumberDetector.get_supported_languages()
     super(PhoneDetector, self).__init__(language, locale)
     self.language = language
     self.locale = locale or 'en-IN'
     self.text = ''
     self.phone, self.original_phone_text = [], []
     self.country_code = self.get_country_code_from_locale()
     self.entity_name = entity_name
     self.tag = '__' + self.entity_name + '__'
Ejemplo n.º 15
0
 def __init__(self, entity_name, language=ENGLISH_LANG):
     """
     Args:
         entity_name (str): A string by which the detected numbers would be replaced with
         on calling detect_entity()
         language (str, optional): language code of number text, defaults to 'en'
     """
     self._supported_languages = NumberDetector.get_supported_languages()
     super(PhoneDetector, self).__init__(language)
     self.language = language
     self.entity_name = entity_name
     self.text = ''
     self.tagged_text = ''
     self.processed_text = ''
     self.phone = []
     self.original_phone_text = []
     self.tag = '__' + self.entity_name + '__'
Ejemplo n.º 16
0
def number(request):
    """Use NumberDetector to detect numerals

       Attributes:
        request: url parameters:

        request params:
           message (str): natural text on which detection logic is to be run. Note if structured value is
                                   detection is run on structured value instead of message
           entity_name (str): name of the entity. Also acts as elastic-search dictionary name
                              if entity uses elastic-search lookup
           structured_value (str): Value obtained from any structured elements. Note if structured value is
                                   detection is run on structured value instead of message
                                   (For example, UI elements like form, payload, etc)
           fallback_value (str): If the detection logic fails to detect any value either from structured_value
                             or message then we return a fallback_value as an output.
           bot_message (str): previous message from a bot/agent.
           unit_type(str): restrict number range to detect for some unit types like 'currency', 'temperature'

           min_digit (str): min digit
           max_digit (str): max digit


       Returns:
           dict or None: dictionary containing entity_value, original_text and detection;
                         entity_value is in itself a dict with its keys varying from entity to entity

       Example:

           message = "I want to purchase 30 units of mobile and 40 units of Television"
           entity_name = 'number_of_unit'
           structured_value = None
           fallback_value = None
           bot_message = None
           unit_type = None
           output = get_number(message=message, entity_name=entity_name, structured_value=structured_value,
                              fallback_value=fallback_value, bot_message=bot_message, min_digit=1, max_digit=2)
           print output

               >> [{'detection': 'message', 'original_text': '30', 'entity_value': {'value': '30', 'unit': None}},
                   {'detection': 'message', 'original_text': '40', 'entity_value': {'value': '40', 'unit': None}}]


           message = "I want to reserve a table for 3 people"
           entity_name = 'number_of_people'
           structured_value = None
           fallback_value = None
           bot_message = None
           unit_type = None
           min_digit=1
           max_digit=6
           output = number(request)
           print output

               >> [{'detection': 'message', 'original_text': 'for 3 people', 'entity_value':
                                                                        {'value': '3', 'unit': 'people'}}]

       """
    try:
        parameters_dict = get_parameters_dictionary(request)
        ner_logger.debug('Start: %s ' % parameters_dict[PARAMETER_ENTITY_NAME])

        number_detection = NumberDetector(entity_name=parameters_dict[PARAMETER_ENTITY_NAME],
                                          language=parameters_dict[PARAMETER_SOURCE_LANGUAGE],
                                          unit_type=parameters_dict[PARAMETER_NUMBER_UNIT_TYPE])

        if parameters_dict[PARAMETER_MIN_DIGITS] and parameters_dict[PARAMETER_MAX_DIGITS]:
            min_digit = int(parameters_dict[PARAMETER_MIN_DIGITS])
            max_digit = int(parameters_dict[PARAMETER_MAX_DIGITS])
            number_detection.set_min_max_digits(min_digit=min_digit, max_digit=max_digit)

        entity_output = number_detection.detect(message=parameters_dict[PARAMETER_MESSAGE],
                                                structured_value=parameters_dict[PARAMETER_STRUCTURED_VALUE],
                                                fallback_value=parameters_dict[PARAMETER_FALLBACK_VALUE],
                                                bot_message=parameters_dict[PARAMETER_BOT_MESSAGE])
        ner_logger.debug('Finished %s : %s ' % (parameters_dict[PARAMETER_ENTITY_NAME], entity_output))

    except TypeError as e:
        ner_logger.exception('Exception for numeric: %s ' % e)
        return HttpResponse(status=500)

    return HttpResponse(json.dumps({'data': entity_output}), content_type='application/json')
Ejemplo n.º 17
0
class BaseNumberRangeDetector(object):
    def __init__(self,
                 entity_name,
                 language,
                 data_directory_path,
                 unit_type=None):
        """
        Standard Number detection class, read data from language data path and help to detect number ranges like min
        and max value from given number range text for given languages.
        Args:
            entity_name (str): entity_name: string by which the detected number would be replaced
            language (str): language code of text
            data_directory_path (str): path of data folder for given language
            unit_type (str, optional): number unit types like weight, currency, temperature, used to detect number with
                                       specific unit type only. If None, it will detect all number ranges irrespective
                                       of units. You can see all unit types supported inside number detection
                                       language data with filename unit.csv.

        """
        self.text = ''
        self.tagged_text = ''
        self.processed_text = ''
        self.entity_name = entity_name
        self.tag = '__' + entity_name + '__'
        self.range_variants_map = {}
        self.unit_type = unit_type

        self.min_range_prefix_variants = None
        self.min_range_suffix_variants = None
        self.max_range_prefix_variants = None
        self.max_range_suffix_variants = None
        self.min_max_range_variants = None
        self.number_detected_map = {}

        self.number_detector = NumberDetector(entity_name=entity_name,
                                              language=language)
        self.number_detector.set_min_max_digits(1, 100)

        # Method to initialise regex params
        self._init_regex_for_range(data_directory_path)

        # Variable to define default order in which detector will work
        self.detector_preferences = [
            self._detect_min_max_num_range,
            self._detect_min_num_range_with_prefix_variants,
            self._detect_min_num_range_with_suffix_variants,
            self._detect_max_num_range_with_prefix_variants,
            self._detect_max_num_range_with_suffix_variants
        ]

    def _init_regex_for_range(self, data_directory_path):
        """
        Initialise params which hold variants of keywords defining whether a given number range in text contains
        min value, max value or both.

        Params:
             min_range_start_variants (list): List of keywords which occur before min value in text
             min_range_end_variants (list): List of keywords which occur after min value in text
             max_range_start_variants (list): List of keywords which occur before max value in text
             max_range_end_variants (list): List of keywords which occur after max value in text
             min_max_range_variants (list): List of keywords which occur in between min and max value in text
        Args:
            data_directory_path (str): Data directory path
        Returns:
            None
        """
        number_range_df = pd.read_csv(os.path.join(
            data_directory_path,
            numeral_constant.NUMBER_RANGE_KEYWORD_FILE_NAME),
                                      encoding='utf-8')
        for index, row in number_range_df.iterrows():
            range_variants = get_list_from_pipe_sep_string(
                row[numeral_constant.COLUMN_NUMBER_RANGE_VARIANTS])
            for variant in range_variants:
                self.range_variants_map[variant] = \
                    NumberRangeVariant(position=row[numeral_constant.COLUMN_NUMBER_RANGE_POSITION],
                                       range_type=row[numeral_constant.COLUMN_NUMBER_RANGE_RANGE_TYPE])

        self.min_range_prefix_variants = [
            re.escape(variant)
            for variant, value in self.range_variants_map.items()
            if (value.position == -1
                and value.range_type == numeral_constant.NUMBER_RANGE_MIN_TYPE)
        ]

        self.min_range_suffix_variants = [
            re.escape(variant)
            for variant, value in self.range_variants_map.items()
            if (value.position == 1
                and value.range_type == numeral_constant.NUMBER_RANGE_MIN_TYPE)
        ]

        self.max_range_prefix_variants = [
            re.escape(variant)
            for variant, value in self.range_variants_map.items()
            if (value.position == -1
                and value.range_type == numeral_constant.NUMBER_RANGE_MAX_TYPE)
        ]

        self.max_range_suffix_variants = [
            re.escape(variant)
            for variant, value in self.range_variants_map.items()
            if (value.position == 1
                and value.range_type == numeral_constant.NUMBER_RANGE_MAX_TYPE)
        ]

        self.min_max_range_variants = [
            re.escape(variant)
            for variant, value in self.range_variants_map.items()
            if (value.position == 0 and value.range_type ==
                numeral_constant.NUMBER_RANGE_MIN_MAX_TYPE)
        ]

    def _tag_number_in_text(self, processed_text):
        """
        replace number in text with number tag from number_detected_map
        Args:
            processed_text (str): processed text
        Returns:
            (str): text with number replaced with tag
        Examples:
            >>> text = 'i want to buy 3 apples and more than two bananas'
            >>> number_detected_map = {'__number__0': ({'value': '2', 'unit': None}, 'two'),
                                       '__number__1': ({'value': '3', 'unit': None}, '3')}
            >>> self._tag_number_in_text(text)
            i want to buy __number__1 apples and more than __number__0 bananas
        """
        tagged_number_text = processed_text
        sorted_number_detected_map = sorted(
            self.number_detected_map.items(),
            key=lambda kv: len(kv[1].original_text),
            reverse=True)
        for number_tag in sorted_number_detected_map:
            tagged_number_text = tagged_number_text.replace(
                number_tag[1].original_text, number_tag[0], 1)
        return tagged_number_text

    def _get_number_tag_dict(self):
        """
        Method to create number tag dict. Its run number detection on text and create a dict having number tag as key
        and value as tuple of entity value and original text.
        Returns:
            (dict): dict containing number tag and their corresponding value and original text
        Examples:
            >>> text = 'I want 12 dozen banana'
            >>> self._get_number_tag_dict()
            {'__number_1': ({'value': 12, 'unit': None}, '12')}
        """
        detected_number_dict = {}
        entity_value_list, original_text_list = self.number_detector.detect_entity(
            self.processed_text)
        for index, (entity_value, original_text) in enumerate(
                zip(entity_value_list, original_text_list)):
            detected_number_dict[numeral_constant.NUMBER_REPLACE_TEXT +
                                 str(index)] = ValueTextPair(
                                     entity_value=entity_value,
                                     original_text=original_text)
        return detected_number_dict

    def _get_original_text_from_tagged_text(self, number_tag_text):
        """
        Return original text value of number tag from number detected map
        Args:
            number_tag_text (str): tagged number
        Returns:
            (str or None): Original value of tagged number if found else None
        """
        original = number_tag_text
        for number_tag in self.number_detected_map:
            original = original.replace(
                number_tag, self.number_detected_map[number_tag].original_text)
        if original == number_tag_text:
            return None
        return original

    def detect_number_range(self, text):
        """
        Detect number-range from number range text. Run through list of detectors defined in detector_preferences in
        the preferences.
        Args:
            text(str): text string
        Returns:
            (tuple): a tuple containing
                (list): list containing detected numeric text
                (list): list containing original numeral text

        """
        self.text = text
        self.tagged_text = text
        self.processed_text = text
        self.number_detected_map = self._get_number_tag_dict()
        self.processed_text = self._tag_number_in_text(text)

        number_list, original_list = None, None
        for detector in self.detector_preferences:
            number_list, original_list = detector(number_list, original_list)
            self._update_tagged_text(original_list)
        return number_list, original_list

    def _get_number_range(self, min_part_match, max_part_match, full_match):
        """
        Update number_range_list and original_list by finding entity value of number tag and original text from
        number_detected_map
        Args:
            min_part_match (str or None): tagged min number
            max_part_match (str or None): tagged max number
            full_match (str): text matching regex
        Returns:
            (tuple): a tuple containing
                (list): list containing detected numeric text
                (list): list containing original numeral text
        """
        number_range = None
        original_text = None

        if full_match not in self.processed_text:
            return number_range, original_text

        entity_value_min, entity_value_max, entity_unit = None, None, None

        if min_part_match and min_part_match in self.number_detected_map:
            entity_dict = self.number_detected_map[min_part_match].entity_value
            entity_value_min = entity_dict[
                numeral_constant.NUMBER_DETECTION_RETURN_DICT_VALUE]
            entity_unit = entity_dict[
                numeral_constant.NUMBER_DETECTION_RETURN_DICT_UNIT]

        if max_part_match and max_part_match in self.number_detected_map:
            entity_dict = self.number_detected_map[max_part_match].entity_value
            entity_value_max = entity_dict[
                numeral_constant.NUMBER_DETECTION_RETURN_DICT_VALUE]
            entity_unit = entity_dict[
                numeral_constant.NUMBER_DETECTION_RETURN_DICT_UNIT]

        if self.unit_type and (entity_unit is None or
                               self.number_detector.get_unit_type(entity_unit)
                               != self.unit_type):
            return number_range, original_text

        original_text = self._get_original_text_from_tagged_text(full_match)
        if (entity_value_min or entity_value_max) and original_text:
            self.processed_text = self.processed_text.replace(
                full_match.strip(), '', 1)
            original_text = original_text.strip()
            number_range = {
                numeral_constant.NUMBER_RANGE_MIN_VALUE: entity_value_min,
                numeral_constant.NUMBER_RANGE_MAX_VALUE: entity_value_max,
                numeral_constant.NUMBER_RANGE_VALUE_UNIT: entity_unit
            }
        return number_range, original_text

    def _detect_min_num_range_with_prefix_variants(self,
                                                   number_range_list=None,
                                                   original_list=None):
        """
        Method to detect number range containing only min value and keywords which identify value as min present
        before them. Example - More than 2 {'more than' => keyword, '2' => min value},
                               At least seven hundred rupees {'At least' => keyword, 'seven hundred rupees'=>min value}
        Args:
            number_range_list (list):
            original_list (list):
        Returns:
            (tuple): a tuple containing
                (list): list containing detected numeric text
                (list): list containing original numeral text
        """
        number_range_list = number_range_list or []
        original_list = original_list or []

        if self.min_range_prefix_variants:
            min_prefix_choices = '|'.join(self.min_range_prefix_variants)
            min_range_start_pattern = re.compile(
                ur'((?:{min_prefix_choices})\s+({number}\d+))'.format(
                    number=numeral_constant.NUMBER_REPLACE_TEXT,
                    min_prefix_choices=min_prefix_choices), re.UNICODE)
            number_range_matches = min_range_start_pattern.findall(
                self.processed_text)
            for match in number_range_matches:
                number_range, original_text = self._get_number_range(
                    min_part_match=match[1],
                    max_part_match=None,
                    full_match=match[0])
                if number_range and original_text:
                    number_range_list.append(number_range)
                    original_list.append(original_text)
        return number_range_list, original_list

    def _detect_min_num_range_with_suffix_variants(self,
                                                   number_range_list=None,
                                                   original_list=None):
        """
        Method to detect number range containing only min value and keywords which identify value as min present
        after them.
        Args:
            number_range_list (list):
            original_list (list):
        Returns:
           (tuple): a tuple containing
                (list): list containing detected numeric text
                (list): list containing original numeral text
        """
        number_range_list = number_range_list or []
        original_list = original_list or []

        if self.min_range_suffix_variants:
            min_suffix_choices = '|'.join(self.min_range_suffix_variants)
            min_range_end_pattern = re.compile(
                ur'(({number}\d+)\s+(?:{min_suffix_choices}))'.format(
                    number=numeral_constant.NUMBER_REPLACE_TEXT,
                    min_suffix_choices=min_suffix_choices), re.UNICODE)
            number_range_matches = min_range_end_pattern.findall(
                self.processed_text)
            for match in number_range_matches:
                number_range, original_text = self._get_number_range(
                    min_part_match=match[1],
                    max_part_match=None,
                    full_match=match[0])
                if number_range and original_text:
                    number_range_list.append(number_range)
                    original_list.append(original_text)

        return number_range_list, original_list

    def _detect_max_num_range_with_prefix_variants(self,
                                                   number_range_list=None,
                                                   original_list=None):
        """
        Method to detect number range containing only max value and keywords which identify value as min present
        before them. Example - less than 2 {'less than' => keyword, '2' => max value},
                               At most seven hundred rupees {'At most' => keyword, 'seven hundred rupees'=>min value}
        Args:
            number_range_list (list):
            original_list (list):
        Returns:
            (tuple): a tuple containing
                (list): list containing detected numeric text
                (list): list containing original numeral text
        """
        number_range_list = number_range_list or []
        original_list = original_list or []

        if self.max_range_prefix_variants:
            max_prefix_choices = '|'.join(self.max_range_prefix_variants)
            max_range_start_pattern = re.compile(
                ur'((?:{max_prefix_choices})\s+({number}\d+))'.format(
                    number=numeral_constant.NUMBER_REPLACE_TEXT,
                    max_prefix_choices=max_prefix_choices), re.UNICODE)
            number_range_matches = max_range_start_pattern.findall(
                self.processed_text)
            for match in number_range_matches:
                number_range, original_text = self._get_number_range(
                    min_part_match=None,
                    max_part_match=match[1],
                    full_match=match[0])
                if number_range and original_text:
                    number_range_list.append(number_range)
                    original_list.append(original_text)

        return number_range_list, original_list

    def _detect_max_num_range_with_suffix_variants(self,
                                                   number_range_list=None,
                                                   original_list=None):
        """
        Method to detect number range containing only max value and keywords which identify value as min present
        after them.
        Args:
            number_range_list (list):
            original_list (list):
        Returns:
            (tuple): a tuple containing
                (list): list containing detected numeric text
                (list): list containing original numeral text
        """
        number_range_list = number_range_list or []
        original_list = original_list or []

        if self.max_range_suffix_variants:
            max_suffix_choices = '|'.join(self.max_range_suffix_variants)
            max_range_end_pattern = re.compile(
                ur'(({number}\d+)\s+(?:{max_suffix_choices}))'.format(
                    number=numeral_constant.NUMBER_REPLACE_TEXT,
                    max_suffix_choices=max_suffix_choices), re.UNICODE)
            number_range_matches = max_range_end_pattern.findall(
                self.processed_text)
            for match in number_range_matches:
                number_range, original_text = self._get_number_range(
                    min_part_match=None,
                    max_part_match=match[1],
                    full_match=match[0])
                if number_range and original_text:
                    number_range_list.append(number_range)
                    original_list.append(original_text)

        return number_range_list, original_list

    def _detect_min_max_num_range(self,
                                  number_range_list=None,
                                  original_list=None):
        """
        Method to detect number range containing both min and max value and keywords them present in between
        Example - 2000 to 30000 {'to' => keyword, '2000' => min value, '30000' => ,max_value},
                 2k-3k hundred rupees {'-' => keyword, '2k' => min value, '3k' => ,max_value}
        Args:
            number_range_list (list):
            original_list (list):
        Returns:
            (tuple): a tuple containing
                (list): list containing detected numeric text
                (list): list containing original numeral text
        """
        number_range_list = number_range_list or []
        original_list = original_list or []

        if self.min_max_range_variants:
            min_max_choices = '|'.join(self.min_max_range_variants)
            min_max_range_pattern = re.compile(
                ur'(({number}\d+)\s*(?:{min_max_choices})\s*'
                ur'({number}\d+))'.format(
                    number=numeral_constant.NUMBER_REPLACE_TEXT,
                    min_max_choices=min_max_choices), re.UNICODE)
            number_range_matches = min_max_range_pattern.findall(
                self.processed_text)
            for match in number_range_matches:
                number_range, original_text = self._get_number_range(
                    min_part_match=match[1],
                    max_part_match=match[2],
                    full_match=match[0])
                if number_range and original_text:
                    number_range_list.append(number_range)
                    original_list.append(original_text)

        return number_range_list, original_list

    def _update_tagged_text(self, original_number_list):
        """
        Replaces detected date with tag generated from entity_name used to initialize the object with

        A final string with all dates replaced will be stored in object's tagged_text attribute
        A string with all dates removed will be stored in object's processed_text attribute

        Args:
            original_number_list (list): list of substrings of original text to be replaced with tag
                                       created from entity_name
        """
        for detected_text in original_number_list:
            self.tagged_text = self.tagged_text.replace(
                detected_text, self.tag)
Ejemplo n.º 18
0
def get_currency(text,detected_lang):
    from ner_v2.detectors.numeral.number.number_detection import NumberDetector
    detector = NumberDetector(entity_name='number', language=detected_lang,unit_type='currency')
    number = detector.detect_entity(text)
    return number