def _detect_number_from_words(self, number_list=None, original_list=None): """ Detect numbers from number words, for example - "two thousand", "One hundred twenty two". How it works? First it splits the text checking if any of '-' or ':' is present in text, and pass the split text to number word detector, which return the number value and original word from which it is being detected. Further we check for unit in suffix and prefix of original string and update that if any units are found. Args: number_list (list): list containing detected numeric text original_list (list): list containing original numeral text Returns: number_list (list): list containing updated detected numeric text original_list (list): list containing updated original numeral text Examples: [In] >> self.processed_text = "One hundred two" [In] >> _detect_number_from_numerals() [Out] >> ([{'value': '102', 'unit': None}], ['one hundred two two']) [In] >> self.processed_text = "two hundred - three hundred" [In] >> _detect_number_from_numerals() [Out] >> ([{'value': '200', 'unit': None}, {'value': '300', 'unit': None}], ['two hundred', 'three hundred']) [In] >> self.processed_text = "one two three" [In] >> _detect_number_from_numerals() [Out] >> ([{'value': '2', 'unit': None}, {'value': '2', 'unit': None}, {'value': '3', 'unit': None}], ['one', 'two', 'three']) *Notes* Some Limitations: i) Cannot detect decimals without the integer part. E.g. .25, .5, etc ii) Cannot detect one with "a/an". E.g. I want an apple iii) Detects wrong for multiple scales mentioned consecutively E.g. three hundred thousand, hundred thousand """ number_list = number_list or [] original_list = original_list or [] # Splitting text based on "-" and ":", as in case of text "two thousand-three thousand", simple splitting # will give list as [two, thousand-three, thousand], result in number word detector giving wrong result, # hence we need to separate them into [two thousand, three thousand] using '-' or ':' as split char numeral_text_list = re.split(r'[\-\:]', self.processed_text) for numeral_text in numeral_text_list: numbers, original_texts = get_number_from_number_word( numeral_text, self.numbers_word_map) for number, original_text in zip(numbers, original_texts): unit = None if self.unit_type: unit, original_text = self._get_unit_from_text( original_text, numeral_text) numeral_text = numeral_text.replace(original_text, self.tag) number_list.append({ NUMBER_DETECTION_RETURN_DICT_VALUE: str(number), NUMBER_DETECTION_RETURN_DICT_UNIT: unit }) original_list.append(original_text) return number_list, original_list
def test_get_number_with_scale_and_unit_in_number_word(self): """ Number detection from word with scale and unit like - 'one hundred', 'one thousand two hundred five' """ message = 'haptik get one thousand two hundred five messages daily' detect_texts, original_texts = get_number_from_number_word( message, self.number_words_map) zipped = list(zip(detect_texts, original_texts)) self.assertEqual(len(zipped), 1) self.assertIn((1205, 'one thousand two hundred five'), zipped)
def test_get_number_with_only_scale_in_number_word(self): """ Number detection from word with only scale like - 'hundred', 'thousand' """ message = 'need hundred change' detect_texts, original_texts = get_number_from_number_word( message, self.number_words_map) zipped = list(zip(detect_texts, original_texts)) self.assertEqual(len(zipped), 1) self.assertIn((100, 'hundred'), zipped)
def test_get_number_with_only_unit_in_number_word(self): """ Number detection from word with only unit ex - 'one', 'two', 'twenty' """ message = 'I want to book for one passenger' detect_texts, original_texts = get_number_from_number_word( message, self.number_words_map) zipped = list(zip(detect_texts, original_texts)) self.assertEqual(len(zipped), 1) self.assertIn((1, 'one'), zipped)
def test_get_number_with_multiple_spaces_in_unit_scale_number_word(self): """ Number detection from word with multiple spaces in unit scale number like 'one thousand one hundred two' """ message = 'there are one thousand one hundred two students attending placement drive' detect_texts, original_texts = get_number_from_number_word( message, self.number_words_map) zipped = list(zip(detect_texts, original_texts)) self.assertEqual(len(zipped), 1) self.assertIn((1102, 'one thousand one hundred two'), zipped)
def test_get_number_with_multiple_unit_scale_number_word(self): """ Number detection from word with multiple unit scale numbers like 'one thousand to one hundred two' """ message = 'one thousand to one hundred two' detect_texts, original_texts = get_number_from_number_word( message, self.number_words_map) zipped = list(zip(detect_texts, original_texts)) self.assertEqual(len(zipped), 2) self.assertIn((1000, 'one thousand'), zipped) self.assertIn((102, 'one hundred two'), zipped)
def test_get_number_with_multiple_unit_number_word(self): """ Number detection from word with multiple unit numbers like 'one two three twenty one' """ message = 'one two three' detect_texts, original_texts = get_number_from_number_word( message, self.number_words_map) zipped = list(zip(detect_texts, original_texts)) self.assertEqual(len(zipped), 3) self.assertIn((1, 'one'), zipped) self.assertIn((2, 'two'), zipped) self.assertIn((3, 'three'), zipped)
def test_get_number_with_combination_of_unit_scale_and_unit_number_word( self): """ Number detection from word with combination of unit scale and unit number like 'one thousand one two three' """ message = 'one thousand one two three' detect_texts, original_texts = get_number_from_number_word( message, self.number_words_map) zipped = list(zip(detect_texts, original_texts)) self.assertEqual(len(zipped), 3) self.assertIn((1001, 'one thousand one'), zipped) self.assertIn((2, 'two'), zipped) self.assertIn((3, 'three'), zipped)
def _detect_number_from_words(self, number_list=None, original_list=None): """ Detect numbers from number words, for example - "two thousand", "One hundred twenty two". How it works? First it splits the text checking if any of '-' or ':' is present in text, and pass the split text to number word detector, which return the number value and original word from which it is being detected. Further we check for unit in suffix and prefix of original string and update that if any units are found. Args: number_list (list): list containing detected numeric text original_list (list): list containing original numeral text Returns: number_list (list): list containing updated detected numeric text original_list (list): list containing updated original numeral text Examples: [In] >> self.processed_text = "One hundred two" [In] >> _detect_number_from_numerals() [Out] >> ([{'value': '102', 'unit': None}], ['one hundred two two']) [In] >> self.processed_text = "two hundred - three hundred" [In] >> _detect_number_from_numerals() [Out] >> ([{'value': '200', 'unit': None}, {'value': '300', 'unit': None}], ['two hundred', 'three hundred']) [In] >> self.processed_text = "one two three" [In] >> _detect_number_from_numerals() [Out] >> ([{'value': '2', 'unit': None}, {'value': '2', 'unit': None}, {'value': '3', 'unit': None}], ['one', 'two', 'three']) *Notes* Some Limitations: i) Cannot detect decimals without the integer part. E.g. .25, .5, etc ii) Cannot detect one with "a/an". E.g. I want an apple iii) Detects wrong for multiple scales mentioned consecutively E.g. three hundred thousand, hundred thousand """ number_list = number_list or [] original_list = original_list or [] end_span = -1 spans = [] spanned_text = self.text # Splitting text based on "-" and ":", as in case of text "two thousand-three thousand", simple splitting # will give list as [two, thousand-three, thousand], result in number word detector giving wrong result, # hence we need to separate them into [two thousand, three thousand] using '-' or ':' as split char numeral_text_list = re.split(r'[\-\:]', self.processed_text) for numeral_text in numeral_text_list: numbers, original_texts = get_number_from_number_word( numeral_text, self.numbers_word_map) for original in original_texts: span = re.search(original, spanned_text).span() start_span = end_span + span[0] end_span += span[1] spanned_text = spanned_text[span[1]:] spans.append((start_span, end_span)) full_list = list(zip(numbers, original_texts, spans)) sorted_full_list = sorted(full_list, key=lambda kv: len(kv[2]), reverse=False) for number, original_text, span in sorted_full_list: unit = None if self.unit_type: unit, original_text = self._get_unit_from_text( original_text, numeral_text) _pattern = re.compile(self._SPAN_BOUNDARY_TEMPLATE.format( re.escape(original_text)), flags=_re_flags) if _pattern.search(numeral_text): numeral_text = _pattern.sub(self.tag, numeral_text, 1) number_list.append({ NUMBER_DETECTION_RETURN_DICT_VALUE: str(number), NUMBER_DETECTION_RETURN_DICT_UNIT: unit, NUMBER_DETECTION_RETURN_DICT_SPAN: span }) original_list.append(original_text) return number_list, original_list