def get_regex(message, entity_name, structured_value, fallback_value, bot_message, pattern): """Use RegexDetector to detect text that abide by the specified pattern. The meta_data consists the pattern Args: message (str): natural text on which detection logic is to be run. Note if structured value is detection is run on structured value instead of message entity_name (str): name of the entity. Also acts as elastic-search dictionary name if entity uses elastic-search lookup structured_value (str): Value obtained from any structured elements. Note if structured value is detection is run on structured value instead of message (For example, UI elements like form, payload, etc) fallback_value (str): If the detection logic fails to detect any value either from structured_value or message then we return a fallback_value as an output. bot_message (str): previous message from a bot/agent. Returns: dict or None: dictionary containing entity_value, original_text and detection; entity_value is in itself a dict with its keys varying from entity to entity Example: message = 'abc123' entity_name = 'numerals' pattern = '\\d+' structured_value = None fallback_value = None bot_message = None output = get_regex(message=message, entity_name=entity_name, structured_value=structured_value, fallback_value=fallback_value, bot_message=bot_message, pattern=pattern) print output >> [{'detection': 'message', 'original_text': '123', 'entity_value': {'value': '123'}}] """ regex_detector = RegexDetector(entity_name=entity_name, pattern=pattern) if structured_value: entity_list, original_text_list = regex_detector.detect_entity(text=structured_value) if entity_list: return output_entity_dict_list(entity_list, original_text_list, FROM_STRUCTURE_VALUE_VERIFIED) else: return output_entity_dict_list([structured_value], [structured_value], FROM_STRUCTURE_VALUE_NOT_VERIFIED) else: entity_list, original_text_list = regex_detector.detect_entity(text=message) if entity_list: return output_entity_dict_list(entity_list, original_text_list, FROM_MESSAGE) elif fallback_value: return output_entity_dict_list([fallback_value], [fallback_value], FROM_FALLBACK_VALUE) return None
def get_regex(message, entity_name, structured_value, fallback_value, bot_message, regex): """This functionality calls the RegexDetector class to detect text that abide by the specified regex. The meta_data consists the regex Attributes: NOTE: Explained above meta_data (dict) : It consists of the regex Output: NOTE: Explained above Example: message = 'abc123' entity_name = 'regex' meta_data = {'regex': '\d'} structured_value = None fallback_value = None bot_message = None output = get_regex(message=message, entity_name=entity_name, structured_value=structured_value, fallback_value=fallback_value, bot_message=bot_message, meta_data=meta_data) print output >> [{'detection': 'message', 'original_text': '1', 'entity_value': {'value': '1'}}] """ ner_logger.debug("BEFORE AST LITERAL REGEX>>>>>>%s" % regex) ner_logger.debug("REGEX>>>>>>%s" % regex) regex_detection = RegexDetector(entity_name=entity_name, regex=regex) if structured_value: entity_list, original_text_list = regex_detection.detect_entity( text=structured_value) if entity_list: return output_entity_dict_list(entity_list, original_text_list, FROM_STRUCTURE_VALUE_VERIFIED) else: return output_entity_dict_value(structured_value, structured_value, FROM_STRUCTURE_VALUE_NOT_VERIFIED) else: entity_list, original_text_list = regex_detection.detect_entity( text=message) if entity_list: return output_entity_dict_list(entity_list, original_text_list, FROM_MESSAGE) elif fallback_value: return output_entity_dict_value(fallback_value, fallback_value, FROM_FALLBACK_VALUE) return None
def test_non_empty_matches(self): """Test if RegexDetector returns only non empty matches""" entity_name = 'test' _ = '__{}__'.format(entity_name) pattern = '\\b(\\d+|)\\b' text = 'there are no numbers in this text! but the pattern is bad too, it matches empty string' regex_detector = RegexDetector(entity_name=entity_name, pattern=pattern) expected_values = [] expected_original_texts = [] expected_tagged_text = text values, original_texts = regex_detector.detect_entity(text) self.assertEqual(regex_detector.tagged_text, expected_tagged_text) self.assertEqual(values, expected_values) self.assertEqual(original_texts, expected_original_texts)
def test_recursive_replace(self): """Test protection against MemoryError when replacing in RegexDetector""" multiplier = 30 entity_name = 'abab' tag = '__{}__'.format(entity_name) pattern = '\\bab\\b' text = ' '.join(['ab'] * multiplier) regex_detector = RegexDetector(entity_name=entity_name, pattern=pattern) expected_values = ['ab'] * multiplier expected_original_texts = ['ab'] * multiplier expected_tagged_text = ' '.join(['{t}'.format(t=tag)] * multiplier) values, original_texts = regex_detector.detect_entity(text) self.assertEqual(regex_detector.tagged_text, expected_tagged_text) self.assertEqual(values, expected_values) self.assertEqual(original_texts, expected_original_texts)
def test_dot_star(self): """Test .* pattern for RegexDetector""" entity_name = 'test' tag = '__{}__'.format(entity_name) pattern = '.*' text = 'hello world\nlorem ipsum dolor sit amet\ntest with new lines and stuff .^!@"#$%^&*(){}[]:?><\n' regex_detector = RegexDetector(entity_name=entity_name, pattern=pattern) expected_values = [ 'hello world', 'lorem ipsum dolor sit amet', 'test with new lines and stuff .^!@"#$%^&*(){}[]:?><' ] expected_original_texts = [ 'hello world', 'lorem ipsum dolor sit amet', 'test with new lines and stuff .^!@"#$%^&*(){}[]:?><' ] expected_tagged_text = '{t}\n{t}\n{t}\n'.format(t=tag) values, original_texts = regex_detector.detect_entity(text) self.assertEqual(regex_detector.tagged_text, expected_tagged_text) self.assertEqual(values, expected_values) self.assertEqual(original_texts, expected_original_texts) regex_detector = RegexDetector(entity_name=entity_name, re_flags=RegexDetector.DEFAULT_FLAGS | re.DOTALL, pattern=pattern) expected_values = [text] expected_original_texts = [text] expected_tagged_text = '{t}'.format(t=tag) values, original_texts = regex_detector.detect_entity(text) self.assertEqual(regex_detector.tagged_text, expected_tagged_text) self.assertEqual(values, expected_values) self.assertEqual(original_texts, expected_original_texts)
def test_nested_character_group_compile(self): """Test compiling patterns that fail with regex.V1 but work with regex.V0""" entity_name = 'test' pattern = '[[\\]]' text = 'this pattern should extract box brackets [] [][[[ ]]]]]' expected_values = [ '[', ']', '[', ']', '[', '[', '[', ']', ']', ']', ']', ']' ] expected_original_texts = [ '[', ']', '[', ']', '[', '[', '[', ']', ']', ']', ']', ']' ] regex_detector = RegexDetector(entity_name=entity_name, re_flags=RegexDetector.DEFAULT_FLAGS, pattern=pattern) self.assertTrue((regex_detector.pattern.flags & re.V1) == 0) self.assertTrue((regex_detector.pattern.flags & re.V0) != 0) values, original_texts = regex_detector.detect_entity(text) self.assertEqual(values, expected_values) self.assertEqual(original_texts, expected_original_texts) pattern = '[[]]' text = 'this pattern should extract box brackets pairs [] [][[[ ]]]]]' expected_values = ['[]', '[]'] expected_original_texts = ['[]', '[]'] regex_detector = RegexDetector(entity_name=entity_name, re_flags=RegexDetector.DEFAULT_FLAGS, pattern=pattern) self.assertTrue((regex_detector.pattern.flags & re.V1) == 0) self.assertTrue((regex_detector.pattern.flags & re.V0) != 0) values, original_texts = regex_detector.detect_entity(text) self.assertEqual(values, expected_values) self.assertEqual(original_texts, expected_original_texts)
def test_max_matches(self): """Test max_matches argument for RegexDetector""" entity_name = 'num' tag = '__{}__'.format(entity_name) pattern = '\\b(\\d+|)\\b' text = 'there are some numbers like 345 and 2342, but the pattern is bad too it matches empty string! We ' \ 'will now sprinkle this text with numbers 34634653 42342345234 12433345325 to test 17293847 345 2342' regex_detector = RegexDetector(entity_name=entity_name, pattern=pattern, max_matches=3) expected_values = ['345', '2342', '34634653'] expected_original_texts = ['345', '2342', '34634653'] expected_tagged_text = 'there are some numbers like {t} and {t}, but the pattern is bad too ' \ 'it matches empty string! We will now sprinkle this text with' \ ' numbers {t} 42342345234 12433345325 to test 17293847 345 2342'.format(t=tag) values, original_texts = regex_detector.detect_entity(text) self.assertEqual(regex_detector.tagged_text, expected_tagged_text) self.assertEqual(values, expected_values) self.assertEqual(original_texts, expected_original_texts) regex_detector = RegexDetector(entity_name=entity_name, pattern=pattern, max_matches=50) expected_values = [ '345', '2342', '34634653', '42342345234', '12433345325', '17293847', '345', '2342' ] expected_original_texts = [ '345', '2342', '34634653', '42342345234', '12433345325', '17293847', '345', '2342' ] expected_tagged_text = 'there are some numbers like {t} and {t}, but the pattern is bad too ' \ 'it matches empty string! We will now sprinkle this text with' \ ' numbers {t} {t} {t} to test {t} {t} {t}'.format(t=tag) values, original_texts = regex_detector.detect_entity(text) self.assertEqual(regex_detector.tagged_text, expected_tagged_text) self.assertEqual(values, expected_values) self.assertEqual(original_texts, expected_original_texts)
def test_invalid_pattern_compile(self): """Test compiling invalid pattern raises re.error""" entity_name = 'test' pattern = '(invalid!' with self.assertRaises(re.error): RegexDetector(entity_name=entity_name, pattern=pattern)