def get_regex(message, entity_name, structured_value, fallback_value, bot_message, pattern):
    """Use RegexDetector to detect text that abide by the specified
        pattern.
        The meta_data consists the pattern

    Args:
        message (str): natural text on which detection logic is to be run. Note if structured value is
                                detection is run on structured value instead of message
        entity_name (str): name of the entity. Also acts as elastic-search dictionary name
                           if entity uses elastic-search lookup
        structured_value (str): Value obtained from any structured elements. Note if structured value is
                                detection is run on structured value instead of message
                                (For example, UI elements like form, payload, etc)
        fallback_value (str): If the detection logic fails to detect any value either from structured_value
                          or message then we return a fallback_value as an output.
        bot_message (str): previous message from a bot/agent.


    Returns:
        dict or None: dictionary containing entity_value, original_text and detection;
                      entity_value is in itself a dict with its keys varying from entity to entity

    Example:

        message = 'abc123'
        entity_name = 'numerals'
        pattern = '\\d+'
        structured_value = None
        fallback_value = None
        bot_message = None
        output = get_regex(message=message, entity_name=entity_name, structured_value=structured_value,
                           fallback_value=fallback_value, bot_message=bot_message, pattern=pattern)
        print output

            >> [{'detection': 'message', 'original_text': '123', 'entity_value': {'value': '123'}}]

    """
    regex_detector = RegexDetector(entity_name=entity_name, pattern=pattern)
    if structured_value:
        entity_list, original_text_list = regex_detector.detect_entity(text=structured_value)
        if entity_list:
            return output_entity_dict_list(entity_list, original_text_list, FROM_STRUCTURE_VALUE_VERIFIED)
        else:
            return output_entity_dict_list([structured_value], [structured_value], FROM_STRUCTURE_VALUE_NOT_VERIFIED)
    else:
        entity_list, original_text_list = regex_detector.detect_entity(text=message)
        if entity_list:
            return output_entity_dict_list(entity_list, original_text_list, FROM_MESSAGE)
        elif fallback_value:
            return output_entity_dict_list([fallback_value], [fallback_value], FROM_FALLBACK_VALUE)

    return None
Exemple #2
0
def get_regex(message, entity_name, structured_value, fallback_value,
              bot_message, regex):
    """This functionality calls the RegexDetector class to detect text that abide by the specified
        regex.
        The meta_data consists the regex

    Attributes:
        NOTE: Explained above
        meta_data (dict) : It consists of the regex
    Output:
        NOTE: Explained above

    Example:

        message = 'abc123'
        entity_name = 'regex'
        meta_data = {'regex': '\d'}
        structured_value = None
        fallback_value = None
        bot_message = None
        output = get_regex(message=message, entity_name=entity_name, structured_value=structured_value,
                        fallback_value=fallback_value, bot_message=bot_message, meta_data=meta_data)
        print output

            >> [{'detection': 'message', 'original_text': '1', 'entity_value': {'value': '1'}}]

    """
    ner_logger.debug("BEFORE AST LITERAL REGEX>>>>>>%s" % regex)
    ner_logger.debug("REGEX>>>>>>%s" % regex)
    regex_detection = RegexDetector(entity_name=entity_name, regex=regex)
    if structured_value:
        entity_list, original_text_list = regex_detection.detect_entity(
            text=structured_value)
        if entity_list:
            return output_entity_dict_list(entity_list, original_text_list,
                                           FROM_STRUCTURE_VALUE_VERIFIED)
        else:
            return output_entity_dict_value(structured_value, structured_value,
                                            FROM_STRUCTURE_VALUE_NOT_VERIFIED)
    else:
        entity_list, original_text_list = regex_detection.detect_entity(
            text=message)
        if entity_list:
            return output_entity_dict_list(entity_list, original_text_list,
                                           FROM_MESSAGE)
        elif fallback_value:
            return output_entity_dict_value(fallback_value, fallback_value,
                                            FROM_FALLBACK_VALUE)

    return None
    def test_non_empty_matches(self):
        """Test if RegexDetector returns only non empty matches"""
        entity_name = 'test'
        _ = '__{}__'.format(entity_name)
        pattern = '\\b(\\d+|)\\b'
        text = 'there are no numbers in this text! but the pattern is bad too, it matches empty string'

        regex_detector = RegexDetector(entity_name=entity_name,
                                       pattern=pattern)
        expected_values = []
        expected_original_texts = []
        expected_tagged_text = text
        values, original_texts = regex_detector.detect_entity(text)
        self.assertEqual(regex_detector.tagged_text, expected_tagged_text)
        self.assertEqual(values, expected_values)
        self.assertEqual(original_texts, expected_original_texts)
    def test_recursive_replace(self):
        """Test protection against MemoryError when replacing in RegexDetector"""
        multiplier = 30
        entity_name = 'abab'
        tag = '__{}__'.format(entity_name)
        pattern = '\\bab\\b'
        text = ' '.join(['ab'] * multiplier)

        regex_detector = RegexDetector(entity_name=entity_name,
                                       pattern=pattern)
        expected_values = ['ab'] * multiplier
        expected_original_texts = ['ab'] * multiplier
        expected_tagged_text = ' '.join(['{t}'.format(t=tag)] * multiplier)
        values, original_texts = regex_detector.detect_entity(text)
        self.assertEqual(regex_detector.tagged_text, expected_tagged_text)
        self.assertEqual(values, expected_values)
        self.assertEqual(original_texts, expected_original_texts)
    def test_dot_star(self):
        """Test .* pattern for RegexDetector"""
        entity_name = 'test'
        tag = '__{}__'.format(entity_name)
        pattern = '.*'
        text = 'hello world\nlorem ipsum dolor sit amet\ntest with new lines and stuff .^!@"#$%^&*(){}[]:?><\n'

        regex_detector = RegexDetector(entity_name=entity_name,
                                       pattern=pattern)
        expected_values = [
            'hello world', 'lorem ipsum dolor sit amet',
            'test with new lines and stuff .^!@"#$%^&*(){}[]:?><'
        ]
        expected_original_texts = [
            'hello world', 'lorem ipsum dolor sit amet',
            'test with new lines and stuff .^!@"#$%^&*(){}[]:?><'
        ]
        expected_tagged_text = '{t}\n{t}\n{t}\n'.format(t=tag)
        values, original_texts = regex_detector.detect_entity(text)
        self.assertEqual(regex_detector.tagged_text, expected_tagged_text)
        self.assertEqual(values, expected_values)
        self.assertEqual(original_texts, expected_original_texts)

        regex_detector = RegexDetector(entity_name=entity_name,
                                       re_flags=RegexDetector.DEFAULT_FLAGS
                                       | re.DOTALL,
                                       pattern=pattern)
        expected_values = [text]
        expected_original_texts = [text]
        expected_tagged_text = '{t}'.format(t=tag)
        values, original_texts = regex_detector.detect_entity(text)
        self.assertEqual(regex_detector.tagged_text, expected_tagged_text)
        self.assertEqual(values, expected_values)
        self.assertEqual(original_texts, expected_original_texts)
    def test_nested_character_group_compile(self):
        """Test compiling patterns that fail with regex.V1 but work with regex.V0"""
        entity_name = 'test'
        pattern = '[[\\]]'
        text = 'this pattern should extract box brackets [] [][[[ ]]]]]'
        expected_values = [
            '[', ']', '[', ']', '[', '[', '[', ']', ']', ']', ']', ']'
        ]
        expected_original_texts = [
            '[', ']', '[', ']', '[', '[', '[', ']', ']', ']', ']', ']'
        ]
        regex_detector = RegexDetector(entity_name=entity_name,
                                       re_flags=RegexDetector.DEFAULT_FLAGS,
                                       pattern=pattern)
        self.assertTrue((regex_detector.pattern.flags & re.V1) == 0)
        self.assertTrue((regex_detector.pattern.flags & re.V0) != 0)
        values, original_texts = regex_detector.detect_entity(text)
        self.assertEqual(values, expected_values)
        self.assertEqual(original_texts, expected_original_texts)

        pattern = '[[]]'
        text = 'this pattern should extract box brackets pairs [] [][[[ ]]]]]'
        expected_values = ['[]', '[]']
        expected_original_texts = ['[]', '[]']
        regex_detector = RegexDetector(entity_name=entity_name,
                                       re_flags=RegexDetector.DEFAULT_FLAGS,
                                       pattern=pattern)
        self.assertTrue((regex_detector.pattern.flags & re.V1) == 0)
        self.assertTrue((regex_detector.pattern.flags & re.V0) != 0)
        values, original_texts = regex_detector.detect_entity(text)
        self.assertEqual(values, expected_values)
        self.assertEqual(original_texts, expected_original_texts)
    def test_max_matches(self):
        """Test max_matches argument for RegexDetector"""
        entity_name = 'num'
        tag = '__{}__'.format(entity_name)
        pattern = '\\b(\\d+|)\\b'
        text = 'there are some numbers like 345 and 2342, but the pattern is bad too it matches empty string! We ' \
               'will now sprinkle this text with numbers 34634653 42342345234 12433345325 to test 17293847 345 2342'

        regex_detector = RegexDetector(entity_name=entity_name,
                                       pattern=pattern,
                                       max_matches=3)
        expected_values = ['345', '2342', '34634653']
        expected_original_texts = ['345', '2342', '34634653']
        expected_tagged_text = 'there are some numbers like {t} and {t}, but the pattern is bad too ' \
                               'it matches empty string! We will now sprinkle this text with' \
                               ' numbers {t} 42342345234 12433345325 to test 17293847 345 2342'.format(t=tag)
        values, original_texts = regex_detector.detect_entity(text)
        self.assertEqual(regex_detector.tagged_text, expected_tagged_text)
        self.assertEqual(values, expected_values)
        self.assertEqual(original_texts, expected_original_texts)

        regex_detector = RegexDetector(entity_name=entity_name,
                                       pattern=pattern,
                                       max_matches=50)
        expected_values = [
            '345', '2342', '34634653', '42342345234', '12433345325',
            '17293847', '345', '2342'
        ]
        expected_original_texts = [
            '345', '2342', '34634653', '42342345234', '12433345325',
            '17293847', '345', '2342'
        ]
        expected_tagged_text = 'there are some numbers like {t} and {t}, but the pattern is bad too ' \
                               'it matches empty string! We will now sprinkle this text with' \
                               ' numbers {t} {t} {t} to test {t} {t} {t}'.format(t=tag)
        values, original_texts = regex_detector.detect_entity(text)
        self.assertEqual(regex_detector.tagged_text, expected_tagged_text)
        self.assertEqual(values, expected_values)
        self.assertEqual(original_texts, expected_original_texts)
 def test_invalid_pattern_compile(self):
     """Test compiling invalid pattern raises re.error"""
     entity_name = 'test'
     pattern = '(invalid!'
     with self.assertRaises(re.error):
         RegexDetector(entity_name=entity_name, pattern=pattern)