def combine_output_of_detection_logic_and_tag(entity_data, text):
    """NER is often used to tag the chat so it can be used in disambiguation process. Also, many times one entity may
    overlap with another.
    For example: "I want to order from Delhi Dhaba" and we want to detect two entities i.e. restaurant and city.
    So, first we will run individual detection logic of restaurant and city and from this we are able to derive two
    entity values i.e. Delhi Dhaba (restaurant) and Delhi (city) but we see that entity city is irrelevant in above
    case because message is about ordering from restaurant. So it necessary to process the output which is obtained by
    running individual detection logic and keep the relevant entities.

    Attributes:
        text: a message on which detection logic needs to run. For example "i want to order form  delhi dhaba"
        entity_data: dictionary containing key as entity_name and value as a output from entities detection logic.
        For example:
            {
            "restaurant":
                [
                    {
                        "detection": "chat",
                        "original_text": "delhi dhaba",
                        "entity_value":"Delhi Dhaba"
                    }
                ],
            "city":
                [
                    {
                        "detection": "chat",
                        "original_text": "delhi",
                        "entity_value":"New Delhi"
                    }
                ]
        }

    Output:
        will be list of dictionary
        {
            'entity_data':  PROCESSED_ENTITY_DICTIONARY,
            'tag': TAGGED_TEXT

        }

        entity_data will be processed dictionary of entities containg valid entity value and will remove the ambiguity
        tagged_text will be the tagged_data
        For example:
          {
            "entity_data":
                {
                    "restaurant":
                        [
                            {
                                "detection": "chat",
                                "original_text": "delhi dhaba",
                                "entity_value":"Delhi Dhaba"
                            }
                        ],
                    "city":
                        [
                            {
                                "detection": "chat",
                                "original_text": "delhi",
                                "entity_value":"New Delhi"
                            }
                        ]
                },
            "tagged_text": "i want to order from __restaurant__"
          }

    """
    regex = Regex([(r'[\'\/]', r'')])
    text = regex.text_substitute(text)
    final_entity_data = defaultdict(list)
    tagged_text = text.lower()
    processed_text = text.lower()
    tag_preprocess_dict = defaultdict(list)
    for entity, entity_list in entity_data.iteritems():
        if entity_list:
            for entity_identified in entity_list:
                if entity_identified[ORIGINAL_TEXT] and \
                                entity_identified[DETECTION_METHOD] in [FROM_MESSAGE, FROM_MODEL_VERIFIED,
                                                                        FROM_MODEL_NOT_VERIFIED]:
                    tag_preprocess_dict[
                        entity_identified[ORIGINAL_TEXT].lower()].append(
                            [entity_identified, entity])
                else:
                    tag_preprocess_dict['NA'].append(
                        [entity_identified, entity])
        else:
            final_entity_data[entity] = None

    original_text_list = tag_preprocess_dict.keys()
    original_text_list = sort_original_text(original_text_list)
    for original_text in original_text_list:
        tag = ''
        if original_text in processed_text:
            processed_text = processed_text.replace(original_text, '')
            for entity_dict, entity in tag_preprocess_dict[original_text]:
                tag += '_' + entity
                if final_entity_data[entity]:
                    final_entity_data[entity].append(entity_dict)
                else:
                    final_entity_data[entity] = [entity_dict]
            if tag != '':
                tag = '_' + tag + '__'
            tagged_text = tagged_text.replace(original_text, tag)
        else:
            for entity_dict, entity in tag_preprocess_dict[original_text]:
                if not final_entity_data[entity]:
                    final_entity_data[entity] = None

    if tag_preprocess_dict.get('NA'):
        for entity_dict, entity in tag_preprocess_dict['NA']:
            if final_entity_data[entity]:
                final_entity_data[entity].append(entity_dict)
            else:
                final_entity_data[entity] = [entity_dict]

    return {'entity_data': final_entity_data, 'tag': tagged_text}
Beispiel #2
0
class DateAdvanceDetector(object):
    """
    Detects dates subject to conditions like "departure date" and "return date". These dates are returned in a
    dictionary with keys 'date_departure' and 'date_return'. This class uses DateDetector to detect the date values.

    This class can be used to detect dates specific to scenarios involving a departure and arrival dates for example in
    travel related text

    Attributes:
        text: string to extract entities from
        entity_name: string by which the detected date entities would be replaced with on calling detect_entity()
        tagged_text: string with date entities replaced with tag defined by entity name
        processed_text: string with detected date entities removed
        date: list of date entities detected
        original_date_text: list to store substrings of the text detected as date entities
        tag: entity_name prepended and appended with '__'
        date_detector_object: DateDetector object used to detect dates in the given text
        bot_message: boolean, set as the outgoing bot text/message
    """
    def __init__(self, entity_name, timezone=pytz.timezone('UTC')):
        """
        Initializes the DateAdvanceDetector object with given entity_name and pytz timezone object

        Args:
            entity_name: A string by which the detected date entity substrings would be replaced with on calling
                        detect_entity()
            timezone: Optional, pytz.timezone object used for getting current time, default is pytz.timezone('UTC')
        """
        self.text = ''
        self.tagged_text = ''
        self.processed_text = ''
        self.date = []
        self.original_date_text = []
        self.regx_to_process = Regex([(r'[\/]', r'')])
        self.regx_to_process_text = Regex([(r'[\,]', r'')])
        self.entity_name = entity_name
        self.tag = '__' + entity_name + '__'
        self.date_detector_object = DateDetector(entity_name=self.entity_name,
                                                 timezone=timezone)
        self.bot_message = None

    def detect_entity(self, text):
        """
        Detects all date strings in text and returns two lists of detected date entities and their corresponding
        original substrings in text respectively.

        Args:
            text: string to extract date entities from

        Returns:
            Tuple containing two lists, first containing dictionaries, each containing 'date_return'
            and 'date_departure' keys and dictionaries returned form DateDetector as their values,
            for each detected date, and second list containing corresponding original substrings in text

        Examples:
            date_adv_detector = DateAdvanceDetector("date_advance")
            text = 'find me a flight departing on 20/2/17 and one returning on 21/2/17'
            date_adv_detector.detect_entity(text)

                Output:
                    ([
                        {
                            'date_departure': {'dd': 20, 'mm': 2, 'type': 'date', 'yy': 2017},
                            'date_return': None
                        },
                        {
                            'date_departure': None,
                            'date_return': {'dd': 21, 'mm': 2, 'type': 'date', 'yy': 2017}
                        }
                    ],
                    ['departing on 20/2/17', 'returning on 21/2/17'])

            dd.tagged_text

                Output:
                    ' find me a flight __date_adv__ and one __date_adv__ '


            text = 'I am not available from 3rd April, 2017 to 9/05/17'
            date_adv_detector.detect_entity(text)

                Output:
                    ([
                        {
                            'date_departure': {'dd': 3, 'mm': 4, 'type': 'date', 'yy': 2017},
                            'date_return': {'dd': 9, 'mm': 5, 'type': 'date', 'yy': 2017}
                        }
                    ],
                    ['3rd april 2017', '9/05/17'])

            dd.tagged_text

                Output:
                    ' find me a flight __date_adv__ and one __date_adv__ '


        Additionally this function assigns these lists to self.date and self.original_date_text attributes
        respectively.

        """

        self.text = ' ' + text.lower() + ' '
        self.text = self.regx_to_process_text.text_substitute(self.text)
        self.processed_text = self.text
        self.tagged_text = self.text
        date_data = self._detect_date()
        self.date = date_data[0]
        self.original_date_text = date_data[1]
        return date_data

    def _detect_date(self):
        """
        Detects "departure" and "return" from the object's text attribute

        Returns:
            Tuple containing two lists, first containing dictionaries, each containing 'date_return'
            and 'date_departure' keys and dictionaries returned form DateDetector as their values,
            for each detected date, and second list containing corresponding original substrings in text

        """
        # print 'detection for default task'
        date_list = []
        original_list = []

        date_list, original_list = self._detect_departure_return_date(
            date_list, original_list)
        self._update_processed_text(original_list)
        date_list, original_list = self._detect_departure_date(
            date_list, original_list)
        self._update_processed_text(original_list)
        date_list, original_list = self._detect_return_date(
            date_list, original_list)
        self._update_processed_text(original_list)
        date_list, original_list = self._detect_any_date(
            date_list, original_list)
        self._update_processed_text(original_list)

        return date_list, original_list

    def _detect_departure_return_date(self,
                                      date_list=None,
                                      original_list=None):
        """
        Finds <any text><space(s)><'-' or 'to' or '2'><space(s)><any text> in the given text.
        It  splits the text into two parts on '-' or 'to' or '2'
        and detects the departure date in the first (left) part and detects return date in the second (right) part

        Args:
            date_list: Optional, list to store dictionaries of detected dates
            original_list: Optional, list to store corresponding original substrings of text which were detected as
                            departure and return type date entities

        Returns:
            Tuple containing two lists, first containing dictionaries, each containing 'date_return'
            and 'date_departure' keys and dictionaries returned form DateDetector as their values,
            for each detected date, and second list containing corresponding original substrings in text
        """
        if date_list is None:
            date_list = []
        if original_list is None:
            original_list = []
        patterns = re.findall(r'\b((.+)\s*(\-|to|2)\s*(.+))\b',
                              self.processed_text.lower())

        for pattern in patterns:
            date = {'date_departure': None, 'date_return': None}
            date_departure = None
            date_return = None
            date_detect = self.date_detector_object.detect_entity(pattern[1])
            if date_detect[0]:
                date_departure = date_detect[0][0]
                original = date_detect[1][0] if date_detect[1] else None
                original_list.append(original)

            date_detect = self.date_detector_object.detect_entity(pattern[3])
            if date_detect[0]:
                date_return = date_detect[0][0]
                original = date_detect[1][0] if date_detect[1] else None
                original_list.append(original)

            if date_departure and date_return:
                date['date_departure'] = date_departure
                date['date_return'] = date_return

                date_list.append(date)
                # original = self.regx_to_process.text_substitute(original)

        return date_list, original_list

    def _detect_departure_date(self, date_list=None, original_list=None):
        """
        Finds departure type dates in the given text by matching few keywords like 'onward date', 'departure date',
        'leaving on', 'starting from', 'departing', 'going on' . It detects dates in the part of text right to these
        keywords.

        Args:
            date_list: Optional, list to store dictionaries of detected dates
            original_list: Optional, list to store corresponding original substrings of text which were detected as
                            departure type date entities

        Returns:
            Tuple containing two lists, first containing dictionaries, each containing 'date_return'
            and 'date_departure' keys and dictionaries returned form DateDetector as their values,
            for each detected date, and second list containing corresponding original substrings in text
        """
        if date_list is None:
            date_list = []
        if original_list is None:
            original_list = []
        regex_string = r'\b((onward date\:|onward date -|on|departure date|leaving on|starting from|' + \
                       r'departing on|departing|going on|for|departs on)\s+(.+))\b'
        patterns = re.findall(regex_string, self.processed_text.lower())

        for pattern in patterns:
            date_departure = None
            date = {'date_departure': None, 'date_return': None}

            date_detect = self.date_detector_object.detect_entity(pattern[2])
            if date_detect[0]:
                date_departure = date_detect[0][0]

            if date_departure:
                if date_detect[1] and date_detect[1][0] in pattern[0]:
                    end_idx = pattern[0].find(date_detect[1][0]) + len(
                        date_detect[1][0])
                    original = pattern[0][:end_idx]
                else:
                    original = date_detect[1][0] if date_detect[1] else None
                date['date_departure'] = date_departure

                date_list.append(date)
                # original = self.regx_to_process.text_substitute(original)
                original_list.append(original)

        return date_list, original_list

    def _detect_return_date(self, date_list=None, original_list=None):
        """
        Finds return type dates in the given text by matching few keywords like 'coming back', 'return date',
        'leaving on', 'returning on', 'returning at', 'arriving', 'arrive' . It detects dates in the part of text right
        to these keywords.

        Args:
            date_list: Optional, list to store dictionaries of detected dates
            original_list: Optional, list to store corresponding original substrings of text which were detected as
                            return type date entities

        Returns:
            Tuple containing two lists, first containing dictionaries, each containing 'date_return'
            and 'date_departure' keys and dictionaries returned form DateDetector as their values,
            for each detected date, and second list containing corresponding original substrings in text
        """
        if original_list is None:
            original_list = []
        if date_list is None:
            date_list = []
        regex_string = r'\b((coming back|back|return date\:?|return date -|returning on|' + \
                       r'arriving|arrive|return|returning|returns on|at)\s+(.+))\b'
        patterns = re.findall(regex_string, self.processed_text.lower())

        for pattern in patterns:
            date_return = None
            original = None
            date = {'date_departure': None, 'date_return': None}

            date_detect = self.date_detector_object.detect_entity(pattern[2])
            if date_detect[0]:
                date_return = date_detect[0][0]

            if date_return:
                if date_detect[1] and date_detect[1][0] in pattern[0]:
                    end_idx = pattern[0].find(date_detect[1][0]) + len(
                        date_detect[1][0])
                    original = pattern[0][:end_idx]
                else:
                    original = date_detect[1][0] if date_detect[1] else None
                date['date_return'] = date_return

                date_list.append(date)
                original_list.append(original)

        return date_list, original_list

    def _detect_any_date(self, date_list=None, original_list=None):
        """
        Finds departure and return type dates in the given text. It detects 'departure' and 'return' and their synonyms
        and tags all the detected dates in the text accordingly. If both type synonyms are found, and more than one
        dates are detected, first date is marked as departure type and last as return type. If only one date is found,
        it is marked as departure if 'departure' or both type synonyms are found and marked as 'return' type if 'return'
        type synonyms were found in the given text


        Args:
            date_list: Optional, list to store dictionaries of detected dates
            original_list: Optional, list to store corresponding original substrings of text which were detected as
                            return type date entities

        Returns:
            Tuple containing two lists, first containing dictionaries, each containing 'date_return'
            and 'date_departure' keys and dictionaries returned form DateDetector as their values,
            for each detected date, and second list containing corresponding original substrings in text
        """
        if date_list is None:
            date_list = []
        if original_list is None:
            original_list = []
        departure_date_flag = False
        return_date_flag = False
        if self.bot_message:
            departure_regex_string = r'traveling on|going on|starting on|departure date|date of travel|' + \
                                     r'check in date|check-in date|date of check-in|date of departure\.'
            arrival_regex_string = r'traveling back|coming back|returning back|returning on|return date' + \
                                   r'|arrival date|check out date|check-out date|date of check-out|check out'
            departure_regexp = re.compile(departure_regex_string)
            arrival_regexp = re.compile(arrival_regex_string)
            if departure_regexp.search(self.bot_message) is not None:
                departure_date_flag = True
            elif arrival_regexp.search(self.bot_message) is not None:
                return_date_flag = True

        patterns = re.findall(r'\s((.+))\.?\b', self.processed_text.lower())

        for pattern in patterns:
            pattern = list(pattern)
            date = {'date_departure': None, 'date_return': None}
            date_detect = self.date_detector_object.detect_entity(pattern[1])
            if date_detect[0]:
                original = date_detect[1][0]
                if len(date_detect[0]) > 1:
                    sort_date_detect = self._sort_date_list(
                        date_list=date_detect[0], original_list=date_detect[1])
                    date['date_departure'] = sort_date_detect[0][0]
                    date['date_return'] = sort_date_detect[0][-1]
                else:
                    if departure_date_flag and not return_date_flag:
                        date['date_departure'] = date_detect[0][0]
                        date['date_return'] = None
                    elif not departure_date_flag and return_date_flag:
                        date['date_departure'] = None
                        date['date_return'] = date_detect[0][0]
                    else:
                        date['date_departure'] = date_detect[0][0]
                        date['date_return'] = None

                date_list.append(date)
                # original = self.regx_to_process.text_substitute(original)
                original_list.append(original)

        return date_list, original_list

    def _update_processed_text(self, original_date_strings):
        """
        Replaces detected date entities with tag generated from entity_name used to initialize the object with

        A final string with all date entities replaced will be stored in object's tagged_text attribute
        A string with all date entities removed will be stored in object's processed_text attribute

        Args:
            original_date_strings: list of substrings of original text to be replaced with tag created from entity_name
        """
        for detected_text in original_date_strings:
            if detected_text:
                self.tagged_text = self.tagged_text.replace(
                    detected_text, self.tag)
                self.processed_text = self.processed_text.replace(
                    detected_text, '')

    def set_bot_message(self, bot_message):
        """
        Sets the object's bot_message attribute

        Args:
            bot_message: string
        """
        self.bot_message = bot_message

    def _sort_date_list(self, date_list, original_list):
        """
        Sorts the date_list and original_list according to date value in chronological order

        Args:
            date_list: List of dictionaries of date values for detected dates in the text
            original_list: List of substrings of the given text to DateAdvanceDetector that correspond to the
                            detected dates in the date_list

        Returns:
            Tuple containing two lists, first containing dictionaries of detected dates sorted in chronological order
            and second list containing their corresponding substrings of text

        Example:


        """
        sorted_original_list = []
        if len(date_list) > 1:
            dates_zip = zip(date_list, original_list)
            sorted_dates_zip = sorted(dates_zip, key=lambda d: d[0].values())
            sorted_date_list, sorted_original_list = map(
                list, zip(*sorted_dates_zip))
        else:
            sorted_date_list = date_list
            sorted_original_list = original_list
        return sorted_date_list, sorted_original_list
Beispiel #3
0
class TextDetector(BaseDetector):
    """
    TextDetector detects custom entities in text string by performing similarity searches against a list fetched from
    datastore (elasticsearch) and tags them.

    TextDetector detects text type custom entities that do not adhere to some strict/weak formats which other entities
    like date, time, email, etc do. Examples of such types of entites can be city, food dish name, brand names etc


    Attributes:
        text (str): string to extract entities from
        entity_name (str): string by which the detected time entities would be replaced with on calling detect_entity()
        regx_to_process (lib.nlp.Regex): list of regex patterns to match and remove text that matches
                                         these patterns before starting to detect entities
        text_dict (dict): dictionary to store lemmas, stems, ngrams used during detection process
        _fuzziness (str or int): If this parameter is str, elasticsearch's
                                 auto is used with low and high term distances. Default low and high term distances
                                 are 3 and 6 for elasticsearch. For this module they are set to 4 and 7 respectively.
                                 In auto mode, if length of term is less than low it must match exactly, if it is
                                 between [low, high) one insert/delete/substitution is allowed, for anything higher
                                 than equal to high, two inserts/deletes/substitutions are allowed
        _min_token_size_for_fuzziness (int): minimum number of letters a word must have to be considered
                                             for calculating edit distance with similar ngrams from the datastore
        tagged_text (str): string with time entities replaced with tag defined by entity_name
        text_entity (list): list to store detected entities from the text
        original_text_entity (list): list of substrings of the text detected as entities
        processed_text (str): string with detected text entities removed
        tag (str): entity_name prepended and appended with '__'
    """
    def __init__(self,
                 entity_name=None,
                 source_language_script=ENGLISH_LANG,
                 translation_enabled=False):
        """
        Initializes a TextDetector object with given entity_name

        Args:
            entity_name: A string by which the detected substrings that correspond to text entities would be replaced
                         with on calling detect_entity()
            source_language_script: ISO 639 code for language of entities to be detected by the instance of this class
            translation_enabled: True if messages needs to be translated in case detector does not support a
                                 particular language, else False
        """
        # assigning values to superclass attributes
        self._supported_languages = [ENGLISH_LANG, HINDI_LANG]
        super(TextDetector, self).__init__(source_language_script,
                                           translation_enabled)

        self.text = None
        self.regx_to_process = Regex([(r'[\'\/]', r'')])
        self.text_dict = {}
        self.tagged_text = None
        self.text_entity = []
        self.original_text_entity = []
        self.processed_text = None
        self.entity_name = entity_name
        self.tag = '__' + self.entity_name + '__'

        # defaults for auto mode
        self._fuzziness = "auto:4,7"
        self._fuzziness_lo, self._fuzziness_hi = 4, 7
        self._min_token_size_for_fuzziness = self._fuzziness_lo
        # self.set_fuzziness_threshold(fuzziness=(self._fuzziness_lo, self._fuzziness_hi))

        # defaults for non-auto mode
        self.set_fuzziness_threshold(fuzziness=1)
        self._min_token_size_for_fuzziness = 4

        self.db = DataStore()

    @property
    def supported_languages(self):
        return self._supported_languages

    def set_fuzziness_threshold(self, fuzziness):
        """
        Sets the fuzziness thresholds for similarity searches. The fuzziness threshold corresponds to the
        maximum Levenshtein's distance allowed during similarity matching

        Args:
            fuzziness (iterable or int): If this parameter is int, elasticsearch's auto is used with
                                         low and high term distances.
                                         Please make sure the iterable has only two integers like (4, 7).
                                         This will generate "auto:4,7"

                                         Note that this also sets
                                         _min_token_size_for_fuzziness to first value of the iterable

                                         Default low and high term distances are 3 and 6 for elasticsearch.
                                         See [1] for more details.

                                         If this argument is int, elasticsearch will set fuzziness as
                                         min(2, fuzziness)

        [1] https://www.elastic.co/guide/en/elasticsearch/reference/current/common-options.html#fuzziness

        """
        try:
            iter(fuzziness)
            if len(fuzziness) == 2:
                lo, hi = fuzziness
                self._fuzziness_lo, self._fuzziness_hi = int(lo), int(hi)
                self._fuzziness = "auto:" + str(
                    self._fuzziness_lo) + "," + str(self._fuzziness_hi)
                self._min_token_size_for_fuzziness = lo
            else:
                self._fuzziness = "auto"
        except TypeError:
            if type(fuzziness) == int or type(fuzziness) == float:
                self._fuzziness = int(
                    fuzziness
                )  # Note that elasticsearch would take min(2, self._fuzziness)
            else:
                raise TypeError(
                    'fuziness has to be either an iterable of length 2 or an int'
                )

    def _get_fuzziness_threshold_for_token(self, token):
        """
        Return dynamic fuzziness threshold for damerau-levenshtein check based on length of token if elasticsearch
        fuzziness was set to auto mode

        Args:
            token (str or unicode): the string to calculate fuzziness threshold for

        Returns:
            int: fuzziness threshold for ngram matching on elastic search results
        """
        if type(self._fuzziness) == int:
            return self._fuzziness
        else:
            if len(token) < self._fuzziness_lo:
                return 0  # strict match
            elif len(token) >= self._fuzziness_hi:
                return 2  # Allow upto two inserts/deletes and one substitution
            else:
                return 1  # lo <= len < hi Allow only insert/delete

    def set_min_token_size_for_levenshtein(self, min_size):
        """
        Sets the minimum number of letters a word must have to be considered for calculating edit distance with similar
        ngrams from the datastore

        Args:
            min_size: integer, maximum allowed Levenshtein's distance from the word/phrase being tested for
            entity match
        """
        self._min_token_size_for_fuzziness = min_size

    def detect_entity(self, text, **kwargs):
        """
        Detects all textual entities in text that are similar to variants of 'entity_name' stored in the datastore and
        returns two lists of detected text entities and their corresponding original substrings in text respectively.
        Note that datastore stores number of values under a entity_name and each entity_value has its own list of
        variants, whenever a variant is matched sucessfully, the entity_value whose list the variant belongs to,
        is returned. For more information on how data is stored, see Datastore docs.

        Args:
            text (unicode): string to extract textual entities from
            **kwargs: it can be used to send specific arguments in future. for example, fuzziness, previous context.
        Returns:
            Tuple containing two lists, first containing entity value as defined into datastore
            and second list containing corresponding original substrings in text

        Example:
            DataStore().get_entity_dictionary('city')

                Output:
                    {
                        u'Agartala': [u'', u'Agartala'],
                        u'Barnala': [u'', u'Barnala'],
                        ...
                        u'chennai': [u'', u'chennai', u'tamilnadu', u'madras'],
                        u'hyderabad': [u'hyderabad'],
                        u'koramangala': [u'koramangala']
                    }

            text_detection = TextDetector('city')
            text_detection.detect_entity('Come to Chennai, TamilNadu,  I will visit Delhi next year')

                Output:
                    ([u'Chennai', u'New Delhi', u'chennai'], ['chennai', 'delhi', 'tamilnadu'])

            text_detection.tagged_text

                Output:
                    ' come to __city__, __city__,  i will visit __city__ next year '

        Additionally this function assigns these lists to self.time and self.original_text_entity attributes
        respectively.
        """
        self.text = text
        self.text = self.regx_to_process.text_substitute(self.text)
        self.text = ' ' + self.text.lower() + ' '
        self.processed_text = self.text
        self.tagged_text = self.processed_text

        text_entity_data = self._text_detection_with_variants()

        self.text_entity = text_entity_data[0]
        self.original_text_entity = text_entity_data[1]
        return text_entity_data

    def _text_detection_with_variants(self):
        """
        This function will normalise the message by breaking it into trigrams, bigrams and unigrams. The generated
        ngrams will be used to create query to retrieve search results from datastore. These results will contain a
        dictionary where key will be variant and value will be entity value this will be further processed to get the
        original text which has been identified and will return the results

        Returns:
             A tuple of two lists with first list containing the detected text entities and second list containing
             their corresponding substrings in the original message.
        """
        original_final_list = []
        value_final_list = []
        variant_dictionary = {}

        tokens = tokenizer.tokenize(self.processed_text)
        message = u' '.join(tokens)
        variants = self.db.get_similar_dictionary(
            self.entity_name,
            message,
            self._fuzziness,
            search_language_script=self._target_language_script)
        variant_dictionary.update(variants)
        variant_list = variant_dictionary.keys()

        exact_matches, fuzzy_variants = [], []
        for variant in variant_list:
            if variant.lower() in self.processed_text.lower():
                exact_matches.append(variant)
            else:
                fuzzy_variants.append(variant)

        exact_matches.sort(key=lambda s: len(tokenizer.tokenize(s)),
                           reverse=True)
        fuzzy_variants.sort(key=lambda s: len(tokenizer.tokenize(s)),
                            reverse=True)
        variant_list = exact_matches + fuzzy_variants

        for variant in variant_list:
            original_text = self._get_entity_from_text(
                variant, self.processed_text.lower())
            if original_text:
                value_final_list.append(variant_dictionary[variant])
                original_final_list.append(original_text)
                _pattern = re.compile(r'\b%s\b' % original_text, re.UNICODE)
                self.tagged_text = _pattern.sub(self.tag, self.tagged_text)
                # Instead of dropping completely like in other entities,
                # we replace with tag to avoid matching non contiguous segments
                self.processed_text = _pattern.sub(self.tag,
                                                   self.processed_text)
        return value_final_list, original_final_list

    def _get_entity_from_text(self, variant, text):
        """
        Checks ngrams of the text for similarity against the variant (can be a ngram) using Levenshtein distance

        Args:
            variant: string, ngram of variant to fuzzy detect in the text using Levenshtein distance
            text: text to detect entities from

        Returns:
            part of the given text that was detected as entity given the variant, None otherwise

        Example:
            text_detection = TextDetector('city')
            ...
            text_detection._get_entity_from_text(self, variant, text)
            text = 'Come to Chennai, Tamil Nadu,  I will visit Delehi next year'.lower()
            text_detection.get_entity_from_text('chennai', text)

            Output:
                'chennai'

            text_detection.get_entity_from_text('Delhi', text)

            Output:
                'delehi'
        """
        variant_tokens = tokenizer.tokenize(variant.lower())
        text_tokens = tokenizer.tokenize(text.lower())
        original_text = []
        variant_count = 0
        for text_token in text_tokens:
            variant_token = variant_tokens[variant_count]

            utext_token = text_token
            if type(utext_token) == 'str':
                utext_token = utext_token.decode('utf-8')

            same = variant_token == text_token
            ft = self._get_fuzziness_threshold_for_token(utext_token)
            if same or (len(utext_token) > self._min_token_size_for_fuzziness
                        and edit_distance(string1=variant_token,
                                          string2=text_token,
                                          max_distance=ft + 1) <= ft):
                original_text.append(text_token)
                variant_count += 1
                if variant_count == len(variant_tokens):
                    return ' '.join(original_text)
            else:
                original_text = []
                variant_count = 0
        return None
class BudgetDetector(object):
    """Detects budget from the text  and tags them.

    Detects the budget from the text and replaces them by entity_name.
    This detection logic first checks for budget using regular expressions and also uses TextDetector class to extract
    data in textual format (i.e. Hundred, Thousand, etc).

    This detector captures  additional attributes like max_budget, min_budget whether the budget is
    normal_budget (detected through regex) or text_budget (detected through text detection)

    For Example:

        budget_detection = BudgetDetector('budget')
        message = "shirts between 2000 to 3000"
        budget_list, original_text = budget_detection.detect_entity(message)
        tagged_text = budget_detection.tagged_text
        print budget_list, ' -- ', original_text
        print 'Tagged text: ', tagged_text

         >> [{'max_budget': 3000, 'type': 'normal_budget', 'min_budget': 2000}]  --  ['2000 to 3000']
            Tagged text:  shirts between __budget__

        budget_detection = BudgetDetector('budget')
        message = "tshirts less than 2k"
        budget_list, original_text = budget_detection.detect_entity(message)
        tagged_text = budget_detection.tagged_text
        print budget_list, ' -- ', original_text
        print 'Tagged text: ', tagged_text

         >> [{'max_budget': 2000, 'type': 'normal_budget', 'min_budget': 0}]  --  ['less than 2k']
            Tagged text:  tshirts __budget__

        budget_detection = BudgetDetector('budget')
        message = "tshirts greater than 2k"
        budget_list, original_text = budget_detection.detect_entity(message)
        tagged_text = budget_detection.tagged_text
        print budget_list, ' -- ', original_text
        print 'Tagged text: ', tagged_text

         >> [{'max_budget': 0, 'type': 'normal_budget', 'min_budget': 2000}]  --  ['greater than 2k']
            Tagged text:  tshirts __budget__

        budget_detection = BudgetDetector('budget')
        message = "jeans of Rs. 1000"
        budget_list, original_text = budget_detection.detect_entity(message)
        tagged_text = budget_detection.tagged_text
        print budget_list, ' -- ', original_text
        print 'Tagged text: ', tagged_text

         >> [{'max_budget': 1000, 'type': 'normal_budget', 'min_budget': 0}]  --  ['rs. 1000']
            Tagged text:  ' jeans of __budget__ '


    Attributes:
        min_digit: minimum digit that a budget can take by default it is set to 2. So, the NER will detect number as
        budget if its greater then 9
        max_digit: maximum digit that buget can take by default it is set to 5. So, the NER will detect number
        as budget if its less than 99999
        text: string to extract entities from
        entity_name: string by which the detected size would be replaced with on calling detect_entity()
        dictionary_name: name of a dictionary that stores the string data. For example: Hunderd, Thousand, lakh, etc.
        It's a constant and its value is BUDGET_LIST
        tagged_text: string with size replaced with tag defined by entity name
        processed_text: string with sizes detected removed
        budget: list of budgets detected
        original_budget_text: list to store substrings of the text detected as budget
        tag: entity_name prepended and appended with '__'
        regex_object: regex object that is used to substitute k with 000 i.e. if text contains 2k then
        it will be substituted as 2000
        text_detection_object: text detection object to detect text in Textual format
        
    Note:
        text and tagged_text will have a extra space prepended and appended after calling detect_entity(text)

    """
    def __init__(self, entity_name):
        """Initializes a BudgetDetector object

        Args:
            entity_name: A string by which the detected budget would be replaced with on calling detect_entity()
        """

        self.min_digit = 2
        self.max_digit = 5
        self.entity_name = entity_name

        self.text = ''
        self.tagged_text = ''
        self.processed_text = ''
        self.budget = []
        self.original_budget_text = []

        regex_for_thousand = [(r'(\d+)k', r'\g<1>000')]
        self.regex_object = Regex(regex_for_thousand)
        self.tag = '__' + self.entity_name + '__'
        self.text_detection_object = TextDetector(entity_name=ES_BUDGET_LIST)

    def detect_entity(self, text):
        """Detects budget in the text string

        Args:
            text: string to extract entities from

        Returns:
            A tuple of two lists with first list containing the detected budgets and second list containing their
            corresponding substrings in the original message.

            For example:

                ([{'max_budget': 1000, 'type': 'normal_budget', 'min_budget': 0}], ['rs. 1000'])

            Additionally this function assigns these lists to self.budget and self.original_budget_text attributes
            respectively.

        """
        self.text = ' ' + text + ' '
        self.processed_text = self.text.lower()
        self.tagged_text = self.text
        budget_data = self._detect_budget()
        self.budget = budget_data[0]
        self.original_budget_text = budget_data[1]
        return budget_data

    def _detect_budget(self):
        """Detects budget in the self.text

        Returns:
            A tuple of two lists with first list containing the detected budgets and second list containing their
            corresponding substrings in the original message.

            For example:
                input: "shirts between 2000 to 3000"
                output: ([{'max_budget': 3000, 'type': 'normal_budget', 'min_budget': 2000}], ['2000 to 3000'])

        """

        budget_list = []
        original_list = []
        budget_list, original_list = self._detect_min_max_budget(
            budget_list, original_list)
        self._update_processed_text(original_list)
        budget_list, original_list = self._detect_min_budget(
            budget_list, original_list)
        self._update_processed_text(original_list)
        budget_list, original_list = self._detect_max_budget(
            budget_list, original_list)
        self._update_processed_text(original_list)
        budget_list, original_list = self._detect_any_budget(
            budget_list, original_list)
        self._update_processed_text(original_list)
        budget_list, original_list = self._detect_text_budget(
            budget_list, original_list)
        self._update_processed_text(original_list)

        return budget_list, original_list

    def _detect_min_budget(self, budget_list=None, original_list=None):
        """Detects minimum budget from text using regex
        This is a function which will be called when we want to detect the budget using regex from the text

        Returns:
            A tuple of two lists with first list containing the detected budget and second list containing their
            corresponding substrings in the original message.

            For example:
                input: "tshirts greater than 2k"
                output: ([{'max_budget': 0, 'type': 'normal_budget', 'min_budget': 2000}], ['greater than 2k'])

        """

        if budget_list is None:
            budget_list = []
        if original_list is None:
            original_list = []
        patterns = re.findall(
            r'(\s(above|more? than|more?|greater than|greater|abv|abov|more? den|\>\s*\=?)\s+'
            r'(rs.|rs|rupees|rupee)*\s*(\d{' + str(self.min_digit) + ',' +
            str(self.max_digit) + '}|\d{1,' +
            str(self.max_digit - 3) + '}\s*k)\s*(rs.|rs|rupees|rupee|\.)?\s)',
            self.processed_text.lower())
        for pattern in patterns:
            original = pattern[0].strip()
            budget = {
                'min_budget': 0,
                'max_budget': 0,
                'type': BUDGET_TYPE_NORMAL
            }

            if 'k' in pattern[3]:
                budget['min_budget'] = int(
                    self.regex_object.text_substitute(pattern[3]))
            else:
                budget['min_budget'] = int(pattern[3])

            budget_list.append(budget)
            original_list.append(original)
        return budget_list, original_list

    def _detect_max_budget(self, budget_list=None, original_list=None):
        """Detects maximum budget from text using regex
        This is a function which will be called when we want to detect the budget using regex from the text

        Returns:
            A tuple of two lists with first list containing the detected budget and second list containing their
            corresponding substrings in the original message.

            For example:
                input: "tshirts less than 2k"
                output: ([{'max_budget': 2000, 'type': 'normal_budget', 'min_budget': }], ['less than 2k'])

        """

        if budget_list is None:
            budget_list = []
        if original_list is None:
            original_list = []

        patterns = re.findall(
            r'(\s(max|upto|o?nly|around|below|less than|less|less den|\<\s*\=?)\s+(rs.|rs|rupees|rupee)?\s*(\d{'
            + str(self.min_digit) + ',' + str(self.max_digit) + '}|\d{1,' +
            str(self.max_digit - 3) + '}\s*k)\s*(rs.|rs|rupees|rupee|\.)?\s)',
            self.processed_text.lower())
        for pattern in patterns:
            original = pattern[0].strip()

            budget = {
                'min_budget': 0,
                'max_budget': 0,
                'type': BUDGET_TYPE_NORMAL
            }

            if 'k' in pattern[3]:
                budget['max_budget'] = int(
                    self.regex_object.text_substitute(pattern[3]))
            else:
                budget['max_budget'] = int(pattern[3])

            budget_list.append(budget)
            original_list.append(original)
        return budget_list, original_list

    def _detect_min_max_budget(self, budget_list=None, original_list=None):
        """Detects both minimum and maximum budget from text using regex
        This is a function which will be called when we want to detect the budget using regex from the text

        Returns:
            A tuple of two lists with first list containing the detected budget and second list containing their
            corresponding substrings in the original message.

            For example:
                input: shirts between 2000 to 3000
                output: ([{'max_budget': 3000, 'type': 'normal_budget', 'min_budget': 2000}], ['2000 to 3000'])

        """
        if budget_list is None:
            budget_list = []
        if original_list is None:
            original_list = []

        patterns = re.findall(
            r'(\s((\d{1,' + str(self.max_digit - 3) + '}\s*k?)|(\d{' +
            str(self.min_digit) + ',' + str(self.max_digit) +
            '}))\s*(\-|to|and)\s*((\d{1,' + str(self.max_digit - 3) +
            '}\s*k?)|(\d{' + str(self.min_digit) + ',' + str(self.max_digit) +
            '}))\.?\s)', self.processed_text.lower())
        for pattern in patterns:
            original = None
            pattern = list(pattern)
            budget = {
                'min_budget': 0,
                'max_budget': 0,
                'type': BUDGET_TYPE_NORMAL
            }

            flag_contains_k = False
            max_budget = 0
            if pattern[6]:
                flag_contains_k = True if 'k' in pattern[6] else False
                max_budget = int(self.regex_object.text_substitute(pattern[6]))
            elif pattern[7]:
                max_budget = int(pattern[7])
            min_budget = 0
            if pattern[2]:
                if flag_contains_k and not 'k' in pattern[2]:
                    pattern[2] = str(pattern[2]).strip() + 'k'
                min_budget = int(self.regex_object.text_substitute(pattern[2]))
            elif pattern[3]:
                min_budget = int(pattern[3])
            min_budget = min_budget if self.min_digit <= min_budget.__str__(
            ).__len__() <= self.max_digit else 0
            max_budget = max_budget if self.min_digit <= max_budget.__str__(
            ).__len__() <= self.max_digit else 0
            if min_budget != 0 and max_budget != 0 and min_budget <= max_budget:
                original = pattern[0].strip()
                budget['min_budget'] = min_budget
                budget['max_budget'] = max_budget

                budget_list.append(budget)
                original_list.append(original)
        return budget_list, original_list

    def _detect_any_budget(self, budget_list=None, original_list=None):
        """Detects a budget from text using regex
        This is a function which will be called when we want to detect the budget using regex from the text

        Returns:
            A tuple of two lists with first list containing the detected budget and second list containing their
            corresponding substrings in the original message.

            For example:
                input: shirts between 2000 to 3000
                output: ([{'max_budget': 3000, 'type': 'normal_budget', 'min_budget': 2000}], ['2000 to 3000'])

        """

        if budget_list is None:
            budget_list = []
        if original_list is None:
            original_list = []

        patterns = re.findall(
            r'\s((rs.|rs|rupees|rupee)?\s?(\d{' + str(self.min_digit) + ',' +
            str(self.max_digit) + '}|\d{1,' +
            str(self.max_digit - 3) + '}\s*k)\s?(rs.|rs|rupees|rupee)?\.?)\s',
            self.processed_text.lower())
        for pattern in patterns:
            original = pattern[0].strip()
            budget = {
                'min_budget': 0,
                'max_budget': 0,
                'type': BUDGET_TYPE_NORMAL
            }
            if 'k' in pattern[2]:
                budget['max_budget'] = int(
                    self.regex_object.text_substitute(pattern[2]))
            else:
                budget['max_budget'] = int(pattern[2])
            budget_list.append(budget)
            original_list.append(original)
        return budget_list, original_list

    def _detect_text_budget(self, budget_list=None, original_list=None):
        """Detects budget  from text using text detection logic i.e.TextDetector
        This is a function which will be called when we want to detect the budget using text

        Returns:
            A tuple of two lists with first list containing the detected numbers and second list containing their
            corresponding substrings in the original message.

        """
        if budget_list is None:
            budget_list = []
        if original_list is None:
            original_list = []

        budget_text_list, original_text_list = self.text_detection_object.detect_entity(
            self.text)
        self.tagged_text = self.text_detection_object.tagged_text
        self.processed_text = self.text_detection_object.processed_text
        count = 0
        while count < len(original_text_list):
            budget = {
                'min_budget': 0,
                'max_budget': 0,
                'type': BUDGET_TYPE_TEXT
            }

            budget_list.append(budget)
            count += 1
        if original_text_list:
            original_list.extend(original_text_list)
        return budget_list, original_list

    def _update_processed_text(self, original_budget_strings):
        """
        Replaces detected budgets with self.tag generated from entity_name used to initialize the object with

        A final string with all budgets replaced will be stored in self.tagged_text attribute
        A string with all budgets removed will be stored in self.processed_text attribute

        Args:
            original_budget_strings: list of substrings of original text to be replaced with self.tag
        """
        for detected_text in original_budget_strings:
            if detected_text:
                self.tagged_text = self.tagged_text.replace(
                    detected_text, self.tag)
                self.processed_text = self.processed_text.replace(
                    detected_text, '')

    def min_max_digit(self, min_digit, max_digit):
        self.min_digit = min_digit
        self.max_digit = max_digit
Beispiel #5
0
class TextDetector(object):
    """
    TextDetector detects custom entities in text string by performing similarity searches against a list fetched from
    datastore (elasticsearch) and tags them.

    TextDetector detects text type custom entities that do not adhere to some strict/weak formats which other entities
    like date, time, email, etc do. Examples of such types of entites can be city, food dish name, brand names etc


    Attributes:
        text: string to extract entities from
        entity_name: string by which the detected time entities would be replaced with on calling detect_entity()
        regx_to_process: list of regex patterns to match and remove text that matches these patterns before starting to
                         detect entities
        text_dict: dictionary to store lemmas, stems, ngrams used during detection process
        fuzziness_threshold: maximum Levenshtein's distance allowed during similarity matching
        min_size_token_for_levenshtein: minimum number of words a phrase must have to be considered for calculating
                                        edit distance with similar ngrams from the datastore
        tagged_text: string with time entities replaced with tag defined by entity_name
        text_entity: list to store detected entities from the text
        original_text_entity: list of substrings of the text detected as entities
        processed_text: string with detected time entities removed
        tag: entity_name prepended and appended with '__'
    """

    def __init__(self, entity_name=None):
        """
        Initializes a TextDetector object with given entity_name

        Args:
            entity_name: A string by which the detected substrings that correspond to text entities would be replaced
                         with on calling detect_entity()
        """
        self.text = None
        self.regx_to_process = Regex([(r'[\'\/]', r'')])
        self.text_dict = {}
        self.fuzziness_threshold = 1
        self.min_size_token_for_levenshtein = 4
        self.tagged_text = None
        self.text_entity = []
        self.original_text_entity = []
        self.processed_text = None
        self.entity_name = entity_name
        self.tag = '__' + self.entity_name + '__'
        self.db = DataStore()

    def set_fuzziness_threshold(self, fuzziness):
        """
        Sets the fuzziness threshold for similarity searches. The fuzziness threshold corresponds to the
        maximum Levenshtein's distance allowed during similarity matching

        Args:
            fuzziness: integer, maximum allowed Levenshtein's distance from the word/phrase being tested for
                       entity match
        """
        self.fuzziness_threshold = fuzziness

    def set_min_size_for_levenshtein(self, min_size):
        """
        Sets the minimum number of words a phrase must have to be considered for calculating edit distance with similar
        ngrams from the datastore

        Args:
            min_size: integer, maximum allowed Levenshtein's distance from the word/phrase being tested for
            entity match
        """
        self.min_size_token_for_levenshtein = min_size

    def detect_entity(self, text):
        """
        Detects all textual entities in text that are similar to variants of 'entity_name' stored in the datastore and
        returns two lists of detected text entities and their corresponding original substrings in text respectively.
        Note that datastore stores number of values under a entity_name and each entity_value has its own list of
        variants, whenever a variant is matched sucessfully, the entity_value whose list the variant belongs to,
        is returned. For more information on how data is stored, see Datastore docs.

        Args:
            text: string to extract textual entities from

        Returns:
            Tuple containing two lists, first containing entity value as defined into datastore
            and second list containing corresponding original substrings in text

        Example:
            DataStore().get_entity_dictionary('city')

                Output:
                    {
                        u'Agartala': [u'', u'Agartala'],
                        u'Barnala': [u'', u'Barnala'],
                        ...
                        u'chennai': [u'', u'chennai', u'tamilnadu', u'madras'],
                        u'hyderabad': [u'hyderabad'],
                        u'koramangala': [u'koramangala']
                    }

            text_detection = TextDetector('city')
            text_detection.detect_entity('Come to Chennai, TamilNadu,  I will visit Delhi next year')

                Output:
                    ([u'Chennai', u'New Delhi', u'chennai'], ['chennai', 'delhi', 'tamilnadu'])

            text_detection.tagged_text

                Output:
                    ' come to __city__, __city__,  i will visit __city__ next year '

        Additionally this function assigns these lists to self.time and self.original_text_entity attributes
        respectively.
        """
        self.text = text
        self.text = self.regx_to_process.text_substitute(self.text)
        self.text = ' ' + self.text.lower() + ' '
        self.processed_text = self.text
        text_entity_data = self._text_detection_with_variants()
        self.tagged_text = self.processed_text
        self.text_entity = text_entity_data[0]
        self.original_text_entity = text_entity_data[1]
        return text_entity_data

    def _text_detection_with_variants(self):
        """
        This function will normalise the message by breaking it into trigrams, bigrams and unigrams. The generated
        ngrams will be used to create query to retrieve search results from datastore. These results will contain a
        dictionary where key will be variant and value will be entity value this will be further processed to get the
        original text which has been identified and will return the results

        Returns:
             A tuple of two lists with first list containing the detected text entities and second list containing
             their corresponding substrings in the original message.
        """
        original_final_list = []
        value_final_list = []
        normalization = Normalization()
        self.text_dict = normalization.ngram_data(self.processed_text.lower(), flag_punctuation_removal=False,
                                                  stem_unigram=False, stem_bigram=False, stem_trigram=False,
                                                  stop_words_unigram=True, stop_words_bigram=True,
                                                  stop_words_trigram=True).copy()
        variant_dictionary = {}

        trigram_variants = self.db.get_similar_ngrams_dictionary(self.entity_name, self.text_dict['trigram'],
                                                                 self.fuzziness_threshold)
        bigram_variants = self.db.get_similar_ngrams_dictionary(self.entity_name, self.text_dict['bigram'],
                                                                self.fuzziness_threshold)
        unigram_variants = self.db.get_similar_ngrams_dictionary(self.entity_name, self.text_dict['unigram'],
                                                                 self.fuzziness_threshold)
        variant_dictionary.update(trigram_variants)
        variant_dictionary.update(bigram_variants)
        variant_dictionary.update(unigram_variants)
        variant_list = variant_dictionary.keys()
        variant_list.sort(key=lambda s: len(tokenizer.tokenize(s)), reverse=True)

        for variant in variant_list:
            original_text = self._get_entity_from_text(variant, self.processed_text.lower())
            if original_text:
                value_final_list.append(variant_dictionary[variant])
                original_final_list.append(original_text)
                self.processed_text = self.processed_text.replace(original_text, self.tag)

        return value_final_list, original_final_list

    def _get_entity_from_text(self, variant, text):
        """
        Checks ngrams of the text for similarity against the variant (can be a ngram) using Levenshtein distance

        Args:
            variant: string, ngram of variant to fuzzy detect in the text using Levenshtein distance
            text: text to detect entities from

        Returns:
            part of the given text that was detected as entity given the variant, None otherwise

        Example:
            text_detection = TextDetector('city')
            ...
            text_detection._get_entity_from_text(self, variant, text)
            text = 'Come to Chennai, Tamil Nadu,  I will visit Delehi next year'.lower()
            text_detection.get_entity_from_text('chennai', text)

            Output:
                'chennai'

            text_detection.get_entity_from_text('Delhi', text)

            Output:
                'delehi'
        """
        variant_token_list = tokenizer.tokenize(variant.lower())
        text_token_list = tokenizer.tokenize(text.lower())
        original_text = []
        variant_count = 0
        token_count = 0
        while token_count < len(text_token_list):
            levenshtein = Levenshtein(variant_token_list[variant_count], text_token_list[token_count],
                                      self.fuzziness_threshold + 1)
            if (variant_token_list[variant_count] == text_token_list[token_count]
                    or (len(text_token_list[token_count]) > self.min_size_token_for_levenshtein
                        and levenshtein.edit_distance() <= self.fuzziness_threshold)):
                original_text.append(text_token_list[token_count])
                variant_count += 1
                if variant_count == len(variant_token_list):
                    return ' '.join(original_text)
            else:
                original_text = []
                variant_count = 0

            token_count += 1
        return None