Ejemplo n.º 1
0
    def _get_entity_substring_from_text(self, text, variant, entity_name):
        """
            Check ngrams of the text for similarity against the variant (can be a ngram) using Levenshtein distance
            and return the closest substring in the text that matches the variant.
            For each entity fuziness and min_token_size_for_fuzziness is used from the entity details.
            Args:
              variant(str or unicode): string, ngram of variant to fuzzy detect in the text using
                                       Levenshtein distance
              text(str or unicode): sentence from self.processed on which detection is being done
              entity_name (str): name of the entity to get fuzziness and min_token_lenght value
            Returns:
              str or unicode or None: part of the given text that was detected as entity given the variant,
                                      None otherwise
            Example:
              >>> text_detector = TextDetector(entity_dict={'city':{})
              >>> text = 'Come to Chennai, Tamil Nadu,  I will visit Delehi next year'.lower()
              >>> text_detector._get_entity_substring_from_text(variant='chennai')
              'chennai'
              >>> text_detector._get_entity_substring_from_text(variant='delhi')
              'delehi'
        """
        variant_tokens = TOKENIZER.tokenize(variant)
        text_tokens = TOKENIZER.tokenize(text)
        original_text_tokens = []
        variant_token_i = 0
        for text_token in text_tokens:
            variant_token = variant_tokens[variant_token_i]
            same = variant_token == text_token

            # get fuzziness and min_token_size_for_fuziness value from entity dict
            entity_dict = self.entities_dict.get(entity_name, {})

            # get fuzziness from entity if not set default
            fuzziness = entity_dict.get('fuzziness') or self._fuzziness

            self.set_fuzziness_low_high_threshold(fuzziness)

            min_token_size_for_fuzziness = entity_dict.get(
                'min_token_len_fuzziness')

            if not min_token_size_for_fuzziness:
                min_token_size_for_fuzziness = self._min_token_size_for_fuzziness

            ft = self._get_fuzziness_threshold_for_token(token=text_token)

            # set substitution cost to one
            if same or (len(text_token) > min_token_size_for_fuzziness
                        and edit_distance(string1=variant_token,
                                          string2=text_token,
                                          substitution_cost=1,
                                          max_distance=ft + 1) <= ft):
                original_text_tokens.append(text_token)
                variant_token_i += 1
                if variant_token_i == len(variant_tokens):
                    return self._get_substring_from_processed_text(
                        text, original_text_tokens)
            else:
                original_text_tokens = []
                variant_token_i = 0
        return None
Ejemplo n.º 2
0
    def _text_detection_with_variants(self):
        """
        This function will normalise the message by breaking it into trigrams, bigrams and unigrams. The generated
        ngrams will be used to create query to retrieve search results from datastore. These results will contain a
        dictionary where key will be variant and value will be entity value this will be further processed to get the
        original text which has been identified and will return the results

        Returns:
             tuple:
                list: containing the detected text entities
                list: containing their corresponding substrings in the original message.
        """
        original_final_list = []
        value_final_list = []
        variants_to_values = {}

        _variants_to_values = self.db.get_similar_dictionary(
            entity_name=self.entity_name,
            text=u' '.join(TOKENIZER.tokenize(self.processed_text)),
            fuzziness_threshold=self._fuzziness,
            search_language_script=self._target_language_script)
        for variant, value in iteritems(_variants_to_values):
            variant = variant.lower()
            if isinstance(variant, bytes):
                variant = variant.decode('utf-8')

            variants_to_values[variant] = value

        variants = variants_to_values.keys()

        exact_matches, fuzzy_variants = [], []
        for variant in variants:
            if variant in self.processed_text:
                exact_matches.append(variant)
            else:
                fuzzy_variants.append(variant)

        exact_matches.sort(key=lambda s: len(TOKENIZER.tokenize(s)),
                           reverse=True)
        fuzzy_variants.sort(key=lambda s: len(TOKENIZER.tokenize(s)),
                            reverse=True)
        variant_list = exact_matches + fuzzy_variants

        for variant in variant_list:
            original_text = self._get_entity_substring_from_text(
                variant, self.processed_text)
            if original_text:
                value_final_list.append(variants_to_values[variant])
                original_final_list.append(original_text)
                _pattern = re.compile(r'\b%s\b' % re.escape(original_text),
                                      re.UNICODE)
                self.tagged_text = _pattern.sub(self.tag, self.tagged_text)
                # Instead of dropping completely like in other entities,
                # we replace with tag to avoid matching non contiguous segments
                self.processed_text = _pattern.sub(self.tag,
                                                   self.processed_text)
        return value_final_list, original_final_list
Ejemplo n.º 3
0
    def _get_entity_substring_from_text(self, variant, text):
        """
        Checks ngrams of the text for similarity against the variant (can be a ngram) using Levenshtein distance

        Args:
            variant: string, ngram of variant to fuzzy detect in the text using Levenshtein distance
            text: text to detect entities from

        Returns:
            str or unicode: part of the given text that was detected as entity given the variant, None otherwise

        Example:
            text_detection = TextDetector('city')
            ...
            text_detection._get_entity_from_text(self, variant, text)
            text = 'Come to Chennai, Tamil Nadu,  I will visit Delehi next year'.lower()
            text_detection.get_entity_from_text('chennai', text)

            Output:
                'chennai'

            text_detection.get_entity_from_text('Delhi', text)

            Output:
                'delehi'
        """
        variant_tokens = TOKENIZER.tokenize(variant)
        text_tokens = TOKENIZER.tokenize(text)
        original_text_tokens = []
        variant_token_i = 0
        for text_token in text_tokens:
            variant_token = variant_tokens[variant_token_i]

            same = variant_token == text_token
            ft = self._get_fuzziness_threshold_for_token(text_token)
            if same or (len(text_token) > self._min_token_size_for_fuzziness
                        and edit_distance(string1=variant_token,
                                          string2=text_token,
                                          max_distance=ft + 1) <= ft):
                original_text_tokens.append(text_token)
                variant_token_i += 1
                if variant_token_i == len(variant_tokens):
                    return self._get_substring_from_processed_text(
                        original_text_tokens)
            else:
                original_text_tokens = []
                variant_token_i = 0
        return None
Ejemplo n.º 4
0
        def _get_tokens_and_indices(text):
            """
            Args:
                text (str or unicode): text to get tokens from and indicies of those tokens in the given text

            Returns:
                tuple:
                    list: containing tokens, direct results from tokenizer.tokenize
                    list: containing (int, int) indicating start and end position of ith token (of first list)
                          in given text

            E.g.
            In: text = u'i want to order 1 pc hot & crispy'
            Out: ([u'i', u'want', u'to', u'order', u'1', u'pc', u'hot', u'crispy'],
                  [(1, 2), (3, 7), (8, 10), (11, 16), (17, 18), (19, 21), (22, 25), (28, 34)])

            """
            processed_text_tokens = TOKENIZER.tokenize(text)
            processed_text_tokens_indices = []

            offset = 0
            txt = text
            for token in processed_text_tokens:
                st = txt.index(token)
                en = st + len(token)
                txt = txt[en:]
                processed_text_tokens_indices.append((offset + st, offset + en))
                offset += en

            return processed_text_tokens, processed_text_tokens_indices
Ejemplo n.º 5
0
def sort_original_text(original_text_list):
    """
    Sorts the original text list based on tokens and length of string
    :param original_text_list:
    :return:
    """
    final_original_text = []
    sort_original_text_dict = defaultdict(list)
    original_text_list.sort(key=lambda s: len(TOKENIZER.tokenize(s)), reverse=True)
    for original in original_text_list:
        length_of_token = len(TOKENIZER.tokenize(original))
        sort_original_text_dict[length_of_token].append(original)
    for token_length in reversed(sorted(sort_original_text_dict.keys())):
        list_of_tokens = sort_original_text_dict[token_length]
        list_of_tokens.sort(key=lambda s: len(s), reverse=True)
        final_original_text.extend(list_of_tokens)
    return final_original_text
Ejemplo n.º 6
0
    def _get_entity_substring_from_text(self, text, variant):
        """
        Check ngrams of the text for similarity against the variant (can be a ngram) using Levenshtein distance
        and return the closest substring in the text that matches the variant

        Args:
            variant(str or unicode): string, ngram of variant to fuzzy detect in the text using Levenshtein distance
            text(str or unicode): sentence from self.processed on which detection is being done

        Returns:
            str or unicode or None: part of the given text that was detected as entity given the variant,
                                    None otherwise

        Example:
            >>> text_detector = TextDetector('city')
            >>> text = 'Come to Chennai, Tamil Nadu,  I will visit Delehi next year'.lower()
            >>> text_detector.detect_entity(text)
            >>> text_detector._get_entity_substring_from_text(variant='chennai')
            'chennai'
            >>> text_detector._get_entity_substring_from_text(variant='delhi')
            'delehi'

        """
        variant_tokens = TOKENIZER.tokenize(variant)
        text_tokens = TOKENIZER.tokenize(text)
        original_text_tokens = []
        variant_token_i = 0
        for text_token in text_tokens:
            variant_token = variant_tokens[variant_token_i]
            same = variant_token == text_token
            ft = self._get_fuzziness_threshold_for_token(text_token)
            if same or (len(text_token) > self._min_token_size_for_fuzziness
                        and edit_distance(string1=variant_token,
                                          string2=text_token,
                                          max_distance=ft + 1) <= ft):
                original_text_tokens.append(text_token)
                variant_token_i += 1
                if variant_token_i == len(variant_tokens):
                    return self._get_substring_from_processed_text(
                        text, original_text_tokens)
            else:
                original_text_tokens = []
                variant_token_i = 0
        return None
Ejemplo n.º 7
0
    def _get_bulk_text_detection_with_variants(self, messages):
        """
        This function will normalise the message by breaking it into trigrams, bigrams and unigrams.
        The generated ngrams will be used to create query to retrieve search results from datastore.
        These results will contain list of dictionary where for each item key will be variant and
        value will be entity value this will be further processed to get the original text which has
        been identified and will return the results

        Args:
            messages (list of str): list of message for which detection needs to be perform

        Returns:
         tuple:
            list of lists: list of dict for each message with key as entity name
                            containing the detected text entities and original message.
        """

        self._process_text(messages)

        texts = [
            u' '.join(TOKENIZER.tokenize(processed_text))
            for processed_text in self.__processed_texts
        ]

        entity_list = list(self.entities_dict)

        # entity list for ES search should be list of entities
        # for all list of texts
        es_entity_list = [entity_list]
        es_texts = [texts]

        # fetch ES datastore search result
        es_results = self.esdb.get_multi_entity_results(
            entities=es_entity_list,
            texts=es_texts,
            fuzziness_threshold=self._es_fuzziness,
            search_language_script=self._target_language_script)

        final_list = []

        for index, entity_result in enumerate(es_results):
            processed_text = self.__processed_texts[index]
            text = texts[index]
            result_list = self._process_es_result(
                entity_result=entity_result,
                entity_list=entity_list,
                text=text,
                processed_text=processed_text)
            final_list.append(result_list)

        return final_list
Ejemplo n.º 8
0
        def _get_tokens_and_indices(txt):
            """
            Args:
                txt (str or unicode): text to get tokens from and indicies of those tokens in the given text

            Returns:
                tuple:
                    list: containing tokens, direct results from tokenizer.tokenize
                    list: containing (int, int) indicating start and end position of ith token (of first list)
                          in given text

            E.g.
            In: text = u'i want to order 1 pc hot & crispy'
            Out: ([u'i', u'want', u'to', u'order', u'1', u'pc', u'hot', u'crispy'],
                  [(1, 2), (3, 7), (8, 10), (11, 16), (17, 18), (19, 21), (22, 25), (28, 34)])

            """
            txt = txt.rstrip() + ' __eos__'
            processed_text_tokens = TOKENIZER.tokenize(txt)
            processed_text_tokens_indices = []

            offset = 0
            for token in processed_text_tokens:
                st = txt.index(token)
                en = st + len(token)

                # Small block to handle tricky cases like '(A B) C'
                # It extends the previous token's end boundary if there are special characters except whitespace
                # towards the end of previous token
                prefix = txt[:en]
                prefix_tokens = whitespace_tokenizer.tokenize(prefix)
                if prefix and len(prefix_tokens) > 1 and prefix_tokens[0]:
                    if processed_text_tokens_indices:
                        s, e = processed_text_tokens_indices.pop()
                        e += len(prefix_tokens[0])
                        processed_text_tokens_indices.append((s, e))

                txt = txt[en:]
                processed_text_tokens_indices.append(
                    (offset + st, offset + en))
                offset += en

            # remove eos parts
            processed_text_tokens.pop()
            processed_text_tokens_indices.pop()

            return processed_text_tokens, processed_text_tokens_indices
Ejemplo n.º 9
0
    def _get_single_text_detection_with_variants(self, message):
        """
        This function will normalise the message by breaking it into trigrams,
        bigrams and unigrams.

        The generated ngrams will be used to create query to retrieve search results from datastore.

        These results will contain list of dictionary where for each item key will be variant and
        value will be entity value this will be further processed to get the original text which has
        been identified and will return the results

        Returns:
            list of dict: list of dict for each message with key as entity name
                            containing the detected text entities and original message.
        """

        entities_dict = self.entities_dict
        es_entity_list = []
        structured_value_entities_list = []
        text_value_entities_list = []
        texts = []

        for each_entity, value in entities_dict.items():
            structured_value = value.get('structured_value')

            if structured_value:
                # add entity list and text for each structured entity
                # for ES query
                es_entity_list.append([each_entity])
                structured_value_entities_list.append(each_entity)
                texts.append(structured_value)
            else:
                text_value_entities_list.append(each_entity)

        if text_value_entities_list:
            # add entity list and text for all other textual
            # entity for ES query
            es_entity_list.append(text_value_entities_list)
            texts.append(message)

        # pre-process text
        self._process_text(texts)
        texts = [
            u' '.join(TOKENIZER.tokenize(processed_text))
            for processed_text in self.__processed_texts
        ]

        # fetch ES datastore search result
        es_results = self.esdb.get_multi_entity_results(
            entities=es_entity_list,
            texts=texts,
            fuzziness_threshold=self._es_fuzziness,
            search_language_script=self._target_language_script)

        final_list = []
        result_dict = {}

        for index, entity_result in enumerate(es_results):
            processed_text = self.__processed_texts[index]
            text = texts[index]
            entity_list = es_entity_list[index]
            result_dict.update(
                self._process_es_result(entity_result=entity_result,
                                        entity_list=entity_list,
                                        text=text,
                                        processed_text=processed_text))

        final_list.append(result_dict)

        return final_list
Ejemplo n.º 10
0
    def _process_es_result(self, entity_result, entity_list, text,
                           processed_text):
        """
        Process ElasticSearch results which will contain list of dictionary where for
        each item key will be variant and value will be entity value this will be
        processed to get the original text which has been identified and will
        return the results dictionary for each entity detected

        Args:
            entity_result: ES result for entity
            entity_list: List of entity for which ES query ran
            text: original text message
            processed_text: processed text on which detection ran

        Returns:
            result_dict: dictionary with detected text and original text for
                         each entity

        """
        result_dict = {}

        for each_key in entity_list:
            original_final_list = []
            value_final_list = []
            variants_to_values = collections.OrderedDict()
            original_final_list_ = []
            value_final_list_ = []
            _processed_text = processed_text

            _variants_to_values = entity_result.get(each_key, [])

            if not _variants_to_values:
                result_dict[each_key] = ([], [])
                continue

            for variant, value in iteritems(_variants_to_values):
                variant = variant.lower()
                if isinstance(variant, bytes):
                    variant = variant.decode('utf-8')

                variants_to_values[variant] = value
            variants_list = list(variants_to_values.keys())

            exact_matches, fuzzy_variants = [], []

            for variant in variants_list:
                if u' '.join(TOKENIZER.tokenize(variant)) in text:
                    exact_matches.append(variant)
                else:
                    fuzzy_variants.append(variant)

            exact_matches.sort(key=lambda s: len(TOKENIZER.tokenize(s)),
                               reverse=True)
            fuzzy_variants.sort(key=lambda s: len(TOKENIZER.tokenize(s)),
                                reverse=True)

            variants_list = exact_matches + fuzzy_variants
            for variant in variants_list:

                original_text = self._get_entity_substring_from_text(
                    _processed_text, variant, each_key)
                if original_text:
                    value_final_list.append(variants_to_values[variant])
                    original_final_list.append(original_text)
                    boundary_punct_pattern = re.compile(
                        r'(^[{0}]+)|([{0}]+$)'.format(
                            re.escape(string.punctuation)))
                    original_text_ = boundary_punct_pattern.sub(
                        "", original_text)

                    _pattern = re.compile(r'\b%s\b' %
                                          re.escape(original_text_),
                                          flags=_re_flags)
                    tag = '__' + each_key + '__'
                    _processed_text = _pattern.sub(tag, _processed_text)

            value_final_list_.append(value_final_list)
            original_final_list_.append(original_final_list)

            result_dict[each_key] = (value_final_list_, original_final_list_)

        return result_dict
Ejemplo n.º 11
0
def _parse_es_search_results(results_list):
    """
    Parse highlighted results returned from elasticsearch query and generate a variants to values dictionary

    Args:
        results_list (list of dict): search results list of dictionaries from elasticsearch including highlights
                                    and scores

    Returns:
        list of collections.OrderedDict: list containing dicts mapping matching variants to their entity values based
                                  on the parsed results from highlighted search query results

    Example:
        Parameter ngram_results has highlighted search results as follows:

        [
            {u'_shards': {u'failed': 0, u'successful': 5, u'total': 5},
            u'hits': {u'hits': [{u'_id': u'AVrW02UE9WNuMIY9vmWn',
            u'_index': u'doc_type_name',
            u'_score': 11.501145,
            u'_source': {u'dict_type': u'variants',
            u'entity_data': u'city',
            u'value': u'goa',
            u'variants': [u'', u'goa']},
            u'_type': u'data_dictionary',
            u'highlight': {u'variants': [u'<em>goa</em>']}},
            {u'_id': u'AVrW02W99WNuMIY9vmcf',
            u'_index': u'entity_data',
            u'_score': 11.210829,
            u'_source': {u'dict_type': u'variants',
            u'entity_data': u'city',
            u'value': u'Mumbai',
            u'variants': [u'', u'Mumbai']},
            u'_type': u'data_dictionary',
            u'highlight': {u'variants': [u'<em>Mumbai</em>']}},
            ...
            u'max_score': 11.501145,
            u'total': 17},
            u'timed_out': False,
            u'took': 96}
        ]

        After parsing highlighted results, this function returns

        [
            {...
             u'Mumbai': u'Mumbai',
             ...
             u'goa': u'goa',
             u'mumbai': u'mumbai',
             ...
            }
        ]

    """
    variants_to_values_list = []
    if results_list:
        for results in results_list:
            entity_values, entity_variants = [], []
            variants_to_values = collections.OrderedDict()
            if results and results['hits']['total'] > 0:
                for hit in results['hits']['hits']:
                    if 'highlight' not in hit:
                        continue

                    value = hit['_source']['value']
                    for variant in hit['highlight']['variants']:
                        entity_values.append(value)
                        entity_variants.append(variant)

                for value, variant in zip(entity_values, entity_variants):
                    variant = re.sub('\s+', ' ', variant.strip())
                    variant_no_highlight_tags = variant.replace(
                        '<em>', '').replace('</em>', '').strip()
                    if variant.count('<em>') == len(
                            TOKENIZER.tokenize(variant_no_highlight_tags)):
                        variant = variant_no_highlight_tags
                        if variant not in variants_to_values:
                            variants_to_values[variant] = value
            variants_to_values_list.append(variants_to_values)

    return variants_to_values_list
Ejemplo n.º 12
0
    def _text_detection_with_variants(self):
        """
        This function will normalise the message by breaking it into trigrams, bigrams and unigrams. The generated
        ngrams will be used to create query to retrieve search results from datastore. These results will contain a
        dictionary where key will be variant and value will be entity value this will be further processed to get the
        original text which has been identified and will return the results

        Returns:
             tuple:
                list of lists: list of lists containing the detected text entities
                list of lists: list of lists containing their corresponding substrings in the original message.
        """

        original_final_list_ = []
        value_final_list_ = []
        texts = [
            u' '.join(TOKENIZER.tokenize(processed_text))
            for processed_text in self.__processed_texts
        ]

        _variants_to_values_list = self.db.get_similar_dictionary(
            entity_name=self.entity_name,
            texts=texts,
            fuzziness_threshold=self._fuzziness,
            search_language_script=self._target_language_script)
        for index, _variants_to_values in enumerate(_variants_to_values_list):
            original_final_list = []
            value_final_list = []
            variants_to_values = collections.OrderedDict()
            for variant, value in iteritems(_variants_to_values):
                variant = variant.lower()
                if isinstance(variant, bytes):
                    variant = variant.decode('utf-8')

                variants_to_values[variant] = value
            variants_list = list(variants_to_values.keys())

            # Length based ordering, this reorders the results from datastore
            # that are already sorted by some relevance scoring

            exact_matches, fuzzy_variants = [], []
            _text = texts
            for variant in variants_list:
                if u' '.join(TOKENIZER.tokenize(variant)) in _text[index]:
                    exact_matches.append(variant)
                else:
                    fuzzy_variants.append(variant)
            exact_matches.sort(key=lambda s: len(TOKENIZER.tokenize(s)),
                               reverse=True)
            fuzzy_variants.sort(key=lambda s: len(TOKENIZER.tokenize(s)),
                                reverse=True)
            variants_list = exact_matches + fuzzy_variants

            for variant in variants_list:

                original_text = self._get_entity_substring_from_text(
                    self.__processed_texts[index], variant)
                if original_text:
                    value_final_list.append(variants_to_values[variant])
                    original_final_list.append(original_text)

                    boundary_punct_pattern = re.compile(
                        r'(^[{0}]+)|([{0}]+$)'.format(
                            re.escape(string.punctuation)))
                    original_text_ = boundary_punct_pattern.sub(
                        "", original_text)

                    _pattern = re.compile(r'\b%s\b' %
                                          re.escape(original_text_),
                                          flags=_re_flags)
                    self.__tagged_texts[index] = _pattern.sub(
                        self.tag, self.__tagged_texts[index])
                    # Instead of dropping completely like in other entities,
                    # we replace with tag to avoid matching non contiguous segments
                    self.__processed_texts[index] = _pattern.sub(
                        self.tag, self.__processed_texts[index])
            value_final_list_.append(value_final_list)
            original_final_list_.append(original_final_list)

        return value_final_list_, original_final_list_
Ejemplo n.º 13
0
def _parse_multi_entity_es_results(results_list):
    """
    This will parse highlighted results returned from elasticsearch query and
    generate a variants to values dictionary mapped to each entity for each
    search text terms.

    Args:
        results_list (list of dict):
            search results list of dictionaries from elasticsearch including highlights
             and scores

    Returns:
        list of dict of collections.OrderedDict:
            list containing dicts mapping each entity to matching variants to their entity
            values based on the parsed results from highlighted search query results

    Example:
        Parameter ngram_results has highlighted search results as follows:

        [
            {u'_shards': {u'failed': 0, u'successful': 5, u'total': 5},
            u'hits': {u'hits': [{u'_id': u'AVrW02UE9WNuMIY9vmWn',
            u'_index': u'doc_type_name',
            u'_score': 11.501145,
            u'_source': {u'dict_type': u'variants',
            u'entity_data': u'city',
            u'value': u'goa',
            u'variants': [u'', u'goa']},
            u'_type': u'data_dictionary',
            u'highlight': {u'variants': [u'<em>goa</em>']}},
            {u'_id': u'AVrW02W99WNuMIY9vmcf',
            u'_index': u'entity_data',
            u'_score': 11.210829,
            u'_source': {u'dict_type': u'variants',
            u'entity_data': u'city',
            u'value': u'Mumbai',
            u'variants': [u'', u'Mumbai']},
            u'_type': u'data_dictionary',
            u'highlight': {u'variants': [u'<em>Mumbai</em>']}},
            ...
            u'max_score': 11.501145,
            u'total': 17},
            u'timed_out': False,
            u'took': 96}
        ]

        After parsing highlighted results, this function returns

       [
            {
            'city': OrderedDict([
                                ('Mumbai', 'Mumbai'),
                                ('mumbai', 'mumbai'),
                                ('goa', 'goa')
                                ])
            },
            {
            'city': OrderedDict([
                                ('Jabalpur', 'Jabalpur'),
                                ('Jamalpur', 'Jamalpur'),
                                ('goa', 'goa')
                                ])
            }
       ]



    """
    entity_variants_to_values_list = []

    if results_list:
        for results in results_list:
            entity_dict = {}
            entity_variants_to_values_dict = {}
            if results['hits']['total'] > 0:
                for hit in results['hits']['hits']:
                    if 'highlight' not in hit:
                        continue

                    value = hit['_source']['value']
                    entity_name = hit['_source']['entity_data']

                    if entity_name not in entity_dict:
                        entity_dict[entity_name] = {'value': [], 'variant': []}

                    entity_dict[entity_name]['value'].extend(
                        [value for _ in hit['highlight']['variants']])
                    entity_dict[entity_name]['variant'].extend(
                        [variant for variant in hit['highlight']['variants']])

                for each_entity in entity_dict.keys():
                    entity_values = entity_dict[each_entity]['value']
                    entity_variants = entity_dict[each_entity]['variant']
                    entity_variants_to_values = collections.OrderedDict()

                    for value, variant in zip(entity_values, entity_variants):
                        variant = re.sub(r'\s+', ' ', variant.strip())
                        variant_no_highlight_tags = variant.replace('<em>', '').replace('</em>', '').strip()
                        if variant.count('<em>') == len(TOKENIZER.tokenize(variant_no_highlight_tags)):
                            variant = variant_no_highlight_tags
                            if variant not in entity_variants_to_values:
                                entity_variants_to_values[variant] = value
                    entity_variants_to_values_dict[each_entity] = entity_variants_to_values
            entity_variants_to_values_list.append(entity_variants_to_values_dict)
    return entity_variants_to_values_list