def _get_entity_substring_from_text(self, text, variant, entity_name): """ Check ngrams of the text for similarity against the variant (can be a ngram) using Levenshtein distance and return the closest substring in the text that matches the variant. For each entity fuziness and min_token_size_for_fuzziness is used from the entity details. Args: variant(str or unicode): string, ngram of variant to fuzzy detect in the text using Levenshtein distance text(str or unicode): sentence from self.processed on which detection is being done entity_name (str): name of the entity to get fuzziness and min_token_lenght value Returns: str or unicode or None: part of the given text that was detected as entity given the variant, None otherwise Example: >>> text_detector = TextDetector(entity_dict={'city':{}) >>> text = 'Come to Chennai, Tamil Nadu, I will visit Delehi next year'.lower() >>> text_detector._get_entity_substring_from_text(variant='chennai') 'chennai' >>> text_detector._get_entity_substring_from_text(variant='delhi') 'delehi' """ variant_tokens = TOKENIZER.tokenize(variant) text_tokens = TOKENIZER.tokenize(text) original_text_tokens = [] variant_token_i = 0 for text_token in text_tokens: variant_token = variant_tokens[variant_token_i] same = variant_token == text_token # get fuzziness and min_token_size_for_fuziness value from entity dict entity_dict = self.entities_dict.get(entity_name, {}) # get fuzziness from entity if not set default fuzziness = entity_dict.get('fuzziness') or self._fuzziness self.set_fuzziness_low_high_threshold(fuzziness) min_token_size_for_fuzziness = entity_dict.get( 'min_token_len_fuzziness') if not min_token_size_for_fuzziness: min_token_size_for_fuzziness = self._min_token_size_for_fuzziness ft = self._get_fuzziness_threshold_for_token(token=text_token) # set substitution cost to one if same or (len(text_token) > min_token_size_for_fuzziness and edit_distance(string1=variant_token, string2=text_token, substitution_cost=1, max_distance=ft + 1) <= ft): original_text_tokens.append(text_token) variant_token_i += 1 if variant_token_i == len(variant_tokens): return self._get_substring_from_processed_text( text, original_text_tokens) else: original_text_tokens = [] variant_token_i = 0 return None
def _text_detection_with_variants(self): """ This function will normalise the message by breaking it into trigrams, bigrams and unigrams. The generated ngrams will be used to create query to retrieve search results from datastore. These results will contain a dictionary where key will be variant and value will be entity value this will be further processed to get the original text which has been identified and will return the results Returns: tuple: list: containing the detected text entities list: containing their corresponding substrings in the original message. """ original_final_list = [] value_final_list = [] variants_to_values = {} _variants_to_values = self.db.get_similar_dictionary( entity_name=self.entity_name, text=u' '.join(TOKENIZER.tokenize(self.processed_text)), fuzziness_threshold=self._fuzziness, search_language_script=self._target_language_script) for variant, value in iteritems(_variants_to_values): variant = variant.lower() if isinstance(variant, bytes): variant = variant.decode('utf-8') variants_to_values[variant] = value variants = variants_to_values.keys() exact_matches, fuzzy_variants = [], [] for variant in variants: if variant in self.processed_text: exact_matches.append(variant) else: fuzzy_variants.append(variant) exact_matches.sort(key=lambda s: len(TOKENIZER.tokenize(s)), reverse=True) fuzzy_variants.sort(key=lambda s: len(TOKENIZER.tokenize(s)), reverse=True) variant_list = exact_matches + fuzzy_variants for variant in variant_list: original_text = self._get_entity_substring_from_text( variant, self.processed_text) if original_text: value_final_list.append(variants_to_values[variant]) original_final_list.append(original_text) _pattern = re.compile(r'\b%s\b' % re.escape(original_text), re.UNICODE) self.tagged_text = _pattern.sub(self.tag, self.tagged_text) # Instead of dropping completely like in other entities, # we replace with tag to avoid matching non contiguous segments self.processed_text = _pattern.sub(self.tag, self.processed_text) return value_final_list, original_final_list
def _get_entity_substring_from_text(self, variant, text): """ Checks ngrams of the text for similarity against the variant (can be a ngram) using Levenshtein distance Args: variant: string, ngram of variant to fuzzy detect in the text using Levenshtein distance text: text to detect entities from Returns: str or unicode: part of the given text that was detected as entity given the variant, None otherwise Example: text_detection = TextDetector('city') ... text_detection._get_entity_from_text(self, variant, text) text = 'Come to Chennai, Tamil Nadu, I will visit Delehi next year'.lower() text_detection.get_entity_from_text('chennai', text) Output: 'chennai' text_detection.get_entity_from_text('Delhi', text) Output: 'delehi' """ variant_tokens = TOKENIZER.tokenize(variant) text_tokens = TOKENIZER.tokenize(text) original_text_tokens = [] variant_token_i = 0 for text_token in text_tokens: variant_token = variant_tokens[variant_token_i] same = variant_token == text_token ft = self._get_fuzziness_threshold_for_token(text_token) if same or (len(text_token) > self._min_token_size_for_fuzziness and edit_distance(string1=variant_token, string2=text_token, max_distance=ft + 1) <= ft): original_text_tokens.append(text_token) variant_token_i += 1 if variant_token_i == len(variant_tokens): return self._get_substring_from_processed_text( original_text_tokens) else: original_text_tokens = [] variant_token_i = 0 return None
def _get_tokens_and_indices(text): """ Args: text (str or unicode): text to get tokens from and indicies of those tokens in the given text Returns: tuple: list: containing tokens, direct results from tokenizer.tokenize list: containing (int, int) indicating start and end position of ith token (of first list) in given text E.g. In: text = u'i want to order 1 pc hot & crispy' Out: ([u'i', u'want', u'to', u'order', u'1', u'pc', u'hot', u'crispy'], [(1, 2), (3, 7), (8, 10), (11, 16), (17, 18), (19, 21), (22, 25), (28, 34)]) """ processed_text_tokens = TOKENIZER.tokenize(text) processed_text_tokens_indices = [] offset = 0 txt = text for token in processed_text_tokens: st = txt.index(token) en = st + len(token) txt = txt[en:] processed_text_tokens_indices.append((offset + st, offset + en)) offset += en return processed_text_tokens, processed_text_tokens_indices
def sort_original_text(original_text_list): """ Sorts the original text list based on tokens and length of string :param original_text_list: :return: """ final_original_text = [] sort_original_text_dict = defaultdict(list) original_text_list.sort(key=lambda s: len(TOKENIZER.tokenize(s)), reverse=True) for original in original_text_list: length_of_token = len(TOKENIZER.tokenize(original)) sort_original_text_dict[length_of_token].append(original) for token_length in reversed(sorted(sort_original_text_dict.keys())): list_of_tokens = sort_original_text_dict[token_length] list_of_tokens.sort(key=lambda s: len(s), reverse=True) final_original_text.extend(list_of_tokens) return final_original_text
def _get_entity_substring_from_text(self, text, variant): """ Check ngrams of the text for similarity against the variant (can be a ngram) using Levenshtein distance and return the closest substring in the text that matches the variant Args: variant(str or unicode): string, ngram of variant to fuzzy detect in the text using Levenshtein distance text(str or unicode): sentence from self.processed on which detection is being done Returns: str or unicode or None: part of the given text that was detected as entity given the variant, None otherwise Example: >>> text_detector = TextDetector('city') >>> text = 'Come to Chennai, Tamil Nadu, I will visit Delehi next year'.lower() >>> text_detector.detect_entity(text) >>> text_detector._get_entity_substring_from_text(variant='chennai') 'chennai' >>> text_detector._get_entity_substring_from_text(variant='delhi') 'delehi' """ variant_tokens = TOKENIZER.tokenize(variant) text_tokens = TOKENIZER.tokenize(text) original_text_tokens = [] variant_token_i = 0 for text_token in text_tokens: variant_token = variant_tokens[variant_token_i] same = variant_token == text_token ft = self._get_fuzziness_threshold_for_token(text_token) if same or (len(text_token) > self._min_token_size_for_fuzziness and edit_distance(string1=variant_token, string2=text_token, max_distance=ft + 1) <= ft): original_text_tokens.append(text_token) variant_token_i += 1 if variant_token_i == len(variant_tokens): return self._get_substring_from_processed_text( text, original_text_tokens) else: original_text_tokens = [] variant_token_i = 0 return None
def _get_bulk_text_detection_with_variants(self, messages): """ This function will normalise the message by breaking it into trigrams, bigrams and unigrams. The generated ngrams will be used to create query to retrieve search results from datastore. These results will contain list of dictionary where for each item key will be variant and value will be entity value this will be further processed to get the original text which has been identified and will return the results Args: messages (list of str): list of message for which detection needs to be perform Returns: tuple: list of lists: list of dict for each message with key as entity name containing the detected text entities and original message. """ self._process_text(messages) texts = [ u' '.join(TOKENIZER.tokenize(processed_text)) for processed_text in self.__processed_texts ] entity_list = list(self.entities_dict) # entity list for ES search should be list of entities # for all list of texts es_entity_list = [entity_list] es_texts = [texts] # fetch ES datastore search result es_results = self.esdb.get_multi_entity_results( entities=es_entity_list, texts=es_texts, fuzziness_threshold=self._es_fuzziness, search_language_script=self._target_language_script) final_list = [] for index, entity_result in enumerate(es_results): processed_text = self.__processed_texts[index] text = texts[index] result_list = self._process_es_result( entity_result=entity_result, entity_list=entity_list, text=text, processed_text=processed_text) final_list.append(result_list) return final_list
def _get_tokens_and_indices(txt): """ Args: txt (str or unicode): text to get tokens from and indicies of those tokens in the given text Returns: tuple: list: containing tokens, direct results from tokenizer.tokenize list: containing (int, int) indicating start and end position of ith token (of first list) in given text E.g. In: text = u'i want to order 1 pc hot & crispy' Out: ([u'i', u'want', u'to', u'order', u'1', u'pc', u'hot', u'crispy'], [(1, 2), (3, 7), (8, 10), (11, 16), (17, 18), (19, 21), (22, 25), (28, 34)]) """ txt = txt.rstrip() + ' __eos__' processed_text_tokens = TOKENIZER.tokenize(txt) processed_text_tokens_indices = [] offset = 0 for token in processed_text_tokens: st = txt.index(token) en = st + len(token) # Small block to handle tricky cases like '(A B) C' # It extends the previous token's end boundary if there are special characters except whitespace # towards the end of previous token prefix = txt[:en] prefix_tokens = whitespace_tokenizer.tokenize(prefix) if prefix and len(prefix_tokens) > 1 and prefix_tokens[0]: if processed_text_tokens_indices: s, e = processed_text_tokens_indices.pop() e += len(prefix_tokens[0]) processed_text_tokens_indices.append((s, e)) txt = txt[en:] processed_text_tokens_indices.append( (offset + st, offset + en)) offset += en # remove eos parts processed_text_tokens.pop() processed_text_tokens_indices.pop() return processed_text_tokens, processed_text_tokens_indices
def _get_single_text_detection_with_variants(self, message): """ This function will normalise the message by breaking it into trigrams, bigrams and unigrams. The generated ngrams will be used to create query to retrieve search results from datastore. These results will contain list of dictionary where for each item key will be variant and value will be entity value this will be further processed to get the original text which has been identified and will return the results Returns: list of dict: list of dict for each message with key as entity name containing the detected text entities and original message. """ entities_dict = self.entities_dict es_entity_list = [] structured_value_entities_list = [] text_value_entities_list = [] texts = [] for each_entity, value in entities_dict.items(): structured_value = value.get('structured_value') if structured_value: # add entity list and text for each structured entity # for ES query es_entity_list.append([each_entity]) structured_value_entities_list.append(each_entity) texts.append(structured_value) else: text_value_entities_list.append(each_entity) if text_value_entities_list: # add entity list and text for all other textual # entity for ES query es_entity_list.append(text_value_entities_list) texts.append(message) # pre-process text self._process_text(texts) texts = [ u' '.join(TOKENIZER.tokenize(processed_text)) for processed_text in self.__processed_texts ] # fetch ES datastore search result es_results = self.esdb.get_multi_entity_results( entities=es_entity_list, texts=texts, fuzziness_threshold=self._es_fuzziness, search_language_script=self._target_language_script) final_list = [] result_dict = {} for index, entity_result in enumerate(es_results): processed_text = self.__processed_texts[index] text = texts[index] entity_list = es_entity_list[index] result_dict.update( self._process_es_result(entity_result=entity_result, entity_list=entity_list, text=text, processed_text=processed_text)) final_list.append(result_dict) return final_list
def _process_es_result(self, entity_result, entity_list, text, processed_text): """ Process ElasticSearch results which will contain list of dictionary where for each item key will be variant and value will be entity value this will be processed to get the original text which has been identified and will return the results dictionary for each entity detected Args: entity_result: ES result for entity entity_list: List of entity for which ES query ran text: original text message processed_text: processed text on which detection ran Returns: result_dict: dictionary with detected text and original text for each entity """ result_dict = {} for each_key in entity_list: original_final_list = [] value_final_list = [] variants_to_values = collections.OrderedDict() original_final_list_ = [] value_final_list_ = [] _processed_text = processed_text _variants_to_values = entity_result.get(each_key, []) if not _variants_to_values: result_dict[each_key] = ([], []) continue for variant, value in iteritems(_variants_to_values): variant = variant.lower() if isinstance(variant, bytes): variant = variant.decode('utf-8') variants_to_values[variant] = value variants_list = list(variants_to_values.keys()) exact_matches, fuzzy_variants = [], [] for variant in variants_list: if u' '.join(TOKENIZER.tokenize(variant)) in text: exact_matches.append(variant) else: fuzzy_variants.append(variant) exact_matches.sort(key=lambda s: len(TOKENIZER.tokenize(s)), reverse=True) fuzzy_variants.sort(key=lambda s: len(TOKENIZER.tokenize(s)), reverse=True) variants_list = exact_matches + fuzzy_variants for variant in variants_list: original_text = self._get_entity_substring_from_text( _processed_text, variant, each_key) if original_text: value_final_list.append(variants_to_values[variant]) original_final_list.append(original_text) boundary_punct_pattern = re.compile( r'(^[{0}]+)|([{0}]+$)'.format( re.escape(string.punctuation))) original_text_ = boundary_punct_pattern.sub( "", original_text) _pattern = re.compile(r'\b%s\b' % re.escape(original_text_), flags=_re_flags) tag = '__' + each_key + '__' _processed_text = _pattern.sub(tag, _processed_text) value_final_list_.append(value_final_list) original_final_list_.append(original_final_list) result_dict[each_key] = (value_final_list_, original_final_list_) return result_dict
def _parse_es_search_results(results_list): """ Parse highlighted results returned from elasticsearch query and generate a variants to values dictionary Args: results_list (list of dict): search results list of dictionaries from elasticsearch including highlights and scores Returns: list of collections.OrderedDict: list containing dicts mapping matching variants to their entity values based on the parsed results from highlighted search query results Example: Parameter ngram_results has highlighted search results as follows: [ {u'_shards': {u'failed': 0, u'successful': 5, u'total': 5}, u'hits': {u'hits': [{u'_id': u'AVrW02UE9WNuMIY9vmWn', u'_index': u'doc_type_name', u'_score': 11.501145, u'_source': {u'dict_type': u'variants', u'entity_data': u'city', u'value': u'goa', u'variants': [u'', u'goa']}, u'_type': u'data_dictionary', u'highlight': {u'variants': [u'<em>goa</em>']}}, {u'_id': u'AVrW02W99WNuMIY9vmcf', u'_index': u'entity_data', u'_score': 11.210829, u'_source': {u'dict_type': u'variants', u'entity_data': u'city', u'value': u'Mumbai', u'variants': [u'', u'Mumbai']}, u'_type': u'data_dictionary', u'highlight': {u'variants': [u'<em>Mumbai</em>']}}, ... u'max_score': 11.501145, u'total': 17}, u'timed_out': False, u'took': 96} ] After parsing highlighted results, this function returns [ {... u'Mumbai': u'Mumbai', ... u'goa': u'goa', u'mumbai': u'mumbai', ... } ] """ variants_to_values_list = [] if results_list: for results in results_list: entity_values, entity_variants = [], [] variants_to_values = collections.OrderedDict() if results and results['hits']['total'] > 0: for hit in results['hits']['hits']: if 'highlight' not in hit: continue value = hit['_source']['value'] for variant in hit['highlight']['variants']: entity_values.append(value) entity_variants.append(variant) for value, variant in zip(entity_values, entity_variants): variant = re.sub('\s+', ' ', variant.strip()) variant_no_highlight_tags = variant.replace( '<em>', '').replace('</em>', '').strip() if variant.count('<em>') == len( TOKENIZER.tokenize(variant_no_highlight_tags)): variant = variant_no_highlight_tags if variant not in variants_to_values: variants_to_values[variant] = value variants_to_values_list.append(variants_to_values) return variants_to_values_list
def _text_detection_with_variants(self): """ This function will normalise the message by breaking it into trigrams, bigrams and unigrams. The generated ngrams will be used to create query to retrieve search results from datastore. These results will contain a dictionary where key will be variant and value will be entity value this will be further processed to get the original text which has been identified and will return the results Returns: tuple: list of lists: list of lists containing the detected text entities list of lists: list of lists containing their corresponding substrings in the original message. """ original_final_list_ = [] value_final_list_ = [] texts = [ u' '.join(TOKENIZER.tokenize(processed_text)) for processed_text in self.__processed_texts ] _variants_to_values_list = self.db.get_similar_dictionary( entity_name=self.entity_name, texts=texts, fuzziness_threshold=self._fuzziness, search_language_script=self._target_language_script) for index, _variants_to_values in enumerate(_variants_to_values_list): original_final_list = [] value_final_list = [] variants_to_values = collections.OrderedDict() for variant, value in iteritems(_variants_to_values): variant = variant.lower() if isinstance(variant, bytes): variant = variant.decode('utf-8') variants_to_values[variant] = value variants_list = list(variants_to_values.keys()) # Length based ordering, this reorders the results from datastore # that are already sorted by some relevance scoring exact_matches, fuzzy_variants = [], [] _text = texts for variant in variants_list: if u' '.join(TOKENIZER.tokenize(variant)) in _text[index]: exact_matches.append(variant) else: fuzzy_variants.append(variant) exact_matches.sort(key=lambda s: len(TOKENIZER.tokenize(s)), reverse=True) fuzzy_variants.sort(key=lambda s: len(TOKENIZER.tokenize(s)), reverse=True) variants_list = exact_matches + fuzzy_variants for variant in variants_list: original_text = self._get_entity_substring_from_text( self.__processed_texts[index], variant) if original_text: value_final_list.append(variants_to_values[variant]) original_final_list.append(original_text) boundary_punct_pattern = re.compile( r'(^[{0}]+)|([{0}]+$)'.format( re.escape(string.punctuation))) original_text_ = boundary_punct_pattern.sub( "", original_text) _pattern = re.compile(r'\b%s\b' % re.escape(original_text_), flags=_re_flags) self.__tagged_texts[index] = _pattern.sub( self.tag, self.__tagged_texts[index]) # Instead of dropping completely like in other entities, # we replace with tag to avoid matching non contiguous segments self.__processed_texts[index] = _pattern.sub( self.tag, self.__processed_texts[index]) value_final_list_.append(value_final_list) original_final_list_.append(original_final_list) return value_final_list_, original_final_list_
def _parse_multi_entity_es_results(results_list): """ This will parse highlighted results returned from elasticsearch query and generate a variants to values dictionary mapped to each entity for each search text terms. Args: results_list (list of dict): search results list of dictionaries from elasticsearch including highlights and scores Returns: list of dict of collections.OrderedDict: list containing dicts mapping each entity to matching variants to their entity values based on the parsed results from highlighted search query results Example: Parameter ngram_results has highlighted search results as follows: [ {u'_shards': {u'failed': 0, u'successful': 5, u'total': 5}, u'hits': {u'hits': [{u'_id': u'AVrW02UE9WNuMIY9vmWn', u'_index': u'doc_type_name', u'_score': 11.501145, u'_source': {u'dict_type': u'variants', u'entity_data': u'city', u'value': u'goa', u'variants': [u'', u'goa']}, u'_type': u'data_dictionary', u'highlight': {u'variants': [u'<em>goa</em>']}}, {u'_id': u'AVrW02W99WNuMIY9vmcf', u'_index': u'entity_data', u'_score': 11.210829, u'_source': {u'dict_type': u'variants', u'entity_data': u'city', u'value': u'Mumbai', u'variants': [u'', u'Mumbai']}, u'_type': u'data_dictionary', u'highlight': {u'variants': [u'<em>Mumbai</em>']}}, ... u'max_score': 11.501145, u'total': 17}, u'timed_out': False, u'took': 96} ] After parsing highlighted results, this function returns [ { 'city': OrderedDict([ ('Mumbai', 'Mumbai'), ('mumbai', 'mumbai'), ('goa', 'goa') ]) }, { 'city': OrderedDict([ ('Jabalpur', 'Jabalpur'), ('Jamalpur', 'Jamalpur'), ('goa', 'goa') ]) } ] """ entity_variants_to_values_list = [] if results_list: for results in results_list: entity_dict = {} entity_variants_to_values_dict = {} if results['hits']['total'] > 0: for hit in results['hits']['hits']: if 'highlight' not in hit: continue value = hit['_source']['value'] entity_name = hit['_source']['entity_data'] if entity_name not in entity_dict: entity_dict[entity_name] = {'value': [], 'variant': []} entity_dict[entity_name]['value'].extend( [value for _ in hit['highlight']['variants']]) entity_dict[entity_name]['variant'].extend( [variant for variant in hit['highlight']['variants']]) for each_entity in entity_dict.keys(): entity_values = entity_dict[each_entity]['value'] entity_variants = entity_dict[each_entity]['variant'] entity_variants_to_values = collections.OrderedDict() for value, variant in zip(entity_values, entity_variants): variant = re.sub(r'\s+', ' ', variant.strip()) variant_no_highlight_tags = variant.replace('<em>', '').replace('</em>', '').strip() if variant.count('<em>') == len(TOKENIZER.tokenize(variant_no_highlight_tags)): variant = variant_no_highlight_tags if variant not in entity_variants_to_values: entity_variants_to_values[variant] = value entity_variants_to_values_dict[each_entity] = entity_variants_to_values entity_variants_to_values_list.append(entity_variants_to_values_dict) return entity_variants_to_values_list