def _text_detection_with_variants(self): """ This function will normalise the message by breaking it into trigrams, bigrams and unigrams. The generated ngrams will be used to create query to retrieve search results from datastore. These results will contain a dictionary where key will be variant and value will be entity value this will be further processed to get the original text which has been identified and will return the results Returns: A tuple of two lists with first list containing the detected text entities and second list containing their corresponding substrings in the original message. """ original_final_list = [] value_final_list = [] normalization = Normalization() self.text_dict = normalization.ngram_data( self.processed_text.lower(), flag_punctuation_removal=False, stem_unigram=False, stem_bigram=False, stem_trigram=False, stop_words_unigram=True, stop_words_bigram=True, stop_words_trigram=True).copy() variant_dictionary = {} trigram_variants = self.db.get_similar_ngrams_dictionary( self.entity_name, self.text_dict['trigram'], self._fuzziness) bigram_variants = self.db.get_similar_ngrams_dictionary( self.entity_name, self.text_dict['bigram'], self._fuzziness) unigram_variants = self.db.get_similar_ngrams_dictionary( self.entity_name, self.text_dict['unigram'], self._fuzziness) variant_dictionary.update(trigram_variants) variant_dictionary.update(bigram_variants) variant_dictionary.update(unigram_variants) variant_list = variant_dictionary.keys() exact_matches, fuzzy_variants = [], [] for variant in variant_list: if variant.lower() in self.processed_text.lower(): exact_matches.append(variant) else: fuzzy_variants.append(variant) exact_matches.sort(key=lambda s: len(tokenizer.tokenize(s)), reverse=True) fuzzy_variants.sort(key=lambda s: len(tokenizer.tokenize(s)), reverse=True) variant_list = exact_matches + fuzzy_variants for variant in variant_list: original_text = self._get_entity_from_text( variant, self.processed_text.lower()) if original_text: value_final_list.append(variant_dictionary[variant]) original_final_list.append(original_text) self.processed_text = re.sub(r'\b' + original_text + r'\b', self.tag, self.processed_text) return value_final_list, original_final_list
def _get_entity_from_text(self, variant, text): """ Checks ngrams of the text for similarity against the variant (can be a ngram) using Levenshtein distance Args: variant: string, ngram of variant to fuzzy detect in the text using Levenshtein distance text: text to detect entities from Returns: part of the given text that was detected as entity given the variant, None otherwise Example: text_detection = TextDetector('city') ... text_detection._get_entity_from_text(self, variant, text) text = 'Come to Chennai, Tamil Nadu, I will visit Delehi next year'.lower() text_detection.get_entity_from_text('chennai', text) Output: 'chennai' text_detection.get_entity_from_text('Delhi', text) Output: 'delehi' """ variant_tokens = tokenizer.tokenize(variant.lower()) text_tokens = tokenizer.tokenize(text.lower()) original_text = [] variant_count = 0 for text_token in text_tokens: variant_token = variant_tokens[variant_count] utext_token = text_token if type(utext_token) == 'str': utext_token = utext_token.decode('utf-8') same = variant_token == text_token ft = self._get_fuzziness_threshold_for_token(utext_token) if same or (len(utext_token) > self._min_token_size_for_fuzziness and edit_distance(string1=variant_token, string2=text_token, max_distance=ft + 1) <= ft): original_text.append(text_token) variant_count += 1 if variant_count == len(variant_tokens): return ' '.join(original_text) else: original_text = [] variant_count = 0 return None
def _text_detection_with_variants(self): """ This function will normalise the message by breaking it into trigrams, bigrams and unigrams. The generated ngrams will be used to create query to retrieve search results from datastore. These results will contain a dictionary where key will be variant and value will be entity value this will be further processed to get the original text which has been identified and will return the results Returns: A tuple of two lists with first list containing the detected text entities and second list containing their corresponding substrings in the original message. """ original_final_list = [] value_final_list = [] variant_dictionary = {} tokens = tokenizer.tokenize(self.processed_text) message = u' '.join(tokens) variants = self.db.get_similar_dictionary( self.entity_name, message, self._fuzziness, search_language_script=self._target_language_script) variant_dictionary.update(variants) variant_list = variant_dictionary.keys() exact_matches, fuzzy_variants = [], [] for variant in variant_list: if variant.lower() in self.processed_text.lower(): exact_matches.append(variant) else: fuzzy_variants.append(variant) exact_matches.sort(key=lambda s: len(tokenizer.tokenize(s)), reverse=True) fuzzy_variants.sort(key=lambda s: len(tokenizer.tokenize(s)), reverse=True) variant_list = exact_matches + fuzzy_variants for variant in variant_list: original_text = self._get_entity_from_text( variant, self.processed_text.lower()) if original_text: value_final_list.append(variant_dictionary[variant]) original_final_list.append(original_text) _pattern = re.compile(r'\b%s\b' % original_text, re.UNICODE) self.tagged_text = _pattern.sub(self.tag, self.tagged_text) # Instead of dropping completely like in other entities, # we replace with tag to avoid matching non contiguous segments self.processed_text = _pattern.sub(self.tag, self.processed_text) return value_final_list, original_final_list
def _get_entity_from_text(self, variant, text): """ Checks ngrams of the text for similarity against the variant (can be a ngram) using Levenshtein distance Args: variant: string, ngram of variant to fuzzy detect in the text using Levenshtein distance text: text to detect entities from Returns: part of the given text that was detected as entity given the variant, None otherwise Example: text_detection = TextDetector('city') ... text_detection._get_entity_from_text(self, variant, text) text = 'Come to Chennai, Tamil Nadu, I will visit Delehi next year'.lower() text_detection.get_entity_from_text('chennai', text) Output: 'chennai' text_detection.get_entity_from_text('Delhi', text) Output: 'delehi' """ variant_token_list = tokenizer.tokenize(variant.lower()) text_token_list = tokenizer.tokenize(text.lower()) original_text = [] variant_count = 0 token_count = 0 while token_count < len(text_token_list): levenshtein = Levenshtein(variant_token_list[variant_count], text_token_list[token_count], self.fuzziness_threshold + 1) if variant_token_list[variant_count] == text_token_list[token_count] or \ (len(text_token_list[token_count]) > self.min_size_token_for_levenshtein and levenshtein.levenshtein_distance() <= self.fuzziness_threshold): original_text.append(text_token_list[token_count]) variant_count += 1 if variant_count == len(variant_token_list): return ' '.join(original_text) else: original_text = [] variant_count = 0 token_count += 1 return None
def sort_original_text(original_text_list): """ Sorts the original text list based on tokens and length of string :param original_text_list: :return: """ final_original_text = [] sort_original_text_dict = defaultdict(list) original_text_list.sort(key=lambda s: len(tokenizer.tokenize(s)), reverse=True) for original in original_text_list: length_of_token = len(tokenizer.tokenize(original)) sort_original_text_dict[length_of_token].append(original) for token_length in reversed(sorted(sort_original_text_dict.keys())): list_of_tokens = sort_original_text_dict[token_length] list_of_tokens.sort(key=lambda s: len(s), reverse=True) final_original_text.extend(list_of_tokens) return final_original_text
def add_data_to_tagger(self, bot_message, user_message): """ As explained, CRF need data in a particular format, this function converts the bot_message and user_message into that format and add it to the tagger. Args: bot_message: message from bot user_message: message from user for Example: Args: bot_message = 'none' user_message = 'flights from delhi to goa' Then this functions tokenize the bot and user messages, gets the POS tags, tags them as outbound or inbound as per the sender and adds it to the tagger object. tokens_bot_message = ['none'] tokens_user_message = ['flights', 'from', 'delhi', 'goa'] pos_bot_message = [['none', 'NN']] pos_user_message = [['flights','NNS'], ['from', 'VBP'], ['delhi', 'NN'], ['to', 'TO'], ['goa', 'VB']] none NN o flights NNS i from VBP i delhi NN i to TO i goa VB i """ if bot_message is None: bot_message = '' tokens_bot_message = tokenizer.tokenize(bot_message) tokens_user_message = tokenizer.tokenize(user_message) pos_bot_message = self.pos_tagger.tag(tokens_bot_message) pos_user_message = self.pos_tagger.tag(tokens_user_message) for token in pos_bot_message: self.tagger.add( str(token[0]) + ' ' + str(token[1]) + ' ' + OUTBOUND) for token in pos_user_message: self.tagger.add( str(token[0]) + ' ' + str(token[1]) + ' ' + INBOUND)