def jaccard_sim(sentence_1, sentence_2): """ Compute IoU meseaure on string tokens """ sentence_1 = normalizeString(sentence_1, target) tokens_1 = sentence_1.split() sentence_2 = normalizeString(sentence_2, target) tokens_2 = sentence_2.split() union = np.unique(tokens_1.extend(tokens_2)) intersection = [_ for _ in union if _ in tokens_1 and _ in tokens_2] return len(intersection) / len(union)
def answer_question(question, s2v_model, w2v_model, data, similarity, k=1): """ Uses vector representation and similarity function to provide K possible answers to the question """ # Clean question question = normalizeString(question) embed_matrix = s2v_model.get_embedding_matrix() query_vec = s2v_model.seq_vec_sent(question) result = '' if query_vec.shape[0] == embed_matrix.shape[0]: # Compute similarity X = similarity(query_vec, embed_matrix)[0] # Get best matchs indexes' indexes = np.argsort(X) # Extract responses from data Y = data.iloc[indexes, 1].drop_duplicates('last') Y_indexes = list(Y.index) responses = Y.to_numpy()[-k:] for i, rep in enumerate(responses): result += "Similarity : {} - {}\n".format( float(X[Y_indexes[-k + i]]), rep) else: resilt += 'seqvec embedding dimensions {} \n'\ 'model embedding dimensions {}'.format(query_vec.shape, embed_questions.shape) return result
def tf_idf_vector(self, sentence, emb_size=40): """ Create vectors using tf_idf score """ vec = np.zeros(emb_size) senctence = normalizeString(sentence) tokens = sentence.split() for i,token in enumerate(tokens): if token in self.tf_dict.keys(): vec[i] = self.tf_dict[token] return vec
def bow(self, sentence): """ Create vectors using bag of words created from the list of words in dataset """ vec = np.zeros(self.n_words) senctence = normalizeString(sentence) tokens = sentence.split() for i,token in enumerate(tokens): if word in self.words: vec[i] = 1 return vec
def reply(self, input_text): with torch.no_grad(): sentences = [s.strip() for s in re.split('[\.\,\?\!]', input_text)] sentences = sentences[:-1] if sentences == []: sentences = [input_text] for sentence in sentences: trimmed_sentence = TrimWordsSentence(normalizeString(sentence)) print(trimmed_sentence) answer_words, _ = self.model(trimmed_sentence, self.train_input_lang, self.train_output_lang) answer = ' '.join(answer_words) return answer
def w_seq2vec_fun(self, sentence): """ Computes sequence vector using input model """ senctence = normalizeString(sentence) tokens = sentence.split() len_tokens = len(tokens) seq_vec = np.zeros_like(self.model[:, 0], dtype=float) for i in range(len_tokens): if tokens[i] in self.words: index = self.words.index(tokens[i]) seq_vec += self.tf_dict[tokens[i]] * self.model[:, index] return seq_vec/len_tokens