def W2V_Vec(sent_A, sent_B, vec): if len(sent_A) <= 1: sent_A += 'none' elif len(sent_B) <= 1: sent_B += 'none' vec1 = 0 vec2 = 0 sent_A = tokenize(sent_A) sent_B = tokenize(sent_B) for word in sent_A: if word not in ", . ? ! # $ % ^ & * ( ) { } [ ]".split(): try: vec1 += vec[word] except: continue for word in sent_B: if word not in ", . ? ! # $ % ^ & * ( ) { } [ ]".split(): try: vec2 += vec[word] except: continue try: result = cos(vec1, vec2) except: result = 0.0 if np.isnan(result): return 0.0 else: return result
def get_negative_data(self, numpy_data, check_sim, limit): neg_data = numpy_data.copy() np.random.shuffle(neg_data) if check_sim: c = 0 for i in range(len(neg_data)): sim = 1 - cos(neg_data[i], numpy_data[i]) while sim > limit: c += 1 neg_data[i] = neg_data[int(random.random() * np.shape(neg_data)[0])] sim = 1 - cos(neg_data[i], numpy_data[i]) print("Negative sample build. #Changes: ", c) return neg_data
def words_score(sentence, words_infos): '''新数据与老数据对比,分类''' s2v = AvgWord2vec() words_vec = s2v.transfrom_sentence_to_vec(sentence) for words_info in words_infos: score = cos(words_vec, words_info.get("words_vec")) print(score) # 夹角越小越相似 if score < best_score: return words_info.get("intent") else: return "匹配失败"
def is_word_embed_match(self, mention_x: MentionDataLight, mention_y: MentionDataLight): """ Check if input mentions Word Embedding cosine distance below above 0.65 Args: mention_x: MentionDataLight mention_y: MentionDataLight Returns: bool """ match_result = False x_embed = self.embedding.get_feature_vector(mention_x) y_embed = self.embedding.get_feature_vector(mention_y) # make sure words are not 'unk/None/0' if x_embed is not None and y_embed is not None: dist = cos(x_embed, y_embed) if not math.isnan(dist): sim = 1 - dist if sim > 0.65: match_result = True return match_result
with torch.no_grad(): query_bert_outputs, _ = model(query, attention_mask=(query > 0).long(), token_type_ids=None, output_all_encoded_layers=True) query_bert_outputs = torch.cat(query_bert_outputs[-1:], dim=-1) pred = span_extractor(query_bert_outputs, pos).squeeze(0) candidate_abstract_output, _ = model( candidate_abstract, attention_mask=(candidate_abstract > 0).long(), token_type_ids=None, output_all_encoded_layers=True) abstract_bert_outputs = torch.cat(candidate_abstract_output[-1:], dim=-1) label = span_extractor(abstract_bert_outputs, pos_answer).squeeze(0) #print(pred.size(),type.size(),torch.max(type)) pred = pred.cpu().numpy() label = label.cpu().numpy() for i in range(query.size()[0]): mse_distance.append(mse(pred[i], label[i])) point_distance.append(sum(pred[i] * label[i])) cos_distance.append(cos(pred[i], label[i])) #print('loss',loss) #pred_set = np.concatenate(pred_set, axis=0) #label_set = np.concatenate(label_set, axis=0) bert_dist = pd.DataFrame() bert_dist['bert_cos_distance'] = cos_distance bert_dist['bert_point_distance'] = point_distance bert_dist['bert_mse_distance'] = mse_distance bert_dist.to_pickle('data/bert_dis_test.pkl')
import json from pprint import pprint import numpy as np import pandas as pd data = [json.loads(line) for line in open('gensim.json', 'r')] xx = [] for parte in range(0, len(data)): xx.append( np.mean([ data[parte]['features'][i]['layers'][0]['values'] for i in range(0, len(data[parte]['features'])) ], axis=0)) from scipy.spatial.distance import cosine as cos df = pd.read_csv('gensim.csv', encoding="latin-1", header=None) print(df.shape) print(len(data)) for i in range(0, len(xx)): print(np.array(df)[i], cos(xx[3], xx[i]))