def get_char_by_vector(self, vector: np.array): """ New method for both feature encodings and one-hot embeddings Parameters ---------- vector The numpy array representing the features of the char Returns A Char object corresponding to the feature vector ------- """ cos_sims = {} if not self.__ortho: for c, v in self.__dict.items(): cos_sims[c] = cos_sim(vector, v) else: for c, v in self.__char_embeddings.items(): # only second dimension, the first one is lacking in the feature encodings # for unknown reasons cos_sims[c] = cos_sim(v, vector[0,]) return max(cos_sims, key=cos_sims.get)
def validate(valid_dataset, model, pool_size, pad_index): """ simple validation in a code pool. @param: poolsize - size of the code pool, if -1, load the whole test set """ model.eval() processd_num = 0 # record the number of processed data accs, mrrs, maps, ndcgs = [], [], [], [] code_reprs, desc_reprs = [], [] while processd_num < len(valid_dataset)-batch_size: # batch:code_tokens, code_tokens_len, ast_seq, ast_seq_len, desc_pos, desc_pos_len, desc_neg, desc_neg_len batch = get_batch(valid_dataset, processd_num, batch_size, pad_index) processd_num += batch_size code_batch = batch[:4] desc_batch = batch[4:6] with torch.no_grad(): code_repr = model.code_encode(*code_batch).data.cpu().numpy().astype(np.float32) desc_repr = model.desc_encode(*desc_batch).data.cpu().numpy().astype(np.float32) code_reprs.append(code_repr) desc_reprs.append(desc_repr) code_reprs, desc_reprs = np.vstack(code_reprs), np.vstack(desc_reprs) assert len(code_reprs) == len(desc_reprs) bar = tqdm(range(0, len(code_reprs), pool_size)) bar.set_description("start valid") for k in bar: if k+pool_size >len(bar): break code_matrix = code_reprs[k:k+pool_size] desc_matrix = desc_reprs[k:k+pool_size] real = list(range(pool_size)) sims = cos_sim(desc_matrix, code_matrix) # use description to search code negsim = np.negative(sims) predict = np.argpartition(negsim, kth=pool_size-1) predict=predict[:pool_size] for i in range(len(real)): accs.append(ACC([real[i]], list(predict[i]))) mrrs.append(MRR([real[i]], list(predict[i]))) maps.append(MAP([real[i]], list(predict[i]))) ndcgs.append(NDCG([real[i]], list(predict[i]))) return np.mean(accs), np.mean(mrrs), np.mean(maps), np.mean(ndcgs)
def get_char_by_embedding(self, vec: np.array): """ Does the same as the method above, but for the one-hot-encoding Parameters ---------- vec The numpy array representing the localist encoding of the car Returns A char object corresponding to the embedding ------- """ cos_sims = {} for c, feature_vector in self.__char_embeddings.items(): cos_sims[c] = cos_sim(vec, feature_vector) return max(cos_sims, key=cos_sims.get)
def get_char_by_feature_vector(self, vec: np.array): """ Finds the character whose feature vector/embedding is closest to the input vector. Atm we use cosine similarity to do the matching Parameters ---------- vec The numpy array representing the features of the char Returns A char object corresponding to the feature vector ------- """ cos_sims = {} for c, feature_vector in self.__dict.items(): cos_sims[c] = cos_sim(vec, feature_vector) return max(cos_sims, key=cos_sims.get)
def _init_similarity(self, user_id, another_user_id): """ Description A function which computes and returns the similarity between two users. Arguments :param user_id: The first user. :type user_id: int :param another_user_id: The second user. :type another_user_id: int """ number_rated_items_user = len(self.co_rated_between(user_id, user_id)) number_rated_items_another_user = len( self.co_rated_between(another_user_id, another_user_id)) number_of_co_rated_items = len( self.co_rated_between(user_id, another_user_id)) return cos_sim(number_of_co_rated_items, number_rated_items_user, number_rated_items_another_user)
def time_augmented_evaluate_model(mode, model, db_gen, l_utt, save_dir, epoch, l_trial, args, device): if mode not in ['val', 'eval']: raise ValueError('mode should be either "val" or "eval"') model.eval() with torch.set_grad_enabled(False): #1st, extract speaker embeddings. l_embeddings = [] with tqdm(total=len(db_gen), ncols=70) as pbar: for m_batch in db_gen: l_code = [] for batch in m_batch: batch = batch.to(device) code = model(x=batch, is_test=True) l_code.extend(code.cpu().numpy()) l_embeddings.append(np.mean(l_code, axis=0)) pbar.update(1) d_embeddings = {} if not len(l_utt) == len(l_embeddings): print(len(l_utt), len(l_embeddings)) exit() for k, v in zip(l_utt, l_embeddings): d_embeddings[k] = v #2nd, calculate EER y_score = [] # score for each sample y = [] # label for each sample f_res = open(save_dir + 'results/{}_epoch{}.txt'.format(mode, epoch), 'w') for line in l_trial: trg, utt_a, utt_b = line.strip().split(' ') y.append(int(trg)) y_score.append(cos_sim(d_embeddings[utt_a], d_embeddings[utt_b])) f_res.write('{score} {target}\n'.format(score=y_score[-1], target=y[-1])) f_res.close() fpr, tpr, _ = roc_curve(y, y_score, pos_label=1) eer = brentq(lambda x: 1. - x - interp1d(fpr, tpr)(x), 0., 1.) return eer
def forward(self, src_context_output, src_word_output, src_word_len, KG_word_output, KG_word_len, KG_word_seq, tgt_word_input, combine_knowledge = False): """Compute decoder scores from context_output. Args: src_context_output (FloatTensor) : (batch_size, context_size) src_word_output (FloatTensor) : (batch_size, src_max_word_len, word_size) src_word_len (LongTensor) : (batch_size) KG_word_output (FloatTensor) : (batch_size, KG_max_word_len, KG_word_size) KG_word_len (LongTensor) : (batch_size) KG_word_seq (LongTensor) : (batch_size, src_max_word_len) tgt_word_input (LongTensor) : (batch_size, tgt_word_len) Regurns: logit (FloatTensor) : (batch_size, tgt_word_len, num_vocab) converage (FloatTensor) : (batch_size, tgt_word_len) """ assert src_context_output.size(0) == src_word_output.size(0) assert src_context_output.size(0) == tgt_word_input.size(0) assert src_context_output.size(0) == KG_word_output.size(0) batch_size = src_context_output.size(0) max_src_len = src_word_output.size(1) max_KG_len = KG_word_output.size(1) max_tgt_len = tgt_word_input.size(1) # prepare the source word and KG word outputs # src_word_output : [src_word_len, batch_size, src_word_size] src_word_output = src_word_output.permute(1, 0, 2) # src_word_mask : [max_src_len, batch_size] src_word_mask = generate_mask_by_length(src_word_len, max_src_len) # KG_word_output : [KG_word_len, batch_size, KG_word_size] KG_word_output = KG_word_output.permute(1, 0, 2) # KG_word_mask : [max_KG_len, batch_size] KG_word_mask = generate_mask_by_length(KG_word_len, max_KG_len) # obtain word embedding and initial hidden states # tgt_word_emb : [batch_size, tgt_word_len, emb_size] tgt_word_emb = self.word_embedding(tgt_word_input) # hidden : [batch_size, num_layer, rnn_size] hidden = self.context2hidden(src_context_output).view(batch_size, self.num_layers, self.rnn_size) # hidden : [num_layer, batch_size, rnn_size] hidden = hidden.permute(1, 0, 2) logit_word_list = [] coverage_list = [] for word_index in range(max_tgt_len): # recurrence # last_hidden : [batch_size, rnn_size] last_hidden = hidden[-1] # attn_src_word : [batch_size, src_word_size] attn_src_word,_ = self.attention(last_hidden, src_word_output, src_word_output, src_word_mask) # attn_KG_word : [batch_size, src_KG_size] # attn_KG_scores : [max_KG_len ,batch_size] KG_attention_query = last_hidden + attn_src_word attn_KG_word, attn_KG_scores = self.attention(KG_attention_query, KG_word_output, KG_word_output, KG_word_mask) # rnn_inputs : [barch_size, emb_size + src_word_size + KG_word_size] rnn_inputs = torch.cat([tgt_word_emb[:,word_index], attn_src_word, attn_KG_word], dim = 1) # rnn_output : [batch_size, rnn_size] ; hidden : [num_layer, batch_size, rnn_size] rnn_output, hidden = self.rnn_cell(rnn_inputs, hidden) # prob_word : [batch_size, num_vocab] prob_word = self.output(rnn_output) if combine_knowledge == True: # copy_dist : [batch_size, num_vocab] copy_prob = torch.zeros(batch_size, self.num_vocab, device = src_context_output.device) copy_prob = torch.scatter_add(input = copy_prob, dim = 1, index = KG_word_seq, src = attn_KG_scores.permute(1, 0)) # gen_dist_trans_input : [batch_size, emb_size + KG_rnn_size + rnn_size] gen_dist_trans_input = torch.cat([tgt_word_emb[:, word_index], attn_KG_word, rnn_output], dim = 1) gen_dist = self.gen_dist_trans(gen_dist_trans_input) gen_dist = torch.sigmoid(gen_dist) # combined_prob_word: [batch_size, num_vocab] combined_prob_word = prob_word * gen_dist + (1 - gen_dist) * copy_prob else: combined_prob_word = prob_word # logit_word : [batch_size, num_vocab] logit_word = combined_prob_word.log() # coverage_score : [batch_size] coverage_score = cos_sim(last_hidden, hidden[-1]) coverage_score = F.relu(coverage_score) logit_word_list.append(logit_word) # logit : [batch_size, max_tgt_len, num_vocab] logit = torch.stack(logit_word_list, dim = 1) return logit
def __init__(self, batch_size, seq_len, embeddings, char_embeddings, embedding_size, filter_size, num_filters, num_features, num_layers, rnn_size=100, unknown_id=7447, num_classes=2, l2_reg_lambda=4e-4, model_type= "ABCNN3", adjust_weight=False,label_weight=[],is_training=True): # define input variable self.batch_size = batch_size self.seq_len = seq_len self.embeddings = embeddings self.char_embeddings = char_embeddings self.embedding_size = embedding_size self.filter_size = filter_size self.num_filters = num_filters self.num_features = num_features self.num_layers = num_layers self.num_classes = num_classes self.l2_reg_lambda = l2_reg_lambda self.model_type = model_type self.adjust_weight = adjust_weight self.label_weight = label_weight self.is_training = is_training self.rnn_size = rnn_size self.ori_input_quests = tf.placeholder(tf.int32, shape=[None, self.seq_len], name="ori_input") self.cand_input_quests = tf.placeholder(tf.int32, shape=[None, self.seq_len], name="cand_input") self.ori_input_quests_char = tf.placeholder(tf.int32, shape=[None, self.seq_len], name="ori_input_var") self.cand_input_quests_char = tf.placeholder(tf.int32, shape=[None, self.seq_len], name="cand_input_var") #self.ori_input_quests_var = self.ori_input_quests #self.cand_input_quests_var = self.cand_input_quests self.labels = tf.placeholder(tf.int32, shape=[None], name="labels") self.features = tf.placeholder(tf.float32, shape=[None, num_features], name="features") self.keep_prob = tf.placeholder(tf.float32, name="keep_drop") self.new_lr = tf.placeholder(tf.float32, shape=[],name="new_learning_rate") self.lr = tf.Variable(0.0,trainable=False) self._lr_update = tf.assign(self.lr, self.new_lr) #embedding layer with tf.device("/cpu:0"),tf.name_scope("embedding_layer"): W = tf.Variable(tf.to_float(self.embeddings), trainable=True, name="W") char_W = tf.Variable(tf.to_float(self.char_embeddings), trainable=True, name="char_W") ori_quests =tf.nn.embedding_lookup(W, self.ori_input_quests) cand_quests =tf.nn.embedding_lookup(W, self.cand_input_quests) ori_quests_char =tf.nn.embedding_lookup(W, self.ori_input_quests_char) cand_quests_char =tf.nn.embedding_lookup(W, self.cand_input_quests_char) # ori_emb = tf.concat(2, [ori_quests, ori_quests_char]) cand_emb = tf.concat(2, [cand_quests, cand_quests_char]) #shape [batch_size, embedding_size, seq_len, 1] #shape [batch_size, embedding_size] #LO_0 = all_pool("input-left", x1_expanded, self.seq_len, self.filter_size, self.num_filters, self.embedding_size) #RO_0 = all_pool("input-right", x2_expanded, self.seq_len, self.filter_size, self.num_filters, self.embedding_size) # LI_1, RI_1 shape [batch, num_filters, seq_len, 1] # LO_1, RO_1 shape [batch, num_filters] x1_expanded = tf.expand_dims(ori_emb, -1) x2_expanded = tf.expand_dims(cand_emb, -1) #LO_1, RO_1 = CNN_layer("CNN-1", x1_expanded, x2_expanded, self.seq_len, self.embedding_size, self.num_filters, self.filter_size, self.l2_reg_lambda, self.model_type) with tf.variable_scope("cnn", reuse=None) as scope: LO_1 = CNN(x1_expanded, self.seq_len, self.embedding_size*2, self.filter_size, self.num_filters) with tf.variable_scope("cnn", reuse=True) as scope: RO_1 = CNN(x2_expanded, self.seq_len, self.embedding_size*2, self.filter_size, self.num_filters) #with tf.variable_scope("LSTM_scope", reuse=None): # ori_q = BILSTM(ori_emb, self.rnn_size) # LO_1 = max_pooling(ori_q) #with tf.variable_scope("LSTM_scope", reuse=True): # cand_a = BILSTM(cand_emb, self.rnn_size) # RO_1 = max_pooling(cand_a) #self.sims = [cos_sim(LO_0, RO_0), cos_sim(LO_1, RO_1)] #self.sims = [cos_sim(LO_1, RO_1), cos_sim(LO_2, RO_2)] self.sims = [cos_sim(LO_1, RO_1)] #if self.num_layers > 1: # with tf.variable_scope("cnn", reuse=None) as scope: # LO_2 = CNN(tf.expand_dims(ori_q, -1), self.seq_len, self.rnn_size * 2, self.filter_size, self.num_filters) # with tf.variable_scope("cnn", reuse=True) as scope: # RO_2 = CNN(tf.expand_dims(cand_a, -1), self.seq_len, self.rnn_size * 2, self.filter_size, self.num_filters) # self.sims.append(cos_sim(LO_2, RO_2)) with tf.variable_scope("output_layer") as scope: self.output_features = tf.concat(1, [self.features, tf.pack(self.sims, axis=1)], name="output_features") #self.lstm_features = tf.concat(1, [LO_1, RO_1]) self.lstm_features = tf.concat(1, [tf.concat(1, [LO_1, RO_1]), self.features]) #self.lstm_features = self.output_features with tf.variable_scope("fully_connected"): #feature_len = int(self.output_features.get_shape()[1]) feature_len = int(self.lstm_features.get_shape()[1]) softmax_w = tf.get_variable("softmax_w", initializer=tf.truncated_normal([feature_len, self.num_classes], stddev=0.1)) softmax_b = tf.get_variable("softmax_b", initializer=tf.constant(0., shape=[self.num_classes])) self.estimation = tf.matmul(self.lstm_features, softmax_w) + softmax_b #self.estimation_sigmoid = tf.nn.sigmoid(self.estimation) self.output_features = self.estimation #self.output_features = tf.concat(1, [self.features, self.estimation_sigmoid]) with tf.name_scope("loss"): self.loss = tf.nn.sparse_softmax_cross_entropy_with_logits(self.estimation, self.labels) self.cost = tf.reduce_mean(self.loss)
ppl = 1 for i in range(1, length-1): input_seq = torch.LongTensor(sentence2id[:i]).view(1, -1) target = torch.LongTensor([sentence2id[i+1]]) with torch.no_grad(): _, output = model(input_seq) # 1*5262 prob = F.softmax(output.squeeze(), dim=-1) prob = torch.index_select(prob, 0, target) ppl *= (1/prob.item()) ppl = pow(ppl, 1/(length-1)) return ppl if __name__ == '__main__': # sentence = '我 喜欢 吃 火' # print(sentence_complement(sentence)) s1 = '你 把 衣服 脱 光 了' s2 = '你 把 衣服 脱 了' begin = time.time() v1 = get_hidden_state(s1) print(time.time()-begin) v2 = get_hidden_state(s2) sim = cos_sim(v1, v2) print('{}\t与\t{}\t的相似度为:{}'.format(s1.replace(' ', ''), s2.replace(' ', ''), sim))
for q_id, q_text in total_q_dics.items(): a = collect_a(str(q_id)) if a != None: QA.append((q_text, a)) ''' Qの検索結果とAの検索結果の類似度を計算して,そのスコアをQとタプルにする ''' Q_score = [] for q_text, a_text in QA: #QとAのtf辞書を作成する q_dic = tfdic_by_bing(q_text, 50, ["Description"]) a_dic = tfdic_by_bing(a_text, 50, ["Description"]) #QとAの辞書を類似度を測る sim_score = cos_sim(q_dic, a_dic) Q_score.append((q_text, sim_score)) ''' Q_scoreを降順にソートする ''' sorted_Q = sorted(Q_score, key=lambda x:x[1]) for q in sorted_Q: print(q)