def get_self_critical_semantics_reward(id_word, infersent_model, model, feat0, feat1, feat_mask, pos_feat, groundtruth, video_id, total_embeddings, probability_sample, kwargs={}): batch_size = feat0.size(0) double_batch_size = batch_size * 2 seq_length = probability_sample.size(1) semantics_score = np.zeros(double_batch_size) greedy_sample, _ = model.sample(feat0, feat1, feat_mask, pos_feat) res = [] res_embeddings = [] gts_dict = OrderedDict() gts_embeddings = [] greedy_sample = greedy_sample.cpu().numpy() probability_sample = probability_sample.cpu().numpy() for i in range(batch_size): res.append(decode_idx(probability_sample[i], id_word)) for i in range(batch_size, double_batch_size): res.append(decode_idx(greedy_sample[i - batch_size], id_word)) res_embeddings = infersent_model.encode(res, bsize=128, tokenize=True, verbose=True) for key in video_id: gts_embeddings.append(total_embeddings[key]) for key in video_id: gts_embeddings.append(total_embeddings[key]) # for i in range(batch_size): # gts_dict[i] = [decode_idx(groundtruth[i][j].cpu().numpy(), id_word) for j in range(len(groundtruth[i]))] # for i in range(double_batch_size): # gts_embeddings.append(infersent_model.encode(gts_dict[i % batch_size], bsize=128, tokenize=True)) for i in range(double_batch_size): hypothesis_embedding = res_embeddings[i] for j in range(len(gts_embeddings[i])): premise_embedding = gts_embeddings[i][j] semantics_score[i] = max( semantics_score[i], cosine(hypothesis_embedding, premise_embedding)) reward = semantics_score[:batch_size] - semantics_score[batch_size:] reward = np.repeat(reward[:, np.newaxis], seq_length, axis=1) return reward
def test(opt, infermodel, embed): import math EPS = 1e-4 # path0 = os.path.join(opt.data_path, 'test_prediction.pkl') path1 = os.path.join(opt.data_path, 'extracted_test_prediction.pkl') # with open(path0, 'rb') as f: # content0 = pickle.load(f) with open(path1, 'rb') as f: content1 = pickle.load(f) store = [] for key in content1: store.append(content1[key]) embeddings = infermodel.encode(store, bsize=128, tokenize=True) idx = 0 for key in content1: sent = content1[key] temp = infermodel.encode([sent], bsize=128, tokenize=True)[0] if math.fabs(1 - cosine(temp, embeddings[idx])) > EPS: print(key) print(sent) print(cosine(temp, embeddings[idx])) idx += 1
embeddings1 = infersent_model.encode(sentences1, bsize=128, tokenize=True) embeddings2 = infersent_model.encode(sentences2, bsize=128, tokenize=True) embeddings3 = infersent_model.encode(sentences3, bsize=128, tokenize=True) embeddings4 = infersent_model.encode(sentences4, bsize=128, tokenize=True) test(opt=opt, infermodel=infersent_model, embed=embeddings0) print(len(embeddings0)) for i in range(7010, len(embeddings0)): vid = 'vid' + str(i + 1) b_vid = vid.encode() answers = total_embeddings[b_vid] max_score0, max_score1, max_score2, max_score3, max_score4 = -1.0, -1.0, -1.0, -1.0, -1.0 # print('len of answers is ', len(answers)) for item in answers: max_score0 = max(max_score0, cosine(item, embeddings0[i])) max_score1 = max(max_score1, cosine(item, embeddings1[i])) max_score2 = max(max_score2, cosine(item, embeddings2[i])) max_score3 = max(max_score3, cosine(item, embeddings3[i])) max_score4 = max(max_score4, cosine(item, embeddings4[i])) scores0.append(max_score0) scores1.append(max_score1) scores2.append(max_score2) scores3.append(max_score3) scores4.append(max_score4) if max_score0 > max_score1: compare01.append(vid) if max_score0 > max_score2: compare02.append(vid) if max_score0 > max_score3: compare03.append(vid) if max_score0 > max_score4: compare04.append(vid)
'pool_type': 'max', 'dpout_model': 0.0, 'version': model_version } infersent_model = InferSent(params_model) infersent_model.load_state_dict(torch.load(MODEL_PATH)) infersent_model = infersent_model.to(device) W2V_PATH = opt.w2v_path assert W2V_PATH is not None, '--w2v_path is None!' infersent_model.set_w2v_path(W2V_PATH) infersent_model.build_vocab_k_words(K=100000) store = [ 'a man is talking about a movie pictures of a movie pictures', 'a person is folding paper', 'a man is singing', 'people are dancing and dancing', 'a man and woman are talking about something', 'a woman is applying makeup', 'a person is cooking a dish and adding ingredients into a pot', 'a man is talking', 'a man is talking about the weather on the screen', 'cartoon characters are interacting' ] # 集体编码 embeddings = infersent_model.encode(store, bsize=128, tokenize=True) for i in range(len(store)): # 对第i句话单独编码 成 temp temp = infersent_model.encode([store[i]], bsize=128, tokenize=True)[0] # 将单独编码的第i句话和集体编码的第i句话算余弦相似度 if math.fabs(1 - cosine(temp, embeddings[i])) > EPS: print(cosine(temp, embeddings[i]))
data_loaders = [train_loader, valid_loader, test_loader] embeddings_path = os.path.join(opt.data_path, 'sentence_embeddings.pkl') sentence_embeddings = load_pkl(embeddings_path) caption_set = get_caps(opt.data_path) cnt = 0 for dataloader in data_loaders: for i, (data, caps, caps_mask, cap_classes, class_masks, feats0, feats1, feat_mask, pos_feat, lens, gts, video_id) in enumerate(dataloader): for t in range(feats0.size(0)): vid_t = video_id[t] number = len(caption_set[vid_t]) temp = [] for j in range(number): temp.append(caption_set[vid_t][j][b'tokenized'].decode()) temp_embeddings = model.encode(temp, bsize=128, tokenize=True) keeped_embeddings = sentence_embeddings[vid_t] for j in range(number): cosine_value = cosine(temp_embeddings[j], keeped_embeddings[j]) if math.fabs(1.0 - cosine_value) < EPS: continue print(cosine_value) # print('#######################################################') cnt += 1 print('now cnt == ', cnt) print('cnt == ', cnt)