def forward(self, iter_num, batch_sentence, batch_verb, vocab): sen_embed = self.sen_embed(batch_sentence) verb_embed = self.verb_embed(batch_verb) sen_embed = self.sen_embed_linear(sen_embed) # sen_embed = self.sen_embed_dropout(sen_embed) verb_embed = self.verb_embed_linear(verb_embed) # verb_embed = self.verb_embed_dropout(verb_embed) embed = torch.cat((sen_embed, verb_embed), dim=1) embed = self.cat_embed_linear(embed) embed = self.cat_embed_dropout(embed) ### embed.size() -> [batch_size, max_seq_len, common_size] ### src = embed.permute(1, 0, 2) ### scr.size() -> [max_seq_len, batch_size, common_size] ### tgt = embed.permute(1, 0, 2) ### tgt.size() -> [max_seq_len, batch_size, common_size] ### output = self.transformer(src, tgt) ### output.size() -> [max_seq_len, batch_size, common_size] ### sen2vec = self.output_linear(output.permute(1, 2, 0)).permute(2, 0, 1) sen2vec = l2norm(sen2vec) ### sen2vec.size() -> [1, batch_size, common_size] ### sen2vec = self.sen2vec_linear(sen2vec) sen2vec = self.sen2vec_dropout(sen2vec) sen2vec = l2norm(sen2vec) sen2vec = sen2vec.squeeze(dim=0) # print('sen2vec:', sen2vec.size(), src.size(), tgt.size(), output.size(), output.permute(1, 0, 2).size()) # if iter_num % 100 == 0: # print('sen2vec:', sen2vec) return sen2vec, output.permute(1, 0, 2)
def forward(self, x_id, im, x): x_id_emb = self.embedding(x_id) im = self.linear(im) x_w2v = torch.zeros(*x_id_emb.size()) x_cat = None if self.model_options['concat']: for i, text in enumerate(x): for j, word in enumerate(text.split()): try: x_w2v[j, i] = torch.from_numpy( wvModel[word.decode('utf8')]) except KeyError: pass x_w2v = Variable(x_w2v.cuda()) x_cat = torch.cat([x_id_emb, x_w2v]) else: x_cat = x_id_emb if self.model_options['encoder'] == 'bow': x_cat = x_cat.sum(0).squeeze(0) else: _, (x_cat, _) = self.lstm(x_cat) x_cat = x_cat.squeeze(0) return l2norm(x_cat), l2norm(im)
def forward(self, iter_num, batch_sentence, batch_verb, vocab): words_num = batch_sentence.ne(vocab.padidx).sum(dim=1).to(device) verbs_num = batch_verb.ne(vocab.padidx).sum(dim=1).to(device) sen_embed = self.sen_embed(batch_sentence) verb_embed = self.verb_embed(batch_verb) # sen_embed, sen_embed_attn = self.sen_embed_attention( # sen_embed, sen_embed) # embed_attention # # verb_embed, verb_embed_attn = self.verb_embed_attention( # verb_embed, sen_embed) # embed_attention # sen_output, (sen_h_n, sen_c_n) = self.sen_lstm( sen_embed, self.sen_h0.permute(1, 0, 2).contiguous(), self.sen_c0.permute(1, 0, 2).contiguous()) verb_output, (verb_h_n, verb_c_n) = self.verb_lstm( verb_embed, self.verb_h0.permute(1, 0, 2).contiguous(), self.verb_c0.permute(1, 0, 2).contiguous()) ### sen_output.size() -> [batch_size, max_seq_len, hidden_size*2] ### ### sen_output_cat.size() -> [batch_size, max_seq_len+max_verb_len, hidden_size*2] ### # output_cat = torch.cat((sen_output, verb_output), dim=1) # self_attention = True self_attention = False if self_attention == True: output, output_attn = self.lstm_attention( sen_output, verb_output) # lstm_attention # sen2vec = output.permute(0, 2, 1) sen2vec = self.len_linear(sen2vec).squeeze(dim=2) else: sen2vec = torch.stack( [sen_output[i, j - 1, :] for i, j in enumerate(words_num)], dim=0) verb2vec = torch.stack( [verb_output[i, j - 1, :] for i, j in enumerate(verbs_num)], dim=0) sen2vec = torch.cat((sen2vec, verb2vec), dim=1) sen2vec = self.cat_linear(sen2vec) sen2vec = self.sen2vec_linear(sen2vec) # sen2vec, sen2vec_attn = self.sen2vec_attention(sen2vec, sen2vec) sen2vec = self.sen2vec_dropout(sen2vec) sen2vec = l2norm(sen2vec) sen_out = torch.cat((sen_output, verb_output), dim=1) sen_out = self.sen_out_linear(sen_out) sen_out = self.sen_out_dropout(sen_out) sen_out = l2norm(sen_out) sen_h = torch.cat((sen_h_n, verb_h_n), dim=2) sen_h = self.sen_h_linear(sen_h) sen_h = self.sen_h_dropout(sen_h) sen_h = l2norm(sen_h) # print('sen2vec:', sen2vec.size(), sen_out.size(), sen_h.size()) # if iter_num % 100 == 0: # print('sen2vec:', sen2vec) return sen2vec, sen_out, sen_h
def forward(self, x, im): x_emb = self.embedding(x) im = self.linear(im) _, (x_emb, _) = self.lstm(x_emb) x_emb = x_emb.squeeze(0) return l2norm(x_emb), l2norm(im)
def forward(self, en, cn): en_embed = self.embedding(en) cn = self.linear(cn) _, (en_embed, _) = self.lstm(en_embed) en_embed = en_embed.squeeze(0) return l2norm(en_embed), l2norm(cn)
def forward(self, x, im): x = self.embedding(x) im = self.linear(im) if self.model_options['encoder'] == 'bow': x = x.sum(0).squeeze(0) else: _, (x, _) = self.lstm(x) x = x.squeeze(0) return l2norm(x), l2norm(im)
def evaluate_i2t(image_mha, image_encoder, bert_model, text_encoder, image_dataloader, text_dataloader, ks): with torch.no_grad(): all_text_features = [] text_index = 0 res_dict = dict() for filenames, input_ids, attention_masks in text_dataloader: for filename in filenames: image_id = int(re.findall(r'\d{12}', filename)[0]) if image_id not in res_dict: res_dict[image_id] = [] res_dict[image_id].append(text_index) text_index += 1 # Get text features input_ids = input_ids.to(device) attention_masks = attention_masks.to(device) text_features = bert_model(input_ids, attention_mask=attention_masks) text_features = l2norm(text_features) text_features = text_encoder(text_features) all_text_features.append(text_features) all_text_features = torch.cat(all_text_features, dim=0) recall = np.zeros(len(ks)) max_k = max(ks) total_query = 0 pbar = tqdm(enumerate(image_dataloader), total=len(image_dataloader), leave=False, position=0, file=sys.stdout) for i, (image_ids, features) in pbar: mha_features = [] for feature in features: feature = l2norm(feature.to(device)) feature = l2norm(image_mha(feature)) feature = torch.mean(feature, dim=0, keepdim=True) mha_features.append(feature) mha_features = torch.cat(mha_features, dim=0) image_features = image_encoder(mha_features) all_indices = get_top_k_eval(image_features, all_text_features, max_k) for idx, indices in enumerate(all_indices): total_query += 1 image_id = image_ids[idx].item() true_text_indices = torch.tensor(res_dict[image_id]) for i, k in enumerate(ks): top_k_text = indices[:k].to('cpu') relevant_text = np.intersect1d(top_k_text, true_text_indices) if relevant_text.shape[0] > 0: recall[i] += 1 recall = recall / total_query return recall
def forward(self, image, verb_id): '''print('testing 123') x = torch.tensor([[1, 2, 3],[4,5,6]]) print('original', x.size()) x = x.repeat(1,2) print('xxxxxx', x, x.view(-1,3), x.size())''' conv = self.conv(image) #verb pred verb_rep = self.verb(conv) verb_embedding = self.verb_transform(self.verb_lookup(verb_id)) return utils.l2norm(verb_rep), utils.l2norm(verb_embedding)
def forward(self, en, en_lengths, en_index, cn, cn_lengths, cn_index): """ Input Variable: input_var: A variables whose size is (B,W), B is the batch size and W is the longest sequence length in the batch input_lengths: The lengths of each element in the batch. hidden: The hidden state variable whose size is (num_layer*num_directions,batch_size,hidden_size) Output: output: A variable with tensor size W*B*N, W is the maximum length of the batch, B is the batch size, and N is the hidden size hidden: The hidden state variable with tensor size (num_layer*num_direction,B,N) """ en = self.sorted_forward(en, en_lengths, en_index) cn = self.sorted_forward(cn, cn_lengths, cn_index) return l2norm(en), l2norm(cn)
def forward_loss(self, img_span_features, cap_span_features, img_lengths, txt_lengths, img_span_bounds, txt_span_bounds, img_span_margs, txt_span_margs): b = img_span_features.size(0) N_txt = txt_lengths.max(0)[0] mstep_txt = (txt_lengths * 2).int() # focus on only short spans nstep_txt = int(mstep_txt.float().mean().item()) N_img = img_lengths.max(0)[0] mstep_img = (img_lengths * 2).int() # focus on only short spans nstep_img = int(mstep_img.float().mean().item()) matching_loss_matrix = torch.zeros(b, nstep_img, nstep_txt, device=img_span_features.device) similarity_matrix = torch.zeros(b, b, nstep_img, nstep_txt, device=img_span_features.device) for j in range(nstep_img): for k in range(nstep_txt): cap_emb = cap_span_features[:, k] img_emb = img_span_features[:, j] cap_marg = txt_span_margs[:, k].softmax(-1).unsqueeze(-2) cap_emb = torch.matmul(cap_marg, cap_emb).squeeze(-2) img_marg = img_span_margs[:, j].softmax(-1).unsqueeze(-2) img_emb = torch.matmul(img_marg, img_emb).squeeze(-2) cap_emb = utils.l2norm(cap_emb) img_emb = utils.l2norm(img_emb) similarity_matrix[:, :, j, k] = self.similarity(img_emb, cap_emb) img_span_margs = img_span_margs.sum(-1).unsqueeze(2).unsqueeze(1) txt_span_margs = txt_span_margs.sum(-1).unsqueeze(1).unsqueeze(0) expected_similarity = img_span_margs[:, :, : nstep_img, :] * txt_span_margs[:, :, :, : nstep_txt] * similarity_matrix expected_similarity = expected_similarity.sum([-2, -1]) expected_loss = self.contrastive(expected_similarity) return expected_loss
def evaluate_t2i(image_mha, image_encoder, bert_model, text_encoder, image_dataloader, text_dataloader, ks): # Load image features with torch.no_grad(): image_features = [] image_ids = [] for ids, features in image_dataloader: image_ids.append(torch.stack(ids)) mha_features = [] for feature in features: feature = l2norm(feature.to(device)) feature = l2norm(image_mha(feature)) feature = torch.mean(feature, dim=0, keepdim=True) mha_features.append(feature) mha_features = torch.cat(mha_features, dim=0) image_features.append(image_encoder(mha_features)) image_features = torch.cat(image_features, dim=0) image_ids = torch.cat(image_ids, dim=0).to(device) # Evaluate max_k = max(ks) recall = np.zeros(len(ks)) total_query = 0 pbar = tqdm(enumerate(text_dataloader), total=len(text_dataloader), leave=False, position=0, file=sys.stdout) for i, (image_files, input_ids, attention_mask) in pbar: input_ids = input_ids.to(device) attention_mask = attention_mask.to(device) text_features = bert_model(input_ids, attention_mask=attention_mask) text_features = l2norm(text_features) text_features = text_encoder(text_features) image_files = torch.tensor( list( map(lambda x: int(re.findall(r'\d{12}', x)[0]), image_files))).to(device) top_k = get_top_k_eval(text_features, image_features, max_k) for idx, indices in enumerate(top_k): total_query += 1 true_image_id = image_files[idx] sorted_image_ids = torch.gather(image_ids, 0, indices) for i, k in enumerate(ks): top_k_images = sorted_image_ids[:k] if (top_k_images == true_image_id).nonzero().numel() > 0: recall[i] += 1 recall = recall / total_query return recall
def build_model(tparams, options): """ Computation graph for the model """ opt_ret = dict() trng = RandomStreams(1234) # description string: #words x #samples x = tensor.matrix('x', dtype='int64') mask = tensor.matrix('mask', dtype='float32') im = tensor.matrix('im', dtype='float32') n_timesteps = x.shape[0] n_samples = x.shape[1] # Word embedding (source) emb = tparams['Wemb'][x.flatten()].reshape([n_timesteps, n_samples, options['dim_word']]) # Encode sentences (source) if options['encoder'] == 'bow': sents = (emb * mask[:,:,None]).sum(0) else: proj = get_layer(options['encoder'])[1](tparams, emb, None, options, prefix='encoder', mask=mask) sents = proj[0][-1] sents = l2norm(sents) # Encode images (source) images = get_layer('ff')[1](tparams, im, options, prefix='ff_image', activ='linear') # Compute loss cost = contrastive_loss(options['margin'], images, sents) return trng, [x, mask, im], cost
def build_sentence_encoder(tparams, options): """ Encoder only, for sentences """ opt_ret = dict() trng = RandomStreams(1234) # description string: #words x #samples x = tensor.matrix('x', dtype='int64') mask = tensor.matrix('x_mask', dtype='float32') n_timesteps = x.shape[0] n_samples = x.shape[1] # Word embedding emb = tparams['Wemb'][x.flatten()].reshape([n_timesteps, n_samples, options['dim_word']]) # Encode sentences if options['encoder'] == 'bow': sents = (emb * mask[:, :, None]).sum(0) else: proj = get_layer(options['encoder'])[1](tparams, emb, None, options, prefix='encoder', mask=mask) sents = proj[0][-1] sents = l2norm(sents) return trng, [x, mask], sents
def forward(self, features, img_lengths): b, N, _ = features.size() # b, N, _, _, _ = input.size() # input = input.reshape(-1, input.size(-3), input.size(-2), input.size(-1)) # features = self.conv(input) # features = F.relu(features) # features = features.reshape(features.size(0), -1) features = self.fc1(features) dim = features.size(-1) assert N == img_lengths.max() feats = torch.zeros(b, int(N * (N - 1) / 2), self.NT, self.sem_dim, device=features.device) beg_idx = 0 for k in range(1, N): inc = torch.arange(N - k, device=features.device).view( N - k, 1) #.expand(N - k, k + 1) idx = torch.arange(k + 1, device=features.device).view(1, k + 1).repeat( N - k, 1) idx = (idx + inc).view(-1) idx = idx.unsqueeze(0).unsqueeze(-1).expand(b, -1, dim) feat = torch.gather(features, 1, idx) feat = feat.view(b, N - k, k + 1, dim) feat = feat.unsqueeze(3).expand(b, N - k, k + 1, self.NT, self.sem_dim) feat = feat.view(b, N - k, k + 1, self.NT, self.sem_dim) feat = l2norm(feat.sum(2)) end_idx = beg_idx + N - k feats[:, beg_idx:end_idx] = feat beg_idx = end_idx return feats
def encode_images(tparams, options, im): im_emb = get_layer('ff')[1](tparams, im, options, prefix='ff_image', activ='linear') im_emb = l2norm(im_emb) if options['abs']: im_emb = abs(im_emb) return im_emb
def _read(self, feat_path, meta_path, proposal_folders): fn_node_pattern = '*_node.npz' fn_edge_pattern = '*_edge.npz' with Timer('read meta and feature'): self.lb2idxs, self.idx2lb = read_meta(meta_path) inst_num = len(self.idx2lb) if not self.featureless: features = read_probs(feat_path, inst_num, self.feature_dim) self.features = l2norm(features) else: self.feature_dim = 1 self.features = np.ones(inst_num).reshape(-1, 1) with Timer('read proposal list'): self.lst = [] for proposal_folder in proposal_folders: print('read proposals from folder: ', proposal_folder) fn_nodes = sorted( glob.glob(os.path.join(proposal_folder, fn_node_pattern))) fn_edges = sorted( glob.glob(os.path.join(proposal_folder, fn_edge_pattern))) assert len(fn_nodes) == len( fn_edges), "node files({}) vs edge files({})".format( len(fn_nodes), len(fn_edges)) assert len(fn_nodes) > 0, 'files under {} is 0'.format( proposal_folder) for fn_node, fn_edge in zip(fn_nodes, fn_edges): assert fn_node[:fn_node.rfind( '_')] == fn_edge[:fn_edge.rfind('_' )], "{} vs {}".format( fn_node, fn_edge) self.lst.append([fn_node, fn_edge]) self.size = len(self.lst)
def forward(self, input): output = self.linear(input) if self.l2_norm: output = l2norm(output) return output
def image_template_feature(img_feats, template, media, choose_templates, choose_ids): template = np.array(template, np.int) media = np.array(media, np.int) unique_templates, indices = np.unique(choose_templates, return_index=True) unique_subjectids = choose_ids[indices] template_feats = [] for uqt in tqdm(unique_templates): ind_t = np.where(template == uqt) face_norm_feats = img_feats[ind_t] face_medias = media[ind_t] unique_medias, unique_media_counts = np.unique(face_medias, return_counts=True) media_norm_feats = [] for u, ct in zip(unique_medias, unique_media_counts): ind_m = np.where(face_medias == u) if ct == 1: media_norm_feats.append(face_norm_feats[ind_m]) else: media_norm_feats.append( np.mean(face_norm_feats[ind_m], axis=0, keepdims=True)) media_norm_feats = np.array(media_norm_feats) template_feats.append(np.sum(media_norm_feats, axis=0)) template_feats = np.concatenate(template_feats, axis=0) template_norm_feats = l2norm(template_feats) return template_norm_feats, unique_templates, unique_subjectids
def build_image_encoder(tparams, options): """ Encoder only, for images """ opt_ret = dict() trng = RandomStreams(1234) # image features im = tensor.matrix('im', dtype='float32') # Encode images images_mm = get_layer('ff')[1](tparams, im, options, prefix='ff_image_mm', activ='linear') if not 'attention_type' in options or options['attention_type'] == 'dot': images_mm = l2norm(images_mm) if options['use_dropout']: images_mm *= shared_dropout_layer( (n_samples, options['dim_multimodal']), use_noise, trng, retain_probability_hidden) return trng, [im], images_mm
def build_sentence_encoder(tparams, options): """ Encoder only, for sentences """ opt_ret = dict() trng = RandomStreams(1234) # description string: #words x #samples x = tensor.matrix('x', dtype='int64') mask = tensor.matrix('x_mask', dtype='float32') n_timesteps = x.shape[0] n_samples = x.shape[1] # Word embedding emb = tparams['Wemb'][x.flatten()].reshape([n_timesteps, n_samples, options['dim_word']]) # Encode sentences if options['encoder'] == 'bow': sents = (emb * mask[:,:,None]).sum(0) else: proj = get_layer(options['encoder'])[1](tparams, emb, None, options, prefix='encoder', mask=mask) sents = proj[0][-1] sents = l2norm(sents) return trng, [x, mask], sents
def get_approx_min_longest_edge(simplex, L): '''n->1 l.b.m.approx.: max(f(A) - L*|AB|, f(B) - L*|AB|)''' A = simplex[0] B = simplex[1] AB_dist = l2norm(A[:-1], B[:-1]) return max([A[-1]['obj'] - L*AB_dist, B[-1]['obj'] - L*AB_dist])
def _read(self, feat_path, label_path, proposal_folders): with Timer('read meta and feature'): if label_path is not None: self.lb2idxs, self.idx2lb = read_meta(label_path) self.labels = intdict2ndarray(self.idx2lb) self.inst_num = len(self.idx2lb) self.ignore_label = False else: self.lb2idxs, self.idx2lb = None, None self.labels = None self.inst_num = -1 self.ignore_label = True if not self.featureless: features = read_probs(feat_path, self.inst_num, self.feature_dim) self.features = l2norm(features) if self.inst_num == -1: self.inst_num = features.shape[0] else: assert self.inst_num > 0 self.feature_dim = 1 self.features = np.ones(self.inst_num).reshape(-1, 1) with Timer('read proposal list'): self.lst = [] self.tot_lst = [] if callable(proposal_folders): proposal_folders = proposal_folders() for proposal_folder in proposal_folders: print('read proposals from folder: ', proposal_folder) fn_nodes = sorted( glob.glob(osp.join(proposal_folder, self.fn_node_pattern))) fn_edges = sorted( glob.glob(osp.join(proposal_folder, self.fn_edge_pattern))) assert len(fn_nodes) == len( fn_edges), "node files({}) vs edge files({})".format( len(fn_nodes), len(fn_edges)) assert len(fn_nodes) > 0, 'files under {} is 0'.format( proposal_folder) for fn_node, fn_edge in zip(fn_nodes, fn_edges): # sanity check assert fn_node[:fn_node.rfind( '_')] == fn_edge[:fn_edge.rfind('_' )], "{} vs {}".format( fn_node, fn_edge) if self._check_iop(fn_node): self.lst.append([fn_node, fn_edge]) self.tot_lst.append([fn_node, fn_edge]) self.size = len(self.lst) self.tot_size = len(self.tot_lst) assert self.size <= self.tot_size if self.size < self.tot_size: print('select {} / {} = {:.2f} proposals ' 'with iop between ({:.2f}, {:.2f})'.format( self.size, self.tot_size, 1. * self.size / self.tot_size, self.th_iop_min, self.th_iop_max))
def forward_sens(self, x): x = self.embedding(x) if self.model_options['encoder'] == 'bow': x = x.sum(0).squeeze(0) else: _, (x, _) = self.lstm(x) x = x.squeeze(0) return l2norm(x)
def evaluate(self, val_image_dataloader, val_text_dataloader, k): self.switch_to_eval() # Load image features with torch.no_grad(): image_features = [] image_ids = [] for ids, features, image_attention_mask in val_image_dataloader: image_ids.append(torch.stack(ids)) features = torch.stack(features).to(self.device) image_attention_mask = torch.stack(image_attention_mask).to( self.device) features = l2norm(features).detach() mha_features = l2norm( self.image_mha(features, image_attention_mask)) image_features.append(self.image_encoder(mha_features)) # image_features.append(mha_features) image_features = torch.cat(image_features, dim=0) image_ids = torch.cat(image_ids, dim=0).to(self.device) # Evaluate recall = 0 total_query = 0 pbar = tqdm(enumerate(val_text_dataloader), total=len(val_text_dataloader), leave=False, position=0, file=sys.stdout) for i, (image_files, input_ids, attention_mask) in pbar: input_ids = input_ids.to(self.device) attention_mask = attention_mask.to(self.device) text_features = self.bert_model(input_ids, attention_mask=attention_mask) text_features = l2norm(text_features) text_features = self.text_encoder(text_features) image_files = torch.tensor( list( map(lambda x: int(re.findall(r'\d{12}', x)[0]), image_files))).to(device) top_k = get_top_k_eval(text_features, image_features, k) for idx, indices in enumerate(top_k): total_query += 1 true_image_id = image_files[idx] top_k_images = torch.gather(image_ids, 0, indices) if (top_k_images == true_image_id).nonzero().numel() > 0: recall += 1 recall = recall / total_query return recall
def matmul_loss_function(batch_size, matmul_sim): loss_filter = torch.ones(batch_size, dtype=torch.float32) - \ torch.eye(batch_size, dtype=torch.float32) loss_filter = loss_filter.to(device) matmul_loss = torch.mul(matmul_sim, loss_filter).to(device) matmul_loss = torch.abs(matmul_loss).to(device) matmul_loss = l2norm(matmul_loss) matmul_loss = torch.mean(matmul_loss).to(device) return matmul_loss
def forward(self, input): features = self.feature_extractor(input) output = self.linear(features.squeeze()) if self.l2_norm: output = l2norm(output) return output
def encode_topic_vector2(tparams, options, topics): t_emb = get_layer('ff')[1](tparams, topics, options, prefix='ff_topic_vector2', activ='linear') t_emb = l2norm(t_emb) #t_emb = maxnorm2(t_emb) if options['abs']: #im_emb = abs(im_emb) t_emb = tensor.maximum(t_emb, 0) return t_emb
def encode_images(tparams, options, im): im_emb = get_layer('ff')[1](tparams, im, options, prefix='ff_image', activ='linear') #if options['v_norm'] == 'l2' : im_emb = l2norm(im_emb) #im_emb = maxnorm2(im_emb) if options['abs']: #im_emb = abs(im_emb) im_emb = tensor.maximum(im_emb, 0) return im_emb
def forward(self, images): """ extract image feature vectors """ # assuming that the precomputed features are already l2-normalized features = self.fc(images.float()) # normalize in the joint embedding space if not self.no_imgnorm: features = l2norm(features) return features
def forward(self, image_feature, image_attention_mask, input_ids, attention_mask, epoch): if epoch > 1 and self.frozen: self.frozen = False del self.lr_scheduler_0 torch.cuda.empty_cache() image_feature = l2norm(image_feature).detach() final_image_features = l2norm( self.image_mha(image_feature, image_attention_mask)) text_feature = self.bert_model(input_ids, attention_mask=attention_mask) text_feature = l2norm(text_feature) if epoch == 1: text_feature = text_feature.detach() self.frozen = True image_to_common = self.image_encoder(final_image_features) # image_to_common = final_image_features text_to_common = self.text_encoder(text_feature) return image_to_common, text_to_common
def get_tolerance(simplex, L): if type(simplex[-1]) == dict and simplex[-1].has_key('approx_min_ABC'): lbm = simplex[-1].get('approx_min_ABC') else: lbm = get_approx_lb_min(simplex, L) min_dist = None for v in simplex[:-1]: obj_dist = l2norm(v[-1]['obj'], lbm) if min_dist is None or obj_dist < min_dist: min_dist = obj_dist return min_dist
def forward(self, sen2vec, vid2vec): # print('sen2vec.size: {}, vid2vec.size: {}'.format( # sen2vec.size(), vid2vec.size())) if self.mode == 'simple': matmul_sim = self._matmul_similarity(sen2vec, vid2vec) cos_sim = self._cos_similarity(sen2vec, vid2vec) elif self.mode == 'multi': if sen2vec.size() != vid2vec.size(): sen2vec = sen2vec.repeat_interleave(vid2vec.size(0), dim=0) multi_vec = torch.cat((sen2vec, vid2vec), dim=1) multi_vec = self.multi_linear(multi_vec) multi_vec = self.multi_dropout(multi_vec) multi_vec = l2norm(multi_vec) multi_sen2vec = torch.cat((sen2vec, multi_vec), dim=1) multi_sen2vec = self.sen_linear(multi_sen2vec) multi_sen2vec = self.sen_dropout(multi_sen2vec) multi_sen2vec = l2norm(multi_sen2vec) multi_vid2vec = torch.cat((vid2vec, multi_vec), dim=1) multi_vid2vec = self.vid_linear(multi_vid2vec) multi_vid2vec = self.vid_dropout(multi_vid2vec) multi_vid2vec = l2norm(multi_vid2vec) matmul_sim = torch.stack( (self._matmul_similarity(sen2vec, vid2vec), self._matmul_similarity(multi_sen2vec, multi_vid2vec)), dim=0) matmul_sim = torch.mean(matmul_sim, dim=0) cos_sim = torch.stack( (self._cos_similarity(sen2vec, vid2vec), self._cos_similarity(multi_sen2vec, multi_vid2vec)), dim=0) cos_sim = torch.mean(cos_sim, dim=0) else: matmul_sim = self._matmul_similarity(sen2vec, vid2vec) cos_sim = self._cos_similarity(sen2vec, vid2vec) # print('matmal, cos', matmul_sim.size(), cos_sim.size()) return matmul_sim, cos_sim
def get_approx_min_max_angle(simplex, L): '''nD->nD Approximates nD simplexes lower bound minimum by extending longest age by 1/cos(max angle)''' def get_angle(A, B, C): '''Finds angle in radians between AB and BC vectors''' vec1 = a(A) - a(B) vec2 = a(C) - a(B) return np.arccos(np.dot((vec1), (vec2))/ (enorm(vec1) * enorm(vec2))) # radians to degree: * 180/np.pi # Choose longest edge vertexes A = simplex[0] B = simplex[1] # Find maximum angles for each vertex A_angles = [] B_angles = [] for V in nm(simplex): if V != A and V != B: A_angles.append(get_angle(B[:-1], A[:-1], V[:-1])) B_angles.append(get_angle(A[:-1], B[:-1], V[:-1])) max_A_angle = max(A_angles) max_B_angle = max(B_angles) v1 = simplex[0] v2 = simplex[1] if type(simplex[-1]) == dict and simplex[-1].has_key['mins_AB']: mins_AB = simplex[-1]['mins_AB'] else: mins_AB = find_mins_AB(simplex, L) return min([ v1[-1]['obj'][0] - L*l2norm(nm(v1), mins_AB[0][:-1]) / np.cos(max_A_angle), v2[-1]['obj'][0] - L*l2norm(nm(v2), mins_AB[0][:-1]) / np.cos(max_B_angle) ])
def _read(self, feat_path, label_path, proposal_folders): fn_node_pattern = '*_node.npz' fn_edge_pattern = '*_edge.npz' with Timer('read meta and feature'): if label_path is not None: self.lb2idxs, self.idx2lb = read_meta(label_path) self.labels = intdict2ndarray(self.idx2lb) self.inst_num = len(self.idx2lb) self.ignore_label = False else: self.lb2idxs, self.idx2lb = None, None self.labels = None self.inst_num = -1 self.ignore_label = True if not self.featureless: features = read_probs(feat_path, self.inst_num, self.feature_dim) self.features = l2norm(features) if self.inst_num == -1: self.inst_num = features.shape[0] else: assert self.inst_num > 0 self.feature_dim = 1 self.features = np.ones(self.inst_num).reshape(-1, 1) with Timer('read proposal list'): self.lst = [] if callable(proposal_folders): proposal_folders = proposal_folders() for proposal_folder in proposal_folders: print('read proposals from folder: ', proposal_folder) fn_nodes = sorted( glob.glob(os.path.join(proposal_folder, fn_node_pattern))) fn_edges = sorted( glob.glob(os.path.join(proposal_folder, fn_edge_pattern))) assert len(fn_nodes) == len( fn_edges), "node files({}) vs edge files({})".format( len(fn_nodes), len(fn_edges)) assert len(fn_nodes) > 0, 'files under {} is 0'.format( proposal_folder) for fn_node, fn_edge in zip(fn_nodes, fn_edges): assert fn_node[:fn_node.rfind( '_')] == fn_edge[:fn_edge.rfind('_' )], "{} vs {}".format( fn_node, fn_edge) self.lst.append([fn_node, fn_edge]) self.size = len(self.lst)
def encode_sentences(tparams, options, x, mask): n_timesteps = x.shape[0] n_samples = x.shape[1] # Word embedding (source) emb = tparams['Wemb'][x.flatten()].reshape([n_timesteps, n_samples, options['dim_word']]) # Encode sentences (source) proj = get_layer(options['encoder'])[1](tparams, emb, None, options, prefix='encoder', mask=mask) s = l2norm(proj[0][-1]) if options['abs']: s = abs(s) return s
def build_image_encoder(tparams, options): """ Encoder only, for images """ opt_ret = dict() trng = RandomStreams(1234) # image features im = tensor.matrix('im', dtype='float32') # Encode images images = get_layer('ff')[1](tparams, im, options, prefix='ff_image', activ='linear') images = l2norm(images) return trng, [im], images
def sort_vertexes_longest_edge_first(simplex): '''nD->nD Moves longest edge vertexes to the simplex vertex list beginning.''' # Find simplex edges lengths edge_lengths = [] # [(vertex_index, vertex_index, edge_length),] for i, j in permutations(range(len(simplex[:-1])+1), 2): if j > i: edge_lengths.append((i, j, l2norm(simplex[i][:-1], simplex[j][:-1]))) # Get longest edge vertexes ids le_i, le_j, le_length = max(edge_lengths, key=lambda x: x[-1]) # Move longest edge vertexes to simplex vertex list beginning vi = simplex[le_i] vj = simplex[le_j] simplex.remove(vi) simplex.remove(vj) simplex.insert(0, vj) simplex.insert(0, vi) return simplex
def build_model(tparams, options): """ Computation graph for the model """ opt_ret = dict() trng = RandomStreams(1234) # description string: #words x #samples x = tensor.matrix('x', dtype='int64') mask = tensor.matrix('mask', dtype='float32') im = tensor.matrix('im', dtype='float32') con = tensor.matrix('con', dtype='int64') n_timesteps = x.shape[0] n_samples = x.shape[1] # Word embedding (source) emb = tparams['Wemb'][x.flatten()].reshape([n_timesteps, n_samples, options['dim_word']]) # Encode sentences (source) proj = get_layer(options['encoder'])[1](tparams, emb, None, options, prefix='encoder', mask=mask) sents = proj[0][-1] sents = l2norm(sents) # Encode images (source) images = get_layer('ff')[1](tparams, im, options, prefix='ff_image', activ='linear') # Compute loss cost, updates = theano.scan(_step, sequences=con, outputs_info=tensor.alloc(0.), non_sequences = [sents, images, options['margin']], n_steps=con.shape[0], profile=False, strict=True) cost = cost[-1] return trng, [x, mask, im, con], cost
def find_mins_AB(simplex, L): ''' nD->nD Finds AB' and B'A intersection, where A, B are longest edge vertexes t - triangle (simplex). y - objective values for each vertex. Returns lower Lipschitz bound minimum for the first edge (made from first and second vetexes). ''' dist = l2norm(nm(simplex[0]), nm(simplex[1])) x1 = a((0, simplex[0][-1]['obj'][0])) x2 = a((dist, simplex[0][-1]['obj'][0] - L*dist)) x3 = a((dist, simplex[1][-1]['obj'][0])) x4 = a((0, simplex[1][-1]['obj'][0] - L*dist)) ## 2D line intersection based on http://mathworld.wolfram.com/Line-LineIntersection.html av = x2 - x1 bv = x4 - x3 cv = x3 - x1 s = x1 + av * (np.cross(cv, bv) * (np.cross(av, bv))/( enorm(np.cross(av, bv))**2 )) X = a(simplex[0][:-1]) + s[0]/float(dist) * (a(simplex[1][:-1]) - a(simplex[0][:-1])) return [list(X) + [s[1]]]
def trainencoder( sources = ("image_vects", "word_vects") , sources_k = ("image_vects_k", "word_vects_k") , batch_size=128 , embedding_dim=300 , n_captions=5 , n_sbu=None , separate_emb=False , test_size=1000 # per dataset , mode='dev' ): if mode=="coco120k+flickr38k": XYsplit_cum = ([], [], [], []) xyloaders = [ "cocoXYFilenames(dataType='train2014')" , "cocoXYFilenames(dataType='val2014')" , "flickrXYFilenames(dataType='8k')" , "flickrXYFilenames(dataType='30k')" ] ntrains = [80000, 40000, 8000, 30000] for xyloader, ntrain in zip(xyloaders, ntrains): X, Y, _ = eval(xyloader) XYsplit = train_test_split(X, Y, train_size=ntrain) for i in range(len(XYsplit)): XYsplit_cum[i].extend(XYsplit[i]) trX, teX, trY, teY = XYsplit_cum else: trX, teX, trY, teY = coco(mode=mode, n_captions=n_captions, test_size=test_size) if n_sbu: sbutrX, sbuteX, sbutrY, sbuteY = sbu(mode=mode, test_size=test_size) pairs = ( (trX, sbutrX) , (teX, sbuteX) , (trY, sbutrY) , (teY, sbuteY) ) for coco_data, sbu_data in pairs: if isinstance(coco_data, list): coco_data.extend(sbu_data) print("n_train: %d" % len(trX)) print("n_test: %d" % len(teX)) # # # # # # # # # # # # Modeling Building # # # # # # # # # # # # s = Encoder( image_feature_dim=4096 , embedding_dim=embedding_dim , biases_init=Constant(0.) , weights_init=Uniform(width=0.08) ) s.initialize() image_vects = tensor.matrix(sources[0]) # named to match the source name word_vects = tensor.tensor3(sources[1]) # named to match the source name image_vects_k = tensor.matrix(sources_k[0]) # named to match the contrastive source name word_vects_k = tensor.tensor3(sources_k[1]) # named to match the contrastive source name # image_vects.tag.test_value = np.zeros((2, 4096), dtype='float32') # word_vects.tag.test_value = np.zeros((2, 15, 50), dtype='float32') # image_vects_k.tag.test_value = np.zeros((2, 4096), dtype='float32') # word_vects_k.tag.test_value = np.zeros((2, 15, 50), dtype='float32') # learned image embedding, learned sentence embedding lim, ls = s.apply(image_vects, word_vects) # learned constrastive im embedding, learned contrastive s embedding lcim, lcs = s.apply(image_vects_k, word_vects_k) # identical cost code thanks to Ryan Kiros # https://github.com/youralien/skip-thoughts/blob/master/eval_rank.py lim = l2norm(lim) lcim = l2norm(lcim) ls = l2norm(ls) lcs = l2norm(lcs) margin = 0.2 # alpha term should not be more than 1 cost_im = margin - (lim * ls).sum(axis=1) + (lim * lcs).sum(axis=1) cost_im = cost_im * (cost_im > 0.) # this is like the max(0, pairwise-ranking-loss) cost_im = cost_im.sum(0) cost_s = margin - (ls * lim).sum(axis=1) + (ls * lcim).sum(axis=1) cost_s = cost_s * (cost_s > 0.) # this is like max(0, pairwise-ranking-loss) cost_s = cost_s.sum(0) cost = cost_im + cost_s cost.name = "pairwise_ranking_loss" # function(s) to produce embedding if separate_emb: img_encoder = theano.function([image_vects], lim) txt_encoder = theano.function([word_vects], ls) f_emb = theano.function([image_vects, word_vects], [lim, ls]) if n_sbu: sbuname = "sbu%d+" % n_sbu else: sbuname = '' name = "%sproject1.%s.jointembedder" % (sbuname, mode) savename = MODEL_FILES_DIR + name def save_function(self): if separate_emb: ModelIO.save( img_encoder , savename + "_Img") ModelIO.save( txt_encoder , savename + "_Txt") ModelIO.save(f_emb, savename) print "Similarity Embedding function(s) saved while training" def rank_function(stream): images, captions, _0, _1 = stream.get_epoch_iterator().next() image_embs, caption_embs = f_emb(images, captions) ModelEval.ImageSentenceRanking(image_embs, caption_embs) def rank_coco(self=None): # Get 1000 images / captions to test rank stream = DataETL.getFinalStream(teX, teY, sources=sources, sources_k=sources_k, batch_size=test_size, shuffle=True) print "COCO test" rank_function(stream) def rank_sbu(self=None): stream = DataETL.getFinalStream(sbuteX, sbuteY, sources=sources, sources_k=sources_k, batch_size=test_size, shuffle=True) print "SBU test" rank_function(stream) def rank_em(self=None): rank_coco() if n_sbu: rank_sbu() cg = ComputationGraph(cost) # # # # # # # # # # # # Modeling Training # # # # # # # # # # # # algorithm = GradientDescent( cost=cost , parameters=cg.parameters , step_rule=Adam(learning_rate=0.0002) ) main_loop = MainLoop( model=Model(cost) , data_stream=DataETL.getFinalStream(trX, trY, sources=sources, sources_k=sources_k, batch_size=batch_size) , algorithm=algorithm , extensions=[ DataStreamMonitoring( [cost] , DataETL.getFinalStream(trX, trY, sources=sources, sources_k=sources_k, batch_size=batch_size, shuffle=True) , prefix='train') , DataStreamMonitoring( [cost] , DataETL.getFinalStream(teX, teY, sources=sources, sources_k=sources_k, batch_size=batch_size, shuffle=True) , prefix='test') , UserFunc(save_function, after_epoch=True) , UserFunc(rank_em, after_epoch=True) , Printing() , LogToFile('logs/%s.csv' % name) ] ) main_loop.run()