def forward(self, v, q): self.opt = config.parse_opt() w_emb = self.w_emb(q) print(v.shape) q_emb = self.q_emb(w_emb) #print(q_emb.shape) v_emb = self.v_fc(v) #print(v_emb.shape) r_emb = self.r_emb(v_emb, q_emb) #print(r_emb.shape) w_att = self.v_att(v_emb, q_emb) v_att = v_emb * w_att vatt = torch.squeeze(torch.sum(v_att, 1, keepdim=True)) wr_att = self.r_att(r_emb, q_emb) r_att = r_emb * wr_att ratt = torch.squeeze(torch.sum(r_att, 1, keepdim=True)) #print(ratt.shape) joint_proj = vatt + q_emb #print(joint_proj.shape) joint_proj = joint_proj + ratt logits = self.classifier(joint_proj) return logits
def __init__(self): self.opt=config.parse_opt() self.CSV_TYPE={'FrameQA': '_frameqa_question.csv', 'Count': '_count_question.csv', 'Trans': '_transition_question.csv', 'Action' : '_action_question.csv' }
def load_dataset(mode, num=None): if mode == 'Train': textset, _ = _read_from_csv() else: _, textset = _read_from_csv() entries = [] opt = config.parse_opt() hdf5_result = json.load(open(opt.HDF5_JSON, 'r'))['results'] count = 0 idx = list(textset.index) if num == None: num = len(idx) + 10 for row in textset.iterrows(): question = row[1]['question'] image_idx = idx[count] gif_name = row[1]['gif_name'] answer = row[1]['answer'] if count > num: break vid = str(row[1]['key']) entry = { 'question': question, 'answer': answer, 'gif_name': gif_name, 'index': image_idx, 'vid_id': vid, } entries.append(entry) count += 1 return entries
def __init__(self): super(MFB,self).__init__() self.opt=config.parse_opt() self.dropout=nn.Dropout(self.opt.MFH_DROPOUT) self.v_proj=nn.Linear(self.opt.NUM_GLIMPSE*self.opt.NUM_HIDDEN,self.opt.MFB_DIM * self.opt.POOLING_SIZE) self.q_proj=nn.Linear(self.opt.NUM_HIDDEN,self.opt.MFB_DIM * self.opt.POOLING_SIZE) self.v_proj1=nn.Linear(self.opt.NUM_GLIMPSE*self.opt.NUM_HIDDEN,self.opt.MFB_DIM * self.opt.POOLING_SIZE) self.q_proj1=nn.Linear(self.opt.NUM_HIDDEN,self.opt.MFB_DIM * self.opt.POOLING_SIZE)
def __init__(self, hidden, mid, dropout): super(Attention, self).__init__() self.opt = config.parse_opt() self.v_proj = nn.Linear(hidden, mid) self.q_proj = nn.Linear(hidden, mid) self.att = nn.Linear(mid, 1) self.softmax = nn.Softmax() self.dropout = nn.Dropout(dropout)
def train(model, train_loader, test_loader, opt): opt = config.parse_opt() optim = optimizer.get_std_opt(model, opt) num_epochs = opt.EPOCHS #optim=torch.optim.Adamax(model.parameters()) logger = utils.Logger( os.path.join('./save_models', 'log' + str(opt.SAVE_NUM) + '.txt')) dict_file = load_pkl(opt.ANET_LABEL2ANS) log_hyperpara(logger, opt) best_eval_score = 0 train_file = load_pkl(opt.ANET_TRAIN_DICT) for epoch in range(num_epochs): total_loss = 0 train_score = 0.0 t = time.time() t_time = 0.0 for i, (conct, q, l, ques_id) in enumerate(train_loader): starttime = datetime.datetime.now() conct = conct.float().cuda() v = Variable(conct).float().cuda() q = Variable(q).cuda() l = Variable(l).float().cuda() pred = model(v, q) loss = instance_bce_with_logits(pred, l.cuda()) loss.backward() nn.utils.clip_grad_norm(model.parameters(), 0.25) optim.step() optim.zero_grad() batch_score = compute_score(pred, ques_id, train_file, dict_file) total_loss += loss * v.size(0) train_score += batch_score print 'Epoch:', epoch, 'batch:', i + 1, 'bathc_score:', batch_score, 'loss:', loss endtime = datetime.datetime.now() t_time += (endtime - starttime).microseconds logger.write('epoch %d, time: %.2f' % (epoch, time.time() - t)) logger.write('time cost: %.2f' % (t_time / 1000000)) total_loss /= len(train_loader.dataset) train_score = 100 * train_score / len(train_loader.dataset) model.train(False) evaluate_score, test_loss, w0, w9 = evaluate(model, test_loader, opt, epoch) w0 = w0 * 100 w9 = w9 * 100 print 'Epoch:', epoch, 'evaluation w0:', w0, ' w9:', w9 logger.write('\twup0: %.2f, wup9: %.2f' % (w0, w9)) print 'Epoch:', epoch, 'evaluation score:', 100 * evaluate_score, ' loss:', test_loss logger.write('\ttrain_loss: %.2f, accuracy: %.2f' % (total_loss, train_score)) logger.write('\teval accuracy: %.2f ' % (100 * evaluate_score)) logger.write('\teval loss: %.2f ' % (test_loss)) if evaluate_score > best_eval_score: best_eval_score = evaluate_score model.train(True) logger.write('best accuracy %.2f' % (100 * best_eval_score))
def _read_from_csv(): opt = config.parse_opt() train_path = os.path.join(opt.TEXT_DIR, ('Train' + CSV_TYPE[opt.QUESTION_TYPE])) test_path = os.path.join(opt.TEXT_DIR, ('Test' + CSV_TYPE[opt.QUESTION_TYPE])) text_train = pd.read_csv(train_path, sep='\t') text_test = pd.read_csv(test_path, sep='\t') text_train = text_train.set_index('vid_id') text_test = text_test.set_index('vid_id') return text_train, text_test
def build_baseline(dataset): opt = config.parse_opt() w_emb = WordEmbedding(dataset.dictionary.ntokens(), 300, opt.EMB_DROPOUT) q_emb = QuestionEmbedding(300, opt.NUM_HIDDEN, opt.NUM_LAYER, opt.BIDIRECT, opt.L_RNN_DROPOUT) v_emb = VideoEmbedding(opt.C3D_SIZE + opt.RES_SIZE, opt.NUM_HIDDEN, opt.NUM_LAYER, opt.BIDIRECT, opt.L_RNN_DROPOUT) v_att = Attention(opt.NUM_HIDDEN, opt.MID_DIM, opt.FC_DROPOUT) classifier = SimpleClassifier(opt.NUM_HIDDEN, opt.MID_DIM, 1, opt.FC_DROPOUT) return BaseModel(w_emb, q_emb, v_att, classifier, v_emb)
def build_baseline(dataset,opt): opt=config.parse_opt() w_emb=WordEmbedding(dataset.dictionary.ntokens(),300,opt.EMB_DROPOUT) q_emb=QuestionEmbedding(300,opt.NUM_HIDDEN,opt.NUM_LAYER,opt.BIDIRECT,opt.L_RNN_DROPOUT) v_emb=VideoEmbedding(opt.C3D_SIZE+opt.RES_SIZE,opt.NUM_HIDDEN,opt.NUM_LAYER,opt.BIDIRECT,opt.L_RNN_DROPOUT) v_att=Attention(opt.NUM_HIDDEN,opt.MID_DIM,opt.FC_DROPOUT) r_att=Attention(opt.NUM_HIDDEN,opt.MID_DIM,opt.FC_DROPOUT) v_fc=Videofc(opt.GLIMPSE,opt.C3D_SIZE+opt.RES_SIZE,opt.NUM_HIDDEN,opt.FC_DROPOUT) a_emb=AnswerEmbedding(300,opt.NUM_HIDDEN,opt.NUM_LAYER,opt.BIDIRECT,opt.L_RNN_DROPOUT) rela_emb = Rela_Module(opt.NUM_HIDDEN*3,opt.NUM_HIDDEN,opt.NUM_HIDDEN) classifier=SimpleClassifier(opt.NUM_HIDDEN,opt.MID_DIM,dataset.num_ans,opt.FC_DROPOUT) return BaseModel(w_emb,q_emb,v_emb,a_emb,v_att,v_fc,rela_emb,r_att,classifier,opt)
def main(): opt = config.parse_opt() folder = os.path.join(config.OUTPUT_DIR, opt.ID + '_pred') log_file = os.path.join(config.LOG_DIR, opt.ID) logger = get_logger(log_file) check_mkdir(folder) pred(opt, folder, logger) visualize_pred(opt, folder, 'val', logger)
def __init__(self, hidden, mid, dropout): super(Gate_combine, self).__init__() self.opt = config.parse_opt() self.a_proj = nn.Linear(hidden, mid) self.a_att = nn.Linear(mid, 1) self.f_proj = nn.Linear(hidden, mid) self.f_att = nn.Linear(mid, 1) self.q_proj = nn.Linear(hidden, mid) self.q_att = nn.Linear(mid, 1) self.sig = nn.Sigmoid() self.dropout = nn.Dropout(dropout)
def forward(self, v, q): self.opt = config.parse_opt() w_emb = self.w_emb(q) v_embedding = self.v_emb(v) q_emb = self.q_emb(w_emb) w_att = self.v_att(v_embedding, q_emb) #v_proj=self.v_proj(v_embedding) #q_proj=torch.unsqueeze(self.q_proj(q_emb),1) #vq_proj=F.relu(v_proj +q_proj) #proj=torch.squeeze(self.att(vq_proj)) #w_att=torch.unsqueeze(self.softmax(proj),2) v_att = v_embedding * w_att vatt = torch.squeeze(torch.sum(v_att, 1, keepdim=True)) joint_proj = vatt + q_emb logits = torch.squeeze(self.classifier(joint_proj)) return logits
def __init__(self, question_type, dictionary, mode): super(FeatureDataset, self).__init__() self.opt = config.parse_opt() utils.assert_in_type(question_type) if question_type == 'FrameQA': self.ans2label = pkl.load(open('./data/ans2label.pkl', 'rb')) self.label2ans = pkl.load(open('./data/label2ans.pkl', 'rb')) self.num_ans = len(self.ans2label) self.dictionary = dictionary entry_path = './data/entries_' + str(mode) + '.pkl' print('Load Dataset') self.entries = load_dataset(mode) print('Dataset\'s length is %d' % (len(self.entries))) self.tokenize() self.read_from_h5py() self.tensorize() '''
def load_dataset(mode, num=None): if mode == 'Train': textset, _ = _read_from_csv() else: _, textset = _read_from_csv() entries = [] opt = config.parse_opt() hdf5_result = json.load(open(opt.HDF5_JSON, 'r'))['results'] count = 0 idx = list(textset.index) if num == None: num = len(idx) + 10 for row in textset.iterrows(): question = row[1]['question'] a1 = row[1]['question'] + row[1]['a1'] a2 = row[1]['question'] + row[1]['a2'] a3 = row[1]['question'] + row[1]['a3'] a4 = row[1]['question'] + row[1]['a4'] a5 = row[1]['question'] + row[1]['a5'] image_idx = idx[count] gif_name = row[1]['gif_name'] answer = row[1]['answer'] if count > num: break vid = str(row[1]['key']) proposal_info = hdf5_result[gif_name[2:]] entry = { 'a1': a1, 'a2': a2, 'a3': a3, 'a4': a4, 'a5': a5, 'question': question, 'answer': answer, 'gif_name': gif_name, 'index': image_idx, 'vid_id': vid, 'proposal_info': proposal_info } entries.append(entry) count += 1 return entries
def forward(self, v, q, a1, a2, a3, a4, a5): self.opt = config.parse_opt() w_emb = self.w_emb(q) emb1 = self.w_emb(a1) emb2 = self.w_emb(a2) emb3 = self.w_emb(a3) emb4 = self.w_emb(a4) emb5 = self.w_emb(a5) print(v.shape) q_emb = self.q_emb(w_emb) #print(q_emb.shape) v_emb = self.v_fc(v) #print(v_emb.shape) r_emb = self.r_emb(v_emb, q_emb) #print(r_emb.shape) w_att = self.v_att(v_emb, q_emb) v_att = v_emb * w_att vatt = torch.squeeze(torch.sum(v_att, 1, keepdim=True)) wr_att = self.r_att(r_emb, q_emb) r_att = r_emb * wr_att ratt = torch.squeeze(torch.sum(r_att, 1, keepdim=True)) #print(ratt.shape) joint_proj = vatt + q_emb #print(joint_proj.shape) joint_proj = joint_proj + ratt cand1 = self.score(self.a_emb(emb1, joint_proj)) cand2 = self.score(self.a_emb(emb2, joint_proj)) cand3 = self.score(self.a_emb(emb3, joint_proj)) cand4 = self.score(self.a_emb(emb4, joint_proj)) cand5 = self.score(self.a_emb(emb5, joint_proj)) score = torch.from_numpy(np.zeros((v.size(0), 5), dtype=np.float32)) score[:, 0] = torch.squeeze(cand1) score[:, 1] = torch.squeeze(cand2) score[:, 2] = torch.squeeze(cand3) score[:, 3] = torch.squeeze(cand4) score[:, 4] = torch.squeeze(cand5) return score
def label_img_with_ques_etm(): opt = config.parse_opt() q_i_a_path = os.path.join(root_path, "data/train/All_QA_Pairs_train.txt") img_ques_dict = {} with open(q_i_a_path, "r") as f: for row in f: q_i_a = row.strip().split("|") img = q_i_a[0] ques = q_i_a[1] if (img in img_ques_dict): img_ques_dict[img].append(ques) else: img_ques_dict[img] = [ques] img_topic_dict = {} for img, qs in img_ques_dict.items(): img_topic_vector = np.zeros(opt.ETM_TOP_NUM) for q in qs: words = VQADataProvider.text_to_list(q) q_t_v = etm_topic_distrib(words) img_topic_vector = np.add(img_topic_vector, q_t_v) img_topic_dict[img] = (np.argmax(img_topic_vector)).item() return img_topic_dict
word_len = min(len(tokens), self.opt.max_word_len) for _ in range(self.opt.max_word_len - word_len): tokens.append(0) sent_chars.append(tokens[:self.opt.max_word_len]) for _ in range(sen_len, self.opt.max_len): sent_ids.append(0) sent_pos_ids.append(0) sent_chars.append([0] * self.opt.max_word_len) return sent_words[: sen_len], sent_ids, sent_pos_ids, sent_chars, sen_len ''' for w in sent_words[:sen_len]: tokens = [self.char2id.get(token, 1) for token in list(w)] word_len = min(len(tokens), self.opt.max_word_len) for _ in range(self.opt.max_word_len - word_len): tokens.append(0) sent_chars.append(tokens[: self.opt.max_word_len]) for _ in range(sen_len, self.opt.max_len): sent_ids.append(0) sent_pos_ids.append(0) sent_chars.append([0] * self.opt.max_word_len) return sent_words[:sen_len], sent_ids, sent_pos_ids, sent_chars, sen_len ''' opt = config.parse_opt() Prepare = DataPrepare(opt) Prepare.prepare()
img = images[ann['image_id']] x1, y1, w, h = ann['box'] image = Image.open(os.path.join(image_root, img['file_name'])).convert('RGB') if h <= w: nh, nw = int(224 / w * h), 224 else: nh, nw = 224, int(224 / h * w) image = image.crop((x1, y1, x1 + w, y1 + h)).resize( (nw, nh), Image.ANTIALIAS) image = np.array(image).astype(np.float32)[:, :, ::-1] image -= np.array([103.939, 116.779, 123.68], dtype=np.float32) image = image.transpose((2, 0, 1)) pad_image = np.zeros((3, 224, 224), dtype=np.float32) if nh <= nw: pad_image[:, (224 - nh) // 2:(224 - nh) // 2 + nh, :] = image else: pad_image[:, :, (224 - nw) // 2:(224 - nw) // 2 + nw] = image batch.append(pad_image) batch = Variable(xp.array(batch, dtype=xp.float32)) feature = res(batch, layers=['pool5']) feature = cuda.to_cpu(feature['pool5'].data) ann_feats.extend(feature) np.save(os.path.join(target_save_dir, params['ann_feats']), ann_feats) if __name__ == '__main__': args = config.parse_opt() params = vars(args) # convert to ordinary dict extract_feature(params)
def __init__(self, word2idx=None, idx2word=None): self.opt = config.parse_opt() self.word2idx = word2idx self.idx2word = idx2word
]) valid_transform = transforms.Compose([ transforms.CenterCrop(args.crop_size), # transforms.RandomHorizontalFlip(), # do we need to flip when eval? transforms.ToTensor(), transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225)) ]) train_dataloader = get_loader(opt, mode='train', transform=train_transform) valid_dataloader = get_loader(opt, mode='val', transform=valid_transform) print('load the dataset into memory...') print( 'total iterations in training phase : {} \ntotal iterations in validation phase : {}' .format(len(train_dataloader), len(valid_dataloader))) trainer = Trainer(opt, train_dataloader, valid_dataloader) trainer.train() print('done') if __name__ == "__main__": args = parse_opt() setup_logging(os.path.join('log.txt')) logging.info("\nrun arguments: %s", json.dumps(vars(args), indent=4, sort_keys=True)) main(args) print('done')