def create_dictionary(dataroot): """Creates a dictionary object for future usage """ dictionary = Dictionary() #questions = [] files = [ 'v2_OpenEnded_mscoco_train2014_questions.json', 'v2_OpenEnded_mscoco_val2014_questions.json' ] for path in files: question_path = os.path.join(dataroot, path) qs = json.load(open(question_path))['questions'] for q in qs: dictionary.tokenize(q['question'], True) return dictionary
def main_run(dataroot, pkl_filename, glove_filename, filenames_dict, image_filenames_dict, emb_dim=300): dictionary = create_dictionary(dataroot) dictionary.dump_to_file(os.path.join('data', pkl_filename)) d = Dictionary.load_from_file((os.path.join('data', pkl_filename))) print(d.idx2word) weights, word2emb = create_glove_embedding_init(d.idx2word, glove_filename) np.save('data/glove6b_init_%dd.npy' % emb_dim, weights) #extract the raw data from json train_questions = json.load(open( filenames_dict['train_question_file']))['questions'] train_answers = json.load(open( filenames_dict['train_answer_file']))['annotations'] validation_questions = json.load( open(filenames_dict['validation_question_file']))['questions'] validation_answers = json.load( open(filenames_dict['validation_answer_file']))['annotations'] #generate the question labels and the id maps answers = train_answers + validation_answers occurence = filter_answers(answers, 9) ans2label = create_ans2label(occurence, 'trainval') train_target = compute_target(train_answers, ans2label, 'train') validation_target = compute_target(validation_answers, ans2label, 'val') #image feature extraction here based on functions in image_feature_extractor image_feats_converter(image_filenames_dict)
def __init__(self,args): torch.cuda.manual_seed_all(args.seed) super(VQA_Model_combined, self).__init__() model=models.resnet152(pretrained=True) self.img_model = nn.Sequential(*list(model.children())[:-2]) self.img_model.train(False) attention_model_checkpoint=torch.load(args.model_path) new_state_dict = OrderedDict() for k, v in attention_model_checkpoint.items(): name = k[7:] # remove `module.` new_state_dict[name] = v print('Model checkpoint loaded') print(new_state_dict.keys()) dictionary=Dictionary.load_from_file(args.pickle_path) train_dataset=Dataset_VQA(img_root_dir=args.image_root_dir,feats_data_path=args.feats_data_path,dictionary=dictionary,choice='train',dataroot=args.data_root,arch_choice=args.arch_choice,layer_option=args.layer_option) print('Loading the attention model') attention_model = attention_baseline(train_dataset, num_hid=args.num_hid, dropout= args.dropout, norm=args.norm,\ activation=args.activation, drop_L=args.dropout_L, drop_G=args.dropout_G,\ drop_W=args.dropout_W, drop_C=args.dropout_C) attention_model.load_state_dict(new_state_dict) self.vqa_model=attention_model self.vqa_model.train(False) #self.attention_model.train(False) # # # # # self.vqa_model = model_regen(args) self.gradients = None
def load_model(args): torch.cuda.manual_seed_all(args.seed) model_checkpoint = torch.load(args.model_path) new_state_dict = OrderedDict() for k, v in model_checkpoint.items(): name = k[7:] # remove `module.` new_state_dict[name] = v #new_state_dict["classifier.main.2.bias"]=new_state_dict.pop("classifier.main.3.bias") #new_state_dict["classifier.main.2.weight_g"]=new_state_dict.pop("classifier.main.3.weight_g") #new_state_dict["classifier.main.2.weight_v"]=new_state_dict.pop("classifier.main.3.weight_v") print('Model checkpoint loaded') dictionary = Dictionary.load_from_file(args.pickle_path) train_dataset = Dataset_VQA(img_root_dir=args.image_root_dir, feats_data_path=args.feats_data_path, dictionary=dictionary, choice='train', dataroot=args.data_root, arch_choice=args.arch_choice, layer_option=args.layer_option) #train_dataset=Dataset_VQA(img_root_dir=args.image_root_dir,feats_data_path=args.feats_data_path,dictionary=dictionary,choice='train',dataroot=args.data_root,arch_choice=args.arch_choice,layer_option=args.layer_option) print('Loading the attention model') attention_model = attention_baseline(train_dataset, num_hid=args.num_hid, dropout= args.dropout, norm=args.norm,\ activation=args.activation, drop_L=args.dropout_L, drop_G=args.dropout_G,\ drop_W=args.dropout_W, drop_C=args.dropout_C) #attention_model=attention_mfh(train_dataset, num_hid=args.num_hid, dropout= args.dropout, norm=args.norm,\ #activation=args.activation, drop_L=args.dropout_L, drop_G=args.dropout_G,\ #drop_W=args.dropout_W, drop_C=args.dropout_C,mfb_out_dim=args.mfb_out_dim) attention_model.load_state_dict(new_state_dict) attention_model.train(False) #torch.cuda.manual_seed(args.seed) torch.cuda.set_device(args.device) attention_model.to(args.device) return (attention_model)
def model_regen(args): print('Loading model checkpoint') attention_model_checkpoint=torch.load(args.model_path) new_state_dict = OrderedDict() for k, v in attention_model_checkpoint.items(): name = k[7:] # remove `module.` new_state_dict[name] = v print('Model checkpoint loaded') print(new_state_dict.keys()) dictionary=Dictionary.load_from_file(args.pickle_path) train_dataset=Dataset_VQA(img_root_dir=args.image_root_dir,feats_data_path=args.feats_data_path,dictionary=dictionary,choice='train',dataroot=args.data_root,arch_choice=args.arch_choice,layer_option=args.layer_option) print('Loading the attention model') attention_model = attention_baseline(train_dataset, num_hid=args.num_hid, dropout= args.dropout, norm=args.norm,\ activation=args.activation, drop_L=args.dropout_L, drop_G=args.dropout_G,\ drop_W=args.dropout_W, drop_C=args.dropout_C) attention_model.load_state_dict(new_state_dict) attention_model.eval() #print('Saving the entire model') #torch.save(attention_model,'gradcam_models/resnet_152_attention_baseline_model.pth') return(attention_model)
VQA_model_exp=VQA_Model_combined(args) VQA_model_exp.to(args.device) #VQA_model_exp.img_model.eval() #VQA_model_exp.vqa_model.eval() #print(type(VQA_model_exp)) #VQA_model_exp.eval() #VQA_model_exp.to(0) class_meta_data=pd.read_csv('/proj/digbose92/VQA/VisualQuestion_VQA/Visual_All/data/Train_Class_Distribution.csv') class_label_map=class_meta_data['Label_names'].tolist() print('Load the validation json file') valid_questions=json.load(open('/proj/digbose92/VQA/VisualQuestion_VQA/common_resources/v2_OpenEnded_mscoco_val2014_yes_no_questions.json'))['questions'] valid_entry=valid_questions[77] dictionary=Dictionary.load_from_file('../Visual_All/data/dictionary.pkl') print(valid_entry['question']) tokens=preproc_question(valid_entry['question'],14,dictionary) print(tokens) #print(valid_entry) pkl_data=pickle.load(open('/proj/digbose92/VQA/VisualQuestion_VQA/common_resources/val_target_yes_no_ans.pkl','rb')) question_ids=[pkl_data[i]['question_id'] for i, question in enumerate(pkl_data)] #print(question_ids) id=question_ids.index(valid_entry['question_id']) #print(pkl_data[id]) image_id=pkl_data[id]['image_id'] choice='val'
def main(args): #defining torch configurations torch.manual_seed(args.seed) torch.cuda.manual_seed(args.seed) torch.backends.cudnn.benchmark = True #extract weights from the weight matrices weights = np.load(args.file_name) # CUDA for PyTorch #if cuda: device = 2 torch.cuda.set_device(device) #use_cuda = torch.cuda.is_available() #device = torch.device("cuda" if torch.cuda.is_available() else "cpu") #defining dictionary and VQAFeatureDataset dictionary = Dictionary.load_from_file('data/dictionary.pkl') train_dataset = VQAFeatureDataset( 'train', dictionary, dataroot='/proj/digbose92/VQA/VisualQuestion_VQA/Visual_All/data') eval_dataset = VQAFeatureDataset( 'val', dictionary, dataroot='/proj/digbose92/VQA/VisualQuestion_VQA/Visual_All/data') #model definition question_encoder = EncoderLSTM(hidden_size=args.num_hid, weights_matrix=weights, fc_size=args.q_embed, max_seq_length=args.max_sequence_length, batch_size=args.batch_size).to(device) fusion_network = FusionModule(fuse_embed_size=args.q_embed, fc_size=args.fuse_embed, class_size=args.num_class).to(device) print(question_encoder) print(fusion_network) input() #Dataloader initialization train_loader = DataLoader(train_dataset, args.batch_size, shuffle=True, num_workers=1) eval_loader = DataLoader(eval_dataset, args.batch_size, shuffle=True, num_workers=1) # Loss and optimizer criterion = nn.CrossEntropyLoss() params = list(question_encoder.parameters()) + list( fusion_network.parameters()) optimizer = torch.optim.Adam(params, lr=args.learning_rate) # Train the models total_step = len(train_loader) step = 0 #Training starts for epoch in range(args.epochs): for i, (image_features, spatials, question_tokens, labels) in enumerate(train_loader): class_indices = convert_one_hot2int(labels.numpy()) image_feats = torch.mean(image_features, dim=1) image_feats = image_feats.to(device) class_indices = torch.from_numpy(class_indices).long().to(device) #labels=labels.to(device) #preproc the tokens after converting from tensor to numpy. Then numpy to tensor before passing to loss fn question_array = preproc_question_tokens( question_tokens.cpu().numpy()) question_tokens = torch.from_numpy(question_array).to(device) #fusion_network.zero_grad() optimizer.zero_grad() #Forward, Backward and Optimize question_features = question_encoder(question_tokens) class_outputs = fusion_network(question_features, image_feats) loss = criterion(class_outputs, class_indices) #question_encoder.zero_grad() loss.backward() optimizer.step() if (step % 1000 == 0): #optimizer.zero_grad() print('Epoch [{}/{}], Step [{}/{}], Loss: {:.4f}'.format( epoch, args.epochs, i, total_step, loss.item())) step = step + 1
def evaluate_attention_model(args): #torch.manual_seed(args.seed) torch.cuda.manual_seed_all(args.seed) class_data=pd.read_csv(args.class_metadata_file) #class_label_map={0:"no",1:"yes"} class_label_map=class_data['Label_names'].tolist() print('Loading model checkpoint') attention_model_checkpoint=torch.load(args.model_path) new_state_dict = OrderedDict() for k, v in attention_model_checkpoint.items(): name = k[7:] # remove `module.` new_state_dict[name] = v print('Model checkpoint loaded') #new_state_dict["classifier.main.2.bias"]=new_state_dict.pop("classifier.main.3.bias") #new_state_dict["classifier.main.2.weight_g"]=new_state_dict.pop("classifier.main.3.weight_g") #new_state_dict["classifier.main.2.weight_v"]=new_state_dict.pop("classifier.main.3.weight_v") print(new_state_dict.keys()) print('Loading Dictionary') dictionary=Dictionary.load_from_file(args.pickle_path) train_dataset=Dataset_VQA(img_root_dir=args.image_root_dir,feats_data_path=args.feats_data_path,dictionary=dictionary,choice='train',dataroot=args.data_root,arch_choice=args.arch_choice,layer_option=args.layer_option) print('Loading the attention model') attention_model = attention_baseline(train_dataset, num_hid=args.num_hid, dropout= args.dropout, norm=args.norm,\ activation=args.activation, drop_L=args.dropout_L, drop_G=args.dropout_G,\ drop_W=args.dropout_W, drop_C=args.dropout_C) #attention_model=attention_mfh(train_dataset, num_hid=args.num_hid, dropout= args.dropout, norm=args.norm,\ #activation=args.activation, drop_L=args.dropout_L, drop_G=args.dropout_G,\ #drop_W=args.dropout_W, drop_C=args.dropout_C,mfb_out_dim=args.mfb_out_dim) attention_model.load_state_dict(new_state_dict) attention_model.eval() #torch.cuda.manual_seed(args.seed) torch.cuda.set_device(args.device) attention_model.to(args.device) if(args.image_model is None): """use extracted features as a Dataset and Dataloader """ print('Using validation features') dataset_temp=Dataset_VQA(img_root_dir=args.image_root_dir,feats_data_path=args.feats_data_path,dictionary=dictionary,bert_option=args.bert_option,rcnn_pkl_path=None,choice=args.choice,dataroot=args.data_root,arch_choice=args.arch_choice,layer_option=args.layer_option) loader=DataLoader(dataset_temp, batch_size=args.batch_size, shuffle=False, num_workers=1) print('Length of validation dataloader:', len(loader)) upper_bound = 0 num_data = 0 V_loss=0 score=0 print('Validation data loading starting') actual_class_labels=[] predicted_class_labels=[] question_set=[] question_id=[] count=0 for data in tqdm(loader): feat,quest,quest_sent,quest_id,target = data feat = feat.to(args.device) quest = quest.to(args.device) target = target.to(args.device) question_id=question_id+quest_id.tolist() pred = attention_model(feat, quest) question_set=question_set+list(quest_sent) loss = instance_bce_with_logits(pred, target) V_loss += loss.item() * feat.size(0) score_temp, logits, class_labels= compute_score_with_logits(pred, target.data) actual_class_labels=actual_class_labels+list(class_labels.cpu().numpy()) predicted_class_labels=predicted_class_labels+list(logits.cpu().numpy()) batch_score=score_temp.sum() score += batch_score upper_bound += (target.max(1)[0]).sum() num_data += pred.size(0) #count=count+1 class_predicted_name=[class_label_map[id] for id in predicted_class_labels] class_actual_name=[class_label_map[id] for id in actual_class_labels] print(class_predicted_name) list_set=[] for index,val in tqdm(enumerate(question_id)): temp={"answer":class_predicted_name[index],"question_id":val} list_set.append(temp) with open('validation_results_resnet_152_attention_baseline_num_hid_512_batch_size_512.json', 'w') as fout: json.dump(list_set , fout) predicted_df=pd.DataFrame({'Question_id':question_id,'Questions':question_set,'Actual_Answers':class_actual_name,'Predicted_Answers':class_predicted_name}) predicted_df.to_csv('Validation_Stats_resnet_152_attention_baseline_num_hid_512_batch_size_512.csv') score = score / len(loader.dataset) V_loss /= len(loader.dataset) upper_bound = upper_bound / len(loader.dataset) print(score,V_loss) #print(pred) else: print("Extract features and then come back")
def main(args): #defining torch configurations #torch.manual_seed(args.seed) #torch.cuda.manual_seed(args.seed) #torch.backends.cudnn.benchmark = True #extract weights from the weight matrices weights = np.load(args.file_name) # CUDA for PyTorch #if cuda: device = 3 torch.cuda.set_device(device) #device = torch.device("cuda:2" if torch.cuda.is_available() else "cpu") #use_cuda = torch.cuda.is_available() #device = torch.device("cuda" if torch.cuda.is_available() else "cpu") #defining dictionary and VQAFeatureDataset #transforms for pretrained network(transform for resnet now) train_transform = transforms.Compose([ transforms.Resize((args.crop_size, args.crop_size)), transforms.ToTensor(), transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225)) ]) validate_transform = transforms.Compose([ transforms.Resize((args.crop_size, args.crop_size)), transforms.ToTensor(), transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225)) ]) dictionary = Dictionary.load_from_file('data/dictionary.pkl') train_dataset = VQADataset(image_root_dir=args.img_root_dir, dictionary=dictionary, dataroot=args.data_root_dir, choice='train', transform_set=train_transform) # eval_dataset = VQADataset(image_root_dir=args.img_root_dir,dictionary=dictionary,dataroot=args.data_root_dir,choice='val',transform_set=validate_transform) #model definition print('Loading the models') image_encoder = EncoderCNN(embed_size=args.img_feats).to(device) question_encoder = EncoderLSTM(hidden_size=args.num_hid, weights_matrix=weights, fc_size=args.q_embed, max_seq_length=args.max_sequence_length, batch_size=args.batch_size).to(device) fusion_network = FusionModule(qnetwork=question_encoder, img_network=image_encoder, fuse_embed_size=args.fuse_embed, input_fc_size=args.img_feats, class_size=args.num_class).to(device) #print(list(fusion_network.parameters())) print(fusion_network) #input() #Dataloader initialization train_loader = DataLoader(train_dataset, args.batch_size, shuffle=True, num_workers=12) # eval_loader = DataLoader(eval_dataset, args.batch_size, shuffle=True, num_workers=1) # Loss and optimizer criterion = nn.NLLLoss() #params=lis #params = list(image_encoder.linear.parameters())+list(image_encoder.bn.parameters())+list(question_encoder.parameters()) + list(fusion_network.parameters()) optimizer = torch.optim.Adam(fusion_network.parameters(), lr=args.learning_rate) # Train the models total_step = len(train_loader) step = 0 #Training starts #print('Training Starting ......................') def evaluate_val(model, train_loader, criterion, device): loss = 0 accuracy = 0 with torch.no_grad(): for image_sample, question_token, labels in iter(train_loader): image_sample, question_token, labels = image_sample.to( device), question_token.to(device), labels.to(device) output = model.forward(question_token, image_sample) loss += criterion(output, labels).item() ps = torch.exp(output) equality = (labels.data == ps.max(dim=1)[1]) accuracy += equality.type(torch.FloatTensor).mean() return loss, accuracy file_train = open('train_loss_log.txt', 'a+') loss_save = [] for epoch in range(args.epochs): running_loss = 0.0 running_corrects = 0 step = 0 for data in tqdm(train_loader): image_samp, question_toks, labels = data image_samp = image_samp.to(device) question_toks = question_toks.to(device) labels = labels.to(device) class_outputs = fusion_network(question_toks, image_samp) _, preds = torch.max(class_outputs, 1) loss = criterion(class_outputs, labels) #question_encoder.zero_grad() optimizer.zero_grad() loss.backward() optimizer.step() #print('Enter some key') #input() # statistics running_loss += loss.item() * image_samp.size(0) running_corrects += torch.sum(preds == labels.data) if (step % 300 == 0): #optimizer.zero_grad() print('Epoch [{}/{}], Step [{}/{}], Loss: {:.4f}'.format( epoch, args.epochs, step, total_step, loss.item())) step = step + 1 epoch_loss = running_loss / len(train_dataset) epoch_acc = running_corrects.double() / len(train_dataset) print(epoch_loss) #loss_save.append(val_loss) val_loss, accuracy = evaluate_val(fusion_network, train_loader, criterion, device) string = 'Epoch {}:{} loss: {} \t'.format(epoch, args.epochs, running_loss) string += 'Accuracy : '.format(accuracy) file_train.write(string) print('{} Loss: {:.4f} Acc: {:.4f}'.format('train', epoch_loss, epoch_acc)) file_train.close()
im = im.convert('RGB') if (self.transform is not None): image = self.transform(im) question = torch.from_numpy(np.array(question)) return (image, question, label) else: print(filename) print('Filepath not found') def __len__(self): return (len(self.entries)) if __name__ == "__main__": image_root_dir = "/data/digbose92/VQA/COCO" dictionary = Dictionary.load_from_file('data/dictionary.pkl') dataroot = '/proj/digbose92/VQA/VisualQuestion_VQA/Visual_All/data' transform_list = transforms.Compose( [transforms.Resize((224, 224)), transforms.ToTensor()]) train_dataset = VQADataset(image_root_dir=image_root_dir, dictionary=dictionary, dataroot=dataroot, transform_set=transform_list) train_loader = DataLoader(train_dataset, batch_size=2, shuffle=True, num_workers=1) image, question, label = next(iter(train_loader))