Esempio n. 1
0
    def __init__(self,args):
        torch.cuda.manual_seed_all(args.seed)
        super(VQA_Model_combined, self).__init__()
        model=models.resnet152(pretrained=True)
        self.img_model = nn.Sequential(*list(model.children())[:-2])
        self.img_model.train(False)
        attention_model_checkpoint=torch.load(args.model_path)
        new_state_dict = OrderedDict()
        for k, v in attention_model_checkpoint.items():
            name = k[7:] # remove `module.`
            new_state_dict[name] = v
        print('Model checkpoint loaded')
    
        print(new_state_dict.keys())
        dictionary=Dictionary.load_from_file(args.pickle_path)

        train_dataset=Dataset_VQA(img_root_dir=args.image_root_dir,feats_data_path=args.feats_data_path,dictionary=dictionary,choice='train',dataroot=args.data_root,arch_choice=args.arch_choice,layer_option=args.layer_option)
        print('Loading the attention model')
        attention_model = attention_baseline(train_dataset, num_hid=args.num_hid, dropout= args.dropout, norm=args.norm,\
                               activation=args.activation, drop_L=args.dropout_L, drop_G=args.dropout_G,\
                               drop_W=args.dropout_W, drop_C=args.dropout_C)
        attention_model.load_state_dict(new_state_dict)
        self.vqa_model=attention_model
        self.vqa_model.train(False)
        #self.attention_model.train(False)
        
        #
        # 
        # 
        # 
        # self.vqa_model = model_regen(args)
        self.gradients = None
Esempio n. 2
0
def main_run(dataroot,
             pkl_filename,
             glove_filename,
             filenames_dict,
             image_filenames_dict,
             emb_dim=300):

    dictionary = create_dictionary(dataroot)
    dictionary.dump_to_file(os.path.join('data', pkl_filename))
    d = Dictionary.load_from_file((os.path.join('data', pkl_filename)))
    print(d.idx2word)
    weights, word2emb = create_glove_embedding_init(d.idx2word, glove_filename)
    np.save('data/glove6b_init_%dd.npy' % emb_dim, weights)

    #extract the raw data from json
    train_questions = json.load(open(
        filenames_dict['train_question_file']))['questions']
    train_answers = json.load(open(
        filenames_dict['train_answer_file']))['annotations']
    validation_questions = json.load(
        open(filenames_dict['validation_question_file']))['questions']
    validation_answers = json.load(
        open(filenames_dict['validation_answer_file']))['annotations']

    #generate the question labels and the id maps
    answers = train_answers + validation_answers
    occurence = filter_answers(answers, 9)
    ans2label = create_ans2label(occurence, 'trainval')
    train_target = compute_target(train_answers, ans2label, 'train')
    validation_target = compute_target(validation_answers, ans2label, 'val')

    #image feature extraction here based on functions in image_feature_extractor
    image_feats_converter(image_filenames_dict)
Esempio n. 3
0
def load_model(args):
    torch.cuda.manual_seed_all(args.seed)
    model_checkpoint = torch.load(args.model_path)
    new_state_dict = OrderedDict()
    for k, v in model_checkpoint.items():
        name = k[7:]  # remove `module.`
        new_state_dict[name] = v

    #new_state_dict["classifier.main.2.bias"]=new_state_dict.pop("classifier.main.3.bias")
    #new_state_dict["classifier.main.2.weight_g"]=new_state_dict.pop("classifier.main.3.weight_g")
    #new_state_dict["classifier.main.2.weight_v"]=new_state_dict.pop("classifier.main.3.weight_v")

    print('Model checkpoint loaded')
    dictionary = Dictionary.load_from_file(args.pickle_path)
    train_dataset = Dataset_VQA(img_root_dir=args.image_root_dir,
                                feats_data_path=args.feats_data_path,
                                dictionary=dictionary,
                                choice='train',
                                dataroot=args.data_root,
                                arch_choice=args.arch_choice,
                                layer_option=args.layer_option)

    #train_dataset=Dataset_VQA(img_root_dir=args.image_root_dir,feats_data_path=args.feats_data_path,dictionary=dictionary,choice='train',dataroot=args.data_root,arch_choice=args.arch_choice,layer_option=args.layer_option)
    print('Loading the attention model')

    attention_model = attention_baseline(train_dataset, num_hid=args.num_hid, dropout= args.dropout, norm=args.norm,\
                               activation=args.activation, drop_L=args.dropout_L, drop_G=args.dropout_G,\
                               drop_W=args.dropout_W, drop_C=args.dropout_C)
    #attention_model=attention_mfh(train_dataset, num_hid=args.num_hid, dropout= args.dropout, norm=args.norm,\
    #activation=args.activation, drop_L=args.dropout_L, drop_G=args.dropout_G,\
    #drop_W=args.dropout_W, drop_C=args.dropout_C,mfb_out_dim=args.mfb_out_dim)
    attention_model.load_state_dict(new_state_dict)
    attention_model.train(False)

    #torch.cuda.manual_seed(args.seed)
    torch.cuda.set_device(args.device)
    attention_model.to(args.device)

    return (attention_model)
Esempio n. 4
0
def model_regen(args):
    print('Loading model checkpoint')
    attention_model_checkpoint=torch.load(args.model_path)
    new_state_dict = OrderedDict()
    for k, v in attention_model_checkpoint.items():
        name = k[7:] # remove `module.`
        new_state_dict[name] = v
    print('Model checkpoint loaded')
    
    print(new_state_dict.keys())
    dictionary=Dictionary.load_from_file(args.pickle_path)

    train_dataset=Dataset_VQA(img_root_dir=args.image_root_dir,feats_data_path=args.feats_data_path,dictionary=dictionary,choice='train',dataroot=args.data_root,arch_choice=args.arch_choice,layer_option=args.layer_option)
    print('Loading the attention model')
    attention_model = attention_baseline(train_dataset, num_hid=args.num_hid, dropout= args.dropout, norm=args.norm,\
                               activation=args.activation, drop_L=args.dropout_L, drop_G=args.dropout_G,\
                               drop_W=args.dropout_W, drop_C=args.dropout_C)
    attention_model.load_state_dict(new_state_dict)
    attention_model.eval()
    
    #print('Saving the entire model')
    #torch.save(attention_model,'gradcam_models/resnet_152_attention_baseline_model.pth')
    return(attention_model)
Esempio n. 5
0
    VQA_model_exp=VQA_Model_combined(args)
    VQA_model_exp.to(args.device)
    #VQA_model_exp.img_model.eval()
    #VQA_model_exp.vqa_model.eval()
    #print(type(VQA_model_exp))
    #VQA_model_exp.eval()
    #VQA_model_exp.to(0)
    class_meta_data=pd.read_csv('/proj/digbose92/VQA/VisualQuestion_VQA/Visual_All/data/Train_Class_Distribution.csv')
    class_label_map=class_meta_data['Label_names'].tolist()

    print('Load the validation json file')
    valid_questions=json.load(open('/proj/digbose92/VQA/VisualQuestion_VQA/common_resources/v2_OpenEnded_mscoco_val2014_yes_no_questions.json'))['questions']

    valid_entry=valid_questions[77]
    dictionary=Dictionary.load_from_file('../Visual_All/data/dictionary.pkl')
    print(valid_entry['question'])
    tokens=preproc_question(valid_entry['question'],14,dictionary)
    print(tokens)
    #print(valid_entry)
    
    pkl_data=pickle.load(open('/proj/digbose92/VQA/VisualQuestion_VQA/common_resources/val_target_yes_no_ans.pkl','rb'))
    
    question_ids=[pkl_data[i]['question_id'] for i, question in enumerate(pkl_data)]
    #print(question_ids)
    
    id=question_ids.index(valid_entry['question_id'])
    #print(pkl_data[id])
    image_id=pkl_data[id]['image_id']
    
    choice='val'
Esempio n. 6
0
def main(args):

    #defining torch configurations
    torch.manual_seed(args.seed)
    torch.cuda.manual_seed(args.seed)
    torch.backends.cudnn.benchmark = True

    #extract weights from the weight matrices
    weights = np.load(args.file_name)

    # CUDA for PyTorch
    #if cuda:
    device = 2
    torch.cuda.set_device(device)
    #use_cuda = torch.cuda.is_available()
    #device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    #defining dictionary and VQAFeatureDataset
    dictionary = Dictionary.load_from_file('data/dictionary.pkl')
    train_dataset = VQAFeatureDataset(
        'train',
        dictionary,
        dataroot='/proj/digbose92/VQA/VisualQuestion_VQA/Visual_All/data')
    eval_dataset = VQAFeatureDataset(
        'val',
        dictionary,
        dataroot='/proj/digbose92/VQA/VisualQuestion_VQA/Visual_All/data')

    #model definition
    question_encoder = EncoderLSTM(hidden_size=args.num_hid,
                                   weights_matrix=weights,
                                   fc_size=args.q_embed,
                                   max_seq_length=args.max_sequence_length,
                                   batch_size=args.batch_size).to(device)
    fusion_network = FusionModule(fuse_embed_size=args.q_embed,
                                  fc_size=args.fuse_embed,
                                  class_size=args.num_class).to(device)

    print(question_encoder)
    print(fusion_network)
    input()

    #Dataloader initialization
    train_loader = DataLoader(train_dataset,
                              args.batch_size,
                              shuffle=True,
                              num_workers=1)
    eval_loader = DataLoader(eval_dataset,
                             args.batch_size,
                             shuffle=True,
                             num_workers=1)

    # Loss and optimizer
    criterion = nn.CrossEntropyLoss()
    params = list(question_encoder.parameters()) + list(
        fusion_network.parameters())
    optimizer = torch.optim.Adam(params, lr=args.learning_rate)

    # Train the models
    total_step = len(train_loader)
    step = 0
    #Training starts
    for epoch in range(args.epochs):
        for i, (image_features, spatials, question_tokens,
                labels) in enumerate(train_loader):
            class_indices = convert_one_hot2int(labels.numpy())
            image_feats = torch.mean(image_features, dim=1)
            image_feats = image_feats.to(device)
            class_indices = torch.from_numpy(class_indices).long().to(device)
            #labels=labels.to(device)

            #preproc the tokens after converting from tensor to numpy. Then numpy to tensor before passing to loss fn
            question_array = preproc_question_tokens(
                question_tokens.cpu().numpy())
            question_tokens = torch.from_numpy(question_array).to(device)

            #fusion_network.zero_grad()
            optimizer.zero_grad()
            #Forward, Backward and Optimize
            question_features = question_encoder(question_tokens)
            class_outputs = fusion_network(question_features, image_feats)

            loss = criterion(class_outputs, class_indices)
            #question_encoder.zero_grad()
            loss.backward()
            optimizer.step()
            if (step % 1000 == 0):
                #optimizer.zero_grad()
                print('Epoch [{}/{}], Step [{}/{}], Loss: {:.4f}'.format(
                    epoch, args.epochs, i, total_step, loss.item()))
            step = step + 1
def evaluate_attention_model(args):
    #torch.manual_seed(args.seed)
    torch.cuda.manual_seed_all(args.seed)
    class_data=pd.read_csv(args.class_metadata_file)
    #class_label_map={0:"no",1:"yes"}

    class_label_map=class_data['Label_names'].tolist()

    print('Loading model checkpoint')
    attention_model_checkpoint=torch.load(args.model_path)
    new_state_dict = OrderedDict()
    for k, v in attention_model_checkpoint.items():
        name = k[7:] # remove `module.`
        new_state_dict[name] = v
    print('Model checkpoint loaded')
    #new_state_dict["classifier.main.2.bias"]=new_state_dict.pop("classifier.main.3.bias")
    #new_state_dict["classifier.main.2.weight_g"]=new_state_dict.pop("classifier.main.3.weight_g")
    #new_state_dict["classifier.main.2.weight_v"]=new_state_dict.pop("classifier.main.3.weight_v")
    
    print(new_state_dict.keys())
    print('Loading Dictionary')
    dictionary=Dictionary.load_from_file(args.pickle_path)

    train_dataset=Dataset_VQA(img_root_dir=args.image_root_dir,feats_data_path=args.feats_data_path,dictionary=dictionary,choice='train',dataroot=args.data_root,arch_choice=args.arch_choice,layer_option=args.layer_option)
    print('Loading the attention model')
    attention_model = attention_baseline(train_dataset, num_hid=args.num_hid, dropout= args.dropout, norm=args.norm,\
                               activation=args.activation, drop_L=args.dropout_L, drop_G=args.dropout_G,\
                               drop_W=args.dropout_W, drop_C=args.dropout_C)

    #attention_model=attention_mfh(train_dataset, num_hid=args.num_hid, dropout= args.dropout, norm=args.norm,\
                               #activation=args.activation, drop_L=args.dropout_L, drop_G=args.dropout_G,\
                               #drop_W=args.dropout_W, drop_C=args.dropout_C,mfb_out_dim=args.mfb_out_dim)
    attention_model.load_state_dict(new_state_dict)
    attention_model.eval()

    
    #torch.cuda.manual_seed(args.seed)
    torch.cuda.set_device(args.device)
    attention_model.to(args.device)
    if(args.image_model is None):
        """use extracted features as a Dataset and Dataloader
        """
        print('Using validation features')
        dataset_temp=Dataset_VQA(img_root_dir=args.image_root_dir,feats_data_path=args.feats_data_path,dictionary=dictionary,bert_option=args.bert_option,rcnn_pkl_path=None,choice=args.choice,dataroot=args.data_root,arch_choice=args.arch_choice,layer_option=args.layer_option)
        loader=DataLoader(dataset_temp, batch_size=args.batch_size, shuffle=False, num_workers=1)
        print('Length of validation dataloader:', len(loader))
        upper_bound = 0
        num_data = 0
        V_loss=0 
        score=0
        print('Validation data loading starting')
        actual_class_labels=[]
        predicted_class_labels=[]
        question_set=[]
        question_id=[]
        count=0
        for data in tqdm(loader):         
                feat,quest,quest_sent,quest_id,target = data
                feat = feat.to(args.device)
                quest = quest.to(args.device)
                target = target.to(args.device)
                
                question_id=question_id+quest_id.tolist()
                pred = attention_model(feat, quest)
                question_set=question_set+list(quest_sent)
                loss = instance_bce_with_logits(pred, target)
                V_loss += loss.item() * feat.size(0)
                score_temp, logits, class_labels= compute_score_with_logits(pred, target.data)
                actual_class_labels=actual_class_labels+list(class_labels.cpu().numpy())
                predicted_class_labels=predicted_class_labels+list(logits.cpu().numpy())
                batch_score=score_temp.sum()
                score += batch_score
                upper_bound += (target.max(1)[0]).sum()
                num_data += pred.size(0)
            #count=count+1
            
        

        class_predicted_name=[class_label_map[id] for id in predicted_class_labels]
        class_actual_name=[class_label_map[id] for id in actual_class_labels]
        
        print(class_predicted_name)
        list_set=[]
        for index,val in tqdm(enumerate(question_id)):
            temp={"answer":class_predicted_name[index],"question_id":val}
            list_set.append(temp)
        with open('validation_results_resnet_152_attention_baseline_num_hid_512_batch_size_512.json', 'w') as fout:
            json.dump(list_set , fout)
        predicted_df=pd.DataFrame({'Question_id':question_id,'Questions':question_set,'Actual_Answers':class_actual_name,'Predicted_Answers':class_predicted_name})
        predicted_df.to_csv('Validation_Stats_resnet_152_attention_baseline_num_hid_512_batch_size_512.csv')
        score = score / len(loader.dataset)
        V_loss /= len(loader.dataset)
        upper_bound = upper_bound / len(loader.dataset)
        print(score,V_loss)
        #print(pred)
    else:   
        print("Extract features and then come back")
Esempio n. 8
0
def main(args):

    #defining torch configurations
    #torch.manual_seed(args.seed)
    #torch.cuda.manual_seed(args.seed)
    #torch.backends.cudnn.benchmark = True

    #extract weights from the weight matrices
    weights = np.load(args.file_name)

    # CUDA for PyTorch
    #if cuda:
    device = 3
    torch.cuda.set_device(device)

    #device = torch.device("cuda:2" if torch.cuda.is_available() else "cpu")
    #use_cuda = torch.cuda.is_available()
    #device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    #defining dictionary and VQAFeatureDataset
    #transforms for pretrained network(transform for resnet now)
    train_transform = transforms.Compose([
        transforms.Resize((args.crop_size, args.crop_size)),
        transforms.ToTensor(),
        transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225))
    ])

    validate_transform = transforms.Compose([
        transforms.Resize((args.crop_size, args.crop_size)),
        transforms.ToTensor(),
        transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225))
    ])

    dictionary = Dictionary.load_from_file('data/dictionary.pkl')
    train_dataset = VQADataset(image_root_dir=args.img_root_dir,
                               dictionary=dictionary,
                               dataroot=args.data_root_dir,
                               choice='train',
                               transform_set=train_transform)
    # eval_dataset = VQADataset(image_root_dir=args.img_root_dir,dictionary=dictionary,dataroot=args.data_root_dir,choice='val',transform_set=validate_transform)

    #model definition
    print('Loading the models')
    image_encoder = EncoderCNN(embed_size=args.img_feats).to(device)
    question_encoder = EncoderLSTM(hidden_size=args.num_hid,
                                   weights_matrix=weights,
                                   fc_size=args.q_embed,
                                   max_seq_length=args.max_sequence_length,
                                   batch_size=args.batch_size).to(device)
    fusion_network = FusionModule(qnetwork=question_encoder,
                                  img_network=image_encoder,
                                  fuse_embed_size=args.fuse_embed,
                                  input_fc_size=args.img_feats,
                                  class_size=args.num_class).to(device)
    #print(list(fusion_network.parameters()))
    print(fusion_network)
    #input()

    #Dataloader initialization
    train_loader = DataLoader(train_dataset,
                              args.batch_size,
                              shuffle=True,
                              num_workers=12)
    # eval_loader =  DataLoader(eval_dataset, args.batch_size, shuffle=True, num_workers=1)

    # Loss and optimizer
    criterion = nn.NLLLoss()
    #params=lis
    #params = list(image_encoder.linear.parameters())+list(image_encoder.bn.parameters())+list(question_encoder.parameters()) + list(fusion_network.parameters())
    optimizer = torch.optim.Adam(fusion_network.parameters(),
                                 lr=args.learning_rate)

    # Train the models
    total_step = len(train_loader)
    step = 0

    #Training starts
    #print('Training Starting ......................')

    def evaluate_val(model, train_loader, criterion, device):
        loss = 0
        accuracy = 0
        with torch.no_grad():
            for image_sample, question_token, labels in iter(train_loader):
                image_sample, question_token, labels = image_sample.to(
                    device), question_token.to(device), labels.to(device)
                output = model.forward(question_token, image_sample)
                loss += criterion(output, labels).item()
                ps = torch.exp(output)
                equality = (labels.data == ps.max(dim=1)[1])
                accuracy += equality.type(torch.FloatTensor).mean()
        return loss, accuracy

    file_train = open('train_loss_log.txt', 'a+')
    loss_save = []

    for epoch in range(args.epochs):

        running_loss = 0.0
        running_corrects = 0
        step = 0
        for data in tqdm(train_loader):
            image_samp, question_toks, labels = data
            image_samp = image_samp.to(device)
            question_toks = question_toks.to(device)
            labels = labels.to(device)

            class_outputs = fusion_network(question_toks, image_samp)
            _, preds = torch.max(class_outputs, 1)
            loss = criterion(class_outputs, labels)
            #question_encoder.zero_grad()
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            #print('Enter some key')
            #input()
            # statistics
            running_loss += loss.item() * image_samp.size(0)
            running_corrects += torch.sum(preds == labels.data)
            if (step % 300 == 0):
                #optimizer.zero_grad()
                print('Epoch [{}/{}], Step [{}/{}], Loss: {:.4f}'.format(
                    epoch, args.epochs, step, total_step, loss.item()))
            step = step + 1
        epoch_loss = running_loss / len(train_dataset)
        epoch_acc = running_corrects.double() / len(train_dataset)
        print(epoch_loss)
        #loss_save.append(val_loss)

        val_loss, accuracy = evaluate_val(fusion_network, train_loader,
                                          criterion, device)
        string = 'Epoch {}:{} loss: {} \t'.format(epoch, args.epochs,
                                                  running_loss)
        string += 'Accuracy : '.format(accuracy)
        file_train.write(string)
        print('{} Loss: {:.4f} Acc: {:.4f}'.format('train', epoch_loss,
                                                   epoch_acc))
    file_train.close()
Esempio n. 9
0
            im = im.convert('RGB')
            if (self.transform is not None):
                image = self.transform(im)
            question = torch.from_numpy(np.array(question))
            return (image, question, label)
        else:
            print(filename)
            print('Filepath not found')

    def __len__(self):
        return (len(self.entries))


if __name__ == "__main__":
    image_root_dir = "/data/digbose92/VQA/COCO"
    dictionary = Dictionary.load_from_file('data/dictionary.pkl')
    dataroot = '/proj/digbose92/VQA/VisualQuestion_VQA/Visual_All/data'

    transform_list = transforms.Compose(
        [transforms.Resize((224, 224)),
         transforms.ToTensor()])
    train_dataset = VQADataset(image_root_dir=image_root_dir,
                               dictionary=dictionary,
                               dataroot=dataroot,
                               transform_set=transform_list)
    train_loader = DataLoader(train_dataset,
                              batch_size=2,
                              shuffle=True,
                              num_workers=1)
    image, question, label = next(iter(train_loader))