def __init__(self, encoder, gpu_mode, embed_hidden=300, mlp_hidden=512): super(BaseModel, self).__init__() self.normalize = tv.transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) self.train_transform = tv.transforms.Compose([ tv.transforms.RandomRotation(10), tv.transforms.RandomResizedCrop(224), tv.transforms.RandomHorizontalFlip(), tv.transforms.ToTensor(), self.normalize, ]) self.dev_transform = tv.transforms.Compose([ tv.transforms.Resize(224), tv.transforms.CenterCrop(224), tv.transforms.ToTensor(), self.normalize, ]) self.encoder = encoder self.gpu_mode = gpu_mode self.mlp_hidden = mlp_hidden #self.verbq_word_count = len(self.encoder.verb_q_words) self.n_verbs = self.encoder.get_num_verbs() self.verb_module = model_verbq_0.BaseModel(self.encoder, self.gpu_mode) self.role_module = model_roles_recqa_noself.BaseModel( self.encoder, self.gpu_mode) self.verb_module.eval() self.role_module.eval() '''self.verb_vqa = TopDown(self.n_verbs) self.verb_q_emb = nn.Embedding(self.verb_module.verbq_word_count + 1, embed_hidden, padding_idx=self.verb_module.verbq_word_count) self.last_class = nn.Linear(self.mlp_hidden*8, self.n_verbs) weight_verbqa = copy.deepcopy(self.verb_module.verb_vqa.state_dict()) weight_emb = copy.deepcopy(self.verb_module.verb_q_emb.state_dict()) weight_lastclass = copy.deepcopy(self.verb_module.last_class.state_dict()) self.verb_vqa.load_state_dict(weight_verbqa) self.verb_q_emb.load_state_dict(weight_emb) self.last_class.load_state_dict(weight_lastclass)''' self.updated_verb_module = model_verbq_0.BaseModel( self.encoder, self.gpu_mode) self.role_maker = nn.Linear(mlp_hidden, mlp_hidden) self.real_comb_concat = nn.Linear(mlp_hidden * 2, mlp_hidden)
def __init__(self, encoder, gpu_mode, embed_hidden=300, mlp_hidden=512): super(BaseModel, self).__init__() self.normalize = tv.transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) self.train_transform = tv.transforms.Compose([ tv.transforms.RandomRotation(10), tv.transforms.RandomResizedCrop(224), tv.transforms.RandomHorizontalFlip(), tv.transforms.ToTensor(), self.normalize, ]) self.dev_transform = tv.transforms.Compose([ tv.transforms.Resize(224), tv.transforms.CenterCrop(224), tv.transforms.ToTensor(), self.normalize, ]) self.encoder = encoder self.gpu_mode = gpu_mode self.n_roles = self.encoder.get_num_roles() self.n_verbs = self.encoder.get_num_verbs() self.vocab_size = self.encoder.get_num_labels() self.max_role_count = self.encoder.get_max_role_count() self.n_role_q_vocab = len(self.encoder.question_words) self.verbq_word_count = len(self.encoder.verb_question_words) self.conv = vgg16_modified() self.verb_module = model_verbq_0.BaseModel(self.encoder, self.gpu_mode) #self.verb_lookup = nn.Embedding(self.n_verbs, embed_hidden) self.w_emb = nn.Embedding(self.n_role_q_vocab + 1, embed_hidden, padding_idx=self.n_role_q_vocab) self.roles = TopDown(self.vocab_size) self.role_classifier = SimpleClassifier(mlp_hidden, 2 * mlp_hidden, self.vocab_size, 0.5) self.verb_vqa = model_verbq_0.TopDown(self.n_verbs) self.verb_q_emb = nn.Embedding(self.verbq_word_count + 1, embed_hidden, padding_idx=self.verbq_word_count) self.verb_last_class = nn.Linear(mlp_hidden * 8, self.n_verbs) self.verb_role_maker = nn.Linear(mlp_hidden, mlp_hidden) self.verb_real_comb_concat = nn.Linear(mlp_hidden * 2, mlp_hidden) self.conv_hidden = self.conv.base_size() self.mlp_hidden = mlp_hidden self.embed_hidden = embed_hidden
def __init__(self, encoder, gpu_mode, embed_hidden=300, mlp_hidden = 512 ): super(BaseModel, self).__init__() self.normalize = tv.transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) self.train_transform = tv.transforms.Compose([ tv.transforms.RandomRotation(10), tv.transforms.RandomResizedCrop(224), tv.transforms.RandomHorizontalFlip(), tv.transforms.ToTensor(), self.normalize, ]) self.dev_transform = tv.transforms.Compose([ tv.transforms.Resize(224), tv.transforms.CenterCrop(224), tv.transforms.ToTensor(), self.normalize, ]) self.encoder = encoder self.gpu_mode = gpu_mode self.mlp_hidden = mlp_hidden #self.verbq_word_count = len(self.encoder.verb_q_words) self.n_verbs = self.encoder.get_num_verbs() self.verb_module = model_verbq_0.BaseModel(self.encoder, self.gpu_mode) self.role_module = model_roles_recqa_noself.BaseModel(self.encoder, self.gpu_mode) self.verb_module.eval() self.role_module.eval() self.label_small = nn.Linear(mlp_hidden, embed_hidden) self.updating_verb_module = model_verbq_0.BaseModel(self.encoder, self.gpu_mode) self.dropout = nn.Dropout(0.5)
def __init__(self, encoder, gpu_mode, embed_hidden=300, mlp_hidden = 512 ): super(BaseModel, self).__init__() self.normalize = tv.transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) self.train_transform = tv.transforms.Compose([ tv.transforms.RandomRotation(10), tv.transforms.RandomResizedCrop(224), tv.transforms.RandomHorizontalFlip(), tv.transforms.ToTensor(), self.normalize, ]) self.dev_transform = tv.transforms.Compose([ tv.transforms.Resize(224), tv.transforms.CenterCrop(224), tv.transforms.ToTensor(), self.normalize, ]) self.encoder = encoder self.gpu_mode = gpu_mode self.mlp_hidden = mlp_hidden self.verbq_word_count = len(self.encoder.verb_q_words) self.n_verbs = self.encoder.get_num_verbs() self.conv = vgg16_modified() self.verb_module = model_verbq_0.BaseModel(self.encoder, self.gpu_mode) self.role_module = model_roles_recqa_noself.BaseModel(self.encoder, self.gpu_mode) self.verb_module.eval() self.role_module.eval() '''for param in self.verb_module.parameters(): param.require_grad = False for param in self.role_module.parameters(): param.require_grad = False for param in self.conv.parameters(): param.require_grad = False''' self.verb_vqa = TopDown(self.n_verbs) self.verb_q_emb = nn.Embedding(self.verbq_word_count + 1, embed_hidden, padding_idx=self.verbq_word_count) self.last_class = nn.Linear(self.mlp_hidden*8, self.n_verbs)
def main(): import argparse parser = argparse.ArgumentParser( description="imsitu VSRL. Training, evaluation and prediction.") parser.add_argument("--gpuid", default=-1, help="put GPU id > -1 in GPU mode", type=int) #parser.add_argument("--command", choices = ["train", "eval", "resume", 'predict'], required = True) parser.add_argument('--resume_training', action='store_true', help='Resume training from the model [resume_model]') parser.add_argument('--resume_model', type=str, default='', help='The model we resume') parser.add_argument('--verb_module', type=str, default='', help='pretrained verb module') parser.add_argument('--role_module', type=str, default='', help='pretrained role module') parser.add_argument('--train_role', action='store_true', help='cnn fix, verb fix, role train from the scratch') parser.add_argument( '--finetune_verb', action='store_true', help='cnn fix, verb finetune, role train from the scratch') parser.add_argument( '--finetune_cnn', action='store_true', help='cnn finetune, verb finetune, role train from the scratch') parser.add_argument('--output_dir', type=str, default='./trained_models', help='Location to output the model') parser.add_argument('--evaluate', action='store_true', help='Only use the testing mode') parser.add_argument('--test', action='store_true', help='Only use the testing mode') parser.add_argument('--dataset_folder', type=str, default='./imSitu', help='Location of annotations') parser.add_argument('--imgset_dir', type=str, default='./resized_256', help='Location of original images') parser.add_argument('--frcnn_feat_dir', type=str, help='Location of output from detectron') #todo: train role module separately with gt verbs args = parser.parse_args() batch_size = 640 #lr = 5e-6 lr = 0.0001 lr_max = 5e-4 lr_gamma = 0.1 lr_step = 15 clip_norm = 0.5 weight_decay = 1e-4 n_epoch = 500 n_worker = 3 #dataset_folder = 'imSitu' #imgset_folder = 'resized_256' dataset_folder = args.dataset_folder imgset_folder = args.imgset_dir print('model spec :, top down att with role q ') train_set = json.load(open(dataset_folder + "/updated_train_new.json")) imsitu_roleq = json.load(open("imsitu_data/imsitu_questions_prev.json")) encoder = imsitu_encoder(train_set, imsitu_roleq) model = model_verbq_0.BaseModel(encoder, args.gpuid) # To group up the features #cnn_features, role_features = utils.group_features_noun(model) cnn_features, role_features = utils.group_features_noun(model) train_set = imsitu_loader_roleq_updated(imgset_folder, train_set, encoder, model.train_preprocess()) train_loader = torch.utils.data.DataLoader(train_set, batch_size=64, shuffle=True, num_workers=n_worker) dev_set = json.load(open(dataset_folder + "/dev.json")) dev_set = imsitu_loader_roleq_updated(imgset_folder, dev_set, encoder, model.dev_preprocess()) dev_loader = torch.utils.data.DataLoader(dev_set, batch_size=64, shuffle=True, num_workers=n_worker) test_set = json.load(open(dataset_folder + "/test.json")) test_set = imsitu_loader_roleq_updated(imgset_folder, test_set, encoder, model.dev_preprocess()) test_loader = torch.utils.data.DataLoader(test_set, batch_size=64, shuffle=True, num_workers=n_worker) traindev_set = json.load(open(dataset_folder + "/dev.json")) traindev_set = imsitu_loader_roleq_updated(imgset_folder, traindev_set, encoder, model.dev_preprocess()) traindev_loader = torch.utils.data.DataLoader(traindev_set, batch_size=8, shuffle=True, num_workers=n_worker) #utils.load_net(args.verb_module, [model.verb_module]) #utils.load_net(args.role_module, [model.role_module]) model_name = 'train_full' if not os.path.exists(args.output_dir): os.mkdir(args.output_dir) torch.manual_seed(1234) if args.gpuid >= 0: #print('GPU enabled') model.cuda() torch.cuda.manual_seed(1234) torch.backends.cudnn.deterministic = True optimizer = torch.optim.Adam([{ 'params': cnn_features, 'lr': 5e-5 }, { 'params': role_features }], lr=1e-3) #optimizer = torch.optim.Adam(model.parameters(), lr=lr, weight_decay=weight_decay) #scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=lr_step, gamma=lr_gamma) #gradient clipping, grad check scheduler = torch.optim.lr_scheduler.ExponentialLR(optimizer, gamma=0.9) if args.evaluate: top1, top5, val_loss = eval(model, dev_loader, encoder, args.gpuid, write_to_file=True) top1_avg = top1.get_average_results() top5_avg = top5.get_average_results() avg_score = top1_avg["verb"] + top1_avg["value"] + top1_avg["value-all"] + top5_avg["verb"] + \ top5_avg["value"] + top5_avg["value-all"] + top5_avg["value*"] + top5_avg["value-all*"] avg_score /= 8 print('Dev average :{:.2f} {} {}'.format( avg_score * 100, utils.format_dict(top1_avg, '{:.2f}', '1-'), utils.format_dict(top5_avg, '{:.2f}', '5-'))) #write results to csv file role_dict = top1.role_dict fail_val_all = top1.value_all_dict pass_val_dict = top1.vall_all_correct with open('role_pred_data.json', 'w') as fp: json.dump(role_dict, fp, indent=4) with open('fail_val_all.json', 'w') as fp: json.dump(fail_val_all, fp, indent=4) with open('pass_val_all.json', 'w') as fp: json.dump(pass_val_dict, fp, indent=4) print('Writing predictions to file completed !') elif args.test: top1, top5, val_loss = eval(model, test_loader, encoder, args.gpuid, write_to_file=True) top1_avg = top1.get_average_results() top5_avg = top5.get_average_results() avg_score = top1_avg["verb"] + top1_avg["value"] + top1_avg["value-all"] + top5_avg["verb"] + \ top5_avg["value"] + top5_avg["value-all"] + top5_avg["value*"] + top5_avg["value-all*"] avg_score /= 8 print('Test average :{:.2f} {} {}'.format( avg_score * 100, utils.format_dict(top1_avg, '{:.2f}', '1-'), utils.format_dict(top5_avg, '{:.2f}', '5-'))) else: print('Model training started!') train(model, train_loader, dev_loader, traindev_loader, optimizer, scheduler, n_epoch, args.output_dir, encoder, args.gpuid, clip_norm, lr_max, model_name, args)