def eval(rank, args, shared_model): print('eval start...') torch.cuda.set_device(args.gpus.index(args.gpus[rank % len(args.gpus)])) if args.input_type == 'ques': model_kwargs = {'vocab': load_vocab(args.vocab_json)} model = VqaLstmModel(**model_kwargs) elif args.input_type == 'ques,image': model_kwargs = {'vocab': load_vocab(args.vocab_json)} model = VqaLstmCnnAttentionModel(**model_kwargs) lossFn = torch.nn.CrossEntropyLoss().cuda() eval_loader_kwargs = { 'questions_h5': getattr(args, args.eval_split + '_h5'), 'vocab': args.vocab_json, 'batch_size': 1, 'input_type': args.input_type, 'num_frames': args.num_frames, 'split': args.eval_split, 'max_threads_per_gpu': args.max_threads_per_gpu, 'gpu_id': args.gpus[rank % len(args.gpus)], } # print(eval_loader_kwargs) eval_loader = EqaDataLoader(**eval_loader_kwargs) print('eval_loader has %d samples' % len(eval_loader.dataset)) args.output_log_path = os.path.join(args.log_dir, 'eval_' + str(rank) + '.json') t, epoch, best_eval_acc = 0, 0, 0 while epoch < int(args.max_epochs): model.load_state_dict(shared_model.state_dict()) model.eval() metrics = VqaMetric(info={'split': args.eval_split}, metric_names=[ 'loss', 'accuracy', 'mean_rank', 'mean_reciprocal_rank' ], log_json=args.output_log_path) if args.input_type == 'ques': for batch in eval_loader: t += 1 model.cuda() idx, questions, answers = batch questions_var = Variable(questions.cuda()) answers_var = Variable(answers.cuda()) scores = model(questions_var) loss = lossFn(scores, answers_var) print(scores) # update metrics accuracy, ranks = metrics.compute_ranks( scores.data.cpu(), answers) metrics.update([loss.data[0], accuracy, ranks, 1.0 / ranks]) print(metrics.get_stat_string(mode=0)) elif args.input_type == 'ques,image': done = False all_envs_loaded = True #all_envs_loaded = eval_loader.dataset._check_if_all_envs_loaded() while done == False: for batch in eval_loader: t += 1 model.cuda() idx, questions, answers, images, _, _, _ = batch questions_var = Variable(questions.cuda()) answers_var = Variable(answers.cuda()) images_var = Variable(images.cuda()) scores, att_probs = model(images_var, questions_var) loss = lossFn(scores, answers_var) # update metrics accuracy, ranks = metrics.compute_ranks( scores.data.cpu(), answers) metrics.update( [loss.data[0], accuracy, ranks, 1.0 / ranks]) print(metrics.get_stat_string(mode=0)) if all_envs_loaded == False: eval_loader.dataset._load_envs() if len(eval_loader.dataset.pruned_env_set) == 0: done = True else: done = True epoch += 1 # checkpoint if best val accuracy if metrics.metrics[1][0] > best_eval_acc: best_eval_acc = metrics.metrics[1][0] if epoch % args.eval_every == 0 and args.log == True: metrics.dump_log() model_state = get_state(model) if args.checkpoint_path != False: ad = checkpoint['args'] else: ad = args.__dict__ checkpoint = {'args': ad, 'state': model_state, 'epoch': epoch} checkpoint_path = '%s/epoch_%d_accuracy_%.04f.pt' % ( args.checkpoint_dir, epoch, best_eval_acc) print('Saving checkpoint to %s' % checkpoint_path) torch.save(checkpoint, checkpoint_path) print('[best_eval_accuracy:%.04f]' % best_eval_acc)
args.checkpoint_dir = os.path.join(args.checkpoint_dir, args.time_id + '_' + args.identifier) args.log_dir = os.path.join(args.log_dir, args.time_id + '_' + args.identifier) print(args.__dict__) if not os.path.exists(args.checkpoint_dir) and args.log == True: os.makedirs(args.checkpoint_dir) os.makedirs(args.log_dir) if args.input_type == 'ques': model_kwargs = {'vocab': load_vocab(args.vocab_json)} shared_model = VqaLstmModel(**model_kwargs) elif args.input_type == 'ques,image': model_kwargs = {'vocab': load_vocab(args.vocab_json)} print('[CREATE] SHARED MODEL') shared_model = VqaLstmCnnAttentionModel(**model_kwargs) print('[FINISH] SHARED MODEL') if args.checkpoint_path != False: print('Loading params from checkpoint: %s' % args.checkpoint_path) shared_model.load_state_dict(checkpoint['state']) shared_model.share_memory() if args.mode == 'eval':
def train(rank, args, shared_model): torch.cuda.set_device(args.gpus.index(args.gpus[rank % len(args.gpus)])) if args.input_type == 'ques': model_kwargs = {'vocab': load_vocab(args.vocab_json)} model = VqaLstmModel(**model_kwargs) elif args.input_type == 'ques,image': model_kwargs = {'vocab': load_vocab(args.vocab_json)} model = VqaLstmCnnAttentionModel(**model_kwargs) lossFn = torch.nn.CrossEntropyLoss().cuda() optim = torch.optim.Adam(filter(lambda p: p.requires_grad, shared_model.parameters()), lr=args.learning_rate) train_loader_kwargs = { 'questions_h5': args.train_h5, 'vocab': args.vocab_json, 'batch_size': args.batch_size, 'input_type': args.input_type, 'num_frames': args.num_frames, 'split': 'train', 'max_threads_per_gpu': args.max_threads_per_gpu, 'gpu_id': args.gpus[rank % len(args.gpus)], } args.output_log_path = os.path.join(args.log_dir, 'train_' + str(rank) + '.json') metrics = VqaMetric( info={ 'split': 'train', 'thread': rank }, metric_names=['loss', 'accuracy', 'mean_rank', 'mean_reciprocal_rank'], log_json=args.output_log_path) print('[TRAIN_LOADER] start') train_loader = EqaDataLoader(**train_loader_kwargs) print('train_loader has %d samples' % len(train_loader.dataset)) t, epoch = 0, 0 while epoch < int(args.max_epochs): # print('epoch no. %d' % epoch) if args.input_type == 'ques': for batch in train_loader: t += 1 model.load_state_dict(shared_model.state_dict()) model.train() model.cuda() idx, questions, answers = batch questions_var = Variable(questions.cuda()) answers_var = Variable(answers.cuda()) scores = model(questions_var) loss = lossFn(scores, answers_var) # zero grad optim.zero_grad() # update metrics accuracy, ranks = metrics.compute_ranks( scores.data.cpu(), answers) metrics.update([loss.data[0], accuracy, ranks, 1.0 / ranks]) # backprop and update loss.backward() ensure_shared_grads(model.cpu(), shared_model) optim.step() if t % args.print_every == 0: print(metrics.get_stat_string()) if args.log == True: metrics.dump_log() elif args.input_type == 'ques,image': done = False all_envs_loaded = True #all_envs_loaded = train_loader.dataset._check_if_all_envs_loaded() while done == False: for batch in train_loader: t += 1 model.load_state_dict(shared_model.state_dict()) model.train() model.cnn.eval() model.cuda() idx, questions, answers, images, _, _, _ = batch print('--- images dim {}'.format(images.size())) questions_var = Variable(questions.cuda()) answers_var = Variable(answers.cuda()) images_var = Variable(images.cuda()) scores, att_probs = model(images_var, questions_var) loss = lossFn(scores, answers_var) print('--- att_probs: {}.'.format(att_probs)) # print('--- answers_var: {}.'.format(answers_var)) # print('--- loss: {}.'.format(loss)) # zero grad optim.zero_grad() # update metrics accuracy, ranks = metrics.compute_ranks( scores.data.cpu(), answers) metrics.update([loss.item(), accuracy, ranks, 1.0 / ranks]) # backprop and update loss.backward() ensure_shared_grads(model.cpu(), shared_model) optim.step() if t % args.print_every == 0: print(metrics.get_stat_string()) if args.log == True: metrics.dump_log() if all_envs_loaded == False: print('[CHECK][Cache:%d][Total:%d]' % (len(train_loader.dataset.img_data_cache), len(train_loader.dataset.env_list))) train_loader.dataset._load_envs(in_order=True) if len(train_loader.dataset.pruned_env_set) == 0: done = True else: done = True epoch += 1
def train(rank, args, shared_model, use_vision, use_language): gpu_idx = args.gpus.index(args.gpus[rank % len(args.gpus)]) torch.cuda.set_device(gpu_idx) print("train gpu:" + str(gpu_idx) + " assigned") if args.input_type == 'ques': model_kwargs = {'vocab': load_vocab(args.vocab_json)} model = VqaLstmModel(**model_kwargs) elif args.input_type == 'ques,image': model_kwargs = {'vocab': load_vocab(args.vocab_json)} model = VqaLstmCnnAttentionModel(**model_kwargs) lossFn = torch.nn.CrossEntropyLoss().cuda() optim = torch.optim.Adam(filter(lambda p: p.requires_grad, shared_model.parameters()), lr=args.learning_rate) train_loader_kwargs = { 'questions_h5': args.train_h5, 'data_json': args.data_json, 'vocab': args.vocab_json, 'batch_size': args.batch_size, 'input_type': args.input_type, 'num_frames': args.num_frames, 'split': 'train', 'max_threads_per_gpu': args.max_threads_per_gpu, 'gpu_id': args.gpus[rank % len(args.gpus)], 'to_cache': args.to_cache } args.output_log_path = os.path.join(args.log_dir, 'train_' + str(rank) + '.json') metrics = VqaMetric( info={ 'split': 'train', 'thread': rank }, metric_names=['loss', 'accuracy', 'mean_rank', 'mean_reciprocal_rank'], log_json=args.output_log_path) train_loader = EqaDataLoader(**train_loader_kwargs) if args.input_type == 'ques,image': train_loader.dataset._load_envs(start_idx=0, in_order=True) print('train_loader has %d samples' % len(train_loader.dataset)) t, epoch = 0, 0 while epoch < int(args.max_epochs): print("train gpu:" + str(gpu_idx) + " running epoch " + str(epoch)) if args.input_type == 'ques': for batch in train_loader: t += 1 model.load_state_dict(shared_model.state_dict()) model.train() model.cuda() idx, questions, answers = batch # If not using language, replace each question with a start and end token back to back. if not use_language: questions = torch.zeros_like(questions) questions.fill_( model_kwargs['vocab']['questionTokenToIdx']['<NULL>']) questions[:, 0] = model_kwargs['vocab'][ 'questionTokenToIdx']['<START>'] questions[:, 1] = model_kwargs['vocab'][ 'questionTokenToIdx']['<END>'] questions_var = Variable(questions.cuda()) answers_var = Variable(answers.cuda()) scores = model(questions_var) loss = lossFn(scores, answers_var) # zero grad optim.zero_grad() # update metrics accuracy, ranks = metrics.compute_ranks( scores.data.cpu(), answers) metrics.update([loss.data[0], accuracy, ranks, 1.0 / ranks]) # backprop and update loss.backward() ensure_shared_grads(model.cpu(), shared_model) optim.step() if t % args.print_every == 0: print(metrics.get_stat_string()) if args.to_log == 1: metrics.dump_log() elif args.input_type == 'ques,image': done = False all_envs_loaded = train_loader.dataset._check_if_all_envs_loaded() while done == False: for batch in train_loader: t += 1 model.load_state_dict(shared_model.state_dict()) model.train() model.cuda() idx, questions, answers, images, _, _, _ = batch # If not using language, replace each question with a start and end token back to back. if not use_language: questions = torch.zeros_like(questions) questions.fill_(model_kwargs['vocab'] ['questionTokenToIdx']['<NULL>']) questions[:, 0] = model_kwargs['vocab'][ 'questionTokenToIdx']['<START>'] questions[:, 1] = model_kwargs['vocab'][ 'questionTokenToIdx']['<END>'] # If not using vision, replace all image feature data with zeros. if not use_vision: images = torch.zeros_like(images) questions_var = Variable(questions.cuda()) answers_var = Variable(answers.cuda()) images_var = Variable(images.cuda()) scores, att_probs = model(images_var, questions_var) loss = lossFn(scores, answers_var) # zero grad optim.zero_grad() # update metrics accuracy, ranks = metrics.compute_ranks( scores.data.cpu(), answers) metrics.update( [loss.data[0], accuracy, ranks, 1.0 / ranks]) # backprop and update loss.backward() ensure_shared_grads(model.cpu(), shared_model) optim.step() if t % args.print_every == 0: print(metrics.get_stat_string()) if args.to_log == 1: metrics.dump_log() if all_envs_loaded == False: train_loader.dataset._load_envs(in_order=True) if len(train_loader.dataset.pruned_env_set) == 0: done = True else: done = True # Set shared epoch when it finishes on the training side with open(args.identifier + '.shared_epoch.tmp', 'w') as f: f.write(str(epoch)) epoch += 1
def eval(rank, args, shared_model, use_vision, use_language): gpu_idx = args.gpus.index(args.gpus[rank % len(args.gpus)]) torch.cuda.set_device(gpu_idx) print("eval gpu:" + str(gpu_idx) + " assigned") if args.input_type == 'ques': model_kwargs = {'vocab': load_vocab(args.vocab_json)} model = VqaLstmModel(**model_kwargs) elif args.input_type == 'ques,image': model_kwargs = {'vocab': load_vocab(args.vocab_json)} model = VqaLstmCnnAttentionModel(**model_kwargs) lossFn = torch.nn.CrossEntropyLoss().cuda() eval_loader_kwargs = { 'questions_h5': getattr(args, args.eval_split + '_h5'), 'data_json': args.data_json, 'vocab': args.vocab_json, 'batch_size': 1, 'input_type': args.input_type, 'num_frames': args.num_frames, 'split': args.eval_split, 'max_threads_per_gpu': args.max_threads_per_gpu, 'gpu_id': args.gpus[rank % len(args.gpus)], 'to_cache': args.to_cache } eval_loader = EqaDataLoader(**eval_loader_kwargs) print('eval_loader has %d samples' % len(eval_loader.dataset)) args.output_log_path = os.path.join(args.log_dir, 'eval_' + str(rank) + '.json') t, epoch, best_eval_acc = 0, 0, 0 print(epoch, args.max_epochs) # DEBUG while epoch < int(args.max_epochs): print("eval gpu:" + str(gpu_idx) + " running epoch " + str(epoch)) model.load_state_dict(shared_model.state_dict()) model.eval() metrics = VqaMetric(info={'split': args.eval_split}, metric_names=[ 'loss', 'accuracy', 'mean_rank', 'mean_reciprocal_rank' ], log_json=args.output_log_path) if args.input_type == 'ques': for batch in eval_loader: t += 1 model.cuda() idx, questions, answers = batch # If not using language, replace each question with a start and end token back to back. if not use_language: questions = torch.zeros_like(questions) questions.fill_( model_kwargs['vocab']['questionTokenToIdx']['<NULL>']) questions[:, 0] = model_kwargs['vocab'][ 'questionTokenToIdx']['<START>'] questions[:, 1] = model_kwargs['vocab'][ 'questionTokenToIdx']['<END>'] questions_var = Variable(questions.cuda()) answers_var = Variable(answers.cuda()) scores = model(questions_var) loss = lossFn(scores, answers_var) # update metrics accuracy, ranks = metrics.compute_ranks( scores.data.cpu(), answers) metrics.update([loss.data[0], accuracy, ranks, 1.0 / ranks]) print(metrics.get_stat_string(mode=0)) elif args.input_type == 'ques,image': done = False all_envs_loaded = eval_loader.dataset._check_if_all_envs_loaded() while done == False: for batch in eval_loader: t += 1 model.cuda() idx, questions, answers, images, _, _, _ = batch # If not using language, replace each question with a start and end token back to back. if not use_language: questions = torch.zeros_like(questions) questions.fill_(model_kwargs['vocab'] ['questionTokenToIdx']['<NULL>']) questions[:, 0] = model_kwargs['vocab'][ 'questionTokenToIdx']['<START>'] questions[:, 1] = model_kwargs['vocab'][ 'questionTokenToIdx']['<END>'] # If not using vision, replace all image feature data with zeros. if not use_vision: images = torch.zeros_like(images) questions_var = Variable(questions.cuda()) answers_var = Variable(answers.cuda()) images_var = Variable(images.cuda()) scores, att_probs = model(images_var, questions_var) loss = lossFn(scores, answers_var) # update metrics accuracy, ranks = metrics.compute_ranks( scores.data.cpu(), answers) metrics.update( [loss.data[0], accuracy, ranks, 1.0 / ranks]) print(metrics.get_stat_string(mode=0)) if all_envs_loaded == False: eval_loader.dataset._load_envs() if len(eval_loader.dataset.pruned_env_set) == 0: done = True else: done = True read_epoch = None while read_epoch is None or epoch >= read_epoch: try: with open(args.identifier + '.shared_epoch.tmp', 'r') as f: read_epoch = int(f.read().strip()) except (IOError, ValueError): pass if read_epoch is None: # TODO: since merger, this no longer works (hanging); might need to undo changes re: threading that we # TODO: made or debug them. print("eval gpu:" + str(gpu_idx) + " waiting for train thread to finish epoch " + str(epoch)) time.sleep( 10 ) # sleep until the training thread finishes another iteration epoch = read_epoch # checkpoint if best val accuracy if metrics.metrics[1][0] > best_eval_acc: best_eval_acc = metrics.metrics[1][0] if epoch % args.eval_every == 0 and args.to_log == 1: metrics.dump_log() model_state = get_state(model) if args.checkpoint_path != False: ad = checkpoint['args'] else: ad = args.__dict__ checkpoint = {'args': ad, 'state': model_state, 'epoch': epoch} checkpoint_path = '%s/epoch_%d_accuracy_%.04f.pt' % ( args.checkpoint_dir, epoch, best_eval_acc) print('Saving checkpoint to %s' % checkpoint_path) torch.save(checkpoint, checkpoint_path) print('[best_eval_accuracy:%.04f]' % best_eval_acc)
def train(rank, args, shared_model): torch.cuda.set_device(args.gpus.index(args.gpus[rank % len(args.gpus)])) if args.input_type == 'ques': model_kwargs = {'vocab': load_vocab(args.vocab_json)} model = VqaLstmModel(**model_kwargs) elif args.input_type == 'ques,image': model_kwargs = {'vocab': load_vocab(args.vocab_json)} model = VqaLstmCnnAttentionModel(**model_kwargs) lossFn = torch.nn.CrossEntropyLoss().cuda() optim = torch.optim.Adam(filter(lambda p: p.requires_grad, shared_model.parameters()), lr=args.learning_rate) train_loader_kwargs = { 'questions_h5': args.train_h5, 'data_json': args.data_json, 'vocab': args.vocab_json, 'batch_size': args.batch_size, 'input_type': args.input_type, 'num_frames': args.num_frames, 'split': 'train', 'max_threads_per_gpu': args.max_threads_per_gpu, 'gpu_id': args.gpus[rank % len(args.gpus)], 'to_cache': args.to_cache } eval_loader_kwargs = { 'questions_h5': getattr(args, args.eval_split + '_h5'), 'data_json': args.data_json, 'vocab': args.vocab_json, 'batch_size': 1, 'input_type': args.input_type, 'num_frames': args.num_frames, 'split': args.eval_split, 'max_threads_per_gpu': args.max_threads_per_gpu, 'gpu_id': args.gpus[rank % len(args.gpus)], 'to_cache': args.to_cache } args.output_log_path = os.path.join(args.log_dir, 'train_' + str(rank) + '.json') metrics = VqaMetric( info={ 'split': 'train', 'thread': rank }, metric_names=['loss', 'accuracy', 'mean_rank', 'mean_reciprocal_rank'], log_json=args.output_log_path) eval_loader = EqaDataLoader(**eval_loader_kwargs) train_loader = EqaDataLoader(**train_loader_kwargs) if args.input_type == 'ques,image': train_loader.dataset._load_envs(start_idx=0, in_order=True) print('train_loader has %d samples' % len(train_loader.dataset)) t, epoch, best_eval_acc = 0, 0, 0 while epoch < int(args.max_epochs): if args.input_type == 'ques': for batch in train_loader: t += 1 model.load_state_dict(shared_model.state_dict()) model.train() model.cuda() idx, questions, answers = batch questions_var = Variable(questions.cuda()) answers_var = Variable(answers.cuda()) scores = model(questions_var) loss = lossFn(scores, answers_var) # zero grad optim.zero_grad() # update metrics accuracy, ranks = metrics.compute_ranks( scores.data.cpu(), answers) metrics.update([loss.data[0], accuracy, ranks, 1.0 / ranks]) # backprop and update loss.backward() ensure_shared_grads(model.cpu(), shared_model) optim.step() if t % args.print_every == 0: print(metrics.get_stat_string()) if args.to_log == 1: metrics.dump_log() elif args.input_type == 'ques,image': done = False all_envs_loaded = train_loader.dataset._check_if_all_envs_loaded() while done == False: for batch in train_loader: t += 1 model.load_state_dict(shared_model.state_dict()) model.train() model.cuda() idx, questions, answers, images, _, _, _ = batch questions_var = Variable(questions.cuda()) answers_var = Variable(answers.cuda()) images_var = Variable(images.cuda()) scores, att_probs = model(images_var, questions_var) loss = lossFn(scores, answers_var) # zero grad optim.zero_grad() # update metrics # accuracy, ranks = metrics.compute_ranks(scores.data.cpu(), answers) # metrics.update([loss.data[0], accuracy, ranks, 1.0 / ranks]) # backprop and update loss.backward() ensure_shared_grads(model.cpu(), shared_model) optim.step() #if t % args.print_every == 0: # print(metrics.get_stat_string()) # if args.to_log == 1: # metrics.dump_log() if all_envs_loaded == False: train_loader.dataset._load_envs(in_order=True) if len(train_loader.dataset.pruned_env_set) == 0: done = True else: done = True env_done = False env_all_envs_loaded = eval_loader.dataset._check_if_all_envs_loaded( ) while env_done == False: _loss, _accuracy, _ranks = None, None, None for batch in eval_loader: t += 1 model.cuda() idx, questions, answers, images, _, _, _ = batch questions_var = Variable(questions.cuda()) answers_var = Variable(answers.cuda()) images_var = Variable(images.cuda()) scores, att_probs = model(images_var, questions_var) loss = lossFn(scores, answers_var) # update metrics accuracy, ranks = metrics.compute_ranks( scores.data.cpu(), answers) if _loss is None: _loss = loss.data[0] _accuracy = accuracy _ranks = ranks else: _loss = torch.cat([_loss, loss.data[0]]) _accuracy = torch.cat([_accuracy, accuracy]) _ranks = torch.cat([_ranks, ranks]) metrics.update([loss.data[0], accuracy, ranks, 1.0 / ranks]) print(metrics.get_stat_string(mode=0)) if env_all_envs_loaded == False: eval_loader.dataset._load_envs() if len(eval_loader.dataset.pruned_env_set) == 0: env_done = True else: env_done = True epoch += 1 # checkpoint if best val accuracy if metrics.metrics[1][0] > best_eval_acc: best_eval_acc = metrics.metrics[1][0] if epoch % args.eval_every == 0 and args.to_log == 1: metrics.dump_log() model_state = get_state(model) if args.checkpoint_path != False: ad = checkpoint['args'] else: ad = args.__dict__ checkpoint = {'args': ad, 'state': model_state, 'epoch': epoch} checkpoint_path = '%s/epoch_%d_accuracy_%.04f.pt' % ( args.checkpoint_dir, epoch, best_eval_acc) print('Saving checkpoint to %s' % checkpoint_path) torch.save(checkpoint, checkpoint_path) print('[best_eval_accuracy:%.04f]' % best_eval_acc)
def train(rank, args, shared_model): torch.cuda.set_device(args.gpus.index(args.gpus[rank % len(args.gpus)])) if args.forget_rate is None: forget_rate = args.noise_rate else: forget_rate = args.forget_rate rate_schedule = np.ones(args.max_epochs) * forget_rate rate_schedule[:args.num_gradual] = np.linspace(0, forget_rate**args.exponent, args.num_gradual) if args.input_type == 'ques': model_kwargs = {'vocab': load_vocab(args.vocab_json)} model = VqaLstmModel(**model_kwargs) elif args.input_type == 'ques,image': model_kwargs = {'vocab': load_vocab(args.vocab_json)} model = VqaLstmCnnAttentionModel(**model_kwargs) model_kwargs = {'vocab': load_vocab(args.vocab_json)} model1 = VqaLstmCnnAttentionModel(**model_kwargs) # lossFn = torch.nn.CrossEntropyLoss().cuda() optim = torch.optim.Adam(filter(lambda p: p.requires_grad, shared_model.parameters()), lr=args.learning_rate) optim1 = torch.optim.Adam(filter(lambda p: p.requires_grad, model1.parameters()), lr=args.learning_rate) train_loader_kwargs = { 'questions_h5': args.train_h5, 'data_json': args.data_json, 'vocab': args.vocab_json, 'batch_size': args.batch_size, 'input_type': args.input_type, 'num_frames': args.num_frames, 'split': 'train', 'max_threads_per_gpu': args.max_threads_per_gpu, 'gpu_id': args.gpus[rank % len(args.gpus)], 'to_cache': args.cache } args.output_log_path = os.path.join(args.log_dir, 'train_' + str(rank) + '.json') metrics = VqaMetric( info={ 'split': 'train', 'thread': rank }, metric_names=['loss', 'accuracy', 'mean_rank', 'mean_reciprocal_rank'], log_json=args.output_log_path) train_loader = EqaDataLoader(**train_loader_kwargs) if args.input_type == 'ques,image': train_loader.dataset._load_envs(start_idx=0, in_order=True) print('train_loader has %d samples' % len(train_loader.dataset)) t, epoch = 0, 0 while epoch < int(args.max_epochs): if args.input_type == 'ques': for batch in train_loader: t += 1 model.load_state_dict(shared_model.state_dict()) model.train() model.cuda() idx, questions, answers = batch questions_var = Variable(questions.cuda()) answers_var = Variable(answers.cuda()) scores = model(questions_var) loss = lossFn(scores, answers_var) # zero grad optim.zero_grad() # update metrics accuracy, ranks = metrics.compute_ranks( scores.data.cpu(), answers) metrics.update( [loss.data.item(), accuracy, ranks, 1.0 / ranks]) # backprop and update loss.backward() ensure_shared_grads(model.cpu(), shared_model) optim.step() if t % args.print_every == 0: print(metrics.get_stat_string()) if args.log == True: metrics.dump_log() elif args.input_type == 'ques,image': done = False all_envs_loaded = train_loader.dataset._check_if_all_envs_loaded() while done == False: for batch in train_loader: t += 1 model.load_state_dict(shared_model.state_dict()) model.train() model.cnn.eval() model.cuda() # model1.load_state_dict(shared_model1.state_dict()) model1.train() model1.cnn.eval() model1.cuda() idx, questions, answers, images, _, _, _ = batch for i in range(len(answers)): if random.random() < args.noise_rate: tempt = random.randint(-10, 10) answers[i] = answers[i] + tempt while (answers[i] < 0 or answers[i] > 70): answers[i] = answers[i] - tempt tempt = random.randint(-10, 10) answers[i] = answers[i] + tempt questions_var = Variable(questions.cuda()) answers_var = Variable(answers.cuda()) images_var = Variable(images.cuda()) scores, att_probs = model(images_var, questions_var) scores1, att_probs1 = model1(images_var, questions_var) loss, loss1 = loss_coteaching(scores, scores1, answers_var, rate_schedule[epoch]) # zero grad optim.zero_grad() optim1.zero_grad() # update metrics accuracy, ranks = metrics.compute_ranks( scores.data.cpu(), answers) metrics.update( [loss.data.item(), accuracy, ranks, 1.0 / ranks]) # backprop and update loss.backward() loss1.backward() ensure_shared_grads(model.cpu(), shared_model) # ensure_shared_grads(model1.cpu(), shared_model1) optim.step() optim1.step() if t % args.print_every == 0: print(metrics.get_stat_string()) if args.log == True: metrics.dump_log() if all_envs_loaded == False: print('[CHECK][Cache:%d][Total:%d]' % (len(train_loader.dataset.img_data_cache), len(train_loader.dataset.env_list))) train_loader.dataset._load_envs(in_order=True) if len(train_loader.dataset.pruned_env_set) == 0: done = True else: done = True epoch += 1
def eval(rank, args, shared_model): torch.cuda.set_device(args.gpus.index(args.gpus[rank % len(args.gpus)])) if args.input_type == 'ques': model_kwargs = {'vocab': load_vocab(args.vocab_json)} model = VqaLstmModel(**model_kwargs) elif args.input_type == 'ques,image': model_kwargs = {'vocab': load_vocab(args.vocab_json)} model = VqaLstmCnnAttentionModel(**model_kwargs) lossFn = torch.nn.CrossEntropyLoss().cuda() eval_loader_kwargs = { 'questions_h5': getattr(args, args.eval_split + '_h5'), 'data_json': args.data_json, 'vocab': args.vocab_json, 'batch_size': 1, 'input_type': args.input_type, 'num_frames': args.num_frames, 'split': args.eval_split, 'max_threads_per_gpu': args.max_threads_per_gpu, 'gpu_id': args.gpus[rank%len(args.gpus)], 'to_cache': args.to_cache } eval_loader = EqaDataLoader(**eval_loader_kwargs) print('eval_loader has %d samples' % len(eval_loader.dataset)) args.output_log_path = os.path.join(args.log_dir, 'eval_' + str(rank) + '.json') t, epoch, best_eval_acc = 0, 0, 0 while epoch < int(args.max_epochs): model.load_state_dict(shared_model.state_dict()) model.eval() metrics = VqaMetric( info={'split': args.eval_split}, metric_names=[ 'loss', 'accuracy', 'mean_rank', 'mean_reciprocal_rank' ], log_json=args.output_log_path) if args.input_type == 'ques': for batch in eval_loader: t += 1 model.cuda() idx, questions, answers = batch questions_var = Variable(questions.cuda()) answers_var = Variable(answers.cuda()) scores = model(questions_var) loss = lossFn(scores, answers_var) # update metrics accuracy, ranks = metrics.compute_ranks( scores.data.cpu(), answers) metrics.update([loss.data[0], accuracy, ranks, 1.0 / ranks]) print(metrics.get_stat_string(mode=0)) elif args.input_type == 'ques,image': done = False all_envs_loaded = eval_loader.dataset._check_if_all_envs_loaded() while done == False: for batch in eval_loader: t += 1 model.cuda() idx, questions, answers, images, _, _, _ = batch questions_var = Variable(questions.cuda()) answers_var = Variable(answers.cuda()) images_var = Variable(images.cuda()) scores, att_probs = model(images_var, questions_var) loss = lossFn(scores, answers_var) # update metrics accuracy, ranks = metrics.compute_ranks( scores.data.cpu(), answers) metrics.update( [loss.data[0], accuracy, ranks, 1.0 / ranks]) print(metrics.get_stat_string(mode=0)) if all_envs_loaded == False: eval_loader.dataset._load_envs() if len(eval_loader.dataset.pruned_env_set) == 0: done = True else: done = True epoch += 1 # checkpoint if best val accuracy if metrics.metrics[1][0] > best_eval_acc: best_eval_acc = metrics.metrics[1][0] if epoch % args.eval_every == 0 and args.to_log == 1: metrics.dump_log() model_state = get_state(model) if args.checkpoint_path != False: ad = checkpoint['args'] else: ad = args.__dict__ checkpoint = {'args': ad, 'state': model_state, 'epoch': epoch} checkpoint_path = '%s/epoch_%d_accuracy_%.04f.pt' % ( args.checkpoint_dir, epoch, best_eval_acc) print('Saving checkpoint to %s' % checkpoint_path) torch.save(checkpoint, checkpoint_path) print('[best_eval_accuracy:%.04f]' % best_eval_acc)
def train(rank, args, shared_model): torch.cuda.set_device(args.gpus.index(args.gpus[rank % len(args.gpus)])) if args.input_type == 'ques': model_kwargs = {'vocab': load_vocab(args.vocab_json)} model = VqaLstmModel(**model_kwargs) elif args.input_type == 'ques,image': model_kwargs = {'vocab': load_vocab(args.vocab_json)} model = VqaLstmCnnAttentionModel(**model_kwargs) lossFn = torch.nn.CrossEntropyLoss().cuda() optim = torch.optim.Adam( filter(lambda p: p.requires_grad, shared_model.parameters()), lr=args.learning_rate) train_loader_kwargs = { 'questions_h5': args.train_h5, 'data_json': args.data_json, 'vocab': args.vocab_json, 'batch_size': args.batch_size, 'input_type': args.input_type, 'num_frames': args.num_frames, 'split': 'train', 'max_threads_per_gpu': args.max_threads_per_gpu, 'gpu_id': args.gpus[rank%len(args.gpus)], 'to_cache': args.to_cache } args.output_log_path = os.path.join(args.log_dir, 'train_' + str(rank) + '.json') metrics = VqaMetric( info={'split': 'train', 'thread': rank}, metric_names=['loss', 'accuracy', 'mean_rank', 'mean_reciprocal_rank'], log_json=args.output_log_path) train_loader = EqaDataLoader(**train_loader_kwargs) if args.input_type == 'ques,image': train_loader.dataset._load_envs(start_idx=0, in_order=True) print('train_loader has %d samples' % len(train_loader.dataset)) t, epoch = 0, 0 while epoch < int(args.max_epochs): if args.input_type == 'ques': for batch in train_loader: t += 1 model.load_state_dict(shared_model.state_dict()) model.train() model.cuda() idx, questions, answers = batch questions_var = Variable(questions.cuda()) answers_var = Variable(answers.cuda()) scores = model(questions_var) loss = lossFn(scores, answers_var) # zero grad optim.zero_grad() # update metrics accuracy, ranks = metrics.compute_ranks(scores.data.cpu(), answers) metrics.update([loss.data[0], accuracy, ranks, 1.0 / ranks]) # backprop and update loss.backward() ensure_shared_grads(model.cpu(), shared_model) optim.step() if t % args.print_every == 0: print(metrics.get_stat_string()) if args.to_log == 1: metrics.dump_log() elif args.input_type == 'ques,image': done = False all_envs_loaded = train_loader.dataset._check_if_all_envs_loaded() while done == False: for batch in train_loader: t += 1 model.load_state_dict(shared_model.state_dict()) model.train() model.cuda() idx, questions, answers, images, _, _, _ = batch questions_var = Variable(questions.cuda()) answers_var = Variable(answers.cuda()) images_var = Variable(images.cuda()) scores, att_probs = model(images_var, questions_var) loss = lossFn(scores, answers_var) # zero grad optim.zero_grad() # update metrics accuracy, ranks = metrics.compute_ranks(scores.data.cpu(), answers) metrics.update([loss.data[0], accuracy, ranks, 1.0 / ranks]) # backprop and update loss.backward() ensure_shared_grads(model.cpu(), shared_model) optim.step() if t % args.print_every == 0: print(metrics.get_stat_string()) if args.to_log == 1: metrics.dump_log() if all_envs_loaded == False: train_loader.dataset._load_envs(in_order=True) if len(train_loader.dataset.pruned_env_set) == 0: done = True else: done = True epoch += 1
args.checkpoint_dir = os.path.join(args.checkpoint_dir, args.time_id + '_' + args.identifier) args.log_dir = os.path.join(args.log_dir, args.time_id + '_' + args.identifier) print(args.__dict__) if not os.path.exists(args.checkpoint_dir) and args.to_log == 1: os.makedirs(args.checkpoint_dir) os.makedirs(args.log_dir) if args.input_type == 'ques': model_kwargs = {'vocab': load_vocab(args.vocab_json)} shared_model = VqaLstmModel(**model_kwargs) elif args.input_type == 'ques,image': model_kwargs = {'vocab': load_vocab(args.vocab_json)} shared_model = VqaLstmCnnAttentionModel(**model_kwargs) if args.checkpoint_path != False: print('Loading params from checkpoint: %s' % args.checkpoint_path) shared_model.load_state_dict(checkpoint['state']) shared_model.share_memory() if args.mode == 'eval': eval(0, args, shared_model)