def train(rank, args, shared_nav_model, shared_ans_model): torch.cuda.set_device(args.gpus.index(args.gpus[rank % len(args.gpus)])) if args.model_type == 'pacman': model_kwargs = {'question_vocab': load_vocab(args.vocab_json)} nav_model = NavPlannerControllerModel(**model_kwargs) else: exit() model_kwargs = {'vocab': load_vocab(args.vocab_json)} ans_model = VqaLstmCnnAttentionModel(**model_kwargs) optim = torch.optim.SGD(filter(lambda p: p.requires_grad, shared_nav_model.parameters()), lr=args.learning_rate) train_loader_kwargs = { 'questions_h5': args.train_h5, 'data_json': args.data_json, 'vocab': args.vocab_json, 'target_obj_conn_map_dir': args.target_obj_conn_map_dir, 'map_resolution': args.map_resolution, 'batch_size': 1, 'input_type': args.model_type, 'num_frames': 5, 'split': 'train', 'max_threads_per_gpu': args.max_threads_per_gpu, 'gpu_id': args.gpus[rank % len(args.gpus)], 'to_cache': args.to_cache } args.output_nav_log_path = os.path.join(args.log_dir, 'nav_train_' + str(rank) + '.json') args.output_ans_log_path = os.path.join(args.log_dir, 'ans_train_' + str(rank) + '.json') nav_model.load_state_dict(shared_nav_model.state_dict()) nav_model.cuda() ans_model.load_state_dict(shared_ans_model.state_dict()) ans_model.eval() ans_model.cuda() nav_metrics = NavMetric(info={ 'split': 'train', 'thread': rank }, metric_names=[ 'planner_loss', 'controller_loss', 'reward', 'episode_length' ], log_json=args.output_nav_log_path) vqa_metrics = VqaMetric( info={ 'split': 'train', 'thread': rank }, metric_names=['accuracy', 'mean_rank', 'mean_reciprocal_rank'], log_json=args.output_ans_log_path) train_loader = EqaDataLoader(**train_loader_kwargs) print('train_loader has %d samples' % len(train_loader.dataset)) t, epoch = 0, 0 p_losses, c_losses, reward_list, episode_length_list = [], [], [], [] nav_metrics.update([10.0, 10.0, 0, 100]) mult = 0.1 while epoch < int(args.max_epochs): if 'pacman' in args.model_type: planner_lossFn = MaskedNLLCriterion().cuda() controller_lossFn = MaskedNLLCriterion().cuda() done = False all_envs_loaded = train_loader.dataset._check_if_all_envs_loaded() while done == False: for batch in train_loader: nav_model.load_state_dict(shared_nav_model.state_dict()) nav_model.eval() nav_model.cuda() idx, question, answer, actions, action_length = batch metrics_slug = {} h3d = train_loader.dataset.episode_house # evaluate at multiple initializations # for i in [10, 30, 50]: t += 1 question_var = Variable(question.cuda()) controller_step = False planner_hidden = nav_model.planner_nav_rnn.init_hidden(1) # forward through planner till spawn planner_actions_in, planner_img_feats, controller_step, controller_action_in, controller_img_feat, init_pos = train_loader.dataset.get_hierarchical_features_till_spawn( actions[0, :action_length[0] + 1].numpy(), max(3, int(mult * action_length[0]))) planner_actions_in_var = Variable( planner_actions_in.cuda()) planner_img_feats_var = Variable(planner_img_feats.cuda()) for step in range(planner_actions_in.size(0)): planner_scores, planner_hidden = nav_model.planner_step( question_var, planner_img_feats_var[step].view(1, 1, 3200), planner_actions_in_var[step].view(1, 1), planner_hidden) if controller_step == True: controller_img_feat_var = Variable( controller_img_feat.cuda()) controller_action_in_var = Variable( torch.LongTensor(1, 1).fill_( int(controller_action_in)).cuda()) controller_scores = nav_model.controller_step( controller_img_feat_var.view(1, 1, 3200), controller_action_in_var.view(1, 1), planner_hidden[0]) prob = F.softmax(controller_scores, dim=1) controller_action = int( prob.max(1)[1].data.cpu().numpy()[0]) if controller_action == 1: controller_step = True else: controller_step = False action = int(controller_action_in) action_in = torch.LongTensor(1, 1).fill_(action + 1).cuda() else: prob = F.softmax(planner_scores, dim=1) action = int(prob.max(1)[1].data.cpu().numpy()[0]) action_in = torch.LongTensor(1, 1).fill_(action + 1).cuda() h3d.env.reset(x=init_pos[0], y=init_pos[2], yaw=init_pos[3]) init_dist_to_target = h3d.get_dist_to_target( h3d.env.cam.pos) if init_dist_to_target < 0: # unreachable # invalids.append([idx[0], i]) continue episode_length = 0 episode_done = True controller_action_counter = 0 dists_to_target, pos_queue = [init_dist_to_target ], [init_pos] rewards, planner_actions, planner_log_probs, controller_actions, controller_log_probs = [], [], [], [], [] if action != 3: # take the first step img, rwd, episode_done = h3d.step(action, step_reward=True) img = torch.from_numpy(img.transpose( 2, 0, 1)).float() / 255.0 img_feat_var = train_loader.dataset.cnn( Variable(img.view(1, 3, 224, 224).cuda())).view(1, 1, 3200) for step in range(args.max_episode_length): episode_length += 1 if controller_step == False: planner_scores, planner_hidden = nav_model.planner_step( question_var, img_feat_var, Variable(action_in), planner_hidden) planner_prob = F.softmax(planner_scores, dim=1) planner_log_prob = F.log_softmax( planner_scores, dim=1) action = planner_prob.multinomial().data planner_log_prob = planner_log_prob.gather( 1, Variable(action)) planner_log_probs.append( planner_log_prob.cpu()) action = int(action.cpu().numpy()[0, 0]) planner_actions.append(action) img, rwd, episode_done = h3d.step(action, step_reward=True) episode_done = episode_done or episode_length >= args.max_episode_length rewards.append(rwd) img = torch.from_numpy(img.transpose( 2, 0, 1)).float() / 255.0 img_feat_var = train_loader.dataset.cnn( Variable(img.view(1, 3, 224, 224).cuda())).view( 1, 1, 3200) dists_to_target.append( h3d.get_dist_to_target(h3d.env.cam.pos)) pos_queue.append([ h3d.env.cam.pos.x, h3d.env.cam.pos.y, h3d.env.cam.pos.z, h3d.env.cam.yaw ]) if episode_done == True: break # query controller to continue or not controller_action_in = Variable( torch.LongTensor(1, 1).fill_(action).cuda()) controller_scores = nav_model.controller_step( img_feat_var, controller_action_in, planner_hidden[0]) controller_prob = F.softmax(controller_scores, dim=1) controller_log_prob = F.log_softmax( controller_scores, dim=1) controller_action = controller_prob.multinomial( ).data if int(controller_action[0] ) == 1 and controller_action_counter < 4: controller_action_counter += 1 controller_step = True else: controller_action_counter = 0 controller_step = False controller_action.fill_(0) controller_log_prob = controller_log_prob.gather( 1, Variable(controller_action)) controller_log_probs.append( controller_log_prob.cpu()) controller_action = int( controller_action.cpu().numpy()[0, 0]) controller_actions.append(controller_action) action_in = torch.LongTensor(1, 1).fill_(action + 1).cuda() # run answerer here ans_acc = [0] if action == 3: if len(pos_queue) < 5: pos_queue = train_loader.dataset.episode_pos_queue[ len(pos_queue) - 5:] + pos_queue images = train_loader.dataset.get_frames( h3d, pos_queue[-5:], preprocess=True) images_var = Variable( torch.from_numpy(images).cuda()).view( 1, 5, 3, 224, 224) scores, att_probs = ans_model(images_var, question_var) ans_acc, ans_rank = vqa_metrics.compute_ranks( scores.data.cpu(), answer) vqa_metrics.update([ans_acc, ans_rank, 1.0 / ans_rank]) rewards.append(h3d.success_reward * ans_acc[0]) R = torch.zeros(1, 1) planner_loss = 0 controller_loss = 0 planner_rev_idx = -1 for i in reversed(range(len(rewards))): R = 0.99 * R + rewards[i] advantage = R - nav_metrics.metrics[2][1] if i < len(controller_actions): controller_loss = controller_loss - controller_log_probs[ i] * Variable(advantage) if controller_actions[ i] == 0 and planner_rev_idx + len( planner_log_probs) >= 0: planner_loss = planner_loss - planner_log_probs[ planner_rev_idx] * Variable(advantage) planner_rev_idx -= 1 elif planner_rev_idx + len(planner_log_probs) >= 0: planner_loss = planner_loss - planner_log_probs[ planner_rev_idx] * Variable(advantage) planner_rev_idx -= 1 controller_loss /= max(1, len(controller_log_probs)) planner_loss /= max(1, len(planner_log_probs)) optim.zero_grad() if isinstance(planner_loss, float) == False and isinstance( controller_loss, float) == False: p_losses.append(planner_loss.data[0, 0]) c_losses.append(controller_loss.data[0, 0]) reward_list.append(np.sum(rewards)) episode_length_list.append(episode_length) (planner_loss + controller_loss).backward() ensure_shared_grads(nav_model.cpu(), shared_nav_model) optim.step() if len(reward_list) > 50: nav_metrics.update([ p_losses, c_losses, reward_list, episode_length_list ]) print(nav_metrics.get_stat_string()) if args.to_log == 1: nav_metrics.dump_log() if nav_metrics.metrics[2][1] > 0.35: mult = min(mult + 0.1, 1.0) p_losses, c_losses, reward_list, episode_length_list = [], [], [], [] if all_envs_loaded == False: train_loader.dataset._load_envs(in_order=True) if len(train_loader.dataset.pruned_env_set) == 0: done = True if args.to_cache == False: train_loader.dataset._load_envs(start_idx=0, in_order=True) else: done = True epoch += 1
print('Loading navigation params from checkpoint: %s' % args.nav_checkpoint_path) shared_nav_model.load_state_dict(checkpoint['state']) # Load answering model print('Loading answering checkpoint from %s' % args.ans_checkpoint_path) ans_checkpoint = torch.load(args.ans_checkpoint_path, map_location={'cuda:0': 'cpu'}) ans_model_kwargs = {'vocab': load_vocab(args.vocab_json)} shared_ans_model = VqaLstmCnnAttentionModel(**ans_model_kwargs) shared_ans_model.share_memory() print('Loading params from checkpoint: %s' % args.ans_checkpoint_path) shared_ans_model.load_state_dict(ans_checkpoint['state']) if args.mode == 'eval': eval(0, args, shared_nav_model, shared_ans_model) elif args.mode == 'train': train(0, args, shared_nav_model, shared_ans_model) else: processes = [] p = mp.Process(target=eval, args=(0, args, shared_nav_model, shared_ans_model))
def eval(rank, args, shared_nav_model, shared_ans_model): torch.cuda.set_device(args.gpus.index(args.gpus[rank % len(args.gpus)])) if args.model_type == 'pacman': model_kwargs = {'question_vocab': load_vocab(args.vocab_json)} nav_model = NavPlannerControllerModel(**model_kwargs) else: exit() model_kwargs = {'vocab': load_vocab(args.vocab_json)} ans_model = VqaLstmCnnAttentionModel(**model_kwargs) eval_loader_kwargs = { 'questions_h5': getattr(args, args.eval_split + '_h5'), 'data_json': args.data_json, 'vocab': args.vocab_json, 'target_obj_conn_map_dir': args.target_obj_conn_map_dir, 'map_resolution': args.map_resolution, 'batch_size': 1, 'input_type': args.model_type, 'num_frames': 5, 'split': args.eval_split, 'max_threads_per_gpu': args.max_threads_per_gpu, 'gpu_id': args.gpus[rank % len(args.gpus)], 'to_cache': False } eval_loader = EqaDataLoader(**eval_loader_kwargs) print('eval_loader has %d samples' % len(eval_loader.dataset)) args.output_nav_log_path = os.path.join(args.log_dir, 'nav_eval_' + str(rank) + '.json') args.output_ans_log_path = os.path.join(args.log_dir, 'ans_eval_' + str(rank) + '.json') t, epoch, best_eval_acc = 0, 0, 0.0 while epoch < int(args.max_epochs): start_time = time.time() invalids = [] nav_model.load_state_dict(shared_nav_model.state_dict()) nav_model.eval() ans_model.load_state_dict(shared_ans_model.state_dict()) ans_model.eval() ans_model.cuda() # that's a lot of numbers nav_metrics = NavMetric( info={ 'split': args.eval_split, 'thread': rank }, metric_names=[ 'd_0_10', 'd_0_30', 'd_0_50', 'd_T_10', 'd_T_30', 'd_T_50', 'd_D_10', 'd_D_30', 'd_D_50', 'd_min_10', 'd_min_30', 'd_min_50', 'r_T_10', 'r_T_30', 'r_T_50', 'r_e_10', 'r_e_30', 'r_e_50', 'stop_10', 'stop_30', 'stop_50', 'ep_len_10', 'ep_len_30', 'ep_len_50' ], log_json=args.output_nav_log_path) vqa_metrics = VqaMetric( info={ 'split': args.eval_split, 'thread': rank }, metric_names=[ 'accuracy_10', 'accuracy_30', 'accuracy_50', 'mean_rank_10', 'mean_rank_30', 'mean_rank_50', 'mean_reciprocal_rank_10', 'mean_reciprocal_rank_30', 'mean_reciprocal_rank_50' ], log_json=args.output_ans_log_path) if 'pacman' in args.model_type: done = False while done == False: for batch in tqdm(eval_loader): nav_model.load_state_dict(shared_nav_model.state_dict()) nav_model.eval() nav_model.cuda() idx, question, answer, actions, action_length = batch metrics_slug = {} h3d = eval_loader.dataset.episode_house # evaluate at multiple initializations for i in [10, 30, 50]: t += 1 if i > action_length[0]: invalids.append([idx[0], i]) continue question_var = Variable(question.cuda()) controller_step = False planner_hidden = nav_model.planner_nav_rnn.init_hidden( 1) # forward through planner till spawn planner_actions_in, planner_img_feats, controller_step, controller_action_in, controller_img_feat, init_pos = eval_loader.dataset.get_hierarchical_features_till_spawn( actions[0, :action_length[0] + 1].numpy(), i) planner_actions_in_var = Variable( planner_actions_in.cuda()) planner_img_feats_var = Variable( planner_img_feats.cuda()) for step in range(planner_actions_in.size(0)): planner_scores, planner_hidden = nav_model.planner_step( question_var, planner_img_feats_var[step].view(1, 1, 3200), planner_actions_in_var[step].view(1, 1), planner_hidden) if controller_step == True: controller_img_feat_var = Variable( controller_img_feat.cuda()) controller_action_in_var = Variable( torch.LongTensor(1, 1).fill_( int(controller_action_in)).cuda()) controller_scores = nav_model.controller_step( controller_img_feat_var.view(1, 1, 3200), controller_action_in_var.view(1, 1), planner_hidden[0]) prob = F.softmax(controller_scores, dim=1) controller_action = int( prob.max(1)[1].data.cpu().numpy()[0]) if controller_action == 1: controller_step = True else: controller_step = False action = int(controller_action_in) action_in = torch.LongTensor(1, 1).fill_(action + 1).cuda() else: prob = F.softmax(planner_scores, dim=1) action = int(prob.max(1)[1].data.cpu().numpy()[0]) action_in = torch.LongTensor(1, 1).fill_(action + 1).cuda() h3d.env.reset(x=init_pos[0], y=init_pos[2], yaw=init_pos[3]) init_dist_to_target = h3d.get_dist_to_target( h3d.env.cam.pos) if init_dist_to_target < 0: # unreachable invalids.append([idx[0], i]) continue episode_length = 0 episode_done = True controller_action_counter = 0 dists_to_target, pos_queue, pred_actions = [ init_dist_to_target ], [init_pos], [] planner_actions, controller_actions = [], [] if action != 3: # take the first step img, _, _ = h3d.step(action) img = torch.from_numpy(img.transpose( 2, 0, 1)).float() / 255.0 img_feat_var = eval_loader.dataset.cnn( Variable(img.view(1, 3, 224, 224).cuda())).view( 1, 1, 3200) for step in range(args.max_episode_length): episode_length += 1 if controller_step == False: planner_scores, planner_hidden = nav_model.planner_step( question_var, img_feat_var, Variable(action_in), planner_hidden) prob = F.softmax(planner_scores, dim=1) action = int( prob.max(1)[1].data.cpu().numpy()[0]) planner_actions.append(action) pred_actions.append(action) img, _, episode_done = h3d.step(action) episode_done = episode_done or episode_length >= args.max_episode_length img = torch.from_numpy(img.transpose( 2, 0, 1)).float() / 255.0 img_feat_var = eval_loader.dataset.cnn( Variable(img.view(1, 3, 224, 224).cuda())).view( 1, 1, 3200) dists_to_target.append( h3d.get_dist_to_target(h3d.env.cam.pos)) pos_queue.append([ h3d.env.cam.pos.x, h3d.env.cam.pos.y, h3d.env.cam.pos.z, h3d.env.cam.yaw ]) if episode_done == True: break # query controller to continue or not controller_action_in = Variable( torch.LongTensor(1, 1).fill_(action).cuda()) controller_scores = nav_model.controller_step( img_feat_var, controller_action_in, planner_hidden[0]) prob = F.softmax(controller_scores, dim=1) controller_action = int( prob.max(1)[1].data.cpu().numpy()[0]) if controller_action == 1 and controller_action_counter < 4: controller_action_counter += 1 controller_step = True else: controller_action_counter = 0 controller_step = False controller_action = 0 controller_actions.append(controller_action) action_in = torch.LongTensor( 1, 1).fill_(action + 1).cuda() # run answerer here if len(pos_queue) < 5: pos_queue = eval_loader.dataset.episode_pos_queue[ len(pos_queue) - 5:] + pos_queue images = eval_loader.dataset.get_frames( h3d, pos_queue[-5:], preprocess=True) images_var = Variable( torch.from_numpy(images).cuda()).view( 1, 5, 3, 224, 224) scores, att_probs = ans_model(images_var, question_var) ans_acc, ans_rank = vqa_metrics.compute_ranks( scores.data.cpu(), answer) pred_answer = scores.max(1)[1].data[0] print( '[Q_GT]', ' '.join([ eval_loader.dataset.vocab['questionIdxToToken'] [x] for x in question[0] if x != 0 ])) print( '[A_GT]', eval_loader.dataset.vocab['answerIdxToToken'][ answer[0]]) print( '[A_PRED]', eval_loader.dataset.vocab['answerIdxToToken'] [pred_answer]) # compute stats metrics_slug['accuracy_' + str(i)] = ans_acc[0] metrics_slug['mean_rank_' + str(i)] = ans_rank[0] metrics_slug['mean_reciprocal_rank_' + str(i)] = 1.0 / ans_rank[0] metrics_slug['d_0_' + str(i)] = dists_to_target[0] metrics_slug['d_T_' + str(i)] = dists_to_target[-1] metrics_slug[ 'd_D_' + str(i)] = dists_to_target[0] - dists_to_target[-1] metrics_slug['d_min_' + str(i)] = np.array(dists_to_target).min() metrics_slug['ep_len_' + str(i)] = episode_length if action == 3: metrics_slug['stop_' + str(i)] = 1 else: metrics_slug['stop_' + str(i)] = 0 inside_room = [] for p in pos_queue: inside_room.append( h3d.is_inside_room( p, eval_loader.dataset.target_room)) if inside_room[-1] == True: metrics_slug['r_T_' + str(i)] = 1 else: metrics_slug['r_T_' + str(i)] = 0 if any([x == True for x in inside_room]) == True: metrics_slug['r_e_' + str(i)] = 1 else: metrics_slug['r_e_' + str(i)] = 0 # navigation metrics metrics_list = [] for i in nav_metrics.metric_names: if i not in metrics_slug: metrics_list.append(nav_metrics.metrics[ nav_metrics.metric_names.index(i)][0]) else: metrics_list.append(metrics_slug[i]) nav_metrics.update(metrics_list) # vqa metrics metrics_list = [] for i in vqa_metrics.metric_names: if i not in metrics_slug: metrics_list.append(vqa_metrics.metrics[ vqa_metrics.metric_names.index(i)][0]) else: metrics_list.append(metrics_slug[i]) vqa_metrics.update(metrics_list) try: print(nav_metrics.get_stat_string(mode=0)) print(vqa_metrics.get_stat_string(mode=0)) except: pass print('epoch', epoch) print('invalids', len(invalids)) eval_loader.dataset._load_envs() if len(eval_loader.dataset.pruned_env_set) == 0: done = True epoch += 1 # checkpoint if best val accuracy if vqa_metrics.metrics[2][0] > best_eval_acc: # ans_acc_50 best_eval_acc = vqa_metrics.metrics[2][0] if epoch % args.eval_every == 0 and args.to_log == 1: vqa_metrics.dump_log() nav_metrics.dump_log() model_state = get_state(nav_model) aad = dict(args.__dict__) ad = {} for i in aad: if i[0] != '_': ad[i] = aad[i] checkpoint = {'args': ad, 'state': model_state, 'epoch': epoch} checkpoint_path = '%s/epoch_%d_ans_50_%.04f.pt' % ( args.checkpoint_dir, epoch, best_eval_acc) print('Saving checkpoint to %s' % checkpoint_path) torch.save(checkpoint, checkpoint_path) print('[best_eval_ans_acc_50:%.04f]' % best_eval_acc) eval_loader.dataset._load_envs(start_idx=0, in_order=True)
def fgsm(rank, args, shared_model, number): torch.cuda.set_device(args.gpus.index(args.gpus[rank % len(args.gpus)])) all_n = 0 # torch.cuda.set_device(args.gpus.index(args.gpus[rank % len(args.gpus)])) model_kwargs = {'vocab': load_vocab(args.vocab_json)} model = VqaLstmCnnAttentionModel(**model_kwargs) device_ids = [0,1] model = model.cuda(device_ids[0]) model = torch.nn.DataParallel(model, device_ids=device_ids) # torch.backends.cudnn.benchmark = True lossFn = torch.nn.CrossEntropyLoss().cuda() eval_loader_kwargs = { 'questions_h5': getattr(args, args.eval_split + '_h5'), 'data_json': args.data_json, 'vocab': args.vocab_json, 'target_obj_conn_map_dir': args.target_obj_conn_map_dir, 'batch_size': args.batch_size, 'input_type': args.input_type, 'num_frames': args.num_frames, 'split': args.eval_split, 'max_threads_per_gpu': args.max_threads_per_gpu, 'gpu_id': args.gpus[rank%len(args.gpus)], 'to_cache': args.cache } eval_loader = EqaDataLoader(**eval_loader_kwargs) # for ijcai in range(number): # eval_loader.dataset._load_envs() eval_loader.dataset._load_envs(start_idx=number) print('eval_loader has %d samples' % len(eval_loader.dataset)) args.output_log_path = os.path.join(args.log_dir, 'eval_' + str(rank) + '.json') model.load_state_dict(handle_load(shared_model.state_dict())) model.eval() metrics = VqaMetric( info={'split': args.eval_split}, metric_names=[ 'loss', 'accuracy', 'mean_rank', 'mean_reciprocal_rank' ], log_json=args.output_log_path) all_envs_loaded = eval_loader.dataset._check_if_all_envs_loaded() done = False print(number, ' begin') import copy import torch.nn as nn from PIL import Image softmax = nn.Softmax() allcor = [] while done == False: for ii,batch in enumerate(eval_loader): # model.cuda() if(ii>=0): idx, questions, answers, house, v, f, vt, pos, _, _, _ = batch print('all size:',v.size(),f.size(),vt.size()) #print(house) print(questions, answers) questions_var = Variable(questions.cuda()) print(questions_var.size()) answers_var = Variable(answers.cuda()) v_var = Variable(v.cuda(),requires_grad=True) f_var = Variable(f.cuda()) vt_var = Variable(vt.cuda(),requires_grad=True) begin, end = get_info(idx[0], house[0]) #print(vt_var[0][0][40010][0][0]) #print(vt_var[0][0][39506][0][0]) vt_test = copy.deepcopy(vt_var) epsilon = 32.0/256.0 vt_grad = torch.zeros(vt_var.size()).cuda() #vt_var.retain_grad() scores, att_probs,img_clean = model(v_var, f_var, vt_var, pos, questions_var) i1, i2 = torch.max(scores[0],0) mi = i2.cpu().numpy().tolist() ms = int(answers) print(mi) print(mi==int(ms)) if(mi==int(ms)): allcor.append(1.0) else: allcor.append(0.0) print(softmax(scores[0])) print(softmax(scores[0])[ms]) img_clean = img_clean[0] for iii in range(img_clean.size()[0]): imggg = img_clean[iii].detach().cpu().numpy() imggg = imggg * 255.0 imggg = imggg.transpose((1,2,0)) imggg = Image.fromarray(imggg.astype('uint8')) imggg.save('result_test/'+str(ii)+'_'+str(iii)+'_clean.jpg') loss = lossFn(scores, answers_var) loss.backward() #print(torch.max(vt_grad)) vt_grad = vt_var.grad v_grad = v_var.grad print('max grad',torch.max(vt_grad.data)) #print(vt_grad[0][0][40010][0][0]) #print(vt_grad[0][0][39506][0][0]) vt_var = vt_var.detach() + epsilon * torch.sign(vt_grad) #v_var = v_var.detach() + 1.0 * torch.sign(v_grad) vt_var = torch.clamp(vt_var, 0, 1) #vt_var = Variable(vt_var.data, requires_grad=True).cuda() with torch.no_grad(): model.eval() begin, end = get_info(idx[0], house[0]) #for iii in range(begin,end): #if(vt_test[0][0][iii][0][0][0][0] != vt_var[0][0][iii][0][0][0][0] or vt_test[0][0][iii][0][0][1][0] != vt_var[0][0][iii][0][0][1][0] or vt_test[0][0][iii][0][1][1][0] != vt_var[0][0][iii][0][1][1][0]): #print(iii) vt_test[0][0][begin:end] = vt_var[0][0][begin:end] #print(vt_test[0][0][40010][0][0]) #print(vt_test[0][0][39506][0][0]) #print((vt_test[0][0] == vt_var[0][0]).sum()) #print((vt_test[0][0].size()),(vt_var[0][0].size())) scores, att_probs,imgg = model(v_var, f_var, vt_test, pos, questions_var) imgg = imgg[0] #print(imgg.size()) for iii in range(imgg.size()[0]): imggg = imgg[iii].detach().cpu().numpy() imggg = imggg * 255.0 imggg = imggg.transpose((1,2,0)) imggg = Image.fromarray(imggg.astype('uint8')) imggg.save('result_test/'+str(ii)+'_'+str(iii)+'_adv.jpg') i1, i2 = torch.max(scores[0],0) mi = i2.cpu().numpy().tolist() ms = int(answers) print(mi) print(mi==int(ms)) print(softmax(scores[0])) print(softmax(scores[0])[ms]) for k in range(idx.shape[0]): begin, end = get_info(idx[k], house[k]) #print(begin, end) #begin, end = find_index('167', house[k]) v_m = v_var[k][0] f_m = f_var[k][0][begin:end] vt_m = vt_var[k][0][begin:end] nr.save_obj('/media/trs1/dataset/suncg_data/house/' + house[k] + '/attack_' + str(int(idx[k])) + '.obj', v_m, f_m, vt_m) idx = idx.cpu() questions = questions.cpu() answers = answers.cpu() questions_var = questions_var.cpu() answers_var = answers_var.cpu() v = v.cpu() f = f.cpu() vt = vt.cpu() v_m = v_m.cpu() f_m = f_m.cpu() vt_m = vt_m.cpu() v_var = v_var.detach().cpu() f_var = f_var.cpu() vt_var = vt_var.detach().cpu() vt_grad = vt_grad.cpu() print(house[k] + ' ' + str(int(idx[k])) + ' ok') all_n += 1 # handle_file(path) if all_envs_loaded == False: eval_loader.dataset._load_envs() if len(eval_loader.dataset.pruned_env_set) == 0: done = True else: done = True print(allcor) print(number, ' over')
def test(rank): nav_model_kwargs = {'question_vocab': load_vocab(args.vocab_json)} nav_model = NavPlannerControllerModel(**nav_model_kwargs) nav_checkpoint = torch.load(args.nav_weight) #load checkpoint weights nav_model.load_state_dict(nav_checkpoint['state']) #create model print('--- nav_model loaded checkpoint ---') cnn_kwargs = {'num_classes': 191, 'pretrained': True} cnn = MultitaskCNN(**cnn_kwargs) cnn.eval() cnn.cuda() #create cnn model vqa_model_kwargs = {'vocab': load_vocab(args.vocab_json)} vqa_model = VqaLstmCnnAttentionModel(**vqa_model_kwargs) vqa_checkpoint = torch.load(args.vqa_weight) #load checkpoint weights vqa_model.load_state_dict(vqa_checkpoint['state']) print('--- vqa_model loaded checkpoint ---') # need cnn? scene = "test-10-obj-100.txt" my_env = enviroment.Environment(is_testing=0, testing_file=scene) object_exist_list = my_env.ur5.object_type print("Objetcts that exist: ") print(object_exist_list) #create simulation enviroment my_question = Qusetion(object_exist_list) #create testing question testing_questions = my_question.createQueue() vocab = my_question.create_vocab() for question in testing_questions: planner_hidden = None max_action = 30 position = [0, 0] action_in_raw = [0] #start action_in actions = [] print(question['question']) #question questionTokens = my_question.tokenize(question['question'], punctToRemove=['?'], addStartToken=False) encoded_question_raw = my_question.encode(questionTokens, vocab['questionTokenToIdx']) encoded_question_raw.append(0) #encode question encoded_question_raw = np.array(encoded_question_raw) encoded_question_tensor = _dataset_to_tensor(encoded_question_raw) encoded_question = Variable(encoded_question_tensor) encoded_question = encoded_question.unsqueeze(0) print(encoded_question) action_times = 0 push_signal = 0 push_point = 0 while (action_times < max_action): #print(planner_img_feats_var.size()) action_in_tensor = _dataset_to_tensor(action_in_raw) action_in = Variable(action_in_tensor) action_in = action_in.unsqueeze(0) action_in = action_in.unsqueeze(0) _, rgb_image_raw = my_env.camera.get_camera_data() #before position_in, planner_img_feats_var = data2input( position, rgb_image_raw, cnn) output_data, planner_hidden = nav_model.planner_step( encoded_question, planner_img_feats_var, action_in, position_in, planner_hidden) planner_possi = F.log_softmax(output_data, dim=1) planner_data = planner_possi.data.numpy() planner_data = planner_data[0] action_out = np.where(planner_data == np.max(planner_data)) action_out = action_out[0][0] actions.append(action_out) action_in_raw = [action_out] if action_out == 9: print('stop') break elif action_out == 0: push_signal = 1 push_point = action_times else: dx, dy = order2action(action_out) position[0] += dx position[1] += dy action_times += 1 if len(actions) > 2 and push_signal == 0: action_position = position + position my_env.UR5_action(action_position, 2) #sucking elif len(actions) > 2 and push_signal == 1: #pushing position_start = [0, 0] position_end = [0, 0] for i in range(len(actions)): if i <= push_point: #the first step dx, dy = order2action(actions[i]) position_start[0] += dx position_start[1] += dy position_end[0] += dx position_end[1] += dy else: #the second step dx, dy = order2action(actions[i]) position_end[0] += dx position_end[1] += dy action_position = position_start + position_end my_env.UR5_action(action_position, 1) #pushing # get image after actions _, rgb_image_after = my_env.camera.get_camera_data( ) # image after actions shrink = cv.resize(rgb_image_raw, (224, 224), interpolation=cv.INTER_AREA) shrink = np.array(shrink) shrink = shrink.transpose((2, 0, 1)) shrink = shrink.reshape(1, 3, 224, 224) shrink = (shrink / 255.0).astype(np.float32) images = torch.FloatTensor(shrink) images = Variable(images) images = images.unsqueeze(0) # process images # answer question in vqa now # encoded_question already done scores, _ = vqa_model(images, encoded_question) scores = scores.data.numpy() scores = scores[0] answer_predict = np.where(scores == np.max(scores)) answer_predict = answer_predict[0][0] if answer_predict == 0: print('--- Predict: Exists not') elif answer_predict == 1: print('--- Predict: Exists') else: raise Exception('Prediction neither 0 nor 1')
def test(rank): cnn_model_dir = os.path.abspath("../train/models/03_13_h3d_hybrid_cnn.pt") vqa_model_kwargs = { 'vocab': load_vocab(args.vocab_json), 'checkpoint_path': cnn_model_dir } vqa_model = VqaLstmCnnAttentionModel(**vqa_model_kwargs) vqa_checkpoint = torch.load(args.vqa_weight) #load checkpoint weights vqa_model.load_state_dict(vqa_checkpoint['state']) print('--- vqa_model loaded checkpoint ---') res_model_dir = os.path.abspath("../train/models/resnet101.pth") my_map_cnn = mapCNN(checkpoint_path=res_model_dir) map_checkpoint = torch.load('mapcnn.pt', map_location='cpu') #load checkpoint weights my_map_cnn.load_state_dict(map_checkpoint['state']) #create map model print('--- map_model loaded checkpoint ---') cnn_kwargs = { 'num_classes': 191, 'pretrained': True, 'checkpoint_path': cnn_model_dir } cnn = MultitaskCNN(**cnn_kwargs) cnn.eval() vocab_dir = os.path.abspath("vocab.json") vocab_file = open(vocab_dir, 'r', encoding='utf-8') vocab = json.load(vocab_file) question = args.question print(question) questionTokens = tokenize(question, punctToRemove=['?'], addStartToken=False) encoded_question_raw = encode(questionTokens, vocab['questionTokenToIdx']) while (len(encoded_question_raw) < 10): encoded_question_raw.append(0) #encode question encoded_question_raw = np.array(encoded_question_raw) encoded_question_tensor = _dataset_to_tensor(encoded_question_raw) encoded_question = Variable(encoded_question_tensor) encoded_question = encoded_question.unsqueeze(0) rgb_before = cv.imread(args.rgb_image_before_dir) rgb_after = cv.imread(args.rgb_image_after_dir) depth_after = cv.imread(args.depth_image_after_dir) depth_after = depth_after[0] depth_dim = depth_after.shape print(depth_dim) rgb_after_resize = cv.resize(rgb_after, (256, 256), interpolation=cv.INTER_AREA) # crop and add marking depth_after_resize = cv.resize(depth_after, (256, 256), interpolation=cv.INTER_AREA) # crop and add marking rgb_tensor, depth_tensor = rgbd2tensor(rgb_after_resize, depth_after_resize) #output_heatmap heatmap_output = rgbd2heatmap(rgb_tensor, depth_tensor, my_map_cnn) f = h5py.File(args.heatmap_output_dir, 'w') f['heatmap'] = heatmap_output cv.imwrite(args.rgb_image_after_dir, rgb_after_resize) cv.imwrite(args.depth_image_after_dir, depth_after_resize) before_image_feat = data2input(rgb_before) after_image_feat = data2input(rgb_after_resize) input_image = [before_image_feat, after_image_feat] input_image_feats = Variable(torch.FloatTensor(input_image)) input_image_feats = input_image_feats.view(1, 2, 3, 224, 224) # print(input_image_feats.size()) #print(input_image.size()) #print(before_image_feat.size()) scores, _ = vqa_model(input_image_feats, encoded_question) scores = scores.data.numpy() scores = scores[0] answer_predict = np.where(scores == np.max(scores)) answer_predict = answer_predict[0][0] answer_dic = vocab["answerTokenToIdx"] answer = [k for k, v in answer_dic.items() if v == answer_predict] print(answer[0])
args.checkpoint_dir = os.path.join(args.checkpoint_dir, args.time_id + '_' + args.identifier) args.log_dir = os.path.join(args.log_dir, args.time_id + '_' + args.identifier) print(args.__dict__) if not os.path.exists(args.checkpoint_dir) and args.log == True: os.makedirs(args.checkpoint_dir) os.makedirs(args.log_dir) model_kwargs = {'vocab': load_vocab(args.vocab_json)} shared_model = VqaLstmCnnAttentionModel(**model_kwargs) if args.checkpoint_path != False: print('Loading params from checkpoint: %s' % args.checkpoint_path) shared_model.load_state_dict(checkpoint['state']) shared_model.share_memory() fgsm(0, args, shared_model, 0) torch.cuda.empty_cache()
def attack_fgsm(rank, args, shared_model, number): #? torch.cuda.set_device(args.gpus.index(args.gpus[rank % len(args.gpus)])) all_n = 0 # torch.cuda.set_device(args.gpus.index(args.gpus[rank % len(args.gpus)])) model_kwargs = {'vocab': load_vocab(args.vocab_json)} model = VqaLstmCnnAttentionModel(**model_kwargs) device_ids = [0, 1] model = model.cuda(device_ids[0]) model = torch.nn.DataParallel(model, device_ids=device_ids) # torch.backends.cudnn.benchmark = True lossFn = torch.nn.CrossEntropyLoss().cuda() eval_loader_kwargs = { 'questions_h5': getattr(args, args.eval_split + '_h5'), 'data_json': args.data_json, 'vocab': args.vocab_json, 'batch_size': args.batch_size, 'input_type': args.input_type, 'num_frames': args.num_frames, 'split': args.eval_split, 'max_threads_per_gpu': args.max_threads_per_gpu, 'gpu_id': args.gpus[rank % len(args.gpus)], 'to_cache': args.cache } eval_loader = EqaDataLoader(**eval_loader_kwargs) # for ijcai in range(number): # eval_loader.dataset._load_envs() eval_loader.dataset._load_envs(start_idx=number) print('eval_loader has %d samples' % len(eval_loader.dataset)) args.output_log_path = os.path.join(args.log_dir, 'eval_' + str(rank) + '.json') model.load_state_dict(handle_load(shared_model.state_dict())) model.eval() metrics = VqaMetric( info={'split': args.eval_split}, metric_names=['loss', 'accuracy', 'mean_rank', 'mean_reciprocal_rank'], log_json=args.output_log_path) all_envs_loaded = eval_loader.dataset._check_if_all_envs_loaded() done = False while done == False: for batch in eval_loader: idx, questions, answers, house, v, f, vt, pos, _, _, _ = batch questions_var = Variable(questions.cuda()) answers_var = Variable(answers.cuda()) v_var = Variable(v.cuda(), requires_grad=True) f_var = Variable(f.cuda()) vt_var = Variable(vt.cuda(), requires_grad=True) # noise level epsilon = 12.0 / 255.0 scores, att_probs = model(v_var, f_var, vt_var, pos, questions_var, 0, '0') loss = lossFn(scores, answers_var) loss.backward() # get grad for attack vt_grad = vt_var.grad vt_detach = vt_var.detach() begin, end, oid = get_info(idx[0], house[0]) if (begin == 1000 and end == 2000): print(str(int(idx[0])), 'error') vt_grad[0][0][:begin] = 0 vt_grad[0][0][end:] = 0 vt_var = vt_detach + epsilon * torch.sign(vt_grad) for k in range(idx.shape[0]): begin, end, oid = get_info(idx[k], house[k]) v_m = v_var[k][0] f_m = f_var[k][0][begin:end] vt_m = vt_var[k][0][begin:end] # save changed object to .obj file nr.save_obj( '/path/to/data/house/' + house[k] + '/attack_' + str(int(idx[k])) + '_' + str(oid) + '.obj', v_m, f_m, vt_m) with torch.no_grad(): model.eval() scores, att_probs = model(v_var, f_var, vt_var, pos, questions_var, 0, '0') accuracy, ranks = metrics.compute_ranks( scores.data.cpu(), answers) if all_envs_loaded == False: eval_loader.dataset._load_envs() if len(eval_loader.dataset.pruned_env_set) == 0: done = True else: done = True
def attack_pgd(rank, args, shared_model, number): torch.cuda.set_device(args.gpus.index(args.gpus[rank % len(args.gpus)])) all_n = 0 model_kwargs = {'vocab': load_vocab(args.vocab_json)} model = VqaLstmCnnAttentionModel(**model_kwargs) device_ids = [0, 1] model = model.cuda(device_ids[0]) model = torch.nn.DataParallel(model, device_ids=device_ids) # torch.backends.cudnn.benchmark = True lossFn = torch.nn.CrossEntropyLoss().cuda() eval_loader_kwargs = { 'questions_h5': getattr(args, args.eval_split + '_h5'), 'data_json': args.data_json, 'vocab': args.vocab_json, 'batch_size': args.batch_size, 'input_type': args.input_type, 'num_frames': args.num_frames, 'split': args.eval_split, 'max_threads_per_gpu': args.max_threads_per_gpu, 'gpu_id': args.gpus[rank % len(args.gpus)], 'to_cache': args.cache } eval_loader = EqaDataLoader(**eval_loader_kwargs) eval_loader.dataset._load_envs(start_idx=number) print('eval_loader has %d samples' % len(eval_loader.dataset)) args.output_log_path = os.path.join(args.log_dir, 'eval_' + str(rank) + '.json') model.load_state_dict(handle_load(shared_model.state_dict())) model.eval() metrics = VqaMetric( info={'split': args.eval_split}, metric_names=['loss', 'accuracy', 'mean_rank', 'mean_reciprocal_rank'], log_json=args.output_log_path) all_envs_loaded = eval_loader.dataset._check_if_all_envs_loaded() done = False while done == False: for batch in eval_loader: # model.cuda() idx, questions, answers, house, v, f, vt, pos, _, _, _ = batch questions_var = Variable(questions.cuda()) answers_var = Variable(answers.cuda()) v_var = Variable(v.cuda(), requires_grad=True) f_var = Variable(f.cuda()) vt_var = Variable(vt.cuda(), requires_grad=True) attack_iter = 10 attack_momentum = 1 alpha = 2.0 / 255 epsilon = 16.0 / 255 vt_grad = torch.zeros(vt_var.size()).cuda() for j in range(attack_iter): scores, att_probs = model(v_var, f_var, vt_var, pos, questions_var, j, str(int(idx[0]))) loss = lossFn(scores, answers_var) loss.backward() vg = vt_var.grad begin, end, _ = get_info(idx[0], house[0]) if (begin == 1000 and end == 2000): print(str(int(idx[0])), 'error') vg[0][0][:begin] = 0 vg[0][0][end:] = 0 noise = attack_momentum * vt_grad + vg vt_grad = noise vt_var = vt_var.detach() + alpha * torch.sign(noise) vt_var = torch.where(vt_var > vt + epsilon, vt + epsilon, vt_var) vt_var = torch.clamp(vt_var, 0, 1) vt_var = torch.where(vt_var < vt - epsilon, vt - epsilon, vt_var) vt_var = torch.clamp(vt_var, 0, 1) vt_var = Variable(vt_var.data, requires_grad=True).cuda() with torch.no_grad(): model.eval() scores, att_probs = model(v_var, f_var, vt_var, pos, questions_var, 100, str(int(idx[0]))) accuracy, ranks = metrics.compute_ranks( scores.data.cpu(), answers) begin, end, oid = get_info(idx[0], house[0]) v_m = v_var[0][0] f_m = f_var[0][0][begin:end] vt_m = vt_var[0][0][begin:end] nr.save_obj( '/path/to/data/house/' + house[0] + '/attack_' + str(int(idx[0])) + '_' + str(oid) + '.obj', v_m, f_m, vt_m) if all_envs_loaded == False: eval_loader.dataset._load_envs() if len(eval_loader.dataset.pruned_env_set) == 0: done = True else: done = True
def train(rank, args, shared_nav_model, shared_ans_model): torch.cuda.set_device(args.gpus.index(args.gpus[rank % len(args.gpus)])) if args.model_type == 'pacman': model_kwargs = {'question_vocab': load_vocab(args.vocab_json)} nav_model = NavPlannerControllerModel(**model_kwargs) else: exit() model_kwargs = {'vocab': load_vocab(args.vocab_json)} ans_model = VqaLstmCnnAttentionModel(**model_kwargs) optim = torch.optim.SGD( filter(lambda p: p.requires_grad, shared_nav_model.parameters()), lr=args.learning_rate) train_loader_kwargs = { 'questions_h5': args.train_h5, 'data_json': args.data_json, 'vocab': args.vocab_json, 'target_obj_conn_map_dir': args.target_obj_conn_map_dir, 'map_resolution': args.map_resolution, 'batch_size': 1, 'input_type': args.model_type, 'num_frames': 5, 'split': 'train', 'max_threads_per_gpu': args.max_threads_per_gpu, 'gpu_id': args.gpus[rank % len(args.gpus)], 'to_cache': args.to_cache } args.output_nav_log_path = os.path.join(args.log_dir, 'nav_train_' + str(rank) + '.json') args.output_ans_log_path = os.path.join(args.log_dir, 'ans_train_' + str(rank) + '.json') nav_model.load_state_dict(shared_nav_model.state_dict()) nav_model.cuda() ans_model.load_state_dict(shared_ans_model.state_dict()) ans_model.eval() ans_model.cuda() nav_metrics = NavMetric( info={'split': 'train', 'thread': rank}, metric_names=[ 'planner_loss', 'controller_loss', 'reward', 'episode_length' ], log_json=args.output_nav_log_path) vqa_metrics = VqaMetric( info={'split': 'train', 'thread': rank}, metric_names=['accuracy', 'mean_rank', 'mean_reciprocal_rank'], log_json=args.output_ans_log_path) train_loader = EqaDataLoader(**train_loader_kwargs) print('train_loader has %d samples' % len(train_loader.dataset)) t, epoch = 0, 0 p_losses, c_losses, reward_list, episode_length_list = [], [], [], [] nav_metrics.update([10.0, 10.0, 0, 100]) mult = 0.1 while epoch < int(args.max_epochs): if 'pacman' in args.model_type: planner_lossFn = MaskedNLLCriterion().cuda() controller_lossFn = MaskedNLLCriterion().cuda() done = False all_envs_loaded = train_loader.dataset._check_if_all_envs_loaded() while done == False: for batch in train_loader: nav_model.load_state_dict(shared_nav_model.state_dict()) nav_model.eval() nav_model.cuda() idx, question, answer, actions, action_length = batch metrics_slug = {} h3d = train_loader.dataset.episode_house # evaluate at multiple initializations # for i in [10, 30, 50]: t += 1 question_var = Variable(question.cuda()) controller_step = False planner_hidden = nav_model.planner_nav_rnn.init_hidden(1) # forward through planner till spawn planner_actions_in, planner_img_feats, controller_step, controller_action_in, controller_img_feat, init_pos = train_loader.dataset.get_hierarchical_features_till_spawn( actions[0, :action_length[0] + 1].numpy(), max(3, int(mult * action_length[0]))) planner_actions_in_var = Variable( planner_actions_in.cuda()) planner_img_feats_var = Variable(planner_img_feats.cuda()) for step in range(planner_actions_in.size(0)): planner_scores, planner_hidden = nav_model.planner_step( question_var, planner_img_feats_var[step].view( 1, 1, 3200), planner_actions_in_var[step].view( 1, 1), planner_hidden) if controller_step == True: controller_img_feat_var = Variable( controller_img_feat.cuda()) controller_action_in_var = Variable( torch.LongTensor(1, 1).fill_( int(controller_action_in)).cuda()) controller_scores = nav_model.controller_step( controller_img_feat_var.view(1, 1, 3200), controller_action_in_var.view(1, 1), planner_hidden[0]) prob = F.softmax(controller_scores, dim=1) controller_action = int( prob.max(1)[1].data.cpu().numpy()[0]) if controller_action == 1: controller_step = True else: controller_step = False action = int(controller_action_in) action_in = torch.LongTensor( 1, 1).fill_(action + 1).cuda() else: prob = F.softmax(planner_scores, dim=1) action = int(prob.max(1)[1].data.cpu().numpy()[0]) action_in = torch.LongTensor( 1, 1).fill_(action + 1).cuda() h3d.env.reset( x=init_pos[0], y=init_pos[2], yaw=init_pos[3]) init_dist_to_target = h3d.get_dist_to_target( h3d.env.cam.pos) if init_dist_to_target < 0: # unreachable # invalids.append([idx[0], i]) continue episode_length = 0 episode_done = True controller_action_counter = 0 dists_to_target, pos_queue = [init_dist_to_target], [ init_pos ] rewards, planner_actions, planner_log_probs, controller_actions, controller_log_probs = [], [], [], [], [] if action != 3: # take the first step img, rwd, episode_done = h3d.step(action, step_reward=True) img = torch.from_numpy(img.transpose( 2, 0, 1)).float() / 255.0 img_feat_var = train_loader.dataset.cnn( Variable(img.view(1, 3, 224, 224).cuda())).view( 1, 1, 3200) for step in range(args.max_episode_length): episode_length += 1 if controller_step == False: planner_scores, planner_hidden = nav_model.planner_step( question_var, img_feat_var, Variable(action_in), planner_hidden) planner_prob = F.softmax(planner_scores, dim=1) planner_log_prob = F.log_softmax( planner_scores, dim=1) action = planner_prob.multinomial().data planner_log_prob = planner_log_prob.gather( 1, Variable(action)) planner_log_probs.append( planner_log_prob.cpu()) action = int(action.cpu().numpy()[0, 0]) planner_actions.append(action) img, rwd, episode_done = h3d.step(action, step_reward=True) episode_done = episode_done or episode_length >= args.max_episode_length rewards.append(rwd) img = torch.from_numpy(img.transpose( 2, 0, 1)).float() / 255.0 img_feat_var = train_loader.dataset.cnn( Variable(img.view(1, 3, 224, 224) .cuda())).view(1, 1, 3200) dists_to_target.append( h3d.get_dist_to_target(h3d.env.cam.pos)) pos_queue.append([ h3d.env.cam.pos.x, h3d.env.cam.pos.y, h3d.env.cam.pos.z, h3d.env.cam.yaw ]) if episode_done == True: break # query controller to continue or not controller_action_in = Variable( torch.LongTensor(1, 1).fill_(action).cuda()) controller_scores = nav_model.controller_step( img_feat_var, controller_action_in, planner_hidden[0]) controller_prob = F.softmax( controller_scores, dim=1) controller_log_prob = F.log_softmax( controller_scores, dim=1) controller_action = controller_prob.multinomial( ).data if int(controller_action[0] ) == 1 and controller_action_counter < 4: controller_action_counter += 1 controller_step = True else: controller_action_counter = 0 controller_step = False controller_action.fill_(0) controller_log_prob = controller_log_prob.gather( 1, Variable(controller_action)) controller_log_probs.append( controller_log_prob.cpu()) controller_action = int( controller_action.cpu().numpy()[0, 0]) controller_actions.append(controller_action) action_in = torch.LongTensor( 1, 1).fill_(action + 1).cuda() # run answerer here ans_acc = [0] if action == 3: if len(pos_queue) < 5: pos_queue = train_loader.dataset.episode_pos_queue[len( pos_queue) - 5:] + pos_queue images = train_loader.dataset.get_frames( h3d, pos_queue[-5:], preprocess=True) images_var = Variable( torch.from_numpy(images).cuda()).view( 1, 5, 3, 224, 224) scores, att_probs = ans_model(images_var, question_var) ans_acc, ans_rank = vqa_metrics.compute_ranks( scores.data.cpu(), answer) vqa_metrics.update([ans_acc, ans_rank, 1.0 / ans_rank]) rewards.append(h3d.success_reward * ans_acc[0]) R = torch.zeros(1, 1) planner_loss = 0 controller_loss = 0 planner_rev_idx = -1 for i in reversed(range(len(rewards))): R = 0.99 * R + rewards[i] advantage = R - nav_metrics.metrics[2][1] if i < len(controller_actions): controller_loss = controller_loss - controller_log_probs[i] * Variable( advantage) if controller_actions[i] == 0 and planner_rev_idx + len(planner_log_probs) >= 0: planner_loss = planner_loss - planner_log_probs[planner_rev_idx] * Variable( advantage) planner_rev_idx -= 1 elif planner_rev_idx + len(planner_log_probs) >= 0: planner_loss = planner_loss - planner_log_probs[planner_rev_idx] * Variable( advantage) planner_rev_idx -= 1 controller_loss /= max(1, len(controller_log_probs)) planner_loss /= max(1, len(planner_log_probs)) optim.zero_grad() if isinstance(planner_loss, float) == False and isinstance( controller_loss, float) == False: p_losses.append(planner_loss.data[0, 0]) c_losses.append(controller_loss.data[0, 0]) reward_list.append(np.sum(rewards)) episode_length_list.append(episode_length) (planner_loss + controller_loss).backward() ensure_shared_grads(nav_model.cpu(), shared_nav_model) optim.step() if len(reward_list) > 50: nav_metrics.update([ p_losses, c_losses, reward_list, episode_length_list ]) print(nav_metrics.get_stat_string()) if args.to_log == 1: nav_metrics.dump_log() if nav_metrics.metrics[2][1] > 0.35: mult = min(mult + 0.1, 1.0) p_losses, c_losses, reward_list, episode_length_list = [], [], [], [] if all_envs_loaded == False: train_loader.dataset._load_envs(in_order=True) if len(train_loader.dataset.pruned_env_set) == 0: done = True if args.to_cache == False: train_loader.dataset._load_envs( start_idx=0, in_order=True) else: done = True epoch += 1
def eval(rank, args, shared_nav_model, shared_ans_model): torch.cuda.set_device(args.gpus.index(args.gpus[rank % len(args.gpus)])) if args.model_type == 'pacman': model_kwargs = {'question_vocab': load_vocab(args.vocab_json)} nav_model = NavPlannerControllerModel(**model_kwargs) else: exit() model_kwargs = {'vocab': load_vocab(args.vocab_json)} ans_model = VqaLstmCnnAttentionModel(**model_kwargs) eval_loader_kwargs = { 'questions_h5': getattr(args, args.eval_split + '_h5'), 'data_json': args.data_json, 'vocab': args.vocab_json, 'target_obj_conn_map_dir': args.target_obj_conn_map_dir, 'map_resolution': args.map_resolution, 'batch_size': 1, 'input_type': args.model_type, 'num_frames': 5, 'split': args.eval_split, 'max_threads_per_gpu': args.max_threads_per_gpu, 'gpu_id': args.gpus[rank % len(args.gpus)], 'to_cache': False } eval_loader = EqaDataLoader(**eval_loader_kwargs) print('eval_loader has %d samples' % len(eval_loader.dataset)) args.output_nav_log_path = os.path.join(args.log_dir, 'nav_eval_' + str(rank) + '.json') args.output_ans_log_path = os.path.join(args.log_dir, 'ans_eval_' + str(rank) + '.json') t, epoch, best_eval_acc = 0, 0, 0.0 while epoch < int(args.max_epochs): start_time = time.time() invalids = [] nav_model.load_state_dict(shared_nav_model.state_dict()) nav_model.eval() ans_model.load_state_dict(shared_ans_model.state_dict()) ans_model.eval() ans_model.cuda() # that's a lot of numbers nav_metrics = NavMetric( info={'split': args.eval_split, 'thread': rank}, metric_names=[ 'd_0_10', 'd_0_30', 'd_0_50', 'd_T_10', 'd_T_30', 'd_T_50', 'd_D_10', 'd_D_30', 'd_D_50', 'd_min_10', 'd_min_30', 'd_min_50', 'r_T_10', 'r_T_30', 'r_T_50', 'r_e_10', 'r_e_30', 'r_e_50', 'stop_10', 'stop_30', 'stop_50', 'ep_len_10', 'ep_len_30', 'ep_len_50' ], log_json=args.output_nav_log_path) vqa_metrics = VqaMetric( info={'split': args.eval_split, 'thread': rank}, metric_names=[ 'accuracy_10', 'accuracy_30', 'accuracy_50', 'mean_rank_10', 'mean_rank_30', 'mean_rank_50', 'mean_reciprocal_rank_10', 'mean_reciprocal_rank_30', 'mean_reciprocal_rank_50' ], log_json=args.output_ans_log_path) if 'pacman' in args.model_type: done = False while done == False: for batch in tqdm(eval_loader): nav_model.load_state_dict(shared_nav_model.state_dict()) nav_model.eval() nav_model.cuda() idx, question, answer, actions, action_length = batch metrics_slug = {} h3d = eval_loader.dataset.episode_house # evaluate at multiple initializations for i in [10, 30, 50]: t += 1 if i > action_length[0]: invalids.append([idx[0], i]) continue question_var = Variable(question.cuda()) controller_step = False planner_hidden = nav_model.planner_nav_rnn.init_hidden( 1) # forward through planner till spawn planner_actions_in, planner_img_feats, controller_step, controller_action_in, controller_img_feat, init_pos = eval_loader.dataset.get_hierarchical_features_till_spawn( actions[0, :action_length[0] + 1].numpy(), i) planner_actions_in_var = Variable( planner_actions_in.cuda()) planner_img_feats_var = Variable( planner_img_feats.cuda()) for step in range(planner_actions_in.size(0)): planner_scores, planner_hidden = nav_model.planner_step( question_var, planner_img_feats_var[step].view( 1, 1, 3200), planner_actions_in_var[step].view( 1, 1), planner_hidden) if controller_step == True: controller_img_feat_var = Variable( controller_img_feat.cuda()) controller_action_in_var = Variable( torch.LongTensor(1, 1).fill_( int(controller_action_in)).cuda()) controller_scores = nav_model.controller_step( controller_img_feat_var.view(1, 1, 3200), controller_action_in_var.view(1, 1), planner_hidden[0]) prob = F.softmax(controller_scores, dim=1) controller_action = int( prob.max(1)[1].data.cpu().numpy()[0]) if controller_action == 1: controller_step = True else: controller_step = False action = int(controller_action_in) action_in = torch.LongTensor( 1, 1).fill_(action + 1).cuda() else: prob = F.softmax(planner_scores, dim=1) action = int(prob.max(1)[1].data.cpu().numpy()[0]) action_in = torch.LongTensor( 1, 1).fill_(action + 1).cuda() h3d.env.reset( x=init_pos[0], y=init_pos[2], yaw=init_pos[3]) init_dist_to_target = h3d.get_dist_to_target( h3d.env.cam.pos) if init_dist_to_target < 0: # unreachable invalids.append([idx[0], i]) continue episode_length = 0 episode_done = True controller_action_counter = 0 dists_to_target, pos_queue, pred_actions = [ init_dist_to_target ], [init_pos], [] planner_actions, controller_actions = [], [] if action != 3: # take the first step img, _, _ = h3d.step(action) img = torch.from_numpy(img.transpose( 2, 0, 1)).float() / 255.0 img_feat_var = eval_loader.dataset.cnn( Variable(img.view(1, 3, 224, 224).cuda())).view( 1, 1, 3200) for step in range(args.max_episode_length): episode_length += 1 if controller_step == False: planner_scores, planner_hidden = nav_model.planner_step( question_var, img_feat_var, Variable(action_in), planner_hidden) prob = F.softmax(planner_scores, dim=1) action = int( prob.max(1)[1].data.cpu().numpy()[0]) planner_actions.append(action) pred_actions.append(action) img, _, episode_done = h3d.step(action) episode_done = episode_done or episode_length >= args.max_episode_length img = torch.from_numpy(img.transpose( 2, 0, 1)).float() / 255.0 img_feat_var = eval_loader.dataset.cnn( Variable(img.view(1, 3, 224, 224) .cuda())).view(1, 1, 3200) dists_to_target.append( h3d.get_dist_to_target(h3d.env.cam.pos)) pos_queue.append([ h3d.env.cam.pos.x, h3d.env.cam.pos.y, h3d.env.cam.pos.z, h3d.env.cam.yaw ]) if episode_done == True: break # query controller to continue or not controller_action_in = Variable( torch.LongTensor(1, 1).fill_(action).cuda()) controller_scores = nav_model.controller_step( img_feat_var, controller_action_in, planner_hidden[0]) prob = F.softmax(controller_scores, dim=1) controller_action = int( prob.max(1)[1].data.cpu().numpy()[0]) if controller_action == 1 and controller_action_counter < 4: controller_action_counter += 1 controller_step = True else: controller_action_counter = 0 controller_step = False controller_action = 0 controller_actions.append(controller_action) action_in = torch.LongTensor( 1, 1).fill_(action + 1).cuda() # run answerer here if len(pos_queue) < 5: pos_queue = eval_loader.dataset.episode_pos_queue[len( pos_queue) - 5:] + pos_queue images = eval_loader.dataset.get_frames( h3d, pos_queue[-5:], preprocess=True) images_var = Variable( torch.from_numpy(images).cuda()).view( 1, 5, 3, 224, 224) scores, att_probs = ans_model(images_var, question_var) ans_acc, ans_rank = vqa_metrics.compute_ranks( scores.data.cpu(), answer) pred_answer = scores.max(1)[1].data[0] print('[Q_GT]', ' '.join([ eval_loader.dataset.vocab['questionIdxToToken'][x] for x in question[0] if x != 0 ])) print('[A_GT]', eval_loader.dataset.vocab[ 'answerIdxToToken'][answer[0]]) print('[A_PRED]', eval_loader.dataset.vocab[ 'answerIdxToToken'][pred_answer]) # compute stats metrics_slug['accuracy_' + str(i)] = ans_acc[0] metrics_slug['mean_rank_' + str(i)] = ans_rank[0] metrics_slug['mean_reciprocal_rank_' + str(i)] = 1.0 / ans_rank[0] metrics_slug['d_0_' + str(i)] = dists_to_target[0] metrics_slug['d_T_' + str(i)] = dists_to_target[-1] metrics_slug['d_D_' + str( i)] = dists_to_target[0] - dists_to_target[-1] metrics_slug['d_min_' + str(i)] = np.array( dists_to_target).min() metrics_slug['ep_len_' + str(i)] = episode_length if action == 3: metrics_slug['stop_' + str(i)] = 1 else: metrics_slug['stop_' + str(i)] = 0 inside_room = [] for p in pos_queue: inside_room.append( h3d.is_inside_room( p, eval_loader.dataset.target_room)) if inside_room[-1] == True: metrics_slug['r_T_' + str(i)] = 1 else: metrics_slug['r_T_' + str(i)] = 0 if any([x == True for x in inside_room]) == True: metrics_slug['r_e_' + str(i)] = 1 else: metrics_slug['r_e_' + str(i)] = 0 # navigation metrics metrics_list = [] for i in nav_metrics.metric_names: if i not in metrics_slug: metrics_list.append(nav_metrics.metrics[ nav_metrics.metric_names.index(i)][0]) else: metrics_list.append(metrics_slug[i]) nav_metrics.update(metrics_list) # vqa metrics metrics_list = [] for i in vqa_metrics.metric_names: if i not in metrics_slug: metrics_list.append(vqa_metrics.metrics[ vqa_metrics.metric_names.index(i)][0]) else: metrics_list.append(metrics_slug[i]) vqa_metrics.update(metrics_list) try: print(nav_metrics.get_stat_string(mode=0)) print(vqa_metrics.get_stat_string(mode=0)) except: pass print('epoch', epoch) print('invalids', len(invalids)) eval_loader.dataset._load_envs() if len(eval_loader.dataset.pruned_env_set) == 0: done = True epoch += 1 # checkpoint if best val accuracy if vqa_metrics.metrics[2][0] > best_eval_acc: # ans_acc_50 best_eval_acc = vqa_metrics.metrics[2][0] if epoch % args.eval_every == 0 and args.to_log == 1: vqa_metrics.dump_log() nav_metrics.dump_log() model_state = get_state(nav_model) aad = dict(args.__dict__) ad = {} for i in aad: if i[0] != '_': ad[i] = aad[i] checkpoint = {'args': ad, 'state': model_state, 'epoch': epoch} checkpoint_path = '%s/epoch_%d_ans_50_%.04f.pt' % ( args.checkpoint_dir, epoch, best_eval_acc) print('Saving checkpoint to %s' % checkpoint_path) torch.save(checkpoint, checkpoint_path) print('[best_eval_ans_acc_50:%.04f]' % best_eval_acc) eval_loader.dataset._load_envs(start_idx=0, in_order=True)
shared_nav_model.load_state_dict(checkpoint['state']) # Load answering model print('Loading answering checkpoint from %s' % args.ans_checkpoint_path) ans_checkpoint = torch.load( args.ans_checkpoint_path, map_location={ 'cuda:0': 'cpu' }) ans_model_kwargs = {'vocab': load_vocab(args.vocab_json)} shared_ans_model = VqaLstmCnnAttentionModel(**ans_model_kwargs) shared_ans_model.share_memory() print('Loading params from checkpoint: %s' % args.ans_checkpoint_path) shared_ans_model.load_state_dict(ans_checkpoint['state']) if args.mode == 'eval': eval(0, args, shared_nav_model, shared_ans_model) elif args.mode == 'train': train(0, args, shared_nav_model, shared_ans_model) else: processes = [] p = mp.Process( target=eval, args=(0, args, shared_nav_model, shared_ans_model))