def thunk(): _env = grounding_env.GroundingEnv(args, args.seed + rank, img_encoder=None, fixed=False, manual_set_task=True, n_stack=variant['n_stack']) _env.game_init() _env.tasks = _env.sample_tasks(variant['task_params']['n_tasks'], variants=variant['all_tasks']) return _env
if args.evaluate == 0: args.use_train_instructions = 1 log_filename = "train.log" elif args.evaluate == 1: args.use_train_instructions = 1 args.num_processes = 0 log_filename = "test-MT.log" elif args.evaluate == 2: args.use_train_instructions = 0 args.num_processes = 0 log_filename = "test-ZSL.log" else: assert False, "Invalid evaluation type" env = grounding_env.GroundingEnv(args) args.input_size = len(env.word_to_idx) # Setup logging if not os.path.exists(args.dump_location): os.makedirs(args.dump_location) logging.basicConfig(filename=args.dump_location + log_filename, level=logging.INFO) shared_model = A3C_LSTM_GA(args) # Load the model if (args.load != "0"): shared_model.load_state_dict( torch.load(args.load, map_location=lambda storage, loc: storage))
def train(rank, args, shared_model): torch.manual_seed(args.seed + rank) env = grounding_env.GroundingEnv(args) env.game_init() model = A3C_LSTM_GA(args) if (args.load != "0"): print(str(rank) + " Loading model ... "+args.load) model.load_state_dict( torch.load(args.load, map_location=lambda storage, loc: storage)) model.train() optimizer = optim.SGD(shared_model.parameters(), lr=args.lr) p_losses = [] v_losses = [] (image, instruction), _, _, _ = env.reset() instruction_idx = [] for word in instruction.split(" "): instruction_idx.append(env.word_to_idx[word]) instruction_idx = np.array(instruction_idx) image = torch.from_numpy(image).float()/255.0 instruction_idx = torch.from_numpy(instruction_idx).view(1, -1) done = True episode_length = 0 num_iters = 0 while True: # Sync with the shared model model.load_state_dict(shared_model.state_dict()) if done: episode_length = 0 cx = Variable(torch.zeros(1, 256)) hx = Variable(torch.zeros(1, 256)) else: cx = Variable(cx.data) hx = Variable(hx.data) values = [] log_probs = [] rewards = [] entropies = [] for step in range(args.num_steps): episode_length += 1 tx = Variable(torch.from_numpy(np.array([episode_length])).long()) value, logit, (hx, cx) = model((Variable(image.unsqueeze(0)), Variable(instruction_idx), (tx, hx, cx))) prob = F.softmax(logit, dim=1) log_prob = F.log_softmax(logit, dim=1) entropy = -(log_prob * prob).sum(1) entropies.append(entropy) action = prob.multinomial(1).data log_prob = log_prob.gather(1, Variable(action)) action = action.numpy()[0, 0] (image, _), reward, done, _ = env.step(action) done = done or episode_length >= args.max_episode_length if done: (image, instruction), _, _, _ = env.reset() instruction_idx = [] for word in instruction.split(" "): instruction_idx.append(env.word_to_idx[word]) instruction_idx = np.array(instruction_idx) instruction_idx = torch.from_numpy( instruction_idx).view(1, -1) image = torch.from_numpy(image).float()/255.0 values.append(value) log_probs.append(log_prob) rewards.append(reward) if done: break R = torch.zeros(1, 1) if not done: tx = Variable(torch.from_numpy(np.array([episode_length])).long()) value, _, _ = model((Variable(image.unsqueeze(0)), Variable(instruction_idx), (tx, hx, cx))) R = value.data values.append(Variable(R)) policy_loss = 0 value_loss = 0 R = Variable(R) gae = torch.zeros(1, 1) for i in reversed(range(len(rewards))): R = args.gamma * R + rewards[i] advantage = R - values[i] value_loss = value_loss + 0.5 * advantage.pow(2) # Generalized Advantage Estimataion delta_t = rewards[i] + args.gamma * \ values[i + 1].data - values[i].data gae = gae * args.gamma * args.tau + delta_t policy_loss = policy_loss - \ log_probs[i] * Variable(gae) - 0.01 * entropies[i] optimizer.zero_grad() p_losses.append(policy_loss.data[0, 0]) v_losses.append(value_loss.data[0, 0]) if(len(p_losses) > 1000): num_iters += 1 print(" ".join([ "Training thread: {}".format(rank), "Num iters: {}K".format(num_iters), "Avg policy loss: {}".format(np.mean(p_losses)), "Avg value loss: {}".format(np.mean(v_losses))])) logging.info(" ".join([ "Training thread: {}".format(rank), "Num iters: {}K".format(num_iters), "Avg policy loss: {}".format(np.mean(p_losses)), "Avg value loss: {}".format(np.mean(v_losses))])) p_losses = [] v_losses = [] (policy_loss + 0.5 * value_loss).backward() torch.nn.utils.clip_grad_norm_(model.parameters(), 40) ensure_shared_grads(model, shared_model) optimizer.step()
def test(rank, args, shared_model): torch.manual_seed(args.seed + rank) env = grounding_env.GroundingEnv(args) env.game_init() model = A3C_LSTM_GA(args) if (args.load != "0"): print("Loading model ... " + args.load) model.load_state_dict( torch.load(args.load, map_location=lambda storage, loc: storage)) model.eval() (image, instruction), _, _, _ = env.reset() # Print instruction while evaluating and visualizing if args.evaluate != 0 and args.visualize == 1: print("Instruction: {} ".format(instruction)) # Getting indices of the words in the instruction instruction_idx = [] for word in instruction.split(" "): instruction_idx.append(env.word_to_idx[word]) instruction_idx = np.array(instruction_idx) image = torch.from_numpy(image).float() / 255.0 instruction_idx = torch.from_numpy(instruction_idx).view(1, -1) reward_sum = 0 done = True start_time = time.time() episode_length = 0 rewards_list = [] accuracy_list = [] episode_length_list = [] num_episode = 0 best_reward = 0.0 test_freq = 50 while True: episode_length += 1 if done: if (args.evaluate == 0): model.load_state_dict(shared_model.state_dict()) cx = Variable(torch.zeros(1, 256), volatile=True) hx = Variable(torch.zeros(1, 256), volatile=True) else: cx = Variable(cx.data, volatile=True) hx = Variable(hx.data, volatile=True) tx = Variable(torch.from_numpy(np.array([episode_length])).long(), volatile=True) value, logit, (hx, cx) = model((Variable(image.unsqueeze(0), volatile=True), Variable(instruction_idx, volatile=True), (tx, hx, cx))) prob = F.softmax(logit) action = prob.max(1)[1].data.numpy() (image, _), reward, done, _ = env.step(action[0]) done = done or episode_length >= args.max_episode_length reward_sum += reward if done: num_episode += 1 rewards_list.append(reward_sum) # Print reward while evaluating and visualizing if args.evaluate != 0 and args.visualize == 1: print("Total reward: {}".format(reward_sum)) episode_length_list.append(episode_length) if reward == CORRECT_OBJECT_REWARD: accuracy = 1 else: accuracy = 0 accuracy_list.append(accuracy) if (len(rewards_list) >= test_freq): print(" ".join([ "Time {},".format( time.strftime("%Hh %Mm %Ss", time.gmtime(time.time() - start_time))), "Avg Reward {},".format(np.mean(rewards_list)), "Avg Accuracy {},".format(np.mean(accuracy_list)), "Avg Ep length {},".format(np.mean(episode_length_list)), "Best Reward {}".format(best_reward) ])) logging.info(" ".join([ "Time {},".format( time.strftime("%Hh %Mm %Ss", time.gmtime(time.time() - start_time))), "Avg Reward {},".format(np.mean(rewards_list)), "Avg Accuracy {},".format(np.mean(accuracy_list)), "Avg Ep length {},".format(np.mean(episode_length_list)), "Best Reward {}".format(best_reward) ])) if np.mean(rewards_list) >= best_reward and args.evaluate == 0: torch.save(model.state_dict(), args.dump_location + "model_best") best_reward = np.mean(rewards_list) rewards_list = [] accuracy_list = [] episode_length_list = [] reward_sum = 0 episode_length = 0 (image, instruction), _, _, _ = env.reset() # Print instruction while evaluating and visualizing if args.evaluate != 0 and args.visualize == 1: print("Instruction: {} ".format(instruction)) # Getting indices of the words in the instruction instruction_idx = [] for word in instruction.split(" "): instruction_idx.append(env.word_to_idx[word]) instruction_idx = np.array(instruction_idx) instruction_idx = torch.from_numpy(instruction_idx).view(1, -1)
def train(rank, args, shared_model): torch.manual_seed(args.seed + rank) env = grounding_env.GroundingEnv(args) env.game_init() model = A3C_LSTM_GA(args) if (args.load != "0"): print(str(rank) + " Loading model ... " + args.load) model.load_state_dict( torch.load(args.load, map_location=lambda storage, loc: storage)) model.train() optimizer = optim.SGD(shared_model.parameters(), lr=args.lr) p_losses = [] v_losses = [] (images, instruction), _, _, _ = env.reset() images = torch.from_numpy(np.stack(images)).float() / 255.0 done = True ''' #Curiosity bookkeeping prevState = images prevAction = None beta = .2 lamb = .2 #TODO tune this. Language grounding is important #eta = 1 #TODO tune this hyperparameter ''' episode_length = 0 num_iters = 0 while True: # Sync with the shared model model.load_state_dict(shared_model.state_dict()) if done: episode_length = 0 cx = Variable(torch.zeros(1, 256)) hx = Variable(torch.zeros(1, 256)) else: cx = Variable(cx.data) hx = Variable(hx.data) values = [] log_probs = [] rewards = [] entropies = [] #Optimizing over this policy_loss = Variable(torch.zeros(1, 1)) value_loss = 0 for step in range(args.num_steps): episode_length += 1 tx = Variable(torch.from_numpy(np.array([episode_length])).long()) value, logit, (hx, cx) = model( (Variable(images), Variable(instruction_idx), (tx, hx, cx)), teacher=True, inverse=False) prob = F.softmax(logit, dim=1) log_prob = F.log_softmax(logit, dim=1) entropy = -(log_prob * prob).sum(1) entropies.append(entropy) action = prob.multinomial( 1).data #action is sampled once from multinomial log_prob = log_prob.gather(1, Variable(action)) oldAction = action action = action.numpy()[0, 0] (images, _), reward, done, _ = env.step(action) #Process entire last 5 images into cnn done = done or episode_length >= args.max_episode_length if done: (images, instruction), _, _, _ = env.reset() instruction_idx = [] for word in instruction.split(" "): instruction_idx.append(env.word_to_idx[word]) instruction_idx = np.array(instruction_idx) instruction_idx = torch.from_numpy(instruction_idx).view(1, -1) #We stack now because we take in last 5 images. images = torch.from_numpy(np.stack(images)).float() / 255.0 #curiosity loss and reward. This is done in pretraining now. ''' if prevAction is not None: pred_action = model((Variable(prevState), Variable(images)), teacher=False, inverse=True) a_prob = F.softmax(pred_action) a_loss = 1/2 * torch.norm(a_prob - prob) #Because we have access to softmax, might as well use it TODO #actionTensor = torch.eye(3)[prevAction[0]] pred_state = model((Variable(prevState), prevAction[0]), teacher=False, inverse=False) #We are predicting final next state s_loss = 1/2 * torch.norm(pred_state - model.getImageRep(Variable(images[-1].unsqueeze(0)))) policy_loss += (1-beta) * a_loss + beta * s_loss #curReward += eta * s_loss.item() #Updating curiosity prevAction = oldAction prevState = images ''' values.append(value) #critic in actor-critic log_probs.append(log_prob) rewards.append( reward) #+2 if found, -.1 if not found, plus intrinsic if done: break R = torch.zeros(1, 1) if not done: tx = Variable(torch.from_numpy(np.array([episode_length])).long()) value, _, _ = model( (Variable(images), Variable(instruction_idx), (tx, hx, cx)), teacher=True, inverse=False) R = value.data values.append(Variable(R)) R = Variable(R) new_loss = 0 gae = torch.zeros(1, 1) for i in reversed(range(len(rewards))): R = args.gamma * R + rewards[i] advantage = R - values[i] value_loss = value_loss + 0.5 * advantage.pow(2) # Generalized Advantage Estimataion delta_t = rewards[i] + args.gamma * \ values[i + 1].data - values[i].data gae = gae * args.gamma * args.tau + delta_t new_loss = new_loss - \ log_probs[i] * Variable(gae) - 0.01 * entropies[i] policy_loss += lamb * new_loss optimizer.zero_grad() p_losses.append(policy_loss.data[0, 0]) v_losses.append(value_loss.data[0, 0]) if (len(p_losses) > 1000): num_iters += 1 print(" ".join([ "Training thread: {}".format(rank), "Num iters: {}K".format(num_iters), "Avg policy loss: {}".format(np.mean(p_losses)), "Avg value loss: {}".format(np.mean(v_losses)) ])) logging.info(" ".join([ "Training thread: {}".format(rank), "Num iters: {}K".format(num_iters), "Avg policy loss: {}".format(np.mean(p_losses)), "Avg value loss: {}".format(np.mean(v_losses)) ])) p_losses = [] v_losses = [] (policy_loss + 0.5 * value_loss).backward() torch.nn.utils.clip_grad_norm(model.parameters(), 40) ensure_shared_grads(model, shared_model) optimizer.step()