def main(): args = get_args() torch.manual_seed(args.seed) torch.cuda.manual_seed_all(args.seed) if args.cuda and torch.cuda.is_available() and args.cuda_deterministic: torch.backends.cudnn.benchmark = False torch.backends.cudnn.deterministic = True log_dir = os.path.expanduser(args.log_dir) eval_log_dir = log_dir + "_eval" utils.cleanup_log_dir(log_dir) utils.cleanup_log_dir(eval_log_dir) torch.set_num_threads(1) device = torch.device("cuda:0" if args.cuda else "cpu") name = "compiled_dataset_08131950" #add 50 back in embed_dim = 300 # switch this later!! embed_size = embed_dim with open('data/' + name + '_all_instructions', 'rb') as f: all_instructions = pickle.load(f) vocab, vocab_weights = build_vocabulary(all_instructions, name, embed_dim) vocab.add_word('<pad>') vocab.add_word('<start>') vocab.add_word('<end>') vocab.add_word('<unk>') envs = make_vec_envs(args.env_name, args.seed, args.num_processes, args.gamma, args.log_dir, device, False, vocabulary=vocab) actor_critic = Policy(envs.observation_space.shape, envs.action_space, base_kwargs={'recurrent': args.recurrent_policy}) actor_critic.to(device) if args.algo == 'a2c': agent = algo.A2C_ACKTR(actor_critic, args.value_loss_coef, args.entropy_coef, lr=args.lr, eps=args.eps, alpha=args.alpha, max_grad_norm=args.max_grad_norm) elif args.algo == 'ppo': agent = algo.PPO(actor_critic, args.clip_param, args.ppo_epoch, args.num_mini_batch, args.value_loss_coef, args.entropy_coef, lr=args.lr, eps=args.eps, max_grad_norm=args.max_grad_norm) elif args.algo == 'acktr': agent = algo.A2C_ACKTR(actor_critic, args.value_loss_coef, args.entropy_coef, acktr=True) if args.gail: assert len(envs.observation_space.shape) == 1 discr = gail.Discriminator( envs.observation_space.shape[0] + envs.action_space.shape[0], 100, device) file_name = os.path.join( args.gail_experts_dir, "trajs_{}.pt".format(args.env_name.split('-')[0].lower())) gail_train_loader = torch.utils.data.DataLoader( gail.ExpertDataset(file_name, num_trajectories=4, subsample_frequency=20), batch_size=args.gail_batch_size, shuffle=True, drop_last=True) #print(args.num_env_steps) rollouts = RolloutStorage(args.num_steps, args.num_processes, envs.observation_space.shape, envs.action_space, actor_critic.recurrent_hidden_state_size) obs = envs.reset() rollouts.obs[0].copy_(obs) rollouts.to(device) episode_rewards = deque(maxlen=10) start = time.time() num_updates = int( args.num_env_steps) // args.num_steps // args.num_processes #print(num_updates) for j in range(num_updates): if args.use_linear_lr_decay: # decrease learning rate linearly utils.update_linear_schedule( agent.optimizer, j, num_updates, agent.optimizer.lr if args.algo == "acktr" else args.lr) for step in range(args.num_steps): # Sample actions with torch.no_grad(): value, action, action_log_prob, recurrent_hidden_states = actor_critic.act( rollouts.obs[step], rollouts.recurrent_hidden_states[step], rollouts.masks[step]) # Obser reward and next obs obs, reward, done, infos = envs.step(action) for info in infos: if 'episode' in info.keys(): episode_rewards.append(info['episode']['r']) # If done then clean the history of observations. masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done]) bad_masks = torch.FloatTensor( [[0.0] if 'bad_transition' in info.keys() else [1.0] for info in infos]) rollouts.insert(obs, recurrent_hidden_states, action, action_log_prob, value, reward, masks, bad_masks) with torch.no_grad(): next_value = actor_critic.get_value( rollouts.obs[-1], rollouts.recurrent_hidden_states[-1], rollouts.masks[-1]).detach() if args.gail: if j >= 10: envs.venv.eval() gail_epoch = args.gail_epoch if j < 10: gail_epoch = 100 # Warm up for _ in range(gail_epoch): discr.update(gail_train_loader, rollouts, utils.get_vec_normalize(envs)._obfilt) for step in range(args.num_steps): rollouts.rewards[step] = discr.predict_reward( rollouts.obs[step], rollouts.actions[step], args.gamma, rollouts.masks[step]) rollouts.compute_returns(next_value, args.use_gae, args.gamma, args.gae_lambda, args.use_proper_time_limits) value_loss, action_loss, dist_entropy = agent.update(rollouts) rollouts.after_update() # save for every interval-th episode or for the last epoch if (j % args.save_interval == 0 or j == num_updates - 1) and args.save_dir != "": save_path = os.path.join(args.save_dir, args.algo) try: os.makedirs(save_path) except OSError: pass torch.save([ actor_critic, getattr(utils.get_vec_normalize(envs), 'ob_rms', None) ], os.path.join(save_path, args.model_name + ".pt")) if j % args.log_interval == 0 and len(episode_rewards) > 1: total_num_steps = (j + 1) * args.num_processes * args.num_steps end = time.time() print( "Updates {}, num timesteps {}, FPS {} \n Last {} training episodes: mean/median reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}\n" .format(j, total_num_steps, int(total_num_steps / (end - start)), len(episode_rewards), np.mean(episode_rewards), np.median(episode_rewards), np.min(episode_rewards), np.max(episode_rewards), dist_entropy, value_loss, action_loss)) if (args.eval_interval is not None and len(episode_rewards) > 1 and j % args.eval_interval == 0): env = make_vec_envs(args.env_name, args.seed + 101, 1, None, None, device, False, vocabulary=vocab) recurrent_hidden_states = torch.zeros( 1, actor_critic.recurrent_hidden_state_size) masks = torch.zeros(1, 1) obs = env.reset() count = {} for i in range(100): tot_steps = obs[0, 0].item() for step in range(98): with torch.no_grad(): value, action, _, recurrent_hidden_states = actor_critic.act( obs, recurrent_hidden_states, masks, True) # Obser reward and next obs obs, reward, done, _ = env.step(action) if done: if tot_steps in count: count[tot_steps][0] = count[tot_steps][0] + 1 count[tot_steps][1] = count[tot_steps][1] + 1 else: count[tot_steps] = [1, 1] break if not done: obs = env.reset() if tot_steps in count: count[tot_steps][0] = count[tot_steps][0] + 0 count[tot_steps][1] = count[tot_steps][1] + 1 else: count[tot_steps] = [0, 1] #f=open(os.path.join(save_path, args.model_name) + ".txt", "a+") filename = os.path.join(save_path, args.model_name) + ".txt" if os.path.exists(filename): append_write = 'a' # append if already exists else: append_write = 'w' # make a new file if not f = open(filename, append_write) f.write(str(j) + "\n") f.write(str(count) + "\n") f.close()
with open(name + 'inventories', 'rb') as f: train_inventories = pickle.load(f) with open(name + 'actions', 'rb') as f: train_actions = pickle.load(f) with open(name + 'goals', 'rb') as f: train_goals = pickle.load(f) with open(name + 'instructions', 'rb') as f: train_instructions = pickle.load(f) with open(name + 'all_instructions', 'rb') as f: all_instructions = pickle.load(f) vocab, vocab_weights = build_vocabulary(all_instructions, name, embed_dim) vocab.add_word('<pad>') vocab.add_word('<start>') vocab.add_word('<end>') vocab.add_word('<unk>') temp = np.zeros((1, 300), dtype=np.float32) vocab_weights = np.concatenate((vocab_weights, temp), axis=0) lstm_embed_dim = 32 train_loss = [] train_loss1 = [] val_loss = []
def load_model_play_game_with_lang_glove(): # load model if torch.cuda.is_available(): print("using cuda") device = torch.device('cuda') else: print("using cpu") device = torch.device('cpu') name = "compiled_dataset_08131950" #add 50 back in embed_dim = 300 # switch this later!! embed_size = embed_dim if embed_dim == 50: glove = vocabtorch.GloVe(name='6B', dim=50) else: glove = vocabtorch.GloVe(name='840B', dim=300) with open('data/'+name+'_all_instructions', 'rb') as f: all_instructions = pickle.load(f) vocab, vocab_weights = build_vocabulary(all_instructions, name, embed_dim) vocab.add_word('<pad>') vocab.add_word('<start>') vocab.add_word('<end>') vocab.add_word('<unk>') temp = np.zeros((1,300), dtype=np.float32) temp1 = np.random.uniform(-0.01, 0.01, (1,300)).astype("float32") vocab_weights = np.concatenate((vocab_weights, temp), axis=0) vocab_weights = torch.Tensor(vocab_weights).to(device) language_model = LanguageWithAttentionGLOVE(len(vocab), embed_dim, vocab_weights, training=False) language_model.to(device) language_model.load_state_dict(torch.load("TRAINED_MODELS/LanguageWithAttentionGLOVE_clipped.pt")) language_model.eval() # or do the all obs. action_model = AllObsPredictAtten(embed_dim, vocab_weights, vocab_words=vocab) action_model.to(device) action_model.load_state_dict(torch.load("TRAINED_MODELS/AllObsPredictAtten_both.pt")) action_model.eval() #action_model = CNNAction(embed_dim, vocab, vocab_weights) #action_model.to(device) #action_model.load_state_dict(torch.load("TRAINED_MODELS/CNNAction_8epochs_nllsoftmax.pt")) #action_model.eval() # play x number of games: tot_games = 20 tot_win = 0 for i in range(tot_games): #print(i) res, sentences = play_game_w_language_glove(language_model, action_model, glove, embed_size, vocab, vocab_weights, device) #print(res) #print(sentences) tot_win = tot_win + res print(tot_win, i+1) print(tot_win, tot_games)
def play_game_by_hand_glove(): if torch.cuda.is_available(): print("using cuda") device = torch.device('cuda') else: print("using cpu") device = torch.device('cpu') name = "compiled_dataset_08131950" #add 50 back in embed_dim = 300 # switch this later!! embed_size = embed_dim with open('data/'+name+'_all_instructions', 'rb') as f: all_instructions = pickle.load(f) vocab, vocab_weights = build_vocabulary(all_instructions, name, embed_dim) vocab_weights = torch.from_numpy(vocab_weights).to(device) vocab.add_word('<pad>') vocab.add_word('<start>') vocab.add_word('<end>') vocab.add_word('<unk>') lstm_embed_dim = 16 #model = LanguageNetv1(len(vocab), lstm_embed_dim) #model = LanguageNetv2(len(vocab), embed_dim, vocab_weights, training=False) #model = LanguageWithAttention(len(vocab), embed_dim, vocab_weights, training=False) model = LanguageWithAttentionGLOVE(len(vocab), embed_dim, vocab_weights, training=False) model.to(device) model.load_state_dict(torch.load("TRAINED_MODELS/LanguageWithAttentionGLOVE_01RMSProp.pt")) count = 0 game = generate_new_game() if embed_size == 300: glove = vocabtorch.GloVe(name='840B', dim=300) elif embed_size == 50: glove = vocabtorch.GloVe(name='6B', dim=50) count = 0 game = generate_new_game() print(game.game.goal) past_moves = [] while not game.is_over() or count == 250: count = count + 1 state = game.observe()['observation'][0] #fix this printing so it is easier.. for line in state: print(line) goal = game.game.goal inventory = game.game.inventory states_embedding = torch.from_numpy(np.array([get_grid_embedding(state, glove, embed_size)])) states_onehot = torch.from_numpy(np.array([one_hot_grid(state, glove, embed_size)])) goal = torch.from_numpy(get_goal_embedding(goal, glove, embed_size)) inventory = torch.Tensor(np.array([get_inventory_embedding(inventory, glove, embed_size)])) states_onehot = states_onehot.to(device) states_embedding = states_embedding.to(device) goal = goal.to(device) inventory = inventory.to(device) sampled_ids, hiddens = model.get_hidden_state_new(states_embedding, states_onehot, inventory, goal, device, vocab, vocab_weights) sampled_caption = [] for word_id in sampled_ids[0]: word = vocab.idx2word[word_id] sampled_caption.append(word) if word == '<end>': break sentence = ' '.join(sampled_caption) print(sentence) print('1:up, 2:down, 3:left, 4:right, 5:toggle, 6:grab, 7:mine, 0: craft') a = input("Enter a move: ") action = get_action_name(int(a)) game.act(action)
'--non-det', action='store_true', default=False, help='whether to use a non-deterministic policy') args = parser.parse_args() args.det = not args.non_det embed_dim = 300 embed_size = embed_dim with open('data/dataset_all_instructions', 'rb') as f: all_instructions = pickle.load(f) vocab, vocab_weights = build_vocabulary(all_instructions, 'blah', embed_dim) vocab.add_word('<pad>') vocab.add_word('<start>') vocab.add_word('<end>') vocab.add_word('<unk>') device = torch.device("cuda:0") env = make_vec_envs(args.env_name, args.seed+101, 1, None, None, device, False, vocabulary=vocab) actor_critic, ob_rms = torch.load(args.load_dir + ".pt") vec_norm = get_vec_normalize(env)