Exemple #1
0
def main():
    args = get_args()

    torch.manual_seed(args.seed)
    torch.cuda.manual_seed_all(args.seed)

    if args.cuda and torch.cuda.is_available() and args.cuda_deterministic:
        torch.backends.cudnn.benchmark = False
        torch.backends.cudnn.deterministic = True

    log_dir = os.path.expanduser(args.log_dir)
    eval_log_dir = log_dir + "_eval"
    utils.cleanup_log_dir(log_dir)
    utils.cleanup_log_dir(eval_log_dir)

    torch.set_num_threads(1)
    device = torch.device("cuda:0" if args.cuda else "cpu")

    name = "compiled_dataset_08131950"  #add 50 back in
    embed_dim = 300  # switch this later!!
    embed_size = embed_dim

    with open('data/' + name + '_all_instructions', 'rb') as f:
        all_instructions = pickle.load(f)

    vocab, vocab_weights = build_vocabulary(all_instructions, name, embed_dim)

    vocab.add_word('<pad>')
    vocab.add_word('<start>')
    vocab.add_word('<end>')
    vocab.add_word('<unk>')

    envs = make_vec_envs(args.env_name,
                         args.seed,
                         args.num_processes,
                         args.gamma,
                         args.log_dir,
                         device,
                         False,
                         vocabulary=vocab)

    actor_critic = Policy(envs.observation_space.shape,
                          envs.action_space,
                          base_kwargs={'recurrent': args.recurrent_policy})
    actor_critic.to(device)

    if args.algo == 'a2c':
        agent = algo.A2C_ACKTR(actor_critic,
                               args.value_loss_coef,
                               args.entropy_coef,
                               lr=args.lr,
                               eps=args.eps,
                               alpha=args.alpha,
                               max_grad_norm=args.max_grad_norm)
    elif args.algo == 'ppo':
        agent = algo.PPO(actor_critic,
                         args.clip_param,
                         args.ppo_epoch,
                         args.num_mini_batch,
                         args.value_loss_coef,
                         args.entropy_coef,
                         lr=args.lr,
                         eps=args.eps,
                         max_grad_norm=args.max_grad_norm)
    elif args.algo == 'acktr':
        agent = algo.A2C_ACKTR(actor_critic,
                               args.value_loss_coef,
                               args.entropy_coef,
                               acktr=True)

    if args.gail:
        assert len(envs.observation_space.shape) == 1
        discr = gail.Discriminator(
            envs.observation_space.shape[0] + envs.action_space.shape[0], 100,
            device)
        file_name = os.path.join(
            args.gail_experts_dir,
            "trajs_{}.pt".format(args.env_name.split('-')[0].lower()))

        gail_train_loader = torch.utils.data.DataLoader(
            gail.ExpertDataset(file_name,
                               num_trajectories=4,
                               subsample_frequency=20),
            batch_size=args.gail_batch_size,
            shuffle=True,
            drop_last=True)

    #print(args.num_env_steps)
    rollouts = RolloutStorage(args.num_steps, args.num_processes,
                              envs.observation_space.shape, envs.action_space,
                              actor_critic.recurrent_hidden_state_size)

    obs = envs.reset()
    rollouts.obs[0].copy_(obs)
    rollouts.to(device)

    episode_rewards = deque(maxlen=10)

    start = time.time()
    num_updates = int(
        args.num_env_steps) // args.num_steps // args.num_processes

    #print(num_updates)
    for j in range(num_updates):

        if args.use_linear_lr_decay:
            # decrease learning rate linearly
            utils.update_linear_schedule(
                agent.optimizer, j, num_updates,
                agent.optimizer.lr if args.algo == "acktr" else args.lr)

        for step in range(args.num_steps):
            # Sample actions
            with torch.no_grad():
                value, action, action_log_prob, recurrent_hidden_states = actor_critic.act(
                    rollouts.obs[step], rollouts.recurrent_hidden_states[step],
                    rollouts.masks[step])

            # Obser reward and next obs
            obs, reward, done, infos = envs.step(action)

            for info in infos:
                if 'episode' in info.keys():
                    episode_rewards.append(info['episode']['r'])

            # If done then clean the history of observations.
            masks = torch.FloatTensor([[0.0] if done_ else [1.0]
                                       for done_ in done])
            bad_masks = torch.FloatTensor(
                [[0.0] if 'bad_transition' in info.keys() else [1.0]
                 for info in infos])
            rollouts.insert(obs, recurrent_hidden_states, action,
                            action_log_prob, value, reward, masks, bad_masks)

        with torch.no_grad():
            next_value = actor_critic.get_value(
                rollouts.obs[-1], rollouts.recurrent_hidden_states[-1],
                rollouts.masks[-1]).detach()

        if args.gail:
            if j >= 10:
                envs.venv.eval()

            gail_epoch = args.gail_epoch
            if j < 10:
                gail_epoch = 100  # Warm up
            for _ in range(gail_epoch):
                discr.update(gail_train_loader, rollouts,
                             utils.get_vec_normalize(envs)._obfilt)

            for step in range(args.num_steps):
                rollouts.rewards[step] = discr.predict_reward(
                    rollouts.obs[step], rollouts.actions[step], args.gamma,
                    rollouts.masks[step])

        rollouts.compute_returns(next_value, args.use_gae, args.gamma,
                                 args.gae_lambda, args.use_proper_time_limits)

        value_loss, action_loss, dist_entropy = agent.update(rollouts)

        rollouts.after_update()

        # save for every interval-th episode or for the last epoch
        if (j % args.save_interval == 0
                or j == num_updates - 1) and args.save_dir != "":
            save_path = os.path.join(args.save_dir, args.algo)
            try:
                os.makedirs(save_path)
            except OSError:
                pass

            torch.save([
                actor_critic,
                getattr(utils.get_vec_normalize(envs), 'ob_rms', None)
            ], os.path.join(save_path, args.model_name + ".pt"))

        if j % args.log_interval == 0 and len(episode_rewards) > 1:
            total_num_steps = (j + 1) * args.num_processes * args.num_steps
            end = time.time()
            print(
                "Updates {}, num timesteps {}, FPS {} \n Last {} training episodes: mean/median reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}\n"
                .format(j, total_num_steps,
                        int(total_num_steps / (end - start)),
                        len(episode_rewards), np.mean(episode_rewards),
                        np.median(episode_rewards), np.min(episode_rewards),
                        np.max(episode_rewards), dist_entropy, value_loss,
                        action_loss))

        if (args.eval_interval is not None and len(episode_rewards) > 1
                and j % args.eval_interval == 0):

            env = make_vec_envs(args.env_name,
                                args.seed + 101,
                                1,
                                None,
                                None,
                                device,
                                False,
                                vocabulary=vocab)

            recurrent_hidden_states = torch.zeros(
                1, actor_critic.recurrent_hidden_state_size)
            masks = torch.zeros(1, 1)

            obs = env.reset()

            count = {}
            for i in range(100):
                tot_steps = obs[0, 0].item()

                for step in range(98):

                    with torch.no_grad():
                        value, action, _, recurrent_hidden_states = actor_critic.act(
                            obs, recurrent_hidden_states, masks, True)

                    # Obser reward and next obs
                    obs, reward, done, _ = env.step(action)

                    if done:
                        if tot_steps in count:
                            count[tot_steps][0] = count[tot_steps][0] + 1
                            count[tot_steps][1] = count[tot_steps][1] + 1
                        else:
                            count[tot_steps] = [1, 1]
                        break

                if not done:
                    obs = env.reset()

                    if tot_steps in count:
                        count[tot_steps][0] = count[tot_steps][0] + 0
                        count[tot_steps][1] = count[tot_steps][1] + 1
                    else:
                        count[tot_steps] = [0, 1]

            #f=open(os.path.join(save_path, args.model_name) + ".txt", "a+")

            filename = os.path.join(save_path, args.model_name) + ".txt"
            if os.path.exists(filename):
                append_write = 'a'  # append if already exists
            else:
                append_write = 'w'  # make a new file if not

            f = open(filename, append_write)

            f.write(str(j) + "\n")
            f.write(str(count) + "\n")
            f.close()
Exemple #2
0
with open(name + 'inventories', 'rb') as f:
    train_inventories = pickle.load(f)

with open(name + 'actions', 'rb') as f:
    train_actions = pickle.load(f)

with open(name + 'goals', 'rb') as f:
    train_goals = pickle.load(f)

with open(name + 'instructions', 'rb') as f:
    train_instructions = pickle.load(f)

with open(name + 'all_instructions', 'rb') as f:
    all_instructions = pickle.load(f)

vocab, vocab_weights = build_vocabulary(all_instructions, name, embed_dim)

vocab.add_word('<pad>')
vocab.add_word('<start>')
vocab.add_word('<end>')
vocab.add_word('<unk>')

temp = np.zeros((1, 300), dtype=np.float32)

vocab_weights = np.concatenate((vocab_weights, temp), axis=0)

lstm_embed_dim = 32

train_loss = []
train_loss1 = []
val_loss = []
Exemple #3
0
def load_model_play_game_with_lang_glove():

    # load model

    if torch.cuda.is_available():
        print("using cuda")
        device = torch.device('cuda')
    else:
        print("using cpu")
        device = torch.device('cpu')

    name = "compiled_dataset_08131950" #add 50 back in
    embed_dim = 300 # switch this later!!
    embed_size = embed_dim

    if embed_dim == 50:
        glove = vocabtorch.GloVe(name='6B', dim=50)
    else:
        glove = vocabtorch.GloVe(name='840B', dim=300)

    with open('data/'+name+'_all_instructions', 'rb') as f:
        all_instructions = pickle.load(f)

    vocab, vocab_weights = build_vocabulary(all_instructions, name, embed_dim)

    vocab.add_word('<pad>')
    vocab.add_word('<start>')
    vocab.add_word('<end>')
    vocab.add_word('<unk>')

    temp = np.zeros((1,300), dtype=np.float32)
    temp1 = np.random.uniform(-0.01, 0.01, (1,300)).astype("float32")

    vocab_weights = np.concatenate((vocab_weights, temp), axis=0)

    vocab_weights = torch.Tensor(vocab_weights).to(device)

    language_model = LanguageWithAttentionGLOVE(len(vocab), embed_dim, vocab_weights, training=False)
    language_model.to(device)
    language_model.load_state_dict(torch.load("TRAINED_MODELS/LanguageWithAttentionGLOVE_clipped.pt"))
    language_model.eval()

    # or do the all obs. 
    action_model = AllObsPredictAtten(embed_dim, vocab_weights, vocab_words=vocab)
    action_model.to(device)
    action_model.load_state_dict(torch.load("TRAINED_MODELS/AllObsPredictAtten_both.pt"))
    action_model.eval()

    #action_model = CNNAction(embed_dim, vocab, vocab_weights)
    #action_model.to(device)
    #action_model.load_state_dict(torch.load("TRAINED_MODELS/CNNAction_8epochs_nllsoftmax.pt"))
    #action_model.eval()

    # play x number of games:
    tot_games = 20
    tot_win = 0
    for i in range(tot_games):
        #print(i)
        res, sentences = play_game_w_language_glove(language_model, action_model, glove, embed_size, vocab, vocab_weights, device)
        #print(res)
        #print(sentences)
        tot_win = tot_win + res
        print(tot_win, i+1)

    print(tot_win, tot_games)
Exemple #4
0
def play_game_by_hand_glove():

    if torch.cuda.is_available():
        print("using cuda")
        device = torch.device('cuda')
    else:
        print("using cpu")
        device = torch.device('cpu')

    name = "compiled_dataset_08131950" #add 50 back in
    embed_dim = 300 # switch this later!!
    embed_size = embed_dim

    with open('data/'+name+'_all_instructions', 'rb') as f:
        all_instructions = pickle.load(f)

    vocab, vocab_weights = build_vocabulary(all_instructions, name, embed_dim)

    vocab_weights = torch.from_numpy(vocab_weights).to(device)

    vocab.add_word('<pad>')
    vocab.add_word('<start>')
    vocab.add_word('<end>')
    vocab.add_word('<unk>')

    lstm_embed_dim = 16

    #model = LanguageNetv1(len(vocab), lstm_embed_dim)
    #model = LanguageNetv2(len(vocab), embed_dim, vocab_weights, training=False)
    #model = LanguageWithAttention(len(vocab), embed_dim, vocab_weights, training=False)
    model = LanguageWithAttentionGLOVE(len(vocab), embed_dim, vocab_weights, training=False)
    model.to(device)
    model.load_state_dict(torch.load("TRAINED_MODELS/LanguageWithAttentionGLOVE_01RMSProp.pt"))

    count = 0
    game = generate_new_game()

    if embed_size == 300:
        glove = vocabtorch.GloVe(name='840B', dim=300)
    elif embed_size == 50:
        glove = vocabtorch.GloVe(name='6B', dim=50)

    count = 0
    game = generate_new_game()

    print(game.game.goal)

    past_moves = []

    while not game.is_over() or count == 250:

        count = count + 1
        state = game.observe()['observation'][0]
        
        #fix this printing so it is easier.. 
        for line in state:
            print(line)

        goal = game.game.goal
        inventory = game.game.inventory

        states_embedding = torch.from_numpy(np.array([get_grid_embedding(state, glove, embed_size)]))
        states_onehot = torch.from_numpy(np.array([one_hot_grid(state, glove, embed_size)]))
        goal = torch.from_numpy(get_goal_embedding(goal, glove, embed_size))
        inventory = torch.Tensor(np.array([get_inventory_embedding(inventory, glove, embed_size)]))

        states_onehot = states_onehot.to(device)
        states_embedding = states_embedding.to(device)
        goal = goal.to(device)
        inventory = inventory.to(device)

        sampled_ids, hiddens = model.get_hidden_state_new(states_embedding, states_onehot, inventory, goal, device, vocab, vocab_weights)         

        sampled_caption = []
        for word_id in sampled_ids[0]:
            word = vocab.idx2word[word_id]
            sampled_caption.append(word)
            if word == '<end>':
                break
        sentence = ' '.join(sampled_caption) 
        print(sentence)

        print('1:up, 2:down, 3:left, 4:right, 5:toggle, 6:grab, 7:mine, 0: craft')

        a = input("Enter a move: ")
        action = get_action_name(int(a))
        game.act(action)
Exemple #5
0
    '--non-det',
    action='store_true',
    default=False,
    help='whether to use a non-deterministic policy')
args = parser.parse_args()

args.det = not args.non_det


embed_dim = 300 
embed_size = embed_dim

with open('data/dataset_all_instructions', 'rb') as f:
    all_instructions = pickle.load(f)

vocab, vocab_weights = build_vocabulary(all_instructions, 'blah', embed_dim)

vocab.add_word('<pad>')
vocab.add_word('<start>')
vocab.add_word('<end>')
vocab.add_word('<unk>')

device = torch.device("cuda:0")

env = make_vec_envs(args.env_name, args.seed+101, 1,
                         None, None, device, False, vocabulary=vocab)


actor_critic, ob_rms = torch.load(args.load_dir + ".pt")

vec_norm = get_vec_normalize(env)