Example #1
0
    def _thunk():

        env = gym.make(env_id)

        is_atari = hasattr(gym.envs, 'atari') and isinstance(
            env.unwrapped, gym.envs.atari.atari_env.AtariEnv)
        if is_atari:
            env = make_atari(env_id)

        env.seed(seed + rank)

        obs_shape = env.observation_space.shape

        if add_timestep and len(
                obs_shape) == 1 and str(env).find('TimeLimit') > -1:
            env = AddTimestep(env)

        if is_atari:
            if len(env.observation_space.shape) == 3:
                env = wrap_deepmind(env)

        # If the input has shape (W,H,3), wrap for PyTorch convolutions
        obs_shape = env.observation_space.shape
        if len(obs_shape) == 3 and obs_shape[2] in [1, 3]:
            env = TransposeImage(env)

        return env
Example #2
0
def rollout_episode(agents, GAMMA, MAX_STEPS, ENV_ID):
    if ENV_ID == "dense-v0":
        env = gym.make(ENV_ID)
    else:
        # Wrap the ATARI env in DeepMind Wrapper
        env = make_atari(ENV_ID)
        env = wrap_deepmind(env, episode_life=True, clip_rewards=True,
                            frame_stack=True, scale=True)
        env = wrap_pytorch(env)
    # Rollout the policy for a single episode - greedy!
    replay_buffer = ReplayBuffer(capacity=5000)

    obs = env.reset()
    episode_rew = 0
    steps = 0

    while steps < MAX_STEPS:
        if ENV_ID == "dense-v0":
            action = agents["current"].act(obs.flatten(), epsilon=0.05)
        else:
            action = agents["current"].act(obs, epsilon=0.05)
        next_obs, reward, done, _ = env.step(action)
        steps += 1

        replay_buffer.push(0, steps, obs, action,
                           reward, next_obs, done)

        obs = next_obs

        episode_rew += GAMMA**(steps - 1) * reward
        if done:
            break
    return steps, episode_rew, replay_buffer.buffer
Example #3
0
def get_env(task, seed, monitor=True):
    env = gym.make(task)

    set_global_seeds(seed)
    env.seed(seed)

    expt_dir = "tmp/gym-results"
    if monitor:
        env = wrappers.Monitor(env, expt_dir, force=True)
    env = wrap_deepmind(env)

    return env
Example #4
0
def get_env(env_id, seed):

    env = gym.make(env_id)

    set_global_seeds(seed)
    env.seed(seed)

    expt_dir = 'tmp/gym-results'
    env = wrappers.Monitor(env, expt_dir, force=True)
    env = wrap_deepmind(env)

    return env
Example #5
0
def get_env(task, seed):
    env_id = task.env_id

    env = gym.make(env_id)

    set_global_seeds(seed)
    env.seed(seed)

    expt_dir = 'results/pong'
    env = wrappers.Monitor(env, expt_dir, force=True)
    env = wrap_deepmind(env)

    return env
Example #6
0
def get_env(seed):
    #     env_id = task.env_id

    env = gym.make('MsPacman-v0')

    set_global_seeds(seed)
    env.seed(seed)

    expt_dir = 'tmp/gym-results'
    env = wrappers.Monitor(env, expt_dir, force=True)
    env = wrap_deepmind(env)

    return env
Example #7
0
def main(env_id):
    env = wrap_deepmind(make_atari(env_id), scale=True)
    policy_model = DQNSoftmax(env.action_space.n)
    value_function_model = DQNRegressor()
    agent = TRPOAgent(env, policy_model, value_function_model)

    subprocess.Popen(["tensorboard", "--logdir", "runs"])
    configure("runs/pong-run")

    for t in count():
        reward = agent.step()
        log_value('score', reward, t)
        if t % 100 == 0:
            torch.save(policy_model.state_dict(), "policy_model.pth")
def rollout_macro_episode(agents, GAMMA, MAX_STEPS, ENV_ID, macros=None):
    if ENV_ID == "dense-v0":
        env = gym.make(ENV_ID)
    else:
        # Wrap the ATARI env in DeepMind Wrapper
        env = make_atari(ENV_ID)
        env = wrap_deepmind(env,
                            episode_life=True,
                            clip_rewards=True,
                            frame_stack=True,
                            scale=True)
        env = wrap_pytorch(env)
    # Rollout the policy for a single episode - greedy!
    replay_buffer = ReplayBuffer(capacity=20000)
    obs = env.reset()
    episode_rew = 0
    steps = 0

    if ENV_ID == "dense-v0":
        NUM_PRIMITIVES = 4
    elif ENV_ID == "PongNoFrameskip-v4":
        NUM_PRIMITIVES = 6
    elif ENV_ID == "SeaquestNoFrameskip-v4":
        NUM_PRIMITIVES = 18
    elif ENV_ID == "MsPacmanNoFrameskip-v4":
        NUM_PRIMITIVES = 9

    while steps < MAX_STEPS:
        if ENV_ID == "dense-v0":
            action = agents["current"].act(obs.flatten(), epsilon=0.05)
        else:
            action = agents["current"].act(obs, epsilon=0.05)
        if action < NUM_PRIMITIVES:
            next_obs, reward, done, _ = env.step(action)
            steps += 1

            # Push transition to ER Buffer
            replay_buffer.push(0, steps, obs, int(action), reward, next_obs,
                               done)
        else:
            # Need to execute a macro action
            macro = macros[action - NUM_PRIMITIVES]
            next_obs, reward, done, _ = macro_action_exec(
                0, obs, steps, replay_buffer, macro, env, GAMMA)
            steps += len(macro)

        episode_rew += GAMMA**(steps - 1) * reward
        if done:
            break
    return steps, episode_rew, replay_buffer.buffer
Example #9
0
def get_env(env, seed):
    # env_id = task.env_id

    # env = gym.make(env_id)

    set_global_seeds(seed)
    env.seed(seed)

    expt_dir = 'tmp/gym-results'
    env = wrappers.Monitor(env, expt_dir, force=True)
    print(env.observation_space)
    print(env.spec.id)
    env = wrap_deepmind(env)
    print("get_env")
    return env
Example #10
0
def main(env_id, embedding_size):
    env = wrap_deepmind(make_atari(env_id), scale=True)
    embedding_model = DQN(embedding_size)
    agent = NECAgent(env, embedding_model)

    # subprocess.Popen(["tensorboard", "--logdir", "runs"])
    configure("runs/pong-run")

    for t in count():
        if t == 0:
            reward = agent.warmup()
        else:
            reward = agent.episode()
        print("Episode {}\nTotal Reward: {}".format(t, reward))
        log_value('score', reward, t)
Example #11
0
def get_env(task, seed):
    env_id = task.env_id

    env = gym.make(env_id)

    set_global_seeds(seed)
    env.seed(seed)

    expt_dir = 'tmp/gym-results'
    env = wrappers.Monitor(
        env,
        expt_dir,
        force=True,
        video_callable=lambda episode_id: episode_id % 100 == 0)
    env = wrap_deepmind(env)

    return env
Example #12
0
        batch_size=BATCH_SIZE,
        gamma=GAMMA,
        learning_starts=LEARNING_STARTS,
        learning_freq=LEARNING_FREQ,
        frame_history_len=FRAME_HISTORY_LEN,
        target_update_freq=TARGER_UPDATE_FREQ,
    )

if __name__ == '__main__':
    env = gym.make("ppaquette/SuperMarioBros-1-1-v0")
    
    
    # set global seeds
    env.seed(SEED)
    torch.manual_seed(SEED)
    np.random.seed(SEED)
    random.seed(SEED)
    
    
    # monitor & wrap the game
    env = wrap_deepmind(env)
    
    expt_dir = 'Game_video'
    #env = wrappers.Monitor(env, expt_dir, force=True, video_callable=lambda episode_id: episode_id % 10 == 0)
    env = wrappers.Monitor(env, expt_dir, force=True, video_callable=lambda episode_id: True)

    # main
main(env)


Example #13
0
def run_online_dqn_smdp_learning(args):
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)

    # Set the GPU device on which to run the agent
    USE_CUDA = torch.cuda.is_available()
    if USE_CUDA:
        torch.cuda.set_device(args.device_id)
        print("USING CUDA DEVICE {}".format(args.device_id))
    else:
        print("USING CPU")
    Variable = lambda *args, **kwargs: autograd.Variable(*args, **kwargs).cuda(
    ) if USE_CUDA else autograd.Variable(*args, **kwargs)
    start = time.time()

    # Extract variables for arguments
    TRAIN_BATCH_SIZE = args.TRAIN_BATCH_SIZE
    EPS_START, EPS_STOP, EPS_DECAY = args.EPS_START, args.EPS_STOP, args.EPS_DECAY
    GAMMA, L_RATE = args.GAMMA, args.L_RATE

    NUM_UPDATES = args.NUM_UPDATES
    NUM_ROLLOUTS = args.NUM_ROLLOUTS
    MAX_STEPS = args.MAX_STEPS
    ROLLOUT_EVERY = args.ROLLOUT_EVERY
    UPDATE_EVERY = args.UPDATE_EVERY
    VERBOSE = args.VERBOSE
    PRINT_EVERY = args.PRINT_EVERY
    CAPACITY = args.CAPACITY

    ENV_ID = args.ENV_ID
    AGENT = args.AGENT
    AGENT_FNAME = args.AGENT_FNAME
    STATS_FNAME = args.SAVE_FNAME

    # Get macros from expert dqn rollout
    LOAD_CKPT = args.LOAD_CKPT
    NUM_MACROS = args.NUM_MACROS
    GRAMMAR_TYPE = args.GRAMMAR_TYPE
    GRAMMAR_EVERY = args.GRAMMAR_EVERY

    if GRAMMAR_TYPE == "sequitur":
        GRAMMAR_DIR = SEQ_DIR
    elif GRAMMAR_TYPE == "lexis":
        GRAMMAR_DIR = LEXIS_DIR

    NUM_ACTIONS = 4 + NUM_MACROS
    if AGENT == "DOUBLE": TRAIN_DOUBLE = True
    else: TRAIN_DOUBLE = False

    # Setup agent, replay replay_buffer, logging stats df
    if AGENT == "MLP-DQN" or AGENT == "DOUBLE":
        agents, optimizer = init_agent(MLP_DQN, L_RATE, USE_CUDA)
    elif AGENT == "MLP-Dueling-DQN":
        agents, optimizer = init_agent(MLP_DDQN, L_RATE, USE_CUDA)
    elif AGENT == "CNN-Dueling-DQN":
        agents, optimizer = init_agent(CNN_DDQN, L_RATE, USE_CUDA)

    # Get random rollout and add num-macros actions
    torch.save(agents["current"].state_dict(), LOAD_CKPT)
    macros, counts, stats = get_macro_from_agent(NUM_MACROS,
                                                 4,
                                                 USE_CUDA,
                                                 AGENT,
                                                 LOAD_CKPT,
                                                 GRAMMAR_DIR,
                                                 ENV_ID,
                                                 g_type=GRAMMAR_TYPE)

    # Setup agent, replay replay_buffer, logging stats df
    if AGENT == "MLP-DQN" or AGENT == "DOUBLE":
        agents, optimizer = init_agent(MLP_DQN, L_RATE, USE_CUDA, NUM_ACTIONS)
    elif AGENT == "MLP-Dueling-DQN":
        agents, optimizer = init_agent(MLP_DDQN, L_RATE, USE_CUDA, NUM_ACTIONS)
    elif AGENT == "CNN-Dueling-DQN":
        agents, optimizer = init_agent(CNN_DDQN, L_RATE, USE_CUDA, NUM_ACTIONS)

    replay_buffer = ReplayBuffer(capacity=args.CAPACITY)
    macro_buffer = MacroBuffer(capacity=args.CAPACITY)

    reward_stats = pd.DataFrame(columns=[
        "opt_counter", "rew_mean", "rew_sd", "rew_median", "rew_10th_p",
        "rew_90th_p"
    ])

    step_stats = pd.DataFrame(columns=[
        "opt_counter", "steps_mean", "steps_sd", "steps_median",
        "steps_10th_p", "steps_90th_p"
    ])

    # Initialize optimization update counter and environment
    opt_counter = 0
    if ENV_ID == "dense-v0":
        env = gym.make(ENV_ID)
    else:
        # Wrap the ATARI env in DeepMind Wrapper
        env = make_atari(ENV_ID)
        env = wrap_deepmind(env,
                            episode_life=True,
                            clip_rewards=True,
                            frame_stack=True,
                            scale=True)
        env = wrap_pytorch(env)

    if ENV_ID == "dense-v0":
        NUM_PRIMITIVES = 4
    elif ENV_ID == "PongNoFrameskip-v4":
        NUM_PRIMITIVES = 6
    elif ENV_ID == "SeaquestNoFrameskip-v4":
        NUM_PRIMITIVES = 18
    elif ENV_ID == "MsPacmanNoFrameskip-v4":
        NUM_PRIMITIVES = 9

    ep_id = 0
    # RUN TRAINING LOOP OVER EPISODES
    while opt_counter < NUM_UPDATES:
        epsilon = epsilon_by_episode(ep_id + 1, EPS_START, EPS_STOP, EPS_DECAY)

        obs = env.reset()

        steps = 0
        while steps < MAX_STEPS:
            if ENV_ID == "dense-v0":
                action = agents["current"].act(obs.flatten(), epsilon)
            else:
                action = agents["current"].act(obs, epsilon)

            if action < NUM_PRIMITIVES:
                next_obs, rew, done, _ = env.step(action)
                steps += 1

                # Push transition to ER Buffer
                replay_buffer.push(ep_id, steps, obs, action, rew, next_obs,
                                   done)
            else:
                # Need to execute a macro action
                macro = macros[action - NUM_PRIMITIVES]
                next_obs, macro_rew, done, _ = macro_action_exec(
                    ep_id, obs, steps, replay_buffer, macro, env, GAMMA)
                steps += len(macro)
                # Push macro transition to ER Buffer
                macro_buffer.push(ep_id, steps, obs, action, macro_rew,
                                  next_obs, done, len(macro), macro)

            if len(replay_buffer) > TRAIN_BATCH_SIZE:
                opt_counter += 1
                loss = compute_td_loss(agents, optimizer, replay_buffer,
                                       TRAIN_BATCH_SIZE, GAMMA, Variable,
                                       TRAIN_DOUBLE, ENV_ID)

            # Check for Online Transfer
            if (opt_counter + 1) % GRAMMAR_EVERY == 0:
                torch.save(agents["current"].state_dict(), LOAD_CKPT)
                macros, counts, stats = get_macro_from_agent(
                    NUM_MACROS, NUM_ACTIONS, USE_CUDA, AGENT, LOAD_CKPT,
                    GRAMMAR_DIR, ENV_ID, macros, GRAMMAR_TYPE)

            # On-Policy Rollout for Performance evaluation
            if (opt_counter + 1) % ROLLOUT_EVERY == 0:
                r_stats, s_stats = get_logging_stats(opt_counter, agents,
                                                     GAMMA, NUM_ROLLOUTS,
                                                     MAX_STEPS, ENV_ID)
                reward_stats = pd.concat([reward_stats, r_stats], axis=0)
                step_stats = pd.concat([step_stats, s_stats], axis=0)

            if VERBOSE and (opt_counter + 1) % PRINT_EVERY == 0:
                stop = time.time()
                print(
                    log_template.format(opt_counter + 1, stop - start,
                                        r_stats.loc[0, "rew_median"],
                                        r_stats.loc[0, "rew_mean"],
                                        s_stats.loc[0, "steps_median"],
                                        s_stats.loc[0, "steps_mean"]))
                start = time.time()

            if (opt_counter + 1) % UPDATE_EVERY == 0:
                update_target(agents["current"], agents["target"])

            # Go to next episode if current one terminated or update obs
            if done: break
            else: obs = next_obs

        ep_id += 1
    # Finally save all results!
    df_to_save = pd.concat([reward_stats, step_stats], axis=1)
    df_to_save = df_to_save.loc[:, ~df_to_save.columns.duplicated()]
    df_to_save = df_to_save.reset_index()
    if args.SAVE:
        torch.save(agents["current"].state_dict(),
                   "agents/online_" + AGENT_FNAME)
        # Save the logging dataframe
        df_to_save.to_csv("results/online_" + STATS_FNAME)
    return df_to_save
Example #14
0
def run_dqn_learning(args):
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    # Set the GPU device on which to run the agent
    USE_CUDA = torch.cuda.is_available()
    if USE_CUDA:
        torch.cuda.set_device(args.device_id)
        print("USING CUDA DEVICE {}".format(args.device_id))
    else:
        print("USING CPU")
    Variable = lambda *args, **kwargs: autograd.Variable(*args, **kwargs).cuda(
    ) if USE_CUDA else autograd.Variable(*args, **kwargs)
    start = time.time()

    # Extract variables for arguments
    TRAIN_BATCH_SIZE = args.TRAIN_BATCH_SIZE
    EPS_START, EPS_STOP, EPS_DECAY = args.EPS_START, args.EPS_STOP, args.EPS_DECAY
    GAMMA, L_RATE = args.GAMMA, args.L_RATE

    NUM_UPDATES = args.NUM_UPDATES
    NUM_ROLLOUTS = args.NUM_ROLLOUTS
    MAX_STEPS = args.MAX_STEPS
    ROLLOUT_EVERY = args.ROLLOUT_EVERY
    UPDATE_EVERY = args.UPDATE_EVERY
    VERBOSE = args.VERBOSE
    PRINT_EVERY = args.PRINT_EVERY
    CAPACITY = args.CAPACITY

    ENV_ID = args.ENV_ID
    AGENT = args.AGENT
    AGENT_FNAME = args.AGENT_FNAME
    STATS_FNAME = args.SAVE_FNAME

    if args.DOUBLE: TRAIN_DOUBLE = True
    else: TRAIN_DOUBLE = False

    # Setup agent, replay replay_buffer, logging stats df
    if AGENT == "MLP-DQN" or AGENT == "DOUBLE":
        agents, optimizer = init_agent(MLP_DQN, L_RATE, USE_CUDA)
    elif AGENT == "MLP-Dueling-DQN":
        agents, optimizer = init_agent(MLP_DDQN, L_RATE, USE_CUDA)
    elif AGENT == "CNN-Dueling-DQN":
        agents, optimizer = init_agent(CNN_DDQN, L_RATE, USE_CUDA)

    replay_buffer = ReplayBuffer(capacity=CAPACITY)

    reward_stats = pd.DataFrame(columns=[
        "opt_counter", "rew_mean", "rew_sd", "rew_median", "rew_10th_p",
        "rew_90th_p"
    ])

    step_stats = pd.DataFrame(columns=[
        "opt_counter", "steps_mean", "steps_sd", "steps_median",
        "steps_10th_p", "steps_90th_p"
    ])

    # Initialize optimization update counter and environment
    opt_counter = 0
    if ENV_ID == "dense-v0":
        env = gym.make(ENV_ID)
    else:
        # Wrap the ATARI env in DeepMind Wrapper
        env = make_atari(ENV_ID)
        env = wrap_deepmind(env,
                            episode_life=True,
                            clip_rewards=True,
                            frame_stack=True,
                            scale=True)
        env = wrap_pytorch(env)

    # RUN TRAINING LOOP OVER EPISODES
    while opt_counter < NUM_UPDATES:
        epsilon = epsilon_by_episode(opt_counter + 1, EPS_START, EPS_STOP,
                                     EPS_DECAY)

        obs = env.reset()
        ep_id = 0
        steps = 0
        while steps < MAX_STEPS:
            if ENV_ID == "dense-v0":
                action = agents["current"].act(obs.flatten(), epsilon)
            else:
                action = agents["current"].act(obs, epsilon)
            next_obs, rew, done, _ = env.step(action)
            steps += 1

            # Push transition to ER Buffer
            replay_buffer.push(ep_id, steps, obs, action, rew, next_obs, done)

            if len(replay_buffer) > TRAIN_BATCH_SIZE:
                opt_counter += 1
                loss = compute_td_loss(agents, optimizer, replay_buffer,
                                       TRAIN_BATCH_SIZE, GAMMA, Variable,
                                       TRAIN_DOUBLE, ENV_ID)

            # On-Policy Rollout for Performance evaluation
            if (opt_counter + 1) % ROLLOUT_EVERY == 0:
                r_stats, s_stats = get_logging_stats(opt_counter, agents,
                                                     GAMMA, NUM_ROLLOUTS,
                                                     MAX_STEPS, ENV_ID)
                reward_stats = pd.concat([reward_stats, r_stats], axis=0)
                step_stats = pd.concat([step_stats, s_stats], axis=0)

            if (opt_counter + 1) % UPDATE_EVERY == 0:
                update_target(agents["current"], agents["target"])

            if VERBOSE and (opt_counter + 1) % PRINT_EVERY == 0:
                stop = time.time()
                print(
                    log_template.format(opt_counter + 1, stop - start,
                                        r_stats.loc[0, "rew_median"],
                                        r_stats.loc[0, "rew_mean"],
                                        s_stats.loc[0, "steps_median"],
                                        s_stats.loc[0, "steps_mean"]))
                start = time.time()

            if args.SAVE:
                if ENV_ID == "dense-v0":
                    # Gridworld - Updates after which to save expert/transfer agent
                    save_after_upd = [250000, 500000, 1000000]
                else:
                    # ATARI - Updates after which to save expert/transfer agent
                    save_after_upd = [1000000, 2500000, 5000000]
                if opt_counter + 1 in save_after_upd:
                    agent_path = "agents/trained/" + str(opt_counter +
                                                         1) + "_" + AGENT_FNAME
                    torch.save(agents["current"].state_dict(), agent_path)
                    print("Saved expert agent to {}".format(agent_path))

            # Go to next episode if current one terminated or update obs
            if done: break
            else: obs = next_obs
        ep_id += 1

    # Save the logging dataframe
    df_to_save = pd.concat([reward_stats, step_stats], axis=1)
    df_to_save = df_to_save.loc[:, ~df_to_save.columns.duplicated()]
    df_to_save = df_to_save.reset_index()

    if args.SAVE:
        # Finally save all results!
        torch.save(agents["current"].state_dict(),
                   "agents/" + str(NUM_UPDATES) + "_" + AGENT_FNAME)
        df_to_save.to_csv("results/" + args.AGENT + "_" + STATS_FNAME)
    return df_to_save