Esempio n. 1
0
def make_vec_envs(env_name,
                  seed,
                  num_processes,
                  num_frame_stack=1,
                  downsample=True,
                  color=False,
                  gamma=0.99,
                  log_dir='./tmp/',
                  device=torch.device('cpu')):
    Path(log_dir).mkdir(parents=True, exist_ok=True)
    envs = [
        make_env(env_name, seed, i, log_dir, downsample, color)
        for i in range(num_processes)
    ]

    if len(envs) > 1:
        envs = SubprocVecEnv(envs, context='fork')
    else:
        envs = DummyVecEnv(envs)

    if len(envs.observation_space.shape) == 1:
        if gamma is None:
            envs = VecNormalize(envs, ret=False)
        else:
            envs = VecNormalize(envs, gamma=gamma)

    envs = VecPyTorch(envs, device)

    if num_frame_stack > 1:
        envs = VecPyTorchFrameStack(envs, num_frame_stack, device)

    return envs
Esempio n. 2
0
def main():
    def make_env():
        obs_type = retro.Observations.IMAGE  # retro.Observations.RAM
        env = retro.make(game=game,
                         state=state,
                         scenario=scenario,
                         record=record,
                         players=players,
                         obs_type=obs_type)
        # env = retro.make(game=game, state=state, scenario=scenario)
        env = wrap_deepmind_retro(env)
        return env

    venv = SubprocVecEnv([make_env] * 8)
    ppo2.learn(
        network='cnn',
        env=venv,
        total_timesteps=int(1e6),
        nsteps=128,
        nminibatches=4,
        lam=0.95,
        gamma=0.99,
        noptepochs=4,
        log_interval=1,
        ent_coef=.01,
        lr=lambda f: f * 2.5e-4,
        cliprange=0.1,
    )
Esempio n. 3
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--game', default='Airstriker-Genesis')
    parser.add_argument('--state', default=retro.State.DEFAULT)
    parser.add_argument('--scenario', default=None)
    args = parser.parse_args()

    def make_env():
        env = make_retro(game=args.game,
                         state=args.state,
                         scenario=args.scenario)
        env = wrap_deepmind_retro(env)
        return env

    venv = SubprocVecEnv([make_env] * 8)
    ppo2.learn(
        network='cnn',
        env=venv,
        total_timesteps=int(100e6),
        nsteps=128,
        nminibatches=4,
        lam=0.95,
        gamma=0.99,
        noptepochs=4,
        log_interval=1,
        ent_coef=.01,
        lr=lambda f: f * 2.5e-4,
        cliprange=0.1,
    )
Esempio n. 4
0
def main():
    def make_env():
        obs_type = retro.Observations.IMAGE  # retro.Observations.RAM
        env = retro.make(game=game,
                         state=state,
                         scenario=scenario,
                         record=record,
                         players=players,
                         obs_type=obs_type)
        # env = retro.make(game=game, state=state, scenario=scenario)
        print(retro.__path__)
        env = wrap_deepmind_retro(env)
        return env

    base_dirname = os.path.join(currentdir, "results")
    demo_dir = os.path.join(currentdir, "human_data/demonstrations")

    demo_fname = os.path.join(demo_dir, "human_demonstration_10.pkl")

    if not os.path.exists(base_dirname):
        os.mkdir(base_dirname)
    dir_name = "pitfall_ppo2"
    dir_name += dir_note
    dir_name = addDateTime(dir_name)
    dir_name = os.path.join(base_dirname, dir_name)
    if not os.path.exists(dir_name):
        os.mkdir(dir_name)

    with open(demo_fname, "rb") as f:
        demos = pickle.load(f)

    valid_demos = []
    for demo in demos:
        action, score = demo
        valid_action = np.array(action, dtype=np.float32).reshape(1, -1)
        valid_demos.append(valid_action)

    venv = SubprocVecEnv([make_env] * 1)
    performance = ppo2.learn(network='cnn',
                             env=venv,
                             total_timesteps=int(2e5),
                             nsteps=32,
                             nminibatches=4,
                             lam=0.95,
                             gamma=0.99,
                             noptepochs=16,
                             log_interval=10,
                             save_interval=500,
                             ent_coef=.02,
                             lr=lambda f: f * 3e-4,
                             cliprange=0.2,
                             base_path=dir_name,
                             use_demo=True,
                             demos=valid_demos,
                             render_env=False)

    performance_fname = os.path.join(dir_name, "performance.p")
    with open(performance_fname, "wb") as f:
        pickle.dump(performance, f)
Esempio n. 5
0
def main():
    def make_env():
        obs_type = retro.Observations.IMAGE  # retro.Observations.RAM
        env = retro.make(game='Pitfall-Atari2600',
                         state=retro.State.DEFAULT,
                         scenario='scenario',
                         record='.',
                         players=1,
                         obs_type=obs_type)
        env = wrap_deepmind_retro(env)
        return env

    base_dirname = os.path.join(currentdir, "results")
    #dir_name = "pitfall_ppo2_rl_baseline1"
    dir_name = "pitfall_ppo2testing_D191211_073544"
    dir_name = os.path.join(base_dirname, dir_name)
    load_path = os.path.join(dir_name, 'models/00781')

    venv = SubprocVecEnv([make_env] * 1)  #Vectorized
    network = 'cnn'
    policy = build_policy(venv, network)
    nenvs = venv.num_envs  # Get the nb of env

    # Get state_space and action_space
    ob_space = venv.observation_space
    ac_space = venv.action_space

    # Instantiate the model object
    model_fn = Model
    nsteps = 2048
    nbatch = nenvs * nsteps
    nminibatches = 4
    nbatch_train = nbatch // nminibatches
    model = model_fn(policy=policy,
                     ob_space=ob_space,
                     ac_space=ac_space,
                     nbatch_act=nenvs,
                     nbatch_train=nbatch_train,
                     nsteps=2048,
                     ent_coef=0.0,
                     vf_coef=0.5,
                     max_grad_norm=0.5)
    model.load(load_path)

    # Instantiate the runner object
    runner = Runner(env=venv, model=model, nsteps=nsteps, gamma=0.99, lam=0.95)

    # run the Runner and record video
    total_timesteps = int(1e4)
    nupdates = total_timesteps // nbatch
    for update in range(1, nupdates + 1):
        print("progress: ", update, "/", nupdates)
        obs, returns, masks, actions, values, neglogpacs, states, epinfos = runner.run(
        )
Esempio n. 6
0
    def make_vec_env(nenvs=4, recurrent=False, grayscale=True, frame_stack=4, frame_diff=False):
        venv = SubprocVecEnv([lambda: make_env(rank, grayscale=grayscale) for rank in range(nenvs)])
        # Uncomment this line in place of the one above for debugging.
        # venv = DummyVecEnv([lambda: make_env(0)])

        if not recurrent:
            if frame_diff:
                venv = VecFrameDiff(venv)
            else:
                # Perform the frame stack at the vectorized environment level as opposed to at
                # the individual environment level. I think this allows you to communicate fewer
                # images across processes.
                venv = VecFrameStack(venv, frame_stack)
        return venv
Esempio n. 7
0
def get_envs(factory: EnvFactory):
    num_envs = len(os.sched_getaffinity(0))

    env = factory.make_env()

    def make_env():
        def _thunk():
            env = factory.make_env()
            return env

        return _thunk

    envs = [make_env() for _ in range(num_envs)]
    envs = SubprocVecEnv(envs)
    return env, envs
Esempio n. 8
0
def make_envs(env_id, device, seed=0, num_envs=1, frame_stack=1, **kwargs):
    envs = [
        env_generator(env_id, seed=seed + 1000 * i, **kwargs)
        for i in range(num_envs)
    ]

    if len(envs) > 1:
        envs = SubprocVecEnv(envs)
    else:
        envs = DummyVecEnv(envs)

    envs = VecPyTorch(envs, device)

    if frame_stack > 1:
        envs = VecPyTorchFrameStack(envs, frame_stack, device)

    return envs
Esempio n. 9
0
def make_vec_envs(env_name,
                  seed,
                  num_processes,
                  num_frame_stack=1,
                  downsample=True,
                  color=False,
                  gamma=0.99,
                  log_dir='./tmp/',
                  device=torch.device('cpu'),
                  use_extended_wrapper=False,
                  train_mode="train_encoder"):
    try:
        Path(log_dir).mkdir(parents=True, exist_ok=True)
    except OSError as exc:
        if exc.errno != errno.EEXIST:
            raise
        pass
    envs = [
        make_env(env_name,
                 seed,
                 i,
                 log_dir,
                 downsample,
                 color,
                 frame_stack=num_frame_stack,
                 use_extended_wrapper=use_extended_wrapper,
                 train_mode=train_mode) for i in range(num_processes)
    ]

    if len(envs) > 1:
        envs = SubprocVecEnv(envs, context='fork')
    else:
        envs = DummyVecEnv(envs)

    if len(envs.observation_space.shape) == 1:
        if gamma is None:
            envs = VecNormalize(envs, ret=False)
        else:
            envs = VecNormalize(envs, gamma=gamma)

    envs = VecPyTorch(envs, device)

    # if num_frame_stack > 1:
    #     envs = VecPyTorchFrameStack(envs, num_frame_stack, device)

    return envs
Esempio n. 10
0
def main(arguments: argparse) -> None:
    """
    Main training loop.
    :param arguments: User input
    :return:
    """
    n_steps = arguments.steps
    n_agents = arguments.envs

    print(f'Training {args.game} using {"cpu" if arguments.cpu else "gpu"}')
    print(f'Number of concurrent environments {args.envs}')
    print(f'Number of steps per batch {args.steps}')

    if arguments.model:
        print(f'Using existing model {arguments.model}')

    env = SubprocVecEnv(
        [make_env(env_id=arguments.game, rank=i) for i in range(n_agents)])
    agent = DeepLearningAgent(observation_space=env.observation_space,
                              action_space=int(env.action_space.n),
                              n_envs=n_agents,
                              n_steps=n_steps,
                              model_path=arguments.model,
                              use_cpu=arguments.cpu)

    # This is the current state (or observation)
    observations = reshape_observations(env.reset())
    actions = agent.get_action(observations)
    initial_training_time = time.time()

    for ep in range(EPISODES):
        # Reset the frame counter each time the batch size is complete
        for i in range(n_steps):
            new_observations, rewards, done, info = env.step(
                actions.cpu().numpy())
            new_observations = reshape_observations(new_observations)

            agent.train(s=observations,
                        r=rewards,
                        s_next=new_observations,
                        a=actions,
                        done=done,
                        step=i)

            actions = agent.get_action(new_observations)
            observations = new_observations

        if ep % 100 == 0:
            fps = ((ep + 1) * n_steps * n_agents) / (time.time() -
                                                     initial_training_time)
            print(f'FPS {fps}')

    env.close()
Esempio n. 11
0
def main():
    # Alter reward in scenario.json (C:\Users\Fergus\Anaconda3\envs\AIGym\Lib\site-packages\retro\data\stable\SonicTheHedgehog-Genesis)

    env = SubprocVecEnv([make_env_3])
    obs = env.reset()
    # env = make_env_3()
    # env2 = make_env_4()
    print(env.observation_space)
    print(env.action_space.n)
    print(obs.shape)
    print(obs[0].shape)
    # obs = env2.reset()
    rew_mb = []
    dones_mb = []
    obs_mb = []
    step = 0
    while True:
        action = env.action_space.sample()
        obs, rew, done, info = env.step([0])
        print("Step {} Reward: {}, Done: {}".format(step, rew, done))
        rew_mb.append(rew)
        dones_mb.append(done)
        obs_mb.append(obs)
        env.render()

        step += 1
        # obs = obs[1] / 255.
        # for i in range(4):
        #     cv2.imshow('GrayScale'+str(i), np.squeeze(obs[:,:,i]))
        #     cv2.waitKey(1)
        if done[0]:
            env.close()
            break
    rew_mb = np.array(rew_mb)
    dones_mb = np.array(dones_mb)
    obs_mb = np.array(obs_mb)
    print("Rewards: ", rew_mb)
    print(rew_mb.shape)
    print(dones_mb)
    print(dones_mb.shape)
    print(obs_mb.shape)
Esempio n. 12
0
def main():
    def make_env():
        obs_type = retro.Observations.IMAGE  # retro.Observations.RAM
        env = retro.make(game=game, state=state, scenario=scenario, record=record, players=players, obs_type=obs_type)
        # env = retro.make(game=game, state=state, scenario=scenario)
        print(retro.__path__)
        env = wrap_deepmind_retro(env)
        return env

    base_dirname = os.path.join(currentdir, "results")

    if not os.path.exists(base_dirname):
        os.mkdir(base_dirname)
    dir_name = "pitfall_ppo2"
    dir_name += dir_note
    dir_name = addDateTime(dir_name)
    dir_name = os.path.join(base_dirname, dir_name)
    if not os.path.exists(dir_name):
        os.mkdir(dir_name)

    venv = SubprocVecEnv([make_env] * 8)
    performance = ppo2.learn(
        network='cnn',
        env=venv,
        total_timesteps=int(2e5),
        nsteps=32,
        nminibatches=4,
        lam=0.95,
        gamma=0.99,
        noptepochs=16,
        log_interval=10,
        save_interval=500,
        ent_coef=.02,
        lr=lambda f: f * 3e-4,
        cliprange=0.2,
        base_path=dir_name
    )

    performance_fname = os.path.join(dir_name, "performance.p")
    with open(performance_fname, "wb") as f:
        pickle.dump(performance, f)
Esempio n. 13
0
def make_rl_envs(env_id, seed, n_envs, device, frame_stack=4, add_video=False, add_frames=False, vid_path=None, **kwargs):
    envs = [env_generator(env_id, seed=seed+1000*i) for i in range(n_envs)]

    if len(envs) > 1:
        envs = SubprocVecEnv(envs)
    else:
        envs = DummyVecEnv(envs)

    if add_video:
        assert vid_path is not None
        envs = VecConcatVideo(envs, vid_path, ordered=True)
    elif add_frames:
        assert vid_path is not None
        envs = VecConcatVideo(envs, vid_path, ordered=False)

    envs = VecPyTorch(envs, device)

    if frame_stack > 1:
        envs = VecPyTorchFrameStack(envs, frame_stack, device)

    return envs
Esempio n. 14
0
    def make_vec_env(nenvs=4,
                     recurrent=False,
                     grayscale=True,
                     frame_stack=4,
                     num_agents=2):
        venv = SubprocVecEnv([
            lambda: make_env(rank, grayscale=grayscale, num_agents=num_agents)
            for rank in range(nenvs)
        ])
        # Uncomment this line in place of the one above for debugging.
        # venv = DummyVecEnv([lambda: make_env(0)])

        if not recurrent:
            # Perform the frame stack at the vectorized environment level as opposed to at
            # the individual environment level. I think this allows you to communicate fewer
            # images across processes.
            venv = VecFrameStack(venv, frame_stack)

        venv = MultiAgentToSingleAgent(venv, num_agents=num_agents)
        venv = VecMonitor(venv, filename=monitor_filepath)
        return venv
Esempio n. 15
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--game', default='Airstriker-Genesis')
    parser.add_argument('--state', default=retro.State.DEFAULT)
    parser.add_argument('--scenario', default=None)
    args = parser.parse_args()

    def make_env():
        #if I wanna record the game video, then add attribute "record = '.'"
        env = make_retro(game=args.game, state=args.state, scenario=args.scenario, record = '.')
        env = wrap_deepmind_retro(env)
        env = Monitor(env, None, True)
        # And activate all of the under codes
        env.reset()
        while True:
            _obs, _rew, done, _info = env.step(env.action_space.sample())
            if done:
                break

        return env
    
    venv = SubprocVecEnv([make_env] * 8)
    ppo2.learn(
        network='cnn', 
        env=venv, 
        total_timesteps=int(100e6),
        nsteps=128,
        nminibatches=4,
        lam=0.95, 
        gamma=0.99, 
        noptepochs=4, 
        log_interval=1,
        ent_coef=.01,
        lr=lambda f : f * 2.5e-4,
        cliprange=0.1,
        load_path='/home/dhodwo/PycharmProjects/untitled/check_pts/checkpoints/1'
    )
Esempio n. 16
0
def main7():
    retro.data.add_custom_integration("custom")

    def wrap_deepmind_n64(env, reward_scale=1 / 100.0, frame_stack=1, grayscale=False):
        env = MaxAndSkipEnv(env, skip=4)
        env = WarpFrame(env, width=150, height=100, grayscale=grayscale)
        env = FrameStack(env, frame_stack)
        env = ScaledFloatFrame(env)
        env = RewardScaler(env, scale=1 / 100.0)
        return env

    def make_env():
        retro.data.add_custom_integration("custom")
        env = retro.n64_env.N64Env(game="SuperSmashBros-N64",
                                   use_restricted_actions=retro.Actions.MULTI_DISCRETE,
                                   inttype=retro.data.Integrations.CUSTOM,
                                   obs_type=retro.Observations.IMAGE)
        env = wrap_deepmind_n64(env)
        return env

    gpu_options = tf.GPUOptions(allow_growth=True)
    sess = tf.Session(config=tf.ConfigProto(gpu_options=gpu_options))

    nenvs = 2
    # env = DummyVecEnv([make_env] * nenvs)
    env = SubprocVecEnv([make_env] * nenvs)
    network_name = "impala_cnn_lstm"
    policy = build_policy(env, network_name)
    recurrent = "lstm" in network_name
    ob_space = env.observation_space
    ac_space = env.action_space
    nsteps = 10
    nminibatches = 2
    nbatch = nenvs * nsteps
    nbatch_train = nbatch // nminibatches
    model = Model(policy=policy,
                  ob_space=ob_space,
                  ac_space=ac_space,
                  nbatch_act=nenvs,
                  nbatch_train=nbatch_train,
                  nsteps=nsteps,
                  ent_coef=0.01,
                  vf_coef=0.5,
                  max_grad_norm=0.5,
                  comm=None,
                  mpi_rank_weight=1)
    runner = Runner(env=env, model=model, nsteps=10, gamma=.99, lam=.95)

    env.reset()
    num_steps = 20000
    action = [np.array([0, 0, 0]), np.array([0, 0, 0])]
    for i in range(num_steps):
        sys.stdout.write(f"\r{i+1} / {num_steps}")
        action = [env.action_space.sample() for _ in range(nenvs)]
        obs, reward, dones, info = env.step(action)
        # env.reset(dones)
        # env.render()

        if i % 50 == 0:
            if recurrent:
                fig, axs = plt.subplots(nrows=1, ncols=2, figsize=(20, 12))
            else:
                fig, axs = plt.subplots(nrows=4, ncols=2, figsize=(20, 12))
            for env_index in range(nenvs):
                if recurrent:
                    axs[env_index].imshow(obs[env_index, :, :, :])
                else:
                    for j in range(4):
                        row = env_index * 2 + j // 2
                        col = j % 2
                        print(row)
                        print(col)
                        axs[row, col].imshow(obs[env_index, :, :, j])
            plt.show()
            plt.close()
    end = time.time()
    print(end - start)

    return env
Esempio n. 17
0
def main6():
    retro.data.add_custom_integration("custom")

    def wrap_deepmind_n64(env, reward_scale=1 / 100.0, frame_stack=1):
        env = MaxAndSkipEnv(env, skip=4)
        env = WarpFrame(env, width=450, height=300, grayscale=False)
        env = FrameStack(env, frame_stack)
        env = ScaledFloatFrame(env)
        env = RewardScaler(env, scale=reward_scale)
        return env

    def make_env():
        retro.data.add_custom_integration("custom")
        state = "ssb64.pikachu.level9dk.dreamland.state"
        env = retro.n64_env.N64Env(game="SuperSmashBros-N64",
                                   use_restricted_actions=retro.Actions.MULTI_DISCRETE,
                                   inttype=retro.data.Integrations.CUSTOM,
                                   obs_type=retro.Observations.IMAGE,
                                   state=state)
        env = wrap_deepmind_n64(env)
        return env

    gpu_options = tf.GPUOptions(allow_growth=True)
    sess = tf.Session(config=tf.ConfigProto(gpu_options=gpu_options))

    # env = make_env()
    env = SubprocVecEnv([make_env] * 1)
    # env = DummyVecEnv([make_env] * 1)

    env.reset()
    num_steps = 20000
    # action = [np.array([0, 0, 0])]
    # action = [env.action_space.sample() for _ in range(2)]
    for i in range(num_steps):
        sys.stdout.write(f"\r{i+1} / {num_steps}")
        # action = env.action_space.sample()
        action = [env.action_space.sample() for _ in range(1)]
        obs, reward, done, info = env.step(action)

        print(f"\nreward: {reward} done: {done}")
        # input()
        if (isinstance(done, bool) and done) or (isinstance(done, list) and all(done)):
            env.reset()
        # env.render()

        if i % 50 == 0:
            image = Image.fromarray((obs[0] * 255).astype(np.uint8))
            image.save("/home/wulfebw/Desktop/color.png")

            plt.imshow(obs[0, :, :, 0])

            # fig, axs = plt.subplots(nrows=2, ncols=2, figsize=(12, 12))
            # for j in range(1):
            #     row = j // 2
            #     col = j % 2
            #     print(row)
            #     print(col)
            #     axs[row, col].imshow(obs[:, :])
            plt.show()
            plt.close()
    end = time.time()
    print(end - start)

    return env
Esempio n. 18
0
def main(hParams, n_run, total_timesteps):
    nsteps = hParams['N_STEPS']
    n_epochs = hParams['N_EPOCHS']
    n_train = 4
    n_minibatch = 8

    log_loss_int = 1
    save_int = 5
    test_int = 10
    test_episodes = 5

    gamma = 0.95
    lr = hParams[HP_LEARNING_RATE]
    vf_coef = hParams[HP_VF_COEF]
    ent_coef = hParams[HP_ENT_COEF]
    save_dir = 'lr' + str(lr) + 'vc' + str(vf_coef) + 'ec' + str(ent_coef)
    testenvfn = SonicEnv.make_env_3

    current_time = datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
    log_dir = 'logs/sonic_long_test/run-' + str(n_run)
    summ_writer = tf.summary.create_file_writer(log_dir)

    env = SubprocVecEnv([SonicEnv.make_env_3])

    nenv = env.num_envs
    state_size = env.observation_space.shape
    num_actions = env.action_space.n
    pgnet = PGNetwork(state_size,
                      num_actions,
                      lr=lr,
                      vf_coef=vf_coef,
                      ent_coef=ent_coef)

    #  Runner used to create training data
    runner = SonicEnvRunner(env, pgnet, nsteps, gamma)

    # total_timesteps = int(n_epochs * nsteps * nenv)
    nbatch = nenv * nsteps

    print("Total updates to run: ", total_timesteps // nbatch)
    for update in range(1, total_timesteps // nbatch + 1):

        print("\nUpdate #{}:".format(update))
        states_mb, actions_mb, values_mb, rewards_mb, next_dones_mb = runner.run(
        )

        for _ in range(n_train):
            indices = np.arange(nbatch)

            np.random.shuffle(indices)

            for start in range(0, nbatch, nbatch // n_minibatch):
                end = start + nbatch // n_minibatch
                bind = indices[start:end]
                policy_loss, entropy_loss, vf_loss, loss = pgnet.fit_gradient(
                    states_mb[bind], actions_mb[bind], rewards_mb[bind],
                    values_mb[bind])

        WeightWriter(summ_writer, pgnet, (Conv2D, Dense), global_step=update)

        r2 = 1 - (np.var(rewards_mb - values_mb) / np.var(rewards_mb))

        with summ_writer.as_default():
            tf.summary.scalar("PolicyLoss", policy_loss, step=update)
            tf.summary.scalar("EntropyLoss", entropy_loss, step=update)
            tf.summary.scalar("ValueFunctionLoss", vf_loss, step=update)
            tf.summary.scalar("Loss", loss, step=update)
            tf.summary.scalar("R-squared", r2, step=update)

        if update % log_loss_int == 0:
            print("PolicyLoss:", policy_loss)
            print("EntropyLoss: ", entropy_loss)
            print("ValueFunctionLoss: ", vf_loss)
            print("Loss: ", loss)

        if update % save_int == 0:
            pgnet.model.save_weights('sonic_long_test/' + save_dir +
                                     '/my_checkpoint')
            print("Model Saved")

        if update % test_int == 0:
            TestRewardWriter(summ_writer,
                             testenvfn,
                             pgnet,
                             test_episodes,
                             global_step=update)

    with summ_writer.as_default():
        hp.hparams(hParams)

    env.close()
Esempio n. 19
0
def train_multi(env: gym.Env, agent: AgentBase, settings: TrainSettings):
    # Initialize variables for logging.
    scores = ContiguousRingBuffer(capacity=128)
    max_avg_score = -np.inf

    # Ensure settings.directory exists for logging / saving.
    os.makedirs(settings.directory, exist_ok=True)
    # Optionally load from existing checkpoint.
    if settings.load:
        agent.load(settings.load)

    # Instantiate vectorized environment.
    if isinstance(env, SubprocVecEnv):
        # No further action is required.
        pass
    elif isinstance(env, gym.Env):
        # Cannot
        logger.error("Unable to broadcast single environment {}".format(env))
    else:
        # Assume that env is a constructor function.
        env = SubprocVecEnv(
            [functools.partial(env, i) for i in range(settings.num_env)])

    # Initialize handlers for data collection.
    total_rewards = np.zeros(settings.num_env, dtype=np.float32)
    dones = np.zeros(settings.num_env, dtype=np.uint8)
    states = env.reset()
    # FIXME(yycho0108): EPS should be configurable.
    # eps = LinearEpsilon(0.8 * settings.num_episodes)
    eps = ExponentialEpsilon(0.99, 0.05, 0.8 * settings.num_episodes, True)

    i_episode = 0
    pbar = tqdm(total=settings.num_episodes)
    while i_episode < settings.num_episodes:
        # Reset the environments that are done, so that
        # At each moment the agent is always dealing with a live-state.
        # SubprocVecEnv.reset() does not allow granular control.
        for s, d, e in zip(states, dones, env.remotes):
            if not d:
                continue
            e.send(('reset', None))
            # FIXME(yycho0108): Applying a reshape here as e.recv()
            # Was seen to return a list for whatever reason.
            # May silently allow an error to pass through.
            s[:] = np.reshape(e.recv(), s.shape)
        scores.extend(total_rewards[dones == True])
        total_rewards[dones == True] = 0.0
        num_done = dones.sum()
        dones[:] = False

        # Process each state and interact with each env.
        actions = agent.select_action(states, eps(i_episode))
        next_states, rewards, dones, _ = env.step(actions)
        agent.step(states, actions, rewards, next_states, dones)
        total_rewards += rewards
        states = next_states

        # Increment episode counts accordingly.
        pbar.set_postfix(score=np.mean(scores.array))

        # Optionally enable printing episode statistics.
        # The logging happens at each crossing of the discretized log-period boundary.
        if count_boundaries(i_episode, num_done, settings.log_period) > 0:
            # Compute statistilcs.
            avg_score = np.mean(scores.array)
            if avg_score > max_avg_score:
                max_avg_score = avg_score

            # Print statistics.
            logger.info(
                "Episode {}/{} | Max Avg: {:.2f} | Eps : {:.2f}".format(
                    i_episode, settings.num_episodes, max_avg_score,
                    eps(i_episode)))
            if isinstance(agent.memory, PrioritizedReplayBuffer):
                logger.info('mp : {} vs {}'.format(
                    agent.memory.max_priority,
                    agent.memory.memory.array['priority'].max()))

        # Save agent checkpoint as well.
        if count_boundaries(i_episode, num_done, settings.save_period) > 0:
            agent.save(settings.directory, i_episode + num_done)

        i_episode += num_done
        pbar.update(num_done)
    pbar.close()

    # Save results and return.
    agent.save(settings.directory)
    return scores
Esempio n. 20
0
def main():
    os.environ['OPENAI_LOGDIR'] = LOG_PATH

    number_of_environments = 1
    venv = SubprocVecEnv([make_sf2_env] * number_of_environments)
    video_path = './recording'
    video_length = 5 * 60 * FPS
    venv = VecVideoRecorder(
        venv,
        video_path,
        record_video_trigger=lambda step: step % video_length == 0,
        video_length=video_length)
    # ppo2.learn(
    #     network='mlp',
    #     env=venv,
    #     # eval_env=venv,
    #     total_timesteps=40000000,
    #     nsteps=128,  # 5 * FPS,
    #     nminibatches=number_of_environments,
    #     lam=0.95,
    #     gamma=0.99,
    #     noptepochs=3,
    #     log_interval=1000,
    #     ent_coef=.01,
    #     lr=lambda alpha: 2.5e-4 * alpha,
    #     vf_coef=1.0,
    #     cliprange=lambda alpha: 0.1 * alpha,
    #     save_interval=1000,
    #     # load_path=MODEL_PATH,
    #     # neuronal network parameters
    #     activation=tf.nn.relu,
    #     num_layers=2,  # 4, 2
    #     num_hidden=48,  # 64, 64
    #     layer_norm=False
    # )

    acer.learn(
        network='mlp',  # 'impala_cnn'
        env=venv,
        total_timesteps=40000000,
        nsteps=128,  # 5 * FPS,
        q_coef=1.0,
        ent_coef=0.001,
        max_grad_norm=10,
        lr=7e-4,
        lrschedule='linear',
        rprop_epsilon=1e-5,
        rprop_alpha=0.99,
        gamma=0.99,
        log_interval=1000,
        buffer_size=50000,
        replay_ratio=4,
        replay_start=10000,
        c=10.0,
        trust_region=True,
        delta=1,
        alpha=0.99,
        # load_path=MODEL_PATH,
        save_interval=1000,
        # neuronal network parameters
        activation=tf.nn.relu,
        num_layers=2,  # 4, 2
        num_hidden=48,  # 64, 64
        layer_norm=False)
Esempio n. 21
0
    :param env_id: (str) the environment ID
    :param num_env: (int) the number of environments you wish to have in subprocesses
    :param seed: (int) the inital seed for RNG
    :param rank: (int) index of the subprocess
    """
    def _init():
        env = gym.make(env_id)
        env.seed(seed + rank)
        return env

    set_global_seeds(seed)
    return _init


if __name__ == "__main__":
    envs = SubprocVecEnv([make_env(env_name, i) for i in range(num_envs)])
    env = gym.make(env_name)

    num_inputs = envs.observation_space.shape
    num_outputs = envs.action_space.shape

    model = ActorCritic(num_inputs[0], num_outputs[0]).to(device)
    if os.path.isfile(modelpath):
        model.load_state_dict(torch.load(modelpath))

    ppo = PPO(model=model,
              envs=envs,
              device=device,
              lr=lr,
              modelpath=modelpath)
    if not play_mode:
Esempio n. 22
0
def main(hParams, n_run):
    nsteps = hParams['N_STEPS']
    nenv = hParams[HP_N_ENV]
    n_epochs = hParams['N_EPOCHS']
    total_timesteps = int(n_epochs * nsteps * nenv)
    nbatch = nenv * nsteps

    update_int = 1
    save_int = 5
    test_int = 10

    gamma = 0.99
    lr = hParams[HP_LEARNING_RATE]
    vf_coef = hParams[HP_VF_COEF]
    ent_coef = hParams[HP_ENT_COEF]
    save_dir = 'lr' + str(lr) + 'vc' + str(vf_coef) + 'ec' + str(
        ent_coef) + 'env' + str(nenv)

    current_time = datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
    log_dir = 'logs/cart_hparam_tuning/run-' + str(n_run)
    summ_writer = tf.summary.create_file_writer(log_dir)

    envfn = lambda: gym.make('CartPole-v1')
    env = SubprocVecEnv([envfn] * nenv)
    state_size = env.observation_space.shape
    num_actions = env.action_space.n
    pgnet = SimplePGNet(state_size,
                        num_actions,
                        learning_rate=lr,
                        vf_coef=vf_coef,
                        ent_coef=ent_coef)

    runner = SonicEnvRunner(env, pgnet, nsteps, gamma)

    print("Total updates to run: ", total_timesteps // nbatch)
    for update in range(1, total_timesteps // nbatch + 1):

        print("\nUpdate #{}:".format(update))
        states_mb, actions_mb, values_mb, rewards_mb, next_dones_mb = runner.run(
        )

        tf.summary.trace_on(graph=True)
        policy_loss, entropy_loss, vf_loss, loss = pgnet.fit_gradient(
            states_mb, actions_mb, rewards_mb, values_mb)
        if update == 1:
            with summ_writer.as_default():
                tf.summary.trace_export(name="grad_trace", step=0)

        WeightWriter(summ_writer, pgnet, (Conv2D, Dense), global_step=update)

        with summ_writer.as_default():
            tf.summary.scalar("PolicyLoss", policy_loss, step=update)
            tf.summary.scalar("EntropyLoss", entropy_loss, step=update)
            tf.summary.scalar("ValueFunctionLoss", vf_loss, step=update)
            tf.summary.scalar("Loss", loss, step=update)

        if update % update_int == 0:
            print("PolicyLoss:", policy_loss)
            print("EntropyLoss: ", entropy_loss)
            print("ValueFunctionLoss: ", vf_loss)
            print("Loss: ", loss)

        if update % save_int == 0:
            pgnet.model.save_weights('cart_hparams_tuning_models/' + save_dir +
                                     '/my_checkpoint')
            print("Model Saved")

        if update % test_int == 0:
            test_rewards = TestRewardWriter(summ_writer,
                                            envfn,
                                            pgnet,
                                            20,
                                            global_step=update)
            print("Test Rewards: ", test_rewards)

    with summ_writer.as_default():
        hp.hparams(hParams)

    env.close()