Esempio n. 1
0
    def __init__(self, env, env_type, stochastic):
        """
        The constructor that uses the environment to constuct the network build policy and then build the agent.

        Parameters
        ----------
        env : gym.env
            The env the agent needs to interact with.
        env_type : str
            The type of env.
        stochastic : bool
            A bool describing if the behavior of the agent is stochastic (random in simple terms).
        """
        ob_space = env.observation_space
        ac_space = env.action_space
        self.stochastic = stochastic

        #now find the correct build policy
        if env_type == 'atari':
            policy = build_policy(env, 'cnn')
        elif env_type == "ChessWrapper":
            policy = build_policy(env, 'mlp', {'num_layers':5})
        else:
            policy = build_policy(env, 'mlp')

        #construct the agent model using the build model
        make_model = lambda: Model(policy=policy, ob_space=ob_space, ac_space=ac_space, nbatch_act=1, nbatch_train=1,
                                   nsteps=1, ent_coef=0., vf_coef=0.,
                                   max_grad_norm=0.)
        self.model = make_model()
Esempio n. 2
0
    def __init__(self, env, env_type, path, stochastic=False, gpu=True):
        from baselines.common.policies import build_policy
        from baselines.ppo2.model import Model

        self.graph = tf.Graph()

        if gpu:
            config = tf.ConfigProto()
            config.gpu_options.allow_growth = True
        else:
            config = tf.ConfigProto(device_count={'GPU': 0})

        self.sess = tf.Session(graph=self.graph, config=config)

        with self.graph.as_default():
            with self.sess.as_default():
                if isinstance(env.observation_space, gym.spaces.Dict):
                    ob_space = env.observation_space.spaces['ob_flattened']
                else:
                    ob_space = env.observation_space
                ac_space = env.action_space

                if env_type == 'atari':
                    policy = build_policy(env, 'cnn')
                elif env_type in ['mujoco', 'robosuite']:
                    policy = build_policy(env, 'mlp')
                else:
                    assert False, ' not supported env_type'

                make_model = lambda: Model(policy=policy,
                                           ob_space=ob_space,
                                           ac_space=ac_space,
                                           nbatch_act=1,
                                           nbatch_train=1,
                                           nsteps=1,
                                           ent_coef=0.,
                                           vf_coef=0.,
                                           max_grad_norm=0.)
                self.model = make_model()

                self.model_path = path
                self.model.load(path)

        if env_type in ['mujoco', 'robosuite']:
            with open(path + '.env_stat.pkl', 'rb') as f:
                import pickle
                s = pickle.load(f)
            self.ob_rms = s['ob_rms']
            #self.ret_rms = s['ret_rms']
            self.clipob = 10.
            self.epsilon = 1e-8
        else:
            self.ob_rms = None

        self.stochastic = stochastic
Esempio n. 3
0
def main(network, env, **network_kwargs):
    policy = build_policy(env, network, **network_kwargs)

    # Get state_space and action_space
    ob_space = env.observation_space
    ac_space = env.action_space
    nenvs = env.num_envs

    # Instantiate the model object (that creates act_model and train_model)
    if model_fn is None:
        from baselines.ppo2.model import Model
        model_fn = Model

    model = model_fn(policy=policy,
                     ob_space=ob_space,
                     ac_space=ac_space,
                     nbatch_act=nenvs,
                     nbatch_train=nbatch_train,
                     nsteps=nsteps,
                     ent_coef=ent_coef,
                     vf_coef=vf_coef,
                     max_grad_norm=max_grad_norm,
                     comm=comm,
                     mpi_rank_weight=mpi_rank_weight)

    if load_path is not None:
        model.load(load_path)
Esempio n. 4
0
def init_wrapper(environment,
                 network_type,
                 number_steps,
                 entropy_coefficient,
                 vf_coefficient,
                 gradient_clipping,
                 learning_rate,
                 alpha,
                 epsilon,
                 total_timesteps,
                 learning_rate_schedule='constant',
                 **network_kwargs):
    policy = build_policy(environment, network_type, **network_kwargs)

    model = Model(policy=policy,
                  env=environment,
                  nsteps=number_steps,
                  ent_coef=entropy_coefficient,
                  vf_coef=vf_coefficient,
                  max_grad_norm=gradient_clipping,
                  lr=learning_rate,
                  alpha=alpha,
                  epsilon=epsilon,
                  total_timesteps=total_timesteps,
                  lrschedule=learning_rate_schedule)

    return {'policy': policy, 'model': model}
Esempio n. 5
0
def test(config, env):
    ob_space = env.observation_space
    ac_space = env.action_space
    tf.reset_default_graph()
    gpu_opts = tf.GPUOptions(allow_growth=True)
    tf_config = tf.ConfigProto(
        inter_op_parallelism_threads=1,
        intra_op_parallelism_threads=1,
        gpu_options=gpu_opts,
    )
    with tf.Session(config=tf_config) as sess:
        policy = build_policy(env, 'cnn', estimate_q=True)
        model = Model(policy=policy,
                      ob_space=ob_space,
                      ac_space=ac_space,
                      nenvs=config.number_of_environments,
                      nsteps=config.number_of_steps,
                      ent_coef=config.entropy_weight,
                      q_coef=config.critic_weight,
                      gamma=config.discount_factor,
                      max_grad_norm=config.max_grad_norm,
                      lr=config.learning_rate,
                      rprop_alpha=config.rmsp_decay,
                      rprop_epsilon=config.rmsp_epsilon,
                      total_timesteps=config.timesteps,
                      lrschedule='linear',
                      c=config.clipping_factor,
                      trust_region=True,
                      alpha=config.momentum,
                      delta=config.trust_region_delta)
        tf_util.load_variables(config.load_path, sess=sess)
        return make_rollouts(config, env, model)
Esempio n. 6
0
def test_lstm_example():
    import tensorflow as tf
    from baselines.common import policies, models, cmd_util
    from baselines.common.vec_env.dummy_vec_env import DummyVecEnv

    # create vectorized environment
    venv = DummyVecEnv([lambda: cmd_util.make_mujoco_env('Reacher-v2', seed=0)])

    with tf.Session() as sess:
        # build policy based on lstm network with 128 units
        policy = policies.build_policy(venv, models.lstm(128))(nbatch=1, nsteps=1)

        # initialize tensorflow variables
        sess.run(tf.global_variables_initializer())

        # prepare environment variables
        ob = venv.reset()
        state = policy.initial_state
        done = [False]
        step_counter = 0

        # run a single episode until the end (i.e. until done)
        while True:
            action, _, state, _ = policy.step(ob, S=state, M=done)
            ob, reward, done, _ = venv.step(action)
            step_counter += 1
            if done:
                break


        assert step_counter > 5
Esempio n. 7
0
def test_lstm_example():
    import tensorflow as tf
    from baselines.common import policies, models, cmd_util
    from baselines.common.vec_env.dummy_vec_env import DummyVecEnv

    # create vectorized environment
    venv = DummyVecEnv(
        [lambda: cmd_util.make_mujoco_env('Reacher-v2', seed=0)])

    with tf.Session() as sess:
        # build policy based on lstm network with 128 units
        policy = policies.build_policy(venv, models.lstm(128))(nbatch=1,
                                                               nsteps=1)

        # initialize tensorflow variables
        sess.run(tf.global_variables_initializer())

        # prepare environment variables
        ob = venv.reset()
        state = policy.initial_state
        done = [False]
        step_counter = 0

        # run a single episode until the end (i.e. until done)
        while True:
            action, _, state, _ = policy.step(ob, S=state, M=done)
            ob, reward, done, _ = venv.step(action)
            step_counter += 1
            if done:
                break

        assert step_counter > 5
Esempio n. 8
0
def test(config, env):
    ob_space = env.observation_space
    ac_space = env.action_space
    tf.reset_default_graph()
    gpu_opts = tf.GPUOptions(allow_growth=True)
    tf_config = tf.ConfigProto(
        inter_op_parallelism_threads=1,
        intra_op_parallelism_threads=1,
        gpu_options=gpu_opts,
    )
    with tf.Session(config=tf_config) as sess:
        nenvs = env.num_envs
        nbatch = nenvs * config.number_of_steps
        nbatch_train = nbatch // 4
        policy = build_policy(env, 'cnn')
        model = Model(
            policy=policy,
            ob_space=ob_space,
            ac_space=ac_space,
            nbatch_act=nenvs,
            nbatch_train=nbatch_train,
            nsteps=config.number_of_steps,
            ent_coef=config.entropy_weight,
            vf_coef=config.critic_weight,
            max_grad_norm=config.max_grad_norm,
            comm=None,
            mpi_rank_weight=1
        )
        model.load(config.load_path)
        return make_rollouts(config, env, model)
Esempio n. 9
0
def test(config, env):
    ob_space = env.observation_space
    ac_space = env.action_space
    tf.reset_default_graph()
    gpu_opts = tf.GPUOptions(allow_growth=True)
    tf_config = tf.ConfigProto(
        inter_op_parallelism_threads=1,
        intra_op_parallelism_threads=1,
        gpu_options=gpu_opts,
    )
    with tf.Session(config=tf_config) as sess:
        config.batch_size = 2
        config.number_of_steps = 2
        policy = build_policy(env, 'cnn')
        model = Model(policy=policy,
                      env=env,
                      nsteps=config.number_of_steps,
                      ent_coef=config.entropy_weight,
                      vf_coef=config.critic_weight,
                      max_grad_norm=config.max_grad_norm,
                      lr=config.learning_rate,
                      alpha=config.rmsp_decay,
                      epsilon=config.discount_factor,
                      total_timesteps=config.timesteps,
                      lrschedule='linear')
        model.load(config.load_path)
        return make_rollouts(config, env, model)
Esempio n. 10
0
def play():
    env_args = dict()
    network_kwargs = dict(nlstm=512)

    # create vectorized environment
    pysc2_env_vec = SubprocVecEnv([partial(make_sc2env, id=i, **env_args) for i in range(1)])

    policy = policies.build_policy(pysc2_env_vec, "cnn_lstm", **network_kwargs)
    nenvs = pysc2_env_vec.num_envs
    # Calculate the batch_size
    nsteps=256
    nminibatches=1
    nbatch = nenvs * nsteps
    nbatch_train = nbatch // nminibatches
    ent_coef=0.0
    vf_coef=0.5
    max_grad_norm=0.5

    make_model = lambda : ppo_model(policy=policy, ob_space=(64, 64, 3), ac_space=65, nbatch_act=nenvs, nbatch_train=nbatch_train,
                    nsteps=nsteps, ent_coef=ent_coef, vf_coef=vf_coef,
                    max_grad_norm=max_grad_norm)
    model = make_model()
    model.load("2170_ppo_cnn_lstm_512_easy")

    ob = pysc2_env_vec.reset()
    state = model.initial_state
    done = [False]
    step_counter = 0

    # run a single episode until the end (i.e. until done)
    while True:
        #print(step_counter)
        action, _, state, _ = model.step(ob, S=state, M=done)
        ob, reward, done, _ = pysc2_env_vec.step(action)
        step_counter += 1
Esempio n. 11
0
def save_lucid_model(config, params, *, model_path, metadata_path):
    config = config.copy()
    config.pop("num_envs")
    library = config.get("library", "baselines")
    venv = create_env(1, **config)
    arch = get_arch(**config)

    with tf.Graph().as_default(), tf.Session() as sess:
        observation_space = venv.observation_space
        observations_placeholder = tf.placeholder(shape=(None, ) +
                                                  observation_space.shape,
                                                  dtype=tf.float32)

        if library == "baselines":
            from baselines.common.policies import build_policy

            with tf.variable_scope("ppo2_model", reuse=tf.AUTO_REUSE):
                policy_fn = build_policy(venv, arch)
                policy = policy_fn(
                    nbatch=None,
                    nsteps=1,
                    sess=sess,
                    observ_placeholder=(observations_placeholder * 255),
                )
                pd = policy.pd
                vf = policy.vf

        else:
            raise ValueError(f"Unsupported library: {library}")

        load_params(params, sess=sess)

        Model.save(
            model_path,
            input_name=observations_placeholder.op.name,
            output_names=[pd.logits.op.name, vf.op.name],
            image_shape=observation_space.shape,
            image_value_range=[0.0, 1.0],
        )

    metadata = {
        "policy_logits_name": pd.logits.op.name,
        "value_function_name": vf.op.name,
        "env_name": config.get("env_name"),
        "gae_gamma": config.get("gamma"),
        "gae_lambda": config.get("lambda"),
    }
    env = venv
    while hasattr(env, "env") and (not hasattr(env, "combos")):
        env = env.env
    if hasattr(env, "combos"):
        metadata["action_combos"] = env.combos
    else:
        metadata["action_combos"] = None

    save_joblib(metadata, metadata_path)
    return {
        "model_bytes": read(model_path, cache=False, mode="rb"),
        **metadata
    }
Esempio n. 12
0
def create_policy(env, network, value_network='copy', **network_kwargs):
    policy_fn = build_policy(env, network, value_network, **network_kwargs)

    ob_space = env.observation_space
    ac_space = env.action_space

    ob = observation_placeholder(ob_space)
    return policy_fn(observ_placeholder=ob)
Esempio n. 13
0
def create_model(network,
                 seed=None,
                 nsteps=20,
                 total_timesteps=int(80e6),
                 q_coef=0.5,
                 ent_coef=0.01,
                 max_grad_norm=10,
                 lr=7e-4,
                 lrschedule='linear',
                 rprop_epsilon=1e-5,
                 rprop_alpha=0.99,
                 gamma=0.99,
                 log_interval=100,
                 buffer_size=50000,
                 replay_ratio=4,
                 replay_start=10000,
                 c=10.0,
                 trust_region=True,
                 alpha=0.99,
                 delta=1,
                 load_path=None,
                 **network_kwargs):
    set_global_seeds(seed)
    env = HaliteEnv()
    # if not isinstance(env, VecFrameStack):
    # env = VecFrameStack(env, env.nstack)

    # network = 'halite_net' # not yet, for now let's prented the halite layer is the only input
    policy = build_policy(env, network, estimate_q=True, **network_kwargs)
    nenvs = env.num_envs
    ob_space = env.observation_space
    ac_space = env.action_space

    nstack = env.nstack
    model_params = {
        "policy": policy,
        "ob_space": ob_space,
        "ac_space": ac_space,
        "nenvs": nenvs,
        "nsteps": nsteps,
        "ent_coef": ent_coef,
        "q_coef": q_coef,
        "gamma": gamma,
        "max_grad_norm": max_grad_norm,
        "lr": lr,
        "rprop_alpha": rprop_alpha,
        "rprop_epsilon": rprop_epsilon,
        "total_timesteps": total_timesteps,
        "lrschedule": lrschedule,
        "c": c,
        "trust_region": trust_region,
        "alpha": alpha,
        "delta": delta
    }
    model = Model(**model_params)

    return env, policy, nenvs, ob_space, ac_space, nstack, model
Esempio n. 14
0
def learn(network, env, seed, total_timesteps=int(40e6), gamma=0.99, log_interval=1, nprocs=32, nsteps=20,
                 ent_coef=0.01, vf_coef=0.5, vf_fisher_coef=1.0, lr=0.25, max_grad_norm=0.5,
                 kfac_clip=0.001, save_interval=None, lrschedule='linear', load_path=None, **network_kwargs):
    set_global_seeds(seed)


    if network == 'cnn':
        network_kwargs['one_dim_bias'] = True

    policy = build_policy(env, network, **network_kwargs)

    nenvs = env.num_envs
    ob_space = env.observation_space
    ac_space = env.action_space
    make_model = lambda : Model(policy, ob_space, ac_space, nenvs, total_timesteps, nprocs=nprocs, nsteps
                                =nsteps, ent_coef=ent_coef, vf_coef=vf_coef, vf_fisher_coef=
                                vf_fisher_coef, lr=lr, max_grad_norm=max_grad_norm, kfac_clip=kfac_clip,
                                lrschedule=lrschedule)
    if save_interval and logger.get_dir():
        import cloudpickle
        with open(osp.join(logger.get_dir(), 'make_model.pkl'), 'wb') as fh:
            fh.write(cloudpickle.dumps(make_model))
    model = make_model()

    if load_path is not None:
        model.load(load_path)

    runner = Runner(env, model, nsteps=nsteps, gamma=gamma)
    nbatch = nenvs*nsteps
    tstart = time.time()
    coord = tf.train.Coordinator()
    enqueue_threads = model.q_runner.create_threads(model.sess, coord=coord, start=True)
    for update in range(1, total_timesteps//nbatch+1):
        obs, states, rewards, masks, actions, values = runner.run()
        policy_loss, value_loss, policy_entropy = model.train(obs, states, rewards, masks, actions, values)
        model.old_obs = obs
        nseconds = time.time()-tstart
        fps = int((update*nbatch)/nseconds)
        if update % log_interval == 0 or update == 1:
            ev = explained_variance(values, rewards)
            logger.record_tabular("nupdates", update)
            logger.record_tabular("total_timesteps", update*nbatch)
            logger.record_tabular("fps", fps)
            logger.record_tabular("policy_entropy", float(policy_entropy))
            logger.record_tabular("policy_loss", float(policy_loss))
            logger.record_tabular("value_loss", float(value_loss))
            logger.record_tabular("explained_variance", float(ev))
            logger.record_tabular("eprewmean", np.nan if len(rewards) == 0 else np.mean(rewards))
            logger.dump_tabular()

        if save_interval and (update % save_interval == 0 or update == 1) and logger.get_dir():
            savepath = osp.join(logger.get_dir(), 'checkpoint%.5i'%update)
            print('Saving to', savepath)
            model.save(savepath)
    coord.request_stop()
    coord.join(enqueue_threads)
    return model
Esempio n. 15
0
def learn(
    network,
    env,
    save_path,
    seed=None,
    nsteps=10,
    total_timesteps=int(80e6),
    vf_coef=0.5,
    ent_coef=0.01,
    max_grad_norm=0.5,
    lr=7e-4,
    lrschedule='linear',
    epsilon=1e-5,
    alpha=0.99,
    gamma=0.99,
    # log_interval=100,
    log_interval=10,
    load_path=None,
    **network_kwargs):


    if network == 'cnn':
        network_kwargs['one_dim_bias'] = True
    set_global_seeds(seed)
    assert save_path is not None
    # Get the nb of env
    nenvs = env.num_envs
    policy = build_policy(env, network, **network_kwargs)

    # Instantiate the model object (that creates step_model and train_model)
    model = Model(policy=policy, env=env, nsteps=nsteps, ent_coef=ent_coef, vf_coef=vf_coef,
        max_grad_norm=max_grad_norm, lr=lr, alpha=alpha, epsilon=epsilon, total_timesteps=total_timesteps, lrschedule=lrschedule)
    if load_path is not None:
        model.load(load_path)

    # Instantiate the runner object
    runner = Runner(env, model, nsteps=nsteps, gamma=gamma)

    # Calculate the batch_size,这里将nsteps设为1
    nbatch = nenvs*nsteps

    observation = []
    action = []
    for update in range(1, total_timesteps//nbatch+1):
        # Get mini batch of experiences
        obs, states, rewards, masks, actions, values,output = runner.run()
        observation.append(obs)
        print('times', update)
    obs = np.concatenate(observation)

    #compute fisher matrix
    FM = model.compute_fisher(obs,plot_diffs=True,disp_freq=10)
    # FM = model.compute_exact_fisher(obs,plot_diffs=True,disp_freq=10)

    joblib.dump(FM, save_path)
Esempio n. 16
0
def main():
    def make_env():
        obs_type = retro.Observations.IMAGE  # retro.Observations.RAM
        env = retro.make(game='Pitfall-Atari2600',
                         state=retro.State.DEFAULT,
                         scenario='scenario',
                         record='.',
                         players=1,
                         obs_type=obs_type)
        env = wrap_deepmind_retro(env)
        return env

    base_dirname = os.path.join(currentdir, "results")
    #dir_name = "pitfall_ppo2_rl_baseline1"
    dir_name = "pitfall_ppo2testing_D191211_073544"
    dir_name = os.path.join(base_dirname, dir_name)
    load_path = os.path.join(dir_name, 'models/00781')

    venv = SubprocVecEnv([make_env] * 1)  #Vectorized
    network = 'cnn'
    policy = build_policy(venv, network)
    nenvs = venv.num_envs  # Get the nb of env

    # Get state_space and action_space
    ob_space = venv.observation_space
    ac_space = venv.action_space

    # Instantiate the model object
    model_fn = Model
    nsteps = 2048
    nbatch = nenvs * nsteps
    nminibatches = 4
    nbatch_train = nbatch // nminibatches
    model = model_fn(policy=policy,
                     ob_space=ob_space,
                     ac_space=ac_space,
                     nbatch_act=nenvs,
                     nbatch_train=nbatch_train,
                     nsteps=2048,
                     ent_coef=0.0,
                     vf_coef=0.5,
                     max_grad_norm=0.5)
    model.load(load_path)

    # Instantiate the runner object
    runner = Runner(env=venv, model=model, nsteps=nsteps, gamma=0.99, lam=0.95)

    # run the Runner and record video
    total_timesteps = int(1e4)
    nupdates = total_timesteps // nbatch
    for update in range(1, nupdates + 1):
        print("progress: ", update, "/", nupdates)
        obs, returns, masks, actions, values, neglogpacs, states, epinfos = runner.run(
        )
Esempio n. 17
0
def main():
    numOfTests = 40
    env_args = {
        'episode_life': False,
        'clip_rewards': False,
        'crop': True,
        'rotate': True
    }
    env = VecFrameStack(
        make_vec_env("gvgai-zelda-lvl0-v0",
                     numOfTests,
                     43,
                     wrapper_kwargs=env_args), 4)
    policy = build_policy(env, "cnn")
    model = Model(policy=policy, env=env, nsteps=5)
    model.load('logs/test_4*5_r1_right/checkpoints/260000')
    nh, nw, nc = env.observation_space.shape
    result = dict()
    for j in range(201, 601):
        # obs = np.zeros((numOfTests, nh, nw, nc), dtype=np.uint8)
        done = np.array([False] * numOfTests)
        env.venv.set_level(
            "GVGAI_GYM/gym_gvgai/envs/games/zelda_v0/zelda_lvl{}.txt".format(
                j))
        obs = env.reset()
        infos = [False] * numOfTests
        # dones = [False] * numOfTests

        while not all(done):
            actions, values, state, _ = model.step(obs)
            obs, rewards, dones, info = env.step(actions)
            done[np.where(dones != False)] = True
            for i in np.where(dones != False)[0].tolist():
                if not infos[i]:
                    # print(info)
                    del info[i]["grid"]
                    del info[i]["ascii"]
                    infos[i] = info[i]
            # print(np.where(dones!=False)[0])
            # print(done)
            # print(infos)

        # print(dones)
        win = [1 if (i['winner'] == 'PLAYER_WINS') else 0 for i in infos]
        # score = [i['episode']['r'] for i in infos]
        # steps = [i['episode']['l'] for i in infos]
        # time = [i['episode']['t'] for i in infos]
        print("level {}".format(j), win)
        result[j] = infos

    env.close()

    with open("result_4*5_r1_right_200~600", "wb") as f:
        pickle.dump(result, f)
Esempio n. 18
0
    def __init__(self, env, env_type, stochastic=False):
        ob_space = env.observation_space
        ac_space = env.action_space

        if env_type == 'atari':
            policy = build_policy(env, 'cnn')
        elif env_type == 'mujoco':
            policy = build_policy(env, 'mlp')

        make_model = lambda: Model(policy=policy,
                                   ob_space=ob_space,
                                   ac_space=ac_space,
                                   nbatch_act=1,
                                   nbatch_train=1,
                                   nsteps=1,
                                   ent_coef=0.,
                                   vf_coef=0.,
                                   max_grad_norm=0.)
        self.model = make_model()
        self.stochastic = stochastic
Esempio n. 19
0
def make_leg_model(leg, env):
    leg_env = gym.make('PhantomXLeg-v0')
    leg_env.set_info(env.info)
    leg_env.leg_name = leg
    policy = build_policy(leg_env, defaults['network'], **alg_kwargs)

    model = ppo2.Model(policy=policy, ob_space=leg_env.observation_space, ac_space=leg_env.action_space, nbatch_act=nenvs,
                    nbatch_train=nbatch_train,
                    nsteps=defaults['nsteps'], ent_coef=defaults['ent_coef'], vf_coef=defaults['vf_coef'],
                    max_grad_norm=defaults['max_grad_norm'])
    model.load('' + leg + '/checkpoints/05000')
    return model
Esempio n. 20
0
def get_step_fn(config, params, *, num_envs, full_resolution):
    config = config.copy()
    config.pop("num_envs")
    library = config.get("library", "baselines")
    venv = create_env(num_envs, **config)
    arch = get_arch(**config)

    with tf.Graph().as_default(), tf.Session() as sess:
        if library == "baselines":
            from baselines.common.policies import build_policy

            with tf.variable_scope("ppo2_model", reuse=tf.AUTO_REUSE):
                policy_fn = build_policy(venv, arch)
                policy = policy_fn(nbatch=venv.num_envs, nsteps=1, sess=sess)

            stepdata = {
                "ob": venv.reset(),
                "state": policy.initial_state,
                "first": np.ones((venv.num_envs, ), bool),
            }
            if full_resolution:
                stepdata["ob_full"] = np.stack(
                    [info["rgb"] for info in venv.env.get_info()], axis=0)

            def step_fn():
                result = {
                    "ob": stepdata["ob"],
                    "first": stepdata["first"].astype(bool)
                }
                if full_resolution:
                    result["ob_full"] = stepdata["ob_full"]
                result["ac"], _, stepdata["state"], _ = policy.step(
                    stepdata["ob"],
                    S=stepdata["state"],
                    M=stepdata["first"].astype(float),
                )
                (
                    stepdata["ob"],
                    result["reward"],
                    stepdata["first"],
                    result["info"],
                ) = venv.step(result["ac"])
                if full_resolution:
                    stepdata["ob_full"] = np.stack(
                        [info["rgb"] for info in result["info"]], axis=0)
                return result

        else:
            raise ValueError(f"Unsupported library: {library}")

        load_params(params, sess=sess)

        yield step_fn
Esempio n. 21
0
def train(args,extra_args):
    env_type, env_id = get_env_type(args)
    print('env_type: {}'.format(env_type))
    total_timesteps = int(args.num_timesteps)
    seed = args.seed
    set_global_seeds(seed)
    #workerseed = seed + 10000 * MPI.COMM_WORLD.Get_rank()
    #set_global_seeds(workerseed)

    learn = get_learn_function(args.alg)
    alg_kwargs = get_learn_function_defaults(args.alg, env_type)
    alg_kwargs.update(extra_args)
    
    env = build_env(args,normalize_ob=False,normalize_ret=False)
    if args.save_video_interval != 0:
        env = VecVideoRecorder(env, osp.join(logger.get_dir(), "videos"), record_video_trigger=lambda x: x % args.save_video_interval == 0, video_length=args.save_video_length)

    if args.network:
        alg_kwargs['network'] = args.network
    else:
        if alg_kwargs.get('network') is None:
            alg_kwargs['network'] = get_default_network(env_type)
   
    #timesteps_per_batch=1024
    #timesteps_per_batch=2048
    beta = -1
    if beta < 0:
        #print(alg_kwargs)
        nr_episodes = total_timesteps // alg_kwargs['timesteps_per_batch']
        # Automatically compute beta based on initial entropy and number of iterations
        policy = build_policy(env, alg_kwargs['network'], value_network='copy', normalize_observations=alg_kwargs['normalize_observations'], copos=True)
        ob = observation_placeholder(env.observation_space)
        
        sess = U.single_threaded_session()
        sess.__enter__()
        with tf.variable_scope("tmp_pi"):
            tmp_pi = policy(observ_placeholder=ob)
        sess.run(tf.global_variables_initializer())
        
        tmp_ob = np.zeros((1,) + env.observation_space.shape)
        entropy = sess.run(tmp_pi.pd.entropy(), feed_dict={tmp_pi.X: tmp_ob})
        #beta = 2 * entropy / nr_episodes
        beta = 0
        print("Initial entropy: " + str(entropy) + ", episodes: " + str(nr_episodes))
        print("Constantly set beta: " + str(beta))
    
    print('Training {} on {}:{} with arguments \n{}'.format(args.alg, env_type, env_id, alg_kwargs))
    model=learn(env=env, seed=seed, beta=beta,
                total_timesteps=total_timesteps,
                **alg_kwargs)
    return model, env
Esempio n. 22
0
 def __init__(self, checkpoint_path):
   player_base.PlayerBase.__init__(self)
   self._action_set = 'default'
   self._player_prefix = 'player_0'
   config = tf.ConfigProto()
   config.gpu_options.allow_growth = True
   self._sess = tf.Session(config=config)
   with tf.variable_scope(self._player_prefix):
     with tf.variable_scope('ppo2_model'):
       policy_fn = build_policy(DummyEnv(self._action_set), 'mlp', num_layers=5, num_hidden=128)
       self._policy = policy_fn(nbatch=1, sess=self._sess)
   _load_variables(checkpoint_path, self._sess, prefix=self._player_prefix + '/')
   saver = tf.train.Saver()
   saver.save(self._sess, "/home/alex/Dropbox/projects/python/kaggle/football/saved_models/simple_ppo2/simple_ppo2")
Esempio n. 23
0
def learn(
    *,
    network,
    env,
    total_timesteps,
    seed=None,
    **network_kwargs,
):

    # setup runable policy
    policy = build_policy(env, network, value_network='copy', **network_kwargs)
    ob_space = env.observation_space
    ac_space = env.action_space

    # initialize the gradient descent policy directly
    return GradientDescent(ob_space, ac_space)
Esempio n. 24
0
def test(env, load_path, img_path, display_steps=500):
    with tf.Session() as sess:
        policy = build_policy(env, a2c_discrete_cnn)
        with tf.variable_scope('a2c_model', reuse=tf.AUTO_REUSE):
            model = policy(1, 1, sess)

        tf_util.load_variables(load_path, sess=sess)

        def display_actor(obs):
            actions = model.step([obs])[0]
            return actions[0]

        if img_path is None:
            show(env, display_actor, display_steps)
        else:
            save_images(env, display_actor, display_steps, img_path, 'img_')
Esempio n. 25
0
  def __init__(self, player_config, env_config):
    player_base.PlayerBase.__init__(self, player_config)

    self._action_set = 'default'
    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True
    self._sess = tf.Session(config=config)
    self._player_prefix = 'player_{}'.format(player_config['index'])
    stacking = 4 if player_config.get('stacked', True) else 1
    policy = player_config.get('policy', 'cnn')
    self._stacker = ObservationStacker(stacking)
    with tf.variable_scope(self._player_prefix):
      with tf.variable_scope('ppo2_model'):
        policy_fn = build_policy(DummyEnv(self._action_set, stacking), policy)
        self._policy = policy_fn(nbatch=1, sess=self._sess)
    _load_variables(player_config['checkpoint'], self._sess,
                    prefix=self._player_prefix + '/')
Esempio n. 26
0
def demonstrate(network,
                env,
                nsteps,
                mvs,
                load_path,
                ent_coef=0.0,
                vf_coef=0.5,
                max_grad_norm=0.5,
                mpi_rank_weight=1,
                comm=None,
                gamma=0.99,
                lam=0.95):

    policy = build_policy(env, network)

    model = Model(policy=policy,
                  nbatch_act=1,
                  nbatch_train=None,
                  nsteps=nsteps,
                  ent_coef=ent_coef,
                  vf_coef=vf_coef,
                  max_grad_norm=max_grad_norm,
                  comm=comm,
                  mpi_rank_weight=mpi_rank_weight)

    if load_path is not None:
        model.load(load_path)
        print('Model has been successfully loaded from {0}'.format(load_path))
    else:
        print(
            'No model has been loaded. Neural network with random weights is used.'
        )

    # Instantiate the runner object and episode buffer

    runner = Runner(env=env,
                    model=model,
                    nsteps=nsteps,
                    gamma=gamma,
                    lam=lam,
                    mvs=mvs)
    obs, returns, masks, actions, values, neglogpacs, states, epinfos = runner.run(
        render=True)

    print('Demo completed! Reward: {0}'.format(epinfos[0]['r']))
    print('\nPress Ctrl+C to stop the demo...')
Esempio n. 27
0
def prelearn(network, env, trainX, trainY, testX, testY, seed=None, lr=3e-4):
    set_global_seeds(seed)

    policy = build_policy(env, network)

    # Get state_space and action_space
    ob_space = env.observation_space
    ac_space = env.action_space

    # Instantiate the model object (that creates act_model and train_model)
    from baselines.ppo2.model import Model
    model_fn = Model

    batch_size = 128
    ndata = len(trainX)
    nepochs = 10

    # Set up model with some dummy arguments
    model = model_fn(policy=policy, ob_space=ob_space, ac_space=ac_space,
                     nbatch_act=1, nbatch_train=batch_size,
                    nsteps=8192, ent_coef=0.00, vf_coef=0.03,
                    max_grad_norm=0.5)

    # train
    for _ in range(nepochs):
        for start in range(0, ndata, batch_size):
            end = start + batch_size
            obs = trainX[start:end]
            actions = trainY[start:end]
            model.pretrain(obs, actions, lr)

    # validate with MSE
    pred_actions = []
    for o in testX:
      pred_actions.append(model.evaluate(o))
    sse = 0
    for pred_action, action in zip(pred_actions, testY):
      sse += (action[0] - float(pred_action[0]))**2

    mse = sse / len(pred_actions)

    print(type(mse))
    print("Validation loss (mse): " + str(mse))
    logdir = logger.get_dir()
    model.save(osp.join(logdir, 'pretrained_model.pkl'))
Esempio n. 28
0
def runner(leg, env):
    leg_env = gym.make('PhantomXLeg-v0')
    leg_env.set_info(env.info)
    leg_env.leg_name = leg
    policy = build_policy(leg_env, defaults['network'], **alg_kwargs)

    model = ppo2.Model(policy=policy,
                       ob_space=leg_env.observation_space,
                       ac_space=leg_env.action_space,
                       nbatch_act=nenvs,
                       nbatch_train=nbatch_train,
                       nsteps=defaults['nsteps'],
                       ent_coef=defaults['ent_coef'],
                       vf_coef=defaults['vf_coef'],
                       max_grad_norm=defaults['max_grad_norm'])
    model.load('' + leg + '/checkpoints/05000')
    obs = leg_env.reset()
    ep_reward = 0
    rewards = []
    episode = 0
    step = 0
    while True:
        step += 1
        action, value_estimate, next_state, neglogp = model.step(obs)
        obs, reward, done, _ = leg_env.step(action[0])
        ep_reward += reward
        if done:
            leg_env.reset()
            episode += 1
            print(step)
            print(ep_reward)
            rewards.append(ep_reward)
            step = 0
            ep_reward = 0
        if episode >= 100:
            break
    f = open(filename, "w+")
    f.write("Variance: " + str(np.var(rewards)))
    rewards = np.array(rewards, dtype=float)
    f.write(",Median: " + str(statistics.median(rewards)))
    f.write(",Mean: " + str(np.mean(rewards)))
    f.close()
    while True:
        time.sleep(2)
        print("DONE")
Esempio n. 29
0
def train(env_id, seed, policy, load_path, num_episodes, frame_skip,
          no_render):

    env = make_neyboy_env(env_id,
                          1,
                          seed,
                          allow_early_resets=True,
                          frame_skip=frame_skip,
                          save_video=True)
    env = VecFrameStack(env, 4)
    policy = build_policy(env, policy)
    ob_space = env.observation_space
    ac_space = env.action_space
    ent_coef = .01
    vf_coef = 0.5
    max_grad_norm = 0.5
    model = Model(policy=policy,
                  ob_space=ob_space,
                  ac_space=ac_space,
                  nbatch_act=env.num_envs,
                  nbatch_train=0,
                  nsteps=0,
                  ent_coef=ent_coef,
                  vf_coef=vf_coef,
                  max_grad_norm=max_grad_norm)
    model.load(load_path)

    for _ in range(num_episodes):
        if not no_render:
            env.render()
        observation, done = env.reset(), False
        if not no_render:
            env.render()
        episode_rew = 0
        score = 0
        while not done:
            if not no_render:
                env.render()
            action, _, _, _ = model.step(observation)
            observation, reward, done, info = env.step(action)
            episode_rew += reward
            score = info[0]
        print('Episode reward={}, info={}'.format(episode_rew, score))
Esempio n. 30
0
def load_model(venv, load_path, network, **network_kwargs):
    policy = build_policy(venv, network, **network_kwargs)

    # Instantiate the model object (that creates step_model and train_model)
    model = Model(policy=policy,
                  env=venv,
                  nsteps=0,
                  ent_coef=0,
                  vf_coef=0,
                  max_grad_norm=0,
                  lr=0,
                  alpha=0,
                  epsilon=0,
                  total_timesteps=0,
                  lrschedule='linear')

    model.load(load_path)

    return model
Esempio n. 31
0
    def __init__(self, player_config, env_config):
        player_base.PlayerBase.__init__(self, player_config)

        self._action_set = (env_config['action_set']
                            if 'action_set' in env_config else 'default')
        config = tf.ConfigProto()
        config.gpu_options.allow_growth = True
        self._sess = tf.Session(config=config)
        self._player_prefix = 'player_{}'.format(player_config['index'])
        with tf.variable_scope(self._player_prefix):
            with tf.variable_scope('ppo2_model'):
                policy_fn = build_policy(DummyEnv(self._action_set),
                                         'mlp',
                                         num_layers=5,
                                         num_hidden=128)
                self._policy = policy_fn(nbatch=1, sess=self._sess)
        _load_variables(player_config['checkpoint'],
                        self._sess,
                        prefix=self._player_prefix + '/')
Esempio n. 32
0
def learn(
    network,
    env,
    seed=None,
    nsteps=5,
    total_timesteps=int(80e6),
    vf_coef=0.5,
    ent_coef=0.01,
    max_grad_norm=0.5,
    lr=7e-4,
    lrschedule='linear',
    epsilon=1e-5,
    alpha=0.99,
    gamma=0.99,
    log_interval=100,
    load_path=None,
    **network_kwargs):

    '''
    Main entrypoint for A2C algorithm. Train a policy with given network architecture on a given environment using a2c algorithm.

    Parameters:
    -----------

    network:            policy network architecture. Either string (mlp, lstm, lnlstm, cnn_lstm, cnn, cnn_small, conv_only - see baselines.common/models.py for full list)
                        specifying the standard network architecture, or a function that takes tensorflow tensor as input and returns
                        tuple (output_tensor, extra_feed) where output tensor is the last network layer output, extra_feed is None for feed-forward
                        neural nets, and extra_feed is a dictionary describing how to feed state into the network for recurrent neural nets.
                        See baselines.common/policies.py/lstm for more details on using recurrent nets in policies


    env:                RL environment. Should implement interface similar to VecEnv (baselines.common/vec_env) or be wrapped with DummyVecEnv (baselines.common/vec_env/dummy_vec_env.py)


    seed:               seed to make random number sequence in the alorightm reproducible. By default is None which means seed from system noise generator (not reproducible)

    nsteps:             int, number of steps of the vectorized environment per update (i.e. batch size is nsteps * nenv where
                        nenv is number of environment copies simulated in parallel)

    total_timesteps:    int, total number of timesteps to train on (default: 80M)

    vf_coef:            float, coefficient in front of value function loss in the total loss function (default: 0.5)

    ent_coef:           float, coeffictiant in front of the policy entropy in the total loss function (default: 0.01)

    max_gradient_norm:  float, gradient is clipped to have global L2 norm no more than this value (default: 0.5)

    lr:                 float, learning rate for RMSProp (current implementation has RMSProp hardcoded in) (default: 7e-4)

    lrschedule:         schedule of learning rate. Can be 'linear', 'constant', or a function [0..1] -> [0..1] that takes fraction of the training progress as input and
                        returns fraction of the learning rate (specified as lr) as output

    epsilon:            float, RMSProp epsilon (stabilizes square root computation in denominator of RMSProp update) (default: 1e-5)

    alpha:              float, RMSProp decay parameter (default: 0.99)

    gamma:              float, reward discounting parameter (default: 0.99)

    log_interval:       int, specifies how frequently the logs are printed out (default: 100)

    **network_kwargs:   keyword arguments to the policy / network builder. See baselines.common/policies.py/build_policy and arguments to a particular type of network
                        For instance, 'mlp' network architecture has arguments num_hidden and num_layers.

    '''



    set_global_seeds(seed)

    # Get the nb of env
    nenvs = env.num_envs
    policy = build_policy(env, network, **network_kwargs)

    # Instantiate the model object (that creates step_model and train_model)
    model = Model(policy=policy, env=env, nsteps=nsteps, ent_coef=ent_coef, vf_coef=vf_coef,
        max_grad_norm=max_grad_norm, lr=lr, alpha=alpha, epsilon=epsilon, total_timesteps=total_timesteps, lrschedule=lrschedule)
    if load_path is not None:
        model.load(load_path)

    # Instantiate the runner object
    runner = Runner(env, model, nsteps=nsteps, gamma=gamma)
    epinfobuf = deque(maxlen=100)

    # Calculate the batch_size
    nbatch = nenvs*nsteps

    # Start total timer
    tstart = time.time()

    for update in range(1, total_timesteps//nbatch+1):
        # Get mini batch of experiences
        obs, states, rewards, masks, actions, values, epinfos = runner.run()
        epinfobuf.extend(epinfos)

        policy_loss, value_loss, policy_entropy = model.train(obs, states, rewards, masks, actions, values)
        nseconds = time.time()-tstart

        # Calculate the fps (frame per second)
        fps = int((update*nbatch)/nseconds)
        if update % log_interval == 0 or update == 1:
            # Calculates if value function is a good predicator of the returns (ev > 1)
            # or if it's just worse than predicting nothing (ev =< 0)
            ev = explained_variance(values, rewards)
            logger.record_tabular("nupdates", update)
            logger.record_tabular("total_timesteps", update*nbatch)
            logger.record_tabular("fps", fps)
            logger.record_tabular("policy_entropy", float(policy_entropy))
            logger.record_tabular("value_loss", float(value_loss))
            logger.record_tabular("explained_variance", float(ev))
            logger.record_tabular("eprewmean", safemean([epinfo['r'] for epinfo in epinfobuf]))
            logger.record_tabular("eplenmean", safemean([epinfo['l'] for epinfo in epinfobuf]))
            logger.dump_tabular()
    return model
Esempio n. 33
0
def learn(*, network, env, total_timesteps, eval_env = None, seed=None, nsteps=2048, ent_coef=0.0, lr=3e-4,
            vf_coef=0.5,  max_grad_norm=0.5, gamma=0.99, lam=0.95,
            log_interval=10, nminibatches=4, noptepochs=4, cliprange=0.2,
            save_interval=0, load_path=None, model_fn=None, **network_kwargs):
    '''
    Learn policy using PPO algorithm (https://arxiv.org/abs/1707.06347)

    Parameters:
    ----------

    network:                          policy network architecture. Either string (mlp, lstm, lnlstm, cnn_lstm, cnn, cnn_small, conv_only - see baselines.common/models.py for full list)
                                      specifying the standard network architecture, or a function that takes tensorflow tensor as input and returns
                                      tuple (output_tensor, extra_feed) where output tensor is the last network layer output, extra_feed is None for feed-forward
                                      neural nets, and extra_feed is a dictionary describing how to feed state into the network for recurrent neural nets.
                                      See common/models.py/lstm for more details on using recurrent nets in policies

    env: baselines.common.vec_env.VecEnv     environment. Needs to be vectorized for parallel environment simulation.
                                      The environments produced by gym.make can be wrapped using baselines.common.vec_env.DummyVecEnv class.


    nsteps: int                       number of steps of the vectorized environment per update (i.e. batch size is nsteps * nenv where
                                      nenv is number of environment copies simulated in parallel)

    total_timesteps: int              number of timesteps (i.e. number of actions taken in the environment)

    ent_coef: float                   policy entropy coefficient in the optimization objective

    lr: float or function             learning rate, constant or a schedule function [0,1] -> R+ where 1 is beginning of the
                                      training and 0 is the end of the training.

    vf_coef: float                    value function loss coefficient in the optimization objective

    max_grad_norm: float or None      gradient norm clipping coefficient

    gamma: float                      discounting factor

    lam: float                        advantage estimation discounting factor (lambda in the paper)

    log_interval: int                 number of timesteps between logging events

    nminibatches: int                 number of training minibatches per update. For recurrent policies,
                                      should be smaller or equal than number of environments run in parallel.

    noptepochs: int                   number of training epochs per update

    cliprange: float or function      clipping range, constant or schedule function [0,1] -> R+ where 1 is beginning of the training
                                      and 0 is the end of the training

    save_interval: int                number of timesteps between saving events

    load_path: str                    path to load the model from

    **network_kwargs:                 keyword arguments to the policy / network builder. See baselines.common/policies.py/build_policy and arguments to a particular type of network
                                      For instance, 'mlp' network architecture has arguments num_hidden and num_layers.



    '''

    set_global_seeds(seed)

    if isinstance(lr, float): lr = constfn(lr)
    else: assert callable(lr)
    if isinstance(cliprange, float): cliprange = constfn(cliprange)
    else: assert callable(cliprange)
    total_timesteps = int(total_timesteps)

    policy = build_policy(env, network, **network_kwargs)

    # Get the nb of env
    nenvs = env.num_envs

    # Get state_space and action_space
    ob_space = env.observation_space
    ac_space = env.action_space

    # Calculate the batch_size
    nbatch = nenvs * nsteps
    nbatch_train = nbatch // nminibatches

    # Instantiate the model object (that creates act_model and train_model)
    if model_fn is None:
        from baselines.ppo2.model import Model
        model_fn = Model

    model = model_fn(policy=policy, ob_space=ob_space, ac_space=ac_space, nbatch_act=nenvs, nbatch_train=nbatch_train,
                    nsteps=nsteps, ent_coef=ent_coef, vf_coef=vf_coef,
                    max_grad_norm=max_grad_norm)

    if load_path is not None:
        model.load(load_path)
    # Instantiate the runner object
    runner = Runner(env=env, model=model, nsteps=nsteps, gamma=gamma, lam=lam)
    if eval_env is not None:
        eval_runner = Runner(env = eval_env, model = model, nsteps = nsteps, gamma = gamma, lam= lam)

    epinfobuf = deque(maxlen=100)
    if eval_env is not None:
        eval_epinfobuf = deque(maxlen=100)

    # Start total timer
    tfirststart = time.perf_counter()

    nupdates = total_timesteps//nbatch
    for update in range(1, nupdates+1):
        assert nbatch % nminibatches == 0
        # Start timer
        tstart = time.perf_counter()
        frac = 1.0 - (update - 1.0) / nupdates
        # Calculate the learning rate
        lrnow = lr(frac)
        # Calculate the cliprange
        cliprangenow = cliprange(frac)
        # Get minibatch
        obs, returns, masks, actions, values, neglogpacs, states, epinfos = runner.run() #pylint: disable=E0632
        if eval_env is not None:
            eval_obs, eval_returns, eval_masks, eval_actions, eval_values, eval_neglogpacs, eval_states, eval_epinfos = eval_runner.run() #pylint: disable=E0632

        epinfobuf.extend(epinfos)
        if eval_env is not None:
            eval_epinfobuf.extend(eval_epinfos)

        # Here what we're going to do is for each minibatch calculate the loss and append it.
        mblossvals = []
        if states is None: # nonrecurrent version
            # Index of each element of batch_size
            # Create the indices array
            inds = np.arange(nbatch)
            for _ in range(noptepochs):
                # Randomize the indexes
                np.random.shuffle(inds)
                # 0 to batch_size with batch_train_size step
                for start in range(0, nbatch, nbatch_train):
                    end = start + nbatch_train
                    mbinds = inds[start:end]
                    slices = (arr[mbinds] for arr in (obs, returns, masks, actions, values, neglogpacs))
                    mblossvals.append(model.train(lrnow, cliprangenow, *slices))
        else: # recurrent version
            assert nenvs % nminibatches == 0
            envsperbatch = nenvs // nminibatches
            envinds = np.arange(nenvs)
            flatinds = np.arange(nenvs * nsteps).reshape(nenvs, nsteps)
            for _ in range(noptepochs):
                np.random.shuffle(envinds)
                for start in range(0, nenvs, envsperbatch):
                    end = start + envsperbatch
                    mbenvinds = envinds[start:end]
                    mbflatinds = flatinds[mbenvinds].ravel()
                    slices = (arr[mbflatinds] for arr in (obs, returns, masks, actions, values, neglogpacs))
                    mbstates = states[mbenvinds]
                    mblossvals.append(model.train(lrnow, cliprangenow, *slices, mbstates))

        # Feedforward --> get losses --> update
        lossvals = np.mean(mblossvals, axis=0)
        # End timer
        tnow = time.perf_counter()
        # Calculate the fps (frame per second)
        fps = int(nbatch / (tnow - tstart))
        if update % log_interval == 0 or update == 1:
            # Calculates if value function is a good predicator of the returns (ev > 1)
            # or if it's just worse than predicting nothing (ev =< 0)
            ev = explained_variance(values, returns)
            logger.logkv("serial_timesteps", update*nsteps)
            logger.logkv("nupdates", update)
            logger.logkv("total_timesteps", update*nbatch)
            logger.logkv("fps", fps)
            logger.logkv("explained_variance", float(ev))
            logger.logkv('eprewmean', safemean([epinfo['r'] for epinfo in epinfobuf]))
            logger.logkv('eplenmean', safemean([epinfo['l'] for epinfo in epinfobuf]))
            if eval_env is not None:
                logger.logkv('eval_eprewmean', safemean([epinfo['r'] for epinfo in eval_epinfobuf]) )
                logger.logkv('eval_eplenmean', safemean([epinfo['l'] for epinfo in eval_epinfobuf]) )
            logger.logkv('time_elapsed', tnow - tfirststart)
            for (lossval, lossname) in zip(lossvals, model.loss_names):
                logger.logkv(lossname, lossval)
            if MPI is None or MPI.COMM_WORLD.Get_rank() == 0:
                logger.dumpkvs()
        if save_interval and (update % save_interval == 0 or update == 1) and logger.get_dir() and (MPI is None or MPI.COMM_WORLD.Get_rank() == 0):
            checkdir = osp.join(logger.get_dir(), 'checkpoints')
            os.makedirs(checkdir, exist_ok=True)
            savepath = osp.join(checkdir, '%.5i'%update)
            print('Saving to', savepath)
            model.save(savepath)
    return model
Esempio n. 34
0
def learn(network, env, seed=None, nsteps=20, total_timesteps=int(80e6), q_coef=0.5, ent_coef=0.01,
          max_grad_norm=10, lr=7e-4, lrschedule='linear', rprop_epsilon=1e-5, rprop_alpha=0.99, gamma=0.99,
          log_interval=100, buffer_size=50000, replay_ratio=4, replay_start=10000, c=10.0,
          trust_region=True, alpha=0.99, delta=1, load_path=None, **network_kwargs):

    '''
    Main entrypoint for ACER (Actor-Critic with Experience Replay) algorithm (https://arxiv.org/pdf/1611.01224.pdf)
    Train an agent with given network architecture on a given environment using ACER.

    Parameters:
    ----------

    network:            policy network architecture. Either string (mlp, lstm, lnlstm, cnn_lstm, cnn, cnn_small, conv_only - see baselines.common/models.py for full list)
                        specifying the standard network architecture, or a function that takes tensorflow tensor as input and returns
                        tuple (output_tensor, extra_feed) where output tensor is the last network layer output, extra_feed is None for feed-forward
                        neural nets, and extra_feed is a dictionary describing how to feed state into the network for recurrent neural nets.
                        See baselines.common/policies.py/lstm for more details on using recurrent nets in policies

    env:                environment. Needs to be vectorized for parallel environment simulation.
                        The environments produced by gym.make can be wrapped using baselines.common.vec_env.DummyVecEnv class.

    nsteps:             int, number of steps of the vectorized environment per update (i.e. batch size is nsteps * nenv where
                        nenv is number of environment copies simulated in parallel) (default: 20)

    nstack:             int, size of the frame stack, i.e. number of the frames passed to the step model. Frames are stacked along channel dimension
                        (last image dimension) (default: 4)

    total_timesteps:    int, number of timesteps (i.e. number of actions taken in the environment) (default: 80M)

    q_coef:             float, value function loss coefficient in the optimization objective (analog of vf_coef for other actor-critic methods)

    ent_coef:           float, policy entropy coefficient in the optimization objective (default: 0.01)

    max_grad_norm:      float, gradient norm clipping coefficient. If set to None, no clipping. (default: 10),

    lr:                 float, learning rate for RMSProp (current implementation has RMSProp hardcoded in) (default: 7e-4)

    lrschedule:         schedule of learning rate. Can be 'linear', 'constant', or a function [0..1] -> [0..1] that takes fraction of the training progress as input and
                        returns fraction of the learning rate (specified as lr) as output

    rprop_epsilon:      float, RMSProp epsilon (stabilizes square root computation in denominator of RMSProp update) (default: 1e-5)

    rprop_alpha:        float, RMSProp decay parameter (default: 0.99)

    gamma:              float, reward discounting factor (default: 0.99)

    log_interval:       int, number of updates between logging events (default: 100)

    buffer_size:        int, size of the replay buffer (default: 50k)

    replay_ratio:       int, now many (on average) batches of data to sample from the replay buffer take after batch from the environment (default: 4)

    replay_start:       int, the sampling from the replay buffer does not start until replay buffer has at least that many samples (default: 10k)

    c:                  float, importance weight clipping factor (default: 10)

    trust_region        bool, whether or not algorithms estimates the gradient KL divergence between the old and updated policy and uses it to determine step size  (default: True)

    delta:              float, max KL divergence between the old policy and updated policy (default: 1)

    alpha:              float, momentum factor in the Polyak (exponential moving average) averaging of the model parameters (default: 0.99)

    load_path:          str, path to load the model from (default: None)

    **network_kwargs:               keyword arguments to the policy / network builder. See baselines.common/policies.py/build_policy and arguments to a particular type of network
                                    For instance, 'mlp' network architecture has arguments num_hidden and num_layers.

    '''

    print("Running Acer Simple")
    print(locals())
    set_global_seeds(seed)
    if not isinstance(env, VecFrameStack):
        env = VecFrameStack(env, 1)

    policy = build_policy(env, network, estimate_q=True, **network_kwargs)
    nenvs = env.num_envs
    ob_space = env.observation_space
    ac_space = env.action_space

    nstack = env.nstack
    model = Model(policy=policy, ob_space=ob_space, ac_space=ac_space, nenvs=nenvs, nsteps=nsteps,
                  ent_coef=ent_coef, q_coef=q_coef, gamma=gamma,
                  max_grad_norm=max_grad_norm, lr=lr, rprop_alpha=rprop_alpha, rprop_epsilon=rprop_epsilon,
                  total_timesteps=total_timesteps, lrschedule=lrschedule, c=c,
                  trust_region=trust_region, alpha=alpha, delta=delta)

    runner = Runner(env=env, model=model, nsteps=nsteps)
    if replay_ratio > 0:
        buffer = Buffer(env=env, nsteps=nsteps, size=buffer_size)
    else:
        buffer = None
    nbatch = nenvs*nsteps
    acer = Acer(runner, model, buffer, log_interval)
    acer.tstart = time.time()

    for acer.steps in range(0, total_timesteps, nbatch): #nbatch samples, 1 on_policy call and multiple off-policy calls
        acer.call(on_policy=True)
        if replay_ratio > 0 and buffer.has_atleast(replay_start):
            n = np.random.poisson(replay_ratio)
            for _ in range(n):
                acer.call(on_policy=False)  # no simulation steps in this

    return model
Esempio n. 35
0
def learn(*,
        network,
        env,
        total_timesteps,
        timesteps_per_batch=1024, # what to train on
        max_kl=0.001,
        cg_iters=10,
        gamma=0.99,
        lam=1.0, # advantage estimation
        seed=None,
        ent_coef=0.0,
        cg_damping=1e-2,
        vf_stepsize=3e-4,
        vf_iters =3,
        max_episodes=0, max_iters=0,  # time constraint
        callback=None,
        load_path=None,
        **network_kwargs
        ):
    '''
    learn a policy function with TRPO algorithm

    Parameters:
    ----------

    network                 neural network to learn. Can be either string ('mlp', 'cnn', 'lstm', 'lnlstm' for basic types)
                            or function that takes input placeholder and returns tuple (output, None) for feedforward nets
                            or (output, (state_placeholder, state_output, mask_placeholder)) for recurrent nets

    env                     environment (one of the gym environments or wrapped via baselines.common.vec_env.VecEnv-type class

    timesteps_per_batch     timesteps per gradient estimation batch

    max_kl                  max KL divergence between old policy and new policy ( KL(pi_old || pi) )

    ent_coef                coefficient of policy entropy term in the optimization objective

    cg_iters                number of iterations of conjugate gradient algorithm

    cg_damping              conjugate gradient damping

    vf_stepsize             learning rate for adam optimizer used to optimie value function loss

    vf_iters                number of iterations of value function optimization iterations per each policy optimization step

    total_timesteps           max number of timesteps

    max_episodes            max number of episodes

    max_iters               maximum number of policy optimization iterations

    callback                function to be called with (locals(), globals()) each policy optimization step

    load_path               str, path to load the model from (default: None, i.e. no model is loaded)

    **network_kwargs        keyword arguments to the policy / network builder. See baselines.common/policies.py/build_policy and arguments to a particular type of network

    Returns:
    -------

    learnt model

    '''

    if MPI is not None:
        nworkers = MPI.COMM_WORLD.Get_size()
        rank = MPI.COMM_WORLD.Get_rank()
    else:
        nworkers = 1
        rank = 0

    cpus_per_worker = 1
    U.get_session(config=tf.ConfigProto(
            allow_soft_placement=True,
            inter_op_parallelism_threads=cpus_per_worker,
            intra_op_parallelism_threads=cpus_per_worker
    ))


    policy = build_policy(env, network, value_network='copy', **network_kwargs)
    set_global_seeds(seed)

    np.set_printoptions(precision=3)
    # Setup losses and stuff
    # ----------------------------------------
    ob_space = env.observation_space
    ac_space = env.action_space

    ob = observation_placeholder(ob_space)
    with tf.variable_scope("pi"):
        pi = policy(observ_placeholder=ob)
    with tf.variable_scope("oldpi"):
        oldpi = policy(observ_placeholder=ob)

    atarg = tf.placeholder(dtype=tf.float32, shape=[None]) # Target advantage function (if applicable)
    ret = tf.placeholder(dtype=tf.float32, shape=[None]) # Empirical return

    ac = pi.pdtype.sample_placeholder([None])

    kloldnew = oldpi.pd.kl(pi.pd)
    ent = pi.pd.entropy()
    meankl = tf.reduce_mean(kloldnew)
    meanent = tf.reduce_mean(ent)
    entbonus = ent_coef * meanent

    vferr = tf.reduce_mean(tf.square(pi.vf - ret))

    ratio = tf.exp(pi.pd.logp(ac) - oldpi.pd.logp(ac)) # advantage * pnew / pold
    surrgain = tf.reduce_mean(ratio * atarg)

    optimgain = surrgain + entbonus
    losses = [optimgain, meankl, entbonus, surrgain, meanent]
    loss_names = ["optimgain", "meankl", "entloss", "surrgain", "entropy"]

    dist = meankl

    all_var_list = get_trainable_variables("pi")
    # var_list = [v for v in all_var_list if v.name.split("/")[1].startswith("pol")]
    # vf_var_list = [v for v in all_var_list if v.name.split("/")[1].startswith("vf")]
    var_list = get_pi_trainable_variables("pi")
    vf_var_list = get_vf_trainable_variables("pi")

    vfadam = MpiAdam(vf_var_list)

    get_flat = U.GetFlat(var_list)
    set_from_flat = U.SetFromFlat(var_list)
    klgrads = tf.gradients(dist, var_list)
    flat_tangent = tf.placeholder(dtype=tf.float32, shape=[None], name="flat_tan")
    shapes = [var.get_shape().as_list() for var in var_list]
    start = 0
    tangents = []
    for shape in shapes:
        sz = U.intprod(shape)
        tangents.append(tf.reshape(flat_tangent[start:start+sz], shape))
        start += sz
    gvp = tf.add_n([tf.reduce_sum(g*tangent) for (g, tangent) in zipsame(klgrads, tangents)]) #pylint: disable=E1111
    fvp = U.flatgrad(gvp, var_list)

    assign_old_eq_new = U.function([],[], updates=[tf.assign(oldv, newv)
        for (oldv, newv) in zipsame(get_variables("oldpi"), get_variables("pi"))])

    compute_losses = U.function([ob, ac, atarg], losses)
    compute_lossandgrad = U.function([ob, ac, atarg], losses + [U.flatgrad(optimgain, var_list)])
    compute_fvp = U.function([flat_tangent, ob, ac, atarg], fvp)
    compute_vflossandgrad = U.function([ob, ret], U.flatgrad(vferr, vf_var_list))

    @contextmanager
    def timed(msg):
        if rank == 0:
            print(colorize(msg, color='magenta'))
            tstart = time.time()
            yield
            print(colorize("done in %.3f seconds"%(time.time() - tstart), color='magenta'))
        else:
            yield

    def allmean(x):
        assert isinstance(x, np.ndarray)
        if MPI is not None:
            out = np.empty_like(x)
            MPI.COMM_WORLD.Allreduce(x, out, op=MPI.SUM)
            out /= nworkers
        else:
            out = np.copy(x)

        return out

    U.initialize()
    if load_path is not None:
        pi.load(load_path)

    th_init = get_flat()
    if MPI is not None:
        MPI.COMM_WORLD.Bcast(th_init, root=0)

    set_from_flat(th_init)
    vfadam.sync()
    print("Init param sum", th_init.sum(), flush=True)

    # Prepare for rollouts
    # ----------------------------------------
    seg_gen = traj_segment_generator(pi, env, timesteps_per_batch, stochastic=True)

    episodes_so_far = 0
    timesteps_so_far = 0
    iters_so_far = 0
    tstart = time.time()
    lenbuffer = deque(maxlen=40) # rolling buffer for episode lengths
    rewbuffer = deque(maxlen=40) # rolling buffer for episode rewards

    if sum([max_iters>0, total_timesteps>0, max_episodes>0])==0:
        # noththing to be done
        return pi

    assert sum([max_iters>0, total_timesteps>0, max_episodes>0]) < 2, \
        'out of max_iters, total_timesteps, and max_episodes only one should be specified'

    while True:
        if callback: callback(locals(), globals())
        if total_timesteps and timesteps_so_far >= total_timesteps:
            break
        elif max_episodes and episodes_so_far >= max_episodes:
            break
        elif max_iters and iters_so_far >= max_iters:
            break
        logger.log("********** Iteration %i ************"%iters_so_far)

        with timed("sampling"):
            seg = seg_gen.__next__()
        add_vtarg_and_adv(seg, gamma, lam)

        # ob, ac, atarg, ret, td1ret = map(np.concatenate, (obs, acs, atargs, rets, td1rets))
        ob, ac, atarg, tdlamret = seg["ob"], seg["ac"], seg["adv"], seg["tdlamret"]
        vpredbefore = seg["vpred"] # predicted value function before udpate
        atarg = (atarg - atarg.mean()) / atarg.std() # standardized advantage function estimate

        if hasattr(pi, "ret_rms"): pi.ret_rms.update(tdlamret)
        if hasattr(pi, "ob_rms"): pi.ob_rms.update(ob) # update running mean/std for policy

        args = seg["ob"], seg["ac"], atarg
        fvpargs = [arr[::5] for arr in args]
        def fisher_vector_product(p):
            return allmean(compute_fvp(p, *fvpargs)) + cg_damping * p

        assign_old_eq_new() # set old parameter values to new parameter values
        with timed("computegrad"):
            *lossbefore, g = compute_lossandgrad(*args)
        lossbefore = allmean(np.array(lossbefore))
        g = allmean(g)
        if np.allclose(g, 0):
            logger.log("Got zero gradient. not updating")
        else:
            with timed("cg"):
                stepdir = cg(fisher_vector_product, g, cg_iters=cg_iters, verbose=rank==0)
            assert np.isfinite(stepdir).all()
            shs = .5*stepdir.dot(fisher_vector_product(stepdir))
            lm = np.sqrt(shs / max_kl)
            # logger.log("lagrange multiplier:", lm, "gnorm:", np.linalg.norm(g))
            fullstep = stepdir / lm
            expectedimprove = g.dot(fullstep)
            surrbefore = lossbefore[0]
            stepsize = 1.0
            thbefore = get_flat()
            for _ in range(10):
                thnew = thbefore + fullstep * stepsize
                set_from_flat(thnew)
                meanlosses = surr, kl, *_ = allmean(np.array(compute_losses(*args)))
                improve = surr - surrbefore
                logger.log("Expected: %.3f Actual: %.3f"%(expectedimprove, improve))
                if not np.isfinite(meanlosses).all():
                    logger.log("Got non-finite value of losses -- bad!")
                elif kl > max_kl * 1.5:
                    logger.log("violated KL constraint. shrinking step.")
                elif improve < 0:
                    logger.log("surrogate didn't improve. shrinking step.")
                else:
                    logger.log("Stepsize OK!")
                    break
                stepsize *= .5
            else:
                logger.log("couldn't compute a good step")
                set_from_flat(thbefore)
            if nworkers > 1 and iters_so_far % 20 == 0:
                paramsums = MPI.COMM_WORLD.allgather((thnew.sum(), vfadam.getflat().sum())) # list of tuples
                assert all(np.allclose(ps, paramsums[0]) for ps in paramsums[1:])

        for (lossname, lossval) in zip(loss_names, meanlosses):
            logger.record_tabular(lossname, lossval)

        with timed("vf"):

            for _ in range(vf_iters):
                for (mbob, mbret) in dataset.iterbatches((seg["ob"], seg["tdlamret"]),
                include_final_partial_batch=False, batch_size=64):
                    g = allmean(compute_vflossandgrad(mbob, mbret))
                    vfadam.update(g, vf_stepsize)

        logger.record_tabular("ev_tdlam_before", explained_variance(vpredbefore, tdlamret))

        lrlocal = (seg["ep_lens"], seg["ep_rets"]) # local values
        if MPI is not None:
            listoflrpairs = MPI.COMM_WORLD.allgather(lrlocal) # list of tuples
        else:
            listoflrpairs = [lrlocal]

        lens, rews = map(flatten_lists, zip(*listoflrpairs))
        lenbuffer.extend(lens)
        rewbuffer.extend(rews)

        logger.record_tabular("EpLenMean", np.mean(lenbuffer))
        logger.record_tabular("EpRewMean", np.mean(rewbuffer))
        logger.record_tabular("EpThisIter", len(lens))
        episodes_so_far += len(lens)
        timesteps_so_far += sum(lens)
        iters_so_far += 1

        logger.record_tabular("EpisodesSoFar", episodes_so_far)
        logger.record_tabular("TimestepsSoFar", timesteps_so_far)
        logger.record_tabular("TimeElapsed", time.time() - tstart)

        if rank==0:
            logger.dump_tabular()

    return pi
Esempio n. 36
0
def learn(network, env, seed, total_timesteps=int(40e6), gamma=0.99, log_interval=1, nprocs=32, nsteps=20,
                 ent_coef=0.01, vf_coef=0.5, vf_fisher_coef=1.0, lr=0.25, max_grad_norm=0.5,
                 kfac_clip=0.001, save_interval=None, lrschedule='linear', load_path=None, is_async=True, **network_kwargs):
    set_global_seeds(seed)


    if network == 'cnn':
        network_kwargs['one_dim_bias'] = True

    policy = build_policy(env, network, **network_kwargs)

    nenvs = env.num_envs
    ob_space = env.observation_space
    ac_space = env.action_space
    make_model = lambda : Model(policy, ob_space, ac_space, nenvs, total_timesteps, nprocs=nprocs, nsteps
                                =nsteps, ent_coef=ent_coef, vf_coef=vf_coef, vf_fisher_coef=
                                vf_fisher_coef, lr=lr, max_grad_norm=max_grad_norm, kfac_clip=kfac_clip,
                                lrschedule=lrschedule, is_async=is_async)
    if save_interval and logger.get_dir():
        import cloudpickle
        with open(osp.join(logger.get_dir(), 'make_model.pkl'), 'wb') as fh:
            fh.write(cloudpickle.dumps(make_model))
    model = make_model()

    if load_path is not None:
        model.load(load_path)

    runner = Runner(env, model, nsteps=nsteps, gamma=gamma)
    epinfobuf = deque(maxlen=100)
    nbatch = nenvs*nsteps
    tstart = time.time()
    coord = tf.train.Coordinator()
    if is_async:
        enqueue_threads = model.q_runner.create_threads(model.sess, coord=coord, start=True)
    else:
        enqueue_threads = []

    for update in range(1, total_timesteps//nbatch+1):
        obs, states, rewards, masks, actions, values, epinfos = runner.run()
        epinfobuf.extend(epinfos)
        policy_loss, value_loss, policy_entropy = model.train(obs, states, rewards, masks, actions, values)
        model.old_obs = obs
        nseconds = time.time()-tstart
        fps = int((update*nbatch)/nseconds)
        if update % log_interval == 0 or update == 1:
            ev = explained_variance(values, rewards)
            logger.record_tabular("nupdates", update)
            logger.record_tabular("total_timesteps", update*nbatch)
            logger.record_tabular("fps", fps)
            logger.record_tabular("policy_entropy", float(policy_entropy))
            logger.record_tabular("policy_loss", float(policy_loss))
            logger.record_tabular("value_loss", float(value_loss))
            logger.record_tabular("explained_variance", float(ev))
            logger.record_tabular("eprewmean", safemean([epinfo['r'] for epinfo in epinfobuf]))
            logger.record_tabular("eplenmean", safemean([epinfo['l'] for epinfo in epinfobuf]))
            logger.dump_tabular()

        if save_interval and (update % save_interval == 0 or update == 1) and logger.get_dir():
            savepath = osp.join(logger.get_dir(), 'checkpoint%.5i'%update)
            print('Saving to', savepath)
            model.save(savepath)
    coord.request_stop()
    coord.join(enqueue_threads)
    return model