Example #1
0
def train(env_id, num_timesteps, seed):

    env = make_mujoco_env(env_id, seed)
    eval_env = make_mujoco_env(env_id, seed)

    sess = tf.InteractiveSession()
    ob_dim = env.observation_space.shape[0]
    ac_dim = env.action_space.shape[0]
    with tf.variable_scope("vf"):
        vf = NeuralNetValueFunction(ob_dim, ac_dim)
    with tf.variable_scope("pi"):
        policy = GaussianMlpPolicy(ob_dim, ac_dim)

    # log_dir = './result/%s'%(args.alg)

    learn(env,
          policy=policy,
          vf=vf,
          gamma=0.99,
          lam=0.97,
          timesteps_per_batch=2500,
          desired_kl=0.002,
          num_timesteps=num_timesteps,
          animate=False,
          eval_env=eval_env)

    env.close()
Example #2
0
def train(env_id, num_timesteps, seed):
    import mlp_policy, pposgd_simple
    U.make_session(num_cpu=1).__enter__()

    def policy_fn(name, ob_space, ac_space):
        return mlp_policy.MlpPolicy(name=name,
                                    ob_space=ob_space,
                                    ac_space=ac_space,
                                    hid_size=32,
                                    num_hid_layers=2)

    env = make_mujoco_env(env_id, seed)
    logger.log("========observation_space %s action_space %s" %
               (str(env.observation_space), str(env.action_space)))
    pposgd_simple.learn(env,
                        policy_fn,
                        max_timesteps=num_timesteps,
                        timesteps_per_actorbatch=1024,
                        clip_param=0.2,
                        entcoeff=0.0,
                        optim_epochs=10,
                        optim_stepsize=3e-4,
                        optim_batchsize=64,
                        gamma=0.99,
                        lam=0.95,
                        schedule='linear')
    env.close()
Example #3
0
def train(env_id, num_timesteps, seed=0):
    U.make_session(num_cpu=1).__enter__()

    def policy_fn(name, ob_space, ac_space):
        return cartpole_policy.CartPolePolicy(name=name,
                                              ob_space=ob_space,
                                              ac_space=ac_space,
                                              hid_size=6,
                                              num_hid_layers=2)

    env = make_mujoco_env(env_id, seed)
    pi = pposgd_simple.learn(
        env,
        policy_fn,
        max_timesteps=num_timesteps,
        timesteps_per_actorbatch=2048,
        clip_param=0.2,
        entcoeff=0.0,
        optim_epochs=10,
        optim_stepsize=3e-4,
        optim_batchsize=64,
        gamma=0.99,
        lam=0.95,
        schedule='linear',
    )
    env.close()

    return pi
Example #4
0
def train(env_id, num_timesteps, seed, model_path=None):
    from baselines.ppo1 import mlp_policy, pposgd_simple
    U.make_session(num_cpu=1).__enter__()
    def policy_fn(name, ob_space, ac_space):
        return mlp_policy.MlpPolicy(name=name, ob_space=ob_space, ac_space=ac_space,
            hid_size=64, num_hid_layers=2)
    env = make_mujoco_env(env_id, seed)

    # parameters below were the best found in a simple random search
    # these are good enough to make humanoid walk, but whether those are
    # an absolute best or not is not certain
    pi = pposgd_simple.learn(env, policy_fn,
            max_timesteps=num_timesteps,
            timesteps_per_actorbatch=2048,
            clip_param=0.2, entcoeff=0.0,
            optim_epochs=10, 
            optim_stepsize=3e-4, 
            optim_batchsize=64, 
            gamma=0.99, 
            lam=0.95,
            schedule='linear',
        )
    print(pi)
    env.close()
    if model_path:
        U.save_state(model_path)
        
    return pi
Example #5
0
def teacher_replay():
    env = make_mujoco_env("Reacher-v2", 0)
    rets = []
    episode = 0
    ret = 0
    timestep = 1
    with tf.Session() as sess:
        teacher = TeacherAgent(env, sess, True)
        ob = env.reset()
        ob = np.expand_dims(ob, axis=0)
        print(ob)
        while episode < TOTAL_EPISODES:
            timestep += 1
            ac, _ = teacher.pi.act(False, ob)
            ob, reward, new, _ = env.step(ac)
            ob = np.expand_dims(ob, axis=0)
            ret += reward
            if new:
                print(
                    "********** Episode {0}, timestep {1} ***********".format(
                        episode, timestep))
                print("return: {0}".format(reward))
                ob = env.reset()
                ob = np.expand_dims(ob, axis=0)
                rets.append(ret)
                ret = 0
                timestep = 1
                episode += 1
            env.render()

    # save_results
    np.save(teacher_ret_path, rets)
Example #6
0
def test_lstm_example():
    import tensorflow as tf
    from baselines.common import policies, models, cmd_util
    from baselines.common.vec_env.dummy_vec_env import DummyVecEnv

    # create vectorized environment
    venv = DummyVecEnv([lambda: cmd_util.make_mujoco_env('Reacher-v2', seed=0)])

    with tf.Session() as sess:
        # build policy based on lstm network with 128 units
        policy = policies.build_policy(venv, models.lstm(128))(nbatch=1, nsteps=1)

        # initialize tensorflow variables
        sess.run(tf.global_variables_initializer())

        # prepare environment variables
        ob = venv.reset()
        state = policy.initial_state
        done = [False]
        step_counter = 0

        # run a single episode until the end (i.e. until done)
        while True:
            action, _, state, _ = policy.step(ob, S=state, M=done)
            ob, reward, done, _ = venv.step(action)
            step_counter += 1
            if done:
                break


        assert step_counter > 5
Example #7
0
def train(env_id, num_timesteps, seed, save, clip_param, optim_stepsize,
          optim_batchsize, gamma, lam):
    from baselines.ppo1 import mlp_policy, pposgd_simple
    U.make_session(num_cpu=1).__enter__()

    def policy_fn(name, ob_space, ac_space):
        return mlp_policy.MlpPolicy(name=name,
                                    ob_space=ob_space,
                                    ac_space=ac_space,
                                    hid_size=64,
                                    num_hid_layers=2)

    env = make_mujoco_env(env_id, seed)
    ret = pposgd_simple.learn(
        env,
        policy_fn,
        max_timesteps=num_timesteps,
        timesteps_per_actorbatch=2048,
        clip_param=clip_param,
        entcoeff=0.0,
        optim_epochs=10,
        optim_stepsize=optim_stepsize,
        optim_batchsize=optim_batchsize,
        gamma=gamma,
        lam=lam,
        schedule='linear',
    )
    env.close()
    np.savetxt(save, np.array([ret]))
def train(env_id, num_timesteps, seed, hid_size=64, num_hid_layers=2):
    from baselines.ppo1 import mlp_policy, pposgd_simple
    assert env_id in (_MujocoEnvs + _RoboticsEnvs)
    def policy_fn(name, ob_space, ac_space):
        return mlp_policy.MlpPolicy(name=name, ob_space=ob_space, ac_space=ac_space,
            hid_size=hid_size, num_hid_layers=num_hid_layers)
    if env_id in _MujocoEnvs:
        env = make_mujoco_env(env_id, seed)
    elif env_id in _RoboticsEnvs:
        env = make_robotics_env(env_id, seed)
    else:
        raise ValueError('Environment `{0}` is not supported.'.format(env_id))
    # Not putting these params in config as we do not plan on changing them.
    optim_epochs = 10 if env_id in _MujocoEnvs else 5
    optim_batchsize = 64 if env_id in _MujocoEnvs else 256
    pi = pposgd_simple.learn(env, policy_fn,
            max_timesteps=num_timesteps,
            timesteps_per_actorbatch=2048,
            clip_param=0.2, entcoeff=0.0,
            optim_epochs=optim_epochs, optim_stepsize=3e-4,
            optim_batchsize=optim_batchsize,
            gamma=0.99, lam=0.95, schedule='linear',
        )
    env.close()
    return pi
Example #9
0
def train(env_id, num_timesteps, seed):
    import baselines.common.tf_util as U
    sess = U.single_threaded_session()
    sess.__enter__()

    rank = MPI.COMM_WORLD.Get_rank()
    if rank == 0:
        logger.configure()
    else:
        logger.configure(format_strs=[])
        logger.set_level(logger.DISABLED)
    workerseed = seed + 10000 * MPI.COMM_WORLD.Get_rank()

    def policy_fn(name, ob_space, ac_space):
        return MlpPolicy(name=name,
                         ob_space=ob_space,
                         ac_space=ac_space,
                         hid_size=32,
                         num_hid_layers=2)

    env = make_mujoco_env(env_id, workerseed)
    trpo_mpi.learn(env,
                   policy_fn,
                   timesteps_per_batch=1024,
                   max_kl=0.01,
                   cg_iters=10,
                   cg_damping=0.1,
                   max_timesteps=num_timesteps,
                   gamma=0.99,
                   lam=0.98,
                   vf_iters=5,
                   vf_stepsize=1e-3)
    env.close()
def train(env_id, num_timesteps, timesteps_per_actor_batch, seed,
          entropy_coeff, filepath):
    from baselines.ppo1 import mlp_policy, pposgd_simple
    sess = U.make_session(num_cpu=1)
    sess.__enter__()

    def policy_fn(name, ob_space, ac_space):
        return mlp_policy.MlpPolicy(name=name,
                                    ob_space=ob_space,
                                    ac_space=ac_space,
                                    hid_size=64,
                                    num_hid_layers=2)

    env = make_mujoco_env(env_id, seed)
    pposgd_simple.learn(env,
                        policy_fn,
                        max_timesteps=num_timesteps,
                        timesteps_per_actorbatch=timesteps_per_actor_batch,
                        clip_param=0.2,
                        entcoeff=entropy_coeff,
                        optim_epochs=10,
                        optim_stepsize=3e-4,
                        optim_batchsize=64,
                        gamma=0.99,
                        lam=0.95,
                        schedule='linear')
    env.close()

    # Save policy etc.
    saver = tf.train.Saver()
    saver.save(sess, filepath + "_final")
Example #11
0
def train(env_id, num_timesteps, seed):
    """
    Train PPO1 model for the Mujoco environment, for testing purposes

    :param env_id: (str) Environment ID
    :param num_timesteps: (int) The total number of samples
    :param seed: (int) The initial seed for training
    """
    def policy_fn(name, ob_space, ac_space, sess=None, placeholders=None):
        return mlp_policy.MlpPolicy(name=name,
                                    ob_space=ob_space,
                                    ac_space=ac_space,
                                    hid_size=64,
                                    num_hid_layers=2,
                                    sess=sess,
                                    placeholders=placeholders)

    env = make_mujoco_env(env_id, seed)
    pposgd_simple.learn(env,
                        policy_fn,
                        max_timesteps=num_timesteps,
                        timesteps_per_actorbatch=2048,
                        clip_param=0.2,
                        entcoeff=0.0,
                        optim_epochs=10,
                        optim_stepsize=3e-4,
                        optim_batchsize=64,
                        gamma=0.99,
                        lam=0.95,
                        schedule='linear')
    env.close()
Example #12
0
def train(env_id, num_timesteps, seed, alg, lr, momentum):
    env = make_mujoco_env(env_id, seed)

    if alg == 'sgd':
        from baselines.acktr.acktr_cont import learn
    elif alg == 'mid':
        from baselines.acktr.acktr_cont_midpoint import learn
    elif alg == 'geo':
        from baselines.acktr.acktr_cont_geo import learn
    else:
        raise ValueError
    nprocs = 4
    with tf.Session(
            config=tf.ConfigProto(allow_soft_placement=True,
                                  intra_op_parallelism_threads=nprocs,
                                  inter_op_parallelism_threads=nprocs)):
        ob_dim = env.observation_space.shape[0]
        ac_dim = env.action_space.shape[0]
        with tf.variable_scope("vf"):
            vf = NeuralNetValueFunction(ob_dim, ac_dim)
        policy = GaussianMlpPolicy(ob_dim, ac_dim, 'pi')

        learn(env,
              policy=policy,
              vf=vf,
              gamma=0.99,
              lam=0.97,
              timesteps_per_batch=2500,
              desired_kl=0.002,
              num_timesteps=num_timesteps,
              animate=False,
              lr=lr,
              momentum=momentum)

        env.close()
Example #13
0
def train(env_id, num_timesteps, seed, save_file, load_file, render,
          stochastic):
    from baselines.ppo1 import mlp_policy
    import my_pposgd_simple as pposgd_simple
    sess = U.make_session(num_cpu=1).__enter__()

    def policy_fn(name, ob_space, ac_space):
        return mlp_policy.MlpPolicy(name=name,
                                    ob_space=ob_space,
                                    ac_space=ac_space,
                                    hid_size=64,
                                    num_hid_layers=2)

    env = make_mujoco_env(env_id, seed)
    pposgd_simple.learn(env,
                        policy_fn,
                        max_timesteps=num_timesteps,
                        timesteps_per_actorbatch=2048,
                        clip_param=0.2,
                        entcoeff=0.0,
                        optim_epochs=10,
                        optim_stepsize=3e-4,
                        optim_batchsize=64,
                        gamma=0.99,
                        lam=0.95,
                        schedule='linear',
                        sess=sess,
                        save_file=save_file,
                        load_file=load_file,
                        render=render,
                        stochastic=stochastic)
    env.close()
Example #14
0
def train(env_id, num_timesteps, seed):
    from baselines.ppo1 import mlp_policy, pposgd_simple
    from baselines.ppo1 import lstm_fc_policy
    from baselines.ppo1 import one_lstm_policy
    U.make_session(num_cpu=4).__enter__()

    def policy_fn(name, ob_space, ac_space):
        return mlp_policy.MlpPolicy(
            name=name,
            ob_space=ob_space,
            ac_space=ac_space,
            hid_size=64,
            num_hid_layers=2)

    env = make_mujoco_env(env_id, seed)
    pposgd_simple.learn(
        env,
        policy_fn,
        max_timesteps=num_timesteps,
        timesteps_per_actorbatch=2048,
        clip_param=0.2,
        entcoeff=0.0,
        optim_epochs=10,
        optim_stepsize=3e-4,
        optim_batchsize=64,
        gamma=0.99,
        lam=0.95,
        schedule='linear',
    )
    env.close()
Example #15
0
def main():
    logger.configure()
    parser = mujoco_arg_parser()
    parser.add_argument('--model-path',
                        default='checkpoints_best/Humanoid-v2-6914')
    parser.set_defaults(num_timesteps=int(2e8))

    args = parser.parse_args()

    if not args.play:
        # train the model
        train(num_timesteps=args.num_timesteps,
              seed=args.seed,
              model_path=args.model_path)
    else:
        # construct the model object, load pre-trained model and render
        pi = train(num_timesteps=1, seed=args.seed)
        U.load_state(args.model_path)
        env = make_mujoco_env('Humanoid-v2', seed=123)

        ob = env.reset()
        while True:
            action = pi.act(stochastic=False, ob=ob)[0]
            ob, _, done, _ = env.step(action)
            env.render()
            time.sleep(0.01)
            if done:
                ob = env.reset()
Example #16
0
def train(num_timesteps, seed, model_path=None):
    env_id = 'Humanoid-v2'
    from baselines.ppo1 import mlp_policy, pposgd_simple
    U.make_session(num_cpu=1).__enter__()
    def policy_fn(name, ob_space, ac_space):
        return mlp_policy.MlpPolicy(name=name, ob_space=ob_space, ac_space=ac_space,
            hid_size=64, num_hid_layers=2)
    env = make_mujoco_env(env_id, seed)

    # parameters below were the best found in a simple random search
    # these are good enough to make humanoid walk, but whether those are
    # an absolute best or not is not certain
    env = RewScale(env, 0.1)
    pi = pposgd_simple.learn(env, policy_fn,
            max_timesteps=num_timesteps,
            timesteps_per_actorbatch=2048,
            clip_param=0.2, entcoeff=0.0,
            optim_epochs=10, 
            optim_stepsize=3e-4, 
            optim_batchsize=64, 
            gamma=0.99, 
            lam=0.95,
            schedule='linear',
        )
    env.close()
    if model_path:
        U.save_state(model_path)
        
    return pi
Example #17
0
def main():
    logger.configure()
    parser = mujoco_arg_parser()
    parser.add_argument('--model-path', default=os.path.join(logger.get_dir(), 'policy'))
    parser.set_defaults(num_timesteps=int(2e7))
   
    args = parser.parse_args()
    
    if not args.play:
        # train the model
        train(args.env, num_timesteps=args.num_timesteps, seed=args.seed, model_path=args.model_path)
    else:       
        # construct the model object, load pre-trained model and render
        pi = train(args.env, num_timesteps=1, seed=args.seed)
        U.load_state(args.model_path)
        env = make_mujoco_env(args.env, seed=0)

        ob = env.reset()        
        while True:
            action = pi.act(stochastic=False, ob=ob)[0]
            ob, _, done, _ =  env.step(action)
            print(ob,action)
            #env.render()
            if done:
                ob = env.reset()
Example #18
0
def test_lstm_example():
    import tensorflow as tf
    from baselines.common import policies, models, cmd_util
    from baselines.common.vec_env.dummy_vec_env import DummyVecEnv

    # create vectorized environment
    venv = DummyVecEnv(
        [lambda: cmd_util.make_mujoco_env('Reacher-v2', seed=0)])

    with tf.Session() as sess:
        # build policy based on lstm network with 128 units
        policy = policies.build_policy(venv, models.lstm(128))(nbatch=1,
                                                               nsteps=1)

        # initialize tensorflow variables
        sess.run(tf.global_variables_initializer())

        # prepare environment variables
        ob = venv.reset()
        state = policy.initial_state
        done = [False]
        step_counter = 0

        # run a single episode until the end (i.e. until done)
        while True:
            action, _, state, _ = policy.step(ob, S=state, M=done)
            ob, reward, done, _ = venv.step(action)
            step_counter += 1
            if done:
                break

        assert step_counter > 5
Example #19
0
def main():
    """
    Runs the test
    """
    logger.configure()
    parser = mujoco_arg_parser()
    parser.add_argument('--model-path',
                        default=os.path.join(logger.get_dir(),
                                             'humanoid_policy'))
    parser.set_defaults(num_timesteps=int(2e7))

    args = parser.parse_args()

    if not args.play:
        # train the model
        train(num_timesteps=args.num_timesteps,
              seed=args.seed,
              model_path=args.model_path)
    else:
        # construct the model object, load pre-trained model and render
        policy = train(num_timesteps=1, seed=args.seed)
        tf_util.load_state(args.model_path)
        env = make_mujoco_env('Humanoid-v2', seed=0)

        obs = env.reset()
        while True:
            action = policy.act(stochastic=False, obs=obs)[0]
            obs, _, done, _ = env.step(action)
            env.render()
            if done:
                obs = env.reset()
def main():
    args = mujoco_arg_parser().parse_args()
    logger.configure()
    pi = train(args.env, num_timesteps=args.num_timesteps, seed=args.seed)
    env = make_mujoco_env('Walker2d-v2', seed=0)
    ob = env.reset()
    while True:
        action = pi.act(stochastic=False, ob=ob)[0]
        ob, _, done, _ =  env.step(action)
        env.render()
        time.sleep(0.01)
        if done:
            env.reset()
Example #21
0
def createEnv(env_id='CartPole-v1', seed=0):
    import baselines.common.tf_util as U
    sess = U.single_threaded_session()
    sess.__enter__()

    rank = MPI.COMM_WORLD.Get_rank()
    if rank == 0:
        logger.configure()
    else:
        logger.configure(format_strs=[])
        logger.set_level(logger.DISABLED)
    workerseed = seed + 10000 * MPI.COMM_WORLD.Get_rank()
    return make_mujoco_env(env_id, workerseed)
Example #22
0
def train(num_timesteps, seed, model_path=None):
    env_id = 'Odie-v2'
    from baselines.ppo1 import mlp_policy, pposgd_simple
    U.make_session(num_cpu=1).__enter__()

    def policy_fn(name, ob_space, ac_space):
        return mlp_policy.MlpPolicy(name=name,
                                    ob_space=ob_space,
                                    ac_space=ac_space,
                                    hid_size=64,
                                    num_hid_layers=2)

    env = make_mujoco_env(env_id, seed)
    env = MinotaurMonitor(env_id, env)

    def callback(locals, globals):
        if (len(locals['rewbuffer'])):
            meanlosses = locals['meanlosses']
            loss_names = locals['loss_names']
            data = {
                'iters_so_far': locals['iters_so_far'],
                'episode_reward_mean': np.mean(locals['rewbuffer']),
            }
            for (lossval, name) in zipsame(locals['meanlosses'],
                                           locals['loss_names']):
                data['loss_' + name] = float(lossval)
            env.post_data(data)

    # parameters below were the best found in a simple random search
    # these are good enough to make humanoid walk, but whether those are
    # an absolute best or not is not certain
    pi = pposgd_simple.learn(env,
                             policy_fn,
                             max_timesteps=num_timesteps,
                             timesteps_per_actorbatch=2048,
                             clip_param=0.2,
                             entcoeff=0.0,
                             optim_epochs=10,
                             optim_stepsize=3e-4,
                             optim_batchsize=64,
                             gamma=0.99,
                             lam=0.95,
                             schedule='linear',
                             callback=callback)
    env.close()
    if model_path:
        U.save_state(model_path)

    return pi
Example #23
0
def train(env_id, num_timesteps, seed):
    from baselines.ppo1 import mlp_policy, pposgd_simple
    U.make_session(num_cpu=1).__enter__()
    def policy_fn(name, ob_space, ac_space):
        return mlp_policy.MlpPolicy(name=name, ob_space=ob_space, ac_space=ac_space,
            hid_size=64, num_hid_layers=2)
    env = make_mujoco_env(env_id, seed)
    pposgd_simple.learn(env, policy_fn,
            max_timesteps=num_timesteps,
            timesteps_per_actorbatch=2048,
            clip_param=0.2, entcoeff=0.0,
            optim_epochs=10, optim_stepsize=3e-4, optim_batchsize=64,
            gamma=0.99, lam=0.95, schedule='linear',
        )
    env.close()
Example #24
0
def train(env_id, num_timesteps, seed):
    env = make_mujoco_env(env_id, seed)

    with tf.Session(config=tf.ConfigProto()):
        ob_dim = env.observation_space.shape[0]
        ac_dim = env.action_space.shape[0]
        with tf.variable_scope("vf"):
            vf = NeuralNetValueFunction(ob_dim, ac_dim)
        with tf.variable_scope("pi"):
            policy = GaussianMlpPolicy(ob_dim, ac_dim)

        learn(env, policy=policy, vf=vf,
            gamma=0.99, lam=0.97, timesteps_per_batch=2500,
            desired_kl=0.002,
            num_timesteps=num_timesteps, animate=False)

        env.close()
Example #25
0
def train(env_id, parameters, seed):
    env = make_mujoco_env(env_id, seed)

    with tf.Session(config=tf.ConfigProto()):
        ob_dim = env.observation_space.shape[0]
        ac_dim = env.action_space.shape[0]
        with tf.variable_scope("vf"):
            vf = NeuralNetValueFunction(ob_dim, ac_dim)

        with tf.variable_scope('pi'):
            policy = GaussianMlpPolicy(ob_dim, ac_dim)
        #with tf.variable_scope ('policy_prev'):
        #    policy_previous = GaussianMlpPolicy(ob_dim, ac_dim, name='policy_prev')

        learn(env, policy=policy, vf=vf, parameters=parameters)

        env.close()
def train(env_id, num_timesteps, seed):
    from baselines.ppo1 import mlp_policy
    sess = U.make_session(num_cpu=4)
    sess.__enter__()

    def policy_fn(name, ob_space, ac_space):
        return mlp_policy.MlpPolicy(name=name, ob_space=ob_space, ac_space=ac_space,
                                    hid_size=128, num_hid_layers=8)

    env = make_mujoco_env(env_id, seed)
    learn(sess, env_id, env, policy_fn,
          max_timesteps=num_timesteps,
          timesteps_per_actorbatch=2048,
          clip_param=0.2, entcoeff=0.0,
          optim_epochs=10, optim_stepsize=3e-4, optim_batchsize=64,
          gamma=0.99, lam=0.95, schedule='linear'
          )
    env.close()
    tf.train.Saver().save(sess, directory + env_id + '/model.ckpt')
Example #27
0
def train(env_id, num_timesteps, seed):
    import baselines.common.tf_util as U
    sess = U.single_threaded_session()
    sess.__enter__()

    rank = MPI.COMM_WORLD.Get_rank()
    if rank == 0:
        logger.configure()
    else:
        logger.configure(format_strs=[])
        logger.set_level(logger.DISABLED)
    workerseed = seed + 10000 * MPI.COMM_WORLD.Get_rank()
    def policy_fn(name, ob_space, ac_space):
        return MlpPolicy(name=name, ob_space=ob_space, ac_space=ac_space,
            hid_size=32, num_hid_layers=2)
    env = make_mujoco_env(env_id, workerseed)
    trpo_mpi.learn(env, policy_fn, timesteps_per_batch=1024, max_kl=0.01, cg_iters=10, cg_damping=0.1,
        max_timesteps=num_timesteps, gamma=0.99, lam=0.98, vf_iters=5, vf_stepsize=1e-3)
    env.close()
Example #28
0
def train(num_timesteps, seed, model_path=None):
    """
    Train PPO1 model for the Humanoid environment, for testing purposes

    :param num_timesteps: (int) The total number of samples
    :param seed: (int) The initial seed for training
    :param model_path: (str) path to the model
    """
    env_id = 'Humanoid-v2'

    def policy_fn(name, ob_space, ac_space, sess=None, placeholders=None):
        return mlp_policy.MlpPolicy(name=name,
                                    ob_space=ob_space,
                                    ac_space=ac_space,
                                    hid_size=64,
                                    num_hid_layers=2,
                                    sess=sess,
                                    placeholders=placeholders)

    env = make_mujoco_env(env_id, seed)

    # parameters below were the best found in a simple random search
    # these are good enough to make humanoid walk, but whether those are
    # an absolute best or not is not certain
    env = RewScale(env, 0.1)
    policy = pposgd_simple.learn(env,
                                 policy_fn,
                                 max_timesteps=num_timesteps,
                                 timesteps_per_actorbatch=2048,
                                 clip_param=0.2,
                                 entcoeff=0.0,
                                 optim_epochs=10,
                                 optim_stepsize=3e-4,
                                 optim_batchsize=64,
                                 gamma=0.99,
                                 lam=0.95,
                                 schedule='linear')
    env.close()
    if model_path:
        tf_util.save_state(model_path)

    return policy
Example #29
0
def main():
    args = mujoco_arg_parser().parse_args()
    logger.configure()
    pi = train(args.env, num_timesteps=1, seed=args.seed, play=False)
    run = 'run-20180703_034952-1a24a6ik/'
    run_home = '/home/ubuntu/wandb_baselines/wandb/' + run  #run-20180702_220411-4xtopfue/'
    model_path = run_home + 'humanoid_policy'
    # model_path = '/home/ubuntu/wandb_baselines/wandb/run-20180702_220411-4xtopfue/humanoid_policy'
    U.load_state(model_path)
    seed = random.randint(1, 1000)
    env = make_mujoco_env('RoboschoolHumanoid-v1', seed=seed)
    tot_r = 0
    ob = env.reset()
    runs = 0
    video = True
    if video:
        video_recorder = gym.wrappers.monitoring.video_recorder.VideoRecorder(
            env=env,
            base_path=os.path.join('/home/ubuntu/wandb_baselines',
                                   'humanoid_run2_%i' % seed),
            enabled=True)

    while True:
        action = pi.act(stochastic=False, ob=ob)[0]
        ob, r, done, _ = env.step(action)
        if video:
            video_recorder.capture_frame()
        tot_r += r
        if done:

            ob = env.reset()
            runs += 1
            #            if video:
            #                video_recorder.close()
            # video_recorder = gym.wrappers.monitoring.video_recorder.VideoRecorder(env=env, base_path=os.path.join(run_home, 'humanoid_run_%i'%runs), enabled=True)

            print(tot_r)
            tot_r = 0
            print("@@@@@@@@@@@@@@@")
        if runs > 0:
            break
Example #30
0
def train(env_id, num_timesteps, seed):
    from baselines.ppo1 import mlp_policy, pposgd_simple
    # enter a tensorflow session
    U.make_session(num_cpu=1).__enter__()
    def policy_fn(name, ob_space, ac_space):
        return mlp_policy.MlpPolicy(name=name, ob_space=ob_space, ac_space=ac_space,
            hid_size=64, num_hid_layers=2, gaussian_fixed_var=True)
    env = make_mujoco_env(env_id, seed)
    pi, result, graph_data = pposgd_simple.learn(env, policy_fn,
            max_timesteps=num_timesteps,
            timesteps_per_actorbatch=2048,
            clip_param=0.2, entcoeff=0.0,
            optim_epochs=10, optim_stepsize=3e-4, optim_batchsize=64,
            gamma=0.99, lam=0.95, schedule='linear',
        )
    if MPI.COMM_WORLD.Get_rank()==0:
        with open(resultfile, 'a+') as file:
            file.write("{0} result: {1}\n".format(env_id, result));
        with open(graphfile, 'wb') as file:
            pickle.dump([np.array(l) for l in list(zip(*graph_data))], file);
    env.close()
Example #31
0
def train(env_id, num_timesteps, seed, logdir):
    from baselines.ppo1 import mlp_policy, pposgd_simple
    U.make_session(num_cpu=1).__enter__()
    def policy_fn(name, ob_space, ac_space):
        return mlp_policy.MlpPolicy(name=name, ob_space=ob_space, ac_space=ac_space,
            hid_size=64, num_hid_layers=2)
    env = make_mujoco_env(env_id, seed)
    pposgd_simple.learn(env, policy_fn,
            max_timesteps=num_timesteps,
            timesteps_per_actorbatch=2048,
            clip_param=0.2, entcoeff=0.0,
            optim_epochs=10, optim_stepsize=3e-4, optim_batchsize=64,
            gamma=0.99, lam=0.95, schedule='linear',
        )
    import os
    fname = os.path.join(logdir, 'final_state')
    os.path.join(logdir, 'final_state')
    saver = tf.train.Saver()
    saver.save(tf.get_default_session(), fname)

    env.close()
Example #32
0
def student_replay(klts):
    env = make_mujoco_env("Reacher-v2", 0)
    timestep = 0
    ret = 0
    episode = 0
    with tf.Session() as sess:
        student = StudentAgent(env, sess, True, klts)
        ob = env.reset()
        while episode < TOTAL_EPISODES:
            ac = student.pi.act(False, ob)
            ob, reward, new, _ = env.step(ac)
            print("********** Episode {0}, timestep {1} ***********".format(
                episode, timestep))
            print("reward: {0}".format(reward))
            ret += reward
            timestep += 1
            if new:
                print(ob)
                ob = env.reset()
                timestep = 0
                episode += 1
Example #33
0
def train(num_timesteps, seed, model_path=None):
    env_id = 'Humanoid-v2'
    from baselines.ppo1 import mlp_policy, pposgd_simple
    U.make_session(num_cpu=1).__enter__()

    def policy_fn(name, ob_space, ac_space):
        return mlp_policy.MlpPolicy(name=name,
                                    ob_space=ob_space,
                                    ac_space=ac_space,
                                    hid_size=64,
                                    num_hid_layers=2)

    env = make_mujoco_env(env_id, seed)

    # parameters below were the best found in a simple random search
    # these are good enough to make humanoid walk, but whether those are
    # an absolute best or not is not certain
    env = RewScale(env, 0.1)
    logger.log(
        "NOTE: reward will be scaled by a factor of 10  in logged stats. Check the monitor for unscaled reward."
    )
    pi = pposgd_simple.learn(
        env,
        policy_fn,
        max_timesteps=num_timesteps,
        timesteps_per_actorbatch=2048,
        clip_param=0.1,
        entcoeff=0.0,
        optim_epochs=10,
        optim_stepsize=1e-4,
        optim_batchsize=64,
        gamma=0.99,
        lam=0.95,
        schedule='constant',
    )
    env.close()
    if model_path:
        U.save_state(model_path)

    return pi
Example #34
0
def train(env_id, num_timesteps, seed):
    env = make_mujoco_env(env_id, seed)

    with tf.Session(config=tf.ConfigProto()):
        ob_dim = env.observation_space.shape[0]
        ac_dim = env.action_space.shape[0]
        with tf.variable_scope("vf"):
            vf = NeuralNetValueFunction(ob_dim, ac_dim)
        with tf.variable_scope("pi"):
            policy = GaussianMlpPolicy(ob_dim, ac_dim)

        learn(env,
              policy=policy,
              vf=vf,
              gamma=0.99,
              lam=0.97,
              timesteps_per_batch=2500,
              desired_kl=0.002,
              num_timesteps=num_timesteps,
              animate=True)

        env.close()
Example #35
0
def train(env_id, num_timesteps, seed):
    """
    train an ACKTR model on atari

    :param env_id: (str) Environment ID
    :param num_timesteps: (int) The total number of samples
    :param seed: (int) The initial seed for training
    """
    env = make_mujoco_env(env_id, seed)

    with tf.Session(config=tf.ConfigProto()):
        ob_dim = env.observation_space.shape[0]
        ac_dim = env.action_space.shape[0]
        with tf.variable_scope("vf"):
            value_fn = NeuralNetValueFunction(ob_dim, ac_dim)
        with tf.variable_scope("pi"):
            policy = GaussianMlpPolicy(ob_dim, ac_dim)

        learn(env, policy=policy, value_fn=value_fn, gamma=0.99, lam=0.97, timesteps_per_batch=2500, desired_kl=0.002,
              num_timesteps=num_timesteps, animate=False)

        env.close()
Example #36
0
def main():
    logger.configure()
    parser = mujoco_arg_parser()
    parser.add_argument('--model-path', default=os.path.join(logger.get_dir(), 'humanoid_policy'))
    parser.set_defaults(num_timesteps=int(2e7))
   
    args = parser.parse_args()
    
    if not args.play:
        # train the model
        train(num_timesteps=args.num_timesteps, seed=args.seed, model_path=args.model_path)
    else:       
        # construct the model object, load pre-trained model and render
        pi = train(num_timesteps=1, seed=args.seed)
        U.load_state(args.model_path)
        env = make_mujoco_env('Humanoid-v2', seed=0)

        ob = env.reset()        
        while True:
            action = pi.act(stochastic=False, ob=ob)[0]
            ob, _, done, _ =  env.step(action)
            env.render()
            if done:
                ob = env.reset()