def main(args):
    """
    Run a trained model for the cartpole problem

    :param args: (ArgumentParser) the input arguments
    """

    logger.configure()
    env = make_mujoco_env(args.env, 0)
    model = TRPO.load(args.dir, env)

    while True:
        obs, done = env.reset(), False
        episode_rew = 0
        while not done:
            if not args.no_render:
                img = env.render(mode="rgb_array")
                img = np.array(img) / 255.
                img = np.transpose(img, (2, 0, 1))
                vis.image(img, win='frame')
            action, _, _, _ = model.policy_pi.step(obs.reshape(-1, *obs.shape),
                                                   deterministic=True)
            obs, rew, done, _ = env.step(action)
            episode_rew += rew
        print("Episode reward", episode_rew)
        # No render is only used for automatic testing
        if args.no_render:
            break
コード例 #2
0
def main():
    """
    Runs the test
    """
    logger.configure()
    parser = mujoco_arg_parser()
    parser.add_argument('--model-path', default=os.path.join(logger.get_dir(), 'humanoid_policy'))
    parser.set_defaults(num_timesteps=int(2e7))

    args = parser.parse_args()

    if not args.play:
        # train the model
        train(num_timesteps=args.num_timesteps, seed=args.seed, model_path=args.model_path)

    else:
        # construct the model object, load pre-trained model and render
        model = train(num_timesteps=1, seed=args.seed)
        tf_util.load_state(args.model_path)
        env = make_mujoco_env('Humanoid-v2', seed=0)

        obs = env.reset()
        while True:
            action = model.policy.act(stochastic=False, obs=obs)[0]
            obs, _, done, _ = env.step(action)
            env.render()
            if done:
                obs = env.reset()
コード例 #3
0
def train(env_id, num_timesteps, seed):
    """
    train an ACKTR model on atari

    :param env_id: (str) Environment ID
    :param num_timesteps: (int) The total number of samples
    :param seed: (int) The initial seed for training
    """
    env = make_mujoco_env(env_id, seed)

    with tf.Session(config=tf.ConfigProto()):
        ob_dim = env.observation_space.shape[0]
        ac_dim = env.action_space.shape[0]
        with tf.variable_scope("vf"):
            value_fn = NeuralNetValueFunction(ob_dim, ac_dim)
        with tf.variable_scope("pi"):
            policy = GaussianMlpPolicy(ob_dim, ac_dim)

        learn(env,
              policy=policy,
              value_fn=value_fn,
              gamma=0.99,
              lam=0.97,
              timesteps_per_batch=2500,
              desired_kl=0.002,
              num_timesteps=num_timesteps,
              animate=False)

        env.close()
コード例 #4
0
def train(env_id, num_timesteps, seed):
    """
    Train TRPO model for the mujoco environment, for testing purposes

    :param env_id: (str) Environment ID
    :param num_timesteps: (int) The total number of samples
    :param seed: (int) The initial seed for training
    """
    with tf_util.single_threaded_session():
        rank = MPI.COMM_WORLD.Get_rank()
        if rank == 0:
            logger.configure()
        else:
            logger.configure(format_strs=[])
            logger.set_level(logger.DISABLED)
        workerseed = seed + 10000 * MPI.COMM_WORLD.Get_rank()

        tblog = "/cvgl2/u/surajn/workspace/tb_logs/reacher/"
        env = make_mujoco_env(env_id, workerseed)
        model = TRPO(MlpPolicy,
                     env,
                     timesteps_per_batch=1024,
                     max_kl=0.01,
                     cg_iters=10,
                     cg_damping=0.1,
                     entcoeff=0.0,
                     gamma=0.99,
                     lam=0.98,
                     vf_iters=5,
                     vf_stepsize=1e-3,
                     tensorboard_log)
        model.learn(total_timesteps=num_timesteps)
        env.close()
def train(env_id, num_timesteps, run, kappa, vf_phi_update_interval, log):
    """
    Train TRPO model for the mujoco environment, for testing purposes

    :param env_id: (str) Environment ID
    :param num_timesteps: (int) The total number of samples
    :param seed: (int) The initial seed for training
    """
    with tf_util.single_threaded_session():
        rank = MPI.COMM_WORLD.Get_rank()
        log_path = './experiments/'+str(env_id)+'./updated_nkappa_x7_ent_0.01_new/'+str(kappa)+'_'+str(vf_phi_update_interval)+'_'+str(run)
        if not log:
            if rank == 0:
                logger.configure(log_path)
            else:
                logger.configure(log_path, format_strs=[])
                logger.set_level(logger.DISABLED)
        else:
            if rank == 0:
                logger.configure()
            else:
                logger.configure(format_strs=[])
                logger.set_level(logger.DISABLED)
        seed = run
        workerseed = seed + 10000 * MPI.COMM_WORLD.Get_rank()

        #set_global_seeds(run)
        env = make_mujoco_env(env_id, workerseed)
        test_env = None#make_mujoco_env(env_id, workerseed)
        model = TRPO(MlpPolicy, env, test_env=test_env, timesteps_per_batch=1024, max_kl=0.01, cg_iters=10, cg_damping=0.1, entcoeff=0.01,
                     gamma=0.99, kappa=kappa, vf_iters=5, vf_stepsize=1e-3, verbose=1, vf_phi_update_interval=vf_phi_update_interval, seed=run)
        model.learn(total_timesteps=int(2e6), seed=run)
        #model.save("./"+str(env_id)+"./models/"+str(kappa)+"_"+str(run)+'_xnew_longer_slower'+str(vf_phi_update_interval)+'.pkl')
        env.close()
コード例 #6
0
def train_SAC(env, out_dir, seed=None, **kwargs):

    # Logs will be saved in log_dir/monitor.csv
    global output_dir
    output_dir = out_dir
    log_dir = os.path.join(out_dir, 'log')
    os.makedirs(log_dir, exist_ok=True)
    env = make_mujoco_env(env, 0)
    env = Monitor(env, log_dir + "/")

    continue_train = False
    if continue_train:
        # Continue training
        print("Loading pretrained agent")
        model = SAC.load(os.path.join(out_dir, 'final_model.pkl'),
                         env=env,
                         tensorboard_log=os.path.join(log_dir, 'tb'),
                         verbose=1,
                         **kwargs)
    else:
        model = SAC(
            policy,
            env,  #action_noise=action_noise,
            verbose=1,
            tensorboard_log=os.path.join(log_dir, 'tb'),
            full_tensorboard_log=False,
            **kwargs)

    model.learn(total_timesteps=n_timesteps,
                seed=seed,
                callback=callback,
                log_interval=10)

    return model
コード例 #7
0
def main():
    """
    Runs the test
    """
    args = mujoco_arg_parser().parse_args()
    logger.configure()
    train(args.env, num_timesteps=args.num_timesteps, seed=args.seed)

    env = make_mujoco_env(args.env, args.seed)
    model = PPO1(MlpPolicy,
                 env,
                 timesteps_per_actorbatch=2048,
                 clip_param=0.2,
                 entcoeff=0.0,
                 optim_epochs=10,
                 optim_stepsize=3e-4,
                 optim_batchsize=64,
                 gamma=0.99,
                 lam=0.95,
                 schedule='linear')
    model.learn(total_timesteps=args.num_timesteps)

    model.save("ppo1")
    # env.close()

    del model  # remove to demonstrate saving and loading
    # env = make_mujoco_env(args.env, args.seed)

    model = PPO1.load("ppo1")
    logger.log("~!!!!!!!!")
    episode_rew = 0
    obs = env.reset()

    while True:
        action, _states = model.predict(obs)
        ob, reward, done, info = env.step(action)
        episode_rew += reward
        env.render()
        if done:
            print(f'episode_rew={episode_rew}')
            episode_rew = 0
            obs = env.reset()
コード例 #8
0
def train(env_id, num_timesteps, seed, algorithm, model_save_file=None, log_dir=None):

    with tf_util.single_threaded_session():
        logger.configure(folder=log_dir, format_strs=['stdout', 'log', 'csv'])

        workerseed = seed + MPI.COMM_WORLD.Get_rank()
        env = make_mujoco_env(env_id, workerseed)

        if algorithm == "TRPO":
            model = TRPO(MlpPolicy, env, seed=workerseed, verbose=1)
        else:
            # Algorithm is PPO
            model = PPO1(MlpPolicy, env, seed=workerseed, verbose=1)

        model.learn(total_timesteps=num_timesteps)

        if model_save_file is not None:
            model.save(model_save_file)

        env.close()
コード例 #9
0
def train(env_id, num_timesteps, seed):
    """
    Train PPO1 model for the Mujoco environment, for testing purposes

    :param env_id: (str) Environment ID
    :param num_timesteps: (int) The total number of samples
    :param seed: (int) The initial seed for training
    """
    env = make_mujoco_env(env_id, seed)
    model = PPO1(MlpPolicy,
                 env,
                 timesteps_per_actorbatch=2048,
                 clip_param=0.2,
                 entcoeff=0.0,
                 optim_epochs=10,
                 optim_stepsize=3e-4,
                 optim_batchsize=64,
                 gamma=0.99,
                 lam=0.95,
                 schedule='linear')
    model.learn(total_timesteps=num_timesteps)
    env.close()
コード例 #10
0
def train(num_timesteps, seed, model_path=None):
    """
    Train PPO1 model for the Humanoid environment, for testing purposes

    :param num_timesteps: (int) The total number of samples
    :param seed: (int) The initial seed for training
    :param model_path: (str) path to the model
    """
    env_id = 'Humanoid-v2'

    env = make_mujoco_env(env_id, seed)

    # parameters below were the best found in a simple random search
    # these are good enough to make humanoid walk, but whether those are
    # an absolute best or not is not certain
    env = RewScale(env, 0.1)
    model = PPO1(MlpPolicy, env, timesteps_per_actorbatch=2048, clip_param=0.2, entcoeff=0.0, optim_epochs=10,
                 optim_stepsize=3e-4, optim_batchsize=64, gamma=0.99, lam=0.95, schedule='linear')
    model.learn(total_timesteps=num_timesteps)
    env.close()
    if model_path:
        tf_util.save_state(model_path)

    return model
コード例 #11
0
ファイル: MyCartpole_v0.py プロジェクト: ZWO2VPY6UT/grewRL
import gym

from stable_baselines.common.policies import MlpPolicy
from stable_baselines.common.cmd_util import make_mujoco_env
#from stable_baselines.common.vec_env.vec_normalize import VecNormalize
from stable_baselines.common.vec_env import DummyVecEnv
from stable_baselines import PPO2

import pybullet_envs

#env = gym.make('CartPole-v1')
env = make_mujoco_env('AntBulletEnv-v0', seed=0)
#env = VecNormalize(env0, norm_obs = True, norm_reward = False)

#env = DummyVecEnv([lambda: AHUenv() for i in range(1)])
# Optional: PPO2 requires a vectorized environment to run
# the env is now wrapped automatically when passing it to the constructor
#env = DummyVecEnv([lambda: env])
model = PPO2(MlpPolicy, env, verbose=1)

model.learn(total_timesteps=500)
print("end training =========================================")
#obs = env.reset()
#for i in range(1000):
#    action, _states = model.predict(obs)
#    obs, rewards, dones, info = env.step(action)
#    env.render()

model.save("ppo2_cartpole")
print("saved model =========================================")
#env.close()
コード例 #12
0
    actor_options = {
        'learning_rate': lr,
        'gamma': 1.,
        'verbose': 0,
        'n_steps': 100,
        'ent_coef': 0.,
        'max_grad_norm': 1e2,
    }

    description = ','.join(
        ['{}={}'.format(k, v) for k, v in actor_options.items()])
    description += ',num_env={},norm_obs={},norm_reward={}'.format(
        num_env, norm_obs, norm_reward)

    learning_options = {'total_timesteps': int(1e6)}

    # Wrap in a try statement to close the environment properly in case of keyboard interrupt.
    try:
        envs = [make_mujoco_env(env_name, 2) for _ in range(num_env)]
        # env = DummyVecEnv([lambda: env for env in envs])
        env = SubprocVecEnv([lambda: env for env in envs])
        env = VecNormalize(env, norm_obs=norm_obs, norm_reward=norm_reward)

        # Create the actor and learn
        actor_options['tensorboard_log'] = os.path.join(
            tensorboard_logdir, env_name)
        model = PPO2(MlpPolicy, env, **actor_options)
        # model = PPO2(MlpLstmPolicy, env, **actor_options)
        model.learn(**learning_options, tb_log_name=description)
    finally:
        env.close()