コード例 #1
0
def test_ddpg_eval_env():
    """
    Additional test to check that everything is working when passing
    an eval env.
    """
    eval_env = gym.make("Pendulum-v0")
    model = DDPG("MlpPolicy", "Pendulum-v0", nb_rollout_steps=5,
                nb_train_steps=2, nb_eval_steps=10,
                eval_env=eval_env, verbose=0)
    model.learn(1000)
コード例 #2
0
def test_ddpg_popart():
    """
    Test DDPG with pop-art normalization
    """
    n_actions = 1
    action_noise = NormalActionNoise(mean=np.zeros(n_actions), sigma=0.1 * np.ones(n_actions))
    model = DDPG('MlpPolicy', 'Pendulum-v0', memory_limit=50000, normalize_observations=True,
                 normalize_returns=True, nb_rollout_steps=128, nb_train_steps=1,
                 batch_size=64, action_noise=action_noise, enable_popart=True)
    model.learn(1000)
コード例 #3
0
def train_agent(train, pickle_file, agent_type, env_kwargs, parms):

    bin_path = "bin/" + pickle_file

    if (path.exists(bin_path)):
        if agent_type == "a2c":
            print("Loading A2C Agent")
            RL_model = A2C.load(
                bin_path,
                tensorboard_log=f"{config.TENSORBOARD_LOG_DIR}/{agent_type}")
        elif agent_type == "ddpg":
            print("Loading DDPG Agent")
            RL_model = DDPG.load(
                bin_path,
                tensorboard_log=f"{config.TENSORBOARD_LOG_DIR}/{agent_type}")
        elif agent_type == "ppo":
            print("Loading PPO2 Agent")
            RL_model = PPO2.load(
                bin_path,
                tensorboard_log=f"{config.TENSORBOARD_LOG_DIR}/{agent_type}")
    else:
        e_train_gym = ipenv.PortfolioAllocEnv(df=train, **env_kwargs)
        env_train, _ = e_train_gym.get_sb_env()

        agent = ipagent.IPRLAgent(env=env_train)

        model = agent.get_model(model_name=agent_type, model_kwargs=parms)

        RL_model = agent.train_model(model=model,
                                     tb_log_name=agent_type,
                                     total_timesteps=1000000)

        RL_model.save(bin_path)

    return RL_model
コード例 #4
0
def test_ddpg_normalization():
    """
    Test that observations and returns normalizations are properly saved and loaded.
    """
    param_noise = AdaptiveParamNoiseSpec(initial_stddev=0.05,
                                         desired_action_stddev=0.05)
    model = DDPG('MlpPolicy',
                 'Pendulum-v0',
                 memory_limit=50000,
                 normalize_observations=True,
                 normalize_returns=True,
                 nb_rollout_steps=128,
                 nb_train_steps=1,
                 batch_size=64,
                 param_noise=param_noise)
    model.learn(1000)
    obs_rms_params = model.sess.run(model.obs_rms_params)
    ret_rms_params = model.sess.run(model.ret_rms_params)
    model.save('./test_ddpg')

    loaded_model = DDPG.load("test_ddpg")
    obs_rms_params_2 = loaded_model.sess.run(loaded_model.obs_rms_params)
    ret_rms_params_2 = loaded_model.sess.run(loaded_model.ret_rms_params)

    for param, param_loaded in zip(obs_rms_params + ret_rms_params,
                                   obs_rms_params_2 + ret_rms_params_2):
        assert np.allclose(param, param_loaded)

    del model, loaded_model

    if os.path.exists("./test_ddpg"):
        os.remove("./test_ddpg")
コード例 #5
0
import gym
from stable_baselines.ddpg import LnMlpPolicy
from stable_baselines.ddpg import DDPG
env = gym.make('HalfCheetah-v3')
model = DDPG(LnMlpPolicy, env,gamma=.95,buffer_size=1000000,param_noise_adaption_interval=0.22,batch_size=256,
			 normalize_observations=True,normalize_returns=False, policy_kwargs= dict(layers=[400, 300]) ,verbose=1)
model.learn(total_timesteps=1000000)
model.save('Cheetah_model_DDPG')
obs = env.reset()
for i in range(1000):
    action, _states = model.predict(obs)
    obs, rewards, dones, info = env.step(action)
    env.render()
env.close()
コード例 #6
0
ファイル: main.py プロジェクト: safrooze/stable-baselines
def run(env_id, seed, noise_type, layer_norm, evaluation, **kwargs):
    """
    run the training of DDPG

    :param env_id: (str) the environment ID
    :param seed: (int) the initial random seed
    :param noise_type: (str) the wanted noises ('adaptive-param', 'normal' or 'ou'), can use multiple noise type by
        seperating them with commas
    :param layer_norm: (bool) use layer normalization
    :param evaluation: (bool) enable evaluation of DDPG training
    :param kwargs: (dict) extra keywords for the training.train function
    """

    # Configure things.
    rank = MPI.COMM_WORLD.Get_rank()
    if rank != 0:
        logger.set_level(logger.DISABLED)

    # Create envs.
    env = gym.make(env_id)
    env = bench.Monitor(
        env,
        logger.get_dir() and os.path.join(logger.get_dir(), str(rank)))

    if evaluation and rank == 0:
        eval_env = gym.make(env_id)
        eval_env = bench.Monitor(eval_env,
                                 os.path.join(logger.get_dir(), 'gym_eval'))
        env = bench.Monitor(env, None)
    else:
        eval_env = None

    # Parse noise_type
    action_noise = None
    param_noise = None
    nb_actions = env.action_space.shape[-1]
    for current_noise_type in noise_type.split(','):
        current_noise_type = current_noise_type.strip()
        if current_noise_type == 'none':
            pass
        elif 'adaptive-param' in current_noise_type:
            _, stddev = current_noise_type.split('_')
            param_noise = AdaptiveParamNoiseSpec(
                initial_stddev=float(stddev),
                desired_action_stddev=float(stddev))
        elif 'normal' in current_noise_type:
            _, stddev = current_noise_type.split('_')
            action_noise = NormalActionNoise(mean=np.zeros(nb_actions),
                                             sigma=float(stddev) *
                                             np.ones(nb_actions))
        elif 'ou' in current_noise_type:
            _, stddev = current_noise_type.split('_')
            action_noise = OrnsteinUhlenbeckActionNoise(
                mean=np.zeros(nb_actions),
                sigma=float(stddev) * np.ones(nb_actions))
        else:
            raise RuntimeError(
                'unknown noise type "{}"'.format(current_noise_type))

    # Seed everything to make things reproducible.
    seed = seed + 1000000 * rank
    logger.info('rank {}: seed={}, logdir={}'.format(rank, seed,
                                                     logger.get_dir()))
    tf.reset_default_graph()
    set_global_seeds(seed)
    env.seed(seed)
    if eval_env is not None:
        eval_env.seed(seed)

    # Disable logging for rank != 0 to avoid noise.
    start_time = 0
    if rank == 0:
        start_time = time.time()
    model = DDPG(policy=MlpPolicy,
                 env=env,
                 memory_policy=Memory,
                 eval_env=eval_env,
                 param_noise=param_noise,
                 action_noise=action_noise,
                 memory_limit=int(1e6),
                 layer_norm=layer_norm,
                 verbose=2,
                 **kwargs)
    model.learn(total_timesteps=10000)
    env.close()
    if eval_env is not None:
        eval_env.close()
    if rank == 0:
        logger.info('total runtime: {}s'.format(time.time() - start_time))
コード例 #7
0
def main(env, load_path, fig_path):

    # arguments
    print("env %s; load_path %s; fig_path %s;" % (env, load_path, fig_path))
    log_path = os.getcwd() + "/log/" + load_path
    os.makedirs(os.getcwd() + "/figs/" + "/", exist_ok=True)
    fig_path = os.getcwd() + "/figs/" + "/" + fig_path
    load_path = os.getcwd() + "/models/" + load_path

    # make environment, flattened environment, vectorized environment
    env = gym.make(env)
    env = gym.wrappers.FlattenDictWrapper(env, ['observation', 'achieved_goal', 'desired_goal'])
    env = DummyVecEnv([lambda: env])

    # load model
    model = DDPG.load(load_path, env=env)
    obs_initial = env.reset()
    obs = obs_initial

    # plot results
    plot_results(fig_path, log_path)

    # initializations
    niter = 10
    counter = 0
    timestep = 0
    results = [[[0,0,0] for i in range(100)], [[0,0,0,0] for i in range(100)]]
    current = [[[0,0,0] for i in range(100)], [[0,0,0,0] for i in range(100)]]
    print("==============================")

    # check initial positions and quaternions
    print("grip", env.envs[0].env.env.sim.data.get_site_xpos('grip'))
    print("box", env.envs[0].env.env.sim.data.get_site_xpos('box'))
    print("tool", env.envs[0].env.env.sim.data.get_site_xpos('tool'))
    print("mocap", env.envs[0].env.env.sim.data.mocap_pos)
    print("quat", env.envs[0].env.env.sim.data.mocap_quat)
    print("==============================")

    # mocap quaternion check
    for i in range(5):
        action, _states = model.predict(obs)
        obs, rewards, dones, info = env.step(action)
        quat = env.envs[0].env.env.sim.data.mocap_quat
        print("obs", obs)
        print("quat", quat)
    print("==============================")

    # start rendering
    dists = []
    box_goal_pos = np.array([0.6, 0.05, -0.17])
    while True:
        if counter == niter:
            break
        action, _states = model.predict(obs)
        obs_old = obs
        obs, rewards, dones, info = env.step(action)
        quaternion = env.envs[0].env.env.sim.data.mocap_quat
        if obs.all() == obs_initial.all():
            if counter % 10 == 0:
                xyzs = current[0]
                quats = current[1]
                print(xyzs)
                print(quats)
                filename = log_path + "/" + "results_" + str(counter) + ".txt"
                os.makedirs(log_path + "/", exist_ok=True)
                file = open(filename, 'w+')
                for xyz, quat in zip(xyzs, quats):
                    for coord in xyz:
                        file.write(str(coord) + " ")
                    for quat_coord in quat:
                        file.write(str(quat_coord) + " ")
                    file.write("\n")
                file.close()

            box_end_pos = np.array(obs_old[0][3:6].tolist())
            print(box_end_pos)
            print(np.shape(box_end_pos))
            print(box_goal_pos)
            print(np.shape(box_goal_pos))
            dists.append(np.linalg.norm(box_goal_pos - box_end_pos))
            current = [[[0,0,0] for i in range(100)], [[0,0,0,0] for i in range(100)]]
            timestep = 0
            counter += 1
        print(timestep)
        print("obs", obs)
        print("quat", quaternion)

        # for average trajectory, smoothed
        for i in range(3):
            results[0][timestep][i] += obs[0][:3].tolist()[i]
        for j in range(4):
            results[1][timestep][j] += quaternion[0].tolist()[j]

        # for current trajectory
        for i in range(3):
            current[0][timestep][i] += obs[0][:3].tolist()[i]
        for j in range(4):
            current[1][timestep][j] += quaternion[0].tolist()[j]

        timestep += 1
        env.render()

    # smooth paths by taking average, and calculate mean distance to goal state
    for timestep in range(100):
        for i in range(3):
            results[0][timeste][i] /= niter
        for j in range(4):
            results[0][timestep][j] /= niter
    dist = np.mean(dists)

    # print and write to file
    xyzs = results[0]
    quats = results[1]
    filename = log_path + "/" + "results_avg.txt"
    os.makedirs(log_path + "/", exist_ok=True)
    file = open(filename, 'w+')
    for xyz, quat in zip(xyzs, quats):
        for coord in xyz:
            file.write(str(coord) + " ")
        for quat_coord in quat:
            file.write(str(quat_coord) + " ")
        file.write("\n")
    file.close()

    # print average distances
    print("average distance of box from end goal: %f" % dist)
コード例 #8
0
def main(env,
         load,
         save_path,
         load_path=None,
         train_timesteps=1.25e6,
         eval_timesteps=5e3):

    # arguments
    print(
        "env %s; load %s; save_path %s; load_path %s; train_timesteps %s; eval_timesteps %s;"
        % (env, load, save_path, load_path, train_timesteps, eval_timesteps))
    train_timesteps = int(float(train_timesteps))
    eval_timesteps = int(float(eval_timesteps))

    # models path
    model_dir = os.getcwd() + "/models/"
    os.makedirs(model_dir, exist_ok=True)

    # logging path
    log_dir = os.getcwd() + "/log/" + save_path
    os.makedirs(log_dir, exist_ok=True)

    # absolute save path and models path
    save_path = model_dir + save_path
    if load and not load_path:
        print("no load path given, exiting...")
        sys.exit()
    elif load:
        load_path = model_dir + load_path

    # make environment, flattened environment, monitor, vectorized environment
    env = gym.make(env)
    env = gym.wrappers.FlattenDictWrapper(
        env, ['observation', 'achieved_goal', 'desired_goal'])
    env = Monitor(env, log_dir, allow_early_resets=True)
    env = DummyVecEnv([lambda: env])

    # load model, or start from scratch
    if load:
        print("loading model from: " + load_path)
        model = DDPG.load(load_path, env=env)
    else:
        print("training model from scratch")
        model = DDPG(MlpPolicy, env, verbose=1)

    # evaluate current model
    mean_reward_before_train = evaluate(model, env, num_steps=eval_timesteps)

    # train model
    global best_mean_reward, n_steps
    best_mean_reward, n_steps = -np.inf, 0
    model.learn(total_timesteps=train_timesteps, callback=None)

    # save model
    print("saving model to:" + save_path)
    model.save(save_path)

    # evaluate post training model
    mean_reward_after_train = evaluate(model, env, num_steps=eval_timesteps)

    # results
    print("reward before training:" + str(mean_reward_before_train))
    print("reward after training:" + str(mean_reward_after_train))
    print("done")
コード例 #9
0
def train(env_id, num_timesteps, seed, model_path=None, images=False):
    """
    Train PPO2 model for Mujoco environment, for testing purposes

    :param env_id: (str) the environment id string
    :param num_timesteps: (int) the number of timesteps to run
    :param seed: (int) Used to seed the random generator.
    """
    def make_env():
        if images:
            env_out = GymWrapper(
                suite.make(
                    "SawyerLift",
                    use_object_obs=False,
                    use_camera_obs=True,  # do not use pixel observations
                    has_offscreen_renderer=
                    True,  # not needed since not using pixel obs
                    has_renderer=False,  # make sure we can render to the screen
                    camera_depth=True,
                    reward_shaping=True,  # use dense rewards
                    control_freq=
                    10,  # control should happen fast enough so that simulation looks smooth
                    render_visual_mesh=False,
                ),
                keys=["image", "depth"],
                images=True,
            )
        else:
            env_out = GymWrapper(
                suite.make(
                    "SawyerLift",
                    use_object_obs=True,
                    use_camera_obs=False,  # do not use pixel observations
                    has_offscreen_renderer=
                    False,  # not needed since not using pixel obs
                    has_renderer=False,  # make sure we can render to the screen
                    camera_depth=False,
                    reward_shaping=True,  # use dense rewards
                    control_freq=
                    10,  # control should happen fast enough so that simulation looks smooth
                    render_visual_mesh=False,
                )  #, keys=["image", "depth"], images=True,
            )
        env_out.reward_range = None
        env_out.metadata = None
        env_out.spec = None
        env_out = bench.Monitor(env_out,
                                logger.get_dir(),
                                allow_early_resets=True)
        return env_out

    #env = make_env()

    if images:
        env = DummyVecEnv([make_env])
        env = VecNormalize(env)

        set_global_seeds(seed)
        policy = CnnPolicy
        tblog = "/cvgl2/u/surajn/workspace/tb_logs/sawyerlift_all/"
    else:
        env = DummyVecEnv([make_env])
        env = VecNormalize(env)

        set_global_seeds(seed)
        policy = MlpPolicy
        tblog = "/cvgl2/u/surajn/workspace/tb_logs/sawyerlift_all/"
    nb_actions = env.action_space.shape[-1]
    #model = PPO2(policy=policy, env=env, n_steps=2048, nminibatches=32, lam=0.95, gamma=0.99, noptepochs=10,
    #             ent_coef=0.0, learning_rate=3e-4, cliprange=0.2, verbose=1, tensorboard_log=tblog)
    #model = TRPO(policy=policy, env, timesteps_per_batch=1024, max_kl=0.01, cg_iters=10, cg_damping=0.1, entcoeff=0.0,
    #                 gamma=0.99, lam=0.98, vf_iters=5, vf_stepsize=1e-3, tensorboard_log=tblog, verbose=1)
    model = DDPG(policy=ddpgMlpPolicy,
                 env=env,
                 memory_policy=Memory,
                 eval_env=None,
                 param_noise=AdaptiveParamNoiseSpec(initial_stddev=0.2,
                                                    desired_action_stddev=0.2),
                 action_noise=OrnsteinUhlenbeckActionNoise(
                     mean=np.zeros(nb_actions),
                     sigma=float(0.2) * np.ones(nb_actions)),
                 memory_limit=int(1e6),
                 verbose=2,
                 tensorboard_log=tblog)

    model.learn(total_timesteps=num_timesteps)
    env.close()

    if model_path:
        model.save(model_path)
        #tf_util.save_state(model_path)

    return model, env
コード例 #10
0
def run(env_id, seed, layer_norm, evaluation, agent, delay_step, gamma=0.99, **kwargs):
    # Create envs.
    env = create_env(env_id, delay_step, str(0))
    print(env.observation_space, env.action_space)
    if evaluation:
        eval_env = create_env(env_id, delay_step, "eval_env")
    else:
        eval_env = None

    # Seed everything to make things reproducible.
    logger.info('seed={}, logdir={}'.format(seed, logger.get_dir()))
    tf.reset_default_graph()
    set_global_seeds(seed)
    env.seed(seed)
    if eval_env is not None:
        eval_env.seed(seed)

    # Disable logging for rank != 0 to avoid noise.
    start_time = time.time()

    policy = 'MlpPolicy'
    td3_variants = {
        "TD3": TD3,
        "TD3SIL": TD3SIL,
        "TD3NSTEP": TD3NSTEP,
        "TD3REDQ": TD3REDQ,
        "TD3DoubleTwin": TD3DoubleTwin,
    }
    if td3_variants.get(agent, None):
        model_func = td3_variants[agent]
        model = model_func(policy=policy, env=env, eval_env=eval_env, gamma=gamma, batch_size=128,
                           tau=0.005, policy_delay=2, learning_starts=25000,
                           action_noise=create_action_noise(env, "normal_0.1"), buffer_size=100000, verbose=2,
                           n_cpu_tf_sess=10,
                           policy_kwargs={"layers": [400, 300]})
    elif agent == "DDPG":
        model = DDPG(policy=policy, env=env, eval_env=eval_env, gamma=gamma, nb_eval_steps=5, batch_size=100,
                     nb_train_steps=100, nb_rollout_steps=100, learning_starts=10000,
                     actor_lr=1e-3, critic_lr=1e-3, critic_l2_reg=0,
                     tau=0.005, normalize_observations=False,
                     action_noise=create_action_noise(env, "normal_0.1"), buffer_size=int(1e6),
                     verbose=2, n_cpu_tf_sess=10,
                     policy_kwargs={"layers": [400, 300]})
    elif agent == "SAC":
        model = SAC(policy=policy, env=env, eval_env=eval_env, gamma=gamma, batch_size=256,
                    action_noise=create_action_noise(env, "normal_0.1"), buffer_size=int(1e6), verbose=2,
                    n_cpu_tf_sess=10, learning_starts=10000,
                    policy_kwargs={"layers": [256, 256]})
    elif agent == "GEM":
        policy = 'TD3LnMlpPolicy'
        model = TD3MemGEM(policy=policy, env=env, eval_env=eval_env, gamma=gamma, batch_size=128,
                          tau=0.005, policy_delay=2, learning_starts=25000,
                          action_noise=create_action_noise(env, "normal_0.1"), buffer_size=100000, verbose=2,
                          n_cpu_tf_sess=10,
                          alpha=0.5, beta=-1, iterative_q=-1,
                          num_q=4, gradient_steps=200, max_step=kwargs['max_steps'], reward_scale=1., nb_eval_steps=10,
                          policy_kwargs={"layers": [400, 300]})
    elif agent == "BP":
        policy = 'TD3LnMlpPolicy'
        model = TD3MemBackProp(policy=policy, env=env, eval_env=eval_env, gamma=gamma, batch_size=128,
                          tau=0.005, policy_delay=2, learning_starts=25000,
                          action_noise=create_action_noise(env, "normal_0.1"), buffer_size=100000, verbose=2,
                          n_cpu_tf_sess=10,
                          alpha=0.5, beta=-1, gradient_steps=200, max_step=kwargs['max_steps'], reward_scale=1., nb_eval_steps=10,
                          policy_kwargs={"layers": [400, 300]})
    else:
        raise NotImplementedError

    print("model building finished")
    model.learn(total_timesteps=kwargs['num_timesteps'])

    env.close()
    if eval_env is not None:
        eval_env.close()

    logger.info('total runtime: {}s'.format(time.time() - start_time))