def do_ppo(args, start_theta, parent_this_run_dir, full_space_save_dir):

    """
    Runs the test
    """

    logger.log(f"#######CMA and then PPO TRAIN: {args}")

    this_conti_ppo_run_dir = get_ppo_part(parent_this_run_dir)
    log_dir = get_log_dir(this_conti_ppo_run_dir)
    conti_ppo_save_dir = get_save_dir(this_conti_ppo_run_dir)
    logger.configure(log_dir)

    full_param_traj_dir_path = get_full_params_dir(this_conti_ppo_run_dir)

    if os.path.exists(full_param_traj_dir_path):
        import shutil
        shutil.rmtree(full_param_traj_dir_path)
    os.makedirs(full_param_traj_dir_path)

    if os.path.exists(conti_ppo_save_dir):
        import shutil
        shutil.rmtree(conti_ppo_save_dir)
    os.makedirs(conti_ppo_save_dir)



    def make_env():
        env_out = gym.make(args.env)
        env_out.env.disableViewer = True
        env_out.env.visualize = False
        env_out = bench.Monitor(env_out, logger.get_dir(), allow_early_resets=True)
        return env_out
    env = DummyVecEnv([make_env])
    if args.normalize:
        env = VecNormalize(env)

    model = PPO2.load(f"{full_space_save_dir}/ppo2")
    model.set_from_flat(start_theta)

    if args.normalize:
        env.load_running_average(full_space_save_dir)
    model.set_env(env)


    run_info = {"run_num": args.run_num,
                "env_id": args.env,
                "full_param_traj_dir_path": full_param_traj_dir_path}

    # model = PPO2(policy=policy, env=env, n_steps=args.n_steps, nminibatches=args.nminibatches, lam=0.95, gamma=0.99,
    #              noptepochs=10,
    #              ent_coef=0.0, learning_rate=3e-4, cliprange=0.2, optimizer=args.optimizer)

    model.tell_run_info(run_info)
    episode_returns = model.learn(total_timesteps=args.ppo_num_timesteps)

    model.save(f"{conti_ppo_save_dir}/ppo2")

    env.save_running_average(conti_ppo_save_dir)
    return episode_returns, full_param_traj_dir_path
Beispiel #2
0
def get_ppo2(
    vec_env=None,
    policy='CnnPolicy',
    seed=0,
    number_of_steps_per_epoch=128,  # nsteps
    number_of_mini_batches_in_epoch=8,  # nminibatches
    number_of_updates_per_epoch=4,  # noptepochs
    max_grad_norm=0.5,
    gamma=0.993,  # discount factor
    entropy_coefficient=0.01,  # ent_coef
    learning_rate=0.00008,  # lr
    clip_range=0.27,  # cliprange
    vf_coefficient=0.5,
) -> PPO2:
    """
    Parameter's default values are taken from football.gfootball.examples.run_ppo2.py
    """
    if vec_env is None:
        vec_env = create_training_env(1)

    return PPO2(
        policy=policy,
        env=vec_env,
        gamma=gamma,
        n_steps=number_of_steps_per_epoch,
        ent_coef=entropy_coefficient,
        learning_rate=learning_rate,
        vf_coef=vf_coefficient,
        max_grad_norm=max_grad_norm,
        nminibatches=number_of_mini_batches_in_epoch,
        noptepochs=number_of_updates_per_epoch,
        cliprange=clip_range,
        seed=seed,
        verbose=2,
    )
Beispiel #3
0
    def __init__(self,
                 policy_file="../models/controllers/PPO/Pendulum-v0.pkl",
                 mass_prior=None,
                 length_prior=None,
                 episodes_per_params=1,
                 seed=1995,
                 params=["length", "mass"],
                 steps_per_episode=200,
                 sufficient_stats="Cross-Correlation",
                 load_from_file=False,
                 assets_path=".",
                 filename=""):

        self.env = PendulumEnv()
        self.seed = seed

        self.cached_data = None
        self.params_scaler = None
        self.params = params
        self.steps_per_episode = steps_per_episode
        self.sufficient_stats = sufficient_stats
        self.assets_path = assets_path
        self.load_from_file = load_from_file
        self.data_file = os.path.join(assets_path + filename)
        self.policy = PPO2.load(policy_file)
        if mass_prior is None:
            self.m_low = 0.1
            self.m_high = 2.0
            self.m_prior = self.sample_mass_from_uniform_prior

        if length_prior is None:
            self.l_low = 0.1
            self.l_high = 2.0
            self.l_prior = self.sample_length_from_uniform_prior
Beispiel #4
0
def train_agent(train, pickle_file, agent_type, env_kwargs, parms):

    bin_path = "bin/" + pickle_file

    if (path.exists(bin_path)):
        if agent_type == "a2c":
            print("Loading A2C Agent")
            RL_model = A2C.load(
                bin_path,
                tensorboard_log=f"{config.TENSORBOARD_LOG_DIR}/{agent_type}")
        elif agent_type == "ddpg":
            print("Loading DDPG Agent")
            RL_model = DDPG.load(
                bin_path,
                tensorboard_log=f"{config.TENSORBOARD_LOG_DIR}/{agent_type}")
        elif agent_type == "ppo":
            print("Loading PPO2 Agent")
            RL_model = PPO2.load(
                bin_path,
                tensorboard_log=f"{config.TENSORBOARD_LOG_DIR}/{agent_type}")
    else:
        e_train_gym = ipenv.PortfolioAllocEnv(df=train, **env_kwargs)
        env_train, _ = e_train_gym.get_sb_env()

        agent = ipagent.IPRLAgent(env=env_train)

        model = agent.get_model(model_name=agent_type, model_kwargs=parms)

        RL_model = agent.train_model(model=model,
                                     tb_log_name=agent_type,
                                     total_timesteps=1000000)

        RL_model.save(bin_path)

    return RL_model
Beispiel #5
0
def test(env_id, seed, policy):
    """
    Train PPO2 model for atari environment, for testing purposes

    :param env_id: (str) the environment id string
    :param seed: (int) Used to seed the random generator.
    :param policy: (Object) The policy model to use (MLP, CNN, LSTM, ...)
    """

    # if 'lstm' in policy:
    #     print('LSTM policies not supported for drawing')
    #     return 1
    env = DummyVecEnv([PadEnvRender for _ in range(1)])  # Need for lstm
    # else:
    #     env = PadEnvRender()

    env = VecFrameStack(env, 8)
    model = PPO2.load('./pad_5combo_ppo2.pkl', env)

    while True:
        obs, done = env.reset(), False
        episode_rew = 0

        while not done:
            env.render()
            action, _ = model.predict(obs)
            obs, rew, done, _ = env.step(action)
            done = done.any()
            episode_rew += rew
            time.sleep(1 / 24.)
            if done:
                print('Episode reward:', rew)
Beispiel #6
0
    def __init__(self,
                 policy_file="../models/controllers/PPO/CartPole-v1.pkl",
                 mass_prior=None,
                 length_prior=None,
                 episodes_per_params=1,
                 seed=1995,
                 params=["length", "masspole"],
                 steps_per_episode=50,
                 sufficient_stats="Cross-Correlation"):

        self.env = CartPoleEnv()
        self.seed = seed

        self.cached_data = None
        self.params_scaler = None
        self.params = params
        self.steps_per_episode = steps_per_episode
        self.sufficient_stats = sufficient_stats

        self.policy = PPO2.load(policy_file)
        if mass_prior is None:
            self.m_low = 0.1
            self.m_high = 2.0
            self.m_prior = self.sample_mass_from_uniform_prior

        if length_prior is None:
            self.l_low = 0.1
            self.l_high = 2.0
            self.l_prior = self.sample_length_from_uniform_prior
Beispiel #7
0
def main():
    """
    Train and save the PPO model, for the cartpole problem

    """
    print("Making a new model")

    env = ControlCarRacing(gym.make('CarRacing-v0'))
    env = MaxAndSkipEnv(env, skip=4)
    env = FrameStack(env, 4)
    env = Monitor(env, log_dir, allow_early_resets=True)
    env = DummyVecEnv([lambda: env])

    model = PPO2(policy=CnnPolicy,
                 env=env,
                 n_steps=128,
                 nminibatches=4,
                 noptepochs=10,
                 learning_rate=3e-4,
                 cliprange=lambda f: f * 0.2,
                 verbose=0,
                 tensorboard_log='graph/')

    print("Learning started. It takes some time...")
    model.learn(total_timesteps=300000,
                callback=callback,
                tb_log_name='190317')
    print("Saving model to CarRacing_model.pkl")
    model.save("CarRacing_model_PPO2")
    print("Plotting Learning Curve")
    plot_results(log_dir)
    plot_results(log_dir, smoothing=False)
Beispiel #8
0
def train(env_id, num_timesteps, seed, policy):
    """
    Train PPO2 model for atari environment, for testing purposes

    :param env_id: (str) the environment id string
    :param num_timesteps: (int) the number of timesteps to run
    :param seed: (int) Used to seed the random generator.
    :param policy: (Object) The policy model to use (MLP, CNN, LSTM, ...)
    """

    env = VecFrameStack(make_atari_env(env_id, 8, seed), 4)
    policy = {
        'cnn': CnnPolicy,
        'lstm': CnnLstmPolicy,
        'lnlstm': CnnLnLstmPolicy,
        'mlp': MlpPolicy
    }[policy]
    model = PPO2(policy=policy,
                 env=env,
                 n_steps=128,
                 nminibatches=4,
                 lam=0.95,
                 gamma=0.99,
                 noptepochs=4,
                 ent_coef=.01,
                 learning_rate=lambda f: f * 2.5e-4,
                 cliprange=lambda f: f * 0.1,
                 verbose=1)
    model.learn(total_timesteps=num_timesteps)
Beispiel #9
0
 def load(self, path, env):
     if self.trpo():
         return TRPO.load(path, env=env)
     elif self.ppo():
         return PPO2.load(path, env=env)
     else:
         return SAC.load(path, env=env)
Beispiel #10
0
def train(env_id, num_timesteps, seed):
    """
    Train PPO2 model for Mujoco environment, for testing purposes

    :param env_id: (str) the environment id string
    :param num_timesteps: (int) the number of timesteps to run
    :param seed: (int) Used to seed the random generator.
    """
    def make_env():
        env_out = gym.make(env_id)
        env_out = bench.Monitor(env_out,
                                logger.get_dir(),
                                allow_early_resets=True)
        return env_out

    env = DummyVecEnv([make_env])
    env = VecNormalize(env)

    set_global_seeds(seed)
    policy = MlpPolicy
    model = PPO2(policy=policy,
                 env=env,
                 n_steps=2048,
                 nminibatches=32,
                 lam=0.95,
                 gamma=0.99,
                 noptepochs=10,
                 ent_coef=0.0,
                 learning_rate=3e-4,
                 cliprange=0.2)
    model.learn(total_timesteps=num_timesteps)

    return model, env
Beispiel #11
0
def __main():
    from stable_baselines.ppo2 import PPO2
    from stable_baselines.common.policies import MlpPolicy
    from stable_baselines.common.vec_env import DummyVecEnv
    env = DummyVecEnv([OptLRs])
    agent = PPO2(MlpPolicy, env, verbose=1)
    agent.learn(total_timesteps=10**2)
Beispiel #12
0
 def model_free_policy(self, ne, n_epochs=1, train=True, load_model=False):
     if self.autoencoder is None:
         self.setup_autoencoder(ne.get_obs())
         assert (self.autoencoder) is not None
     if ne.autoencoder is None:
         ne.set_autoencoder(self.autoencode)
         ne.autoencoder = self.autoencode
     if train:
         fn = "models/model1.h5"
         self.mf_policy = PPO2(env=ne,
                               policy=MlpPolicy,
                               n_steps=40,
                               verbose=2,
                               noptepochs=10,
                               learning_rate=3e-4,
                               ent_coef=0.1,
                               gamma=0.1)
         if load_model:
             self.mf_policy.load(fn, env=make_vec_env(lambda: ne))
         else:
             self.mf_policy.learn(total_timesteps=n_epochs * 40)
             self.mf_policy.save(fn)
     encoded_obs = ne.rl_obs()
     return self.mf_policy.step([encoded_obs],
                                deterministic=True)[0].flatten()
def test_cnn_lstm_policy(request, policy):
    model_fname = './test_model_{}.zip'.format(request.node.name)

    try:
        env = make_env(0)
        model = PPO2(policy, env, nminibatches=1)
        model.learn(total_timesteps=15)
        env = model.get_env()
        evaluate_policy(model, env, n_eval_episodes=5)
        # saving
        model.save(model_fname)
        del model, env
        # loading
        _ = PPO2.load(model_fname, policy=policy)

    finally:
        if os.path.exists(model_fname):
            os.remove(model_fname)
Beispiel #14
0
def main():

    import sys
    logger.log(sys.argv)
    common_arg_parser = get_common_parser()
    args, cma_unknown_args = common_arg_parser.parse_known_args()

    this_run_dir = get_dir_path_for_this_run(args)
    plot_dir_alg = get_plot_dir(args)

    traj_params_dir_name = get_full_params_dir(this_run_dir)
    intermediate_data_dir = get_intermediate_data_dir(this_run_dir,
                                                      params_scope="pi")
    save_dir = get_save_dir(this_run_dir)

    if not os.path.exists(intermediate_data_dir):
        os.makedirs(intermediate_data_dir)
    if not os.path.exists(plot_dir_alg):
        os.makedirs(plot_dir_alg)

    final_file = get_full_param_traj_file_path(traj_params_dir_name,
                                               "pi_final")
    final_params = pd.read_csv(final_file, header=None).values[0]

    def make_env():
        env_out = gym.make(args.env)

        env_out = bench.Monitor(env_out,
                                logger.get_dir(),
                                allow_early_resets=True)
        return env_out

    env = DummyVecEnv([make_env])

    if args.normalize:
        env = VecNormalize(env)

    model = PPO2.load(f"{save_dir}/ppo2")  # this also loads V function
    model.set_pi_from_flat(final_params)

    if args.normalize:
        env.load_running_average(save_dir)

    obz_tensor = model.act_model.fake_input_tensor

    some_neuron = model.act_model.policy_neurons[2][-1]

    grads = tf.gradients(tf.math.negative(some_neuron), obz_tensor)

    grads = list(zip(grads, obz_tensor))

    trainer = tf.train.AdamOptimizer(learning_rate=0.01, epsilon=1e-5)

    train_op = trainer.apply_gradients(grads)
    for i in range(10000):
        obz, _ = model.sess.run([obz_tensor, train_op])
def neuron_values_generator(args, save_dir, pi_theta, eval_timesteps):
    # logger.log(f"#######EVAL: {args}")

    neuron_values_list = []

    def make_env():
        env_out = gym.make(args.env)

        env_out = bench.Monitor(env_out,
                                logger.get_dir(),
                                allow_early_resets=True)
        return env_out

    env = DummyVecEnv([make_env])

    if args.normalize:
        env = VecNormalize(env)

    # policy = MlpPolicy
    # # model = PPO2.load(f"{save_dir}/ppo2") # this also loads V function
    # model = PPO2(policy=policy, env=env, n_steps=args.n_steps, nminibatches=args.nminibatches, lam=0.95, gamma=0.99, noptepochs=10,
    #              ent_coef=0.0, learning_rate=3e-4, cliprange=0.2, optimizer=args.optimizer)
    model = PPO2.load(f"{save_dir}/ppo2")  # this also loads V function
    if pi_theta is not None:
        model.set_pi_from_flat(pi_theta)

    if args.normalize:
        env.load_running_average(save_dir)

    obs = np.zeros((env.num_envs, ) + env.observation_space.shape)
    obs[:] = env.reset()
    env.render()
    ep_infos = []
    while 1:
        neuron_values, actions, _, _, _ = model.step_with_neurons(obs)
        # neuron_values = model.give_neuron_values(obs)

        # neuron_values_list.append( neuron_values )
        yield neuron_values
        obs, rew, done, infos = env.step(actions)
        env.render()

        # time.sleep(1)
        for info in infos:
            maybe_ep_info = info.get('episode')
            if maybe_ep_info is not None:
                ep_infos.append(maybe_ep_info)

        # env.render()
        done = done.any()
        if done:

            episode_rew = safe_mean([ep_info['r'] for ep_info in ep_infos])
            print(f'episode_rew={episode_rew}')
            obs = env.reset()
Beispiel #16
0
def cont_learn():
    print('Continue learning....')

    env = gym.make('CarRacing-v0')
    env = Monitor(env, log_dir, allow_early_resets=True)
    env = DummyVecEnv([lambda: env])
    trained_model = PPO2.load("CarRacing_model_PPO2.pkl")
    trained_model.set_env(env)
    trained_model.learn(300000)
    print("Saving model to CarRacing_model.pkl")
    trained_model.save("CarRacing_model_PPO2.pkl")
    plot_results(log_dir)
Beispiel #17
0
def visualize_neurons(args, save_dir, pi_theta, eval_timesteps):
    # logger.log(f"#######EVAL: {args}")

    def make_env():
        env_out = gym.make(args.env)
        env_out.env.disableViewer = True
        env_out.env.visualize = False
        env_out = bench.Monitor(env_out,
                                logger.get_dir(),
                                allow_early_resets=True)
        return env_out

    env = DummyVecEnv([make_env])
    if args.normalize:
        env = VecNormalize(env)

    model = PPO2.load(f"{save_dir}/ppo2")  # this also loads V function
    if pi_theta is not None:
        model.set_pi_from_flat(pi_theta)

    if args.normalize:
        env.load_running_average(save_dir)

    obs = np.zeros((env.num_envs, ) + env.observation_space.shape)
    obs[:] = env.reset()
    ep_infos = []
    for _ in range(eval_timesteps):
        actions = model.step(obs)[0]
        neuron_values = model.give_neuron_values(obs)

        obs, rew, done, infos = env.step(actions)

        for info in infos:
            maybe_ep_info = info.get('episode')
            if maybe_ep_info is not None:
                ep_infos.append(maybe_ep_info)

        # env.render()
        done = done.any()
        if done:
            if pi_theta is None:
                episode_rew = safe_mean([ep_info['r'] for ep_info in ep_infos])
                print(f'episode_rew={episode_rew}')
            obs = env.reset()

    return safe_mean([ep_info['r'] for ep_info in ep_infos])
Beispiel #18
0
def run():
    """
    Run a trained model for the pong problem
    """
    env = gym.make('CarRacing-v0')
    env = DummyVecEnv([lambda: env])

    # model = PPO2.load("CarRacing_model_PPO1_"+ str(5) +".pkl", env)
    model = PPO2.load("CarRacing_model_PPO2_5.pkl", env)
    avg_rew = evaluate(model=model, env=env, num_steps=10000)
    while True:
        obs, done = env.reset(), False
        episode_rew = 0
        while not done:
            env.render()
            action, _ = model.predict(obs)
            obs, rew, done, _ = env.step(action)

            episode_rew += rew
        print("Episode reward", episode_rew)
Beispiel #19
0
def train(env_id,
          num_timesteps,
          seed,
          policy,
          n_envs=8,
          nminibatches=4,
          n_steps=128):
    """
    Train PPO2 model for atari environment, for testing purposes

    :param env_id: (str) the environment id string
    :param num_timesteps: (int) the number of timesteps to run
    :param seed: (int) Used to seed the random generator.
    :param policy: (Object) The policy model to use (MLP, CNN, LSTM, ...)
    :param n_envs: (int) Number of parallel environments
    :param nminibatches: (int) Number of training minibatches per update. For recurrent policies,
        the number of environments run in parallel should be a multiple of nminibatches.
    :param n_steps: (int) The number of steps to run for each environment per update
        (i.e. batch size is n_steps * n_env where n_env is number of environment copies running in parallel)
    """

    env = VecFrameStack(make_atari_env(env_id, n_envs, seed), 4)
    policy = {
        'cnn': CnnPolicy,
        'lstm': CnnLstmPolicy,
        'lnlstm': CnnLnLstmPolicy,
        'mlp': MlpPolicy
    }[policy]
    model = PPO2(policy=policy,
                 env=env,
                 n_steps=n_steps,
                 nminibatches=nminibatches,
                 lam=0.95,
                 gamma=0.99,
                 noptepochs=4,
                 ent_coef=.01,
                 learning_rate=lambda f: f * 2.5e-4,
                 cliprange=lambda f: f * 0.1,
                 verbose=1)
    model.learn(total_timesteps=num_timesteps)
    del model
def train(num_timesteps, model_to_load):

    try:
        env = DummyVecEnv([dsgym])
        env = VecNormalize(env)
        policy = MlpPolicy
        lr = 3e-4 * 0.75

        model = PPO2(policy=policy,
                     env=env,
                     n_steps=2048,
                     nminibatches=32,
                     lam=0.95,
                     gamma=0.99,
                     noptepochs=10,
                     ent_coef=0.01,
                     learning_rate=linear_schedule(lr),
                     cliprange=0.2)
        if model_to_load:
            env = DummyVecEnv([dsgym])
            env = VecNormalize.load(
                model_to_load.replace(".zip", "vec_normalize.pkl"), env)
            model = model.load(model_to_load)
            model.set_env(env)
            print("Loaded model from: ", model_to_load)
            model.set_learning_rate_func(linear_schedule_start_zero(lr))
        model.learn(total_timesteps=num_timesteps)
    except KeyboardInterrupt:
        print("Saving on keyinterrupt")
        model.save("D:/openAi/ppo2save/" + time.strftime("%Y_%m_%d-%H_%M_%S"))
        # quit
        sys.exit()
    except BaseException as error:
        model.save("D:/openAi/ppo2save/" + time.strftime("%Y_%m_%d-%H_%M_%S"))
        print('An exception occurred: {}'.format(error))
        traceback.print_exception(*sys.exc_info())
        sys.exit()
    model.save("D:/openAi/ppo2save/" + time.strftime("%Y_%m_%d-%H_%M_%S"))
Beispiel #21
0
    def create_learner(self, env, parameters):
        if (self.trpo() or self.ppo()) and not issubclass(type(env), VecEnv):
            env = DummyVecEnv([lambda: env])

        if self.trpo():
            model = TRPO(MlpPolicy, env, **parameters["common"],
                         **parameters[str(self)])
            interface = TRPOInterface(model, env.observation_space.shape[0])
        elif self.ppo():
            model = PPO2(MlpPolicy, env, **parameters["common"],
                         **parameters[str(self)])
            interface = PPOInterface(model, env.observation_space.shape[0])
        else:
            model = SAC(SACMlpPolicy, env, **parameters["common"],
                        **parameters[str(self)])
            interface = SACInterface(model, env.observation_space.shape[0])

        if "pretrain_data_path" in parameters:
            data_path = parameters["pretrain_data_path"]
            model.pretrain(ExpertDataset(expert_path=data_path, verbose=0),
                           n_epochs=25)

        return model, interface
Beispiel #22
0
def train(env_id, num_timesteps, seed, policy):
    """
    Train PPO2 model for atari environment, for testing purposes

    :param env_id: (str) the environment id string
    :param num_timesteps: (int) the number of timesteps to run
    :param seed: (int) Used to seed the random generator.
    :param policy: (Object) The policy model to use (MLP, CNN, LSTM, ...)
    """

    env = Monitor(PadEnv(), './logs', allow_early_resets=True)
    env = DummyVecEnv([lambda: env for _ in range(16)])
    env = VecFrameStack(env, 8)
    policy = {
        'cnn': CnnPolicy,
        'lstm': CnnLstmPolicy,
        'lnlstm': CnnLnLstmPolicy,
        'mlp': MlpPolicy
    }[policy]
    model = PPO2(policy=policy,
                 env=env,
                 n_steps=256,
                 nminibatches=4,
                 lam=0.95,
                 gamma=0.99,
                 noptepochs=4,
                 ent_coef=.01,
                 learning_rate=lambda f: f * 2.5e-4,
                 cliprange=lambda f: f * 0.1,
                 verbose=1)
    # model = model.load('./pad_4combo_ppo2.pkl', env)
    try:
        model.learn(total_timesteps=num_timesteps)
    except KeyboardInterrupt:
        print('Keyboard Interrupted')

    model.save('./pad_5combo_ppo2.pkl')
Beispiel #23
0
        'sha_pol': sha_pol if bool(flags.learn_sha_pol) else None,
        'mov_pol': None,
        'rot_pol': None
    }
}

env = make_env()
model = PPO2(PackingPolicy,
             env,
             n_steps=flags.num_steps,
             verbose=1,
             tensorboard_log=tensorboard_log,
             nminibatches=int((flags.num_steps * flags.num_pro) / 64),
             noptepochs=flags.noptepochs,
             make_env=make_env,
             gamma=flags.gamma,
             lam=flags.lam,
             vf_coef=flags.vf_coef,
             ent_coef=flags.ent_coef,
             zero_mean_advs=bool(flags.zero_mean_advs),
             packing_id_start=flags.id_start,
             learning_rate=flags.lr,
             policy_config=policy_config,
             restore_exp=not (bool(flags.learn_or_evaluate)),
             restore_path="./{}/{}".format(tensorboard_log, flags.model_name))

if bool(flags.learn_or_evaluate):
    model.learn(flags.num_steps * flags.num_pro * 400)
else:
    if bool(flags.eval_va_or_te):
        pack_file_name_evaluate = [
            "pack_va/" + str(i) + "_va"
Beispiel #24
0
def main(env, load_path, fig_path):

    # skip over 1-baxter-no-penalty (no log monitor.csv)
    if load_path == "1-baxter-no-penalty":
        plot = False
    else:
        plot = True

    # arguments
    print("env %s; load_path %s; fig_path %s;" % (env, load_path, fig_path))
    log_path = os.getcwd() + "/log/" + load_path
    os.makedirs(os.getcwd() + "/figs/" + "/", exist_ok=True)
    fig_path = os.getcwd() + "/figs/" + "/" + fig_path
    load_path = os.getcwd() + "/models/" + load_path

    # make environment, flattened environment, vectorized environment
    env = gym.make(env)
    env = gym.wrappers.FlattenDictWrapper(env, ['observation', 'achieved_goal', 'desired_goal'])
    env = DummyVecEnv([lambda: env])

    # load model
    model = PPO2.load(load_path, env=env)
    obs_initial = env.reset()
    obs = obs_initial

    # plot results
    if plot:
        plot_results(fig_path, log_path)

    # initializations
    niter = 10
    counter = 0
    timestep = 0
    results = [[[0,0,0] for i in range(100)], [[0,0,0,0] for i in range(100)]]
    current = [[[0,0,0] for i in range(100)], [[0,0,0,0] for i in range(100)]]
    print("==============================")

    # check initial positions and quaternions
    print("grip", env.envs[0].env.env.sim.data.get_site_xpos('grip'))
    print("box", env.envs[0].env.env.sim.data.get_site_xpos('box'))
    print("tool", env.envs[0].env.env.sim.data.get_site_xpos('tool'))
    print("mocap", env.envs[0].env.env.sim.data.mocap_pos)
    print("quat", env.envs[0].env.env.sim.data.mocap_quat)
    print("==============================")

    # mocap quaternion check
    for i in range(5):
        action, _states = model.predict(obs)
        obs, rewards, dones, info = env.step(action)
        quat = env.envs[0].env.env.sim.data.mocap_quat
        print("obs", obs)
        print("quat", quat)
    print("==============================")

    # start rendering
    dists = []
    box_goal_pos = np.array([0.6, 0.05, -0.17])
    while True:
        if counter == niter:
            break
        action, _states = model.predict(obs)
        obs_old = obs
        obs, rewards, dones, info = env.step(action)
        quaternion = env.envs[0].env.env.sim.data.mocap_quat
        if obs.all() == obs_initial.all():
            if counter % 10 == 0:
                xyzs = current[0]
                quats = current[1]
                print(xyzs)
                print(quats)
                filename = log_path + "/" + "results_" + str(counter) + ".txt"
                os.makedirs(log_path + "/", exist_ok=True)
                file = open(filename, 'w+')
                for xyz, quat in zip(xyzs, quats):
                    for coord in xyz:
                        file.write(str(coord) + " ")
                    for quat_coord in quat:
                        file.write(str(quat_coord) + " ")
                    file.write("\n")
                file.close()

            box_end_pos = np.array(obs_old[0][3:6].tolist())
            print(box_end_pos)
            print(np.shape(box_end_pos))
            print(box_goal_pos)
            print(np.shape(box_goal_pos))
            dists.append(np.linalg.norm(box_goal_pos - box_end_pos))
            current = [[[0,0,0] for i in range(100)], [[0,0,0,0] for i in range(100)]]
            timestep = 0
            counter += 1
        print(timestep)
        print("obs", obs)
        print("quat", quaternion)

        # for average trajectory, smoothed
        for i in range(3):
            results[0][timestep][i] += obs[0][:3].tolist()[i]
        for j in range(4):
            results[1][timestep][j] += quaternion[0].tolist()[j]

        # for current trajectory
        for i in range(3):
            current[0][timestep][i] += obs[0][:3].tolist()[i]
        for j in range(4):
            current[1][timestep][j] += quaternion[0].tolist()[j]

        timestep += 1
        env.render()

    # smooth paths by taking average, and calculate mean distance to goal state
    for timestep in range(100):
        for i in range(3):
            results[0][timeste][i] /= niter
        for j in range(4):
            results[0][timestep][j] /= niter
    dist = np.mean(dists)

    # print and write to file
    xyzs = results[0]
    quats = results[1]
    filename = log_path + "/" + "results_avg.txt"
    os.makedirs(log_path + "/", exist_ok=True)
    file = open(filename, 'w+')
    for xyz, quat in zip(xyzs, quats):
        for coord in xyz:
            file.write(str(coord) + " ")
        for quat_coord in quat:
            file.write(str(quat_coord) + " ")
        file.write("\n")
    file.close()

    # print average distances
    print("average distance of box from end goal: %f" % dist)
Beispiel #25
0
    qpos[i, :] = env_in.GetMotorAngles()
    #     qvel[i,:] = env_in.sim.data.qvel[[7,8,10,12,14,16,17,19,21,23]]
    qvel[i, :] = env_in.GetMotorVelocities()
    #     torque[i,:] = env_in.sim.data.actuator_force
    torque[i, :] = env_in.GetMotorTorques()
    i = (i + 1) % total_data_length
    return True


###############################################################################
# # Use this code for testing the basic controller
# Create the stoch mujoco environment
# env = stoch2_gym_mjc_env.Stoch2Env()
env = vision60_gym_bullet_env.Vision60BulletEnv(render=True)

model_test = PPO2.load(dir_name + "/model_trot")
obs = env.reset()

print("Render mode...")

for _ in range(10):
    action, _states = model_test.predict(obs, deterministic=True)
    obs, reward, done, _ = env.step(action, callback=render_callback)
#     if done:
#         break

pickle.dump(qpos[0:total_data_length:int(total_data_length / 100)],
            open("save.p", "wb"))  # save it into a file named save.p
# print(np.shape(qpos[0:total_data_length:int(total_data_length/100)]))
# print(np.shape(qpos))
Beispiel #26
0
 def __init__(self):
     self.model = PPO2.load(os.path.expanduser(
         "~/Code/drl_local_planner_ros_stable_baselines/example_agents/ppo2_1_raw_data_cont_0/ppo2_1_raw_data_cont_0.pkl"))  # noqa
Beispiel #27
0
def train(args):
    """
    Runs the test
    """
    args, argv = mujoco_arg_parser().parse_known_args(args)
    logger.log(f"#######TRAIN: {args}")
    args.alg = "ppo2"

    this_run_dir = get_dir_path_for_this_run(args)
    if os.path.exists(this_run_dir):
        import shutil
        shutil.rmtree(this_run_dir)
    os.makedirs(this_run_dir)

    log_dir = get_log_dir(this_run_dir)
    save_dir = get_save_dir(this_run_dir)
    logger.configure(log_dir)

    def make_env():
        env_out = gym.make(args.env)
        env_out.env.visualize = False
        env_out = bench.Monitor(env_out,
                                logger.get_dir(),
                                allow_early_resets=True)
        return env_out

    env = DummyVecEnv([make_env])
    env.envs[0].env.env.disableViewer = True
    set_global_seeds(args.seed)
    env.envs[0].env.env.seed(args.seed)

    if args.normalize:
        env = VecNormalize(env)

    policy = MlpPolicy

    # extra run info I added for my purposes

    full_param_traj_dir_path = get_full_params_dir(this_run_dir)

    if os.path.exists(full_param_traj_dir_path):
        import shutil
        shutil.rmtree(full_param_traj_dir_path)
    os.makedirs(full_param_traj_dir_path)

    if os.path.exists(save_dir):
        import shutil
        shutil.rmtree(save_dir)
    os.makedirs(save_dir)

    run_info = {
        "run_num": args.run_num,
        "env_id": args.env,
        "full_param_traj_dir_path": full_param_traj_dir_path,
        "state_samples_to_collect": args.state_samples_to_collect
    }

    model = PPO2(policy=policy,
                 env=env,
                 n_steps=args.n_steps,
                 nminibatches=args.nminibatches,
                 lam=0.95,
                 gamma=0.99,
                 noptepochs=10,
                 ent_coef=0.0,
                 learning_rate=3e-4,
                 cliprange=0.2,
                 optimizer=args.optimizer,
                 seed=args.seed)
    model.tell_run_info(run_info)

    model.learn(total_timesteps=args.num_timesteps)

    model.save(f"{save_dir}/ppo2")

    if args.normalize:
        env.save_running_average(save_dir)
def visualize_augment_experiment(augment_num_timesteps,
                                 top_num_to_include_slice,
                                 augment_seed,
                                 augment_run_num,
                                 network_size,
                                 policy_env,
                                 policy_num_timesteps,
                                 policy_run_num,
                                 policy_seed,
                                 eval_seed,
                                 eval_run_num,
                                 learning_rate,
                                 additional_note,
                                 result_dir,
                                 lagrangian_inds_to_include=None):

    args = AttributeDict()

    args.normalize = True
    args.num_timesteps = augment_num_timesteps
    args.run_num = augment_run_num
    args.alg = "ppo2"
    args.seed = augment_seed

    logger.log(f"#######TRAIN: {args}")
    # non_linear_global_dict
    timestamp = get_time_stamp('%Y_%m_%d_%H_%M_%S')
    experiment_label = f"learning_rate_{learning_rate}timestamp_{timestamp}_augment_num_timesteps{augment_num_timesteps}" \
                       f"_top_num_to_include{top_num_to_include_slice.start}_{top_num_to_include_slice.stop}" \
                       f"_augment_seed{augment_seed}_augment_run_num{augment_run_num}_network_size{network_size}" \
                       f"_policy_num_timesteps{policy_num_timesteps}_policy_run_num{policy_run_num}_policy_seed{policy_seed}" \
                       f"_eval_seed{eval_seed}_eval_run_num{eval_run_num}_additional_note_{additional_note}"

    if policy_env == "DartWalker2d-v1":
        entry_point = 'gym.envs.dart:DartWalker2dEnv_aug_input'
    elif policy_env == "DartHopper-v1":
        entry_point = 'gym.envs.dart:DartHopperEnv_aug_input'
    elif policy_env == "DartHalfCheetah-v1":
        entry_point = 'gym.envs.dart:DartHalfCheetahEnv_aug_input'
    elif policy_env == "DartSnake7Link-v1":
        entry_point = 'gym.envs.dart:DartSnake7LinkEnv_aug_input'
    else:
        raise NotImplemented()

    this_run_dir = get_experiment_path_for_this_run(
        entry_point,
        args.num_timesteps,
        args.run_num,
        args.seed,
        learning_rate=learning_rate,
        top_num_to_include=top_num_to_include_slice,
        result_dir=result_dir,
        network_size=network_size)
    full_param_traj_dir_path = get_full_params_dir(this_run_dir)
    log_dir = get_log_dir(this_run_dir)
    save_dir = get_save_dir(this_run_dir)

    create_dir_remove(this_run_dir)
    create_dir_remove(full_param_traj_dir_path)
    create_dir_remove(save_dir)
    create_dir_remove(log_dir)
    logger.configure(log_dir)

    # note this is only linear
    if lagrangian_inds_to_include is None:
        linear_top_vars_list = read_linear_top_var(policy_env,
                                                   policy_num_timesteps,
                                                   policy_run_num, policy_seed,
                                                   eval_seed, eval_run_num,
                                                   additional_note)

        # keys_to_include = ["COM", "M", "Coriolis", "total_contact_forces_contact_bodynode",
        #                    "com_jacobian", "contact_bodynode_jacobian"]
        keys_to_include = ["COM", "M", "Coriolis", "com_jacobian"]
        # lagrangian_inds_to_include = linear_top_vars_list[top_num_to_include_slice]
        lagrangian_inds_to_include = get_wanted_lagrangians(
            keys_to_include, linear_top_vars_list, top_num_to_include_slice)

    with open(f"{log_dir}/lagrangian_inds_to_include.json", 'w') as fp:
        json.dump(lagrangian_inds_to_include, fp)

    args.env = f'{experiment_label}_{entry_point}-v1'
    register(id=args.env,
             entry_point=entry_point,
             max_episode_steps=1000,
             kwargs={"lagrangian_inds_to_include": lagrangian_inds_to_include})

    def make_env():
        env_out = gym.make(args.env)
        env_out.env.visualize = False
        env_out = bench.Monitor(env_out,
                                logger.get_dir(),
                                allow_early_resets=True)
        return env_out

    env = DummyVecEnv([make_env])
    walker_env = env.envs[0].env.env
    walker_env.disableViewer = True

    if args.normalize:
        env = VecNormalize(env)
    policy = MlpPolicy

    # extra run info I added for my purposes
    run_info = {
        "run_num": args.run_num,
        "env_id": args.env,
        "full_param_traj_dir_path": full_param_traj_dir_path
    }

    layers = [network_size, network_size]
    set_global_seeds(args.seed)
    walker_env.seed(args.seed)

    policy_kwargs = {"net_arch": [dict(vf=layers, pi=layers)]}
    model = PPO2(policy=policy,
                 env=env,
                 n_steps=4096,
                 nminibatches=64,
                 lam=0.95,
                 gamma=0.99,
                 noptepochs=10,
                 ent_coef=0.0,
                 learning_rate=learning_rate,
                 cliprange=0.2,
                 optimizer='adam',
                 policy_kwargs=policy_kwargs,
                 seed=args.seed)
    model.tell_run_info(run_info)

    model.learn(total_timesteps=args.num_timesteps, seed=args.seed)

    model.save(f"{save_dir}/ppo2")

    if args.normalize:
        env.save_running_average(save_dir)

    return log_dir
def visualize_policy_and_collect_COM(
        augment_num_timesteps, top_num_to_include_slice, augment_seed,
        augment_run_num, network_size, policy_env, policy_num_timesteps,
        policy_run_num, policy_seed, eval_seed, eval_run_num, learning_rate,
        additional_note, metric_param):
    result_dir = get_result_dir(policy_env, policy_num_timesteps,
                                policy_run_num, policy_seed, eval_seed,
                                eval_run_num, additional_note, metric_param)
    args = AttributeDict()

    args.normalize = True
    args.num_timesteps = augment_num_timesteps
    args.run_num = augment_run_num
    args.alg = "ppo2"
    args.seed = augment_seed

    logger.log(f"#######VISUALIZE: {args}")
    # non_linear_global_dict
    linear_global_dict, non_linear_global_dict, lagrangian_values, input_values, layers_values, all_weights = read_all_data(
        policy_env,
        policy_num_timesteps,
        policy_run_num,
        policy_seed,
        eval_seed,
        eval_run_num,
        additional_note=additional_note)
    timestamp = get_time_stamp('%Y_%m_%d_%H_%M_%S')
    experiment_label = f"learning_rate_{learning_rate}timestamp_{timestamp}_augment_num_timesteps{augment_num_timesteps}" \
                       f"_top_num_to_include{top_num_to_include_slice.start}_{top_num_to_include_slice.stop}" \
                       f"_augment_seed{augment_seed}_augment_run_num{augment_run_num}_network_size{network_size}" \
                       f"_policy_num_timesteps{policy_num_timesteps}_policy_run_num{policy_run_num}_policy_seed{policy_seed}" \
                       f"_eval_seed{eval_seed}_eval_run_num{eval_run_num}_additional_note_{additional_note}"

    entry_point = 'gym.envs.dart:DartWalker2dEnv_aug_input'

    this_run_dir = get_experiment_path_for_this_run(
        entry_point,
        args.num_timesteps,
        args.run_num,
        args.seed,
        learning_rate=learning_rate,
        top_num_to_include=top_num_to_include_slice,
        result_dir=result_dir,
        network_size=network_size,
        metric_param=metric_param)
    traj_params_dir_name = get_full_params_dir(this_run_dir)
    save_dir = get_save_dir(this_run_dir)

    aug_plot_dir = get_aug_plot_dir(this_run_dir) + "_vis"

    final_file = get_full_param_traj_file_path(traj_params_dir_name,
                                               "pi_final")
    final_params = pd.read_csv(final_file, header=None).values[0]

    args.env = f'{experiment_label}_{entry_point}-v1'
    register(id=args.env,
             entry_point=entry_point,
             max_episode_steps=1000,
             kwargs={
                 'linear_global_dict': linear_global_dict,
                 'non_linear_global_dict': non_linear_global_dict,
                 'top_to_include_slice': top_num_to_include_slice,
                 'aug_plot_dir': aug_plot_dir,
                 "lagrangian_values": lagrangian_values,
                 "layers_values": layers_values
             })

    def make_env():
        env_out = gym.make(args.env)

        env_out = bench.Monitor(env_out,
                                logger.get_dir(),
                                allow_early_resets=True)
        return env_out

    env = DummyVecEnv([make_env])
    walker_env = env.envs[0].env.env

    walker_env.disableViewer = False

    if args.normalize:
        env = VecNormalize(env)

    set_global_seeds(args.seed)
    walker_env.seed(args.seed)

    model = PPO2.load(f"{save_dir}/ppo2", seed=augment_seed)
    model.set_pi_from_flat(final_params)
    if args.normalize:
        env.load_running_average(save_dir)

    sk = env.venv.envs[0].env.env.robot_skeleton
    lagrangian_values = {}

    obs = np.zeros((env.num_envs, ) + env.observation_space.shape)

    obs[:] = env.reset()

    env = VecVideoRecorder(env,
                           aug_plot_dir,
                           record_video_trigger=lambda x: x == 0,
                           video_length=3000,
                           name_prefix="vis_this_policy")

    lagrangian_values["M"] = [sk.M.reshape((-1, 1))]
    lagrangian_values["COM"] = [sk.C.reshape((-1, 1))]
    lagrangian_values["Coriolis"] = [sk.c.reshape((-1, 1))]
    lagrangian_values["q"] = [sk.q.reshape((-1, 1))]
    lagrangian_values["dq"] = [sk.dq.reshape((-1, 1))]

    contact_values = {}

    neuron_values = model.give_neuron_values(obs)
    raw_layer_values_list = [[neuron_value.reshape((-1, 1))]
                             for neuron_value in neuron_values]

    env.render()
    ep_infos = []
    steps_to_first_done = 0
    first_done = False

    # epi_rew = 0
    for _ in range(3000):
        actions = model.step(obs)[0]

        # yield neuron_values
        obs, rew, done, infos = env.step(actions)
        # epi_rew+= rew[0]
        if done and not first_done:
            first_done = True

        if not first_done:
            steps_to_first_done += 1

        neuron_values = model.give_neuron_values(obs)

        for i, layer in enumerate(neuron_values):
            raw_layer_values_list[i].append(layer.reshape((-1, 1)))

        # fill_contacts_jac_dict(infos[0]["contacts"], contact_dict=contact_values, neuron_values=neuron_values)

        lagrangian_values["M"].append(sk.M.reshape((-1, 1)))
        lagrangian_values["q"].append(sk.q.reshape((-1, 1)))
        lagrangian_values["dq"].append(sk.dq.reshape((-1, 1)))
        lagrangian_values["COM"].append(sk.C.reshape((-1, 1)))
        lagrangian_values["Coriolis"].append(sk.c.reshape((-1, 1)))

        # env.render()

        # time.sleep(1)
        for info in infos:
            maybe_ep_info = info.get('episode')
            if maybe_ep_info is not None:
                ep_infos.append(maybe_ep_info)

        env.render()
        done = done.any()
        if done:
            episode_rew = safe_mean([ep_info['r'] for ep_info in ep_infos])
            print(f'episode_rew={episode_rew}')
            # print(f'episode_rew={epi_rew}')
            # epi_rew = 0
            obs = env.reset()

    #Hstack into a big matrix
    lagrangian_values["M"] = np.hstack(lagrangian_values["M"])
    lagrangian_values["COM"] = np.hstack(lagrangian_values["COM"])
    lagrangian_values["Coriolis"] = np.hstack(lagrangian_values["Coriolis"])
    lagrangian_values["q"] = np.hstack(lagrangian_values["q"])
    lagrangian_values["dq"] = np.hstack(lagrangian_values["dq"])

    # for contact_body_name, l in contact_values.items():
    #     body_contact_dict = contact_values[contact_body_name]
    #     for name, l in body_contact_dict.items():
    #         body_contact_dict[name] = np.hstack(body_contact_dict[name])
    input_values = np.hstack(raw_layer_values_list[0])

    layers_values = [
        np.hstack(layer_list) for layer_list in raw_layer_values_list
    ][1:-2]  # drop variance and inputs

    for i, com in enumerate(lagrangian_values["COM"]):
        plt.figure()
        plt.plot(np.arange(len(com)), com)
        plt.xlabel("time")
        plt.ylabel(f"COM{i}")

        plt.savefig(f"{aug_plot_dir}/COM{i}.jpg")
        plt.close()
Beispiel #30
0
        #print(prediction[:20])
        #prediction = sigmoid(sw)
        #objective = mse(prediction, self.labels)
        objective = cross_entropy(prediction, self.labels)
        reward = -objective
        #print(reward)
        self.rewards.append(reward)
        if np.any(np.isnan(state)):
            print(state)
            print("NAN DETECTED")
            exit()
        return state, reward, terminal, {}

    def _terminal(self):
        return self.steps >= 40

    def _get_state(self):
        pass

    def render(self, mode='human'):
        pass

    def close(self):
        pass


if __name__ == '__main__':
    env = DummyVecEnv([OptDist])
    agent = PPO2(MlpPolicy, env)
    agent.learn(total_timesteps=10**7)