Exemple #1
0
def main():
    """
    Train and save the PPO model, for the cartpole problem

    """
    print("Making a new model")

    env = ControlCarRacing(gym.make('CarRacing-v0'))
    env = MaxAndSkipEnv(env, skip=4)
    env = FrameStack(env, 4)
    env = Monitor(env, log_dir, allow_early_resets=True)
    env = DummyVecEnv([lambda: env])

    model = PPO2(policy=CnnPolicy,
                 env=env,
                 n_steps=128,
                 nminibatches=4,
                 noptepochs=10,
                 learning_rate=3e-4,
                 cliprange=lambda f: f * 0.2,
                 verbose=0,
                 tensorboard_log='graph/')

    print("Learning started. It takes some time...")
    model.learn(total_timesteps=300000,
                callback=callback,
                tb_log_name='190317')
    print("Saving model to CarRacing_model.pkl")
    model.save("CarRacing_model_PPO2")
    print("Plotting Learning Curve")
    plot_results(log_dir)
    plot_results(log_dir, smoothing=False)
Exemple #2
0
def __main():
    from stable_baselines.ppo2 import PPO2
    from stable_baselines.common.policies import MlpPolicy
    from stable_baselines.common.vec_env import DummyVecEnv
    env = DummyVecEnv([OptLRs])
    agent = PPO2(MlpPolicy, env, verbose=1)
    agent.learn(total_timesteps=10**2)
Exemple #3
0
def get_ppo2(
    vec_env=None,
    policy='CnnPolicy',
    seed=0,
    number_of_steps_per_epoch=128,  # nsteps
    number_of_mini_batches_in_epoch=8,  # nminibatches
    number_of_updates_per_epoch=4,  # noptepochs
    max_grad_norm=0.5,
    gamma=0.993,  # discount factor
    entropy_coefficient=0.01,  # ent_coef
    learning_rate=0.00008,  # lr
    clip_range=0.27,  # cliprange
    vf_coefficient=0.5,
) -> PPO2:
    """
    Parameter's default values are taken from football.gfootball.examples.run_ppo2.py
    """
    if vec_env is None:
        vec_env = create_training_env(1)

    return PPO2(
        policy=policy,
        env=vec_env,
        gamma=gamma,
        n_steps=number_of_steps_per_epoch,
        ent_coef=entropy_coefficient,
        learning_rate=learning_rate,
        vf_coef=vf_coefficient,
        max_grad_norm=max_grad_norm,
        nminibatches=number_of_mini_batches_in_epoch,
        noptepochs=number_of_updates_per_epoch,
        cliprange=clip_range,
        seed=seed,
        verbose=2,
    )
Exemple #4
0
 def model_free_policy(self, ne, n_epochs=1, train=True, load_model=False):
     if self.autoencoder is None:
         self.setup_autoencoder(ne.get_obs())
         assert (self.autoencoder) is not None
     if ne.autoencoder is None:
         ne.set_autoencoder(self.autoencode)
         ne.autoencoder = self.autoencode
     if train:
         fn = "models/model1.h5"
         self.mf_policy = PPO2(env=ne,
                               policy=MlpPolicy,
                               n_steps=40,
                               verbose=2,
                               noptepochs=10,
                               learning_rate=3e-4,
                               ent_coef=0.1,
                               gamma=0.1)
         if load_model:
             self.mf_policy.load(fn, env=make_vec_env(lambda: ne))
         else:
             self.mf_policy.learn(total_timesteps=n_epochs * 40)
             self.mf_policy.save(fn)
     encoded_obs = ne.rl_obs()
     return self.mf_policy.step([encoded_obs],
                                deterministic=True)[0].flatten()
Exemple #5
0
def train(env_id, num_timesteps, seed, policy):
    """
    Train PPO2 model for atari environment, for testing purposes

    :param env_id: (str) the environment id string
    :param num_timesteps: (int) the number of timesteps to run
    :param seed: (int) Used to seed the random generator.
    :param policy: (Object) The policy model to use (MLP, CNN, LSTM, ...)
    """

    env = VecFrameStack(make_atari_env(env_id, 8, seed), 4)
    policy = {
        'cnn': CnnPolicy,
        'lstm': CnnLstmPolicy,
        'lnlstm': CnnLnLstmPolicy,
        'mlp': MlpPolicy
    }[policy]
    model = PPO2(policy=policy,
                 env=env,
                 n_steps=128,
                 nminibatches=4,
                 lam=0.95,
                 gamma=0.99,
                 noptepochs=4,
                 ent_coef=.01,
                 learning_rate=lambda f: f * 2.5e-4,
                 cliprange=lambda f: f * 0.1,
                 verbose=1)
    model.learn(total_timesteps=num_timesteps)
Exemple #6
0
def train(env_id, num_timesteps, seed):
    """
    Train PPO2 model for Mujoco environment, for testing purposes

    :param env_id: (str) the environment id string
    :param num_timesteps: (int) the number of timesteps to run
    :param seed: (int) Used to seed the random generator.
    """
    def make_env():
        env_out = gym.make(env_id)
        env_out = bench.Monitor(env_out,
                                logger.get_dir(),
                                allow_early_resets=True)
        return env_out

    env = DummyVecEnv([make_env])
    env = VecNormalize(env)

    set_global_seeds(seed)
    policy = MlpPolicy
    model = PPO2(policy=policy,
                 env=env,
                 n_steps=2048,
                 nminibatches=32,
                 lam=0.95,
                 gamma=0.99,
                 noptepochs=10,
                 ent_coef=0.0,
                 learning_rate=3e-4,
                 cliprange=0.2)
    model.learn(total_timesteps=num_timesteps)

    return model, env
def test_cnn_lstm_policy(request, policy):
    model_fname = './test_model_{}.zip'.format(request.node.name)

    try:
        env = make_env(0)
        model = PPO2(policy, env, nminibatches=1)
        model.learn(total_timesteps=15)
        env = model.get_env()
        evaluate_policy(model, env, n_eval_episodes=5)
        # saving
        model.save(model_fname)
        del model, env
        # loading
        _ = PPO2.load(model_fname, policy=policy)

    finally:
        if os.path.exists(model_fname):
            os.remove(model_fname)
Exemple #8
0
def train(env_id,
          num_timesteps,
          seed,
          policy,
          n_envs=8,
          nminibatches=4,
          n_steps=128):
    """
    Train PPO2 model for atari environment, for testing purposes

    :param env_id: (str) the environment id string
    :param num_timesteps: (int) the number of timesteps to run
    :param seed: (int) Used to seed the random generator.
    :param policy: (Object) The policy model to use (MLP, CNN, LSTM, ...)
    :param n_envs: (int) Number of parallel environments
    :param nminibatches: (int) Number of training minibatches per update. For recurrent policies,
        the number of environments run in parallel should be a multiple of nminibatches.
    :param n_steps: (int) The number of steps to run for each environment per update
        (i.e. batch size is n_steps * n_env where n_env is number of environment copies running in parallel)
    """

    env = VecFrameStack(make_atari_env(env_id, n_envs, seed), 4)
    policy = {
        'cnn': CnnPolicy,
        'lstm': CnnLstmPolicy,
        'lnlstm': CnnLnLstmPolicy,
        'mlp': MlpPolicy
    }[policy]
    model = PPO2(policy=policy,
                 env=env,
                 n_steps=n_steps,
                 nminibatches=nminibatches,
                 lam=0.95,
                 gamma=0.99,
                 noptepochs=4,
                 ent_coef=.01,
                 learning_rate=lambda f: f * 2.5e-4,
                 cliprange=lambda f: f * 0.1,
                 verbose=1)
    model.learn(total_timesteps=num_timesteps)
    del model
def train(num_timesteps, model_to_load):

    try:
        env = DummyVecEnv([dsgym])
        env = VecNormalize(env)
        policy = MlpPolicy
        lr = 3e-4 * 0.75

        model = PPO2(policy=policy,
                     env=env,
                     n_steps=2048,
                     nminibatches=32,
                     lam=0.95,
                     gamma=0.99,
                     noptepochs=10,
                     ent_coef=0.01,
                     learning_rate=linear_schedule(lr),
                     cliprange=0.2)
        if model_to_load:
            env = DummyVecEnv([dsgym])
            env = VecNormalize.load(
                model_to_load.replace(".zip", "vec_normalize.pkl"), env)
            model = model.load(model_to_load)
            model.set_env(env)
            print("Loaded model from: ", model_to_load)
            model.set_learning_rate_func(linear_schedule_start_zero(lr))
        model.learn(total_timesteps=num_timesteps)
    except KeyboardInterrupt:
        print("Saving on keyinterrupt")
        model.save("D:/openAi/ppo2save/" + time.strftime("%Y_%m_%d-%H_%M_%S"))
        # quit
        sys.exit()
    except BaseException as error:
        model.save("D:/openAi/ppo2save/" + time.strftime("%Y_%m_%d-%H_%M_%S"))
        print('An exception occurred: {}'.format(error))
        traceback.print_exception(*sys.exc_info())
        sys.exit()
    model.save("D:/openAi/ppo2save/" + time.strftime("%Y_%m_%d-%H_%M_%S"))
Exemple #10
0
    def create_learner(self, env, parameters):
        if (self.trpo() or self.ppo()) and not issubclass(type(env), VecEnv):
            env = DummyVecEnv([lambda: env])

        if self.trpo():
            model = TRPO(MlpPolicy, env, **parameters["common"],
                         **parameters[str(self)])
            interface = TRPOInterface(model, env.observation_space.shape[0])
        elif self.ppo():
            model = PPO2(MlpPolicy, env, **parameters["common"],
                         **parameters[str(self)])
            interface = PPOInterface(model, env.observation_space.shape[0])
        else:
            model = SAC(SACMlpPolicy, env, **parameters["common"],
                        **parameters[str(self)])
            interface = SACInterface(model, env.observation_space.shape[0])

        if "pretrain_data_path" in parameters:
            data_path = parameters["pretrain_data_path"]
            model.pretrain(ExpertDataset(expert_path=data_path, verbose=0),
                           n_epochs=25)

        return model, interface
Exemple #11
0
def train(env_id, num_timesteps, seed, policy):
    """
    Train PPO2 model for atari environment, for testing purposes

    :param env_id: (str) the environment id string
    :param num_timesteps: (int) the number of timesteps to run
    :param seed: (int) Used to seed the random generator.
    :param policy: (Object) The policy model to use (MLP, CNN, LSTM, ...)
    """

    env = Monitor(PadEnv(), './logs', allow_early_resets=True)
    env = DummyVecEnv([lambda: env for _ in range(16)])
    env = VecFrameStack(env, 8)
    policy = {
        'cnn': CnnPolicy,
        'lstm': CnnLstmPolicy,
        'lnlstm': CnnLnLstmPolicy,
        'mlp': MlpPolicy
    }[policy]
    model = PPO2(policy=policy,
                 env=env,
                 n_steps=256,
                 nminibatches=4,
                 lam=0.95,
                 gamma=0.99,
                 noptepochs=4,
                 ent_coef=.01,
                 learning_rate=lambda f: f * 2.5e-4,
                 cliprange=lambda f: f * 0.1,
                 verbose=1)
    # model = model.load('./pad_4combo_ppo2.pkl', env)
    try:
        model.learn(total_timesteps=num_timesteps)
    except KeyboardInterrupt:
        print('Keyboard Interrupted')

    model.save('./pad_5combo_ppo2.pkl')
def visualize_augment_experiment(augment_num_timesteps,
                                 top_num_to_include_slice,
                                 augment_seed,
                                 augment_run_num,
                                 network_size,
                                 policy_env,
                                 policy_num_timesteps,
                                 policy_run_num,
                                 policy_seed,
                                 eval_seed,
                                 eval_run_num,
                                 learning_rate,
                                 additional_note,
                                 result_dir,
                                 lagrangian_inds_to_include=None):

    args = AttributeDict()

    args.normalize = True
    args.num_timesteps = augment_num_timesteps
    args.run_num = augment_run_num
    args.alg = "ppo2"
    args.seed = augment_seed

    logger.log(f"#######TRAIN: {args}")
    # non_linear_global_dict
    timestamp = get_time_stamp('%Y_%m_%d_%H_%M_%S')
    experiment_label = f"learning_rate_{learning_rate}timestamp_{timestamp}_augment_num_timesteps{augment_num_timesteps}" \
                       f"_top_num_to_include{top_num_to_include_slice.start}_{top_num_to_include_slice.stop}" \
                       f"_augment_seed{augment_seed}_augment_run_num{augment_run_num}_network_size{network_size}" \
                       f"_policy_num_timesteps{policy_num_timesteps}_policy_run_num{policy_run_num}_policy_seed{policy_seed}" \
                       f"_eval_seed{eval_seed}_eval_run_num{eval_run_num}_additional_note_{additional_note}"

    if policy_env == "DartWalker2d-v1":
        entry_point = 'gym.envs.dart:DartWalker2dEnv_aug_input'
    elif policy_env == "DartHopper-v1":
        entry_point = 'gym.envs.dart:DartHopperEnv_aug_input'
    elif policy_env == "DartHalfCheetah-v1":
        entry_point = 'gym.envs.dart:DartHalfCheetahEnv_aug_input'
    elif policy_env == "DartSnake7Link-v1":
        entry_point = 'gym.envs.dart:DartSnake7LinkEnv_aug_input'
    else:
        raise NotImplemented()

    this_run_dir = get_experiment_path_for_this_run(
        entry_point,
        args.num_timesteps,
        args.run_num,
        args.seed,
        learning_rate=learning_rate,
        top_num_to_include=top_num_to_include_slice,
        result_dir=result_dir,
        network_size=network_size)
    full_param_traj_dir_path = get_full_params_dir(this_run_dir)
    log_dir = get_log_dir(this_run_dir)
    save_dir = get_save_dir(this_run_dir)

    create_dir_remove(this_run_dir)
    create_dir_remove(full_param_traj_dir_path)
    create_dir_remove(save_dir)
    create_dir_remove(log_dir)
    logger.configure(log_dir)

    # note this is only linear
    if lagrangian_inds_to_include is None:
        linear_top_vars_list = read_linear_top_var(policy_env,
                                                   policy_num_timesteps,
                                                   policy_run_num, policy_seed,
                                                   eval_seed, eval_run_num,
                                                   additional_note)

        # keys_to_include = ["COM", "M", "Coriolis", "total_contact_forces_contact_bodynode",
        #                    "com_jacobian", "contact_bodynode_jacobian"]
        keys_to_include = ["COM", "M", "Coriolis", "com_jacobian"]
        # lagrangian_inds_to_include = linear_top_vars_list[top_num_to_include_slice]
        lagrangian_inds_to_include = get_wanted_lagrangians(
            keys_to_include, linear_top_vars_list, top_num_to_include_slice)

    with open(f"{log_dir}/lagrangian_inds_to_include.json", 'w') as fp:
        json.dump(lagrangian_inds_to_include, fp)

    args.env = f'{experiment_label}_{entry_point}-v1'
    register(id=args.env,
             entry_point=entry_point,
             max_episode_steps=1000,
             kwargs={"lagrangian_inds_to_include": lagrangian_inds_to_include})

    def make_env():
        env_out = gym.make(args.env)
        env_out.env.visualize = False
        env_out = bench.Monitor(env_out,
                                logger.get_dir(),
                                allow_early_resets=True)
        return env_out

    env = DummyVecEnv([make_env])
    walker_env = env.envs[0].env.env
    walker_env.disableViewer = True

    if args.normalize:
        env = VecNormalize(env)
    policy = MlpPolicy

    # extra run info I added for my purposes
    run_info = {
        "run_num": args.run_num,
        "env_id": args.env,
        "full_param_traj_dir_path": full_param_traj_dir_path
    }

    layers = [network_size, network_size]
    set_global_seeds(args.seed)
    walker_env.seed(args.seed)

    policy_kwargs = {"net_arch": [dict(vf=layers, pi=layers)]}
    model = PPO2(policy=policy,
                 env=env,
                 n_steps=4096,
                 nminibatches=64,
                 lam=0.95,
                 gamma=0.99,
                 noptepochs=10,
                 ent_coef=0.0,
                 learning_rate=learning_rate,
                 cliprange=0.2,
                 optimizer='adam',
                 policy_kwargs=policy_kwargs,
                 seed=args.seed)
    model.tell_run_info(run_info)

    model.learn(total_timesteps=args.num_timesteps, seed=args.seed)

    model.save(f"{save_dir}/ppo2")

    if args.normalize:
        env.save_running_average(save_dir)

    return log_dir
Exemple #13
0
        'sha_pol': sha_pol if bool(flags.learn_sha_pol) else None,
        'mov_pol': None,
        'rot_pol': None
    }
}

env = make_env()
model = PPO2(PackingPolicy,
             env,
             n_steps=flags.num_steps,
             verbose=1,
             tensorboard_log=tensorboard_log,
             nminibatches=int((flags.num_steps * flags.num_pro) / 64),
             noptepochs=flags.noptepochs,
             make_env=make_env,
             gamma=flags.gamma,
             lam=flags.lam,
             vf_coef=flags.vf_coef,
             ent_coef=flags.ent_coef,
             zero_mean_advs=bool(flags.zero_mean_advs),
             packing_id_start=flags.id_start,
             learning_rate=flags.lr,
             policy_config=policy_config,
             restore_exp=not (bool(flags.learn_or_evaluate)),
             restore_path="./{}/{}".format(tensorboard_log, flags.model_name))

if bool(flags.learn_or_evaluate):
    model.learn(flags.num_steps * flags.num_pro * 400)
else:
    if bool(flags.eval_va_or_te):
        pack_file_name_evaluate = [
            "pack_va/" + str(i) + "_va"
Exemple #14
0
        #print(prediction[:20])
        #prediction = sigmoid(sw)
        #objective = mse(prediction, self.labels)
        objective = cross_entropy(prediction, self.labels)
        reward = -objective
        #print(reward)
        self.rewards.append(reward)
        if np.any(np.isnan(state)):
            print(state)
            print("NAN DETECTED")
            exit()
        return state, reward, terminal, {}

    def _terminal(self):
        return self.steps >= 40

    def _get_state(self):
        pass

    def render(self, mode='human'):
        pass

    def close(self):
        pass


if __name__ == '__main__':
    env = DummyVecEnv([OptDist])
    agent = PPO2(MlpPolicy, env)
    agent.learn(total_timesteps=10**7)
Exemple #15
0
def train_agent_ppo2(config,
                     agent_name,
                     total_timesteps,
                     policy,
                     gamma=0.99,
                     n_steps=128,
                     ent_coef=0.01,
                     learning_rate=0.00025,
                     vf_coef=0.5,
                     max_grad_norm=0.5,
                     lam=0.95,
                     nminibatches=4,
                     noptepochs=4,
                     cliprange=0.2,
                     num_envs=1,
                     robot_radius=0.46,
                     rew_fnc=3,
                     num_stacks=1,
                     stack_offset=15,
                     disc_action_space=False,
                     debug=False,
                     normalize=False,
                     stage=0,
                     pretrained_model_name="",
                     task_mode="static"):

    # Setting seed
    seed = random.randint(0, 1000)
    np.random.seed(seed)
    tf.random.set_random_seed(seed)
    random.seed(seed)

    # Define pathes to store things
    path_to_tensorboard_log = config['PATHES']['path_to_tensorboard_log']
    global path_to_models
    path_to_models = config['PATHES']['path_to_models']

    agent_dir = '%s/%s' % (path_to_models, agent_name)
    if not os.path.exists(agent_dir):
        os.makedirs(agent_dir)

    # Loading simulation environment
    env = load_train_env(num_envs, robot_radius, rew_fnc, num_stacks,
                         stack_offset, debug, task_mode, policy,
                         disc_action_space, normalize)

    if stage == 0:
        model = PPO2(eval(policy),
                     env,
                     gamma=gamma,
                     n_steps=n_steps,
                     ent_coef=ent_coef,
                     learning_rate=learning_rate,
                     vf_coef=vf_coef,
                     max_grad_norm=max_grad_norm,
                     lam=lam,
                     nminibatches=nminibatches,
                     noptepochs=noptepochs,
                     cliprange=cliprange,
                     verbose=1,
                     tensorboard_log='%s' % (path_to_tensorboard_log))
    else:
        # Pretrained model is loaded to continue training.
        model = PPO2.load(
            "%s/%s/%s.pkl" %
            (path_to_models, pretrained_model_name, pretrained_model_name),
            env,
            tensorboard_log='%s' % (path_to_tensorboard_log))

    # Document agent
    print("Starting PPO2 Training of agent: %s" % (agent_name))
    print("------------------------------------------------------")
    print("gamma \t\t\t\t %f" % model.gamma)
    print("n_steps \t\t\t %d" % model.n_steps)
    print("ent_coef \t\t\t %f" % model.ent_coef)
    print("learning_rate \t\t\t %f" % learning_rate)
    print("vf_coef \t\t\t %f" % model.vf_coef)
    print("max_grad_norm \t\t\t %f" % model.max_grad_norm)
    print("lam \t\t\t\t %f" % model.lam)
    print("nminibatches \t\t\t %d" % model.nminibatches)
    print("noptepochs \t\t\t %d" % model.noptepochs)
    print("cliprange \t\t\t %f" % cliprange)
    print("total_timesteps \t\t %d" % total_timesteps)
    print("Policy \t\t\t\t %s" % policy)
    print("reward_fnc \t\t\t %d" % rew_fnc)
    print("Normalized state: %d" % normalize)
    print("discrete action space %d" % disc_action_space)
    print("Number of stacks: %d, stack offset: %d" %
          (num_stacks, stack_offset))
    print("\n")

    # Starting training
    reset_num_timesteps = False
    if stage == 0:
        reset_num_timesteps = True

    model.learn(total_timesteps=total_timesteps,
                log_interval=100,
                callback=train_callback,
                tb_log_name=agent_name,
                reset_num_timesteps=reset_num_timesteps)

    # Saving final model
    model.save("%s/%s/%s" % (path_to_models, agent_name, "%s_stage_%d" %
                             (agent_name, stage)))
    print("Training finished.")
    env.close()
Exemple #16
0
        return env

    set_global_seeds(seed)
    return _init


env_id = 'Pendulum-v0'
env = gym.make(env_id)
# env = DummyVecEnv([lambda: env])  # The algorithms require a vectorized environment to run
env = SubprocVecEnv([make_env(env_id, i) for i in range(128)])
env = VecNormalize(env)

model = PPO2(CustomPolicy,
             env,
             n_steps=int(2048 / 128),
             nminibatches=64,
             noptepochs=10,
             lam=0.98,
             verbose=1,
             tensorboard_log='/home/xi/model/log')
# model = PPO2.load("ppo2_ipadgame")
# model.set_env(env)
# model.tensorboard_log='/home/xi/model/log'
# env.load_running_average("/home/xi/model/")

model.learn(total_timesteps=50000)

# model.save("ppo2_ipadgame")
# env.save_running_average("/home/xi/model/")
# print ('done')

env = gym.make(env_id)
Exemple #17
0
import gym

from stable_baselines.common.policies import MlpPolicy
from stable_baselines.common.vec_env import DummyVecEnv

from stable_baselines.ppo2 import PPO2

env = gym.make('CartPole-v1')
env = DummyVecEnv([lambda: env])

model = PPO2(MlpPolicy, env, verbose=1)
model.learn(total_timesteps=10000)

obs = env.reset()
for i in range(1000):
    action, _states = model.predict(obs)
    obs, rewards, dones, info = env.step(action)
    env.render()
env.close()
Exemple #18
0
  return True

###############################################################################
# Function to Create Vectorized Environment
def make_env(rank, seed=0):
    """
    :param rank: (int) index of the subprocess
    """
    def _init():
        env = stoch2_gym_env.Stoch2Env()
        env.seed(seed + rank)
        return env
    set_global_seeds(seed)
    return _init

###############################################################################
# Create Vectorized Environment for Multiprocessing
# Define the number of processes to use
num_cpu = 6
# Create the vectorized environment
env = SubprocVecEnv([make_env(i) for i in range(num_cpu)])

# Custom MLP policy of two layers of size 32 each with tanh activation function
policy_kwargs = dict(act_fun=tf.nn.relu, net_arch=[64, 64])
model = PPO2(MlpPolicy, env,policy_kwargs=policy_kwargs, tensorboard_log= tflow_log, verbose=0) #if tf log required add: 

###############################################################################
# Start training
print("RL Training begins....")
model.learn(total_timesteps=3*10**7,tb_log_name="log",callback=callback)
import pytest

from stable_baselines.a2c import A2C
from stable_baselines.ppo1 import PPO1
from stable_baselines.ppo2 import PPO2
from stable_baselines.trpo_mpi import TRPO
from stable_baselines.common.identity_env import IdentityEnvMultiBinary, IdentityEnvMultiDiscrete
from stable_baselines.common.vec_env.dummy_vec_env import DummyVecEnv
from stable_baselines.common.policies import MlpPolicy

MODEL_FUNC_LIST = [
    lambda e: A2C(policy=MlpPolicy, env=e),
    lambda e: PPO1(policy=MlpPolicy, env=e),
    lambda e: PPO2(policy=MlpPolicy, env=e),
    lambda e: TRPO(policy=MlpPolicy, env=e),
]


@pytest.mark.slow
@pytest.mark.parametrize("model_func", MODEL_FUNC_LIST)
def test_identity_multidiscrete(model_func):
    """
    Test if the algorithm (with a given policy)
    can learn an identity transformation (i.e. return observation as an action)
    with a multidiscrete action space

    :param model_func: (lambda (Gym Environment): BaseRLModel) the model generator
    """
    env = DummyVecEnv([lambda: IdentityEnvMultiDiscrete(10)])

    model = model_func(env)
Exemple #20
0
# Make Environments
frame_segs = [
    int(x * ((nframes - min(max_steps * 2, nframes) - 1) / (nprocs + 1)))
    for x in range(nprocs + 2)
]
envs = [
    make_env(rank=i, seed=0, framerange=(frame_segs[i], frame_segs[i + 2]))
    for i in range(nprocs)
]
env = SubprocVecEnv(envs)

# Create Networks
policy_kwargs = dict(act_fun=tf.nn.relu, net_arch=[1024, 512])

# Create Training Agent
agent = PPO2(MlpPolicy,
             env,
             gamma=0.95,
             lam=0.95,
             n_steps=nsteps,
             verbose=0,
             policy_kwargs=policy_kwargs,
             cliprange=0.2,
             learning_rate=5 * 1e-5,
             nminibatches=16)

# Start Learning
agent.learn(nbatches * max_steps * batch_size, callback=callback)

# Save File
agent.save(affix)
Exemple #21
0
def main(env,
         load,
         save_path,
         load_path=None,
         train_timesteps=1.25e6,
         eval_timesteps=5e3):

    # arguments
    print(
        "env %s; load %s; save_path %s; load_path %s; train_timesteps %s; eval_timesteps %s;"
        % (env, load, save_path, load_path, train_timesteps, eval_timesteps))
    train_timesteps = int(float(train_timesteps))
    eval_timesteps = int(float(eval_timesteps))

    # models path
    model_dir = os.getcwd() + "/models/"
    os.makedirs(model_dir, exist_ok=True)

    # logging path
    log_dir = os.getcwd() + "/log/" + save_path
    os.makedirs(log_dir, exist_ok=True)

    # absolute save path and models path
    save_path = model_dir + save_path
    if load and not load_path:
        print("no load path given, exiting...")
        sys.exit()
    elif load:
        load_path = model_dir + load_path

    # make environment, flattened environment, monitor, vectorized environment
    env = gym.make(env)
    env = gym.wrappers.FlattenDictWrapper(
        env, ['observation', 'achieved_goal', 'desired_goal'])
    env = Monitor(env, log_dir, allow_early_resets=True)
    env = DummyVecEnv([lambda: env])

    # load model, or start from scratch
    if load:
        print("loading model from: " + load_path)
        model = PPO2.load(load_path, env=env)
    else:
        print("training model from scratch")
        model = PPO2(MlpPolicy, env, verbose=1)

    # evaluate current model
    mean_reward_before_train = evaluate(model, env, num_steps=eval_timesteps)

    # train model
    global best_mean_reward, n_steps
    best_mean_reward, n_steps = -np.inf, 0
    model.learn(total_timesteps=train_timesteps, callback=None)

    # save model
    print("saving model to:" + save_path)
    model.save(save_path)

    # evaluate post training model
    mean_reward_after_train = evaluate(model, env, num_steps=eval_timesteps)

    # results
    print("reward before training:" + str(mean_reward_before_train))
    print("reward after training:" + str(mean_reward_after_train))
    print("done")
    #     gamma=0.99,  # Discount factor (TODO: THINK ABOUT THIS)
    #     lam=0.97,
    #     adam_epsilon=1E-5,
    #     schedule='linear',
    #     _init_setup_model=True,

    #     # Misc. Params
    #     tensorboard_log='./logs/',
    #     full_tensorboard_log=False,
    #     seed=0,
    #     n_cpu_tf_sess=None,
    #     verbose=1)

    model = PPO2(
        # Setting environment and Policy
        env=env,
        policy=MlpPolicy,
        policy_kwargs=policy_kwargs)
    print("training model ...")

    model.learn(total_timesteps=400,
                log_interval=210,
                tb_log_name="test_2",
                # callback=[],
                )
    print("saving")

    # attrs = vars(model)
    # print(', '.join("%s: %s" % item for item in attrs.items()))

    # model.save("ppo_reacher")
Exemple #23
0
def train(args):
    """
    Runs the test
    """
    args, argv = mujoco_arg_parser().parse_known_args(args)
    logger.log(f"#######TRAIN: {args}")
    args.alg = "ppo2"

    this_run_dir = get_dir_path_for_this_run(args)
    if os.path.exists(this_run_dir):
        import shutil
        shutil.rmtree(this_run_dir)
    os.makedirs(this_run_dir)

    log_dir = get_log_dir(this_run_dir)
    save_dir = get_save_dir(this_run_dir)
    logger.configure(log_dir)

    def make_env():
        env_out = gym.make(args.env)
        env_out.env.visualize = False
        env_out = bench.Monitor(env_out,
                                logger.get_dir(),
                                allow_early_resets=True)
        return env_out

    env = DummyVecEnv([make_env])
    env.envs[0].env.env.disableViewer = True
    set_global_seeds(args.seed)
    env.envs[0].env.env.seed(args.seed)

    if args.normalize:
        env = VecNormalize(env)

    policy = MlpPolicy

    # extra run info I added for my purposes

    full_param_traj_dir_path = get_full_params_dir(this_run_dir)

    if os.path.exists(full_param_traj_dir_path):
        import shutil
        shutil.rmtree(full_param_traj_dir_path)
    os.makedirs(full_param_traj_dir_path)

    if os.path.exists(save_dir):
        import shutil
        shutil.rmtree(save_dir)
    os.makedirs(save_dir)

    run_info = {
        "run_num": args.run_num,
        "env_id": args.env,
        "full_param_traj_dir_path": full_param_traj_dir_path,
        "state_samples_to_collect": args.state_samples_to_collect
    }

    model = PPO2(policy=policy,
                 env=env,
                 n_steps=args.n_steps,
                 nminibatches=args.nminibatches,
                 lam=0.95,
                 gamma=0.99,
                 noptepochs=10,
                 ent_coef=0.0,
                 learning_rate=3e-4,
                 cliprange=0.2,
                 optimizer=args.optimizer,
                 seed=args.seed)
    model.tell_run_info(run_info)

    model.learn(total_timesteps=args.num_timesteps)

    model.save(f"{save_dir}/ppo2")

    if args.normalize:
        env.save_running_average(save_dir)
def run_experiment_with_trained(augment_num_timesteps, linear_co_threshold, augment_seed, augment_run_num, network_size,
                                policy_env, policy_num_timesteps, policy_run_num, policy_seed, eval_seed, eval_run_num, learning_rate,
                                additional_note, result_dir, keys_to_include, metric_param, linear_top_vars_list=None,
                                linear_correlation_neuron_list=None, visualize=False, lagrangian_inds_to_include=None,
                                neurons_inds_to_include=None, use_lagrangian=True):
    trained_model = None
    if not use_lagrangian:
        with tf.variable_scope("trained_model"):
            common_arg_parser = get_common_parser()
            trained_args, cma_unknown_args = common_arg_parser.parse_known_args()
            trained_args.env = policy_env
            trained_args.seed = policy_seed
            trained_args.num_timesteps = policy_num_timesteps
            trained_args.run_num = policy_run_num
            trained_this_run_dir = get_dir_path_for_this_run(trained_args)
            trained_traj_params_dir_name = get_full_params_dir(trained_this_run_dir)
            trained_save_dir = get_save_dir(trained_this_run_dir)

            trained_final_file = get_full_param_traj_file_path(trained_traj_params_dir_name, "pi_final")
            trained_final_params = pd.read_csv(trained_final_file, header=None).values[0]

            trained_model = PPO2.load(f"{trained_save_dir}/ppo2", seed=augment_seed)
            trained_model.set_pi_from_flat(trained_final_params)

    args = AttributeDict()

    args.normalize = True
    args.num_timesteps = augment_num_timesteps
    args.run_num = augment_run_num
    args.alg = "ppo2"
    args.seed = augment_seed

    logger.log(f"#######TRAIN: {args}")
    # non_linear_global_dict
    timestamp = get_time_stamp('%Y_%m_%d_%H_%M_%S')
    experiment_label = f"learning_rate_{learning_rate}timestamp_{timestamp}_augment_num_timesteps{augment_num_timesteps}" \
                       f"_top_num_to_include{linear_co_threshold.start}_{linear_co_threshold.stop}" \
                       f"_augment_seed{augment_seed}_augment_run_num{augment_run_num}_network_size{network_size}" \
                       f"_policy_num_timesteps{policy_num_timesteps}_policy_run_num{policy_run_num}_policy_seed{policy_seed}" \
                       f"_eval_seed{eval_seed}_eval_run_num{eval_run_num}_additional_note_{additional_note}"

    if policy_env == "DartWalker2d-v1":
        entry_point = 'gym.envs.dart:DartWalker2dEnv_aug_input'
    elif policy_env == "DartHopper-v1":
        entry_point = 'gym.envs.dart:DartHopperEnv_aug_input'
    elif policy_env == "DartHalfCheetah-v1":
        entry_point = 'gym.envs.dart:DartHalfCheetahEnv_aug_input'
    elif policy_env == "DartSnake7Link-v1":
        entry_point = 'gym.envs.dart:DartSnake7LinkEnv_aug_input'
    else:
        raise NotImplemented()


    this_run_dir = get_experiment_path_for_this_run(entry_point, args.num_timesteps, args.run_num,
                                                    args.seed, learning_rate=learning_rate, top_num_to_include=linear_co_threshold,
                                                    result_dir=result_dir, network_size=network_size)
    full_param_traj_dir_path = get_full_params_dir(this_run_dir)
    log_dir = get_log_dir(this_run_dir)
    save_dir = get_save_dir(this_run_dir)


    create_dir_remove(this_run_dir)
    create_dir_remove(full_param_traj_dir_path)
    create_dir_remove(save_dir)
    create_dir_remove(log_dir)
    logger.configure(log_dir)

    linear_top_vars_list_wanted_to_print = []
    if (use_lagrangian and lagrangian_inds_to_include is None) or (not use_lagrangian and neurons_inds_to_include is None):
        # note this is only linear
        if linear_top_vars_list is None or linear_correlation_neuron_list is None:

            linear_top_vars_list, linear_correlation_neuron_list = read_linear_top_var(policy_env, policy_num_timesteps, policy_run_num, policy_seed, eval_seed,
                                               eval_run_num, additional_note, metric_param=metric_param)

        lagrangian_inds_to_include, neurons_inds_to_include, linear_top_vars_list_wanted_to_print = \
            get_wanted_lagrangians_and_neurons(keys_to_include, linear_top_vars_list, linear_correlation_neuron_list, linear_co_threshold)



    with open(f"{log_dir}/lagrangian_inds_to_include.json", 'w') as fp:
        json.dump(lagrangian_inds_to_include, fp)
    with open(f"{log_dir}/linear_top_vars_list_wanted_to_print.json", 'w') as fp:
        json.dump(linear_top_vars_list_wanted_to_print, fp)
    with open(f"{log_dir}/neurons_inds_to_include.json", 'w') as fp:
        json.dump(neurons_inds_to_include, fp)


    args.env = f'{experiment_label}_{entry_point}-v1'

    if not use_lagrangian:
        register(
            id=args.env,
            entry_point=entry_point,
            max_episode_steps=1000,
            kwargs={"lagrangian_inds_to_include": None, "trained_model": trained_model,
                    "neurons_inds_to_include": neurons_inds_to_include}
        )
    else:
        register(
            id=args.env,
            entry_point=entry_point,
            max_episode_steps=1000,
            kwargs={"lagrangian_inds_to_include": lagrangian_inds_to_include, "trained_model": None,
                    "neurons_inds_to_include": None}
        )

    def make_env():
        env_out = gym.make(args.env)
        env_out.env.visualize = visualize
        env_out = bench.Monitor(env_out, logger.get_dir(), allow_early_resets=True)
        return env_out

    env = DummyVecEnv([make_env])
    walker_env = env.envs[0].env.env
    walker_env.disableViewer = not visualize


    if args.normalize:
        env = VecNormalize(env)
    policy = MlpPolicy



    set_global_seeds(args.seed)
    walker_env.seed(args.seed)

    num_dof = walker_env.robot_skeleton.ndofs
    show_M_matrix(num_dof, lagrangian_inds_to_include, linear_co_threshold, log_dir)




    # extra run info I added for my purposes
    run_info = {"run_num": args.run_num,
                "env_id": args.env,
                "full_param_traj_dir_path": full_param_traj_dir_path}

    layers = [network_size, network_size]
    policy_kwargs = {"net_arch" : [dict(vf=layers, pi=layers)]}
    model = PPO2(policy=policy, env=env, n_steps=4096, nminibatches=64, lam=0.95, gamma=0.99,
                 noptepochs=10,
                 ent_coef=0.0, learning_rate=learning_rate, cliprange=0.2, optimizer='adam', policy_kwargs=policy_kwargs,
                 seed=args.seed)
    model.tell_run_info(run_info)
    model.learn(total_timesteps=args.num_timesteps, seed=args.seed)

    model.save(f"{save_dir}/ppo2")

    if args.normalize:
        env.save_running_average(save_dir)

    return log_dir
Exemple #25
0
        env = AnimalSkip(env, skip=SKIP_FRAMES)                  
        env = AnimalWrapper(env)
        env = AnimalStack(env,VISUAL_FRAMES_COUNT, VEL_FRAMES_COUNT, greyscale=USE_GREYSCALE_OBSES)
        return env
        
    return env

# Define environments
env = create_env_fn(num_actors = 1, inference=False, seed=0)
env = make_vec_env(env, n_envs=4)

# # register policy
register_policy('MyPolicy', LstmPolicy)

# # define algorithm
model = PPO2('MyPolicy', env, n_steps=256)

#########################
# Dataset concatenation #
#########################

def dataset_concatenation(dataset_path):
    '''
    Use only when you have datasets of seperate environments.
    If not, and the code already has a concatenated all_data.npz, ***do not use the function***

    Input: Directory where expert trajectory per environment .npz files are present
    Output: A all_data.npz in the same directory
    '''
    all_npzs = sorted(glob.glob(dataset_path+'*.npz'))
    print(all_npzs)
Exemple #26
0
    learn(total_timesteps=10000, seed=0),
    lambda e: ACER(policy=MlpPolicy, env=e, n_steps=1, replay_ratio=1).learn(
        total_timesteps=10000, seed=0),
    lambda e: ACKTR(policy=MlpPolicy, env=e, learning_rate=5e-4, n_steps=1
                    ).learn(total_timesteps=20000, seed=0),
    lambda e: DeepQ(policy=deepq_models.mlp([32]),
                    batch_size=16,
                    gamma=0.1,
                    exploration_fraction=0.001,
                    env=e).learn(total_timesteps=40000, seed=0),
    lambda e: PPO1(policy=MlpPolicy,
                   env=e,
                   lam=0.7,
                   optim_batchsize=16,
                   optim_stepsize=1e-3).learn(total_timesteps=10000, seed=0),
    lambda e: PPO2(policy=MlpPolicy, env=e, learning_rate=1.5e-3, lam=0.8
                   ).learn(total_timesteps=20000, seed=0),
    lambda e: TRPO(policy=MlpPolicy, env=e, max_kl=0.05, lam=0.7).learn(
        total_timesteps=10000, seed=0),
]


@pytest.mark.slow
@pytest.mark.parametrize("learn_func", learn_func_list)
def test_identity(learn_func):
    """
    Test if the algorithm (with a given policy)
    can learn an identity transformation (i.e. return observation as an action)

    :param learn_func: (lambda (Gym Environment): A2CPolicy) the policy generator
    """
    env = DummyVecEnv([lambda: IdentityEnv(10)])
Exemple #27
0
def get_PPO(env_name, ckpt_name="ppo_default_bin"):
    new_env = make_vec_env(env_name, n_envs=1)
    model = PPO2(CnnLstmPolicy, new_env, verbose=1)

    return new_env, model