def main(): """ Train and save the PPO model, for the cartpole problem """ print("Making a new model") env = ControlCarRacing(gym.make('CarRacing-v0')) env = MaxAndSkipEnv(env, skip=4) env = FrameStack(env, 4) env = Monitor(env, log_dir, allow_early_resets=True) env = DummyVecEnv([lambda: env]) model = PPO2(policy=CnnPolicy, env=env, n_steps=128, nminibatches=4, noptepochs=10, learning_rate=3e-4, cliprange=lambda f: f * 0.2, verbose=0, tensorboard_log='graph/') print("Learning started. It takes some time...") model.learn(total_timesteps=300000, callback=callback, tb_log_name='190317') print("Saving model to CarRacing_model.pkl") model.save("CarRacing_model_PPO2") print("Plotting Learning Curve") plot_results(log_dir) plot_results(log_dir, smoothing=False)
def __main(): from stable_baselines.ppo2 import PPO2 from stable_baselines.common.policies import MlpPolicy from stable_baselines.common.vec_env import DummyVecEnv env = DummyVecEnv([OptLRs]) agent = PPO2(MlpPolicy, env, verbose=1) agent.learn(total_timesteps=10**2)
def get_ppo2( vec_env=None, policy='CnnPolicy', seed=0, number_of_steps_per_epoch=128, # nsteps number_of_mini_batches_in_epoch=8, # nminibatches number_of_updates_per_epoch=4, # noptepochs max_grad_norm=0.5, gamma=0.993, # discount factor entropy_coefficient=0.01, # ent_coef learning_rate=0.00008, # lr clip_range=0.27, # cliprange vf_coefficient=0.5, ) -> PPO2: """ Parameter's default values are taken from football.gfootball.examples.run_ppo2.py """ if vec_env is None: vec_env = create_training_env(1) return PPO2( policy=policy, env=vec_env, gamma=gamma, n_steps=number_of_steps_per_epoch, ent_coef=entropy_coefficient, learning_rate=learning_rate, vf_coef=vf_coefficient, max_grad_norm=max_grad_norm, nminibatches=number_of_mini_batches_in_epoch, noptepochs=number_of_updates_per_epoch, cliprange=clip_range, seed=seed, verbose=2, )
def model_free_policy(self, ne, n_epochs=1, train=True, load_model=False): if self.autoencoder is None: self.setup_autoencoder(ne.get_obs()) assert (self.autoencoder) is not None if ne.autoencoder is None: ne.set_autoencoder(self.autoencode) ne.autoencoder = self.autoencode if train: fn = "models/model1.h5" self.mf_policy = PPO2(env=ne, policy=MlpPolicy, n_steps=40, verbose=2, noptepochs=10, learning_rate=3e-4, ent_coef=0.1, gamma=0.1) if load_model: self.mf_policy.load(fn, env=make_vec_env(lambda: ne)) else: self.mf_policy.learn(total_timesteps=n_epochs * 40) self.mf_policy.save(fn) encoded_obs = ne.rl_obs() return self.mf_policy.step([encoded_obs], deterministic=True)[0].flatten()
def train(env_id, num_timesteps, seed, policy): """ Train PPO2 model for atari environment, for testing purposes :param env_id: (str) the environment id string :param num_timesteps: (int) the number of timesteps to run :param seed: (int) Used to seed the random generator. :param policy: (Object) The policy model to use (MLP, CNN, LSTM, ...) """ env = VecFrameStack(make_atari_env(env_id, 8, seed), 4) policy = { 'cnn': CnnPolicy, 'lstm': CnnLstmPolicy, 'lnlstm': CnnLnLstmPolicy, 'mlp': MlpPolicy }[policy] model = PPO2(policy=policy, env=env, n_steps=128, nminibatches=4, lam=0.95, gamma=0.99, noptepochs=4, ent_coef=.01, learning_rate=lambda f: f * 2.5e-4, cliprange=lambda f: f * 0.1, verbose=1) model.learn(total_timesteps=num_timesteps)
def train(env_id, num_timesteps, seed): """ Train PPO2 model for Mujoco environment, for testing purposes :param env_id: (str) the environment id string :param num_timesteps: (int) the number of timesteps to run :param seed: (int) Used to seed the random generator. """ def make_env(): env_out = gym.make(env_id) env_out = bench.Monitor(env_out, logger.get_dir(), allow_early_resets=True) return env_out env = DummyVecEnv([make_env]) env = VecNormalize(env) set_global_seeds(seed) policy = MlpPolicy model = PPO2(policy=policy, env=env, n_steps=2048, nminibatches=32, lam=0.95, gamma=0.99, noptepochs=10, ent_coef=0.0, learning_rate=3e-4, cliprange=0.2) model.learn(total_timesteps=num_timesteps) return model, env
def test_cnn_lstm_policy(request, policy): model_fname = './test_model_{}.zip'.format(request.node.name) try: env = make_env(0) model = PPO2(policy, env, nminibatches=1) model.learn(total_timesteps=15) env = model.get_env() evaluate_policy(model, env, n_eval_episodes=5) # saving model.save(model_fname) del model, env # loading _ = PPO2.load(model_fname, policy=policy) finally: if os.path.exists(model_fname): os.remove(model_fname)
def train(env_id, num_timesteps, seed, policy, n_envs=8, nminibatches=4, n_steps=128): """ Train PPO2 model for atari environment, for testing purposes :param env_id: (str) the environment id string :param num_timesteps: (int) the number of timesteps to run :param seed: (int) Used to seed the random generator. :param policy: (Object) The policy model to use (MLP, CNN, LSTM, ...) :param n_envs: (int) Number of parallel environments :param nminibatches: (int) Number of training minibatches per update. For recurrent policies, the number of environments run in parallel should be a multiple of nminibatches. :param n_steps: (int) The number of steps to run for each environment per update (i.e. batch size is n_steps * n_env where n_env is number of environment copies running in parallel) """ env = VecFrameStack(make_atari_env(env_id, n_envs, seed), 4) policy = { 'cnn': CnnPolicy, 'lstm': CnnLstmPolicy, 'lnlstm': CnnLnLstmPolicy, 'mlp': MlpPolicy }[policy] model = PPO2(policy=policy, env=env, n_steps=n_steps, nminibatches=nminibatches, lam=0.95, gamma=0.99, noptepochs=4, ent_coef=.01, learning_rate=lambda f: f * 2.5e-4, cliprange=lambda f: f * 0.1, verbose=1) model.learn(total_timesteps=num_timesteps) del model
def train(num_timesteps, model_to_load): try: env = DummyVecEnv([dsgym]) env = VecNormalize(env) policy = MlpPolicy lr = 3e-4 * 0.75 model = PPO2(policy=policy, env=env, n_steps=2048, nminibatches=32, lam=0.95, gamma=0.99, noptepochs=10, ent_coef=0.01, learning_rate=linear_schedule(lr), cliprange=0.2) if model_to_load: env = DummyVecEnv([dsgym]) env = VecNormalize.load( model_to_load.replace(".zip", "vec_normalize.pkl"), env) model = model.load(model_to_load) model.set_env(env) print("Loaded model from: ", model_to_load) model.set_learning_rate_func(linear_schedule_start_zero(lr)) model.learn(total_timesteps=num_timesteps) except KeyboardInterrupt: print("Saving on keyinterrupt") model.save("D:/openAi/ppo2save/" + time.strftime("%Y_%m_%d-%H_%M_%S")) # quit sys.exit() except BaseException as error: model.save("D:/openAi/ppo2save/" + time.strftime("%Y_%m_%d-%H_%M_%S")) print('An exception occurred: {}'.format(error)) traceback.print_exception(*sys.exc_info()) sys.exit() model.save("D:/openAi/ppo2save/" + time.strftime("%Y_%m_%d-%H_%M_%S"))
def create_learner(self, env, parameters): if (self.trpo() or self.ppo()) and not issubclass(type(env), VecEnv): env = DummyVecEnv([lambda: env]) if self.trpo(): model = TRPO(MlpPolicy, env, **parameters["common"], **parameters[str(self)]) interface = TRPOInterface(model, env.observation_space.shape[0]) elif self.ppo(): model = PPO2(MlpPolicy, env, **parameters["common"], **parameters[str(self)]) interface = PPOInterface(model, env.observation_space.shape[0]) else: model = SAC(SACMlpPolicy, env, **parameters["common"], **parameters[str(self)]) interface = SACInterface(model, env.observation_space.shape[0]) if "pretrain_data_path" in parameters: data_path = parameters["pretrain_data_path"] model.pretrain(ExpertDataset(expert_path=data_path, verbose=0), n_epochs=25) return model, interface
def train(env_id, num_timesteps, seed, policy): """ Train PPO2 model for atari environment, for testing purposes :param env_id: (str) the environment id string :param num_timesteps: (int) the number of timesteps to run :param seed: (int) Used to seed the random generator. :param policy: (Object) The policy model to use (MLP, CNN, LSTM, ...) """ env = Monitor(PadEnv(), './logs', allow_early_resets=True) env = DummyVecEnv([lambda: env for _ in range(16)]) env = VecFrameStack(env, 8) policy = { 'cnn': CnnPolicy, 'lstm': CnnLstmPolicy, 'lnlstm': CnnLnLstmPolicy, 'mlp': MlpPolicy }[policy] model = PPO2(policy=policy, env=env, n_steps=256, nminibatches=4, lam=0.95, gamma=0.99, noptepochs=4, ent_coef=.01, learning_rate=lambda f: f * 2.5e-4, cliprange=lambda f: f * 0.1, verbose=1) # model = model.load('./pad_4combo_ppo2.pkl', env) try: model.learn(total_timesteps=num_timesteps) except KeyboardInterrupt: print('Keyboard Interrupted') model.save('./pad_5combo_ppo2.pkl')
def visualize_augment_experiment(augment_num_timesteps, top_num_to_include_slice, augment_seed, augment_run_num, network_size, policy_env, policy_num_timesteps, policy_run_num, policy_seed, eval_seed, eval_run_num, learning_rate, additional_note, result_dir, lagrangian_inds_to_include=None): args = AttributeDict() args.normalize = True args.num_timesteps = augment_num_timesteps args.run_num = augment_run_num args.alg = "ppo2" args.seed = augment_seed logger.log(f"#######TRAIN: {args}") # non_linear_global_dict timestamp = get_time_stamp('%Y_%m_%d_%H_%M_%S') experiment_label = f"learning_rate_{learning_rate}timestamp_{timestamp}_augment_num_timesteps{augment_num_timesteps}" \ f"_top_num_to_include{top_num_to_include_slice.start}_{top_num_to_include_slice.stop}" \ f"_augment_seed{augment_seed}_augment_run_num{augment_run_num}_network_size{network_size}" \ f"_policy_num_timesteps{policy_num_timesteps}_policy_run_num{policy_run_num}_policy_seed{policy_seed}" \ f"_eval_seed{eval_seed}_eval_run_num{eval_run_num}_additional_note_{additional_note}" if policy_env == "DartWalker2d-v1": entry_point = 'gym.envs.dart:DartWalker2dEnv_aug_input' elif policy_env == "DartHopper-v1": entry_point = 'gym.envs.dart:DartHopperEnv_aug_input' elif policy_env == "DartHalfCheetah-v1": entry_point = 'gym.envs.dart:DartHalfCheetahEnv_aug_input' elif policy_env == "DartSnake7Link-v1": entry_point = 'gym.envs.dart:DartSnake7LinkEnv_aug_input' else: raise NotImplemented() this_run_dir = get_experiment_path_for_this_run( entry_point, args.num_timesteps, args.run_num, args.seed, learning_rate=learning_rate, top_num_to_include=top_num_to_include_slice, result_dir=result_dir, network_size=network_size) full_param_traj_dir_path = get_full_params_dir(this_run_dir) log_dir = get_log_dir(this_run_dir) save_dir = get_save_dir(this_run_dir) create_dir_remove(this_run_dir) create_dir_remove(full_param_traj_dir_path) create_dir_remove(save_dir) create_dir_remove(log_dir) logger.configure(log_dir) # note this is only linear if lagrangian_inds_to_include is None: linear_top_vars_list = read_linear_top_var(policy_env, policy_num_timesteps, policy_run_num, policy_seed, eval_seed, eval_run_num, additional_note) # keys_to_include = ["COM", "M", "Coriolis", "total_contact_forces_contact_bodynode", # "com_jacobian", "contact_bodynode_jacobian"] keys_to_include = ["COM", "M", "Coriolis", "com_jacobian"] # lagrangian_inds_to_include = linear_top_vars_list[top_num_to_include_slice] lagrangian_inds_to_include = get_wanted_lagrangians( keys_to_include, linear_top_vars_list, top_num_to_include_slice) with open(f"{log_dir}/lagrangian_inds_to_include.json", 'w') as fp: json.dump(lagrangian_inds_to_include, fp) args.env = f'{experiment_label}_{entry_point}-v1' register(id=args.env, entry_point=entry_point, max_episode_steps=1000, kwargs={"lagrangian_inds_to_include": lagrangian_inds_to_include}) def make_env(): env_out = gym.make(args.env) env_out.env.visualize = False env_out = bench.Monitor(env_out, logger.get_dir(), allow_early_resets=True) return env_out env = DummyVecEnv([make_env]) walker_env = env.envs[0].env.env walker_env.disableViewer = True if args.normalize: env = VecNormalize(env) policy = MlpPolicy # extra run info I added for my purposes run_info = { "run_num": args.run_num, "env_id": args.env, "full_param_traj_dir_path": full_param_traj_dir_path } layers = [network_size, network_size] set_global_seeds(args.seed) walker_env.seed(args.seed) policy_kwargs = {"net_arch": [dict(vf=layers, pi=layers)]} model = PPO2(policy=policy, env=env, n_steps=4096, nminibatches=64, lam=0.95, gamma=0.99, noptepochs=10, ent_coef=0.0, learning_rate=learning_rate, cliprange=0.2, optimizer='adam', policy_kwargs=policy_kwargs, seed=args.seed) model.tell_run_info(run_info) model.learn(total_timesteps=args.num_timesteps, seed=args.seed) model.save(f"{save_dir}/ppo2") if args.normalize: env.save_running_average(save_dir) return log_dir
'sha_pol': sha_pol if bool(flags.learn_sha_pol) else None, 'mov_pol': None, 'rot_pol': None } } env = make_env() model = PPO2(PackingPolicy, env, n_steps=flags.num_steps, verbose=1, tensorboard_log=tensorboard_log, nminibatches=int((flags.num_steps * flags.num_pro) / 64), noptepochs=flags.noptepochs, make_env=make_env, gamma=flags.gamma, lam=flags.lam, vf_coef=flags.vf_coef, ent_coef=flags.ent_coef, zero_mean_advs=bool(flags.zero_mean_advs), packing_id_start=flags.id_start, learning_rate=flags.lr, policy_config=policy_config, restore_exp=not (bool(flags.learn_or_evaluate)), restore_path="./{}/{}".format(tensorboard_log, flags.model_name)) if bool(flags.learn_or_evaluate): model.learn(flags.num_steps * flags.num_pro * 400) else: if bool(flags.eval_va_or_te): pack_file_name_evaluate = [ "pack_va/" + str(i) + "_va"
#print(prediction[:20]) #prediction = sigmoid(sw) #objective = mse(prediction, self.labels) objective = cross_entropy(prediction, self.labels) reward = -objective #print(reward) self.rewards.append(reward) if np.any(np.isnan(state)): print(state) print("NAN DETECTED") exit() return state, reward, terminal, {} def _terminal(self): return self.steps >= 40 def _get_state(self): pass def render(self, mode='human'): pass def close(self): pass if __name__ == '__main__': env = DummyVecEnv([OptDist]) agent = PPO2(MlpPolicy, env) agent.learn(total_timesteps=10**7)
def train_agent_ppo2(config, agent_name, total_timesteps, policy, gamma=0.99, n_steps=128, ent_coef=0.01, learning_rate=0.00025, vf_coef=0.5, max_grad_norm=0.5, lam=0.95, nminibatches=4, noptepochs=4, cliprange=0.2, num_envs=1, robot_radius=0.46, rew_fnc=3, num_stacks=1, stack_offset=15, disc_action_space=False, debug=False, normalize=False, stage=0, pretrained_model_name="", task_mode="static"): # Setting seed seed = random.randint(0, 1000) np.random.seed(seed) tf.random.set_random_seed(seed) random.seed(seed) # Define pathes to store things path_to_tensorboard_log = config['PATHES']['path_to_tensorboard_log'] global path_to_models path_to_models = config['PATHES']['path_to_models'] agent_dir = '%s/%s' % (path_to_models, agent_name) if not os.path.exists(agent_dir): os.makedirs(agent_dir) # Loading simulation environment env = load_train_env(num_envs, robot_radius, rew_fnc, num_stacks, stack_offset, debug, task_mode, policy, disc_action_space, normalize) if stage == 0: model = PPO2(eval(policy), env, gamma=gamma, n_steps=n_steps, ent_coef=ent_coef, learning_rate=learning_rate, vf_coef=vf_coef, max_grad_norm=max_grad_norm, lam=lam, nminibatches=nminibatches, noptepochs=noptepochs, cliprange=cliprange, verbose=1, tensorboard_log='%s' % (path_to_tensorboard_log)) else: # Pretrained model is loaded to continue training. model = PPO2.load( "%s/%s/%s.pkl" % (path_to_models, pretrained_model_name, pretrained_model_name), env, tensorboard_log='%s' % (path_to_tensorboard_log)) # Document agent print("Starting PPO2 Training of agent: %s" % (agent_name)) print("------------------------------------------------------") print("gamma \t\t\t\t %f" % model.gamma) print("n_steps \t\t\t %d" % model.n_steps) print("ent_coef \t\t\t %f" % model.ent_coef) print("learning_rate \t\t\t %f" % learning_rate) print("vf_coef \t\t\t %f" % model.vf_coef) print("max_grad_norm \t\t\t %f" % model.max_grad_norm) print("lam \t\t\t\t %f" % model.lam) print("nminibatches \t\t\t %d" % model.nminibatches) print("noptepochs \t\t\t %d" % model.noptepochs) print("cliprange \t\t\t %f" % cliprange) print("total_timesteps \t\t %d" % total_timesteps) print("Policy \t\t\t\t %s" % policy) print("reward_fnc \t\t\t %d" % rew_fnc) print("Normalized state: %d" % normalize) print("discrete action space %d" % disc_action_space) print("Number of stacks: %d, stack offset: %d" % (num_stacks, stack_offset)) print("\n") # Starting training reset_num_timesteps = False if stage == 0: reset_num_timesteps = True model.learn(total_timesteps=total_timesteps, log_interval=100, callback=train_callback, tb_log_name=agent_name, reset_num_timesteps=reset_num_timesteps) # Saving final model model.save("%s/%s/%s" % (path_to_models, agent_name, "%s_stage_%d" % (agent_name, stage))) print("Training finished.") env.close()
return env set_global_seeds(seed) return _init env_id = 'Pendulum-v0' env = gym.make(env_id) # env = DummyVecEnv([lambda: env]) # The algorithms require a vectorized environment to run env = SubprocVecEnv([make_env(env_id, i) for i in range(128)]) env = VecNormalize(env) model = PPO2(CustomPolicy, env, n_steps=int(2048 / 128), nminibatches=64, noptepochs=10, lam=0.98, verbose=1, tensorboard_log='/home/xi/model/log') # model = PPO2.load("ppo2_ipadgame") # model.set_env(env) # model.tensorboard_log='/home/xi/model/log' # env.load_running_average("/home/xi/model/") model.learn(total_timesteps=50000) # model.save("ppo2_ipadgame") # env.save_running_average("/home/xi/model/") # print ('done') env = gym.make(env_id)
import gym from stable_baselines.common.policies import MlpPolicy from stable_baselines.common.vec_env import DummyVecEnv from stable_baselines.ppo2 import PPO2 env = gym.make('CartPole-v1') env = DummyVecEnv([lambda: env]) model = PPO2(MlpPolicy, env, verbose=1) model.learn(total_timesteps=10000) obs = env.reset() for i in range(1000): action, _states = model.predict(obs) obs, rewards, dones, info = env.step(action) env.render() env.close()
return True ############################################################################### # Function to Create Vectorized Environment def make_env(rank, seed=0): """ :param rank: (int) index of the subprocess """ def _init(): env = stoch2_gym_env.Stoch2Env() env.seed(seed + rank) return env set_global_seeds(seed) return _init ############################################################################### # Create Vectorized Environment for Multiprocessing # Define the number of processes to use num_cpu = 6 # Create the vectorized environment env = SubprocVecEnv([make_env(i) for i in range(num_cpu)]) # Custom MLP policy of two layers of size 32 each with tanh activation function policy_kwargs = dict(act_fun=tf.nn.relu, net_arch=[64, 64]) model = PPO2(MlpPolicy, env,policy_kwargs=policy_kwargs, tensorboard_log= tflow_log, verbose=0) #if tf log required add: ############################################################################### # Start training print("RL Training begins....") model.learn(total_timesteps=3*10**7,tb_log_name="log",callback=callback)
import pytest from stable_baselines.a2c import A2C from stable_baselines.ppo1 import PPO1 from stable_baselines.ppo2 import PPO2 from stable_baselines.trpo_mpi import TRPO from stable_baselines.common.identity_env import IdentityEnvMultiBinary, IdentityEnvMultiDiscrete from stable_baselines.common.vec_env.dummy_vec_env import DummyVecEnv from stable_baselines.common.policies import MlpPolicy MODEL_FUNC_LIST = [ lambda e: A2C(policy=MlpPolicy, env=e), lambda e: PPO1(policy=MlpPolicy, env=e), lambda e: PPO2(policy=MlpPolicy, env=e), lambda e: TRPO(policy=MlpPolicy, env=e), ] @pytest.mark.slow @pytest.mark.parametrize("model_func", MODEL_FUNC_LIST) def test_identity_multidiscrete(model_func): """ Test if the algorithm (with a given policy) can learn an identity transformation (i.e. return observation as an action) with a multidiscrete action space :param model_func: (lambda (Gym Environment): BaseRLModel) the model generator """ env = DummyVecEnv([lambda: IdentityEnvMultiDiscrete(10)]) model = model_func(env)
# Make Environments frame_segs = [ int(x * ((nframes - min(max_steps * 2, nframes) - 1) / (nprocs + 1))) for x in range(nprocs + 2) ] envs = [ make_env(rank=i, seed=0, framerange=(frame_segs[i], frame_segs[i + 2])) for i in range(nprocs) ] env = SubprocVecEnv(envs) # Create Networks policy_kwargs = dict(act_fun=tf.nn.relu, net_arch=[1024, 512]) # Create Training Agent agent = PPO2(MlpPolicy, env, gamma=0.95, lam=0.95, n_steps=nsteps, verbose=0, policy_kwargs=policy_kwargs, cliprange=0.2, learning_rate=5 * 1e-5, nminibatches=16) # Start Learning agent.learn(nbatches * max_steps * batch_size, callback=callback) # Save File agent.save(affix)
def main(env, load, save_path, load_path=None, train_timesteps=1.25e6, eval_timesteps=5e3): # arguments print( "env %s; load %s; save_path %s; load_path %s; train_timesteps %s; eval_timesteps %s;" % (env, load, save_path, load_path, train_timesteps, eval_timesteps)) train_timesteps = int(float(train_timesteps)) eval_timesteps = int(float(eval_timesteps)) # models path model_dir = os.getcwd() + "/models/" os.makedirs(model_dir, exist_ok=True) # logging path log_dir = os.getcwd() + "/log/" + save_path os.makedirs(log_dir, exist_ok=True) # absolute save path and models path save_path = model_dir + save_path if load and not load_path: print("no load path given, exiting...") sys.exit() elif load: load_path = model_dir + load_path # make environment, flattened environment, monitor, vectorized environment env = gym.make(env) env = gym.wrappers.FlattenDictWrapper( env, ['observation', 'achieved_goal', 'desired_goal']) env = Monitor(env, log_dir, allow_early_resets=True) env = DummyVecEnv([lambda: env]) # load model, or start from scratch if load: print("loading model from: " + load_path) model = PPO2.load(load_path, env=env) else: print("training model from scratch") model = PPO2(MlpPolicy, env, verbose=1) # evaluate current model mean_reward_before_train = evaluate(model, env, num_steps=eval_timesteps) # train model global best_mean_reward, n_steps best_mean_reward, n_steps = -np.inf, 0 model.learn(total_timesteps=train_timesteps, callback=None) # save model print("saving model to:" + save_path) model.save(save_path) # evaluate post training model mean_reward_after_train = evaluate(model, env, num_steps=eval_timesteps) # results print("reward before training:" + str(mean_reward_before_train)) print("reward after training:" + str(mean_reward_after_train)) print("done")
# gamma=0.99, # Discount factor (TODO: THINK ABOUT THIS) # lam=0.97, # adam_epsilon=1E-5, # schedule='linear', # _init_setup_model=True, # # Misc. Params # tensorboard_log='./logs/', # full_tensorboard_log=False, # seed=0, # n_cpu_tf_sess=None, # verbose=1) model = PPO2( # Setting environment and Policy env=env, policy=MlpPolicy, policy_kwargs=policy_kwargs) print("training model ...") model.learn(total_timesteps=400, log_interval=210, tb_log_name="test_2", # callback=[], ) print("saving") # attrs = vars(model) # print(', '.join("%s: %s" % item for item in attrs.items())) # model.save("ppo_reacher")
def train(args): """ Runs the test """ args, argv = mujoco_arg_parser().parse_known_args(args) logger.log(f"#######TRAIN: {args}") args.alg = "ppo2" this_run_dir = get_dir_path_for_this_run(args) if os.path.exists(this_run_dir): import shutil shutil.rmtree(this_run_dir) os.makedirs(this_run_dir) log_dir = get_log_dir(this_run_dir) save_dir = get_save_dir(this_run_dir) logger.configure(log_dir) def make_env(): env_out = gym.make(args.env) env_out.env.visualize = False env_out = bench.Monitor(env_out, logger.get_dir(), allow_early_resets=True) return env_out env = DummyVecEnv([make_env]) env.envs[0].env.env.disableViewer = True set_global_seeds(args.seed) env.envs[0].env.env.seed(args.seed) if args.normalize: env = VecNormalize(env) policy = MlpPolicy # extra run info I added for my purposes full_param_traj_dir_path = get_full_params_dir(this_run_dir) if os.path.exists(full_param_traj_dir_path): import shutil shutil.rmtree(full_param_traj_dir_path) os.makedirs(full_param_traj_dir_path) if os.path.exists(save_dir): import shutil shutil.rmtree(save_dir) os.makedirs(save_dir) run_info = { "run_num": args.run_num, "env_id": args.env, "full_param_traj_dir_path": full_param_traj_dir_path, "state_samples_to_collect": args.state_samples_to_collect } model = PPO2(policy=policy, env=env, n_steps=args.n_steps, nminibatches=args.nminibatches, lam=0.95, gamma=0.99, noptepochs=10, ent_coef=0.0, learning_rate=3e-4, cliprange=0.2, optimizer=args.optimizer, seed=args.seed) model.tell_run_info(run_info) model.learn(total_timesteps=args.num_timesteps) model.save(f"{save_dir}/ppo2") if args.normalize: env.save_running_average(save_dir)
def run_experiment_with_trained(augment_num_timesteps, linear_co_threshold, augment_seed, augment_run_num, network_size, policy_env, policy_num_timesteps, policy_run_num, policy_seed, eval_seed, eval_run_num, learning_rate, additional_note, result_dir, keys_to_include, metric_param, linear_top_vars_list=None, linear_correlation_neuron_list=None, visualize=False, lagrangian_inds_to_include=None, neurons_inds_to_include=None, use_lagrangian=True): trained_model = None if not use_lagrangian: with tf.variable_scope("trained_model"): common_arg_parser = get_common_parser() trained_args, cma_unknown_args = common_arg_parser.parse_known_args() trained_args.env = policy_env trained_args.seed = policy_seed trained_args.num_timesteps = policy_num_timesteps trained_args.run_num = policy_run_num trained_this_run_dir = get_dir_path_for_this_run(trained_args) trained_traj_params_dir_name = get_full_params_dir(trained_this_run_dir) trained_save_dir = get_save_dir(trained_this_run_dir) trained_final_file = get_full_param_traj_file_path(trained_traj_params_dir_name, "pi_final") trained_final_params = pd.read_csv(trained_final_file, header=None).values[0] trained_model = PPO2.load(f"{trained_save_dir}/ppo2", seed=augment_seed) trained_model.set_pi_from_flat(trained_final_params) args = AttributeDict() args.normalize = True args.num_timesteps = augment_num_timesteps args.run_num = augment_run_num args.alg = "ppo2" args.seed = augment_seed logger.log(f"#######TRAIN: {args}") # non_linear_global_dict timestamp = get_time_stamp('%Y_%m_%d_%H_%M_%S') experiment_label = f"learning_rate_{learning_rate}timestamp_{timestamp}_augment_num_timesteps{augment_num_timesteps}" \ f"_top_num_to_include{linear_co_threshold.start}_{linear_co_threshold.stop}" \ f"_augment_seed{augment_seed}_augment_run_num{augment_run_num}_network_size{network_size}" \ f"_policy_num_timesteps{policy_num_timesteps}_policy_run_num{policy_run_num}_policy_seed{policy_seed}" \ f"_eval_seed{eval_seed}_eval_run_num{eval_run_num}_additional_note_{additional_note}" if policy_env == "DartWalker2d-v1": entry_point = 'gym.envs.dart:DartWalker2dEnv_aug_input' elif policy_env == "DartHopper-v1": entry_point = 'gym.envs.dart:DartHopperEnv_aug_input' elif policy_env == "DartHalfCheetah-v1": entry_point = 'gym.envs.dart:DartHalfCheetahEnv_aug_input' elif policy_env == "DartSnake7Link-v1": entry_point = 'gym.envs.dart:DartSnake7LinkEnv_aug_input' else: raise NotImplemented() this_run_dir = get_experiment_path_for_this_run(entry_point, args.num_timesteps, args.run_num, args.seed, learning_rate=learning_rate, top_num_to_include=linear_co_threshold, result_dir=result_dir, network_size=network_size) full_param_traj_dir_path = get_full_params_dir(this_run_dir) log_dir = get_log_dir(this_run_dir) save_dir = get_save_dir(this_run_dir) create_dir_remove(this_run_dir) create_dir_remove(full_param_traj_dir_path) create_dir_remove(save_dir) create_dir_remove(log_dir) logger.configure(log_dir) linear_top_vars_list_wanted_to_print = [] if (use_lagrangian and lagrangian_inds_to_include is None) or (not use_lagrangian and neurons_inds_to_include is None): # note this is only linear if linear_top_vars_list is None or linear_correlation_neuron_list is None: linear_top_vars_list, linear_correlation_neuron_list = read_linear_top_var(policy_env, policy_num_timesteps, policy_run_num, policy_seed, eval_seed, eval_run_num, additional_note, metric_param=metric_param) lagrangian_inds_to_include, neurons_inds_to_include, linear_top_vars_list_wanted_to_print = \ get_wanted_lagrangians_and_neurons(keys_to_include, linear_top_vars_list, linear_correlation_neuron_list, linear_co_threshold) with open(f"{log_dir}/lagrangian_inds_to_include.json", 'w') as fp: json.dump(lagrangian_inds_to_include, fp) with open(f"{log_dir}/linear_top_vars_list_wanted_to_print.json", 'w') as fp: json.dump(linear_top_vars_list_wanted_to_print, fp) with open(f"{log_dir}/neurons_inds_to_include.json", 'w') as fp: json.dump(neurons_inds_to_include, fp) args.env = f'{experiment_label}_{entry_point}-v1' if not use_lagrangian: register( id=args.env, entry_point=entry_point, max_episode_steps=1000, kwargs={"lagrangian_inds_to_include": None, "trained_model": trained_model, "neurons_inds_to_include": neurons_inds_to_include} ) else: register( id=args.env, entry_point=entry_point, max_episode_steps=1000, kwargs={"lagrangian_inds_to_include": lagrangian_inds_to_include, "trained_model": None, "neurons_inds_to_include": None} ) def make_env(): env_out = gym.make(args.env) env_out.env.visualize = visualize env_out = bench.Monitor(env_out, logger.get_dir(), allow_early_resets=True) return env_out env = DummyVecEnv([make_env]) walker_env = env.envs[0].env.env walker_env.disableViewer = not visualize if args.normalize: env = VecNormalize(env) policy = MlpPolicy set_global_seeds(args.seed) walker_env.seed(args.seed) num_dof = walker_env.robot_skeleton.ndofs show_M_matrix(num_dof, lagrangian_inds_to_include, linear_co_threshold, log_dir) # extra run info I added for my purposes run_info = {"run_num": args.run_num, "env_id": args.env, "full_param_traj_dir_path": full_param_traj_dir_path} layers = [network_size, network_size] policy_kwargs = {"net_arch" : [dict(vf=layers, pi=layers)]} model = PPO2(policy=policy, env=env, n_steps=4096, nminibatches=64, lam=0.95, gamma=0.99, noptepochs=10, ent_coef=0.0, learning_rate=learning_rate, cliprange=0.2, optimizer='adam', policy_kwargs=policy_kwargs, seed=args.seed) model.tell_run_info(run_info) model.learn(total_timesteps=args.num_timesteps, seed=args.seed) model.save(f"{save_dir}/ppo2") if args.normalize: env.save_running_average(save_dir) return log_dir
env = AnimalSkip(env, skip=SKIP_FRAMES) env = AnimalWrapper(env) env = AnimalStack(env,VISUAL_FRAMES_COUNT, VEL_FRAMES_COUNT, greyscale=USE_GREYSCALE_OBSES) return env return env # Define environments env = create_env_fn(num_actors = 1, inference=False, seed=0) env = make_vec_env(env, n_envs=4) # # register policy register_policy('MyPolicy', LstmPolicy) # # define algorithm model = PPO2('MyPolicy', env, n_steps=256) ######################### # Dataset concatenation # ######################### def dataset_concatenation(dataset_path): ''' Use only when you have datasets of seperate environments. If not, and the code already has a concatenated all_data.npz, ***do not use the function*** Input: Directory where expert trajectory per environment .npz files are present Output: A all_data.npz in the same directory ''' all_npzs = sorted(glob.glob(dataset_path+'*.npz')) print(all_npzs)
learn(total_timesteps=10000, seed=0), lambda e: ACER(policy=MlpPolicy, env=e, n_steps=1, replay_ratio=1).learn( total_timesteps=10000, seed=0), lambda e: ACKTR(policy=MlpPolicy, env=e, learning_rate=5e-4, n_steps=1 ).learn(total_timesteps=20000, seed=0), lambda e: DeepQ(policy=deepq_models.mlp([32]), batch_size=16, gamma=0.1, exploration_fraction=0.001, env=e).learn(total_timesteps=40000, seed=0), lambda e: PPO1(policy=MlpPolicy, env=e, lam=0.7, optim_batchsize=16, optim_stepsize=1e-3).learn(total_timesteps=10000, seed=0), lambda e: PPO2(policy=MlpPolicy, env=e, learning_rate=1.5e-3, lam=0.8 ).learn(total_timesteps=20000, seed=0), lambda e: TRPO(policy=MlpPolicy, env=e, max_kl=0.05, lam=0.7).learn( total_timesteps=10000, seed=0), ] @pytest.mark.slow @pytest.mark.parametrize("learn_func", learn_func_list) def test_identity(learn_func): """ Test if the algorithm (with a given policy) can learn an identity transformation (i.e. return observation as an action) :param learn_func: (lambda (Gym Environment): A2CPolicy) the policy generator """ env = DummyVecEnv([lambda: IdentityEnv(10)])
def get_PPO(env_name, ckpt_name="ppo_default_bin"): new_env = make_vec_env(env_name, n_envs=1) model = PPO2(CnnLstmPolicy, new_env, verbose=1) return new_env, model