def do_ppo(args, start_theta, parent_this_run_dir, full_space_save_dir): """ Runs the test """ logger.log(f"#######CMA and then PPO TRAIN: {args}") this_conti_ppo_run_dir = get_ppo_part(parent_this_run_dir) log_dir = get_log_dir(this_conti_ppo_run_dir) conti_ppo_save_dir = get_save_dir(this_conti_ppo_run_dir) logger.configure(log_dir) full_param_traj_dir_path = get_full_params_dir(this_conti_ppo_run_dir) if os.path.exists(full_param_traj_dir_path): import shutil shutil.rmtree(full_param_traj_dir_path) os.makedirs(full_param_traj_dir_path) if os.path.exists(conti_ppo_save_dir): import shutil shutil.rmtree(conti_ppo_save_dir) os.makedirs(conti_ppo_save_dir) def make_env(): env_out = gym.make(args.env) env_out.env.disableViewer = True env_out.env.visualize = False env_out = bench.Monitor(env_out, logger.get_dir(), allow_early_resets=True) return env_out env = DummyVecEnv([make_env]) if args.normalize: env = VecNormalize(env) model = PPO2.load(f"{full_space_save_dir}/ppo2") model.set_from_flat(start_theta) if args.normalize: env.load_running_average(full_space_save_dir) model.set_env(env) run_info = {"run_num": args.run_num, "env_id": args.env, "full_param_traj_dir_path": full_param_traj_dir_path} # model = PPO2(policy=policy, env=env, n_steps=args.n_steps, nminibatches=args.nminibatches, lam=0.95, gamma=0.99, # noptepochs=10, # ent_coef=0.0, learning_rate=3e-4, cliprange=0.2, optimizer=args.optimizer) model.tell_run_info(run_info) episode_returns = model.learn(total_timesteps=args.ppo_num_timesteps) model.save(f"{conti_ppo_save_dir}/ppo2") env.save_running_average(conti_ppo_save_dir) return episode_returns, full_param_traj_dir_path
def get_ppo2( vec_env=None, policy='CnnPolicy', seed=0, number_of_steps_per_epoch=128, # nsteps number_of_mini_batches_in_epoch=8, # nminibatches number_of_updates_per_epoch=4, # noptepochs max_grad_norm=0.5, gamma=0.993, # discount factor entropy_coefficient=0.01, # ent_coef learning_rate=0.00008, # lr clip_range=0.27, # cliprange vf_coefficient=0.5, ) -> PPO2: """ Parameter's default values are taken from football.gfootball.examples.run_ppo2.py """ if vec_env is None: vec_env = create_training_env(1) return PPO2( policy=policy, env=vec_env, gamma=gamma, n_steps=number_of_steps_per_epoch, ent_coef=entropy_coefficient, learning_rate=learning_rate, vf_coef=vf_coefficient, max_grad_norm=max_grad_norm, nminibatches=number_of_mini_batches_in_epoch, noptepochs=number_of_updates_per_epoch, cliprange=clip_range, seed=seed, verbose=2, )
def __init__(self, policy_file="../models/controllers/PPO/Pendulum-v0.pkl", mass_prior=None, length_prior=None, episodes_per_params=1, seed=1995, params=["length", "mass"], steps_per_episode=200, sufficient_stats="Cross-Correlation", load_from_file=False, assets_path=".", filename=""): self.env = PendulumEnv() self.seed = seed self.cached_data = None self.params_scaler = None self.params = params self.steps_per_episode = steps_per_episode self.sufficient_stats = sufficient_stats self.assets_path = assets_path self.load_from_file = load_from_file self.data_file = os.path.join(assets_path + filename) self.policy = PPO2.load(policy_file) if mass_prior is None: self.m_low = 0.1 self.m_high = 2.0 self.m_prior = self.sample_mass_from_uniform_prior if length_prior is None: self.l_low = 0.1 self.l_high = 2.0 self.l_prior = self.sample_length_from_uniform_prior
def train_agent(train, pickle_file, agent_type, env_kwargs, parms): bin_path = "bin/" + pickle_file if (path.exists(bin_path)): if agent_type == "a2c": print("Loading A2C Agent") RL_model = A2C.load( bin_path, tensorboard_log=f"{config.TENSORBOARD_LOG_DIR}/{agent_type}") elif agent_type == "ddpg": print("Loading DDPG Agent") RL_model = DDPG.load( bin_path, tensorboard_log=f"{config.TENSORBOARD_LOG_DIR}/{agent_type}") elif agent_type == "ppo": print("Loading PPO2 Agent") RL_model = PPO2.load( bin_path, tensorboard_log=f"{config.TENSORBOARD_LOG_DIR}/{agent_type}") else: e_train_gym = ipenv.PortfolioAllocEnv(df=train, **env_kwargs) env_train, _ = e_train_gym.get_sb_env() agent = ipagent.IPRLAgent(env=env_train) model = agent.get_model(model_name=agent_type, model_kwargs=parms) RL_model = agent.train_model(model=model, tb_log_name=agent_type, total_timesteps=1000000) RL_model.save(bin_path) return RL_model
def test(env_id, seed, policy): """ Train PPO2 model for atari environment, for testing purposes :param env_id: (str) the environment id string :param seed: (int) Used to seed the random generator. :param policy: (Object) The policy model to use (MLP, CNN, LSTM, ...) """ # if 'lstm' in policy: # print('LSTM policies not supported for drawing') # return 1 env = DummyVecEnv([PadEnvRender for _ in range(1)]) # Need for lstm # else: # env = PadEnvRender() env = VecFrameStack(env, 8) model = PPO2.load('./pad_5combo_ppo2.pkl', env) while True: obs, done = env.reset(), False episode_rew = 0 while not done: env.render() action, _ = model.predict(obs) obs, rew, done, _ = env.step(action) done = done.any() episode_rew += rew time.sleep(1 / 24.) if done: print('Episode reward:', rew)
def __init__(self, policy_file="../models/controllers/PPO/CartPole-v1.pkl", mass_prior=None, length_prior=None, episodes_per_params=1, seed=1995, params=["length", "masspole"], steps_per_episode=50, sufficient_stats="Cross-Correlation"): self.env = CartPoleEnv() self.seed = seed self.cached_data = None self.params_scaler = None self.params = params self.steps_per_episode = steps_per_episode self.sufficient_stats = sufficient_stats self.policy = PPO2.load(policy_file) if mass_prior is None: self.m_low = 0.1 self.m_high = 2.0 self.m_prior = self.sample_mass_from_uniform_prior if length_prior is None: self.l_low = 0.1 self.l_high = 2.0 self.l_prior = self.sample_length_from_uniform_prior
def main(): """ Train and save the PPO model, for the cartpole problem """ print("Making a new model") env = ControlCarRacing(gym.make('CarRacing-v0')) env = MaxAndSkipEnv(env, skip=4) env = FrameStack(env, 4) env = Monitor(env, log_dir, allow_early_resets=True) env = DummyVecEnv([lambda: env]) model = PPO2(policy=CnnPolicy, env=env, n_steps=128, nminibatches=4, noptepochs=10, learning_rate=3e-4, cliprange=lambda f: f * 0.2, verbose=0, tensorboard_log='graph/') print("Learning started. It takes some time...") model.learn(total_timesteps=300000, callback=callback, tb_log_name='190317') print("Saving model to CarRacing_model.pkl") model.save("CarRacing_model_PPO2") print("Plotting Learning Curve") plot_results(log_dir) plot_results(log_dir, smoothing=False)
def train(env_id, num_timesteps, seed, policy): """ Train PPO2 model for atari environment, for testing purposes :param env_id: (str) the environment id string :param num_timesteps: (int) the number of timesteps to run :param seed: (int) Used to seed the random generator. :param policy: (Object) The policy model to use (MLP, CNN, LSTM, ...) """ env = VecFrameStack(make_atari_env(env_id, 8, seed), 4) policy = { 'cnn': CnnPolicy, 'lstm': CnnLstmPolicy, 'lnlstm': CnnLnLstmPolicy, 'mlp': MlpPolicy }[policy] model = PPO2(policy=policy, env=env, n_steps=128, nminibatches=4, lam=0.95, gamma=0.99, noptepochs=4, ent_coef=.01, learning_rate=lambda f: f * 2.5e-4, cliprange=lambda f: f * 0.1, verbose=1) model.learn(total_timesteps=num_timesteps)
def load(self, path, env): if self.trpo(): return TRPO.load(path, env=env) elif self.ppo(): return PPO2.load(path, env=env) else: return SAC.load(path, env=env)
def train(env_id, num_timesteps, seed): """ Train PPO2 model for Mujoco environment, for testing purposes :param env_id: (str) the environment id string :param num_timesteps: (int) the number of timesteps to run :param seed: (int) Used to seed the random generator. """ def make_env(): env_out = gym.make(env_id) env_out = bench.Monitor(env_out, logger.get_dir(), allow_early_resets=True) return env_out env = DummyVecEnv([make_env]) env = VecNormalize(env) set_global_seeds(seed) policy = MlpPolicy model = PPO2(policy=policy, env=env, n_steps=2048, nminibatches=32, lam=0.95, gamma=0.99, noptepochs=10, ent_coef=0.0, learning_rate=3e-4, cliprange=0.2) model.learn(total_timesteps=num_timesteps) return model, env
def __main(): from stable_baselines.ppo2 import PPO2 from stable_baselines.common.policies import MlpPolicy from stable_baselines.common.vec_env import DummyVecEnv env = DummyVecEnv([OptLRs]) agent = PPO2(MlpPolicy, env, verbose=1) agent.learn(total_timesteps=10**2)
def model_free_policy(self, ne, n_epochs=1, train=True, load_model=False): if self.autoencoder is None: self.setup_autoencoder(ne.get_obs()) assert (self.autoencoder) is not None if ne.autoencoder is None: ne.set_autoencoder(self.autoencode) ne.autoencoder = self.autoencode if train: fn = "models/model1.h5" self.mf_policy = PPO2(env=ne, policy=MlpPolicy, n_steps=40, verbose=2, noptepochs=10, learning_rate=3e-4, ent_coef=0.1, gamma=0.1) if load_model: self.mf_policy.load(fn, env=make_vec_env(lambda: ne)) else: self.mf_policy.learn(total_timesteps=n_epochs * 40) self.mf_policy.save(fn) encoded_obs = ne.rl_obs() return self.mf_policy.step([encoded_obs], deterministic=True)[0].flatten()
def test_cnn_lstm_policy(request, policy): model_fname = './test_model_{}.zip'.format(request.node.name) try: env = make_env(0) model = PPO2(policy, env, nminibatches=1) model.learn(total_timesteps=15) env = model.get_env() evaluate_policy(model, env, n_eval_episodes=5) # saving model.save(model_fname) del model, env # loading _ = PPO2.load(model_fname, policy=policy) finally: if os.path.exists(model_fname): os.remove(model_fname)
def main(): import sys logger.log(sys.argv) common_arg_parser = get_common_parser() args, cma_unknown_args = common_arg_parser.parse_known_args() this_run_dir = get_dir_path_for_this_run(args) plot_dir_alg = get_plot_dir(args) traj_params_dir_name = get_full_params_dir(this_run_dir) intermediate_data_dir = get_intermediate_data_dir(this_run_dir, params_scope="pi") save_dir = get_save_dir(this_run_dir) if not os.path.exists(intermediate_data_dir): os.makedirs(intermediate_data_dir) if not os.path.exists(plot_dir_alg): os.makedirs(plot_dir_alg) final_file = get_full_param_traj_file_path(traj_params_dir_name, "pi_final") final_params = pd.read_csv(final_file, header=None).values[0] def make_env(): env_out = gym.make(args.env) env_out = bench.Monitor(env_out, logger.get_dir(), allow_early_resets=True) return env_out env = DummyVecEnv([make_env]) if args.normalize: env = VecNormalize(env) model = PPO2.load(f"{save_dir}/ppo2") # this also loads V function model.set_pi_from_flat(final_params) if args.normalize: env.load_running_average(save_dir) obz_tensor = model.act_model.fake_input_tensor some_neuron = model.act_model.policy_neurons[2][-1] grads = tf.gradients(tf.math.negative(some_neuron), obz_tensor) grads = list(zip(grads, obz_tensor)) trainer = tf.train.AdamOptimizer(learning_rate=0.01, epsilon=1e-5) train_op = trainer.apply_gradients(grads) for i in range(10000): obz, _ = model.sess.run([obz_tensor, train_op])
def neuron_values_generator(args, save_dir, pi_theta, eval_timesteps): # logger.log(f"#######EVAL: {args}") neuron_values_list = [] def make_env(): env_out = gym.make(args.env) env_out = bench.Monitor(env_out, logger.get_dir(), allow_early_resets=True) return env_out env = DummyVecEnv([make_env]) if args.normalize: env = VecNormalize(env) # policy = MlpPolicy # # model = PPO2.load(f"{save_dir}/ppo2") # this also loads V function # model = PPO2(policy=policy, env=env, n_steps=args.n_steps, nminibatches=args.nminibatches, lam=0.95, gamma=0.99, noptepochs=10, # ent_coef=0.0, learning_rate=3e-4, cliprange=0.2, optimizer=args.optimizer) model = PPO2.load(f"{save_dir}/ppo2") # this also loads V function if pi_theta is not None: model.set_pi_from_flat(pi_theta) if args.normalize: env.load_running_average(save_dir) obs = np.zeros((env.num_envs, ) + env.observation_space.shape) obs[:] = env.reset() env.render() ep_infos = [] while 1: neuron_values, actions, _, _, _ = model.step_with_neurons(obs) # neuron_values = model.give_neuron_values(obs) # neuron_values_list.append( neuron_values ) yield neuron_values obs, rew, done, infos = env.step(actions) env.render() # time.sleep(1) for info in infos: maybe_ep_info = info.get('episode') if maybe_ep_info is not None: ep_infos.append(maybe_ep_info) # env.render() done = done.any() if done: episode_rew = safe_mean([ep_info['r'] for ep_info in ep_infos]) print(f'episode_rew={episode_rew}') obs = env.reset()
def cont_learn(): print('Continue learning....') env = gym.make('CarRacing-v0') env = Monitor(env, log_dir, allow_early_resets=True) env = DummyVecEnv([lambda: env]) trained_model = PPO2.load("CarRacing_model_PPO2.pkl") trained_model.set_env(env) trained_model.learn(300000) print("Saving model to CarRacing_model.pkl") trained_model.save("CarRacing_model_PPO2.pkl") plot_results(log_dir)
def visualize_neurons(args, save_dir, pi_theta, eval_timesteps): # logger.log(f"#######EVAL: {args}") def make_env(): env_out = gym.make(args.env) env_out.env.disableViewer = True env_out.env.visualize = False env_out = bench.Monitor(env_out, logger.get_dir(), allow_early_resets=True) return env_out env = DummyVecEnv([make_env]) if args.normalize: env = VecNormalize(env) model = PPO2.load(f"{save_dir}/ppo2") # this also loads V function if pi_theta is not None: model.set_pi_from_flat(pi_theta) if args.normalize: env.load_running_average(save_dir) obs = np.zeros((env.num_envs, ) + env.observation_space.shape) obs[:] = env.reset() ep_infos = [] for _ in range(eval_timesteps): actions = model.step(obs)[0] neuron_values = model.give_neuron_values(obs) obs, rew, done, infos = env.step(actions) for info in infos: maybe_ep_info = info.get('episode') if maybe_ep_info is not None: ep_infos.append(maybe_ep_info) # env.render() done = done.any() if done: if pi_theta is None: episode_rew = safe_mean([ep_info['r'] for ep_info in ep_infos]) print(f'episode_rew={episode_rew}') obs = env.reset() return safe_mean([ep_info['r'] for ep_info in ep_infos])
def run(): """ Run a trained model for the pong problem """ env = gym.make('CarRacing-v0') env = DummyVecEnv([lambda: env]) # model = PPO2.load("CarRacing_model_PPO1_"+ str(5) +".pkl", env) model = PPO2.load("CarRacing_model_PPO2_5.pkl", env) avg_rew = evaluate(model=model, env=env, num_steps=10000) while True: obs, done = env.reset(), False episode_rew = 0 while not done: env.render() action, _ = model.predict(obs) obs, rew, done, _ = env.step(action) episode_rew += rew print("Episode reward", episode_rew)
def train(env_id, num_timesteps, seed, policy, n_envs=8, nminibatches=4, n_steps=128): """ Train PPO2 model for atari environment, for testing purposes :param env_id: (str) the environment id string :param num_timesteps: (int) the number of timesteps to run :param seed: (int) Used to seed the random generator. :param policy: (Object) The policy model to use (MLP, CNN, LSTM, ...) :param n_envs: (int) Number of parallel environments :param nminibatches: (int) Number of training minibatches per update. For recurrent policies, the number of environments run in parallel should be a multiple of nminibatches. :param n_steps: (int) The number of steps to run for each environment per update (i.e. batch size is n_steps * n_env where n_env is number of environment copies running in parallel) """ env = VecFrameStack(make_atari_env(env_id, n_envs, seed), 4) policy = { 'cnn': CnnPolicy, 'lstm': CnnLstmPolicy, 'lnlstm': CnnLnLstmPolicy, 'mlp': MlpPolicy }[policy] model = PPO2(policy=policy, env=env, n_steps=n_steps, nminibatches=nminibatches, lam=0.95, gamma=0.99, noptepochs=4, ent_coef=.01, learning_rate=lambda f: f * 2.5e-4, cliprange=lambda f: f * 0.1, verbose=1) model.learn(total_timesteps=num_timesteps) del model
def train(num_timesteps, model_to_load): try: env = DummyVecEnv([dsgym]) env = VecNormalize(env) policy = MlpPolicy lr = 3e-4 * 0.75 model = PPO2(policy=policy, env=env, n_steps=2048, nminibatches=32, lam=0.95, gamma=0.99, noptepochs=10, ent_coef=0.01, learning_rate=linear_schedule(lr), cliprange=0.2) if model_to_load: env = DummyVecEnv([dsgym]) env = VecNormalize.load( model_to_load.replace(".zip", "vec_normalize.pkl"), env) model = model.load(model_to_load) model.set_env(env) print("Loaded model from: ", model_to_load) model.set_learning_rate_func(linear_schedule_start_zero(lr)) model.learn(total_timesteps=num_timesteps) except KeyboardInterrupt: print("Saving on keyinterrupt") model.save("D:/openAi/ppo2save/" + time.strftime("%Y_%m_%d-%H_%M_%S")) # quit sys.exit() except BaseException as error: model.save("D:/openAi/ppo2save/" + time.strftime("%Y_%m_%d-%H_%M_%S")) print('An exception occurred: {}'.format(error)) traceback.print_exception(*sys.exc_info()) sys.exit() model.save("D:/openAi/ppo2save/" + time.strftime("%Y_%m_%d-%H_%M_%S"))
def create_learner(self, env, parameters): if (self.trpo() or self.ppo()) and not issubclass(type(env), VecEnv): env = DummyVecEnv([lambda: env]) if self.trpo(): model = TRPO(MlpPolicy, env, **parameters["common"], **parameters[str(self)]) interface = TRPOInterface(model, env.observation_space.shape[0]) elif self.ppo(): model = PPO2(MlpPolicy, env, **parameters["common"], **parameters[str(self)]) interface = PPOInterface(model, env.observation_space.shape[0]) else: model = SAC(SACMlpPolicy, env, **parameters["common"], **parameters[str(self)]) interface = SACInterface(model, env.observation_space.shape[0]) if "pretrain_data_path" in parameters: data_path = parameters["pretrain_data_path"] model.pretrain(ExpertDataset(expert_path=data_path, verbose=0), n_epochs=25) return model, interface
def train(env_id, num_timesteps, seed, policy): """ Train PPO2 model for atari environment, for testing purposes :param env_id: (str) the environment id string :param num_timesteps: (int) the number of timesteps to run :param seed: (int) Used to seed the random generator. :param policy: (Object) The policy model to use (MLP, CNN, LSTM, ...) """ env = Monitor(PadEnv(), './logs', allow_early_resets=True) env = DummyVecEnv([lambda: env for _ in range(16)]) env = VecFrameStack(env, 8) policy = { 'cnn': CnnPolicy, 'lstm': CnnLstmPolicy, 'lnlstm': CnnLnLstmPolicy, 'mlp': MlpPolicy }[policy] model = PPO2(policy=policy, env=env, n_steps=256, nminibatches=4, lam=0.95, gamma=0.99, noptepochs=4, ent_coef=.01, learning_rate=lambda f: f * 2.5e-4, cliprange=lambda f: f * 0.1, verbose=1) # model = model.load('./pad_4combo_ppo2.pkl', env) try: model.learn(total_timesteps=num_timesteps) except KeyboardInterrupt: print('Keyboard Interrupted') model.save('./pad_5combo_ppo2.pkl')
'sha_pol': sha_pol if bool(flags.learn_sha_pol) else None, 'mov_pol': None, 'rot_pol': None } } env = make_env() model = PPO2(PackingPolicy, env, n_steps=flags.num_steps, verbose=1, tensorboard_log=tensorboard_log, nminibatches=int((flags.num_steps * flags.num_pro) / 64), noptepochs=flags.noptepochs, make_env=make_env, gamma=flags.gamma, lam=flags.lam, vf_coef=flags.vf_coef, ent_coef=flags.ent_coef, zero_mean_advs=bool(flags.zero_mean_advs), packing_id_start=flags.id_start, learning_rate=flags.lr, policy_config=policy_config, restore_exp=not (bool(flags.learn_or_evaluate)), restore_path="./{}/{}".format(tensorboard_log, flags.model_name)) if bool(flags.learn_or_evaluate): model.learn(flags.num_steps * flags.num_pro * 400) else: if bool(flags.eval_va_or_te): pack_file_name_evaluate = [ "pack_va/" + str(i) + "_va"
def main(env, load_path, fig_path): # skip over 1-baxter-no-penalty (no log monitor.csv) if load_path == "1-baxter-no-penalty": plot = False else: plot = True # arguments print("env %s; load_path %s; fig_path %s;" % (env, load_path, fig_path)) log_path = os.getcwd() + "/log/" + load_path os.makedirs(os.getcwd() + "/figs/" + "/", exist_ok=True) fig_path = os.getcwd() + "/figs/" + "/" + fig_path load_path = os.getcwd() + "/models/" + load_path # make environment, flattened environment, vectorized environment env = gym.make(env) env = gym.wrappers.FlattenDictWrapper(env, ['observation', 'achieved_goal', 'desired_goal']) env = DummyVecEnv([lambda: env]) # load model model = PPO2.load(load_path, env=env) obs_initial = env.reset() obs = obs_initial # plot results if plot: plot_results(fig_path, log_path) # initializations niter = 10 counter = 0 timestep = 0 results = [[[0,0,0] for i in range(100)], [[0,0,0,0] for i in range(100)]] current = [[[0,0,0] for i in range(100)], [[0,0,0,0] for i in range(100)]] print("==============================") # check initial positions and quaternions print("grip", env.envs[0].env.env.sim.data.get_site_xpos('grip')) print("box", env.envs[0].env.env.sim.data.get_site_xpos('box')) print("tool", env.envs[0].env.env.sim.data.get_site_xpos('tool')) print("mocap", env.envs[0].env.env.sim.data.mocap_pos) print("quat", env.envs[0].env.env.sim.data.mocap_quat) print("==============================") # mocap quaternion check for i in range(5): action, _states = model.predict(obs) obs, rewards, dones, info = env.step(action) quat = env.envs[0].env.env.sim.data.mocap_quat print("obs", obs) print("quat", quat) print("==============================") # start rendering dists = [] box_goal_pos = np.array([0.6, 0.05, -0.17]) while True: if counter == niter: break action, _states = model.predict(obs) obs_old = obs obs, rewards, dones, info = env.step(action) quaternion = env.envs[0].env.env.sim.data.mocap_quat if obs.all() == obs_initial.all(): if counter % 10 == 0: xyzs = current[0] quats = current[1] print(xyzs) print(quats) filename = log_path + "/" + "results_" + str(counter) + ".txt" os.makedirs(log_path + "/", exist_ok=True) file = open(filename, 'w+') for xyz, quat in zip(xyzs, quats): for coord in xyz: file.write(str(coord) + " ") for quat_coord in quat: file.write(str(quat_coord) + " ") file.write("\n") file.close() box_end_pos = np.array(obs_old[0][3:6].tolist()) print(box_end_pos) print(np.shape(box_end_pos)) print(box_goal_pos) print(np.shape(box_goal_pos)) dists.append(np.linalg.norm(box_goal_pos - box_end_pos)) current = [[[0,0,0] for i in range(100)], [[0,0,0,0] for i in range(100)]] timestep = 0 counter += 1 print(timestep) print("obs", obs) print("quat", quaternion) # for average trajectory, smoothed for i in range(3): results[0][timestep][i] += obs[0][:3].tolist()[i] for j in range(4): results[1][timestep][j] += quaternion[0].tolist()[j] # for current trajectory for i in range(3): current[0][timestep][i] += obs[0][:3].tolist()[i] for j in range(4): current[1][timestep][j] += quaternion[0].tolist()[j] timestep += 1 env.render() # smooth paths by taking average, and calculate mean distance to goal state for timestep in range(100): for i in range(3): results[0][timeste][i] /= niter for j in range(4): results[0][timestep][j] /= niter dist = np.mean(dists) # print and write to file xyzs = results[0] quats = results[1] filename = log_path + "/" + "results_avg.txt" os.makedirs(log_path + "/", exist_ok=True) file = open(filename, 'w+') for xyz, quat in zip(xyzs, quats): for coord in xyz: file.write(str(coord) + " ") for quat_coord in quat: file.write(str(quat_coord) + " ") file.write("\n") file.close() # print average distances print("average distance of box from end goal: %f" % dist)
qpos[i, :] = env_in.GetMotorAngles() # qvel[i,:] = env_in.sim.data.qvel[[7,8,10,12,14,16,17,19,21,23]] qvel[i, :] = env_in.GetMotorVelocities() # torque[i,:] = env_in.sim.data.actuator_force torque[i, :] = env_in.GetMotorTorques() i = (i + 1) % total_data_length return True ############################################################################### # # Use this code for testing the basic controller # Create the stoch mujoco environment # env = stoch2_gym_mjc_env.Stoch2Env() env = vision60_gym_bullet_env.Vision60BulletEnv(render=True) model_test = PPO2.load(dir_name + "/model_trot") obs = env.reset() print("Render mode...") for _ in range(10): action, _states = model_test.predict(obs, deterministic=True) obs, reward, done, _ = env.step(action, callback=render_callback) # if done: # break pickle.dump(qpos[0:total_data_length:int(total_data_length / 100)], open("save.p", "wb")) # save it into a file named save.p # print(np.shape(qpos[0:total_data_length:int(total_data_length/100)])) # print(np.shape(qpos))
def __init__(self): self.model = PPO2.load(os.path.expanduser( "~/Code/drl_local_planner_ros_stable_baselines/example_agents/ppo2_1_raw_data_cont_0/ppo2_1_raw_data_cont_0.pkl")) # noqa
def train(args): """ Runs the test """ args, argv = mujoco_arg_parser().parse_known_args(args) logger.log(f"#######TRAIN: {args}") args.alg = "ppo2" this_run_dir = get_dir_path_for_this_run(args) if os.path.exists(this_run_dir): import shutil shutil.rmtree(this_run_dir) os.makedirs(this_run_dir) log_dir = get_log_dir(this_run_dir) save_dir = get_save_dir(this_run_dir) logger.configure(log_dir) def make_env(): env_out = gym.make(args.env) env_out.env.visualize = False env_out = bench.Monitor(env_out, logger.get_dir(), allow_early_resets=True) return env_out env = DummyVecEnv([make_env]) env.envs[0].env.env.disableViewer = True set_global_seeds(args.seed) env.envs[0].env.env.seed(args.seed) if args.normalize: env = VecNormalize(env) policy = MlpPolicy # extra run info I added for my purposes full_param_traj_dir_path = get_full_params_dir(this_run_dir) if os.path.exists(full_param_traj_dir_path): import shutil shutil.rmtree(full_param_traj_dir_path) os.makedirs(full_param_traj_dir_path) if os.path.exists(save_dir): import shutil shutil.rmtree(save_dir) os.makedirs(save_dir) run_info = { "run_num": args.run_num, "env_id": args.env, "full_param_traj_dir_path": full_param_traj_dir_path, "state_samples_to_collect": args.state_samples_to_collect } model = PPO2(policy=policy, env=env, n_steps=args.n_steps, nminibatches=args.nminibatches, lam=0.95, gamma=0.99, noptepochs=10, ent_coef=0.0, learning_rate=3e-4, cliprange=0.2, optimizer=args.optimizer, seed=args.seed) model.tell_run_info(run_info) model.learn(total_timesteps=args.num_timesteps) model.save(f"{save_dir}/ppo2") if args.normalize: env.save_running_average(save_dir)
def visualize_augment_experiment(augment_num_timesteps, top_num_to_include_slice, augment_seed, augment_run_num, network_size, policy_env, policy_num_timesteps, policy_run_num, policy_seed, eval_seed, eval_run_num, learning_rate, additional_note, result_dir, lagrangian_inds_to_include=None): args = AttributeDict() args.normalize = True args.num_timesteps = augment_num_timesteps args.run_num = augment_run_num args.alg = "ppo2" args.seed = augment_seed logger.log(f"#######TRAIN: {args}") # non_linear_global_dict timestamp = get_time_stamp('%Y_%m_%d_%H_%M_%S') experiment_label = f"learning_rate_{learning_rate}timestamp_{timestamp}_augment_num_timesteps{augment_num_timesteps}" \ f"_top_num_to_include{top_num_to_include_slice.start}_{top_num_to_include_slice.stop}" \ f"_augment_seed{augment_seed}_augment_run_num{augment_run_num}_network_size{network_size}" \ f"_policy_num_timesteps{policy_num_timesteps}_policy_run_num{policy_run_num}_policy_seed{policy_seed}" \ f"_eval_seed{eval_seed}_eval_run_num{eval_run_num}_additional_note_{additional_note}" if policy_env == "DartWalker2d-v1": entry_point = 'gym.envs.dart:DartWalker2dEnv_aug_input' elif policy_env == "DartHopper-v1": entry_point = 'gym.envs.dart:DartHopperEnv_aug_input' elif policy_env == "DartHalfCheetah-v1": entry_point = 'gym.envs.dart:DartHalfCheetahEnv_aug_input' elif policy_env == "DartSnake7Link-v1": entry_point = 'gym.envs.dart:DartSnake7LinkEnv_aug_input' else: raise NotImplemented() this_run_dir = get_experiment_path_for_this_run( entry_point, args.num_timesteps, args.run_num, args.seed, learning_rate=learning_rate, top_num_to_include=top_num_to_include_slice, result_dir=result_dir, network_size=network_size) full_param_traj_dir_path = get_full_params_dir(this_run_dir) log_dir = get_log_dir(this_run_dir) save_dir = get_save_dir(this_run_dir) create_dir_remove(this_run_dir) create_dir_remove(full_param_traj_dir_path) create_dir_remove(save_dir) create_dir_remove(log_dir) logger.configure(log_dir) # note this is only linear if lagrangian_inds_to_include is None: linear_top_vars_list = read_linear_top_var(policy_env, policy_num_timesteps, policy_run_num, policy_seed, eval_seed, eval_run_num, additional_note) # keys_to_include = ["COM", "M", "Coriolis", "total_contact_forces_contact_bodynode", # "com_jacobian", "contact_bodynode_jacobian"] keys_to_include = ["COM", "M", "Coriolis", "com_jacobian"] # lagrangian_inds_to_include = linear_top_vars_list[top_num_to_include_slice] lagrangian_inds_to_include = get_wanted_lagrangians( keys_to_include, linear_top_vars_list, top_num_to_include_slice) with open(f"{log_dir}/lagrangian_inds_to_include.json", 'w') as fp: json.dump(lagrangian_inds_to_include, fp) args.env = f'{experiment_label}_{entry_point}-v1' register(id=args.env, entry_point=entry_point, max_episode_steps=1000, kwargs={"lagrangian_inds_to_include": lagrangian_inds_to_include}) def make_env(): env_out = gym.make(args.env) env_out.env.visualize = False env_out = bench.Monitor(env_out, logger.get_dir(), allow_early_resets=True) return env_out env = DummyVecEnv([make_env]) walker_env = env.envs[0].env.env walker_env.disableViewer = True if args.normalize: env = VecNormalize(env) policy = MlpPolicy # extra run info I added for my purposes run_info = { "run_num": args.run_num, "env_id": args.env, "full_param_traj_dir_path": full_param_traj_dir_path } layers = [network_size, network_size] set_global_seeds(args.seed) walker_env.seed(args.seed) policy_kwargs = {"net_arch": [dict(vf=layers, pi=layers)]} model = PPO2(policy=policy, env=env, n_steps=4096, nminibatches=64, lam=0.95, gamma=0.99, noptepochs=10, ent_coef=0.0, learning_rate=learning_rate, cliprange=0.2, optimizer='adam', policy_kwargs=policy_kwargs, seed=args.seed) model.tell_run_info(run_info) model.learn(total_timesteps=args.num_timesteps, seed=args.seed) model.save(f"{save_dir}/ppo2") if args.normalize: env.save_running_average(save_dir) return log_dir
def visualize_policy_and_collect_COM( augment_num_timesteps, top_num_to_include_slice, augment_seed, augment_run_num, network_size, policy_env, policy_num_timesteps, policy_run_num, policy_seed, eval_seed, eval_run_num, learning_rate, additional_note, metric_param): result_dir = get_result_dir(policy_env, policy_num_timesteps, policy_run_num, policy_seed, eval_seed, eval_run_num, additional_note, metric_param) args = AttributeDict() args.normalize = True args.num_timesteps = augment_num_timesteps args.run_num = augment_run_num args.alg = "ppo2" args.seed = augment_seed logger.log(f"#######VISUALIZE: {args}") # non_linear_global_dict linear_global_dict, non_linear_global_dict, lagrangian_values, input_values, layers_values, all_weights = read_all_data( policy_env, policy_num_timesteps, policy_run_num, policy_seed, eval_seed, eval_run_num, additional_note=additional_note) timestamp = get_time_stamp('%Y_%m_%d_%H_%M_%S') experiment_label = f"learning_rate_{learning_rate}timestamp_{timestamp}_augment_num_timesteps{augment_num_timesteps}" \ f"_top_num_to_include{top_num_to_include_slice.start}_{top_num_to_include_slice.stop}" \ f"_augment_seed{augment_seed}_augment_run_num{augment_run_num}_network_size{network_size}" \ f"_policy_num_timesteps{policy_num_timesteps}_policy_run_num{policy_run_num}_policy_seed{policy_seed}" \ f"_eval_seed{eval_seed}_eval_run_num{eval_run_num}_additional_note_{additional_note}" entry_point = 'gym.envs.dart:DartWalker2dEnv_aug_input' this_run_dir = get_experiment_path_for_this_run( entry_point, args.num_timesteps, args.run_num, args.seed, learning_rate=learning_rate, top_num_to_include=top_num_to_include_slice, result_dir=result_dir, network_size=network_size, metric_param=metric_param) traj_params_dir_name = get_full_params_dir(this_run_dir) save_dir = get_save_dir(this_run_dir) aug_plot_dir = get_aug_plot_dir(this_run_dir) + "_vis" final_file = get_full_param_traj_file_path(traj_params_dir_name, "pi_final") final_params = pd.read_csv(final_file, header=None).values[0] args.env = f'{experiment_label}_{entry_point}-v1' register(id=args.env, entry_point=entry_point, max_episode_steps=1000, kwargs={ 'linear_global_dict': linear_global_dict, 'non_linear_global_dict': non_linear_global_dict, 'top_to_include_slice': top_num_to_include_slice, 'aug_plot_dir': aug_plot_dir, "lagrangian_values": lagrangian_values, "layers_values": layers_values }) def make_env(): env_out = gym.make(args.env) env_out = bench.Monitor(env_out, logger.get_dir(), allow_early_resets=True) return env_out env = DummyVecEnv([make_env]) walker_env = env.envs[0].env.env walker_env.disableViewer = False if args.normalize: env = VecNormalize(env) set_global_seeds(args.seed) walker_env.seed(args.seed) model = PPO2.load(f"{save_dir}/ppo2", seed=augment_seed) model.set_pi_from_flat(final_params) if args.normalize: env.load_running_average(save_dir) sk = env.venv.envs[0].env.env.robot_skeleton lagrangian_values = {} obs = np.zeros((env.num_envs, ) + env.observation_space.shape) obs[:] = env.reset() env = VecVideoRecorder(env, aug_plot_dir, record_video_trigger=lambda x: x == 0, video_length=3000, name_prefix="vis_this_policy") lagrangian_values["M"] = [sk.M.reshape((-1, 1))] lagrangian_values["COM"] = [sk.C.reshape((-1, 1))] lagrangian_values["Coriolis"] = [sk.c.reshape((-1, 1))] lagrangian_values["q"] = [sk.q.reshape((-1, 1))] lagrangian_values["dq"] = [sk.dq.reshape((-1, 1))] contact_values = {} neuron_values = model.give_neuron_values(obs) raw_layer_values_list = [[neuron_value.reshape((-1, 1))] for neuron_value in neuron_values] env.render() ep_infos = [] steps_to_first_done = 0 first_done = False # epi_rew = 0 for _ in range(3000): actions = model.step(obs)[0] # yield neuron_values obs, rew, done, infos = env.step(actions) # epi_rew+= rew[0] if done and not first_done: first_done = True if not first_done: steps_to_first_done += 1 neuron_values = model.give_neuron_values(obs) for i, layer in enumerate(neuron_values): raw_layer_values_list[i].append(layer.reshape((-1, 1))) # fill_contacts_jac_dict(infos[0]["contacts"], contact_dict=contact_values, neuron_values=neuron_values) lagrangian_values["M"].append(sk.M.reshape((-1, 1))) lagrangian_values["q"].append(sk.q.reshape((-1, 1))) lagrangian_values["dq"].append(sk.dq.reshape((-1, 1))) lagrangian_values["COM"].append(sk.C.reshape((-1, 1))) lagrangian_values["Coriolis"].append(sk.c.reshape((-1, 1))) # env.render() # time.sleep(1) for info in infos: maybe_ep_info = info.get('episode') if maybe_ep_info is not None: ep_infos.append(maybe_ep_info) env.render() done = done.any() if done: episode_rew = safe_mean([ep_info['r'] for ep_info in ep_infos]) print(f'episode_rew={episode_rew}') # print(f'episode_rew={epi_rew}') # epi_rew = 0 obs = env.reset() #Hstack into a big matrix lagrangian_values["M"] = np.hstack(lagrangian_values["M"]) lagrangian_values["COM"] = np.hstack(lagrangian_values["COM"]) lagrangian_values["Coriolis"] = np.hstack(lagrangian_values["Coriolis"]) lagrangian_values["q"] = np.hstack(lagrangian_values["q"]) lagrangian_values["dq"] = np.hstack(lagrangian_values["dq"]) # for contact_body_name, l in contact_values.items(): # body_contact_dict = contact_values[contact_body_name] # for name, l in body_contact_dict.items(): # body_contact_dict[name] = np.hstack(body_contact_dict[name]) input_values = np.hstack(raw_layer_values_list[0]) layers_values = [ np.hstack(layer_list) for layer_list in raw_layer_values_list ][1:-2] # drop variance and inputs for i, com in enumerate(lagrangian_values["COM"]): plt.figure() plt.plot(np.arange(len(com)), com) plt.xlabel("time") plt.ylabel(f"COM{i}") plt.savefig(f"{aug_plot_dir}/COM{i}.jpg") plt.close()
#print(prediction[:20]) #prediction = sigmoid(sw) #objective = mse(prediction, self.labels) objective = cross_entropy(prediction, self.labels) reward = -objective #print(reward) self.rewards.append(reward) if np.any(np.isnan(state)): print(state) print("NAN DETECTED") exit() return state, reward, terminal, {} def _terminal(self): return self.steps >= 40 def _get_state(self): pass def render(self, mode='human'): pass def close(self): pass if __name__ == '__main__': env = DummyVecEnv([OptDist]) agent = PPO2(MlpPolicy, env) agent.learn(total_timesteps=10**7)