def do_ppo(args, start_theta, parent_this_run_dir, full_space_save_dir): """ Runs the test """ logger.log(f"#######CMA and then PPO TRAIN: {args}") this_conti_ppo_run_dir = get_ppo_part(parent_this_run_dir) log_dir = get_log_dir(this_conti_ppo_run_dir) conti_ppo_save_dir = get_save_dir(this_conti_ppo_run_dir) logger.configure(log_dir) full_param_traj_dir_path = get_full_params_dir(this_conti_ppo_run_dir) if os.path.exists(full_param_traj_dir_path): import shutil shutil.rmtree(full_param_traj_dir_path) os.makedirs(full_param_traj_dir_path) if os.path.exists(conti_ppo_save_dir): import shutil shutil.rmtree(conti_ppo_save_dir) os.makedirs(conti_ppo_save_dir) def make_env(): env_out = gym.make(args.env) env_out.env.disableViewer = True env_out.env.visualize = False env_out = bench.Monitor(env_out, logger.get_dir(), allow_early_resets=True) return env_out env = DummyVecEnv([make_env]) if args.normalize: env = VecNormalize(env) model = PPO2.load(f"{full_space_save_dir}/ppo2") model.set_from_flat(start_theta) if args.normalize: env.load_running_average(full_space_save_dir) model.set_env(env) run_info = {"run_num": args.run_num, "env_id": args.env, "full_param_traj_dir_path": full_param_traj_dir_path} # model = PPO2(policy=policy, env=env, n_steps=args.n_steps, nminibatches=args.nminibatches, lam=0.95, gamma=0.99, # noptepochs=10, # ent_coef=0.0, learning_rate=3e-4, cliprange=0.2, optimizer=args.optimizer) model.tell_run_info(run_info) episode_returns = model.learn(total_timesteps=args.ppo_num_timesteps) model.save(f"{conti_ppo_save_dir}/ppo2") env.save_running_average(conti_ppo_save_dir) return episode_returns, full_param_traj_dir_path
def train(env_id, num_timesteps, seed): """ Train PPO2 model for Mujoco environment, for testing purposes :param env_id: (str) the environment id string :param num_timesteps: (int) the number of timesteps to run :param seed: (int) Used to seed the random generator. """ def make_env(): env_out = gym.make(env_id) env_out = bench.Monitor(env_out, logger.get_dir(), allow_early_resets=True) return env_out env = DummyVecEnv([make_env]) env = VecNormalize(env) set_global_seeds(seed) policy = MlpPolicy model = PPO2(policy=policy, env=env, n_steps=2048, nminibatches=32, lam=0.95, gamma=0.99, noptepochs=10, ent_coef=0.0, learning_rate=3e-4, cliprange=0.2) model.learn(total_timesteps=num_timesteps) return model, env
def test_lstm_train(): """Test that LSTM models are able to achieve >=150 (out of 500) reward on CartPoleNoVelEnv. This environment requires memory to perform well in.""" def make_env(i): env = CartPoleNoVelEnv() env = TimeLimit(env, max_episode_steps=500) env = bench.Monitor(env, None, allow_early_resets=True) env.seed(i) return env env = SubprocVecEnv([lambda: make_env(i) for i in range(NUM_ENVS)]) env = VecNormalize(env) model = PPO2(MlpLstmPolicy, env, n_steps=128, nminibatches=NUM_ENVS, lam=0.95, gamma=0.99, noptepochs=10, ent_coef=0.0, learning_rate=3e-4, cliprange=0.2, verbose=1) eprewmeans = [] def reward_callback(local, _): nonlocal eprewmeans eprewmeans.append(safe_mean([ep_info['r'] for ep_info in local['ep_info_buf']])) model.learn(total_timesteps=100000, callback=reward_callback) # Maximum episode reward is 500. # In CartPole-v1, a non-recurrent policy can easily get >= 450. # In CartPoleNoVelEnv, a non-recurrent policy doesn't get more than ~50. # LSTM policies can reach above 400, but it varies a lot between runs; consistently get >=150. # See PR #244 for more detailed benchmarks. average_reward = sum(eprewmeans[-NUM_EPISODES_FOR_SCORE:]) / NUM_EPISODES_FOR_SCORE assert average_reward >= 150, "Mean reward below 150; per-episode rewards {}".format(average_reward)
def _train(env_id, agent, model_params, total_steps, is_evaluation=False): if is_evaluation: # evaluate_policy() must only take one environment envs = SubprocVecEnv([make_env(env_id)]) else: envs = SubprocVecEnv([make_env(env_id) for _ in range(NUM_CPU)]) envs = VecNormalize( envs) # normalize the envs during training and evaluation # Load pretrained model during training. if not is_evaluation and os.path.exists(agent + '_' + env_id): if agent == 'ppo2': model = PPO2.load(agent + '_' + env_id) elif agent == 'a2c': model = A2C.load(agent + '_' + env_id) else: if agent == 'ppo2': model = PPO2(MlpLstmPolicy, envs, nminibatches=1, verbose=1, **model_params) elif agent == 'a2c': model = A2C(MlpLstmPolicy, envs, verbose=1, **model_params) model.learn(total_timesteps=total_steps) return envs, model
def makeEnv(cls, args, env_kwargs=None, load_path_normalise=None): if "num_population" in args.__dict__: args.num_cpu = args.num_population * 2 assert not (registered_env[args.env][3] is ThreadingType.NONE and args.num_cpu != 1), \ "Error: cannot have more than 1 CPU for the environment {}".format(args.env) if env_kwargs is not None and env_kwargs.get("use_srl", False): srl_model = MultiprocessSRLModel(args.num_cpu, args.env, env_kwargs) env_kwargs["state_dim"] = srl_model.state_dim env_kwargs["srl_pipe"] = srl_model.pipe envs = [ makeEnv(args.env, args.seed, i, args.log_dir, allow_early_resets=True, env_kwargs=env_kwargs) for i in range(args.num_cpu) ] envs = SubprocVecEnv(envs) envs = VecFrameStack(envs, args.num_stack) if args.srl_model != "raw_pixels" and args.algo_type == "v2": envs = VecNormalize(envs, norm_obs=True, norm_reward=False) envs = loadRunningAverage(envs, load_path_normalise=load_path_normalise) return envs
def train(env_id, num_timesteps, seed, lam, sgd_steps, klcoeff, log): """ Train TRPO model for the mujoco environment, for testing purposes :param env_id: (str) Environment ID :param num_timesteps: (int) The total number of samples :param seed: (int) The initial seed for training """ with tf_util.single_threaded_session(): rank = MPI.COMM_WORLD.Get_rank() log_path = './experiments/' + str( env_id) + './OURS-LOADED/noent_klcoeffanneal_samesgdsteps' + str( sgd_steps) + '_longer_wgae0.95_exp1_2_' + str(seed) #log_path = './experiments/'+str(env_id)+'./TRPO-3x/TRPOR-oldsampling/noent_klcoeff'+str(sgd_steps)+'_sgdstep_steps5_'+str(seed) if not log: if rank == 0: logger.configure(log_path) else: logger.configure(log_path, format_strs=[]) logger.set_level(logger.DISABLED) else: if rank == 0: logger.configure() else: logger.configure(format_strs=[]) logger.set_level(logger.DISABLED) workerseed = seed + 10000 * MPI.COMM_WORLD.Get_rank() #env = make_mujoco_env(env_id, workerseed) def make_env(): env_out = gym.make(env_id) env_out = bench.Monitor(env_out, logger.get_dir(), allow_early_resets=True) env_out.seed(seed) return env_out env = DummyVecEnv([make_env]) env = VecNormalize(env) #, norm_reward=False, norm_obs=False) #env = VecNormalize(env) model = TRPO(MlpPolicy, env, timesteps_per_batch=2048, max_kl=0.01, cg_iters=10, cg_damping=0.1, entcoeff=0.0, gamma=0.99, lam=0.95, vf_iters=5, vf_stepsize=1e-3, verbose=1, seed=seed, sgd_steps=sgd_steps, klcoeff=klcoeff, method="multistep-SGD") model.learn(total_timesteps=10e6) #num_timesteps, seed=seed) env.close()
def main(_): def make_env(): env_out = gym.make('CartPole-v0') env_out = bench.Monitor(env_out, logger.get_dir(), allow_early_resets=True) return env_out env = DummyVecEnv([make_env]) env = VecNormalize(env) policy = { 'cnn': CnnPolicy, 'lstm': CnnLstmPolicy, 'lnlstm': CnnLnLstmPolicy, 'mlp': MlpPolicy }[FLAGS.policy] model = PPO2(policy=policy, env=env, n_steps=FLAGS.n_steps, nminibatches=FLAGS.nminibatches, lam=FLAGS.lam, gamma=FLAGS.gamma, noptepochs=FLAGS.noptepochs, ent_coef=FLAGS.ent_coef, learning_rate=FLAGS.learning_rate, cliprange=FLAGS.cliprange, verbose=FLAGS.verbose) model.learn(total_timesteps=FLAGS.num_timesteps)
def train(env_id, num_timesteps, seed, lam, sgd_steps, klcoeff, log): """ Train TRPO model for the mujoco environment, for testing purposes :param env_id: (str) Environment ID :param num_timesteps: (int) The total number of samples :param seed: (int) The initial seed for training """ with tf_util.single_threaded_session(): rank = MPI.COMM_WORLD.Get_rank() log_path = './experiments/' + str( env_id) + './SAC-M/nips_test19/m' + str(sgd_steps) + '_c' + str( 0.5) + '_e' + str(klcoeff) + '_' + str(seed) #log_path = './experiments/'+str(env_id)+'./TRPO-3x/TRPOR-oldsampling/noent_klcoeff'+str(sgd_steps)+'_sgdstep_steps5_'+str(seed) if not log: if rank == 0: logger.configure(log_path) else: logger.configure(log_path, format_strs=[]) logger.set_level(logger.DISABLED) else: if rank == 0: logger.configure() else: logger.configure(format_strs=[]) logger.set_level(logger.DISABLED) workerseed = seed + 10000 * MPI.COMM_WORLD.Get_rank() #env = make_mujoco_env(env_id, workerseed) def make_env(): env_out = gym.make(env_id) env_out = bench.Monitor(env_out, logger.get_dir(), allow_early_resets=True) env_out.seed(seed) return env_out env = DummyVecEnv([make_env]) env = VecNormalize(env, norm_reward=False, norm_obs=False) #env = VecNormalize(env) model = MDPO(MlpPolicy, env, gamma=0.99, verbose=1, seed=seed, buffer_size=1000000, ent_coef=1.0, gradient_steps=sgd_steps, lam=klcoeff, train_freq=1, tsallis_q=1, reparameterize=True, klconst=0.5) model.learn( total_timesteps=int(num_timesteps)) #num_timesteps, seed=seed) env.close()
def main(): import sys logger.log(sys.argv) common_arg_parser = get_common_parser() args, cma_unknown_args = common_arg_parser.parse_known_args() this_run_dir = get_dir_path_for_this_run(args) plot_dir_alg = get_plot_dir(args) traj_params_dir_name = get_full_params_dir(this_run_dir) intermediate_data_dir = get_intermediate_data_dir(this_run_dir, params_scope="pi") save_dir = get_save_dir(this_run_dir) if not os.path.exists(intermediate_data_dir): os.makedirs(intermediate_data_dir) if not os.path.exists(plot_dir_alg): os.makedirs(plot_dir_alg) final_file = get_full_param_traj_file_path(traj_params_dir_name, "pi_final") final_params = pd.read_csv(final_file, header=None).values[0] def make_env(): env_out = gym.make(args.env) env_out = bench.Monitor(env_out, logger.get_dir(), allow_early_resets=True) return env_out env = DummyVecEnv([make_env]) if args.normalize: env = VecNormalize(env) model = PPO2.load(f"{save_dir}/ppo2") # this also loads V function model.set_pi_from_flat(final_params) if args.normalize: env.load_running_average(save_dir) obz_tensor = model.act_model.fake_input_tensor some_neuron = model.act_model.policy_neurons[2][-1] grads = tf.gradients(tf.math.negative(some_neuron), obz_tensor) grads = list(zip(grads, obz_tensor)) trainer = tf.train.AdamOptimizer(learning_rate=0.01, epsilon=1e-5) train_op = trainer.apply_gradients(grads) for i in range(10000): obz, _ = model.sess.run([obz_tensor, train_op])
def neuron_values_generator(args, save_dir, pi_theta, eval_timesteps): # logger.log(f"#######EVAL: {args}") neuron_values_list = [] def make_env(): env_out = gym.make(args.env) env_out = bench.Monitor(env_out, logger.get_dir(), allow_early_resets=True) return env_out env = DummyVecEnv([make_env]) if args.normalize: env = VecNormalize(env) # policy = MlpPolicy # # model = PPO2.load(f"{save_dir}/ppo2") # this also loads V function # model = PPO2(policy=policy, env=env, n_steps=args.n_steps, nminibatches=args.nminibatches, lam=0.95, gamma=0.99, noptepochs=10, # ent_coef=0.0, learning_rate=3e-4, cliprange=0.2, optimizer=args.optimizer) model = PPO2.load(f"{save_dir}/ppo2") # this also loads V function if pi_theta is not None: model.set_pi_from_flat(pi_theta) if args.normalize: env.load_running_average(save_dir) obs = np.zeros((env.num_envs, ) + env.observation_space.shape) obs[:] = env.reset() env.render() ep_infos = [] while 1: neuron_values, actions, _, _, _ = model.step_with_neurons(obs) # neuron_values = model.give_neuron_values(obs) # neuron_values_list.append( neuron_values ) yield neuron_values obs, rew, done, infos = env.step(actions) env.render() # time.sleep(1) for info in infos: maybe_ep_info = info.get('episode') if maybe_ep_info is not None: ep_infos.append(maybe_ep_info) # env.render() done = done.any() if done: episode_rew = safe_mean([ep_info['r'] for ep_info in ep_infos]) print(f'episode_rew={episode_rew}') obs = env.reset()
def _make_warmstart_cartpole(): """Warm-start VecNormalize by stepping through CartPole""" venv = DummyVecEnv([lambda: gym.make("CartPole-v1")]) venv = VecNormalize(venv) venv.reset() venv.get_original_obs() for _ in range(100): actions = [venv.action_space.sample()] venv.step(actions) return venv
def load_old_ppo2(root_dir, env, env_name, index, transparent_params): try: from baselines.ppo2 import ppo2 as ppo2_old except ImportError as e: msg = "{}. HINT: you need to install (OpenAI) Baselines to use old_ppo2".format( e) raise ImportError(msg) denv = FakeSingleSpacesVec(env, agent_id=index) possible_fnames = ["model.pkl", "final_model.pkl"] model_path = None for fname in possible_fnames: candidate_path = os.path.join(root_dir, fname) if os.path.exists(candidate_path): model_path = candidate_path if model_path is None: raise FileNotFoundError(f"Could not find model at '{root_dir}' " f"under any filename '{possible_fnames}'") graph = tf.Graph() sess = tf.Session(graph=graph) with sess.as_default(): with graph.as_default(): pylog.info(f"Loading Baselines PPO2 policy from '{model_path}'") policy = ppo2_old.learn( network="mlp", env=denv, total_timesteps=1, seed=0, nminibatches=4, log_interval=1, save_interval=1, load_path=model_path, ) stable_policy = OpenAIToStablePolicy(policy, ob_space=denv.observation_space, ac_space=denv.action_space) model = PolicyToModel(stable_policy) try: normalize_path = os.path.join(root_dir, "normalize.pkl") with open(normalize_path, "rb") as f: old_vec_normalize = pickle.load(f) vec_normalize = VecNormalize(denv, training=False) vec_normalize.obs_rms = old_vec_normalize.ob_rms vec_normalize.ret_rms = old_vec_normalize.ret_rms model = NormalizeModel(model, vec_normalize) pylog.info(f"Loaded normalization statistics from '{normalize_path}'") except FileNotFoundError: # We did not use VecNormalize during training, skip pass return model
def f(root_dir, env, env_name, index, transparent_params): denv = FakeSingleSpacesVec(env, agent_id=index) pylog.info( f"Loading Stable Baselines policy for '{cls}' from '{root_dir}'") model = load_backward_compatible_model(cls, root_dir, denv) try: vec_normalize = VecNormalize(denv, training=False) vec_normalize.load_running_average(root_dir) model = NormalizeModel(model, vec_normalize) pylog.info(f"Loaded normalization statistics from '{root_dir}'") except FileNotFoundError: # We did not use VecNormalize during training, skip pass return model
def visualize_neurons(args, save_dir, pi_theta, eval_timesteps): # logger.log(f"#######EVAL: {args}") def make_env(): env_out = gym.make(args.env) env_out.env.disableViewer = True env_out.env.visualize = False env_out = bench.Monitor(env_out, logger.get_dir(), allow_early_resets=True) return env_out env = DummyVecEnv([make_env]) if args.normalize: env = VecNormalize(env) model = PPO2.load(f"{save_dir}/ppo2") # this also loads V function if pi_theta is not None: model.set_pi_from_flat(pi_theta) if args.normalize: env.load_running_average(save_dir) obs = np.zeros((env.num_envs, ) + env.observation_space.shape) obs[:] = env.reset() ep_infos = [] for _ in range(eval_timesteps): actions = model.step(obs)[0] neuron_values = model.give_neuron_values(obs) obs, rew, done, infos = env.step(actions) for info in infos: maybe_ep_info = info.get('episode') if maybe_ep_info is not None: ep_infos.append(maybe_ep_info) # env.render() done = done.any() if done: if pi_theta is None: episode_rew = safe_mean([ep_info['r'] for ep_info in ep_infos]) print(f'episode_rew={episode_rew}') obs = env.reset() return safe_mean([ep_info['r'] for ep_info in ep_infos])
def test_vec_env(): """Test VecNormalize Object""" def make_env(): return gym.make(ENV_ID) env = DummyVecEnv([make_env]) env = VecNormalize(env, norm_obs=True, norm_reward=True, clip_obs=10., clip_reward=10.) _, done = env.reset(), [False] obs = None while not done[0]: actions = [env.action_space.sample()] obs, _, done, _ = env.step(actions) assert np.max(obs) <= 10
def single_wrappers(single_venv, scheduler, our_idx, normalize, rew_shape, rew_shape_params, victim_index, victim_path, victim_type, debug, env_name, load_policy, lookback_params, transparent_params, log_callbacks, save_callbacks): if rew_shape: rew_shape_venv = apply_reward_wrapper(single_env=single_venv, scheduler=scheduler, shaping_params=rew_shape_params, agent_idx=our_idx) log_callbacks.append(lambda logger, locals, globals: rew_shape_venv. log_callback(logger)) single_venv = rew_shape_venv for anneal_type in ['noise', 'rew_shape']: if scheduler.is_conditional(anneal_type): scheduler.set_annealer_get_logs(anneal_type, rew_shape_venv.get_logs) if lookback_params['lb_num'] > 0: lookback_venv = LookbackRewardVecWrapper(single_venv, env_name, debug, victim_index, victim_path, victim_type, transparent_params, **lookback_params) single_venv = lookback_venv if normalize: normalized_venv = VecNormalize(single_venv) if load_policy['path'] is not None: if load_policy['type'] == 'zoo': raise ValueError( "Trying to normalize twice. Bansal et al's Zoo agents normalize " "implicitly. Please set normalize=False to disable VecNormalize." ) normalized_venv.load_running_average(load_policy['path']) save_callbacks.append( lambda root_dir: normalized_venv.save_running_average(root_dir)) single_venv = normalized_venv return single_venv
def main(_): p_dic = getattr(conf.dic.path_dic, FLAGS.env_name) register(id=FLAGS.env_id, entry_point='env.env_ep:Env', kwargs={ 'env_name': FLAGS.env_name, 'done_step': 8760 }) def make_env(): env_out = gym.make(FLAGS.env_id) env_out = bench.Monitor(env_out, logger.get_dir(), allow_early_resets=True) return env_out env = DummyVecEnv([make_env]) env = VecNormalize(env) policy = { 'cnn': CnnPolicy, 'lstm': CnnLstmPolicy, 'lnlstm': CnnLnLstmPolicy, 'mlp': MlpPolicy }[FLAGS.policy] model = PPO2(policy=policy, env=env, n_steps=FLAGS.n_steps, nminibatches=FLAGS.nminibatches, lam=FLAGS.lam, gamma=FLAGS.gamma, noptepochs=FLAGS.noptepochs, ent_coef=FLAGS.ent_coef, learning_rate=FLAGS.learning_rate, cliprange=FLAGS.cliprange, verbose=FLAGS.verbose, log_dir=p_dic.get('agent_log_dir')) model.learn(total_timesteps=FLAGS.num_timesteps)
def single_wrappers(single_venv, scheduler, our_idx, normalize, load_policy, rew_shape, rew_shape_params, log_callbacks, save_callbacks): if rew_shape: rew_shape_venv = apply_reward_wrapper(single_env=single_venv, scheduler=scheduler, shaping_params=rew_shape_params, agent_idx=our_idx) log_callbacks.append(lambda logger, locals, globals: rew_shape_venv.log_callback(logger)) single_venv = rew_shape_venv for anneal_type in ['noise', 'rew_shape']: if scheduler.is_conditional(anneal_type): scheduler.set_annealer_get_logs(anneal_type, rew_shape_venv.get_logs) if normalize: if load_policy['type'] == 'zoo': raise ValueError("Trying to normalize twice. Bansal et al's Zoo agents normalize " "implicitly. Please set normalize=False to disable VecNormalize.") normalized_venv = VecNormalize(single_venv) save_callbacks.append(lambda root_dir: normalized_venv.save_running_average(root_dir)) single_venv = normalized_venv return single_venv
def test_vec_env(tmpdir): """Test VecNormalize Object""" clip_obs = 0.5 clip_reward = 5.0 orig_venv = DummyVecEnv([make_env]) norm_venv = VecNormalize(orig_venv, norm_obs=True, norm_reward=True, clip_obs=clip_obs, clip_reward=clip_reward) _, done = norm_venv.reset(), [False] while not done[0]: actions = [norm_venv.action_space.sample()] obs, rew, done, _ = norm_venv.step(actions) assert np.max(np.abs(obs)) <= clip_obs assert np.max(np.abs(rew)) <= clip_reward path = str(tmpdir.join("vec_normalize")) norm_venv.save(path) deserialized = VecNormalize.load(path, venv=orig_venv) check_vec_norm_equal(norm_venv, deserialized)
def _train(env_id, model_params, total_epochs, use_sigmoid_layer=False, is_evaluation=False): if is_evaluation: # evaluate_policy() must only take one environment envs = SubprocVecEnv([make_env(env_id)]) else: envs = SubprocVecEnv([make_env(env_id) for _ in range(NUM_CPU)]) envs = VecNormalize(envs) # normalize the envs during training and evaluation # activation fn: use tanh for delta hedging and relu for mean reversion # learning rate: use 1e-7 for delta hedging and 1e-5 for mean reversion if use_sigmoid_layer: model = PPO2(SigmoidMlpPolicy, envs, n_steps=1, nminibatches=1, learning_rate=lambda f: f * 1e-5, verbose=1, policy_kwargs=dict(act_fun=tf.nn.relu), **model_params) else: model = PPO2(MlpLstmPolicy, envs, n_steps=1, nminibatches=1, learning_rate=lambda f: f * 1e-5, verbose=1, policy_kwargs=dict(act_fun=tf.nn.relu), **model_params) model.learn(total_timesteps=total_epochs * L) return envs, model
def train(num_timesteps, model_to_load): try: env = DummyVecEnv([dsgym]) env = VecNormalize(env) policy = MlpPolicy lr = 3e-4 * 0.75 model = PPO2(policy=policy, env=env, n_steps=2048, nminibatches=32, lam=0.95, gamma=0.99, noptepochs=10, ent_coef=0.01, learning_rate=linear_schedule(lr), cliprange=0.2) if model_to_load: env = DummyVecEnv([dsgym]) env = VecNormalize.load( model_to_load.replace(".zip", "vec_normalize.pkl"), env) model = model.load(model_to_load) model.set_env(env) print("Loaded model from: ", model_to_load) model.set_learning_rate_func(linear_schedule_start_zero(lr)) model.learn(total_timesteps=num_timesteps) except KeyboardInterrupt: print("Saving on keyinterrupt") model.save("D:/openAi/ppo2save/" + time.strftime("%Y_%m_%d-%H_%M_%S")) # quit sys.exit() except BaseException as error: model.save("D:/openAi/ppo2save/" + time.strftime("%Y_%m_%d-%H_%M_%S")) print('An exception occurred: {}'.format(error)) traceback.print_exception(*sys.exc_info()) sys.exit() model.save("D:/openAi/ppo2save/" + time.strftime("%Y_%m_%d-%H_%M_%S"))
def run_experiment_with_trained(augment_num_timesteps, linear_co_threshold, augment_seed, augment_run_num, network_size, policy_env, policy_num_timesteps, policy_run_num, policy_seed, eval_seed, eval_run_num, learning_rate, additional_note, result_dir, keys_to_include, metric_param, linear_top_vars_list=None, linear_correlation_neuron_list=None, visualize=False, lagrangian_inds_to_include=None, neurons_inds_to_include=None, use_lagrangian=True): trained_model = None if not use_lagrangian: with tf.variable_scope("trained_model"): common_arg_parser = get_common_parser() trained_args, cma_unknown_args = common_arg_parser.parse_known_args() trained_args.env = policy_env trained_args.seed = policy_seed trained_args.num_timesteps = policy_num_timesteps trained_args.run_num = policy_run_num trained_this_run_dir = get_dir_path_for_this_run(trained_args) trained_traj_params_dir_name = get_full_params_dir(trained_this_run_dir) trained_save_dir = get_save_dir(trained_this_run_dir) trained_final_file = get_full_param_traj_file_path(trained_traj_params_dir_name, "pi_final") trained_final_params = pd.read_csv(trained_final_file, header=None).values[0] trained_model = PPO2.load(f"{trained_save_dir}/ppo2", seed=augment_seed) trained_model.set_pi_from_flat(trained_final_params) args = AttributeDict() args.normalize = True args.num_timesteps = augment_num_timesteps args.run_num = augment_run_num args.alg = "ppo2" args.seed = augment_seed logger.log(f"#######TRAIN: {args}") # non_linear_global_dict timestamp = get_time_stamp('%Y_%m_%d_%H_%M_%S') experiment_label = f"learning_rate_{learning_rate}timestamp_{timestamp}_augment_num_timesteps{augment_num_timesteps}" \ f"_top_num_to_include{linear_co_threshold.start}_{linear_co_threshold.stop}" \ f"_augment_seed{augment_seed}_augment_run_num{augment_run_num}_network_size{network_size}" \ f"_policy_num_timesteps{policy_num_timesteps}_policy_run_num{policy_run_num}_policy_seed{policy_seed}" \ f"_eval_seed{eval_seed}_eval_run_num{eval_run_num}_additional_note_{additional_note}" if policy_env == "DartWalker2d-v1": entry_point = 'gym.envs.dart:DartWalker2dEnv_aug_input' elif policy_env == "DartHopper-v1": entry_point = 'gym.envs.dart:DartHopperEnv_aug_input' elif policy_env == "DartHalfCheetah-v1": entry_point = 'gym.envs.dart:DartHalfCheetahEnv_aug_input' elif policy_env == "DartSnake7Link-v1": entry_point = 'gym.envs.dart:DartSnake7LinkEnv_aug_input' else: raise NotImplemented() this_run_dir = get_experiment_path_for_this_run(entry_point, args.num_timesteps, args.run_num, args.seed, learning_rate=learning_rate, top_num_to_include=linear_co_threshold, result_dir=result_dir, network_size=network_size) full_param_traj_dir_path = get_full_params_dir(this_run_dir) log_dir = get_log_dir(this_run_dir) save_dir = get_save_dir(this_run_dir) create_dir_remove(this_run_dir) create_dir_remove(full_param_traj_dir_path) create_dir_remove(save_dir) create_dir_remove(log_dir) logger.configure(log_dir) linear_top_vars_list_wanted_to_print = [] if (use_lagrangian and lagrangian_inds_to_include is None) or (not use_lagrangian and neurons_inds_to_include is None): # note this is only linear if linear_top_vars_list is None or linear_correlation_neuron_list is None: linear_top_vars_list, linear_correlation_neuron_list = read_linear_top_var(policy_env, policy_num_timesteps, policy_run_num, policy_seed, eval_seed, eval_run_num, additional_note, metric_param=metric_param) lagrangian_inds_to_include, neurons_inds_to_include, linear_top_vars_list_wanted_to_print = \ get_wanted_lagrangians_and_neurons(keys_to_include, linear_top_vars_list, linear_correlation_neuron_list, linear_co_threshold) with open(f"{log_dir}/lagrangian_inds_to_include.json", 'w') as fp: json.dump(lagrangian_inds_to_include, fp) with open(f"{log_dir}/linear_top_vars_list_wanted_to_print.json", 'w') as fp: json.dump(linear_top_vars_list_wanted_to_print, fp) with open(f"{log_dir}/neurons_inds_to_include.json", 'w') as fp: json.dump(neurons_inds_to_include, fp) args.env = f'{experiment_label}_{entry_point}-v1' if not use_lagrangian: register( id=args.env, entry_point=entry_point, max_episode_steps=1000, kwargs={"lagrangian_inds_to_include": None, "trained_model": trained_model, "neurons_inds_to_include": neurons_inds_to_include} ) else: register( id=args.env, entry_point=entry_point, max_episode_steps=1000, kwargs={"lagrangian_inds_to_include": lagrangian_inds_to_include, "trained_model": None, "neurons_inds_to_include": None} ) def make_env(): env_out = gym.make(args.env) env_out.env.visualize = visualize env_out = bench.Monitor(env_out, logger.get_dir(), allow_early_resets=True) return env_out env = DummyVecEnv([make_env]) walker_env = env.envs[0].env.env walker_env.disableViewer = not visualize if args.normalize: env = VecNormalize(env) policy = MlpPolicy set_global_seeds(args.seed) walker_env.seed(args.seed) num_dof = walker_env.robot_skeleton.ndofs show_M_matrix(num_dof, lagrangian_inds_to_include, linear_co_threshold, log_dir) # extra run info I added for my purposes run_info = {"run_num": args.run_num, "env_id": args.env, "full_param_traj_dir_path": full_param_traj_dir_path} layers = [network_size, network_size] policy_kwargs = {"net_arch" : [dict(vf=layers, pi=layers)]} model = PPO2(policy=policy, env=env, n_steps=4096, nminibatches=64, lam=0.95, gamma=0.99, noptepochs=10, ent_coef=0.0, learning_rate=learning_rate, cliprange=0.2, optimizer='adam', policy_kwargs=policy_kwargs, seed=args.seed) model.tell_run_info(run_info) model.learn(total_timesteps=args.num_timesteps, seed=args.seed) model.save(f"{save_dir}/ppo2") if args.normalize: env.save_running_average(save_dir) return log_dir
def single_wrappers( single_venv, scheduler, our_idx, normalize, normalize_observations, rew_shape, rew_shape_params, embed_index, embed_paths, embed_types, debug, env_name, load_policy, lookback_params, transparent_params, log_callbacks, save_callbacks, ): if rew_shape: rew_shape_venv = apply_reward_wrapper( single_env=single_venv, scheduler=scheduler, shaping_params=rew_shape_params, agent_idx=our_idx, ) log_callbacks.append(LoggerOnlyLogCallback(rew_shape_venv)) single_venv = rew_shape_venv for anneal_type in ["noise", "rew_shape"]: if scheduler.is_conditional(anneal_type): scheduler.set_annealer_get_logs(anneal_type, rew_shape_venv.get_logs) if lookback_params["lb_num"] > 0: if len(embed_types) > 1: raise ValueError( "Lookback is not supported with multiple embedded agents") embed_path = embed_paths[0] embed_type = embed_types[0] lookback_venv = LookbackRewardVecWrapper( single_venv, env_name, debug, embed_index, embed_path, embed_type, transparent_params, **lookback_params, ) single_venv = lookback_venv if normalize: if normalize_observations: if load_policy["path"] is not None: if load_policy["type"] == "zoo": raise ValueError( "Trying to normalize twice. Bansal et al's Zoo agents normalize " "implicitly. Please set normalize=False to disable VecNormalize." ) normalized_venv = VecNormalize(single_venv) else: normalized_venv = VecNormalize(single_venv, norm_obs=False) if load_policy["path"] is not None and load_policy["type"] != "zoo": normalized_venv.load_running_average(load_policy["path"]) save_callbacks.append(lambda root_dir: normalized_venv.save( os.path.join(root_dir, "vec_normalize.pkl"))) single_venv = normalized_venv return single_venv
def train(args): """ Runs the test """ args, argv = mujoco_arg_parser().parse_known_args(args) logger.log(f"#######TRAIN: {args}") args.alg = "ppo2" this_run_dir = get_dir_path_for_this_run(args) if os.path.exists(this_run_dir): import shutil shutil.rmtree(this_run_dir) os.makedirs(this_run_dir) log_dir = get_log_dir(this_run_dir) save_dir = get_save_dir(this_run_dir) logger.configure(log_dir) def make_env(): env_out = gym.make(args.env) env_out.env.visualize = False env_out = bench.Monitor(env_out, logger.get_dir(), allow_early_resets=True) return env_out env = DummyVecEnv([make_env]) env.envs[0].env.env.disableViewer = True set_global_seeds(args.seed) env.envs[0].env.env.seed(args.seed) if args.normalize: env = VecNormalize(env) policy = MlpPolicy # extra run info I added for my purposes full_param_traj_dir_path = get_full_params_dir(this_run_dir) if os.path.exists(full_param_traj_dir_path): import shutil shutil.rmtree(full_param_traj_dir_path) os.makedirs(full_param_traj_dir_path) if os.path.exists(save_dir): import shutil shutil.rmtree(save_dir) os.makedirs(save_dir) run_info = { "run_num": args.run_num, "env_id": args.env, "full_param_traj_dir_path": full_param_traj_dir_path, "state_samples_to_collect": args.state_samples_to_collect } model = PPO2(policy=policy, env=env, n_steps=args.n_steps, nminibatches=args.nminibatches, lam=0.95, gamma=0.99, noptepochs=10, ent_coef=0.0, learning_rate=3e-4, cliprange=0.2, optimizer=args.optimizer, seed=args.seed) model.tell_run_info(run_info) model.learn(total_timesteps=args.num_timesteps) model.save(f"{save_dir}/ppo2") if args.normalize: env.save_running_average(save_dir)
def visualize_policy_and_collect_COM(seed, run_num, policy_env, policy_num_timesteps, policy_seed, policy_run_num): logger.log(sys.argv) common_arg_parser = get_common_parser() args, cma_unknown_args = common_arg_parser.parse_known_args() args.env = policy_env args.seed = policy_seed args.num_timesteps = policy_num_timesteps args.run_num = policy_run_num this_run_dir = get_dir_path_for_this_run(args) traj_params_dir_name = get_full_params_dir(this_run_dir) save_dir = get_save_dir(this_run_dir) final_file = get_full_param_traj_file_path(traj_params_dir_name, "pi_final") final_params = pd.read_csv(final_file, header=None).values[0] def make_env(): env_out = gym.make(args.env) env_out.env.disableViewer = False env_out = bench.Monitor(env_out, logger.get_dir(), allow_early_resets=True) env_out.seed(seed) return env_out env = DummyVecEnv([make_env]) if args.normalize: env = VecNormalize(env) model = PPO2.load(f"{save_dir}/ppo2", seed=seed) model.set_pi_from_flat(final_params) if args.normalize: env.load_running_average(save_dir) sk = env.venv.envs[0].env.env.robot_skeleton lagrangian_values = {} obs = np.zeros((env.num_envs, ) + env.observation_space.shape) obs[:] = env.reset() plot_dir = get_plot_dir(policy_env=args.env, policy_num_timesteps=policy_num_timesteps, policy_run_num=policy_run_num, policy_seed=policy_seed, eval_seed=seed, eval_run_num=run_num, additional_note="") if os.path.exists(plot_dir): shutil.rmtree(plot_dir) os.makedirs(plot_dir) env = VecVideoRecorder(env, plot_dir, record_video_trigger=lambda x: x == 0, video_length=3000, name_prefix="3000000agent-{}".format(args.env)) lagrangian_values["M"] = [sk.M.reshape((-1, 1))] lagrangian_values["COM"] = [sk.C.reshape((-1, 1))] lagrangian_values["Coriolis"] = [sk.c.reshape((-1, 1))] lagrangian_values["q"] = [sk.q.reshape((-1, 1))] lagrangian_values["dq"] = [sk.dq.reshape((-1, 1))] contact_values = {} neuron_values = model.give_neuron_values(obs) raw_layer_values_list = [[neuron_value.reshape((-1, 1))] for neuron_value in neuron_values] env.render() ep_infos = [] steps_to_first_done = 0 first_done = False # epi_rew = 0 for _ in range(3000): actions = model.step(obs)[0] # yield neuron_values obs, rew, done, infos = env.step(actions) # epi_rew+= rew[0] if done and not first_done: first_done = True if not first_done: steps_to_first_done += 1 neuron_values = model.give_neuron_values(obs) for i, layer in enumerate(neuron_values): raw_layer_values_list[i].append(layer.reshape((-1, 1))) # fill_contacts_jac_dict(infos[0]["contacts"], contact_dict=contact_values, neuron_values=neuron_values) lagrangian_values["M"].append(sk.M.reshape((-1, 1))) lagrangian_values["q"].append(sk.q.reshape((-1, 1))) lagrangian_values["dq"].append(sk.dq.reshape((-1, 1))) lagrangian_values["COM"].append(sk.C.reshape((-1, 1))) lagrangian_values["Coriolis"].append(sk.c.reshape((-1, 1))) # env.render() # time.sleep(1) for info in infos: maybe_ep_info = info.get('episode') if maybe_ep_info is not None: ep_infos.append(maybe_ep_info) env.render() done = done.any() if done: episode_rew = safe_mean([ep_info['r'] for ep_info in ep_infos]) print(f'episode_rew={episode_rew}') # print(f'episode_rew={epi_rew}') # epi_rew = 0 obs = env.reset() #Hstack into a big matrix lagrangian_values["M"] = np.hstack(lagrangian_values["M"]) lagrangian_values["COM"] = np.hstack(lagrangian_values["COM"]) lagrangian_values["Coriolis"] = np.hstack(lagrangian_values["Coriolis"]) lagrangian_values["q"] = np.hstack(lagrangian_values["q"]) lagrangian_values["dq"] = np.hstack(lagrangian_values["dq"]) # for contact_body_name, l in contact_values.items(): # body_contact_dict = contact_values[contact_body_name] # for name, l in body_contact_dict.items(): # body_contact_dict[name] = np.hstack(body_contact_dict[name]) input_values = np.hstack(raw_layer_values_list[0]) layers_values = [ np.hstack(layer_list) for layer_list in raw_layer_values_list ][1:-2] # drop variance and inputs for i, com in enumerate(lagrangian_values["COM"]): plt.figure() plt.plot(np.arange(len(com)), com) plt.xlabel("time") plt.ylabel(f"COM{i}") plt.savefig(f"{plot_dir}/COM{i}.jpg") plt.close()
def visualize_augment_experiment(augment_num_timesteps, top_num_to_include_slice, augment_seed, augment_run_num, network_size, policy_env, policy_num_timesteps, policy_run_num, policy_seed, eval_seed, eval_run_num, learning_rate, additional_note, result_dir, lagrangian_inds_to_include=None): args = AttributeDict() args.normalize = True args.num_timesteps = augment_num_timesteps args.run_num = augment_run_num args.alg = "ppo2" args.seed = augment_seed logger.log(f"#######TRAIN: {args}") # non_linear_global_dict timestamp = get_time_stamp('%Y_%m_%d_%H_%M_%S') experiment_label = f"learning_rate_{learning_rate}timestamp_{timestamp}_augment_num_timesteps{augment_num_timesteps}" \ f"_top_num_to_include{top_num_to_include_slice.start}_{top_num_to_include_slice.stop}" \ f"_augment_seed{augment_seed}_augment_run_num{augment_run_num}_network_size{network_size}" \ f"_policy_num_timesteps{policy_num_timesteps}_policy_run_num{policy_run_num}_policy_seed{policy_seed}" \ f"_eval_seed{eval_seed}_eval_run_num{eval_run_num}_additional_note_{additional_note}" if policy_env == "DartWalker2d-v1": entry_point = 'gym.envs.dart:DartWalker2dEnv_aug_input' elif policy_env == "DartHopper-v1": entry_point = 'gym.envs.dart:DartHopperEnv_aug_input' elif policy_env == "DartHalfCheetah-v1": entry_point = 'gym.envs.dart:DartHalfCheetahEnv_aug_input' elif policy_env == "DartSnake7Link-v1": entry_point = 'gym.envs.dart:DartSnake7LinkEnv_aug_input' else: raise NotImplemented() this_run_dir = get_experiment_path_for_this_run( entry_point, args.num_timesteps, args.run_num, args.seed, learning_rate=learning_rate, top_num_to_include=top_num_to_include_slice, result_dir=result_dir, network_size=network_size) full_param_traj_dir_path = get_full_params_dir(this_run_dir) log_dir = get_log_dir(this_run_dir) save_dir = get_save_dir(this_run_dir) create_dir_remove(this_run_dir) create_dir_remove(full_param_traj_dir_path) create_dir_remove(save_dir) create_dir_remove(log_dir) logger.configure(log_dir) # note this is only linear if lagrangian_inds_to_include is None: linear_top_vars_list = read_linear_top_var(policy_env, policy_num_timesteps, policy_run_num, policy_seed, eval_seed, eval_run_num, additional_note) # keys_to_include = ["COM", "M", "Coriolis", "total_contact_forces_contact_bodynode", # "com_jacobian", "contact_bodynode_jacobian"] keys_to_include = ["COM", "M", "Coriolis", "com_jacobian"] # lagrangian_inds_to_include = linear_top_vars_list[top_num_to_include_slice] lagrangian_inds_to_include = get_wanted_lagrangians( keys_to_include, linear_top_vars_list, top_num_to_include_slice) with open(f"{log_dir}/lagrangian_inds_to_include.json", 'w') as fp: json.dump(lagrangian_inds_to_include, fp) args.env = f'{experiment_label}_{entry_point}-v1' register(id=args.env, entry_point=entry_point, max_episode_steps=1000, kwargs={"lagrangian_inds_to_include": lagrangian_inds_to_include}) def make_env(): env_out = gym.make(args.env) env_out.env.visualize = False env_out = bench.Monitor(env_out, logger.get_dir(), allow_early_resets=True) return env_out env = DummyVecEnv([make_env]) walker_env = env.envs[0].env.env walker_env.disableViewer = True if args.normalize: env = VecNormalize(env) policy = MlpPolicy # extra run info I added for my purposes run_info = { "run_num": args.run_num, "env_id": args.env, "full_param_traj_dir_path": full_param_traj_dir_path } layers = [network_size, network_size] set_global_seeds(args.seed) walker_env.seed(args.seed) policy_kwargs = {"net_arch": [dict(vf=layers, pi=layers)]} model = PPO2(policy=policy, env=env, n_steps=4096, nminibatches=64, lam=0.95, gamma=0.99, noptepochs=10, ent_coef=0.0, learning_rate=learning_rate, cliprange=0.2, optimizer='adam', policy_kwargs=policy_kwargs, seed=args.seed) model.tell_run_info(run_info) model.learn(total_timesteps=args.num_timesteps, seed=args.seed) model.save(f"{save_dir}/ppo2") if args.normalize: env.save_running_average(save_dir) return log_dir
def visualize_policy_and_collect_COM( augment_num_timesteps, top_num_to_include_slice, augment_seed, augment_run_num, network_size, policy_env, policy_num_timesteps, policy_run_num, policy_seed, eval_seed, eval_run_num, learning_rate, additional_note, metric_param): result_dir = get_result_dir(policy_env, policy_num_timesteps, policy_run_num, policy_seed, eval_seed, eval_run_num, additional_note, metric_param) args = AttributeDict() args.normalize = True args.num_timesteps = augment_num_timesteps args.run_num = augment_run_num args.alg = "ppo2" args.seed = augment_seed logger.log(f"#######VISUALIZE: {args}") # non_linear_global_dict linear_global_dict, non_linear_global_dict, lagrangian_values, input_values, layers_values, all_weights = read_all_data( policy_env, policy_num_timesteps, policy_run_num, policy_seed, eval_seed, eval_run_num, additional_note=additional_note) timestamp = get_time_stamp('%Y_%m_%d_%H_%M_%S') experiment_label = f"learning_rate_{learning_rate}timestamp_{timestamp}_augment_num_timesteps{augment_num_timesteps}" \ f"_top_num_to_include{top_num_to_include_slice.start}_{top_num_to_include_slice.stop}" \ f"_augment_seed{augment_seed}_augment_run_num{augment_run_num}_network_size{network_size}" \ f"_policy_num_timesteps{policy_num_timesteps}_policy_run_num{policy_run_num}_policy_seed{policy_seed}" \ f"_eval_seed{eval_seed}_eval_run_num{eval_run_num}_additional_note_{additional_note}" entry_point = 'gym.envs.dart:DartWalker2dEnv_aug_input' this_run_dir = get_experiment_path_for_this_run( entry_point, args.num_timesteps, args.run_num, args.seed, learning_rate=learning_rate, top_num_to_include=top_num_to_include_slice, result_dir=result_dir, network_size=network_size, metric_param=metric_param) traj_params_dir_name = get_full_params_dir(this_run_dir) save_dir = get_save_dir(this_run_dir) aug_plot_dir = get_aug_plot_dir(this_run_dir) + "_vis" final_file = get_full_param_traj_file_path(traj_params_dir_name, "pi_final") final_params = pd.read_csv(final_file, header=None).values[0] args.env = f'{experiment_label}_{entry_point}-v1' register(id=args.env, entry_point=entry_point, max_episode_steps=1000, kwargs={ 'linear_global_dict': linear_global_dict, 'non_linear_global_dict': non_linear_global_dict, 'top_to_include_slice': top_num_to_include_slice, 'aug_plot_dir': aug_plot_dir, "lagrangian_values": lagrangian_values, "layers_values": layers_values }) def make_env(): env_out = gym.make(args.env) env_out = bench.Monitor(env_out, logger.get_dir(), allow_early_resets=True) return env_out env = DummyVecEnv([make_env]) walker_env = env.envs[0].env.env walker_env.disableViewer = False if args.normalize: env = VecNormalize(env) set_global_seeds(args.seed) walker_env.seed(args.seed) model = PPO2.load(f"{save_dir}/ppo2", seed=augment_seed) model.set_pi_from_flat(final_params) if args.normalize: env.load_running_average(save_dir) sk = env.venv.envs[0].env.env.robot_skeleton lagrangian_values = {} obs = np.zeros((env.num_envs, ) + env.observation_space.shape) obs[:] = env.reset() env = VecVideoRecorder(env, aug_plot_dir, record_video_trigger=lambda x: x == 0, video_length=3000, name_prefix="vis_this_policy") lagrangian_values["M"] = [sk.M.reshape((-1, 1))] lagrangian_values["COM"] = [sk.C.reshape((-1, 1))] lagrangian_values["Coriolis"] = [sk.c.reshape((-1, 1))] lagrangian_values["q"] = [sk.q.reshape((-1, 1))] lagrangian_values["dq"] = [sk.dq.reshape((-1, 1))] contact_values = {} neuron_values = model.give_neuron_values(obs) raw_layer_values_list = [[neuron_value.reshape((-1, 1))] for neuron_value in neuron_values] env.render() ep_infos = [] steps_to_first_done = 0 first_done = False # epi_rew = 0 for _ in range(3000): actions = model.step(obs)[0] # yield neuron_values obs, rew, done, infos = env.step(actions) # epi_rew+= rew[0] if done and not first_done: first_done = True if not first_done: steps_to_first_done += 1 neuron_values = model.give_neuron_values(obs) for i, layer in enumerate(neuron_values): raw_layer_values_list[i].append(layer.reshape((-1, 1))) # fill_contacts_jac_dict(infos[0]["contacts"], contact_dict=contact_values, neuron_values=neuron_values) lagrangian_values["M"].append(sk.M.reshape((-1, 1))) lagrangian_values["q"].append(sk.q.reshape((-1, 1))) lagrangian_values["dq"].append(sk.dq.reshape((-1, 1))) lagrangian_values["COM"].append(sk.C.reshape((-1, 1))) lagrangian_values["Coriolis"].append(sk.c.reshape((-1, 1))) # env.render() # time.sleep(1) for info in infos: maybe_ep_info = info.get('episode') if maybe_ep_info is not None: ep_infos.append(maybe_ep_info) env.render() done = done.any() if done: episode_rew = safe_mean([ep_info['r'] for ep_info in ep_infos]) print(f'episode_rew={episode_rew}') # print(f'episode_rew={epi_rew}') # epi_rew = 0 obs = env.reset() #Hstack into a big matrix lagrangian_values["M"] = np.hstack(lagrangian_values["M"]) lagrangian_values["COM"] = np.hstack(lagrangian_values["COM"]) lagrangian_values["Coriolis"] = np.hstack(lagrangian_values["Coriolis"]) lagrangian_values["q"] = np.hstack(lagrangian_values["q"]) lagrangian_values["dq"] = np.hstack(lagrangian_values["dq"]) # for contact_body_name, l in contact_values.items(): # body_contact_dict = contact_values[contact_body_name] # for name, l in body_contact_dict.items(): # body_contact_dict[name] = np.hstack(body_contact_dict[name]) input_values = np.hstack(raw_layer_values_list[0]) layers_values = [ np.hstack(layer_list) for layer_list in raw_layer_values_list ][1:-2] # drop variance and inputs for i, com in enumerate(lagrangian_values["COM"]): plt.figure() plt.plot(np.arange(len(com)), com) plt.xlabel("time") plt.ylabel(f"COM{i}") plt.savefig(f"{aug_plot_dir}/COM{i}.jpg") plt.close()
:param seed: (int) the inital seed for RNG :param rank: (int) index of the subprocess """ def _init(): env = gym.make(env_id) env.seed(seed + rank) env = bench.Monitor(env, logger.get_dir(), allow_early_resets=True) return env set_global_seeds(seed) return _init if __name__ == '__main__': log_str = sys.argv[1] env_id = 'ROAMHandGraspCube-v1' model = PPO2.load("logs/{}/trained_model".format(log_str)) # render trained agent env = VecNormalize(DummyVecEnv([lambda: gym.make(env_id)]), norm_reward=False) env.load_running_average("logs/{}".format(log_str)) obs = env.reset() while True: action, _states = model.predict(obs, deterministic=True) obs, rewards, dones, info = env.step(action) env.render()
def main(_): p_dic = getattr(conf.dic.path_dic, FLAGS.env_name) register(id=FLAGS.env_id, entry_point='env.env_ep:Env', kwargs={ 'env_name': FLAGS.env_name, 'done_step': 8760 }) def make_env(): env_out = gym.make(FLAGS.env_id) env_out = bench.Monitor(env_out, logger.get_dir(), allow_early_resets=True) return env_out env = DummyVecEnv([make_env]) env = VecNormalize(env) policy = { 'cnn': CnnPolicy, 'lstm': CnnLstmPolicy, 'lnlstm': CnnLnLstmPolicy, 'mlp': MlpPolicy }[FLAGS.policy] model = PPO2(policy=policy, env=env, n_steps=FLAGS.n_steps, nminibatches=FLAGS.nminibatches, lam=FLAGS.lam, gamma=FLAGS.gamma, noptepochs=FLAGS.noptepochs, ent_coef=FLAGS.ent_coef, learning_rate=FLAGS.learning_rate, cliprange=FLAGS.cliprange, verbose=FLAGS.verbose) with model.graph.as_default(): tf_util.load_state(fname=tf.train.latest_checkpoint( p_dic.get('agent_log_dir')), sess=model.sess) epenv = EnergyPlusEnv( energyplus_file="/usr/local/EnergyPlus-8-8-0/energyplus", model_file=p_dic.get('idf_path'), weather_file= "/usr/local/EnergyPlus-8-8-0/WeatherData/USA_IL_Chicago-OHare.Intl.AP.725300_TMY3.epw", log_dir=p_dic.get('eplog_dir')) os.environ['ENERGYPLUS'] = "/usr/local/EnergyPlus-8-8-0/energyplus" os.environ['ENERGYPLUS_MODEL'] = p_dic.get('idf_path') os.environ[ 'ENERGYPLUS_WEATHER'] = "/usr/local/EnergyPlus-8-8-0/WeatherData/USA_IL_Chicago-OHare.Intl.AP.725300_TMY3.epw" os.environ['ENERGYPLUS_LOG'] = p_dic.get('eplog_dir') epenv.start_instance() def signal_handler(signal, frame): epenv.stop_instance print('=====Energy plus terminated=====') print('==========Pipe closed==========') sys.exit() signal.signal(signal.SIGINT, signal_handler) state = epenv.reset() env = Env('ep', 8760) for i in range(10000000000000): # state = np.array([[state[0], state[1], state[2], state[3], state[4], state[5], state[6], state[7], state[8]]]) state = np.array( [[state[0], state[1], state[2], state[3], state[4], state[5]]]) action, _, _, _ = model.step(state) action = env.set_action(action) action = action.reshape([-1]) state, done = epenv.step(action) epenv.stop_instance()
env_path = args.surrogate_model mimic_model_path = args.mimic_model_path env = gym.make(env_name) venv = SubprocVecEnv([ lambda: make_adv_multi2single_env(env_name, adv_agent_path, adv_agent_norm_path, False) for i in range(n_cpu) ]) venv = Monitor(venv, 0) rew_shape_venv = apply_reward_wrapper(single_env=venv, scheduler=scheduler, agent_idx=0, shaping_params=rew_shape_params) venv = VecNormalize(rew_shape_venv, norm_obs=False) # makedir output out_dir, logger = setup_logger(args.root_dir, args.exp_name) model = MyPPO2(MlpPolicy, venv, ent_coef=ent_coef, nminibatches=nminibatches, noptepochs=noptepochs, learning_rate=learning_rate, verbose=1, n_steps=n_steps, gamma=gamma, tensorboard_log=out_dir, model_saved_loc=out_dir, env_name=env_name, env_path=env_path,