def run_task(*_): sess = tf.Session() sess.__enter__() snapshot = joblib.load(latent_policy_pkl) latent_policy = snapshot["policy"] inner_env = SimpleReacherEnv(goal_position=(0.65, 0.3, 0.3), control_method="position_control", completion_bonus=30) env = TfEnv(EmbeddedPolicyEnv(inner_env, latent_policy)) policy = GaussianMLPPolicy( name="policy", env_spec=env, hidden_sizes=(64, 64), init_std=20, # std_share_network=False, # adaptive_std=True ) baseline = GaussianMLPBaseline(env_spec=env, include_action_to_input=False) algo = PPO( env=env, policy=policy, baseline=baseline, batch_size=1024, # 4096 max_path_length=100, n_itr=1500, discount=0.99, step_size=0.2, policy_ent_coeff=1e-6, plot=True, ) algo.train(sess=sess)
def run_task(*_): sess = tf.Session() sess.__enter__() latent_policy = joblib.load(latent_policy_pkl)["policy"] inner_env = PointEnv(goal=(1.4, 1.4), completion_bonus=100) env = TfEnv(EmbeddedPolicyEnv(inner_env, latent_policy)) policy = GaussianMLPPolicy(name="policy", env_spec=env.spec, hidden_sizes=(64, 64), init_std=20, std_share_network=False, adaptive_std=True) baseline = GaussianMLPBaseline(env_spec=env, include_action_to_input=False) algo = PPO( env=env, policy=policy, baseline=baseline, batch_size=1024, # 4096 max_path_length=50, n_itr=1500, discount=0.99, step_size=0.2, policy_ent_coeff=1e-6, plot=True, use_mpc_es=True, ) algo.train(sess=sess)
def run_task(*_): """ Wrap PPO training task in the run_task function. :param _: :return: """ env = TfEnv(normalize(gym.make("InvertedDoublePendulum-v2"))) policy = GaussianMLPPolicy( name="policy", env_spec=env.spec, hidden_sizes=(64, 64)) baseline = GaussianMLPBaseline(env_spec=env.spec) algo = PPO( env=env, policy=policy, baseline=baseline, batch_size=2048, max_path_length=100, n_itr=488, discount=0.99, step_size=0.01, optimizer_args=dict(batch_size=32, max_epochs=10), plot=False) algo.train()
def run_task(v): v = SimpleNamespace(**v) # Environment env = SimpleReacherEnv(goal_position=GOALS[0], control_method="position_control", completion_bonus=5) env = TfEnv(env) # Policy policy = GaussianMLPPolicy( name="policy", env_spec=env.spec, hidden_sizes=(64, 32), init_std=v.policy_init_std, ) baseline = GaussianMLPBaseline(env_spec=env.spec) algo = PPO( env=env, policy=policy, baseline=baseline, batch_size=v.batch_size, # 4096 max_path_length=v.max_path_length, n_itr=1000, discount=0.99, step_size=0.2, optimizer_args=dict(batch_size=32, max_epochs=10), plot=True, ) algo.train()
def run_task(v): v = SimpleNamespace(**v) # Environment env = SimpleReacherEnv( goal_position=GOALS[0], control_method="position_control", # control_cost_coeff=1.0, action_scale=0.04, randomize_start_jpos=True, completion_bonus=0.1, # terminate_on_collision=True, collision_penalty=0.0, ) env = TfEnv(env) # Policy policy = GaussianMLPPolicy( name="Policy", env_spec=env.spec, hidden_sizes=(64, 64), std_share_network=True, init_std=v.policy_init_std, ) baseline = GaussianMLPBaseline( env_spec=env.spec, regressor_args=dict(hidden_sizes=(64, 64)), ) # baseline = CollisionAwareBaseline( # env_spec=env.spec, # regressor_args=dict(hidden_sizes=(64, 64)), # ) algo = PPO( env=env, policy=policy, baseline=baseline, batch_size=v.batch_size, # 4096 max_path_length=v.max_path_length, n_itr=10000, discount=0.99, step_size=0.2, policy_ent_coeff=0., optimizer_args=dict(batch_size=32, max_epochs=10), plot=True, ) algo.train()
def test_ppo_pendulum(self): """Test PPO with Pendulum environment.""" logger._tensorboard = TensorBoardOutput() env = TfEnv(normalize(gym.make("Pendulum-v0"))) policy = GaussianMLPPolicy( env_spec=env.spec, hidden_sizes=(32, 32), hidden_nonlinearity=tf.nn.tanh, output_nonlinearity=None, ) baseline = GaussianMLPBaseline( env_spec=env.spec, regressor_args=dict(hidden_sizes=(32, 32)), ) algo = PPO( env=env, policy=policy, baseline=baseline, batch_size=1024, max_path_length=100, n_itr=10, discount=0.99, gae_lambda=0.98, policy_ent_coeff=0.0, plot=False, ) last_avg_ret = algo.train(sess=self.sess) assert last_avg_ret > -1000
def test_ppo_pendulum_with_model(self): """Test PPO with model, with Pendulum environment.""" logger.reset() env = TfEnv(normalize(gym.make("InvertedDoublePendulum-v2"))) policy = GaussianMLPPolicyWithModel( env_spec=env.spec, hidden_sizes=(64, 64), hidden_nonlinearity=tf.nn.tanh, output_nonlinearity=None, ) baseline = GaussianMLPBaseline( env_spec=env.spec, regressor_args=dict(hidden_sizes=(32, 32)), ) algo = PPO( env=env, policy=policy, baseline=baseline, batch_size=2048, max_path_length=100, n_itr=10, discount=0.99, lr_clip_range=0.01, optimizer_args=dict(batch_size=32, max_epochs=10), plot=False, ) last_avg_ret = algo.train(sess=self.sess) assert last_avg_ret > 40 env.close()
def run_task(v): v = SimpleNamespace(**v) # Environment env = SimplePusherEnv(action_scale=0.04, control_method="position_control", completion_bonus=0.1, collision_penalty=0.05) env = TfEnv(env) # Policy policy = GaussianMLPPolicy( name="policy", env_spec=env.spec, hidden_sizes=(256, 128), init_std=v.policy_init_std, ) baseline = GaussianMLPBaseline( env_spec=env.spec, regressor_args=dict(hidden_sizes=(256, 128)), ) algo = PPO( env=env, policy=policy, baseline=baseline, batch_size=v.batch_size, # 4096 max_path_length=v.max_path_length, n_itr=2000, discount=0.99, step_size=0.2, optimizer_args=dict(batch_size=32, max_epochs=10), plot=True, ) algo.train()
def run_garage(env, seed, log_dir): """ Create garage model and training. Replace the ppo with the algorithm you want to run. :param env: Environment of the task. :param seed: Random seed for the trail. :param log_dir: Log dir path. :return: """ ext.set_seed(seed) with tf.Graph().as_default(): env = TfEnv(normalize(env)) policy = GaussianMLPPolicy( name="policy", env_spec=env.spec, hidden_sizes=(64, 64), hidden_nonlinearity=tf.nn.tanh, output_nonlinearity=None, ) baseline = GaussianMLPBaseline( env_spec=env.spec, regressor_args=dict( hidden_sizes=(64, 64), use_trust_region=True, ), ) algo = PPO( env=env, policy=policy, baseline=baseline, batch_size=2048, max_path_length=100, n_itr=488, discount=0.99, gae_lambda=0.95, clip_range=0.1, policy_ent_coeff=0.0, optimizer_args=dict( batch_size=32, max_epochs=10, tf_optimizer_args=dict( learning_rate=3e-4, epsilon=1e-5, ), ), plot=False, ) # Set up logger since we are not using run_experiment tabular_log_file = osp.join(log_dir, "progress.csv") garage_logger.add_tabular_output(tabular_log_file) garage_logger.set_tensorboard_dir(log_dir) algo.train() garage_logger.remove_tabular_output(tabular_log_file) return tabular_log_file