def test_ddpg_double_pendulum(self): """Test DDPG with Pendulum environment.""" deterministic.set_seed(0) runner = LocalRunner(snapshot_config) env = GarageEnv(gym.make('InvertedDoublePendulum-v2')) policy = DeterministicMLPPolicy(env_spec=env.spec, hidden_sizes=[64, 64], hidden_nonlinearity=F.relu, output_nonlinearity=torch.tanh) exploration_policy = AddOrnsteinUhlenbeckNoise(env.spec, policy, sigma=0.2) qf = ContinuousMLPQFunction(env_spec=env.spec, hidden_sizes=[64, 64], hidden_nonlinearity=F.relu) replay_buffer = SimpleReplayBuffer(env_spec=env.spec, size_in_transitions=int(1e6), time_horizon=100) algo = DDPG(env_spec=env.spec, policy=policy, qf=qf, replay_buffer=replay_buffer, steps_per_epoch=20, n_train_steps=50, min_buffer_size=int(1e4), exploration_policy=exploration_policy, target_update_tau=1e-2, discount=0.9) runner.setup(algo, env) last_avg_ret = runner.train(n_epochs=10, batch_size=100) assert last_avg_ret > 45 env.close()
def test_ppo_pendulum(self): """Test PPO with Pendulum environment.""" deterministic.set_seed(0) episodes_per_task = 5 max_episode_length = self.env.spec.max_episode_length runner = LocalRunner(snapshot_config) algo = MAMLPPO(env=self.env, policy=self.policy, value_function=self.value_function, meta_batch_size=5, discount=0.99, gae_lambda=1., inner_lr=0.1, num_grad_updates=1) runner.setup(algo, self.env, sampler_cls=LocalSampler) last_avg_ret = runner.train(n_epochs=10, batch_size=episodes_per_task * max_episode_length) assert last_avg_ret > -5
def test_ppo_pendulum(self): """Test PPO with Pendulum environment.""" deterministic.set_seed(0) rollouts_per_task = 5 max_path_length = 100 runner = LocalRunner(snapshot_config) algo = MAMLPPO(env=self.env, policy=self.policy, baseline=self.baseline, max_path_length=max_path_length, meta_batch_size=5, discount=0.99, gae_lambda=1., inner_lr=0.1, num_grad_updates=1) runner.setup(algo, self.env) last_avg_ret = runner.train(n_epochs=10, batch_size=rollouts_per_task * max_path_length) assert last_avg_ret > -5
class TestVPG: """Test class for VPG.""" @classmethod def setup_class(cls): """Setup method which is called once before all tests in this class.""" deterministic.set_seed(0) def setup_method(self): """Setup method which is called before every test.""" self._env = GymEnv('InvertedDoublePendulum-v2') self._runner = LocalRunner(snapshot_config) self._policy = GaussianMLPPolicy(env_spec=self._env.spec, hidden_sizes=[64, 64], hidden_nonlinearity=torch.tanh, output_nonlinearity=None) self._params = { 'env_spec': self._env.spec, 'policy': self._policy, 'value_function': GaussianMLPValueFunction(env_spec=self._env.spec), 'max_episode_length': 100, 'discount': 0.99, } def teardown_method(self): """Teardown method which is called after every test.""" self._env.close() @pytest.mark.mujoco def test_vpg_no_entropy(self): """Test VPG with no_entropy.""" self._params['positive_adv'] = True self._params['use_softplus_entropy'] = True algo = VPG(**self._params) self._runner.setup(algo, self._env, sampler_cls=LocalSampler) last_avg_ret = self._runner.train(n_epochs=10, batch_size=100) assert last_avg_ret > 0 @pytest.mark.mujoco def test_vpg_max(self): """Test VPG with maximum entropy.""" self._params['center_adv'] = False self._params['stop_entropy_gradient'] = True self._params['entropy_method'] = 'max' algo = VPG(**self._params) self._runner.setup(algo, self._env, sampler_cls=LocalSampler) last_avg_ret = self._runner.train(n_epochs=10, batch_size=100) assert last_avg_ret > 0 @pytest.mark.mujoco def test_vpg_regularized(self): """Test VPG with entropy_regularized.""" self._params['entropy_method'] = 'regularized' algo = VPG(**self._params) self._runner.setup(algo, self._env, sampler_cls=LocalSampler) last_avg_ret = self._runner.train(n_epochs=10, batch_size=100) assert last_avg_ret > 0 @pytest.mark.mujoco @pytest.mark.parametrize('algo_param, error, msg', INVALID_ENTROPY_CONFIG) def test_invalid_entropy_config(self, algo_param, error, msg): """Test VPG with invalid entropy config.""" self._params.update(algo_param) with pytest.raises(error, match=msg): VPG(**self._params)
def pearl_metaworld_ml10(ctxt=None, seed=1, num_epochs=1000, num_train_tasks=10, num_test_tasks=5, latent_size=7, encoder_hidden_size=200, net_size=300, meta_batch_size=16, num_steps_per_epoch=4000, num_initial_steps=4000, num_tasks_sample=15, num_steps_prior=750, num_extra_rl_steps_posterior=750, batch_size=256, embedding_batch_size=64, embedding_mini_batch_size=64, max_path_length=150, reward_scale=10., use_gpu=False): """Train PEARL with ML10 environments. Args: ctxt (garage.experiment.ExperimentContext): The experiment configuration used by LocalRunner to create the snapshotter. seed (int): Used to seed the random number generator to produce determinism. num_epochs (int): Number of training epochs. num_train_tasks (int): Number of tasks for training. num_test_tasks (int): Number of tasks for testing. latent_size (int): Size of latent context vector. encoder_hidden_size (int): Output dimension of dense layer of the context encoder. net_size (int): Output dimension of a dense layer of Q-function and value function. meta_batch_size (int): Meta batch size. num_steps_per_epoch (int): Number of iterations per epoch. num_initial_steps (int): Number of transitions obtained per task before training. num_tasks_sample (int): Number of random tasks to obtain data for each iteration. num_steps_prior (int): Number of transitions to obtain per task with z ~ prior. num_extra_rl_steps_posterior (int): Number of additional transitions to obtain per task with z ~ posterior that are only used to train the policy and NOT the encoder. batch_size (int): Number of transitions in RL batch. embedding_batch_size (int): Number of transitions in context batch. embedding_mini_batch_size (int): Number of transitions in mini context batch; should be same as embedding_batch_size for non-recurrent encoder. max_path_length (int): Maximum path length. reward_scale (int): Reward scale. use_gpu (bool): Whether or not to use GPU for training. """ set_seed(seed) encoder_hidden_sizes = (encoder_hidden_size, encoder_hidden_size, encoder_hidden_size) # create multi-task environment and sample tasks ML_train_envs = [ GarageEnv(normalize(mwb.ML10.from_task(task_name))) for task_name in mwb.ML10.get_train_tasks().all_task_names ] ML_test_envs = [ GarageEnv(normalize(mwb.ML10.from_task(task_name))) for task_name in mwb.ML10.get_test_tasks().all_task_names ] env_sampler = EnvPoolSampler(ML_train_envs) env_sampler.grow_pool(num_train_tasks) env = env_sampler.sample(num_train_tasks) test_env_sampler = EnvPoolSampler(ML_test_envs) test_env_sampler.grow_pool(num_test_tasks) runner = LocalRunner(ctxt) # instantiate networks augmented_env = PEARL.augment_env_spec(env[0](), latent_size) qf = ContinuousMLPQFunction(env_spec=augmented_env, hidden_sizes=[net_size, net_size, net_size]) vf_env = PEARL.get_env_spec(env[0](), latent_size, 'vf') vf = ContinuousMLPQFunction(env_spec=vf_env, hidden_sizes=[net_size, net_size, net_size]) inner_policy = TanhGaussianMLPPolicy( env_spec=augmented_env, hidden_sizes=[net_size, net_size, net_size]) pearl = PEARL( env=env, policy_class=ContextConditionedPolicy, encoder_class=MLPEncoder, inner_policy=inner_policy, qf=qf, vf=vf, num_train_tasks=num_train_tasks, num_test_tasks=num_test_tasks, latent_dim=latent_size, encoder_hidden_sizes=encoder_hidden_sizes, test_env_sampler=test_env_sampler, meta_batch_size=meta_batch_size, num_steps_per_epoch=num_steps_per_epoch, num_initial_steps=num_initial_steps, num_tasks_sample=num_tasks_sample, num_steps_prior=num_steps_prior, num_extra_rl_steps_posterior=num_extra_rl_steps_posterior, batch_size=batch_size, embedding_batch_size=embedding_batch_size, embedding_mini_batch_size=embedding_mini_batch_size, max_path_length=max_path_length, reward_scale=reward_scale, ) set_gpu_mode(use_gpu, gpu_id=0) if use_gpu: pearl.to() runner.setup(algo=pearl, env=env[0](), sampler_cls=LocalSampler, sampler_args=dict(max_path_length=max_path_length), n_workers=1, worker_class=PEARLWorker) runner.train(n_epochs=num_epochs, batch_size=batch_size)
def mtsac_metaworld_ml1_pick_place(ctxt=None, seed=1, _gpu=None): """Train MTSAC with the ML1 pick-place-v1 environment. Args: ctxt (garage.experiment.ExperimentContext): The experiment configuration used by LocalRunner to create the snapshotter. seed (int): Used to seed the random number generator to produce determinism. _gpu (int): The ID of the gpu to be used (used on multi-gpu machines). """ deterministic.set_seed(seed) runner = LocalRunner(ctxt) train_envs = [] test_envs = [] env_names = [] for i in range(50): train_env = normalize( GymEnv(mwb.ML1.get_train_tasks('pick-place-v1'), normalize_reward=True)) test_env = pickle.loads(pickle.dumps(train_env)) env_names.append('pick_place_{}'.format(i)) train_envs.append(train_env) test_envs.append(test_env) ml1_train_envs = MultiEnvWrapper(train_envs, sample_strategy=round_robin_strategy, env_names=env_names) ml1_test_envs = MultiEnvWrapper(test_envs, sample_strategy=round_robin_strategy, env_names=env_names) policy = TanhGaussianMLPPolicy( env_spec=ml1_train_envs.spec, hidden_sizes=[400, 400, 400], hidden_nonlinearity=nn.ReLU, output_nonlinearity=None, min_std=np.exp(-20.), max_std=np.exp(2.), ) qf1 = ContinuousMLPQFunction(env_spec=ml1_train_envs.spec, hidden_sizes=[400, 400, 400], hidden_nonlinearity=F.relu) qf2 = ContinuousMLPQFunction(env_spec=ml1_train_envs.spec, hidden_sizes=[400, 400, 400], hidden_nonlinearity=F.relu) replay_buffer = PathBuffer(capacity_in_transitions=int(1e6), ) timesteps = 10000000 batch_size = int(150 * ml1_train_envs.num_tasks) num_evaluation_points = 500 epochs = timesteps // batch_size epoch_cycles = epochs // num_evaluation_points epochs = epochs // epoch_cycles mtsac = MTSAC(policy=policy, qf1=qf1, qf2=qf2, gradient_steps_per_itr=150, max_episode_length=150, eval_env=ml1_test_envs, env_spec=ml1_train_envs.spec, num_tasks=50, steps_per_epoch=epoch_cycles, replay_buffer=replay_buffer, min_buffer_size=1500, target_update_tau=5e-3, discount=0.99, buffer_batch_size=1280) if _gpu is not None: set_gpu_mode(True, _gpu) mtsac.to() runner.setup(algo=mtsac, env=ml1_train_envs, sampler_cls=LocalSampler) runner.train(n_epochs=epochs, batch_size=batch_size)
def diayn_half_cheetah_vel_batch_for_pearl(ctxt=None, seed=1): deterministic.set_seed(seed) runner = LocalRunner(snapshot_config=ctxt) env = GarageEnv(normalize(HalfCheetahVelEnv())) policy = TanhGaussianMLPSkillPolicy( env_spec=env.spec, skills_num=skills_num, hidden_sizes=[256, 256], hidden_nonlinearity=nn.ReLU, output_nonlinearity=None, min_std=np.exp(-20.), max_std=np.exp(2.), ) qf1 = ContinuousMLPSkillQFunction(env_spec=env.spec, skills_num=skills_num, hidden_sizes=[256, 256], hidden_nonlinearity=F.relu) qf2 = ContinuousMLPSkillQFunction(env_spec=env.spec, skills_num=skills_num, hidden_sizes=[256, 256], hidden_nonlinearity=F.relu) discriminator = MLPDiscriminator(env_spec=env.spec, skills_num=skills_num, hidden_sizes=[64, 64], hidden_nonlinearity=F.relu) replay_buffer = PathBuffer(capacity_in_transitions=int(1e6)) diayn = DIAYN( env_spec=env.spec, skills_num=skills_num, discriminator=discriminator, policy=policy, qf1=qf1, qf2=qf2, gradient_steps_per_itr=1000, max_path_length=300, replay_buffer=replay_buffer, min_buffer_size=1e4, recorded=True, # enable the video recording func target_update_tau=5e-3, discount=0.99, buffer_batch_size=256, reward_scale=1., steps_per_epoch=1) if torch.cuda.is_available(): tu.set_gpu_mode(True) else: tu.set_gpu_mode(False) diayn.to() worker_args = {"skills_num": skills_num} runner.setup(algo=diayn, env=env, sampler_cls=LocalSkillSampler, worker_class=SkillWorker, worker_args=worker_args) runner.train(n_epochs=1000, batch_size=1000) # 1000 # runner.restore(from_dir=os.path.join(os.getcwd(), 'data/local/experiment/diayn_half_cheetah_batch_50')) # diayn = runner.get_algo() runner.save(999) # saves the last episode return discriminator, diayn
def pearl_half_cheetah( ctxt=None, seed=1, num_epochs=param_num_epoches, num_train_tasks=param_train_tasks_num, num_test_tasks=param_test_tasks_num, latent_size=param_latent_size, encoder_hidden_size=param_encoder_hidden_size, net_size=param_net_size, meta_batch_size=param_meta_batch_size, num_steps_per_epoch=param_num_steps_per_epoch, num_initial_steps=param_num_initial_steps, num_tasks_sample=param_num_tasks_sample, num_steps_prior=param_num_steps_prior, num_extra_rl_steps_posterior=param_num_extra_rl_steps_posterior, batch_size=param_batch_size, embedding_batch_size=param_embedding_batch_size, embedding_mini_batch_size=param_embedding_mini_batch_size, max_path_length=param_max_path_length, reward_scale=param_reward_scale, use_gpu=param_use_gpu): set_seed(seed) encoder_hidden_sizes = (encoder_hidden_size, encoder_hidden_size, encoder_hidden_size) # create multi-task environment and sample tasks env_sampler = SetTaskSampler( lambda: GarageEnv(normalize(HalfCheetahVelEnv()))) env = env_sampler.sample(num_train_tasks) test_env_sampler = SetTaskSampler( lambda: GarageEnv(normalize(HalfCheetahVelEnv()))) runner = LocalRunner(ctxt) # instantiate networks augmented_env = PEARL.augment_env_spec(env[0](), latent_size) qf = ContinuousMLPQFunction(env_spec=augmented_env, hidden_sizes=[net_size, net_size, net_size]) vf_env = PEARL.get_env_spec(env[0](), latent_size, 'vf') vf = ContinuousMLPQFunction(env_spec=vf_env, hidden_sizes=[net_size, net_size, net_size]) inner_policy = TanhGaussianMLPPolicy( env_spec=augmented_env, hidden_sizes=[net_size, net_size, net_size]) pearl = PEARL( env=env, policy_class=ContextConditionedPolicy, encoder_class=MLPEncoder, inner_policy=inner_policy, qf=qf, vf=vf, num_train_tasks=num_train_tasks, num_test_tasks=num_test_tasks, latent_dim=latent_size, encoder_hidden_sizes=encoder_hidden_sizes, test_env_sampler=test_env_sampler, meta_batch_size=meta_batch_size, num_steps_per_epoch=num_steps_per_epoch, num_initial_steps=num_initial_steps, num_tasks_sample=num_tasks_sample, num_steps_prior=num_steps_prior, num_extra_rl_steps_posterior=num_extra_rl_steps_posterior, batch_size=batch_size, embedding_batch_size=embedding_batch_size, embedding_mini_batch_size=embedding_mini_batch_size, max_path_length=max_path_length, reward_scale=reward_scale, ) tu.set_gpu_mode(use_gpu, gpu_id=0) if use_gpu: pearl.to() runner.setup(algo=pearl, env=env[0](), sampler_cls=LocalSampler, sampler_args=dict(max_path_length=max_path_length), n_workers=1, worker_class=PEARLWorker) average_returns = runner.train(n_epochs=num_epochs, batch_size=batch_size) runner.save(num_epochs - 1) return average_returns
def diayn_pearl_half_cheeth( ctxt=None, seed=1, num_epochs=param_num_epoches, num_train_tasks=param_train_tasks_num, num_test_tasks=param_test_tasks_num, latent_size=param_latent_size, encoder_hidden_size=param_encoder_hidden_size, net_size=param_net_size, meta_batch_size=param_meta_batch_size, num_steps_per_epoch=param_num_steps_per_epoch, num_initial_steps=param_num_initial_steps, num_tasks_sample=param_num_tasks_sample, num_steps_prior=param_num_steps_prior, num_extra_rl_steps_posterior=param_num_extra_rl_steps_posterior, batch_size=param_batch_size, embedding_batch_size=param_embedding_batch_size, embedding_mini_batch_size=param_embedding_mini_batch_size, max_path_length=param_max_path_length, reward_scale=param_reward_scale, use_gpu=param_use_gpu): if task_proposer is None: raise ValueError("Task proposer is empty") assert num_train_tasks is skills_num set_seed(seed) encoder_hidden_sizes = (encoder_hidden_size, encoder_hidden_size, encoder_hidden_size) # create multi-task environment and sample tasks ML_train_envs = [ DiaynEnvWrapper(task_proposer, skills_num, task_name, normalize(HalfCheetahVelEnv())) for task_name in range(skills_num) ] env_sampler = EnvPoolSampler(ML_train_envs) env = env_sampler.sample(num_train_tasks) # train_trajs_dist = [train_env.get_training_traj(diayn_trained_agent) # for train_env in ML_train_envs] # ML_test_envs = [ # GarageEnv(normalize( # DiaynEnvWrapper(env, task_proposer, skills_num, task_name))) # for task_name in random.sample(range(skills_num), test_tasks_num) # ] test_env_sampler = SetTaskSampler( lambda: GarageEnv(normalize(HalfCheetahVelEnv()))) runner = LocalRunner(ctxt) # instantiate networks augmented_env = PEARL.augment_env_spec(env[0](), latent_size) qf = ContinuousMLPQFunction(env_spec=augmented_env, hidden_sizes=[net_size, net_size, net_size]) vf_env = PEARL.get_env_spec(env[0](), latent_size, 'vf') vf = ContinuousMLPQFunction(env_spec=vf_env, hidden_sizes=[net_size, net_size, net_size]) inner_policy = TanhGaussianMLPPolicy( env_spec=augmented_env, hidden_sizes=[net_size, net_size, net_size]) pearl = PEARL( env=env, policy_class=ContextConditionedPolicy, encoder_class=MLPEncoder, inner_policy=inner_policy, qf=qf, vf=vf, num_train_tasks=num_train_tasks, num_test_tasks=num_test_tasks, latent_dim=latent_size, encoder_hidden_sizes=encoder_hidden_sizes, test_env_sampler=test_env_sampler, meta_batch_size=meta_batch_size, num_steps_per_epoch=num_steps_per_epoch, num_initial_steps=num_initial_steps, num_tasks_sample=num_tasks_sample, num_steps_prior=num_steps_prior, num_extra_rl_steps_posterior=num_extra_rl_steps_posterior, batch_size=batch_size, embedding_batch_size=embedding_batch_size, embedding_mini_batch_size=embedding_mini_batch_size, max_path_length=max_path_length, reward_scale=reward_scale, ) tu.set_gpu_mode(use_gpu, gpu_id=0) if use_gpu: pearl.to() runner.setup(algo=pearl, env=env[0](), sampler_cls=LocalSampler, sampler_args=dict(max_path_length=max_path_length), n_workers=1, worker_class=PEARLWorker) average_returns = runner.train(n_epochs=num_epochs, batch_size=batch_size) runner.save(num_epochs - 1) return average_returns
def run(ctxt=None): """ Set up environment and algorithm and run the task. Args: ctxt (garage.experiment.ExperimentContext): The experiment configuration used by LocalRunner to create the snapshotter. seed (int): Used to seed the random number generator to produce determinism. """ deterministic.set_seed(self.seed) runner = LocalRunner(snapshot_config=ctxt, max_cpus=32) env = GarageEnv(normalize(self.env_maker())) policy = TanhGaussianMLPPolicy( env_spec=env.spec, hidden_sizes=self.policy_hidden_sizes, hidden_nonlinearity=nn.ReLU, output_nonlinearity=None, min_std=np.exp(-20.), max_std=np.exp(2.), ) qf1 = ContinuousMLPQFunction(env_spec=env.spec, hidden_sizes=self.qf_hidden_sizes, hidden_nonlinearity=F.relu) qf2 = ContinuousMLPQFunction(env_spec=env.spec, hidden_sizes=self.qf_hidden_sizes, hidden_nonlinearity=F.relu) replay_buffer = PathBuffer( capacity_in_transitions=self.buffer_capacity_in_transitions) algo = _SAC_(env_spec=env.spec, policy=policy, qf1=qf1, qf2=qf2, gradient_steps_per_itr=self.gradient_steps_per_itr, max_path_length=self.max_path_length, max_eval_path_length=self.max_eval_path_length, replay_buffer=replay_buffer, min_buffer_size=self.min_buffer_size, target_update_tau=self.target_update_tau, discount=self.discount, buffer_batch_size=self.buffer_batch_size, reward_scale=self.reward_scale, steps_per_epoch=self.steps_per_epoch) if torch.cuda.is_available(): set_gpu_mode(True) else: set_gpu_mode(False) algo.to() if self.parallel_sampling: runner.setup(algo=algo, env=env, sampler_cls=RaySampler, n_workers=self.n_workers) else: runner.setup(algo=algo, env=env, sampler_cls=LocalSampler) runner.train(n_epochs=self.n_epochs, batch_size=self.batch_size)
def test_pearl_ml1_push(self): """Test PEARL with ML1 Push environment.""" params = dict(seed=1, num_epochs=1, num_train_tasks=5, num_test_tasks=1, latent_size=7, encoder_hidden_sizes=[10, 10, 10], net_size=30, meta_batch_size=16, num_steps_per_epoch=40, num_initial_steps=40, num_tasks_sample=15, num_steps_prior=15, num_extra_rl_steps_posterior=15, batch_size=256, embedding_batch_size=8, embedding_mini_batch_size=8, max_episode_length=50, reward_scale=10., use_information_bottleneck=True, use_next_obs_in_context=False, use_gpu=False) net_size = params['net_size'] set_seed(params['seed']) env_sampler = SetTaskSampler(lambda: GarageEnv( normalize(ML1.get_train_tasks('push-v1')))) env = env_sampler.sample(params['num_train_tasks']) test_env_sampler = SetTaskSampler(lambda: GarageEnv( normalize(ML1.get_test_tasks('push-v1')))) augmented_env = PEARL.augment_env_spec(env[0](), params['latent_size']) qf = ContinuousMLPQFunction( env_spec=augmented_env, hidden_sizes=[net_size, net_size, net_size]) vf_env = PEARL.get_env_spec(env[0](), params['latent_size'], 'vf') vf = ContinuousMLPQFunction( env_spec=vf_env, hidden_sizes=[net_size, net_size, net_size]) inner_policy = TanhGaussianMLPPolicy( env_spec=augmented_env, hidden_sizes=[net_size, net_size, net_size]) pearl = PEARL( env=env, policy_class=ContextConditionedPolicy, encoder_class=MLPEncoder, inner_policy=inner_policy, qf=qf, vf=vf, num_train_tasks=params['num_train_tasks'], num_test_tasks=params['num_test_tasks'], latent_dim=params['latent_size'], encoder_hidden_sizes=params['encoder_hidden_sizes'], test_env_sampler=test_env_sampler, meta_batch_size=params['meta_batch_size'], num_steps_per_epoch=params['num_steps_per_epoch'], num_initial_steps=params['num_initial_steps'], num_tasks_sample=params['num_tasks_sample'], num_steps_prior=params['num_steps_prior'], num_extra_rl_steps_posterior=params[ 'num_extra_rl_steps_posterior'], batch_size=params['batch_size'], embedding_batch_size=params['embedding_batch_size'], embedding_mini_batch_size=params['embedding_mini_batch_size'], max_episode_length=params['max_episode_length'], reward_scale=params['reward_scale'], ) set_gpu_mode(params['use_gpu'], gpu_id=0) if params['use_gpu']: pearl.to() runner = LocalRunner(snapshot_config) runner.setup( algo=pearl, env=env[0](), sampler_cls=LocalSampler, sampler_args=dict(max_episode_length=params['max_episode_length']), n_workers=1, worker_class=PEARLWorker) runner.train(n_epochs=params['num_epochs'], batch_size=params['batch_size'])
def meta_kant_cheetah_vel( ctxt=None, seed=seed, num_skills=skills_num, num_epochs=param_num_epoches, num_train_tasks=param_train_tasks_num, num_test_tasks=param_test_tasks_num, is_encoder_recurrent=False, latent_size=param_latent_size, encoder_hidden_size=param_encoder_hidden_size, net_size=param_net_size, meta_batch_size=param_meta_batch_size, num_steps_per_epoch=param_num_steps_per_epoch, num_initial_steps=param_num_initial_steps, num_tasks_sample=param_num_tasks_sample, num_steps_prior=param_num_steps_prior, num_extra_rl_steps_posterior=param_num_extra_rl_steps_posterior, num_skills_sample=param_num_skills_sample, num_skills_reason_steps=param_num_skills_reason_steps, batch_size=param_batch_size, embedding_batch_size=param_embedding_batch_size, embedding_mini_batch_size=param_embedding_mini_batch_size, max_path_length=param_max_path_length, skills_reason_reward_scale=param_skills_reason_reward_scale, tasks_adapt_reward_scale=param_tasks_adapt_reward_scale, use_gpu=param_use_gpu): assert num_train_tasks is skills_num set_seed(seed) encoder_hidden_sizes = (encoder_hidden_size, encoder_hidden_size, encoder_hidden_size) ML_train_envs = [ DiaynEnvWrapper(task_proposer, skills_num, task_name, normalize(HalfCheetahVelEnv())) for task_name in range(skills_num) ] env_sampler = EnvPoolSampler(ML_train_envs) env = env_sampler.sample(num_train_tasks) test_env_sampler = SetTaskSampler( lambda: GarageEnv(normalize(HalfCheetahVelEnv()))) runner = LocalRunner(ctxt) qf_env = MetaKant.get_env_spec(env[0](), latent_size, num_skills, "qf") qf = ContinuousMLPQFunction(env_spec=qf_env, hidden_sizes=[net_size, net_size, net_size]) vf_env = MetaKant.get_env_spec(env[0](), latent_size, num_skills, 'vf') vf = ContinuousMLPQFunction(env_spec=vf_env, hidden_sizes=[net_size, net_size, net_size]) controller_policy_env = MetaKant.get_env_spec(env[0](), latent_size, module="controller_policy", num_skills=num_skills) controller_policy = CategoricalMLPPolicy( env_spec=controller_policy_env, hidden_sizes=[net_size, net_size], hidden_nonlinearity=functional.relu) metakant = MetaKant( env=env, skill_env=skill_env, controller_policy=controller_policy, skill_actor=skill_actor, qf=qf, vf=vf, num_skills=num_skills, num_train_tasks=num_train_tasks, num_test_tasks=num_test_tasks, sampler_class=LocalSkillSampler, is_encoder_recurrent=is_encoder_recurrent, latent_dim=latent_size, encoder_hidden_sizes=encoder_hidden_sizes, test_env_sampler=test_env_sampler, meta_batch_size=meta_batch_size, num_initial_steps=num_initial_steps, num_tasks_sample=num_tasks_sample, num_steps_per_epoch=num_steps_per_epoch, num_steps_prior=num_steps_prior, # num_steps_posterior num_extra_rl_steps_posterior=num_extra_rl_steps_posterior, num_skills_reason_steps=num_skills_reason_steps, num_skills_sample=num_skills_sample, batch_size=batch_size, embedding_batch_size=embedding_batch_size, embedding_mini_batch_size=embedding_mini_batch_size, max_path_length=max_path_length, skills_reason_reward_scale=skills_reason_reward_scale, tasks_adapt_reward_scale=tasks_adapt_reward_scale) tu.set_gpu_mode(use_gpu, gpu_id=0) if use_gpu: metakant.to() worker_args = dict(num_skills=num_skills, skill_actor_class=type(skill_actor), controller_class=OpenContextConditionedControllerPolicy, deterministic=False, accum_context=True) runner.setup(algo=metakant, env=env[0](), sampler_cls=LocalSkillSampler, sampler_args=dict(max_path_length=max_path_length), n_workers=1, worker_class=KantWorker, worker_args=worker_args) average_returns = runner.train(n_epochs=num_epochs, batch_size=batch_size) runner.save(num_epochs - 1) return average_returns
def run_garage_pytorch(env, seed, log_dir): """Create garage PyTorch PPO model and training. Args: env (dict): Environment of the task. seed (int): Random positive integer for the trial. log_dir (str): Log dir path. Returns: str: Path to output csv file """ env = TfEnv(normalize(env)) deterministic.set_seed(seed) runner = LocalRunner(snapshot_config) policy = PyTorch_GMP(env.spec, hidden_sizes=(32, 32), hidden_nonlinearity=torch.tanh, output_nonlinearity=None) value_function = GaussianMLPValueFunction(env_spec=env.spec, hidden_sizes=(32, 32), hidden_nonlinearity=torch.tanh, output_nonlinearity=None) policy_optimizer = OptimizerWrapper((torch.optim.Adam, dict(lr=2.5e-4)), policy, max_optimization_epochs=10, minibatch_size=64) vf_optimizer = OptimizerWrapper((torch.optim.Adam, dict(lr=2.5e-4)), value_function, max_optimization_epochs=10, minibatch_size=64) algo = PyTorch_PPO(env_spec=env.spec, policy=policy, value_function=value_function, policy_optimizer=policy_optimizer, vf_optimizer=vf_optimizer, max_path_length=hyper_parameters['max_path_length'], discount=0.99, gae_lambda=0.95, center_adv=True, lr_clip_range=0.2) # Set up logger since we are not using run_experiment tabular_log_file = osp.join(log_dir, 'progress.csv') dowel_logger.add_output(dowel.StdOutput()) dowel_logger.add_output(dowel.CsvOutput(tabular_log_file)) dowel_logger.add_output(dowel.TensorBoardOutput(log_dir)) runner.setup(algo, env) runner.train(n_epochs=hyper_parameters['n_epochs'], batch_size=hyper_parameters['batch_size']) dowel_logger.remove_all() return tabular_log_file
def run_task(snapshot_config, *_): """Set up environment and algorithm and run the task. Args: snapshot_config (garage.experiment.SnapshotConfig): The snapshot configuration used by LocalRunner to create the snapshotter. If None, it will create one with default settings. _ : Unused parameters """ th = 1.8 g_max = 0.1 #delta = 1e-7 if args.env == 'CartPole': #CartPole env = TfEnv(normalize(CartPoleEnv())) runner = LocalRunner(snapshot_config) batch_size = 5000 max_length = 100 n_timestep = 5e5 n_counts = 5 name = 'CartPole' grad_factor = 5 th = 1.2 #batchsize: 1 # lr = 0.1 # w = 2 # c = 50 #batchsize: 50 lr = 0.75 c = 3 w = 2 discount = 0.995 path = './init/CartPole_policy.pth' if args.env == 'Walker': #Walker_2d env = TfEnv(normalize(Walker2dEnv())) runner = LocalRunner(snapshot_config) batch_size = 50000 max_length = 500 n_timestep = 1e7 n_counts = 5 lr = 0.75 w = 2 c = 12 grad_factor = 6 discount = 0.999 name = 'Walk' path = './init/Walk_policy.pth' if args.env == 'HalfCheetah': env = TfEnv(normalize(HalfCheetahEnv())) runner = LocalRunner(snapshot_config) batch_size = 50000 max_length = 500 n_timestep = 1e7 n_counts = 5 lr = 0.6 w = 1 c = 4 grad_factor = 5 th = 1.2 g_max = 0.06 discount = 0.999 name = 'HalfCheetah' path = './init/HalfCheetah_policy.pth' if args.env == 'Hopper': #Hopper env = TfEnv(normalize(HopperEnv())) runner = LocalRunner(snapshot_config) batch_size = 50000 max_length = 1000 th = 1.5 n_timestep = 1e7 n_counts = 5 lr = 0.75 w = 1 c = 3 grad_factor = 6 g_max = 0.15 discount = 0.999 name = 'Hopper' path = './init/Hopper_policy.pth' for i in range(n_counts): # print(env.spec) if args.env == 'CartPole': policy = CategoricalMLPPolicy(env.spec, hidden_sizes=[8, 8], hidden_nonlinearity=torch.tanh, output_nonlinearity=None) else: policy = GaussianMLPPolicy(env.spec, hidden_sizes=[64, 64], hidden_nonlinearity=torch.tanh, output_nonlinearity=None) policy.load_state_dict(torch.load(path)) baseline = LinearFeatureBaseline(env_spec=env.spec) algo = MBPG_HA(env_spec=env.spec, env = env, env_name= name, policy=policy, baseline=baseline, max_path_length=max_length, discount=discount, grad_factor=grad_factor, policy_lr= lr, c = c, w = w, th=th, g_max=g_max, n_timestep=n_timestep, batch_size=batch_size, center_adv=True, # delta=delta #decay_learning_rate=d_lr, ) runner.setup(algo, env) runner.train(n_epochs=100, batch_size=batch_size)
def mtsac_metaworld_mt50(ctxt=None, seed=1, use_gpu=False, _gpu=0): """Train MTSAC with MT50 environment. Args: ctxt (garage.experiment.ExperimentContext): The experiment configuration used by LocalRunner to create the snapshotter. seed (int): Used to seed the random number generator to produce determinism. use_gpu (bool): Used to enable ussage of GPU in training. _gpu (int): The ID of the gpu (used on multi-gpu machines). """ deterministic.set_seed(seed) runner = LocalRunner(ctxt) task_names = mwb.MT50.get_train_tasks().all_task_names train_envs = [] test_envs = [] for task_name in task_names: train_env = normalize(GarageEnv(mwb.MT50.from_task(task_name)), normalize_reward=True) test_env = normalize(GarageEnv(mwb.MT50.from_task(task_name))) train_envs.append(train_env) test_envs.append(test_env) mt50_train_envs = MultiEnvWrapper(train_envs, sample_strategy=round_robin_strategy, mode='vanilla') mt50_test_envs = MultiEnvWrapper(test_envs, sample_strategy=round_robin_strategy, mode='vanilla') policy = TanhGaussianMLPPolicy( env_spec=mt50_train_envs.spec, hidden_sizes=[400, 400, 400], hidden_nonlinearity=nn.ReLU, output_nonlinearity=None, min_std=np.exp(-20.), max_std=np.exp(2.), ) qf1 = ContinuousMLPQFunction(env_spec=mt50_train_envs.spec, hidden_sizes=[400, 400, 400], hidden_nonlinearity=F.relu) qf2 = ContinuousMLPQFunction(env_spec=mt50_train_envs.spec, hidden_sizes=[400, 400, 400], hidden_nonlinearity=F.relu) replay_buffer = PathBuffer(capacity_in_transitions=int(1e6), ) timesteps = 100000000 batch_size = int(150 * mt50_train_envs.num_tasks) num_evaluation_points = 500 epochs = timesteps // batch_size epoch_cycles = epochs // num_evaluation_points epochs = epochs // epoch_cycles mtsac = MTSAC(policy=policy, qf1=qf1, qf2=qf2, gradient_steps_per_itr=150, max_episode_length=250, eval_env=mt50_test_envs, env_spec=mt50_train_envs.spec, num_tasks=10, steps_per_epoch=epoch_cycles, replay_buffer=replay_buffer, min_buffer_size=7500, target_update_tau=5e-3, discount=0.99, buffer_batch_size=6400) set_gpu_mode(use_gpu, _gpu) mtsac.to() runner.setup(algo=mtsac, env=mt50_train_envs, sampler_cls=LocalSampler) runner.train(n_epochs=epochs, batch_size=batch_size)
class TestVPG: @classmethod def setup_class(cls): deterministic.set_seed(0) def setup_method(self): self._env = GarageEnv(gym.make('InvertedDoublePendulum-v2')) self._runner = LocalRunner(snapshot_config) policy = GaussianMLPPolicy(env_spec=self._env.spec, hidden_sizes=[64, 64], hidden_nonlinearity=torch.tanh, output_nonlinearity=None) self._params = { 'env_spec': self._env.spec, 'policy': policy, 'optimizer': torch.optim.Adam, 'baseline': LinearFeatureBaseline(env_spec=self._env.spec), 'max_path_length': 100, 'discount': 0.99, 'policy_lr': 1e-2 } def teardown_method(self): self._env.close() def test_vpg_no_entropy(self): """Test VPG with no_entropy.""" self._params['positive_adv'] = True self._params['use_softplus_entropy'] = True algo = VPG(**self._params) self._runner.setup(algo, self._env) last_avg_ret = self._runner.train(n_epochs=10, batch_size=100) assert last_avg_ret > 0 def test_vpg_max(self): """Test VPG with maximum entropy.""" self._params['center_adv'] = False self._params['stop_entropy_gradient'] = True self._params['entropy_method'] = 'max' algo = VPG(**self._params) self._runner.setup(algo, self._env) last_avg_ret = self._runner.train(n_epochs=10, batch_size=100) assert last_avg_ret > 0 def test_vpg_regularized(self): """Test VPG with entropy_regularized.""" self._params['entropy_method'] = 'regularized' algo = VPG(**self._params) self._runner.setup(algo, self._env) last_avg_ret = self._runner.train(n_epochs=10, batch_size=100) assert last_avg_ret > 30 @pytest.mark.parametrize('algo_param, error, msg', INVALID_ENTROPY_CONFIG) def test_invalid_entropy_config(self, algo_param, error, msg): self._params.update(algo_param) with pytest.raises(error, match=msg): VPG(**self._params)
def diayn_point_mass_multigoal(ctxt=None, seed=1): deterministic.set_seed(seed) runner = LocalRunner(snapshot_config=ctxt) env = MultiGoalEnv() skills_num = 6 policy = TanhGaussianMLPSkillPolicy( env_spec=env.spec, skills_num=skills_num, hidden_sizes=[256, 256], hidden_nonlinearity=nn.ReLU, output_nonlinearity=None, min_std=np.exp(-20.), max_std=np.exp(2.), ) qf1 = ContinuousMLPSkillQFunction(env_spec=env.spec, skills_num=skills_num, hidden_sizes=[256, 256], hidden_nonlinearity=F.relu) qf2 = ContinuousMLPSkillQFunction(env_spec=env.spec, skills_num=skills_num, hidden_sizes=[256, 256], hidden_nonlinearity=F.relu) discriminator = MLPDiscriminator(env_spec=env.spec, skills_num=skills_num, hidden_sizes=[64, 64], hidden_nonlinearity=F.relu) replay_buffer = PathBuffer(capacity_in_transitions=int(1e6)) diayn = DIAYN( env_spec=env.spec, skills_num=skills_num, discriminator=discriminator, policy=policy, qf1=qf1, qf2=qf2, gradient_steps_per_itr=1000, max_path_length=500, replay_buffer=replay_buffer, min_buffer_size=1e4, recorded=True, # enable the video recording func is_gym_render=False, media_save_path='diayn_2d_multigoal/', target_update_tau=5e-3, discount=0.99, buffer_batch_size=256, reward_scale=1., steps_per_epoch=1) if torch.cuda.is_available(): tu.set_gpu_mode(True) else: tu.set_gpu_mode(False) diayn.to() worker_args = {"skills_num": skills_num} runner.setup(algo=diayn, env=env, sampler_cls=LocalSkillSampler, worker_class=SkillWorker, worker_args=worker_args) runner.train(n_epochs=1000, batch_size=1000)