def run_metarl(env, seed, log_dir): ''' Create metarl model and training. Replace the ddpg with the algorithm you want to run. :param env: Environment of the task. :param seed: Random seed for the trial. :param log_dir: Log dir path. :return: ''' deterministic.set_seed(seed) runner = LocalRunner(snapshot_config) # Set up params for ddpg policy = TanhGaussianMLPPolicy2(env_spec=env.spec, hidden_sizes=params['policy_hidden_sizes'], hidden_nonlinearity=nn.ReLU, output_nonlinearity=None) qf1 = ContinuousMLPQFunction(env_spec=env.spec, hidden_sizes=params['qf_hidden_sizes'], hidden_nonlinearity=F.relu) qf2 = ContinuousMLPQFunction(env_spec=env.spec, hidden_sizes=params['qf_hidden_sizes'], hidden_nonlinearity=F.relu) replay_buffer = SACReplayBuffer(env_spec=env.spec, max_size=params['replay_buffer_size']) sampler_args = { 'agent': policy, 'max_path_length': 1000, } sac = SAC(env_spec=env.spec, policy=policy, qf1=qf1, qf2=qf2, gradient_steps_per_itr=params['gradient_steps_per_itr'], replay_buffer=replay_buffer, buffer_batch_size=params['buffer_batch_size']) # Set up logger since we are not using run_experiment tabular_log_file = osp.join(log_dir, 'progress.csv') tensorboard_log_dir = osp.join(log_dir) dowel_logger.add_output(dowel.StdOutput()) dowel_logger.add_output(dowel.CsvOutput(tabular_log_file)) dowel_logger.add_output(dowel.TensorBoardOutput(tensorboard_log_dir)) runner.setup(algo=sac, env=env, sampler_cls=SimpleSampler, sampler_args=sampler_args) runner.train(n_epochs=params['n_epochs'], batch_size=params['gradient_steps_per_itr']) dowel_logger.remove_all() return tabular_log_file
def ml1_push_v1_sac(ctxt=None, seed=1): """Set up environment and algorithm and run the task.""" runner = LocalRunner(ctxt) Ml1_reach_envs = get_ML1_envs("push-v1") Ml1_reach_test_envs = get_ML1_envs_test("push-v1") env = MTMetaWorldWrapper(Ml1_reach_envs) policy = TanhGaussianMLPPolicy2( env_spec=env.spec, hidden_sizes=[400, 400, 400], hidden_nonlinearity=nn.ReLU, output_nonlinearity=None, min_std=np.exp(-20.), max_std=np.exp(2.), ) qf1 = ContinuousMLPQFunction(env_spec=env.spec, hidden_sizes=[400, 400, 400], hidden_nonlinearity=F.relu) qf2 = ContinuousMLPQFunction(env_spec=env.spec, hidden_sizes=[400, 400, 400], hidden_nonlinearity=F.relu) replay_buffer = SACReplayBuffer(env_spec=env.spec, max_size=int(1e6)) sampler_args = {'agent': policy, 'max_path_length': 150} timesteps = 100000000 batch_size = int(150 * env.num_tasks) num_evaluation_points = 500 epochs = timesteps // batch_size epoch_cycles = epochs // num_evaluation_points epochs = epochs // epoch_cycles sac = MTSAC(env=env, eval_env_dict=Ml1_reach_test_envs, env_spec=env.spec, policy=policy, qf1=qf1, qf2=qf2, gradient_steps_per_itr=250, epoch_cycles=epoch_cycles, use_automatic_entropy_tuning=True, replay_buffer=replay_buffer, min_buffer_size=7500, target_update_tau=5e-3, discount=0.99, buffer_batch_size=6400) tu.set_gpu_mode(True) sac.to('cuda:0') runner.setup(algo=sac, env=env, sampler_cls=SimpleSampler, sampler_args=sampler_args) runner.train(n_epochs=epochs, batch_size=batch_size)
def mt10_sac(ctxt=None, seed=1): """Set up environment and algorithm and run the task.""" runner = LocalRunner(ctxt) MT10_envs_by_id = {} MT10_envs_test = {} for (task, env) in EASY_MODE_CLS_DICT.items(): MT10_envs_by_id[task] = MetaRLEnv( env(*EASY_MODE_ARGS_KWARGS[task]['args'], **EASY_MODE_ARGS_KWARGS[task]['kwargs'])) # python 3.6 dicts are ordered MT10_envs_test[task] = MetaRLEnv( env(*EASY_MODE_ARGS_KWARGS[task]['args'], **EASY_MODE_ARGS_KWARGS[task]['kwargs'])) env = IgnoreDoneWrapper(MTMetaWorldWrapper(MT10_envs_by_id)) policy = TanhGaussianMLPPolicy2( env_spec=env.spec, hidden_sizes=[400, 400, 400], hidden_nonlinearity=nn.ReLU, output_nonlinearity=None, min_std=np.exp(-20.), max_std=np.exp(2.), ) qf1 = ContinuousMLPQFunction(env_spec=env.spec, hidden_sizes=[400, 400, 400], hidden_nonlinearity=F.relu) qf2 = ContinuousMLPQFunction(env_spec=env.spec, hidden_sizes=[400, 400, 400], hidden_nonlinearity=F.relu) replay_buffer = SACReplayBuffer(env_spec=env.spec, max_size=int(1e6)) sampler_args = {'agent': policy, 'max_path_length': 150} timesteps = 20000000 batch_size = int(150 * env.num_tasks) num_evaluation_points = 500 epochs = timesteps // batch_size epoch_cycles = epochs // num_evaluation_points epochs = epochs // epoch_cycles sac = MTSAC(env=env, eval_env_dict=MT10_envs_test, env_spec=env.spec, policy=policy, qf1=qf1, qf2=qf2, gradient_steps_per_itr=150, epoch_cycles=epoch_cycles, use_automatic_entropy_tuning=True, replay_buffer=replay_buffer, min_buffer_size=1500, target_update_tau=5e-3, discount=0.99, buffer_batch_size=1280) tu.set_gpu_mode(True) sac.to('cuda:0') runner.setup(algo=sac, env=env, sampler_cls=SimpleSampler, sampler_args=sampler_args) runner.train(n_epochs=epochs, batch_size=batch_size)
def mt50_sac_normalize_all(ctxt=None, seed=1): """Set up environment and algorithm and run the task.""" runner = LocalRunner(ctxt) envs = MT50.get_train_tasks(sample_all=True) test_envs = MT50.get_test_tasks(sample_all=True) MT50_envs_by_id = { name: MetaRLEnv( normalize(env, normalize_reward=True, normalize_obs=True, flatten_obs=False)) for (name, env) in zip(envs._task_names, envs._task_envs) } MT50_envs_test = { name: MetaRLEnv(normalize(env, normalize_obs=True, flatten_obs=False)) for (name, env) in zip(test_envs._task_names, test_envs._task_envs) } env = MTMetaWorldWrapper(MT50_envs_by_id) policy = TanhGaussianMLPPolicy2( env_spec=env.spec, hidden_sizes=[400, 400, 400], hidden_nonlinearity=nn.ReLU, output_nonlinearity=None, min_std=np.exp(-20.), max_std=np.exp(2.), ) qf1 = ContinuousMLPQFunction(env_spec=env.spec, hidden_sizes=[400, 400, 400], hidden_nonlinearity=F.relu) qf2 = ContinuousMLPQFunction(env_spec=env.spec, hidden_sizes=[400, 400, 400], hidden_nonlinearity=F.relu) replay_buffer = SACReplayBuffer(env_spec=env.spec, max_size=int(1e6)) sampler_args = {'agent': policy, 'max_path_length': 150} timesteps = 100000000 batch_size = int(150 * env.num_tasks) num_evaluation_points = 500 epochs = timesteps // batch_size epoch_cycles = epochs // num_evaluation_points epochs = epochs // epoch_cycles sac = MTSAC(env=env, eval_env_dict=MT50_envs_test, env_spec=env.spec, policy=policy, qf1=qf1, qf2=qf2, gradient_steps_per_itr=250, epoch_cycles=epoch_cycles, use_automatic_entropy_tuning=True, replay_buffer=replay_buffer, min_buffer_size=7500, target_update_tau=5e-3, discount=0.99, buffer_batch_size=6400) tu.set_gpu_mode(True) sac.to('cuda:0') runner.setup(algo=sac, env=env, sampler_cls=SimpleSampler, sampler_args=sampler_args) runner.train(n_epochs=epochs, batch_size=batch_size)