def mtsac_metaworld_mt10(ctxt=None, *, experiment_name, config_pth, seed, timesteps, use_wandb, wandb_project_name, gpu): """Train MTSAC with metaworld_experiments environment. Args: ctxt (garage.experiment.ExperimentContext): The experiment configuration used by Trainer to create the snapshotter. seed (int): Used to seed the random number generator to produce determinism. _gpu (int): The ID of the gpu to be used (used on multi-gpu machines). timesteps (int): Number of timesteps to run. """ print(f"Initiation took {time()-t0:.2f} secs") # Get experiment parameters (e.g. hyperparameters) and save the json file params = get_params(config_pth) with open(ctxt.snapshot_dir + "/params.json", "w") as json_file: json.dump(params, json_file) if use_wandb == "True": use_wandb = True wandb.init( name=experiment_name, project=wandb_project_name, group="Baselines{}".format("mt10"), reinit=True, config=params, ) else: use_wandb = False num_tasks = 10 timesteps = timesteps deterministic.set_seed(seed) trainer = CustomTrainer(ctxt) mt10 = metaworld.MT10() train_task_sampler = MetaWorldTaskSampler(mt10, "train", add_env_onehot=True) assert num_tasks % 10 == 0, "Number of tasks have to divisible by 10" assert num_tasks <= 500, "Number of tasks should be less or equal 500" mt10_train_envs = train_task_sampler.sample(num_tasks) env = mt10_train_envs[0]() params["net"]["policy_min_std"] = np.exp( params["net"]["policy_min_log_std"]) params["net"]["policy_max_std"] = np.exp( params["net"]["policy_max_log_std"]) policy = create_policy_net(env_spec=env.spec, net_params=params["net"]) qf1 = create_qf_net(env_spec=env.spec, net_params=params["net"]) qf2 = create_qf_net(env_spec=env.spec, net_params=params["net"]) replay_buffer = PathBuffer(capacity_in_transitions=int( params["general_setting"]["num_buffer_transitions"])) max_episode_length = env.spec.max_episode_length # Note: are the episode length the same among all tasks? sampler = RaySampler( agents=policy, envs=mt10_train_envs, max_episode_length=max_episode_length, # 1 sampler worker for each environment n_workers=num_tasks, worker_class=DefaultWorker) test_sampler = RaySampler( agents=policy, envs=mt10_train_envs, max_episode_length=max_episode_length, # 1 sampler worker for each environment n_workers=num_tasks, worker_class=EvalWorker) # Note: difference between sampler and test sampler is only the worker # difference is one line in EvalWorker, uses average: a = agent_info['mean'] # can we create a unified worker that cointais both rules? # Number of transitions before a set of gradient updates steps_between_updates = int(max_episode_length * num_tasks) # epoch: 1 cycle of data collection + gradient updates epochs = timesteps // steps_between_updates mtsac = CustomMTSAC( env_spec=env.spec, policy=policy, qf1=qf1, qf2=qf2, replay_buffer=replay_buffer, sampler=sampler, train_task_sampler=train_task_sampler, test_sampler=test_sampler, gradient_steps_per_itr=int(max_episode_length * params["training"]["num_grad_steps_scale"]), num_tasks=num_tasks, min_buffer_size=max_episode_length * num_tasks, target_update_tau=params["training"]["target_update_tau"], discount=params["general_setting"]["discount"], buffer_batch_size=params["training"]["buffer_batch_size"], policy_lr=params["training"]["policy_lr"], qf_lr=params["training"]["qf_lr"], reward_scale=params["training"]["reward_scale"], num_evaluation_episodes=params["general_setting"]["eval_episodes"], task_update_frequency=params["training"]["task_update_frequency"], wandb_logging=use_wandb, evaluation_frequency=params["general_setting"]["evaluation_frequency"]) if gpu is not None: set_gpu_mode(True, gpu) mtsac.to() trainer.setup(algo=mtsac, env=mt10_train_envs) trainer.train(n_epochs=epochs, batch_size=steps_between_updates)
def mtppo_metaworld_mt10(ctxt, experiment_name, config_pth, seed, n_workers, n_tasks, use_wandb, wandb_username, use_gpu): """Set up environment and algorithm and run the task. Args: ctxt (garage.experiment.ExperimentContext): The experiment configuration used by Trainer to create the snapshotter. seed (int): Used to seed the random number generator to produce determinism. epochs (int): Number of training epochs. batch_size (int): Number of environment steps in one batch. n_workers (int): The number of workers the sampler should use. n_tasks (int): Number of tasks to use. Should be a multiple of 10. """ params = get_params(config_pth) set_seed(seed) mt10 = metaworld.MT10() train_task_sampler = MetaWorldTaskSampler(mt10, "train", lambda env, _: normalize(env), add_env_onehot=True) if use_wandb == "True": use_wandb = True wandb.init( name=experiment_name, entity=wandb_username, project="mt10", group="Baselines{}".format("mt10"), reinit=True, config=params, ) else: use_wandb = False assert n_tasks % 10 == 0 assert n_tasks <= 500 envs = [env_up() for env_up in train_task_sampler.sample(n_tasks)] env = envs[0] policy = create_policy_net(env_spec=env.spec, net_params=params["net"]) value_function = create_vf_net(env_spec=env.spec, net_params=params["net"]) sampler = RaySampler(agents=policy, envs=env, max_episode_length=env.spec.max_episode_length, n_workers=n_workers, worker_class=DefaultWorker) gpu_training = True if use_gpu else False algo = CustomMTPPO( env_spec=env.spec, policy=policy, value_function=value_function, sampler=sampler, train_task_sampler=train_task_sampler, num_tasks=n_tasks, task_update_frequency=params["training"]["task_update_frequency"], num_eval_eps=params["general_setting"]["eval_episodes"], policy_lr=params["training"]["policy_lr"], vf_lr=params["training"]["vf_lr"], ppo_eps=params["training"]["ppo_eps"], minibatch_size=params["training"]["minibatch_size"], ppo_epochs=params["training"]["ppo_epochs"], num_train_per_epoch=params["training"]["num_train_per_epoch"], discount=params["general_setting"]["discount"], gae_lambda=params["training"]["gae_lambda"], center_adv=False, wandb_logging=use_wandb, eval_freq=params["general_setting"]["eval_freq"], stop_entropy_gradient=True, entropy_method="max", gpu_training=gpu_training) trainer = Trainer(ctxt) trainer.setup(algo, env) trainer.train(n_epochs=params["training"]["epochs"], batch_size=params["training"]["batch_episodes_per_task"], plot=False)
def ppo_memorize_digits(ctxt=None, seed=1, batch_size=4000, max_episode_length=100): """Train PPO on MemorizeDigits-v0 environment. Args: ctxt (garage.experiment.ExperimentContext): The experiment configuration used by Trainer to create the snapshotter. seed (int): Used to seed the random number generator to produce determinism. batch_size (int): Number of timesteps to use in each training step. max_episode_length (int): Max number of timesteps in an episode. """ set_seed(seed) with TFTrainer(ctxt) as trainer: env = normalize( GymEnv('MemorizeDigits-v0', is_image=True, max_episode_length=max_episode_length)) policy = CategoricalCNNPolicy(env_spec=env.spec, filters=( (32, (5, 5)), (64, (3, 3)), (64, (2, 2)), ), strides=(4, 2, 1), padding='VALID', hidden_sizes=(256, )) # yapf: disable baseline = GaussianCNNBaseline( env_spec=env.spec, filters=( (32, (5, 5)), (64, (3, 3)), (64, (2, 2)), ), strides=(4, 2, 1), padding='VALID', hidden_sizes=(256, ), use_trust_region=True) # yapf: disable sampler = RaySampler(agents=policy, envs=env, max_episode_length=env.spec.max_episode_length, is_tf_worker=True) algo = PPO(env_spec=env.spec, policy=policy, baseline=baseline, sampler=sampler, discount=0.99, gae_lambda=0.95, lr_clip_range=0.2, policy_ent_coeff=0.0, optimizer_args=dict( batch_size=32, max_optimization_epochs=10, learning_rate=1e-3, )) trainer.setup(algo, env) trainer.train(n_epochs=1000, batch_size=batch_size)
def maml_trpo_metaworld_ml45(ctxt, seed, epochs, episodes_per_task, meta_batch_size): """Set up environment and algorithm and run the task. Args: ctxt (ExperimentContext): The experiment configuration used by :class:`~Trainer` to create the :class:`~Snapshotter`. seed (int): Used to seed the random number generator to produce determinism. epochs (int): Number of training epochs. episodes_per_task (int): Number of episodes per epoch per task for training. meta_batch_size (int): Number of tasks sampled per batch. """ set_seed(seed) ml45 = metaworld.ML45() # pylint: disable=missing-return-doc,missing-return-type-doc def wrap(env, _): return normalize(env, expected_action_scale=10.0) train_task_sampler = MetaWorldTaskSampler(ml45, 'train', wrap) test_env = wrap(MetaWorldSetTaskEnv(ml45, 'test'), None) test_task_sampler = SetTaskSampler(MetaWorldSetTaskEnv, env=test_env, wrapper=wrap) env = train_task_sampler.sample(45)[0]() policy = GaussianMLPPolicy( env_spec=env.spec, hidden_sizes=(100, 100), hidden_nonlinearity=torch.tanh, output_nonlinearity=None, ) value_function = GaussianMLPValueFunction(env_spec=env.spec, hidden_sizes=(32, 32), hidden_nonlinearity=torch.tanh, output_nonlinearity=None) meta_evaluator = MetaEvaluator(test_task_sampler=test_task_sampler) sampler = RaySampler(agents=policy, envs=env, max_episode_length=env.spec.max_episode_length, n_workers=meta_batch_size) trainer = Trainer(ctxt) algo = MAMLTRPO(env=env, task_sampler=train_task_sampler, policy=policy, sampler=sampler, value_function=value_function, meta_batch_size=meta_batch_size, discount=0.99, gae_lambda=1., inner_lr=0.1, num_grad_updates=1, meta_evaluator=meta_evaluator) trainer.setup(algo, env) trainer.train(n_epochs=epochs, batch_size=episodes_per_task * env.spec.max_episode_length)
def maml_vpg_half_cheetah_dir(ctxt, seed, epochs, episodes_per_task, meta_batch_size): """Set up environment and algorithm and run the task. Args: ctxt (ExperimentContext): The experiment configuration used by :class:`~Trainer` to create the :class:`~Snapshotter`. seed (int): Used to seed the random number generator to produce determinism. epochs (int): Number of training epochs. episodes_per_task (int): Number of episodes per epoch per task for training. meta_batch_size (int): Number of tasks sampled per batch. """ set_seed(seed) env = normalize(GymEnv(HalfCheetahDirEnv(), max_episode_length=100), expected_action_scale=10.) policy = GaussianMLPPolicy( env_spec=env.spec, hidden_sizes=(64, 64), hidden_nonlinearity=torch.tanh, output_nonlinearity=None, ) value_function = GaussianMLPValueFunction(env_spec=env.spec, hidden_sizes=(32, 32), hidden_nonlinearity=torch.tanh, output_nonlinearity=None) max_episode_length = env.spec.max_episode_length task_sampler = SetTaskSampler( HalfCheetahDirEnv, wrapper=lambda env, _: normalize(GymEnv( env, max_episode_length=max_episode_length), expected_action_scale=10.)) meta_evaluator = MetaEvaluator(test_task_sampler=task_sampler, n_test_tasks=1, n_test_episodes=10) sampler = RaySampler(agents=policy, envs=env, max_episode_length=env.spec.max_episode_length) trainer = Trainer(ctxt) algo = MAMLVPG(env=env, policy=policy, sampler=sampler, task_sampler=task_sampler, value_function=value_function, meta_batch_size=meta_batch_size, discount=0.99, gae_lambda=1., inner_lr=0.1, num_grad_updates=1, meta_evaluator=meta_evaluator) trainer.setup(algo, env) trainer.train(n_epochs=epochs, batch_size=episodes_per_task * max_episode_length)