def trpo_garage_pytorch_experiment(ctxt, env_id, seed): if torch.cuda.is_available(): set_gpu_mode(True) else: set_gpu_mode(False) deterministic.set_seed(seed) runner = LocalRunner(ctxt) env = GarageEnv(normalize(gym.make(env_id))) # using gaussian policy policy = Pytorch_GMP(env.spec, hidden_sizes=[256, 256], hidden_nonlinearity=torch.tanh, output_nonlinearity=None) # using MLP for value approximator value_function = GaussianMLPValueFunction(env_spec=env.spec, hidden_size=[256, 256], hidden_nonlinearity=torch.tanh, output_nonlinearity=None) # this is good algo = Pytorch_TRPO(env_spec=env.spec, policy=policy, value_function=value_function, max_episode_length=100, discount=0.99, gae_lambda=0.97) runner.setup(algo, env) runner.train(n_epochs=999, batch_size=1024)
def alg_train(ctxt=None): get_args(parser) args = parser.parse_args() args.prefix = use_prefix set_seed(args.seed) env = GymEnv(args.env_name) if args.env_norm: env = normalize(env) trainer = Trainer(ctxt) logger.remove_all() logger.add_output(StdLogger(args.log_interval)) if not args.no_wb: wb_logger = WbOutput(args.log_interval, base_args) logger.add_output(wb_logger) algo = get_algo(env, trainer, args) if args.cuda: set_gpu_mode(True) algo.to() else: set_gpu_mode(False) trainer.train(n_epochs=args.n_epochs, batch_size=args.batch_size)
def sac_half_cheetah_batch(ctxt=None, seed=1): """Set up environment and algorithm and run the task. Args: ctxt (garage.experiment.ExperimentContext): The experiment configuration used by Trainer to create the snapshotter. seed (int): Used to seed the random number generator to produce determinism. """ deterministic.set_seed(seed) trainer = Trainer(snapshot_config=ctxt) env = normalize(GymEnv('HalfCheetah-v2')) policy = TanhGaussianMLPPolicy( env_spec=env.spec, hidden_sizes=[256, 256], hidden_nonlinearity=nn.ReLU, output_nonlinearity=None, min_std=np.exp(-20.), max_std=np.exp(2.), ) qf1 = ContinuousMLPQFunction(env_spec=env.spec, hidden_sizes=[256, 256], hidden_nonlinearity=F.relu) qf2 = ContinuousMLPQFunction(env_spec=env.spec, hidden_sizes=[256, 256], hidden_nonlinearity=F.relu) replay_buffer = PathBuffer(capacity_in_transitions=int(1e6)) sampler = LocalSampler(agents=policy, envs=env, max_episode_length=env.spec.max_episode_length, worker_class=FragmentWorker) sac = SAC(env_spec=env.spec, policy=policy, qf1=qf1, qf2=qf2, sampler=sampler, gradient_steps_per_itr=1000, max_episode_length_eval=1000, replay_buffer=replay_buffer, min_buffer_size=1e4, target_update_tau=5e-3, discount=0.99, buffer_batch_size=256, reward_scale=1., steps_per_epoch=1) if torch.cuda.is_available(): set_gpu_mode(True) else: set_gpu_mode(False) sac.to() trainer.setup(algo=sac, env=env) trainer.train(n_epochs=1000, batch_size=1000)
def trpo_garage_pytorch(env_id): env = GarageEnv(normalize(gym.make(env_id))) if torch.cuda.is_available(): set_gpu_mode(True) else: set_gpu_mode(False) # using gaussian policy policy = Pytorch_GMP(env.spec, hidden_sizes=[32, 32], hidden_nonlinearity=torch.tanh, output_nonlinearity=None) # using MLP for value approximator value_function = GaussianMLPValueFunction(env_spec=env.spec, hidden_size=[32, 32], hidden_nonlinearity=torch.tanh, output_nonlinearity=None) # this is good algo = Pytorch_TRPO(env_spec=env.spec, policy=policy, value_function=value_function, max_episode_length=100, discount=0.99, gae_lambda=0.97)
def test_fixed_alpha(): """Test if using fixed_alpha ensures that alpha is non differentiable.""" env_names = ['InvertedDoublePendulum-v2', 'InvertedDoublePendulum-v2'] task_envs = [GymEnv(name, max_episode_length=100) for name in env_names] env = MultiEnvWrapper(task_envs, sample_strategy=round_robin_strategy) test_envs = MultiEnvWrapper(task_envs, sample_strategy=round_robin_strategy) deterministic.set_seed(0) trainer = Trainer(snapshot_config=snapshot_config) policy = TanhGaussianMLPPolicy( env_spec=env.spec, hidden_sizes=[32, 32], hidden_nonlinearity=torch.nn.ReLU, output_nonlinearity=None, min_std=np.exp(-20.), max_std=np.exp(2.), ) qf1 = ContinuousMLPQFunction(env_spec=env.spec, hidden_sizes=[32, 32], hidden_nonlinearity=F.relu) qf2 = ContinuousMLPQFunction(env_spec=env.spec, hidden_sizes=[32, 32], hidden_nonlinearity=F.relu) replay_buffer = PathBuffer(capacity_in_transitions=int(1e6), ) num_tasks = 2 buffer_batch_size = 128 sampler = LocalSampler(agents=policy, envs=env, max_episode_length=env.spec.max_episode_length, worker_class=FragmentWorker) mtsac = MTSAC(policy=policy, qf1=qf1, qf2=qf2, sampler=sampler, gradient_steps_per_itr=100, eval_env=[test_envs], env_spec=env.spec, num_tasks=num_tasks, steps_per_epoch=1, replay_buffer=replay_buffer, min_buffer_size=1e3, target_update_tau=5e-3, discount=0.99, buffer_batch_size=buffer_batch_size, fixed_alpha=np.exp(0.5)) if torch.cuda.is_available(): set_gpu_mode(True) else: set_gpu_mode(False) mtsac.to() assert torch.allclose(torch.Tensor([0.5] * num_tasks), mtsac._log_alpha.to('cpu')) trainer.setup(mtsac, env) trainer.train(n_epochs=1, batch_size=128, plot=False) assert torch.allclose(torch.Tensor([0.5] * num_tasks), mtsac._log_alpha.to('cpu')) assert not mtsac._use_automatic_entropy_tuning
def test_to(): """Test the torch function that moves modules to GPU. Test that the policy and qfunctions are moved to gpu if gpu is available. """ env_names = ['CartPole-v0', 'CartPole-v1'] task_envs = [GarageEnv(env_name=name) for name in env_names] env = MultiEnvWrapper(task_envs, sample_strategy=round_robin_strategy) deterministic.set_seed(0) policy = TanhGaussianMLPPolicy( env_spec=env.spec, hidden_sizes=[1, 1], hidden_nonlinearity=torch.nn.ReLU, output_nonlinearity=None, min_std=np.exp(-20.), max_std=np.exp(2.), ) qf1 = ContinuousMLPQFunction(env_spec=env.spec, hidden_sizes=[1, 1], hidden_nonlinearity=F.relu) qf2 = ContinuousMLPQFunction(env_spec=env.spec, hidden_sizes=[1, 1], hidden_nonlinearity=F.relu) replay_buffer = PathBuffer(capacity_in_transitions=int(1e6), ) num_tasks = 2 buffer_batch_size = 2 mtsac = MTSAC(policy=policy, qf1=qf1, qf2=qf2, gradient_steps_per_itr=150, max_path_length=150, eval_env=env, env_spec=env.spec, num_tasks=num_tasks, steps_per_epoch=5, replay_buffer=replay_buffer, min_buffer_size=1e3, target_update_tau=5e-3, discount=0.99, buffer_batch_size=buffer_batch_size) set_gpu_mode(torch.cuda.is_available()) mtsac.to() device = global_device() for param in mtsac._qf1.parameters(): assert param.device == device for param in mtsac._qf2.parameters(): assert param.device == device for param in mtsac._qf2.parameters(): assert param.device == device for param in mtsac.policy.parameters(): assert param.device == device assert mtsac._log_alpha.device == device
def test_sac_inverted_double_pendulum(): """Test Sac performance on inverted pendulum.""" # pylint: disable=unexpected-keyword-arg env = normalize(GymEnv('InvertedDoublePendulum-v2', max_episode_length=100)) deterministic.set_seed(0) policy = TanhGaussianMLPPolicy( env_spec=env.spec, hidden_sizes=[32, 32], hidden_nonlinearity=torch.nn.ReLU, output_nonlinearity=None, min_std=np.exp(-20.), max_std=np.exp(2.), ) qf1 = ContinuousMLPQFunction(env_spec=env.spec, hidden_sizes=[32, 32], hidden_nonlinearity=F.relu) qf2 = ContinuousMLPQFunction(env_spec=env.spec, hidden_sizes=[32, 32], hidden_nonlinearity=F.relu) replay_buffer = PathBuffer(capacity_in_transitions=int(1e6), ) trainer = Trainer(snapshot_config=snapshot_config) sampler = LocalSampler(agents=policy, envs=env, max_episode_length=env.spec.max_episode_length, worker_class=FragmentWorker) sac = SAC(env_spec=env.spec, policy=policy, qf1=qf1, qf2=qf2, sampler=sampler, gradient_steps_per_itr=100, replay_buffer=replay_buffer, min_buffer_size=1e3, target_update_tau=5e-3, discount=0.99, buffer_batch_size=64, reward_scale=1., steps_per_epoch=2) trainer.setup(sac, env) if torch.cuda.is_available(): set_gpu_mode(True) else: set_gpu_mode(False) sac.to() ret = trainer.train(n_epochs=12, batch_size=200, plot=False) # check that automatic entropy tuning is used assert sac._use_automatic_entropy_tuning # assert that there was a gradient properly connected to alpha # this doesn't verify that the path from the temperature objective is # correct. assert not torch.allclose(torch.Tensor([1.]), sac._log_alpha.to('cpu')) # check that policy is learning beyond predecided threshold assert ret > 80
def test_utils_set_gpu_mode(): """Test setting gpu mode to False to force CPU.""" if torch.cuda.is_available(): set_gpu_mode(mode=True) assert global_device() == torch.device('cuda:0') assert tu._USE_GPU else: set_gpu_mode(mode=False) assert global_device() == torch.device('cpu') assert not tu._USE_GPU assert not tu._GPU_ID
def train(self, trainer): """Obtain samplers and start actual training for each epoch. Args: trainer (Trainer): Gives the algorithm the access to :method:`~Trainer.step_epochs()`, which provides services such as snapshotting and sampler control. Returns: float: The average return in last epoch cycle. """ last_return = None for i, _ in enumerate(trainer.step_epochs()): if not self._multitask: trainer.step_path = trainer.obtain_episodes(trainer.step_itr) else: env_updates = None assert self._train_task_sampler is not None if (not i % self._task_update_frequency) or ( self._task_update_frequency == 1): env_updates = self._train_task_sampler.sample( self._num_tasks) trainer.step_path = self.obtain_exact_trajectories( trainer, env_update=env_updates) # do training on GPU if self._gpu_training: prefer_gpu() self.to(device=global_device()) log_dict, last_return = self._train_once(trainer.step_itr, trainer.step_path) # move back to CPU for collection set_gpu_mode(False) self.to(device=global_device()) if self._wandb_logging: # log dict should be a dict, not None log_dict['total_env_steps'] = trainer.total_env_steps wandb.log(log_dict) trainer.step_itr += 1 return last_return
def test_sac_to(): """Test moving Sac between CPU and GPU.""" env = normalize(GymEnv('InvertedDoublePendulum-v2', max_episode_length=100)) deterministic.set_seed(0) policy = TanhGaussianMLPPolicy( env_spec=env.spec, hidden_sizes=[32, 32], hidden_nonlinearity=torch.nn.ReLU, output_nonlinearity=None, min_std=np.exp(-20.), max_std=np.exp(2.), ) qf1 = ContinuousMLPQFunction(env_spec=env.spec, hidden_sizes=[32, 32], hidden_nonlinearity=F.relu) qf2 = ContinuousMLPQFunction(env_spec=env.spec, hidden_sizes=[32, 32], hidden_nonlinearity=F.relu) replay_buffer = PathBuffer(capacity_in_transitions=int(1e6), ) trainer = Trainer(snapshot_config=snapshot_config) sampler = LocalSampler(agents=policy, envs=env, max_episode_length=env.spec.max_episode_length, worker_class=FragmentWorker) sac = SAC(env_spec=env.spec, policy=policy, qf1=qf1, qf2=qf2, sampler=sampler, gradient_steps_per_itr=100, replay_buffer=replay_buffer, min_buffer_size=1e3, target_update_tau=5e-3, discount=0.99, buffer_batch_size=64, reward_scale=1., steps_per_epoch=2) trainer.setup(sac, env) if torch.cuda.is_available(): set_gpu_mode(True) else: set_gpu_mode(False) sac.to() trainer.setup(algo=sac, env=env) trainer.train(n_epochs=1, batch_size=100) log_alpha = torch.clone(sac._log_alpha).cpu() set_gpu_mode(False) sac.to() assert torch.allclose(log_alpha, sac._log_alpha)
def pearl_metaworld_ml10(ctxt=None, seed=1, num_epochs=1000, num_train_tasks=10, latent_size=7, encoder_hidden_size=200, net_size=300, meta_batch_size=16, num_steps_per_epoch=4000, num_initial_steps=4000, num_tasks_sample=15, num_steps_prior=750, num_extra_rl_steps_posterior=750, batch_size=256, embedding_batch_size=64, embedding_mini_batch_size=64, reward_scale=10., use_gpu=False): """Train PEARL with ML10 environments. Args: ctxt (garage.experiment.ExperimentContext): The experiment configuration used by Trainer to create the snapshotter. seed (int): Used to seed the random number generator to produce determinism. num_epochs (int): Number of training epochs. num_train_tasks (int): Number of tasks for training. latent_size (int): Size of latent context vector. encoder_hidden_size (int): Output dimension of dense layer of the context encoder. net_size (int): Output dimension of a dense layer of Q-function and value function. meta_batch_size (int): Meta batch size. num_steps_per_epoch (int): Number of iterations per epoch. num_initial_steps (int): Number of transitions obtained per task before training. num_tasks_sample (int): Number of random tasks to obtain data for each iteration. num_steps_prior (int): Number of transitions to obtain per task with z ~ prior. num_extra_rl_steps_posterior (int): Number of additional transitions to obtain per task with z ~ posterior that are only used to train the policy and NOT the encoder. batch_size (int): Number of transitions in RL batch. embedding_batch_size (int): Number of transitions in context batch. embedding_mini_batch_size (int): Number of transitions in mini context batch; should be same as embedding_batch_size for non-recurrent encoder. reward_scale (int): Reward scale. use_gpu (bool): Whether or not to use GPU for training. """ set_seed(seed) encoder_hidden_sizes = (encoder_hidden_size, encoder_hidden_size, encoder_hidden_size) ml10 = metaworld.ML10() train_env = MetaWorldSetTaskEnv(ml10, 'train') env_sampler = SetTaskSampler(MetaWorldSetTaskEnv, env=train_env, wrapper=lambda env, _: normalize(env)) env = env_sampler.sample(num_train_tasks) test_env = MetaWorldSetTaskEnv(ml10, 'test') test_env_sampler = SetTaskSampler(MetaWorldSetTaskEnv, env=test_env, wrapper=lambda env, _: normalize(env)) trainer = Trainer(ctxt) # instantiate networks augmented_env = PEARL.augment_env_spec(env[0](), latent_size) qf = ContinuousMLPQFunction(env_spec=augmented_env, hidden_sizes=[net_size, net_size, net_size]) vf_env = PEARL.get_env_spec(env[0](), latent_size, 'vf') vf = ContinuousMLPQFunction(env_spec=vf_env, hidden_sizes=[net_size, net_size, net_size]) inner_policy = TanhGaussianMLPPolicy( env_spec=augmented_env, hidden_sizes=[net_size, net_size, net_size]) pearl = PEARL( env=env, policy_class=ContextConditionedPolicy, encoder_class=MLPEncoder, inner_policy=inner_policy, qf=qf, vf=vf, num_train_tasks=num_train_tasks, latent_dim=latent_size, encoder_hidden_sizes=encoder_hidden_sizes, test_env_sampler=test_env_sampler, meta_batch_size=meta_batch_size, num_steps_per_epoch=num_steps_per_epoch, num_initial_steps=num_initial_steps, num_tasks_sample=num_tasks_sample, num_steps_prior=num_steps_prior, num_extra_rl_steps_posterior=num_extra_rl_steps_posterior, batch_size=batch_size, embedding_batch_size=embedding_batch_size, embedding_mini_batch_size=embedding_mini_batch_size, reward_scale=reward_scale, ) set_gpu_mode(use_gpu, gpu_id=0) if use_gpu: pearl.to() trainer.setup(algo=pearl, env=env[0](), sampler_cls=LocalSampler, n_workers=1, worker_class=PEARLWorker) trainer.train(n_epochs=num_epochs, batch_size=batch_size)
def mtsac_metaworld_mt10(ctxt=None, *, seed, _gpu, n_tasks, timesteps): """Train MTSAC with MT10 environment. Args: ctxt (garage.experiment.ExperimentContext): The experiment configuration used by Trainer to create the snapshotter. seed (int): Used to seed the random number generator to produce determinism. _gpu (int): The ID of the gpu to be used (used on multi-gpu machines). n_tasks (int): Number of tasks to use. Should be a multiple of 10. timesteps (int): Number of timesteps to run. """ deterministic.set_seed(seed) trainer = Trainer(ctxt) mt10 = metaworld.MT10() mt10_test = metaworld.MT10() # pylint: disable=missing-return-doc, missing-return-type-doc def wrap(env, _): return normalize(env) train_task_sampler = MetaWorldTaskSampler(mt10, 'train', wrap, add_env_onehot=True) test_task_sampler = MetaWorldTaskSampler(mt10_test, 'train', wrap, add_env_onehot=True) assert n_tasks % 10 == 0 assert n_tasks <= 500 mt10_train_envs = train_task_sampler.sample(n_tasks) env = mt10_train_envs[0]() mt10_test_envs = [env_up() for env_up in test_task_sampler.sample(n_tasks)] policy = TanhGaussianMLPPolicy( env_spec=env.spec, hidden_sizes=[400, 400, 400], hidden_nonlinearity=nn.ReLU, output_nonlinearity=None, min_std=np.exp(-20.), max_std=np.exp(2.), ) qf1 = ContinuousMLPQFunction(env_spec=env.spec, hidden_sizes=[400, 400, 400], hidden_nonlinearity=F.relu) qf2 = ContinuousMLPQFunction(env_spec=env.spec, hidden_sizes=[400, 400, 400], hidden_nonlinearity=F.relu) replay_buffer = PathBuffer(capacity_in_transitions=int(1e6), ) meta_batch_size = 10 batch_size = int(env.spec.max_episode_length * meta_batch_size) num_evaluation_points = 500 epochs = timesteps // batch_size epoch_cycles = epochs // num_evaluation_points epochs = epochs // epoch_cycles mtsac = MTSAC(policy=policy, qf1=qf1, qf2=qf2, gradient_steps_per_itr=150, eval_env=mt10_test_envs, env_spec=env.spec, num_tasks=10, steps_per_epoch=epoch_cycles, replay_buffer=replay_buffer, min_buffer_size=1500, target_update_tau=5e-3, discount=0.99, buffer_batch_size=1280) if _gpu is not None: set_gpu_mode(True, _gpu) mtsac.to() trainer.setup(algo=mtsac, env=mt10_train_envs, sampler_cls=LocalSampler, n_workers=meta_batch_size) trainer.train(n_epochs=epochs, batch_size=batch_size)
def mtsac_metaworld_ml1_pick_place(ctxt=None, seed=1, _gpu=None): """Train MTSAC with the ML1 pick-place-v1 environment. Args: ctxt (garage.experiment.ExperimentContext): The experiment configuration used by LocalRunner to create the snapshotter. seed (int): Used to seed the random number generator to produce determinism. _gpu (int): The ID of the gpu to be used (used on multi-gpu machines). """ deterministic.set_seed(seed) runner = LocalRunner(ctxt) train_envs = [] test_envs = [] env_names = [] for i in range(50): train_env = normalize( GymEnv(mwb.ML1.get_train_tasks('pick-place-v1'), normalize_reward=True)) test_env = pickle.loads(pickle.dumps(train_env)) env_names.append('pick_place_{}'.format(i)) train_envs.append(train_env) test_envs.append(test_env) ml1_train_envs = MultiEnvWrapper(train_envs, sample_strategy=round_robin_strategy, env_names=env_names) ml1_test_envs = MultiEnvWrapper(test_envs, sample_strategy=round_robin_strategy, env_names=env_names) policy = TanhGaussianMLPPolicy( env_spec=ml1_train_envs.spec, hidden_sizes=[400, 400, 400], hidden_nonlinearity=nn.ReLU, output_nonlinearity=None, min_std=np.exp(-20.), max_std=np.exp(2.), ) qf1 = ContinuousMLPQFunction(env_spec=ml1_train_envs.spec, hidden_sizes=[400, 400, 400], hidden_nonlinearity=F.relu) qf2 = ContinuousMLPQFunction(env_spec=ml1_train_envs.spec, hidden_sizes=[400, 400, 400], hidden_nonlinearity=F.relu) replay_buffer = PathBuffer(capacity_in_transitions=int(1e6), ) timesteps = 10000000 batch_size = int(150 * ml1_train_envs.num_tasks) num_evaluation_points = 500 epochs = timesteps // batch_size epoch_cycles = epochs // num_evaluation_points epochs = epochs // epoch_cycles mtsac = MTSAC(policy=policy, qf1=qf1, qf2=qf2, gradient_steps_per_itr=150, max_episode_length=150, eval_env=ml1_test_envs, env_spec=ml1_train_envs.spec, num_tasks=50, steps_per_epoch=epoch_cycles, replay_buffer=replay_buffer, min_buffer_size=1500, target_update_tau=5e-3, discount=0.99, buffer_batch_size=1280) if _gpu is not None: set_gpu_mode(True, _gpu) mtsac.to() runner.setup(algo=mtsac, env=ml1_train_envs, sampler_cls=LocalSampler) runner.train(n_epochs=epochs, batch_size=batch_size)
def __init__( self, experiment_name, use_gpu, trainer_args ): """Train MTSAC with metaworld_experiments environment. Args: experiment_name: expeirment name to be used for logging and checkpointing use_wandb: boolean, defines whether or not to log to wandb use_gpu: boolean, defines whether or not to use to GPU for training trainer_args: named tuple with args given by config """ # Define log and checkpoint dir self.checkpoint_dir = os.path.join( trainer_args.log_dir, f"{experiment_name}-{trainer_args.project_id}" ) print(f"Checkpoint dir: {self.checkpoint_dir}") self.state_path = os.path.join(self.checkpoint_dir, "experiment_state.p") self.env_state_path = os.path.join(self.checkpoint_dir, "env_state.p") self.config_path = os.path.join(self.checkpoint_dir, "config.json") self.experiment_name = experiment_name # Only define viz_save_path if required to save visualizations local self.viz_save_path = None if trainer_args.save_visualizations_local: self.viz_save_path = os.path.join(self.checkpoint_dir, "viz") # Check if loading from existing experiment self.loading_from_existing = os.path.exists(self.checkpoint_dir) os.makedirs(self.checkpoint_dir, exist_ok=True) # Save arguments for later retrieval self.init_config(trainer_args) num_tasks = trainer_args.num_tasks # TODO: do we have to fix which GPU to use? run distributed across multiGPUs if use_gpu: set_gpu_mode(True, 0) if trainer_args.seed is not None: deterministic.set_seed(trainer_args.seed) # Note: different classes whether it uses 10 or 50 tasks. Why? mt_env = ( metaworld.MT10(seed=trainer_args.env_seed) if num_tasks <= 10 else metaworld.MT50(seed=trainer_args.env_seed) ) train_task_sampler = MetaWorldTaskSampler( mt_env, "train", add_env_onehot=True ) # TODO: add some clarifying comments of why these asserts are required assert num_tasks % 10 == 0, "Number of tasks have to divisible by 10" assert num_tasks <= 500, "Number of tasks should be less or equal 500" # TODO: do we have guarantees that in case seed is set, the tasks being sampled # are the same? mt_train_envs = train_task_sampler.sample(num_tasks) env = mt_train_envs[0]() if trainer_args.params_seed is not None: torch.manual_seed(trainer_args.params_seed) policy = create_policy_net(env_spec=env.spec, net_params=trainer_args) qf1 = create_qf_net(env_spec=env.spec, net_params=trainer_args) qf2 = create_qf_net(env_spec=env.spec, net_params=trainer_args) if trainer_args.params_seed is not None: calculate_mean_param("policy", policy) calculate_mean_param("qf1", qf1) calculate_mean_param("qf2", qf2) if trainer_args.override_weight_initialization: logging.warn("Overriding dendritic layer weight initialization") self.override_weight_initialization([policy, qf1, qf2]) replay_buffer = PathBuffer( capacity_in_transitions=trainer_args.num_buffer_transitions ) max_episode_length = env.spec.max_episode_length self.env_steps_per_epoch = int(max_episode_length * num_tasks) self.num_epochs = trainer_args.timesteps // self.env_steps_per_epoch sampler = RaySampler( agent=policy, envs=mt_train_envs, max_episode_length=max_episode_length, cpus_per_worker=trainer_args.cpus_per_worker, gpus_per_worker=trainer_args.gpus_per_worker, workers_per_env=trainer_args.workers_per_env, seed=trainer_args.seed, ) self._algo = CustomMTSAC( env_spec=env.spec, policy=policy, qf1=qf1, qf2=qf2, replay_buffer=replay_buffer, sampler=sampler, train_task_sampler=train_task_sampler, gradient_steps_per_itr=int( max_episode_length * trainer_args.num_grad_steps_scale ), task_update_frequency=trainer_args.task_update_frequency, num_tasks=num_tasks, min_buffer_size=max_episode_length * num_tasks, target_update_tau=trainer_args.target_update_tau, discount=trainer_args.discount, buffer_batch_size=trainer_args.buffer_batch_size, policy_lr=trainer_args.policy_lr, qf_lr=trainer_args.qf_lr, reward_scale=trainer_args.reward_scale, num_evaluation_episodes=trainer_args.eval_episodes, fp16=trainer_args.fp16 if use_gpu else False, log_per_task=trainer_args.log_per_task, share_train_eval_env=trainer_args.share_train_eval_env ) # Override with loaded networks if existing experiment self.current_epoch = 0 if self.loading_from_existing: self.load_experiment_state() # Move all networks within the model on device self._algo.to()
def tcl_pearl_ml1(ctxt=None, seed=1, num_epochs=200, num_train_tasks=50, num_test_tasks=10, latent_size=7, encoder_hidden_size=200, net_size=300, meta_batch_size=16, num_steps_per_epoch=4000, num_initial_steps=4000, num_tasks_sample=15, num_steps_prior=750, num_extra_rl_steps_posterior=750, batch_size=256, embedding_batch_size=64, embedding_mini_batch_size=64, max_path_length=200, reward_scale=10., replay_buffer_size=1000000, use_next_obs=False, in_sequence_path_aug=True, emphasized_network=False, use_kl_loss=True, use_q_loss=True, encoder_common_net=True, single_alpha=False, use_task_index_label=False, use_wasserstein_distance=True, gpu_id=0, name='push-v1', prefix='curl_fine_tune', use_gpu=True): """Train TCL-PEARL with ML1 environments. Args: ctxt (garage.experiment.ExperimentContext): The experiment configuration used by LocalRunner to create the snapshotter. seed (int): Used to seed the random number generator to produce determinism. num_epochs (int): Number of training epochs. num_train_tasks (int): Number of tasks for training. num_test_tasks (int): Number of tasks for testing. latent_size (int): Size of latent context vector. encoder_hidden_size (int): Output dimension of dense layer of the context encoder. net_size (int): Output dimension of a dense layer of Q-function and value function. meta_batch_size (int): Meta batch size. num_steps_per_epoch (int): Number of iterations per epoch. num_initial_steps (int): Number of transitions obtained per task before training. num_tasks_sample (int): Number of random tasks to obtain data for each iteration. num_steps_prior (int): Number of transitions to obtain per task with z ~ prior. num_extra_rl_steps_posterior (int): Number of additional transitions to obtain per task with z ~ posterior that are only used to train the policy and NOT the encoder. batch_size (int): Number of transitions in RL batch. embedding_batch_size (int): Number of transitions in context batch. embedding_mini_batch_size (int): Number of transitions in mini context batch; should be same as embedding_batch_size for non-recurrent encoder. max_path_length (int): Maximum path length. reward_scale (int): Reward scale. use_gpu (bool): Whether or not to use GPU for training. """ set_seed(seed) encoder_hidden_sizes = (encoder_hidden_size, encoder_hidden_size, encoder_hidden_size) print("Running experiences on {}/{}".format(prefix, name)) # create multi-task environment and sample tasks ml1 = metaworld.ML1(name) train_env = MetaWorldSetTaskEnv(ml1, 'train') env_sampler = SetTaskSampler(MetaWorldSetTaskEnv, env=train_env, wrapper=lambda env, _: normalize(env)) env = env_sampler.sample(num_train_tasks) test_env = MetaWorldSetTaskEnv(ml1, 'test') test_env_sampler = SetTaskSampler(MetaWorldSetTaskEnv, env=test_env, wrapper=lambda env, _: normalize(env)) sampler = LocalSampler(agents=None, envs=env[0](), max_episode_length=max_path_length, n_workers=1, worker_class=TCLPEARLWorker) trainer = Trainer(ctxt) # instantiate networks augmented_env = TCLPEARL.augment_env_spec(env[0](), latent_size) qf_1 = ContinuousMLPQFunction(env_spec=augmented_env, hidden_sizes=[net_size, net_size, net_size]) qf_2 = ContinuousMLPQFunction(env_spec=augmented_env, hidden_sizes=[net_size, net_size, net_size]) inner_policy = TanhGaussianMLPPolicy( env_spec=augmented_env, hidden_sizes=[net_size, net_size, net_size]) tcl_pearl = TCLPEARL( env=env, policy_class=TCLPolicy, encoder_class=ContrastiveEncoder, inner_policy=inner_policy, qf1=qf_1, qf2=qf_2, sampler=sampler, num_train_tasks=num_train_tasks, num_test_tasks=num_test_tasks, latent_dim=latent_size, encoder_hidden_sizes=encoder_hidden_sizes, test_env_sampler=test_env_sampler, meta_batch_size=meta_batch_size, num_steps_per_epoch=num_steps_per_epoch, num_initial_steps=num_initial_steps, num_tasks_sample=num_tasks_sample, num_steps_prior=num_steps_prior, num_extra_rl_steps_posterior=num_extra_rl_steps_posterior, batch_size=batch_size, embedding_batch_size=embedding_batch_size, embedding_mini_batch_size=embedding_mini_batch_size, max_path_length=max_path_length, reward_scale=reward_scale, replay_buffer_size=replay_buffer_size, use_next_obs_in_context=use_next_obs, embedding_batch_in_sequence=in_sequence_path_aug, use_kl_loss=use_kl_loss, use_q_loss=use_q_loss, encoder_common_net=encoder_common_net, single_alpha=single_alpha, use_task_index_label=use_task_index_label, use_wasserstein_distance=use_wasserstein_distance) set_gpu_mode(use_gpu, gpu_id=gpu_id) if use_gpu: tcl_pearl.to() trainer.setup(algo=tcl_pearl, env=env[0]()) trainer.train(n_epochs=num_epochs, batch_size=batch_size)
def sparse_mlp_mtsac_metaworld_mt1_pick_place(ctxt=None, *, seed, timesteps, _gpu): """Train MTSAC with the MT1 pick-place-v1 environment. Args: ctxt (garage.experiment.ExperimentContext): The experiment configuration used by Trainer to create the snapshotter. seed (int): Used to seed the random number generator to produce determinism. _gpu (int): The ID of the gpu to be used (used on multi-gpu machines). timesteps (int): Number of timesteps to run. """ deterministic.set_seed(seed) mt1 = metaworld.MT1('pick-place-v1') mt1_test = metaworld.MT1('pick-place-v1') train_task_sampler = MetaWorldTaskSampler(mt1, 'train', lambda env, _: normalize(env)) test_task_sampler = MetaWorldTaskSampler(mt1_test, 'train', lambda env, _: normalize(env)) n_tasks = 50 train_envs = train_task_sampler.sample(n_tasks) env = train_envs[0]() test_envs = [env_up() for env_up in test_task_sampler.sample(n_tasks)] trainer = Trainer(ctxt) policy = TanhGaussianSparseMLPPolicy( env_spec=env.spec, hidden_sizes=[400, 400, 400], linear_activity_percent_on=(0.1, 0.1, 0.1), linear_weight_percent_on=(0.4, 0.4, 0.4), mean_nonlinearity=None, std_nonlinearity=None, min_std=np.exp(-20.), max_std=np.exp(2.), ) qf1 = ContinuousSparseMLPQFunction( env_spec=env.spec, hidden_sizes=(400, 400, 400), linear_activity_percent_on=(0.1, 0.1, 0.1,), linear_weight_percent_on=(0.4, 0.4, 0.4,), ) qf2 = ContinuousSparseMLPQFunction( env_spec=env.spec, hidden_sizes=(400, 400, 400), linear_activity_percent_on=(0.1, 0.1, 0.1), linear_weight_percent_on=(0.4, 0.4, 0.4), ) replay_buffer = PathBuffer(capacity_in_transitions=int(1e6), ) sampler = LocalSampler(agents=policy, envs=train_envs, max_episode_length=env.spec.max_episode_length, n_workers=n_tasks, worker_class=FragmentWorker) batch_size = int(env.spec.max_episode_length * n_tasks) num_evaluation_points = 500 epochs = timesteps // batch_size epoch_cycles = epochs // num_evaluation_points epochs = epochs // epoch_cycles mtsac = SparseWeightsMTSAC( policy=policy, qf1=qf1, qf2=qf2, sampler=sampler, gradient_steps_per_itr=150, eval_env=test_envs, env_spec=env.spec, num_tasks=1, steps_per_epoch=epoch_cycles, replay_buffer=replay_buffer, min_buffer_size=1500, target_update_tau=5e-3, discount=0.99, buffer_batch_size=1280) if _gpu is not None: set_gpu_mode(True, _gpu) mtsac.to() trainer.setup(algo=mtsac, env=train_envs) trainer.train(n_epochs=epochs, batch_size=batch_size)
while True: sys.stdout.write(question + prompt) choice = input().lower() if default is not None and choice == '': return valid[default] elif choice in valid: return valid[choice] else: sys.stdout.write("Please respond with 'yes' or 'no' " "(or 'y' or 'n').\n") if __name__ == '__main__': if torch.cuda.is_available(): set_gpu_mode(True) else: set_gpu_mode(False) parser = argparse.ArgumentParser() parser.add_argument('file', type=str, help='path to the snapshot file') parser.add_argument('--max_path_length', type=int, default=1000, help='Max length of rollout') parser.add_argument('--speedup', type=float, default=1, help='Speedup') args = parser.parse_args() # If the snapshot file use tensorflow, do: # import tensorflow as tf # with tf.compat.v1.Session():
def mtsac_metaworld_mt50(ctxt=None, seed=1, use_gpu=False, _gpu=0): """Train MTSAC with MT50 environment. Args: ctxt (garage.experiment.ExperimentContext): The experiment configuration used by LocalRunner to create the snapshotter. seed (int): Used to seed the random number generator to produce determinism. use_gpu (bool): Used to enable ussage of GPU in training. _gpu (int): The ID of the gpu (used on multi-gpu machines). """ deterministic.set_seed(seed) runner = LocalRunner(ctxt) task_names = mwb.MT50.get_train_tasks().all_task_names train_envs = [] test_envs = [] for task_name in task_names: train_env = normalize(GarageEnv(mwb.MT50.from_task(task_name)), normalize_reward=True) test_env = normalize(GarageEnv(mwb.MT50.from_task(task_name))) train_envs.append(train_env) test_envs.append(test_env) mt50_train_envs = MultiEnvWrapper(train_envs, sample_strategy=round_robin_strategy, mode='vanilla') mt50_test_envs = MultiEnvWrapper(test_envs, sample_strategy=round_robin_strategy, mode='vanilla') policy = TanhGaussianMLPPolicy( env_spec=mt50_train_envs.spec, hidden_sizes=[400, 400, 400], hidden_nonlinearity=nn.ReLU, output_nonlinearity=None, min_std=np.exp(-20.), max_std=np.exp(2.), ) qf1 = ContinuousMLPQFunction(env_spec=mt50_train_envs.spec, hidden_sizes=[400, 400, 400], hidden_nonlinearity=F.relu) qf2 = ContinuousMLPQFunction(env_spec=mt50_train_envs.spec, hidden_sizes=[400, 400, 400], hidden_nonlinearity=F.relu) replay_buffer = PathBuffer(capacity_in_transitions=int(1e6), ) timesteps = 100000000 batch_size = int(150 * mt50_train_envs.num_tasks) num_evaluation_points = 500 epochs = timesteps // batch_size epoch_cycles = epochs // num_evaluation_points epochs = epochs // epoch_cycles mtsac = MTSAC(policy=policy, qf1=qf1, qf2=qf2, gradient_steps_per_itr=150, max_episode_length=250, eval_env=mt50_test_envs, env_spec=mt50_train_envs.spec, num_tasks=10, steps_per_epoch=epoch_cycles, replay_buffer=replay_buffer, min_buffer_size=7500, target_update_tau=5e-3, discount=0.99, buffer_batch_size=6400) set_gpu_mode(use_gpu, _gpu) mtsac.to() runner.setup(algo=mtsac, env=mt50_train_envs, sampler_cls=LocalSampler) runner.train(n_epochs=epochs, batch_size=batch_size)
def run(ctxt=None): """ Set up environment and algorithm and run the task. Args: ctxt (garage.experiment.ExperimentContext): The experiment configuration used by LocalRunner to create the snapshotter. seed (int): Used to seed the random number generator to produce determinism. """ deterministic.set_seed(self.seed) runner = LocalRunner(snapshot_config=ctxt, max_cpus=32) env = GarageEnv(normalize(self.env_maker())) policy = TanhGaussianMLPPolicy( env_spec=env.spec, hidden_sizes=self.policy_hidden_sizes, hidden_nonlinearity=nn.ReLU, output_nonlinearity=None, min_std=np.exp(-20.), max_std=np.exp(2.), ) qf1 = ContinuousMLPQFunction(env_spec=env.spec, hidden_sizes=self.qf_hidden_sizes, hidden_nonlinearity=F.relu) qf2 = ContinuousMLPQFunction(env_spec=env.spec, hidden_sizes=self.qf_hidden_sizes, hidden_nonlinearity=F.relu) replay_buffer = PathBuffer( capacity_in_transitions=self.buffer_capacity_in_transitions) algo = _SAC_(env_spec=env.spec, policy=policy, qf1=qf1, qf2=qf2, gradient_steps_per_itr=self.gradient_steps_per_itr, max_path_length=self.max_path_length, max_eval_path_length=self.max_eval_path_length, replay_buffer=replay_buffer, min_buffer_size=self.min_buffer_size, target_update_tau=self.target_update_tau, discount=self.discount, buffer_batch_size=self.buffer_batch_size, reward_scale=self.reward_scale, steps_per_epoch=self.steps_per_epoch) if torch.cuda.is_available(): set_gpu_mode(True) else: set_gpu_mode(False) algo.to() if self.parallel_sampling: runner.setup(algo=algo, env=env, sampler_cls=RaySampler, n_workers=self.n_workers) else: runner.setup(algo=algo, env=env, sampler_cls=LocalSampler) runner.train(n_epochs=self.n_epochs, batch_size=self.batch_size)
def mtsac_metaworld_mt50( ctxt=None, *, config_pth, seed, timesteps, use_wandb, wandb_project_name, gpu ): """Train MTSAC with MT50 environment. Args: ctxt (garage.experiment.ExperimentContext): The experiment configuration used by Trainer to create the snapshotter. seed (int): Used to seed the random number generator to produce determinism. _gpu (int): The ID of the gpu to be used (used on multi-gpu machines). num_tasks (int): Number of tasks to use. Should be a multiple of 10. timesteps (int): Number of timesteps to run. """ """Train MTSAC with metaworld_experiments environment. Args: ctxt (garage.experiment.ExperimentContext): The experiment configuration used by Trainer to create the snapshotter. seed (int): Used to seed the random number generator to produce determinism. _gpu (int): The ID of the gpu to be used (used on multi-gpu machines). timesteps (int): Number of timesteps to run. """ print(f"Initiation took {time() - t0:.2f} secs") # Get experiment parameters (e.g. hyperparameters) and save the json file params = get_params(config_pth) with open(ctxt.snapshot_dir + "/params.json", "w") as json_file: json.dump(params, json_file) if use_wandb == "True": use_wandb = True wandb.init( name=params["experiment_name"], project=wandb_project_name, group="Baselines{}".format("mt50"), reinit=True, config=params, ) else: use_wandb = False num_tasks = 50 timesteps = timesteps deterministic.set_seed(seed) trainer = CustomTrainer(ctxt) mt10 = metaworld.MT50() train_task_sampler = MetaWorldTaskSampler(mt10, "train", add_env_onehot=True) assert num_tasks % 10 == 0, "Number of tasks have to divisible by 10" assert num_tasks <= 500, "Number of tasks should be less or equal 500" mt50_train_envs = train_task_sampler.sample(num_tasks) env = mt50_train_envs[0]() params["net"]["policy_min_std"] = np.exp(params["net"]["policy_min_log_std"]) params["net"]["policy_max_std"] = np.exp(params["net"]["policy_max_log_std"]) policy = create_policy_net(env_spec=env.spec, net_params=params["net"]) qf1 = create_qf_net(env_spec=env.spec, net_params=params["net"]) qf2 = create_qf_net(env_spec=env.spec, net_params=params["net"]) replay_buffer = PathBuffer( capacity_in_transitions=int(params["general_setting"]["num_buffer_transitions"]) ) max_episode_length = env.spec.max_episode_length # Note: are the episode length the same among all tasks? sampler = RaySampler( agents=policy, envs=mt50_train_envs, max_episode_length=max_episode_length, # 1 sampler worker for each environment n_workers=num_tasks, worker_class=DefaultWorker ) test_sampler = RaySampler( agents=policy, envs=mt50_train_envs, max_episode_length=max_episode_length, # 1 sampler worker for each environment n_workers=num_tasks, worker_class=EvalWorker ) # Number of transitions before a set of gradient updates steps_between_updates = int(max_episode_length * num_tasks) # epoch: 1 cycle of data collection + gradient updates epochs = timesteps // steps_between_updates mtsac = CustomMTSAC( env_spec=env.spec, policy=policy, qf1=qf1, qf2=qf2, replay_buffer=replay_buffer, sampler=sampler, train_task_sampler=train_task_sampler, test_sampler=test_sampler, gradient_steps_per_itr=int(max_episode_length * params["training"]["num_grad_steps_scale"]), num_tasks=num_tasks, min_buffer_size=max_episode_length * num_tasks, target_update_tau=params["training"]["target_update_tau"], discount=params["general_setting"]["discount"], buffer_batch_size=params["training"]["buffer_batch_size"], policy_lr=params["training"]["policy_lr"], qf_lr=params["training"]["qf_lr"], reward_scale=params["training"]["reward_scale"], num_evaluation_episodes=params["general_setting"]["eval_episodes"], task_update_frequency=params["training"]["task_update_frequency"], wandb_logging=use_wandb, evaluation_frequency=params["general_setting"]["evaluation_frequency"] ) if gpu is not None: set_gpu_mode(True, gpu) mtsac.to() trainer.setup(algo=mtsac, env=mt50_train_envs) trainer.train(n_epochs=epochs, batch_size=steps_between_updates)
def dqn_atari(ctxt=None, env=None, seed=24, n_workers=psutil.cpu_count(logical=False), max_episode_length=None, **kwargs): """Train DQN with PongNoFrameskip-v4 environment. Args: ctxt (garage.experiment.ExperimentContext): The experiment configuration used by Trainer to create the snapshotter. env (str): Name of the atari environment, eg. 'PongNoFrameskip-v4'. seed (int): Used to seed the random number generator to produce determinism. n_workers (int): Number of workers to use. Defaults to the number of CPU cores available. max_episode_length (int): Max length of an episode. If None, defaults to the timelimit specific to the environment. Used by integration tests. kwargs (dict): hyperparameters to be saved to variant.json. """ assert n_workers > 0 assert env is not None env = gym.make(env) env = Noop(env, noop_max=30) env = MaxAndSkip(env, skip=4) env = EpisodicLife(env) if 'FIRE' in env.unwrapped.get_action_meanings(): env = FireReset(env) env = Grayscale(env) env = Resize(env, 84, 84) env = ClipReward(env) env = StackFrames(env, 4, axis=0) env = GymEnv(env, max_episode_length=max_episode_length, is_image=True) set_seed(seed) trainer = Trainer(ctxt) n_epochs = hyperparams['n_epochs'] steps_per_epoch = hyperparams['steps_per_epoch'] sampler_batch_size = hyperparams['sampler_batch_size'] num_timesteps = n_epochs * steps_per_epoch * sampler_batch_size replay_buffer = PathBuffer( capacity_in_transitions=hyperparams['buffer_size']) qf = DiscreteCNNQFunction( env_spec=env.spec, image_format='NCHW', hidden_channels=hyperparams['hidden_channels'], kernel_sizes=hyperparams['kernel_sizes'], strides=hyperparams['strides'], hidden_w_init=( lambda x: torch.nn.init.orthogonal_(x, gain=np.sqrt(2))), hidden_sizes=hyperparams['hidden_sizes']) policy = DiscreteQFArgmaxPolicy(env_spec=env.spec, qf=qf) exploration_policy = EpsilonGreedyPolicy( env_spec=env.spec, policy=policy, total_timesteps=num_timesteps, max_epsilon=hyperparams['max_epsilon'], min_epsilon=hyperparams['min_epsilon'], decay_ratio=hyperparams['decay_ratio']) sampler = LocalSampler(agents=exploration_policy, envs=env, max_episode_length=env.spec.max_episode_length, worker_class=FragmentWorker, n_workers=n_workers) algo = DQN(env_spec=env.spec, policy=policy, qf=qf, exploration_policy=exploration_policy, replay_buffer=replay_buffer, sampler=sampler, steps_per_epoch=steps_per_epoch, qf_lr=hyperparams['lr'], clip_gradient=hyperparams['clip_gradient'], discount=hyperparams['discount'], min_buffer_size=hyperparams['min_buffer_size'], n_train_steps=hyperparams['n_train_steps'], target_update_freq=hyperparams['target_update_freq'], buffer_batch_size=hyperparams['buffer_batch_size']) set_gpu_mode(False) torch.set_num_threads(1) if torch.cuda.is_available(): set_gpu_mode(True) algo.to() trainer.setup(algo, env) trainer.train(n_epochs=n_epochs, batch_size=sampler_batch_size) env.close()
def test_pearl_ml1_push(self): """Test PEARL with ML1 Push environment.""" params = dict(seed=1, num_epochs=1, num_train_tasks=5, latent_size=7, encoder_hidden_sizes=[10, 10, 10], net_size=30, meta_batch_size=16, num_steps_per_epoch=40, num_initial_steps=40, num_tasks_sample=15, num_steps_prior=15, num_extra_rl_steps_posterior=15, batch_size=256, embedding_batch_size=8, embedding_mini_batch_size=8, reward_scale=10., use_information_bottleneck=True, use_next_obs_in_context=False, use_gpu=False) net_size = params['net_size'] set_seed(params['seed']) # create multi-task environment and sample tasks ml1 = metaworld.ML1('push-v1') train_env = MetaWorldSetTaskEnv(ml1, 'train') env_sampler = SetTaskSampler(MetaWorldSetTaskEnv, env=train_env, wrapper=lambda env, _: normalize(env)) env = env_sampler.sample(params['num_train_tasks']) test_env = MetaWorldSetTaskEnv(ml1, 'test') test_env_sampler = SetTaskSampler( MetaWorldSetTaskEnv, env=test_env, wrapper=lambda env, _: normalize(env)) augmented_env = PEARL.augment_env_spec(env[0](), params['latent_size']) qf = ContinuousMLPQFunction( env_spec=augmented_env, hidden_sizes=[net_size, net_size, net_size]) vf_env = PEARL.get_env_spec(env[0](), params['latent_size'], 'vf') vf = ContinuousMLPQFunction( env_spec=vf_env, hidden_sizes=[net_size, net_size, net_size]) inner_policy = TanhGaussianMLPPolicy( env_spec=augmented_env, hidden_sizes=[net_size, net_size, net_size]) pearl = PEARL( env=env, policy_class=ContextConditionedPolicy, encoder_class=MLPEncoder, inner_policy=inner_policy, qf=qf, vf=vf, num_train_tasks=params['num_train_tasks'], latent_dim=params['latent_size'], encoder_hidden_sizes=params['encoder_hidden_sizes'], test_env_sampler=test_env_sampler, meta_batch_size=params['meta_batch_size'], num_steps_per_epoch=params['num_steps_per_epoch'], num_initial_steps=params['num_initial_steps'], num_tasks_sample=params['num_tasks_sample'], num_steps_prior=params['num_steps_prior'], num_extra_rl_steps_posterior=params[ 'num_extra_rl_steps_posterior'], batch_size=params['batch_size'], embedding_batch_size=params['embedding_batch_size'], embedding_mini_batch_size=params['embedding_mini_batch_size'], reward_scale=params['reward_scale'], ) set_gpu_mode(params['use_gpu'], gpu_id=0) if params['use_gpu']: pearl.to() trainer = Trainer(snapshot_config) trainer.setup(algo=pearl, env=env[0](), sampler_cls=LocalSampler, n_workers=1, worker_class=PEARLWorker) trainer.train(n_epochs=params['num_epochs'], batch_size=params['batch_size'])
def mtsac_metaworld_mt10(ctxt=None, *, seed, _gpu, n_tasks, timesteps): """Train MTSAC with MT10 environment. Args: ctxt (garage.experiment.ExperimentContext): The experiment configuration used by Trainer to create the snapshotter. seed (int): Used to seed the random number generator to produce determinism. _gpu (int): The ID of the gpu to be used (used on multi-gpu machines). n_tasks (int): Number of tasks to use. Should be a multiple of 10. timesteps (int): Number of timesteps to run. """ deterministic.set_seed(seed) trainer = Trainer(ctxt) mt10 = metaworld.MT10() # pylint: disable=no-member mt10_test = metaworld.MT10() # pylint: disable=no-member # pylint: disable=missing-return-doc, missing-return-type-doc def wrap(env, _): return normalize(env, normalize_reward=True) train_task_sampler = MetaWorldTaskSampler(mt10, 'train', wrap, add_env_onehot=True) test_task_sampler = MetaWorldTaskSampler(mt10_test, 'train', add_env_onehot=True) assert n_tasks % 10 == 0 assert n_tasks <= 500 mt10_train_envs = train_task_sampler.sample(n_tasks) env = mt10_train_envs[0]() mt10_test_envs = [env_up() for env_up in test_task_sampler.sample(n_tasks)] policy = TanhGaussianMLPPolicy( env_spec=env.spec, hidden_sizes=[400, 400, 400], hidden_nonlinearity=nn.ReLU, output_nonlinearity=None, min_std=np.exp(-20.), max_std=np.exp(2.), ) qf1 = ContinuousMLPQFunction(env_spec=env.spec, hidden_sizes=[400, 400, 400], hidden_nonlinearity=F.relu) qf2 = ContinuousMLPQFunction(env_spec=env.spec, hidden_sizes=[400, 400, 400], hidden_nonlinearity=F.relu) replay_buffer = PathBuffer(capacity_in_transitions=int(1e6), ) meta_batch_size = 10 sampler = LocalSampler( agents=policy, envs=mt10_train_envs, max_episode_length=env.spec.max_episode_length, # 1 sampler worker for each environment n_workers=meta_batch_size, worker_class=FragmentWorker, # increasing n_envs increases the vectorization of a sampler worker # which improves runtime performance, but you will need to adjust this # depending on your memory constraints. For reference, each worker by # default uses n_envs=8. Each environment is approximately ~50mb large # so creating 50 envs with 8 copies comes out to 20gb of memory. Many # users want to be able to run multiple seeds on 1 machine, so I have # reduced this to n_envs = 2 for 2 copies in the meantime. worker_args=dict(n_envs=2)) batch_size = int(env.spec.max_episode_length * meta_batch_size) num_evaluation_points = 500 epochs = timesteps // batch_size epoch_cycles = epochs // num_evaluation_points epochs = epochs // epoch_cycles mtsac = MTSAC(policy=policy, qf1=qf1, qf2=qf2, sampler=sampler, gradient_steps_per_itr=env.spec.max_episode_length, eval_env=mt10_test_envs, env_spec=env.spec, num_tasks=10, steps_per_epoch=epoch_cycles, replay_buffer=replay_buffer, min_buffer_size=1500, target_update_tau=5e-3, discount=0.99, buffer_batch_size=1280) if _gpu is not None: set_gpu_mode(True, _gpu) mtsac.to() trainer.setup(algo=mtsac, env=mt10_train_envs) trainer.train(n_epochs=epochs, batch_size=batch_size)
def mtsac_metaworld_mt10(ctxt=None, *, experiment_name, config_pth, seed, use_wandb, gpu): """Train MTSAC with MT10 environment. Args: ctxt (garage.experiment.ExperimentContext): The experiment configuration used by Trainer to create the snapshotter. seed (int): Used to seed the random number generator to produce determinism. _gpu (int): The ID of the gpu to be used (used on multi-gpu machines). timesteps (int): Number of timesteps to run. """ print(f"Initiation took {time()-t0:.2f} secs") device = torch.device("cuda") if gpu else torch.device("cpu") print(f"Using GPU: {gpu}, Device: {device}") # maybe overring other things - this is required, why? if gpu: set_gpu_mode(True) else: set_gpu_mode(False) # Get experiment parameters (e.g. hyperparameters) and save the json file params = get_params(config_pth) with open(ctxt.snapshot_dir + "/params.json", "w") as json_file: json.dump(params, json_file) if use_wandb == "True": use_wandb = True wandb.init( name=experiment_name, project="mt10_debug", group="Baselines{}".format("mt10"), reinit=True, config=params, ) else: use_wandb = False num_tasks = params["net"]["num_tasks"] timesteps = 15000000 deterministic.set_seed(seed) trainer = Trainer(ctxt) # Note: different classes whether it uses 10 or 50 tasks. Why? if num_tasks <= 10: mt_env = metaworld.MT10() else: mt_env = metaworld.MT50() train_task_sampler = MetaWorldTaskSampler(mt_env, "train", add_env_onehot=True) assert num_tasks % 10 == 0, "Number of tasks have to divisible by 10" assert num_tasks <= 500, "Number of tasks should be less or equal 500" mt_train_envs = train_task_sampler.sample(num_tasks) env = mt_train_envs[0]() params["net"]["policy_min_std"] = np.exp( params["net"]["policy_min_log_std"]) params["net"]["policy_max_std"] = np.exp( params["net"]["policy_max_log_std"]) policy = create_policy_net(env_spec=env.spec, net_params=params["net"]) print("Created policy") qf1 = create_qf_net(env_spec=env.spec, net_params=params["net"]) qf2 = create_qf_net(env_spec=env.spec, net_params=params["net"]) print("Created value functions") replay_buffer = PathBuffer(capacity_in_transitions=int( params["general_setting"]["num_buffer_transitions"])) max_episode_length = env.spec.max_episode_length # Note: are the episode length the same among all tasks? sampler = RaySampler( agents=policy, envs=mt_train_envs, max_episode_length=max_episode_length, cpus_per_worker=params["sampler"]["cpus_per_worker"], gpus_per_worker=params["sampler"]["gpus_per_worker"], seed=None, # set to get_seed() to make it deterministic ) # will probably still need the sampler test_sampler = sampler # test_sampler = RaySampler( # agents=policy, # envs=mt_train_envs, # max_episode_length=max_episode_length, # # 1 sampler worker for each environment # n_workers=num_tasks, # worker_class=EvalWorker # ) # Note: difference between sampler and test sampler is only the worker # difference is one line in EvalWorker, uses average: a = agent_info["mean"] # can we create a unified worker that cointais both rules? # Number of transitions before a set of gradient updates # Note: should we use avg episode length, if they are not same for all tasks? batch_size = int(max_episode_length * num_tasks) # TODO: this whole block seems unnecessary, it is not doing anything. # Number of times policy is evaluated (also the # of epochs) num_evaluation_points = timesteps // batch_size epochs = timesteps // batch_size # number of times new batch of samples + gradient updates are done per epoch epoch_cycles = epochs // num_evaluation_points # this will always be equal to 1 epochs = epochs // epoch_cycles mtsac = CustomMTSAC( env_spec=env.spec, policy=policy, qf1=qf1, qf2=qf2, replay_buffer=replay_buffer, sampler=sampler, train_task_sampler=train_task_sampler, test_sampler=test_sampler, gradient_steps_per_itr=1, num_tasks=num_tasks, steps_per_epoch=epoch_cycles, min_buffer_size=max_episode_length * num_tasks, target_update_tau=params["training"]["target_update_tau"], discount=params["general_setting"]["discount"], buffer_batch_size=params["training"]["buffer_batch_size"], policy_lr=params["training"]["policy_lr"], qf_lr=params["training"]["qf_lr"], reward_scale=params["training"]["reward_scale"], num_evaluation_episodes=params["general_setting"]["eval_episodes"], task_update_frequency=params["training"]["task_update_frequency"], wandb_logging=use_wandb, evaluation_frequency=params["general_setting"]["evaluation_frequency"], ) print("Created algo") mtsac.to(device=device) print("Moved networks to device") trainer.setup(algo=mtsac, env=mt_train_envs) print("Setup trainer") trainer.train(n_epochs=epochs, batch_size=batch_size)
def mtsac_metaworld_mt10(ctxt=None, *, seed, _gpu, n_tasks, timesteps): """Train MTSAC with MT10 environment. Args: ctxt (garage.experiment.ExperimentContext): The experiment configuration used by Trainer to create the snapshotter. seed (int): Used to seed the random number generator to produce determinism. _gpu (int): The ID of the gpu to be used (used on multi-gpu machines). n_tasks (int): Number of tasks to use. Should be a multiple of 10. timesteps (int): Number of timesteps to run. """ deterministic.set_seed(seed) trainer = Trainer(ctxt) mt10 = metaworld.MT10() # pylint: disable=no-member mt10_test = metaworld.MT10() # pylint: disable=no-member # pylint: disable=missing-return-doc, missing-return-type-doc def wrap(env, _): return normalize(env, normalize_reward=True) train_task_sampler = MetaWorldTaskSampler(mt10, 'train', wrap, add_env_onehot=True) test_task_sampler = MetaWorldTaskSampler(mt10_test, 'train', add_env_onehot=True) assert n_tasks % 10 == 0 assert n_tasks <= 500 mt10_train_envs = train_task_sampler.sample(n_tasks) env = mt10_train_envs[0]() mt10_test_envs = [env_up() for env_up in test_task_sampler.sample(n_tasks)] policy = TanhGaussianMLPPolicy( env_spec=env.spec, hidden_sizes=[400, 400, 400], hidden_nonlinearity=nn.ReLU, output_nonlinearity=None, min_std=np.exp(-20.), max_std=np.exp(2.), ) qf1 = ContinuousMLPQFunction(env_spec=env.spec, hidden_sizes=[400, 400, 400], hidden_nonlinearity=F.relu) qf2 = ContinuousMLPQFunction(env_spec=env.spec, hidden_sizes=[400, 400, 400], hidden_nonlinearity=F.relu) replay_buffer = PathBuffer(capacity_in_transitions=int(1e6), ) meta_batch_size = 10 # ray.init(local_mode=True, log_to_driver=False, ignore_reinit_error=True) sampler = SingleVecWorkSampler( agents=policy, envs=mt10_train_envs, n_workers=meta_batch_size, max_episode_length=env.spec.max_episode_length, # 1 sampler worker for each environment worker_class=FragmentWorker, # increasing n_envs increases the vectorization of a sampler worker # which improves runtime performance, but you will need to adjust this # depending on your memory constraints. For reference, each worker by # default uses n_envs=8. Each environment is approximately ~50mb large # so creating 50 envs with 8 copies comes out to 20gb of memory. Many # users want to be able to run multiple seeds on 1 machine, so I have # reduced this to n_envs = 2 for 2 copies in the meantime. worker_args=dict(n_envs=10)) # one episode for each task between gradient steps batch_size = int(env.spec.max_episode_length * meta_batch_size) num_evaluation_points = 500 epochs = timesteps // batch_size epoch_cycles = epochs // num_evaluation_points epochs = epochs // epoch_cycles mtsac = MTSAC(policy=policy, qf1=qf1, qf2=qf2, sampler=sampler, gradient_steps_per_itr=env.spec.max_episode_length, eval_env=mt10_test_envs, env_spec=env.spec, num_tasks=10, steps_per_epoch=epoch_cycles, replay_buffer=replay_buffer, min_buffer_size=1500, target_update_tau=5e-3, discount=0.99, buffer_batch_size=1280) if _gpu is not None: set_gpu_mode(True, _gpu) trainer.setup(algo=mtsac, env=mt10_train_envs) import time s = time.time() mtsac.to() print(time.time() - s) s = time.time() # trainer.step_episode = trainer.obtain_samples(0, 1500, None, None) trainer.step_episode = trainer.obtain_samples(0, 2000, None, None) print((time.time() - s)) a = 2 from garage import StepType path_returns = [] for path in trainer.step_episode: mtsac.replay_buffer.add_path( dict(observation=path['observations'], action=path['actions'], reward=path['rewards'].reshape(-1, 1), next_observation=path['next_observations'], terminal=np.array([ step_type == StepType.TERMINAL for step_type in path['step_types'] ]).reshape(-1, 1))) path_returns.append(sum(path['rewards'])) s = time.time() for _ in range(10): trainer._algo.train_once() print((time.time() - s) / 10)
def pearl_half_cheetah_vel(ctxt=None, seed=1, num_epochs=500, num_train_tasks=100, num_test_tasks=30, latent_size=5, encoder_hidden_size=200, net_size=300, meta_batch_size=16, num_steps_per_epoch=2000, num_initial_steps=2000, num_tasks_sample=5, num_steps_prior=400, num_extra_rl_steps_posterior=600, batch_size=256, embedding_batch_size=100, embedding_mini_batch_size=100, max_path_length=200, reward_scale=5., use_gpu=False): """Train PEARL with HalfCheetahVel environment. Args: ctxt (garage.experiment.ExperimentContext): The experiment configuration used by LocalRunner to create the snapshotter. seed (int): Used to seed the random number generator to produce determinism. num_epochs (int): Number of training epochs. num_train_tasks (int): Number of tasks for training. num_test_tasks (int): Number of tasks for testing. latent_size (int): Size of latent context vector. encoder_hidden_size (int): Output dimension of dense layer of the context encoder. net_size (int): Output dimension of a dense layer of Q-function and value function. meta_batch_size (int): Meta batch size. num_steps_per_epoch (int): Number of iterations per epoch. num_initial_steps (int): Number of transitions obtained per task before training. num_tasks_sample (int): Number of random tasks to obtain data for each iteration. num_steps_prior (int): Number of transitions to obtain per task with z ~ prior. num_extra_rl_steps_posterior (int): Number of additional transitions to obtain per task with z ~ posterior that are only used to train the policy and NOT the encoder. batch_size (int): Number of transitions in RL batch. embedding_batch_size (int): Number of transitions in context batch. embedding_mini_batch_size (int): Number of transitions in mini context batch; should be same as embedding_batch_size for non-recurrent encoder. max_path_length (int): Maximum path length. reward_scale (int): Reward scale. use_gpu (bool): Whether or not to use GPU for training. """ set_seed(seed) encoder_hidden_sizes = (encoder_hidden_size, encoder_hidden_size, encoder_hidden_size) # create multi-task environment and sample tasks env_sampler = SetTaskSampler( lambda: GarageEnv(normalize(HalfCheetahVelEnv()))) env = env_sampler.sample(num_train_tasks) test_env_sampler = SetTaskSampler( lambda: GarageEnv(normalize(HalfCheetahVelEnv()))) runner = LocalRunner(ctxt) # instantiate networks augmented_env = PEARL.augment_env_spec(env[0](), latent_size) qf = ContinuousMLPQFunction(env_spec=augmented_env, hidden_sizes=[net_size, net_size, net_size]) vf_env = PEARL.get_env_spec(env[0](), latent_size, 'vf') vf = ContinuousMLPQFunction(env_spec=vf_env, hidden_sizes=[net_size, net_size, net_size]) inner_policy = TanhGaussianMLPPolicy( env_spec=augmented_env, hidden_sizes=[net_size, net_size, net_size]) pearl = PEARL( env=env, policy_class=ContextConditionedPolicy, encoder_class=MLPEncoder, inner_policy=inner_policy, qf=qf, vf=vf, num_train_tasks=num_train_tasks, num_test_tasks=num_test_tasks, latent_dim=latent_size, encoder_hidden_sizes=encoder_hidden_sizes, test_env_sampler=test_env_sampler, meta_batch_size=meta_batch_size, num_steps_per_epoch=num_steps_per_epoch, num_initial_steps=num_initial_steps, num_tasks_sample=num_tasks_sample, num_steps_prior=num_steps_prior, num_extra_rl_steps_posterior=num_extra_rl_steps_posterior, batch_size=batch_size, embedding_batch_size=embedding_batch_size, embedding_mini_batch_size=embedding_mini_batch_size, max_path_length=max_path_length, reward_scale=reward_scale, ) set_gpu_mode(use_gpu, gpu_id=0) if use_gpu: pearl.to() runner.setup(algo=pearl, env=env[0](), sampler_cls=LocalSampler, sampler_args=dict(max_path_length=max_path_length), n_workers=1, worker_class=PEARLWorker) runner.train(n_epochs=num_epochs, batch_size=batch_size)