def run_garage(env, seed, log_dir): """ Create garage model and training. Replace the ppo with the algorithm you want to run. :param env: Environment of the task. :param seed: Random seed for the trial. :param log_dir: Log dir path. :return: """ deterministic.set_seed(seed) env.reset() with LocalRunner() as runner: env = TfEnv(env) action_noise = OUStrategy(env.spec, sigma=params['sigma']) policy = ContinuousMLPPolicy( env_spec=env.spec, hidden_sizes=params['policy_hidden_sizes'], hidden_nonlinearity=tf.nn.relu, output_nonlinearity=tf.nn.tanh, input_include_goal=True, ) qf = ContinuousMLPQFunction( env_spec=env.spec, hidden_sizes=params['qf_hidden_sizes'], hidden_nonlinearity=tf.nn.relu, input_include_goal=True, ) replay_buffer = HerReplayBuffer( env_spec=env.spec, size_in_transitions=params['replay_buffer_size'], time_horizon=params['n_rollout_steps'], replay_k=0.4, reward_fun=env.compute_reward, ) algo = DDPG( env_spec=env.spec, policy=policy, qf=qf, replay_buffer=replay_buffer, policy_lr=params['policy_lr'], qf_lr=params['qf_lr'], plot=False, target_update_tau=params['tau'], n_epochs=params['n_epochs'], n_epoch_cycles=params['n_epoch_cycles'], n_train_steps=params['n_train_steps'], discount=params['discount'], exploration_strategy=action_noise, policy_optimizer=tf.train.AdamOptimizer, qf_optimizer=tf.train.AdamOptimizer, buffer_batch_size=256, input_include_goal=True, ) # Set up logger since we are not using run_experiment tabular_log_file = osp.join(log_dir, 'progress.csv') logger.add_output(StdOutput()) logger.add_output(CsvOutput(tabular_log_file)) logger.add_output(TensorBoardOutput(log_dir)) runner.setup(algo, env) runner.train( n_epochs=params['n_epochs'], n_epoch_cycles=params['n_epoch_cycles'], batch_size=params['n_rollout_steps']) logger.remove_all() return tabular_log_file
def pearl_half_cheetah_vel(ctxt=None, seed=1, num_epochs=500, num_train_tasks=100, num_test_tasks=30, latent_size=5, encoder_hidden_size=200, net_size=300, meta_batch_size=16, num_steps_per_epoch=2000, num_initial_steps=2000, num_tasks_sample=5, num_steps_prior=400, num_extra_rl_steps_posterior=600, batch_size=256, embedding_batch_size=100, embedding_mini_batch_size=100, max_path_length=200, reward_scale=5., use_gpu=False): """Train PEARL with HalfCheetahVel environment. Args: ctxt (garage.experiment.ExperimentContext): The experiment configuration used by LocalRunner to create the snapshotter. seed (int): Used to seed the random number generator to produce determinism. num_epochs (int): Number of training epochs. num_train_tasks (int): Number of tasks for training. num_test_tasks (int): Number of tasks for testing. latent_size (int): Size of latent context vector. encoder_hidden_size (int): Output dimension of dense layer of the context encoder. net_size (int): Output dimension of a dense layer of Q-function and value function. meta_batch_size (int): Meta batch size. num_steps_per_epoch (int): Number of iterations per epoch. num_initial_steps (int): Number of transitions obtained per task before training. num_tasks_sample (int): Number of random tasks to obtain data for each iteration. num_steps_prior (int): Number of transitions to obtain per task with z ~ prior. num_extra_rl_steps_posterior (int): Number of additional transitions to obtain per task with z ~ posterior that are only used to train the policy and NOT the encoder. batch_size (int): Number of transitions in RL batch. embedding_batch_size (int): Number of transitions in context batch. embedding_mini_batch_size (int): Number of transitions in mini context batch; should be same as embedding_batch_size for non-recurrent encoder. max_path_length (int): Maximum path length. reward_scale (int): Reward scale. use_gpu (bool): Whether or not to use GPU for training. """ set_seed(seed) encoder_hidden_sizes = (encoder_hidden_size, encoder_hidden_size, encoder_hidden_size) # create multi-task environment and sample tasks env_sampler = SetTaskSampler( lambda: GarageEnv(normalize(HalfCheetahVelEnv()))) env = env_sampler.sample(num_train_tasks) test_env_sampler = SetTaskSampler( lambda: GarageEnv(normalize(HalfCheetahVelEnv()))) runner = LocalRunner(ctxt) # instantiate networks augmented_env = PEARL.augment_env_spec(env[0](), latent_size) qf = ContinuousMLPQFunction(env_spec=augmented_env, hidden_sizes=[net_size, net_size, net_size]) vf_env = PEARL.get_env_spec(env[0](), latent_size, 'vf') vf = ContinuousMLPQFunction(env_spec=vf_env, hidden_sizes=[net_size, net_size, net_size]) inner_policy = TanhGaussianMLPPolicy( env_spec=augmented_env, hidden_sizes=[net_size, net_size, net_size]) pearl = PEARL( env=env, policy_class=ContextConditionedPolicy, encoder_class=MLPEncoder, inner_policy=inner_policy, qf=qf, vf=vf, num_train_tasks=num_train_tasks, num_test_tasks=num_test_tasks, latent_dim=latent_size, encoder_hidden_sizes=encoder_hidden_sizes, test_env_sampler=test_env_sampler, meta_batch_size=meta_batch_size, num_steps_per_epoch=num_steps_per_epoch, num_initial_steps=num_initial_steps, num_tasks_sample=num_tasks_sample, num_steps_prior=num_steps_prior, num_extra_rl_steps_posterior=num_extra_rl_steps_posterior, batch_size=batch_size, embedding_batch_size=embedding_batch_size, embedding_mini_batch_size=embedding_mini_batch_size, max_path_length=max_path_length, reward_scale=reward_scale, ) tu.set_gpu_mode(use_gpu, gpu_id=0) if use_gpu: pearl.to() runner.setup(algo=pearl, env=env[0](), sampler_cls=LocalSampler, sampler_args=dict(max_path_length=max_path_length), n_workers=1, worker_class=PEARLWorker) runner.train(n_epochs=num_epochs, batch_size=batch_size)
def test_singleton_pool(self): max_cpus = 8 with LocalRunner(max_cpus=max_cpus): assert max_cpus == singleton_pool.n_parallel, ( 'LocalRunner(max_cpu) should set up singleton_pool.')
def test_external_sess(self): with tf.Session() as sess: with LocalRunner(sess=sess): pass # sess should still be the default session here. tf.no_op().run()
def run_task(snapshot_config, *_): """Set up environment and algorithm and run the task. Args: snapshot_config (garage.experiment.SnapshotConfig): The snapshot configuration used by LocalRunner to create the snapshotter. If None, it will create one with default settings. _ : Unused parameters """ #count = 1 th = 1.8 g_max = 0.05 star_version = args.IS_MBPG_star if args.env == 'CartPole': #CartPole env = TfEnv(normalize(CartPoleEnv())) runner = LocalRunner(snapshot_config) batch_size = 5000 max_length = 100 n_timestep = 5e5 n_counts = 5 name = 'CartPole' #grad_factor = 5 grad_factor = 100 th = 1.2 # # batchsize:1 # lr = 0.1 # w = 1.5 # c = 15 #batchsize:50 lr = 0.75 c = 1 w = 1 # for MBPG+: # lr = 1.2 #g_max = 0.03 discount = 0.995 path = './init/CartPole_policy.pth' if args.env == 'Walker': #Walker_2d env = TfEnv(normalize(Walker2dEnv())) runner = LocalRunner(snapshot_config) batch_size = 50000 max_length = 500 th = 1.2 n_timestep = 1e7 n_counts = 5 lr = 0.75 w = 2 c = 5 grad_factor = 10 # for MBPG+: #lr = 0.9 discount = 0.999 name = 'Walk' path = './init/Walk_policy.pth' if args.env == 'Hopper': #Hopper env = TfEnv(normalize(HopperEnv())) runner = LocalRunner(snapshot_config) batch_size = 50000 max_length = 1000 th = 1.5 n_timestep = 1e7 n_counts = 5 lr = 0.75 w = 1 c = 3 grad_factor = 10 g_max = 0.15 discount = 0.999 name = 'Hopper' path = './init/Hopper_policy.pth' if args.env == 'HalfCheetah': env = TfEnv(normalize(HalfCheetahEnv())) runner = LocalRunner(snapshot_config) batch_size = 10000 #batch_size = 50000 max_length = 500 n_timestep = 1e7 n_counts = 5 lr = 0.6 w = 3 c = 7 grad_factor = 10 th = 1.2 g_max = 0.06 discount = 0.999 name = 'HalfCheetah' path = './init/HalfCheetah_policy.pth' for i in range(n_counts): print(env.spec) if args.env == 'CartPole': policy = CategoricalMLPPolicy(env.spec, hidden_sizes=[8, 8], hidden_nonlinearity=torch.tanh, output_nonlinearity=None) else: policy = GaussianMLPPolicy(env.spec, hidden_sizes=[64, 64], hidden_nonlinearity=torch.tanh, output_nonlinearity=None) policy.load_state_dict(torch.load(path)) baseline = LinearFeatureBaseline(env_spec=env.spec) algo = MBPG_IM( env_spec=env.spec, env=env, env_name=name, policy=policy, baseline=baseline, max_path_length=max_length, discount=discount, grad_factor=grad_factor, policy_lr=lr, c=c, w=w, n_timestep=n_timestep, #count=count, th=th, batch_size=batch_size, center_adv=True, g_max=g_max, #decay_learning_rate=d_lr, star_version=star_version) runner.setup(algo, env) runner.train(n_epochs=100, batch_size=batch_size)
def mtsac_metaworld_mt50(ctxt=None, seed=1, use_gpu=False, _gpu=0): """Train MTSAC with MT50 environment. Args: ctxt (garage.experiment.ExperimentContext): The experiment configuration used by LocalRunner to create the snapshotter. seed (int): Used to seed the random number generator to produce determinism. use_gpu (bool): Used to enable ussage of GPU in training. _gpu (int): The ID of the gpu (used on multi-gpu machines). """ deterministic.set_seed(seed) runner = LocalRunner(ctxt) task_names = mwb.MT50.get_train_tasks().all_task_names train_envs = [] test_envs = [] for task_name in task_names: train_env = normalize(GarageEnv(mwb.MT50.from_task(task_name)), normalize_reward=True) test_env = normalize(GarageEnv(mwb.MT50.from_task(task_name))) train_envs.append(train_env) test_envs.append(test_env) mt50_train_envs = MultiEnvWrapper(train_envs, sample_strategy=round_robin_strategy, mode='vanilla') mt50_test_envs = MultiEnvWrapper(test_envs, sample_strategy=round_robin_strategy, mode='vanilla') policy = TanhGaussianMLPPolicy( env_spec=mt50_train_envs.spec, hidden_sizes=[400, 400, 400], hidden_nonlinearity=nn.ReLU, output_nonlinearity=None, min_std=np.exp(-20.), max_std=np.exp(2.), ) qf1 = ContinuousMLPQFunction(env_spec=mt50_train_envs.spec, hidden_sizes=[400, 400, 400], hidden_nonlinearity=F.relu) qf2 = ContinuousMLPQFunction(env_spec=mt50_train_envs.spec, hidden_sizes=[400, 400, 400], hidden_nonlinearity=F.relu) replay_buffer = PathBuffer(capacity_in_transitions=int(1e6), ) timesteps = 100000000 batch_size = int(150 * mt50_train_envs.num_tasks) num_evaluation_points = 500 epochs = timesteps // batch_size epoch_cycles = epochs // num_evaluation_points epochs = epochs // epoch_cycles mtsac = MTSAC(policy=policy, qf1=qf1, qf2=qf2, gradient_steps_per_itr=150, max_path_length=250, eval_env=mt50_test_envs, env_spec=mt50_train_envs.spec, num_tasks=10, steps_per_epoch=epoch_cycles, replay_buffer=replay_buffer, min_buffer_size=7500, target_update_tau=5e-3, discount=0.99, buffer_batch_size=6400) set_gpu_mode(use_gpu, _gpu) mtsac.to() runner.setup(algo=mtsac, env=mt50_train_envs, sampler_cls=LocalSampler) runner.train(n_epochs=epochs, batch_size=batch_size)
def diayn_point_mass_multigoal(ctxt=None, seed=1): deterministic.set_seed(seed) runner = LocalRunner(snapshot_config=ctxt) env = MultiGoalEnv() skills_num = 6 policy = TanhGaussianMLPSkillPolicy( env_spec=env.spec, skills_num=skills_num, hidden_sizes=[256, 256], hidden_nonlinearity=nn.ReLU, output_nonlinearity=None, min_std=np.exp(-20.), max_std=np.exp(2.), ) qf1 = ContinuousMLPSkillQFunction(env_spec=env.spec, skills_num=skills_num, hidden_sizes=[256, 256], hidden_nonlinearity=F.relu) qf2 = ContinuousMLPSkillQFunction(env_spec=env.spec, skills_num=skills_num, hidden_sizes=[256, 256], hidden_nonlinearity=F.relu) discriminator = MLPDiscriminator(env_spec=env.spec, skills_num=skills_num, hidden_sizes=[64, 64], hidden_nonlinearity=F.relu) replay_buffer = PathBuffer(capacity_in_transitions=int(1e6)) diayn = DIAYN( env_spec=env.spec, skills_num=skills_num, discriminator=discriminator, policy=policy, qf1=qf1, qf2=qf2, gradient_steps_per_itr=1000, max_path_length=500, replay_buffer=replay_buffer, min_buffer_size=1e4, recorded=True, # enable the video recording func is_gym_render=False, media_save_path='diayn_2d_multigoal/', target_update_tau=5e-3, discount=0.99, buffer_batch_size=256, reward_scale=1., steps_per_epoch=1) if torch.cuda.is_available(): tu.set_gpu_mode(True) else: tu.set_gpu_mode(False) diayn.to() worker_args = {"skills_num": skills_num} runner.setup(algo=diayn, env=env, sampler_cls=LocalSkillSampler, worker_class=SkillWorker, worker_args=worker_args) runner.train(n_epochs=1000, batch_size=1000)
def test_pearl_ml1_push(self): """Test PEARL with ML1 Push environment.""" params = dict(seed=1, num_epochs=1, num_train_tasks=5, num_test_tasks=1, latent_size=7, encoder_hidden_sizes=[10, 10, 10], net_size=30, meta_batch_size=16, num_steps_per_epoch=40, num_initial_steps=40, num_tasks_sample=15, num_steps_prior=15, num_extra_rl_steps_posterior=15, batch_size=256, embedding_batch_size=8, embedding_mini_batch_size=8, max_path_length=50, reward_scale=10., use_information_bottleneck=True, use_next_obs_in_context=False, use_gpu=False) net_size = params['net_size'] set_seed(params['seed']) env_sampler = SetTaskSampler( lambda: GarageEnv(normalize(ML1.get_train_tasks('push-v1')))) env = env_sampler.sample(params['num_train_tasks']) test_env_sampler = SetTaskSampler( lambda: GarageEnv(normalize(ML1.get_test_tasks('push-v1')))) augmented_env = PEARL.augment_env_spec(env[0](), params['latent_size']) qf = ContinuousMLPQFunction( env_spec=augmented_env, hidden_sizes=[net_size, net_size, net_size]) vf_env = PEARL.get_env_spec(env[0](), params['latent_size'], 'vf') vf = ContinuousMLPQFunction( env_spec=vf_env, hidden_sizes=[net_size, net_size, net_size]) inner_policy = TanhGaussianMLPPolicy( env_spec=augmented_env, hidden_sizes=[net_size, net_size, net_size]) pearl = PEARL( env=env, policy_class=ContextConditionedPolicy, encoder_class=MLPEncoder, inner_policy=inner_policy, qf=qf, vf=vf, num_train_tasks=params['num_train_tasks'], num_test_tasks=params['num_test_tasks'], latent_dim=params['latent_size'], encoder_hidden_sizes=params['encoder_hidden_sizes'], test_env_sampler=test_env_sampler, meta_batch_size=params['meta_batch_size'], num_steps_per_epoch=params['num_steps_per_epoch'], num_initial_steps=params['num_initial_steps'], num_tasks_sample=params['num_tasks_sample'], num_steps_prior=params['num_steps_prior'], num_extra_rl_steps_posterior=params[ 'num_extra_rl_steps_posterior'], batch_size=params['batch_size'], embedding_batch_size=params['embedding_batch_size'], embedding_mini_batch_size=params['embedding_mini_batch_size'], max_path_length=params['max_path_length'], reward_scale=params['reward_scale'], ) set_gpu_mode(params['use_gpu'], gpu_id=0) if params['use_gpu']: pearl.to() runner = LocalRunner(snapshot_config) runner.setup( algo=pearl, env=env[0](), sampler_cls=LocalSampler, sampler_args=dict(max_path_length=params['max_path_length']), n_workers=1, worker_class=PEARLWorker) runner.train(n_epochs=params['num_epochs'], batch_size=params['batch_size'])
def kant_cheetah_hurdle( ctxt=None, seed=seed, num_skills=skills_num, num_epochs=param_num_epoches, num_train_tasks=param_train_tasks_num, num_test_tasks=param_test_tasks_num, is_encoder_recurrent=False, latent_size=param_latent_size, encoder_hidden_size=param_encoder_hidden_size, net_size=param_net_size, meta_batch_size=param_meta_batch_size, num_steps_per_epoch=param_num_steps_per_epoch, num_initial_steps=param_num_initial_steps, num_tasks_sample=param_num_tasks_sample, num_steps_prior=param_num_steps_prior, num_extra_rl_steps_posterior=param_num_extra_rl_steps_posterior, num_skills_sample=param_num_skills_sample, num_skills_reason_steps=param_num_skills_reason_steps, batch_size=param_batch_size, embedding_batch_size=param_embedding_batch_size, embedding_mini_batch_size=param_embedding_mini_batch_size, max_path_length=param_max_path_length, skills_reason_reward_scale=param_skills_reason_reward_scale, tasks_adapt_reward_scale=param_tasks_adapt_reward_scale, use_gpu=param_use_gpu): assert num_train_tasks is skills_num set_seed(seed) encoder_hidden_sizes = (encoder_hidden_size, encoder_hidden_size, encoder_hidden_size) ML_train_envs = [ DiaynEnvWrapper(task_proposer, skills_num, task_name, normalize(HalfCheetahEnv_Hurdle())) for task_name in range(skills_num) ] env_sampler = EnvPoolSampler(ML_train_envs) env = env_sampler.sample(num_train_tasks) runner = LocalRunner(ctxt) qf_env = Kant.get_env_spec(env[0](), latent_size, num_skills, "qf") qf = ContinuousMLPQFunction(env_spec=qf_env, hidden_sizes=[net_size, net_size, net_size]) vf_env = Kant.get_env_spec(env[0](), latent_size, num_skills, 'vf') vf = ContinuousMLPQFunction(env_spec=vf_env, hidden_sizes=[net_size, net_size, net_size]) controller_policy_env = Kant.get_env_spec(env[0](), latent_size, module="controller_policy", num_skills=num_skills) controller_policy = CategoricalMLPPolicy( env_spec=controller_policy_env, hidden_sizes=[net_size, net_size], hidden_nonlinearity=functional.relu) kant = Kant( env=env, skill_env=skill_env, controller_policy=controller_policy, skill_actor=skill_actor, qf=qf, vf=vf, num_skills=num_skills, num_train_tasks=num_train_tasks, num_test_tasks=num_test_tasks, is_encoder_recurrent=is_encoder_recurrent, latent_dim=latent_size, encoder_hidden_sizes=encoder_hidden_sizes, meta_batch_size=meta_batch_size, num_initial_steps=num_initial_steps, num_tasks_sample=num_tasks_sample, num_steps_per_epoch=num_steps_per_epoch, num_steps_prior=num_steps_prior, # num_steps_posterior num_extra_rl_steps_posterior=num_extra_rl_steps_posterior, num_skills_reason_steps=num_skills_reason_steps, num_skills_sample=num_skills_sample, batch_size=batch_size, embedding_batch_size=embedding_batch_size, embedding_mini_batch_size=embedding_mini_batch_size, max_path_length=max_path_length, skills_reason_reward_scale=skills_reason_reward_scale, tasks_adapt_reward_scale=tasks_adapt_reward_scale) tu.set_gpu_mode(use_gpu, gpu_id=0) if use_gpu: kant.to() worker_args = dict(num_skills=num_skills, skill_actor_class=type(skill_actor), controller_class=OpenContextConditionedControllerPolicy, deterministic=False, accum_context=True) runner.setup(algo=kant, env=env[0](), sampler_cls=LocalSkillSampler, sampler_args=dict(max_path_length=max_path_length), n_workers=1, worker_class=KantWorker, worker_args=worker_args) average_returns = runner.train(n_epochs=num_epochs, batch_size=batch_size) runner.save(num_epochs - 1) return average_returns