def run_train_task(vv): env = TfEnv(normalize(vv['env'](log_scale_limit=vv['log_scale_limit']))) dynamics_model = MLPDynamicsEnsemble( name="dyn_model", env_spec=env.spec, hidden_sizes=vv['hidden_sizes_model'], weight_normalization=vv['weight_normalization_model'], num_models=vv['num_models'], valid_split_ratio=vv['valid_split_ratio'], rolling_average_persitency=vv['rolling_average_persitency']) policy = MPCController( name="policy", env=env, dynamics_model=dynamics_model, discount=vv['discount'], n_candidates=vv['n_candidates'], horizon=vv['horizon'], ) algo = ModelMPCBatchPolopt( env=env, policy=policy, dynamics_model=dynamics_model, batch_size_env_samples=vv['batch_size_env_samples'], initial_random_samples=vv['initial_random_samples'], dynamic_model_max_epochs=vv['dynamic_model_epochs'], max_path_length=vv['path_length'], n_itr=vv['n_itr'], discount=vv['discount'], step_size=vv["step_size"], reinit_model_cycle=vv['reinit_model_cycle']) algo.train()
def run_train_task(vv): env = TfEnv(normalize(vv['env']())) policy = MAMLGaussianMLPPolicy( name="policy", env_spec=env.spec, hidden_sizes=vv['hidden_sizes'], grad_step_size=vv['fast_lr'], hidden_nonlinearity=vv['hidden_nonlinearity'], trainable_step_size=vv['trainable_step_size'], bias_transform=vv['bias_transform']) baseline = LinearFeatureBaseline(env_spec=env.spec) algo = MAMLTRPO( env=env, policy=policy, baseline=baseline, batch_size=vv['fast_batch_size'], # number of trajs for grad update max_path_length=vv['path_length'], meta_batch_size=vv['meta_batch_size'], num_grad_updates=vv['num_grad_updates'], n_itr=vv['n_itr'], discount=vv['discount'], step_size=vv["meta_step_size"], parallel_sampler=vv['parallel_sampler'], ) algo.train()
def run_train_task(vv): env = TfEnv(normalize(vv['env']())) policy = vv['policy'](name="policy", env_spec=env.spec, num_tasks=vv['meta_batch_size'], hidden_sizes=vv['hidden_sizes'], grad_step_size=vv['fast_lr'], hidden_nonlinearity=vv['hidden_nonlinearity'], trainable_step_size=vv['trainable_step_size'], bias_transform=vv['bias_transform']) baseline = LinearFeatureBaseline(env_spec=env.spec) optimizer_args = dict(max_epochs=vv['max_epochs'], ) algo = vv['algo']( env=env, policy=policy, baseline=baseline, batch_size=vv['fast_batch_size'], # number of trajs for grad update max_path_length=vv['path_length'], meta_batch_size=vv['meta_batch_size'], num_grad_updates=vv['num_grad_updates'], n_itr=vv['n_itr'], discount=vv['discount'], entropy_bonus=vv['entropy_bonus'], clip_eps=vv['clip_eps'], target_inner_step=vv['target_inner_step'], init_kl_penalty=vv['init_kl_penalty'], optimizer_args=optimizer_args, ) algo.train()
def run_train_task(vv): env = TfEnv(normalize(vv['env']( log_scale_limit=vv['log_scale_limit'], target_velocity=vv['target_velocity'], ))) dynamics_model = MLPDynamicsEnsemble( name="dyn_model", env_spec=env.spec, hidden_sizes=vv['hidden_sizes_model'], weight_normalization=vv['weight_normalization_model'], num_models=vv['num_models'], optimizer=vv['optimizer_model'], valid_split_ratio=vv['valid_split_ratio'], rolling_average_persitency=vv['rolling_average_persitency'] ) policy = MAMLGaussianMLPPolicy( name="policy", env_spec=env.spec, hidden_sizes=vv['hidden_sizes_policy'], hidden_nonlinearity=vv['hidden_nonlinearity_policy'], grad_step_size=vv['fast_lr'], trainable_step_size=vv['trainable_step_size'], bias_transform=vv['bias_transform'], num_tasks=vv['meta_batch_size'] ) baseline = LinearFeatureBaseline(env_spec=env.spec) algo = ModelMAMLTRPO( env=env, policy=policy, dynamics_model=dynamics_model, baseline=baseline, n_itr=vv['n_itr'], n_iter=vv['n_itr'], batch_size_env_samples=vv['batch_size_env_samples'], batch_size_dynamics_samples=vv['batch_size_dynamics_samples'], meta_batch_size=vv['meta_batch_size'], initial_random_samples=vv['initial_random_samples'], num_maml_steps_per_iter=vv['num_maml_steps_per_iter'], reset_from_env_traj=vv.get('reset_from_env_traj', False), max_path_length_env=vv['path_length_env'], max_path_length_dyn=vv.get('path_length_dyn', None), dynamic_model_max_epochs=vv.get('dynamic_model_max_epochs', (500, 500)), discount=vv['discount'], step_size=vv["meta_step_size"], num_grad_updates=1, retrain_model_when_reward_decreases=vv['retrain_model_when_reward_decreases'], reset_policy_std=vv['reset_policy_std'], reinit_model_cycle=vv['reinit_model_cycle'], frac_gpu=vv.get('frac_gpu', 0.85), log_real_performance=True, clip_obs=vv.get('clip_obs', True), entropy_bonus=vv['entropy_bonus'], tailored_exploration=vv['tailored_exploration'] ) algo.train()
def run_eval_task(vv): # load policy and baseline- Warning: resets the tf graph # also returns the tensorflow session which must be used in the further code baseline, env, params_pickle_file = eval.load_baseline_and_env(vv) tf.reset_default_graph() # fix the mujoco parameters env_class = eval.get_env_class(env) env = TfEnv( normalize( env_class(log_scale_limit=vv["log_scale_limit"], fix_params=True, random_seed=vv['env_param_seed']))) step_size = vv['fast_lr'] policy = None algo = VPG(env=env, policy=policy, load_policy=params_pickle_file, baseline=baseline, batch_size=20000, max_path_length=100, n_itr=5, optimizer_args={ 'init_learning_rate': step_size, 'tf_optimizer_args': { 'learning_rate': step_size }, 'tf_optimizer_cls': tf.train.GradientDescentOptimizer }) algo.train()
def run_train_task(vv): import sys print(vv['exp_prefix']) sysout_log_path = os.path.join(config.LOG_DIR, 'local', vv['exp_prefix'], vv['exp_name'], 'stdout.log') sysout_log_file = open(sysout_log_path, 'w') sys.stdout = sysout_log_file env = TfEnv(normalize(vv['env'](log_scale_limit=vv['log_scale_limit']))) dynamics_model = MLPDynamicsEnsemble( name="dyn_model", env_spec=env.spec, hidden_sizes=vv['hidden_sizes_model'], weight_normalization=vv['weight_normalization_model'], num_models=vv['num_models'], optimizer=vv['optimizer_model'], valid_split_ratio=vv['valid_split_ratio'], rolling_average_persitency=vv['rolling_average_persitency']) policy = MAMLImprovedGaussianMLPPolicy( name="policy", env_spec=env.spec, hidden_sizes=vv['hidden_sizes_policy'], hidden_nonlinearity=vv['hidden_nonlinearity_policy'], grad_step_size=vv['fast_lr'], trainable_step_size=vv['trainable_step_size'], bias_transform=vv['bias_transform'], param_noise_std=vv['param_noise_std']) baseline = LinearFeatureBaseline(env_spec=env.spec) algo = ModelMAMLTRPO( env=env, policy=policy, dynamics_model=dynamics_model, baseline=baseline, n_itr=vv['n_itr'], n_iter=vv['n_itr'], batch_size_env_samples=vv['batch_size_env_samples'], batch_size_dynamics_samples=vv['batch_size_dynamics_samples'], meta_batch_size=vv['meta_batch_size'], initial_random_samples=vv['initial_random_samples'], num_maml_steps_per_iter=vv['num_maml_steps_per_iter'], reset_from_env_traj=vv.get('reset_from_env_traj', False), max_path_length_env=vv['path_length_env'], max_path_length_dyn=vv.get('path_length_dyn', None), discount=vv['discount'], step_size=vv["meta_step_size"], num_grad_updates=1, retrain_model_when_reward_decreases=vv[ 'retrain_model_when_reward_decreases'], reset_policy_std=vv['reset_policy_std'], reinit_model_cycle=vv['reinit_model_cycle'], frac_gpu=vv.get('frac_gpu', 0.85), clip_obs=vv.get('clip_obs', True)) algo.train() sysout_log_file.close()
def run_train_task(vv): env = TfEnv(normalize(vv['env'](log_scale_limit=vv['log_scale_limit']))) dynamics_model = BadDynamicsEnsemble( name="dyn_model", env_spec=env.spec, hidden_sizes=vv['hidden_sizes_model'], weight_normalization=vv['weight_normalization_model'], num_models=vv['num_models'], optimizer=vv['optimizer_model'], output_bias_range=vv['output_bias_range'], gaussian_noise_output_std=vv['output_noise_std'], ) policy = MAMLImprovedGaussianMLPPolicy( name="policy", env_spec=env.spec, hidden_sizes=vv['hidden_sizes_policy'], hidden_nonlinearity=vv['hidden_nonlinearity_policy'], grad_step_size=vv['fast_lr'], trainable_step_size=vv['trainable_step_size'], bias_transform=vv['bias_transform'], param_noise_std=vv['param_noise_std']) baseline = LinearFeatureBaseline(env_spec=env.spec) algo = ModelMAMLTRPO( env=env, policy=policy, dynamics_model=dynamics_model, baseline=baseline, n_itr=vv['n_itr'], n_iter=vv['n_itr'], batch_size_env_samples=vv['batch_size_env_samples'], batch_size_dynamics_samples=vv['batch_size_dynamics_samples'], meta_batch_size=vv['meta_batch_size'], initial_random_samples=vv['initial_random_samples'], dynamic_model_epochs=vv['dynamic_model_epochs'], num_maml_steps_per_iter=vv['num_maml_steps_per_iter'], reset_from_env_traj=vv['reset_from_env_traj'], max_path_length_env=vv['path_length_env'], discount=vv['discount'], step_size=vv["meta_step_size"], num_grad_updates=1, retrain_model_when_reward_decreases=vv[ 'retrain_model_when_reward_decreases'], reset_policy_std=vv['reset_policy_std'], reinit_model_cycle=vv['reinit_model_cycle'], frac_gpu=vv.get('frac_gpu', 0.85), log_real_performance=True, resample_output_bias=vv['resample_output_bias']) algo.train()
def run_train_task(vv): env = TfEnv( normalize(vv['env_class']( fix_goal=vv['fix_goal'], reward_type=vv['reward_type'], init_puck_low=INIT_PUCK_TARGET - vv['init_slack'], init_puck_high=INIT_PUCK_TARGET + vv['init_slack'], puck_goal_low=PUCK_GOAL_TARGET - vv['goal_slack'], puck_goal_high=PUCK_GOAL_TARGET + vv['goal_slack'], ))) policy = MAMLGaussianMLPPolicy( name="policy", env_spec=env.spec, num_tasks=vv['meta_batch_size'], hidden_sizes=vv['hidden_sizes'], grad_step_size=vv['fast_lr'], hidden_nonlinearity=vv['hidden_nonlinearity'], trainable_step_size=vv['trainable_step_size'], bias_transform=vv['bias_transform']) baseline = LinearFeatureBaseline(env_spec=env.spec) optimizer_args = dict( max_epochs=vv['max_epochs'], batch_size=vv['num_batches'], tf_optimizer_args=dict(learning_rate=vv['outer_lr']), ) algo = MAMLPPO( env=env, policy=policy, baseline=baseline, batch_size=vv['fast_batch_size'], # number of trajs for grad update max_path_length=vv['path_length'], meta_batch_size=vv['meta_batch_size'], num_grad_updates=vv['num_grad_updates'], n_itr=vv['n_itr'], discount=vv['discount'], entropy_bonus=vv['entropy_bonus'], clip_eps=vv['clip_eps'], clip_outer=vv['clip_outer'], target_outer_step=vv['target_outer_step'], target_inner_step=vv['target_inner_step'], init_outer_kl_penalty=vv['init_outer_kl_penalty'], init_inner_kl_penalty=vv['init_inner_kl_penalty'], adaptive_outer_kl_penalty=vv['adaptive_outer_kl_penalty'], adaptive_inner_kl_penalty=vv['adaptive_inner_kl_penalty'], parallel_sampler=vv['parallel_sampler'], optimizer_args=optimizer_args, ) algo.train()
def run_train_task(vv): import sys print(vv['exp_prefix']) sysout_log_path = os.path.join(config.LOG_DIR, 'local', vv['exp_prefix'], vv['exp_name'], 'stdout.log') sysout_log_file = open(sysout_log_path, 'w') sys.stdout = sysout_log_file env = TfEnv(normalize(vv['env'](log_scale_limit=vv['log_scale_limit']))) dynamics_model = MLPDynamicsEnsemble( name="dyn_model", env_spec=env.spec, hidden_sizes=vv['hidden_sizes_model'], weight_normalization=vv['weight_normalization_model'], num_models=vv['num_models'], valid_split_ratio=vv['valid_split_ratio'], rolling_average_persitency=vv['rolling_average_persitency']) policy = GaussianMLPPolicy( name="policy", env_spec=env.spec, hidden_sizes=vv['hidden_sizes_policy'], hidden_nonlinearity=vv['hidden_nonlinearity_policy'], ) baseline = LinearFeatureBaseline(env_spec=env.spec) algo = ModelTRPO( env=env, policy=policy, dynamics_model=dynamics_model, baseline=baseline, batch_size_env_samples=vv['batch_size_env_samples'], batch_size_dynamics_samples=vv['batch_size_dynamics_samples'], initial_random_samples=vv['initial_random_samples'], num_gradient_steps_per_iter=vv['num_gradient_steps_per_iter'], max_path_length=vv['path_length'], n_itr=vv['n_itr'], retrain_model_when_reward_decreases=vv[ 'retrain_model_when_reward_decreases'], discount=vv['discount'], step_size=vv["step_size"], reset_policy_std=vv['reset_policy_std'], reinit_model_cycle=vv['reinit_model_cycle']) algo.train() sysout_log_file.close()
def run_train_task(vv): env = TfEnv(normalize(vv['env'](log_scale_limit=vv['log_scale_limit']))) dynamics_model = BadDynamicsEnsemble( name="dyn_model", env_spec=env.spec, hidden_sizes=vv['hidden_sizes_model'], weight_normalization=vv['weight_normalization_model'], num_models=vv['num_models'], output_bias_range=vv['output_bias_range'], gaussian_noise_output_std=vv['output_noise_std'], ) policy = GaussianMLPPolicy( name="policy", env_spec=env.spec, hidden_sizes=vv['hidden_sizes_policy'], hidden_nonlinearity=vv['hidden_nonlinearity_policy'], ) baseline = LinearFeatureBaseline(env_spec=env.spec) algo = ModelTRPO( env=env, policy=policy, dynamics_model=dynamics_model, baseline=baseline, batch_size_env_samples=vv['batch_size_env_samples'], batch_size_dynamics_samples=vv['batch_size_dynamics_samples'], initial_random_samples=vv['initial_random_samples'], dynamic_model_epochs=vv['dynamic_model_epochs'], num_gradient_steps_per_iter=vv['num_gradient_steps_per_iter'], retrain_model_when_reward_decreases=vv[ 'retrain_model_when_reward_decreases'], max_path_length=vv['path_length'], n_itr=vv['n_itr'], discount=vv['discount'], step_size=vv["step_size"], reset_policy_std=vv['reset_policy_std'], reinit_model_cycle=vv['reinit_model_cycle'], resample_output_bias=vv['resample_output_bias']) algo.train()
def run_train_task(vv): env = TfEnv(normalize(vv['env']())) policy = MAMLGaussianMLPPolicy( name="policy", env_spec=env.spec, num_tasks=vv['meta_batch_size'], hidden_sizes=vv['hidden_sizes'], grad_step_size=vv['fast_lr'], hidden_nonlinearity=vv['hidden_nonlinearity'], trainable_step_size=vv['trainable_step_size'], bias_transform=vv['bias_transform']) baseline = LinearFeatureBaseline(env_spec=env.spec) algo = MAMLPPO( env=env, policy=policy, baseline=baseline, batch_size=vv['fast_batch_size'], # number of trajs for grad update max_path_length=vv['path_length'], meta_batch_size=vv['meta_batch_size'], num_grad_updates=vv['num_grad_updates'], n_itr=vv['n_itr'], discount=vv['discount'], entropy_bonus=vv['entropy_bonus'], clip_eps=vv['clip_eps'], clip_outer=vv['clip_outer'], target_outer_step=vv['target_outer_step'], target_inner_step=vv['target_inner_step'], init_outer_kl_penalty=vv['init_outer_kl_penalty'], init_inner_kl_penalty=vv['init_inner_kl_penalty'], adaptive_outer_kl_penalty=vv['adaptive_outer_kl_penalty'], adaptive_inner_kl_penalty=vv['adaptive_inner_kl_penalty'], max_epochs=vv['max_epochs'], num_batches=vv['num_batches'], tf_optimizer_args=dict(learning_rate=vv['outer_lr']), parallel_sampler=vv['parallel_sampler'], multi_adam=vv['multi_adam'], ) algo.train()
def run_train_task(vv): env = TfEnv( normalize(vv['env'](init_sampling_boundaries=vv['point_env_setup'] ['init_sampling_boundaries'], goal=vv['point_env_setup']['goal']))) dynamics_model = PointEnvFakeModelEnsemble( env_spec=env.spec, num_models=vv['num_models'], error_range_around_goal=vv['fake_model_setup'] ['error_range_around_goal'], bias_range=vv['fake_model_setup']['bias_range'], error_std=vv['fake_model_setup']['error_std'], goal=vv['point_env_setup']['goal'], error_at_goal=vv['fake_model_setup']['error_at_goal'], smooth_error=vv['smooth_error'], ) policy = MAMLGaussianMLPPolicy( name="policy", env_spec=env.spec, hidden_sizes=vv['hidden_sizes_policy'], hidden_nonlinearity=vv['hidden_nonlinearity_policy'], grad_step_size=vv['fast_lr'], trainable_step_size=vv['trainable_step_size'], bias_transform=vv['bias_transform'], num_tasks=vv['meta_batch_size']) baseline = LinearFeatureBaseline(env_spec=env.spec) algo = ModelMAMLTRPO( env=env, policy=policy, dynamics_model=dynamics_model, baseline=baseline, n_itr=vv['n_itr'], n_iter=vv['n_itr'], batch_size_env_samples=vv['batch_size_env_samples'], batch_size_dynamics_samples=vv['batch_size_dynamics_samples'], meta_batch_size=vv['meta_batch_size'], initial_random_samples=vv['initial_random_samples'], num_maml_steps_per_iter=vv['num_maml_steps_per_iter'], reset_from_env_traj=vv.get('reset_from_env_traj', False), max_path_length_env=vv['path_length_env'], max_path_length_dyn=vv.get('path_length_dyn', None), dynamic_model_max_epochs=vv.get('dynamic_model_max_epochs', (500, 500)), discount=vv['discount'], step_size=vv["meta_step_size"], num_grad_updates=1, retrain_model_when_reward_decreases=vv[ 'retrain_model_when_reward_decreases'], reset_policy_std=vv['reset_policy_std'], reinit_model_cycle=vv['reinit_model_cycle'], frac_gpu=vv.get('frac_gpu', 0.85), log_real_performance=True, clip_obs=vv.get('clip_obs', True), entropy_bonus=vv['entropy_bonus'], ) algo.train()