def run_experiment(**kwargs): exp_dir = os.getcwd() + '/data/' + EXP_NAME logger.configure(dir=exp_dir, format_strs=['stdout', 'log', 'csv'], snapshot_mode='last_gap', snapshot_gap=50) json.dump(kwargs, open(exp_dir + '/params.json', 'w'), indent=2, sort_keys=True, cls=ClassEncoder) # Instantiate classes set_seed(kwargs['seed']) baseline = kwargs['baseline']() env = normalize(kwargs['env']()) # Wrappers? policy = MetaGaussianMLPPolicy( name="meta-policy", obs_dim=np.prod(env.observation_space.shape), # Todo...? action_dim=np.prod(env.action_space.shape), meta_batch_size=kwargs['meta_batch_size'], hidden_sizes=kwargs['hidden_sizes'], learn_std=kwargs['learn_std'], hidden_nonlinearity=kwargs['hidden_nonlinearity'], output_nonlinearity=kwargs['output_nonlinearity'], ) # Load policy here sampler = MAMLSampler( env=env, policy=policy, rollouts_per_meta_task=kwargs['rollouts_per_meta_task'], meta_batch_size=kwargs['meta_batch_size'], max_path_length=kwargs['max_path_length'], parallel=kwargs['parallel'], envs_per_task=1, ) sample_processor = MAMLSampleProcessor( baseline=baseline, discount=kwargs['discount'], gae_lambda=kwargs['gae_lambda'], normalize_adv=kwargs['normalize_adv'], positive_adv=kwargs['positive_adv'], ) algo = TRPOMAML( policy=policy, step_size=kwargs['step_size'], inner_type=kwargs['experiment_tuple'][1], inner_lr=kwargs['inner_lr'], meta_batch_size=kwargs['meta_batch_size'], num_inner_grad_steps=kwargs['num_inner_grad_steps'], exploration=kwargs['experiment_tuple'][2], ) trainer = Trainer( algo=algo, policy=policy, env=env, sampler=sampler, sample_processor=sample_processor, n_itr=kwargs['n_itr'], num_inner_grad_steps=kwargs['num_inner_grad_steps'], ) trainer.train()
True, # whether to learn the standard deviation of the gaussian policy # ProMP config 'inner_lr': 0.1, # adaptation step size 'learning_rate': 1e-3, # meta-policy gradient step size 'num_promp_steps': 5, # number of ProMp steps without re-sampling 'clip_eps': 0.3, # clipping range 'target_inner_step': 0.01, 'init_inner_kl_penalty': 5e-4, 'adaptive_inner_kl_penalty': False, # whether to use an adaptive or fixed KL-penalty coefficient 'n_itr': 1001, # number of overall training iterations 'meta_batch_size': 40, # number of sampled meta-tasks per iterations 'num_inner_grad_steps': 1, # number of inner / adaptation gradient steps } # configure logger logger.configure(dir=args.dump_path, format_strs=['stdout', 'log', 'csv'], snapshot_mode='last_gap') # dump run configuration before starting training json.dump(config, open(args.dump_path + '/params.json', 'w'), cls=ClassEncoder) # start the actual algorithm main(config)
def main(args=None): idx = int(time.time()) args = parse_args(args) config = { 'seed': args.seed, 'baseline': 'LinearFeatureBaseline', 'env': 'ReachWorld', # not used 'rollouts_per_meta_task': args.rollout_per_meta_task, 'max_path_length': args.max_path_length, # 100 'parallel': not args.seq, 'discount': args.discount, 'gae_lambda': args.gae_lambda, 'normalize_adv': True, 'hidden_sizes': args.hidden_sizes, 'inner_lr': args.inner_lr, # adaptation step size 'learning_rate': args.learning_rate, # meta-policy gradient step size 'num_promp_steps': args.num_promp_steps, # number of ProMp steps without re-sampling 'clip_eps': args.clip_eps, # clipping range 'target_inner_step': args.target_inner_step, 'init_inner_kl_penalty': args.init_inner_kl_penalty, 'adaptive_inner_kl_penalty': args. adaptive_inner_kl_penalty, # whether to use an adaptive or fixed KL-penalty coefficient 'n_itr': args.n_itr, # number of overall training iterations 'meta_batch_size': args.meta_batch_size, # number of sampled meta-tasks per iterations 'num_inner_grad_steps': args. num_inner_grad_steps, # number of inner / adaptation gradient steps } # configure logger logger.configure(dir=args.dump_path, format_strs=['stdout', 'log', 'csv'], snapshot_mode='last_gap') # dump run configuration before starting training json.dump(config, open(args.dump_path + '/params.json', 'w'), cls=ClassEncoder) set_seed(config['seed']) baseline = globals()[config['baseline']]() #instantiate baseline env = get_env() #env = normalize(env) # apply normalize wrapper to env if isinstance(env.action_space, gym.spaces.Box): action_dim = np.prod(env.action_space.shape) elif isinstance(env.action_space, gym.spaces.Discrete): action_dim = env.action_space.n else: raise Exception('unknown action space, cannot get action dim') policy = MetaCategoricalMLPPolicy( name="meta-policy", obs_dim=np.prod(env.observation_space.shape), action_dim=action_dim, meta_batch_size=config['meta_batch_size'], hidden_sizes=config['hidden_sizes'], ) sampler = MetaSampler( env=env, policy=policy, rollouts_per_meta_task=config[ 'rollouts_per_meta_task'], # This batch_size is confusing meta_batch_size=config['meta_batch_size'], max_path_length=config['max_path_length'], parallel=config['parallel'], ) sample_processor = MetaSampleProcessor( baseline=baseline, discount=config['discount'], gae_lambda=config['gae_lambda'], normalize_adv=config['normalize_adv'], ) algo = ProMP( policy=policy, inner_lr=config['inner_lr'], meta_batch_size=config['meta_batch_size'], num_inner_grad_steps=config['num_inner_grad_steps'], learning_rate=config['learning_rate'], num_ppo_steps=config['num_promp_steps'], clip_eps=config['clip_eps'], target_inner_step=config['target_inner_step'], init_inner_kl_penalty=config['init_inner_kl_penalty'], adaptive_inner_kl_penalty=config['adaptive_inner_kl_penalty'], ) trainer = Trainer( algo=algo, policy=policy, env=env, sampler=sampler, sample_processor=sample_processor, n_itr=config['n_itr'], num_inner_grad_steps=config['num_inner_grad_steps'], ) trainer.train()