def main(config): set_seed(config['seed']) #experiment_log_dir = setup_logger(config['env'], variant=config, exp_id=None, # base_log_dir=config['base_log_dir']) baseline = globals()[config['baseline']]() #instantiate baseline env = globals()[config['env']]() # instantiate env env = normalize(env) # apply normalize wrapper to env policy = MetaGaussianMLPPolicy( name="meta-policy", obs_dim=np.prod(env.observation_space.shape), action_dim=np.prod(env.action_space.shape), meta_batch_size=config['meta_batch_size'], hidden_sizes=config['hidden_sizes'], ) sampler = MetaSampler( env=env, policy=policy, rollouts_per_meta_task=config['rollouts_per_meta_task'], # This batch_size is confusing meta_batch_size=config['meta_batch_size'], max_path_length=config['max_path_length'], parallel=config['parallel'], ) sample_processor = MetaSampleProcessor( baseline=baseline, discount=config['discount'], gae_lambda=config['gae_lambda'], normalize_adv=config['normalize_adv'], ) algo = ProMP( policy=policy, inner_lr=config['inner_lr'], meta_batch_size=config['meta_batch_size'], num_inner_grad_steps=config['num_inner_grad_steps'], learning_rate=config['learning_rate'], num_ppo_steps=config['num_promp_steps'], clip_eps=config['clip_eps'], target_inner_step=config['target_inner_step'], init_inner_kl_penalty=config['init_inner_kl_penalty'], adaptive_inner_kl_penalty=config['adaptive_inner_kl_penalty'], ) trainer = Trainer( algo=algo, policy=policy, env=env, sampler=sampler, sample_processor=sample_processor, n_itr=config['n_itr'], num_inner_grad_steps=config['num_inner_grad_steps'], ) trainer.train()
def main(config): # config['seed'] = 4 experiment.set_name("short meta saving test") set_seed(config['seed']) experiment.log_parameters(config) # experiment.log_parameter("task limit size", 3) # experiment.log_metric("seed", config['seed']) baseline = globals()[config['baseline']]() #instantiate baseline env = globals()[config['env']]() # instantiate env env = normalize(env) # apply normalize wrapper to env with open('/saved_policies/mjvel.policy', 'rb') as policy_file: policy = pickle.load(policy_file) print("policy loaded") sampler = MetaSampler( env=env, policy=policy, rollouts_per_meta_task=config[ 'rollouts_per_meta_task'], # This batch_size is confusing meta_batch_size=config['meta_batch_size'], max_path_length=config['max_path_length'], parallel=config['parallel'], ) sample_processor = MetaSampleProcessor( baseline=baseline, discount=config['discount'], gae_lambda=config['gae_lambda'], normalize_adv=config['normalize_adv'], ) algo = ProMP( policy=policy, inner_lr=config['inner_lr'], meta_batch_size=config['meta_batch_size'], num_inner_grad_steps=config['num_inner_grad_steps'], learning_rate=config['learning_rate'], num_ppo_steps=config['num_promp_steps'], clip_eps=config['clip_eps'], target_inner_step=config['target_inner_step'], init_inner_kl_penalty=config['init_inner_kl_penalty'], adaptive_inner_kl_penalty=config['adaptive_inner_kl_penalty'], ) trainer = RLTrainer(algo=algo, policy=policy, env=env, sampler=sampler, sample_processor=sample_processor, n_itr=config['n_itr'], num_inner_grad_steps=config['num_inner_grad_steps'], experiment=experiment) trainer.train()
def main(config): set_seed(config['seed']) sess = tf.Session() with sess.as_default() as sess: data = joblib.load(load_path + "/params.pkl") policy = data['policy'] env = data['env'] baseline = data['baseline'] # config['meta_batch_size'] = env.NUM_EVAL # policy.meta_batch_size = env.NUM_EVAL sampler = MetaSampler( env=env, policy=policy, rollouts_per_meta_task=config[ 'rollouts_per_meta_task'], # Will be modified later meta_batch_size=config['meta_batch_size'], max_path_length=config['max_path_length'], parallel=config['parallel'], ) sample_processor = MetaSampleProcessor( baseline=baseline, discount=config['discount'], gae_lambda=config['gae_lambda'], normalize_adv=config['normalize_adv'], ) algo = ProMP( policy=policy, inner_lr=config['inner_lr'], meta_batch_size=config['meta_batch_size'], num_inner_grad_steps=config['num_inner_grad_steps'], learning_rate=config['learning_rate'], num_ppo_steps=config['num_promp_steps'], clip_eps=config['clip_eps'], target_inner_step=config['target_inner_step'], init_inner_kl_penalty=config['init_inner_kl_penalty'], adaptive_inner_kl_penalty=config['adaptive_inner_kl_penalty'], ) tester = Tester( algo=algo, policy=policy, env=env, sampler=sampler, sample_processor=sample_processor, #n_itr=config['n_itr'], eff=config['eff'], num_inner_grad_steps=config['num_inner_grad_steps'], ) tester.train() sess.close()
def setUp(self): self.env = env = MetaPointEnv() self.baseline = baseline = LinearFeatureBaseline() self.policy = policy = MetaGaussianMLPPolicy( name="meta-policy", obs_dim=np.prod(env.observation_space.shape), action_dim=np.prod(env.action_space.shape), meta_batch_size=10, hidden_sizes=(16, 16), learn_std=True, hidden_nonlinearity=tf.tanh, output_nonlinearity=None, ) self.sampler = MetaSampler( env=env, policy=policy, rollouts_per_meta_task=2, meta_batch_size=10, max_path_length=50, parallel=False, ) self.sample_processor = MetaSampleProcessor( baseline=baseline, discount=0.99, gae_lambda=1.0, normalize_adv=True, positive_adv=False, ) self.algo = ProMP( policy=policy, inner_lr=0.1, meta_batch_size=10, num_inner_grad_steps=2, learning_rate=1e-3, num_ppo_steps=5, num_minibatches=1, clip_eps=0.5, target_inner_step=2e-2, init_inner_kl_penalty=1e-3, )
def main(config): set_seed(config['seed']) baseline = globals()[config['baseline']]() #instantiate baseline env = globals()[config['env']]() # instantiate env env = normalize(env) # apply normalize wrapper to env policy = MetaGaussianMLPPolicy( name="meta-policy", obs_dim=np.prod(env.observation_space.shape), action_dim=np.prod(env.action_space.shape), meta_batch_size=config['meta_batch_size'], hidden_sizes=config['hidden_sizes'], ) sampler = MetaSampler( env=env, policy=policy, rollouts_per_meta_task=config[ 'rollouts_per_meta_task'], # This batch_size is confusing meta_batch_size=config['meta_batch_size'], max_path_length=config['max_path_length'], parallel=config['parallel'], ) sample_processor = MetaSampleProcessor( baseline=baseline, discount=config['discount'], gae_lambda=config['gae_lambda'], normalize_adv=config['normalize_adv'], ) algo = ProMP( policy=policy, inner_lr=config['inner_lr'], meta_batch_size=config['meta_batch_size'], num_inner_grad_steps=config['num_inner_grad_steps'], learning_rate=config['learning_rate'], num_ppo_steps=config['num_promp_steps'], clip_eps=config['clip_eps'], target_inner_step=config['target_inner_step'], init_inner_kl_penalty=config['init_inner_kl_penalty'], adaptive_inner_kl_penalty=config['adaptive_inner_kl_penalty'], ) gpu_config = tf.ConfigProto() gpu_config.gpu_options.allow_growth = True # pylint: disable=E1101 sess = tf.Session(config=gpu_config) saver = tf.train.Saver( keep_checkpoint_every_n_hours=config['keep_checkpoint_every_n_hours'], max_to_keep=config['max_checkpoints_to_keep']) save_path = os.path.join(args.dump_path, 'model.ckpt') if config['restore_path'] is not None: logger.log('Restoring parameters from {}'.format( config['restore_path'])) saver.restore(sess, config['restore_path']) logger.log('Restored') trainer = Trainer( algo=algo, policy=policy, env=env, sampler=sampler, sample_processor=sample_processor, saver=saver, save_path=save_path, save_steps=config['save_steps'], n_itr=config['n_itr'], num_inner_grad_steps=config['num_inner_grad_steps'], sess=sess, ) trainer.train()
def main(config): # config['seed'] = 4ß # experiment.set_name("pos task only, size = 15, logging vel") set_seed(config['seed']) # experiment.log_parameters(config) # experiment.log_parameter("task limit size", 3) # experiment.log_metric("seed", config['seed']) baseline = globals()[config['baseline']]() #instantiate baseline env = globals()[config['env']]() # instantiate env print("env: ", env.sample_tasks) TASKSL1 = np.array([0, -0.3]) env.set_tasks(TASKSL1) env = normalize(env) # apply normalize wrapper to env policy = MetaGaussianMLPPolicy( name="meta-policy", obs_dim=np.prod(env.observation_space.shape), action_dim=np.prod(env.action_space.shape), meta_batch_size=config['meta_batch_size'], hidden_sizes=config['hidden_sizes'], ) sampler = MetaSampler( env=env, policy=policy, rollouts_per_meta_task=config[ 'rollouts_per_meta_task'], # This batch_size is confusing meta_batch_size=config['meta_batch_size'], max_path_length=config['max_path_length'], parallel=config['parallel'], ) sample_processor = MetaSampleProcessor( baseline=baseline, discount=config['discount'], gae_lambda=config['gae_lambda'], normalize_adv=config['normalize_adv'], ) algo = ProMP( policy=policy, inner_lr=config['inner_lr'], meta_batch_size=config['meta_batch_size'], num_inner_grad_steps=config['num_inner_grad_steps'], learning_rate=config['learning_rate'], num_ppo_steps=config['num_promp_steps'], clip_eps=config['clip_eps'], target_inner_step=config['target_inner_step'], init_inner_kl_penalty=config['init_inner_kl_penalty'], adaptive_inner_kl_penalty=config['adaptive_inner_kl_penalty'], ) trainer = Trainer( algo=algo, policy=policy, env=env, sampler=sampler, sample_processor=sample_processor, n_itr=config['n_itr'], num_inner_grad_steps=config['num_inner_grad_steps'], ) trainer.train()
def main(args=None): idx = int(time.time()) args = parse_args(args) config = { 'seed': args.seed, 'baseline': 'LinearFeatureBaseline', 'env': 'ReachWorld', # not used 'rollouts_per_meta_task': args.rollout_per_meta_task, 'max_path_length': args.max_path_length, # 100 'parallel': not args.seq, 'discount': args.discount, 'gae_lambda': args.gae_lambda, 'normalize_adv': True, 'hidden_sizes': args.hidden_sizes, 'inner_lr': args.inner_lr, # adaptation step size 'learning_rate': args.learning_rate, # meta-policy gradient step size 'num_promp_steps': args.num_promp_steps, # number of ProMp steps without re-sampling 'clip_eps': args.clip_eps, # clipping range 'target_inner_step': args.target_inner_step, 'init_inner_kl_penalty': args.init_inner_kl_penalty, 'adaptive_inner_kl_penalty': args. adaptive_inner_kl_penalty, # whether to use an adaptive or fixed KL-penalty coefficient 'n_itr': args.n_itr, # number of overall training iterations 'meta_batch_size': args.meta_batch_size, # number of sampled meta-tasks per iterations 'num_inner_grad_steps': args. num_inner_grad_steps, # number of inner / adaptation gradient steps } # configure logger logger.configure(dir=args.dump_path, format_strs=['stdout', 'log', 'csv'], snapshot_mode='last_gap') # dump run configuration before starting training json.dump(config, open(args.dump_path + '/params.json', 'w'), cls=ClassEncoder) set_seed(config['seed']) baseline = globals()[config['baseline']]() #instantiate baseline env = get_env() #env = normalize(env) # apply normalize wrapper to env if isinstance(env.action_space, gym.spaces.Box): action_dim = np.prod(env.action_space.shape) elif isinstance(env.action_space, gym.spaces.Discrete): action_dim = env.action_space.n else: raise Exception('unknown action space, cannot get action dim') policy = MetaCategoricalMLPPolicy( name="meta-policy", obs_dim=np.prod(env.observation_space.shape), action_dim=action_dim, meta_batch_size=config['meta_batch_size'], hidden_sizes=config['hidden_sizes'], ) sampler = MetaSampler( env=env, policy=policy, rollouts_per_meta_task=config[ 'rollouts_per_meta_task'], # This batch_size is confusing meta_batch_size=config['meta_batch_size'], max_path_length=config['max_path_length'], parallel=config['parallel'], ) sample_processor = MetaSampleProcessor( baseline=baseline, discount=config['discount'], gae_lambda=config['gae_lambda'], normalize_adv=config['normalize_adv'], ) algo = ProMP( policy=policy, inner_lr=config['inner_lr'], meta_batch_size=config['meta_batch_size'], num_inner_grad_steps=config['num_inner_grad_steps'], learning_rate=config['learning_rate'], num_ppo_steps=config['num_promp_steps'], clip_eps=config['clip_eps'], target_inner_step=config['target_inner_step'], init_inner_kl_penalty=config['init_inner_kl_penalty'], adaptive_inner_kl_penalty=config['adaptive_inner_kl_penalty'], ) trainer = Trainer( algo=algo, policy=policy, env=env, sampler=sampler, sample_processor=sample_processor, n_itr=config['n_itr'], num_inner_grad_steps=config['num_inner_grad_steps'], ) trainer.train()
def main(config): experiment.set_name("short meta saving test") set_seed(config['seed']) experiment.log_parameters(config) baseline = globals()[config['baseline']]() #instantiate baseline env = terrainRLSim.getEnv(env_name=None, render=False) # env = normalize(env) # apply normalize wrapper to env policy = MetaGaussianMLPPolicy( name="meta-policy", obs_dim=np.prod((104, )), action_dim=np.prod((11, )), meta_batch_size=config['meta_batch_size'], hidden_sizes=config['hidden_sizes'], ) sampler = MetaSampler( env=('terrianrlSim', config['env']), policy=policy, rollouts_per_meta_task=config[ 'rollouts_per_meta_task'], # This batch_size is confusing meta_batch_size=config['meta_batch_size'], max_path_length=config['max_path_length'], parallel=config['parallel'], ) env = terrainRLSim.getEnv(env_name=config['env'], render=False) # env = globals()[config['env']]() # instantiate env env = normalize(env) # apply normalize wrapper to env print("env.observation_space.shape: ", env.observation_space.shape) print("env.action_space.shape: ", env.action_space.shape) sampler.set_env(env) sample_processor = MetaSampleProcessor( baseline=baseline, discount=config['discount'], gae_lambda=config['gae_lambda'], normalize_adv=config['normalize_adv'], ) algo = ProMP( policy=policy, inner_lr=config['inner_lr'], meta_batch_size=config['meta_batch_size'], num_inner_grad_steps=config['num_inner_grad_steps'], learning_rate=config['learning_rate'], num_ppo_steps=config['num_promp_steps'], clip_eps=config['clip_eps'], target_inner_step=config['target_inner_step'], init_inner_kl_penalty=config['init_inner_kl_penalty'], adaptive_inner_kl_penalty=config['adaptive_inner_kl_penalty'], ) trainer = Trainer(algo=algo, policy=policy, env=env, sampler=sampler, sample_processor=sample_processor, n_itr=config['n_itr'], num_inner_grad_steps=config['num_inner_grad_steps'], experiment=experiment) trainer.train()
parallel=params['parallel'], ) sample_processor = MetaSampleProcessor( baseline=baseline, discount=params['discount'], gae_lambda=params['gae_lambda'], normalize_adv=params['normalize_adv'], ) algo = ProMP( policy=policy, inner_lr=params['inner_lr'], meta_batch_size=params['meta_batch_size'], num_inner_grad_steps=params['num_inner_grad_steps'], learning_rate=params['learning_rate'], num_ppo_steps=params['num_promp_steps'], clip_eps=params['clip_eps'], target_inner_step=params['target_inner_step'], init_inner_kl_penalty=params['init_inner_kl_penalty'], adaptive_inner_kl_penalty=params['adaptive_inner_kl_penalty'], ) saver = tf.train.Saver() if args.restore_path is not None: logger.log('Restoring parameters from {}'.format(args.restore_path)) saver.restore(sess, args.restore_path) logger.log('Restored') uninit_vars = [ var for var in tf.global_variables()