def main(config): set_seed(config['seed']) #experiment_log_dir = setup_logger(config['env'], variant=config, exp_id=None, # base_log_dir=config['base_log_dir']) baseline = globals()[config['baseline']]() #instantiate baseline env = globals()[config['env']]() # instantiate env env = normalize(env) # apply normalize wrapper to env policy = MetaGaussianMLPPolicy( name="meta-policy", obs_dim=np.prod(env.observation_space.shape), action_dim=np.prod(env.action_space.shape), meta_batch_size=config['meta_batch_size'], hidden_sizes=config['hidden_sizes'], ) sampler = MetaSampler( env=env, policy=policy, rollouts_per_meta_task=config['rollouts_per_meta_task'], # This batch_size is confusing meta_batch_size=config['meta_batch_size'], max_path_length=config['max_path_length'], parallel=config['parallel'], ) sample_processor = MetaSampleProcessor( baseline=baseline, discount=config['discount'], gae_lambda=config['gae_lambda'], normalize_adv=config['normalize_adv'], ) algo = ProMP( policy=policy, inner_lr=config['inner_lr'], meta_batch_size=config['meta_batch_size'], num_inner_grad_steps=config['num_inner_grad_steps'], learning_rate=config['learning_rate'], num_ppo_steps=config['num_promp_steps'], clip_eps=config['clip_eps'], target_inner_step=config['target_inner_step'], init_inner_kl_penalty=config['init_inner_kl_penalty'], adaptive_inner_kl_penalty=config['adaptive_inner_kl_penalty'], ) trainer = Trainer( algo=algo, policy=policy, env=env, sampler=sampler, sample_processor=sample_processor, n_itr=config['n_itr'], num_inner_grad_steps=config['num_inner_grad_steps'], ) trainer.train()
def main(config): set_seed(config['seed']) reward_baseline = LinearTimeBaseline() # the usual baseline return_baseline = LinearFeatureBaseline( ) # the additional baseline for DICE env = globals()[config['env']]() # instantiate env env = normalize(env) # apply normalize wrapper to env meta_baseline = MetaNNBaseline( input_size=env.observation_space.shape[0]) # the meta baseline policy = MetaGaussianMLPPolicy( name="meta-policy", obs_dim=np.prod(env.observation_space.shape), action_dim=np.prod(env.action_space.shape), meta_batch_size=config['meta_batch_size'], hidden_sizes=config['hidden_sizes'], ) sampler = MetaSampler( env=env, policy=policy, rollouts_per_meta_task=config['rollouts_per_meta_task'], meta_batch_size=config['meta_batch_size'], max_path_length=config['max_path_length'], parallel=config['parallel'], ) sample_processor = TMAMLMetaSampleProcessor( baseline=reward_baseline, max_path_length=config['max_path_length'], discount=config['discount'], normalize_adv=config['normalize_adv'], positive_adv=config['positive_adv'], return_baseline=return_baseline, metabaseline=meta_baseline, ) algo = TMAML(policy=policy, max_path_length=config['max_path_length'], meta_batch_size=config['meta_batch_size'], num_inner_grad_steps=config['num_inner_grad_steps'], inner_lr=config['inner_lr'], learning_rate=config['learning_rate']) trainer = Trainer( algo=algo, policy=policy, env=env, sampler=sampler, sample_processor=sample_processor, n_itr=config['n_itr'], num_inner_grad_steps=config['num_inner_grad_steps'], ) trainer.train()
def main(config): # config['seed'] = 4 experiment.set_name("short meta saving test") set_seed(config['seed']) experiment.log_parameters(config) # experiment.log_parameter("task limit size", 3) # experiment.log_metric("seed", config['seed']) baseline = globals()[config['baseline']]() #instantiate baseline env = globals()[config['env']]() # instantiate env env = normalize(env) # apply normalize wrapper to env with open('/saved_policies/mjvel.policy', 'rb') as policy_file: policy = pickle.load(policy_file) print("policy loaded") sampler = MetaSampler( env=env, policy=policy, rollouts_per_meta_task=config[ 'rollouts_per_meta_task'], # This batch_size is confusing meta_batch_size=config['meta_batch_size'], max_path_length=config['max_path_length'], parallel=config['parallel'], ) sample_processor = MetaSampleProcessor( baseline=baseline, discount=config['discount'], gae_lambda=config['gae_lambda'], normalize_adv=config['normalize_adv'], ) algo = ProMP( policy=policy, inner_lr=config['inner_lr'], meta_batch_size=config['meta_batch_size'], num_inner_grad_steps=config['num_inner_grad_steps'], learning_rate=config['learning_rate'], num_ppo_steps=config['num_promp_steps'], clip_eps=config['clip_eps'], target_inner_step=config['target_inner_step'], init_inner_kl_penalty=config['init_inner_kl_penalty'], adaptive_inner_kl_penalty=config['adaptive_inner_kl_penalty'], ) trainer = RLTrainer(algo=algo, policy=policy, env=env, sampler=sampler, sample_processor=sample_processor, n_itr=config['n_itr'], num_inner_grad_steps=config['num_inner_grad_steps'], experiment=experiment) trainer.train()
def main(config): set_seed(config['seed']) baseline = globals()[config['baseline']]() #instantiate baseline env = globals()[config['env']]() # instantiate env env = normalize(env) # apply normalize wrapper to env policy = MetaGaussianMLPPolicy( name="meta-policy", obs_dim=np.prod(env.observation_space.shape), action_dim=np.prod(env.action_space.shape), meta_batch_size=config['meta_batch_size'], hidden_sizes=config['hidden_sizes'], ) sampler = MetaSampler( env=env, policy=policy, rollouts_per_meta_task=config[ 'rollouts_per_meta_task'], # This batch_size is confusing meta_batch_size=config['meta_batch_size'], max_path_length=config['max_path_length'], parallel=config['parallel'], ) sample_processor = MetaSampleProcessor( baseline=baseline, discount=config['discount'], gae_lambda=config['gae_lambda'], normalize_adv=config['normalize_adv'], ) algo = VPGMAML( policy=policy, learning_rate=config['learning_rate'], inner_type=config['inner_type'], inner_lr=config['inner_lr'], meta_batch_size=config['meta_batch_size'], num_inner_grad_steps=config['num_inner_grad_steps'], exploration=False, ) trainer = Trainer( algo=algo, policy=policy, env=env, sampler=sampler, sample_processor=sample_processor, n_itr=config['n_itr'], num_inner_grad_steps=config['num_inner_grad_steps'], ) trainer.train()
def __init__(self, env, meta_batch_size, envs_per_task, max_path_length): self.envs = [] print ("env:", env) sys.exit() if (env is None): for _ in range(meta_batch_size * envs_per_task): env = terrainRLSim.getEnv(env_name="PD_Humanoid_3D_GRF_Mixed_1Sub_Imitate_30FPS_DenseState_v0", render=True) # env = globals()[config['env']]() # instantiate env env = normalize(env) # apply normalize wrapper to env self.envs.append(env) else: self.envs = np.asarray([copy.deepcopy(env) for _ in range(meta_batch_size * envs_per_task)]) self.ts = np.zeros(len(self.envs), dtype='int') # time steps self.max_path_length = max_path_length
def run_experiment(**kwargs): exp_dir = os.getcwd() + '/data/' + EXP_NAME logger.configure(dir=exp_dir, format_strs=['stdout', 'log', 'csv'], snapshot_mode='last_gap', snapshot_gap=50) json.dump(kwargs, open(exp_dir + '/params.json', 'w'), indent=2, sort_keys=True, cls=ClassEncoder) # Instantiate classes set_seed(kwargs['seed']) baseline = kwargs['baseline']() env = normalize(kwargs['env']()) # Wrappers? policy = MetaGaussianMLPPolicy( name="meta-policy", obs_dim=np.prod(env.observation_space.shape), # Todo...? action_dim=np.prod(env.action_space.shape), meta_batch_size=kwargs['meta_batch_size'], hidden_sizes=kwargs['hidden_sizes'], learn_std=kwargs['learn_std'], hidden_nonlinearity=kwargs['hidden_nonlinearity'], output_nonlinearity=kwargs['output_nonlinearity'], ) # Load policy here sampler = MAMLSampler( env=env, policy=policy, rollouts_per_meta_task=kwargs['rollouts_per_meta_task'], meta_batch_size=kwargs['meta_batch_size'], max_path_length=kwargs['max_path_length'], parallel=kwargs['parallel'], envs_per_task=1, ) sample_processor = MAMLSampleProcessor( baseline=baseline, discount=kwargs['discount'], gae_lambda=kwargs['gae_lambda'], normalize_adv=kwargs['normalize_adv'], positive_adv=kwargs['positive_adv'], ) algo = TRPOMAML( policy=policy, step_size=kwargs['step_size'], inner_type=kwargs['experiment_tuple'][1], inner_lr=kwargs['inner_lr'], meta_batch_size=kwargs['meta_batch_size'], num_inner_grad_steps=kwargs['num_inner_grad_steps'], exploration=kwargs['experiment_tuple'][2], ) trainer = Trainer( algo=algo, policy=policy, env=env, sampler=sampler, sample_processor=sample_processor, n_itr=kwargs['n_itr'], num_inner_grad_steps=kwargs['num_inner_grad_steps'], ) trainer.train()
def main(config): set_seed(config['seed']) baseline = globals()[config['baseline']]() #instantiate baseline env = globals()[config['env']]() # instantiate env env = normalize(env) # apply normalize wrapper to env policy = MetaGaussianMLPPolicy( name="meta-policy", obs_dim=np.prod(env.observation_space.shape), action_dim=np.prod(env.action_space.shape), meta_batch_size=config['meta_batch_size'], hidden_sizes=config['hidden_sizes'], ) sampler = MetaSampler( env=env, policy=policy, rollouts_per_meta_task=config[ 'rollouts_per_meta_task'], # This batch_size is confusing meta_batch_size=config['meta_batch_size'], max_path_length=config['max_path_length'], parallel=config['parallel'], ) sample_processor = MetaSampleProcessor( baseline=baseline, discount=config['discount'], gae_lambda=config['gae_lambda'], normalize_adv=config['normalize_adv'], ) algo = ProMP( policy=policy, inner_lr=config['inner_lr'], meta_batch_size=config['meta_batch_size'], num_inner_grad_steps=config['num_inner_grad_steps'], learning_rate=config['learning_rate'], num_ppo_steps=config['num_promp_steps'], clip_eps=config['clip_eps'], target_inner_step=config['target_inner_step'], init_inner_kl_penalty=config['init_inner_kl_penalty'], adaptive_inner_kl_penalty=config['adaptive_inner_kl_penalty'], ) gpu_config = tf.ConfigProto() gpu_config.gpu_options.allow_growth = True # pylint: disable=E1101 sess = tf.Session(config=gpu_config) saver = tf.train.Saver( keep_checkpoint_every_n_hours=config['keep_checkpoint_every_n_hours'], max_to_keep=config['max_checkpoints_to_keep']) save_path = os.path.join(args.dump_path, 'model.ckpt') if config['restore_path'] is not None: logger.log('Restoring parameters from {}'.format( config['restore_path'])) saver.restore(sess, config['restore_path']) logger.log('Restored') trainer = Trainer( algo=algo, policy=policy, env=env, sampler=sampler, sample_processor=sample_processor, saver=saver, save_path=save_path, save_steps=config['save_steps'], n_itr=config['n_itr'], num_inner_grad_steps=config['num_inner_grad_steps'], sess=sess, ) trainer.train()
def main(config): # config['seed'] = 4ß # experiment.set_name("pos task only, size = 15, logging vel") set_seed(config['seed']) # experiment.log_parameters(config) # experiment.log_parameter("task limit size", 3) # experiment.log_metric("seed", config['seed']) baseline = globals()[config['baseline']]() #instantiate baseline env = globals()[config['env']]() # instantiate env print("env: ", env.sample_tasks) TASKSL1 = np.array([0, -0.3]) env.set_tasks(TASKSL1) env = normalize(env) # apply normalize wrapper to env policy = MetaGaussianMLPPolicy( name="meta-policy", obs_dim=np.prod(env.observation_space.shape), action_dim=np.prod(env.action_space.shape), meta_batch_size=config['meta_batch_size'], hidden_sizes=config['hidden_sizes'], ) sampler = MetaSampler( env=env, policy=policy, rollouts_per_meta_task=config[ 'rollouts_per_meta_task'], # This batch_size is confusing meta_batch_size=config['meta_batch_size'], max_path_length=config['max_path_length'], parallel=config['parallel'], ) sample_processor = MetaSampleProcessor( baseline=baseline, discount=config['discount'], gae_lambda=config['gae_lambda'], normalize_adv=config['normalize_adv'], ) algo = ProMP( policy=policy, inner_lr=config['inner_lr'], meta_batch_size=config['meta_batch_size'], num_inner_grad_steps=config['num_inner_grad_steps'], learning_rate=config['learning_rate'], num_ppo_steps=config['num_promp_steps'], clip_eps=config['clip_eps'], target_inner_step=config['target_inner_step'], init_inner_kl_penalty=config['init_inner_kl_penalty'], adaptive_inner_kl_penalty=config['adaptive_inner_kl_penalty'], ) trainer = Trainer( algo=algo, policy=policy, env=env, sampler=sampler, sample_processor=sample_processor, n_itr=config['n_itr'], num_inner_grad_steps=config['num_inner_grad_steps'], ) trainer.train()
def main(config): experiment.set_name("short meta saving test") set_seed(config['seed']) experiment.log_parameters(config) baseline = globals()[config['baseline']]() #instantiate baseline env = terrainRLSim.getEnv(env_name=None, render=False) # env = normalize(env) # apply normalize wrapper to env policy = MetaGaussianMLPPolicy( name="meta-policy", obs_dim=np.prod((104, )), action_dim=np.prod((11, )), meta_batch_size=config['meta_batch_size'], hidden_sizes=config['hidden_sizes'], ) sampler = MetaSampler( env=('terrianrlSim', config['env']), policy=policy, rollouts_per_meta_task=config[ 'rollouts_per_meta_task'], # This batch_size is confusing meta_batch_size=config['meta_batch_size'], max_path_length=config['max_path_length'], parallel=config['parallel'], ) env = terrainRLSim.getEnv(env_name=config['env'], render=False) # env = globals()[config['env']]() # instantiate env env = normalize(env) # apply normalize wrapper to env print("env.observation_space.shape: ", env.observation_space.shape) print("env.action_space.shape: ", env.action_space.shape) sampler.set_env(env) sample_processor = MetaSampleProcessor( baseline=baseline, discount=config['discount'], gae_lambda=config['gae_lambda'], normalize_adv=config['normalize_adv'], ) algo = ProMP( policy=policy, inner_lr=config['inner_lr'], meta_batch_size=config['meta_batch_size'], num_inner_grad_steps=config['num_inner_grad_steps'], learning_rate=config['learning_rate'], num_ppo_steps=config['num_promp_steps'], clip_eps=config['clip_eps'], target_inner_step=config['target_inner_step'], init_inner_kl_penalty=config['init_inner_kl_penalty'], adaptive_inner_kl_penalty=config['adaptive_inner_kl_penalty'], ) trainer = Trainer(algo=algo, policy=policy, env=env, sampler=sampler, sample_processor=sample_processor, n_itr=config['n_itr'], num_inner_grad_steps=config['num_inner_grad_steps'], experiment=experiment) trainer.train()
def worker(remote, parent_remote, env_pickle, n_envs, max_path_length, seed): """ Instantiation of a parallel worker for collecting samples. It loops continually checking the task that the remote sends to it. Args: remote (multiprocessing.Connection): parent_remote (multiprocessing.Connection): env_pickle (pkl): pickled environment n_envs (int): number of environments per worker max_path_length (int): maximum path length of the task seed (int): random seed for the worker """ parent_remote.close() # print ("env_pickle: ", env_pickle) # sys.exit() envs = [] if type(env_pickle) is tuple: for _ in range(n_envs): if (env_pickle[0] == 'terrianrlSim'): env = terrainRLSim.getEnv(env_name=env_pickle[1], render=False) # env = globals()[config['env']]() # instantiate env env = normalize(env) # apply normalize wrapper to env envs.append(env) else: envs = [pickle.loads(env_pickle) for _ in range(n_envs)] np.random.seed(seed) ts = np.zeros(n_envs, dtype='int') while True: # receive command and data from the remote cmd, data = remote.recv() # do a step in each of the environment of the worker if cmd == 'step': all_results = [env.step(a) for (a, env) in zip(data, envs)] obs, rewards, dones, infos = map(list, zip(*all_results)) ts += 1 for i in range(n_envs): if dones[i] or (ts[i] >= max_path_length): dones[i] = True obs[i] = envs[i].reset() ts[i] = 0 remote.send((obs, rewards, dones, infos)) # reset all the environments of the worker elif cmd == 'reset': obs = [env.reset() for env in envs] ts[:] = 0 remote.send(obs) # set the specified task for each of the environments of the worker elif cmd == 'set_task': for env in envs: env.set_task(data) remote.send(None) # close the remote and stop the worker elif cmd == 'close': remote.close() break else: raise NotImplementedError
parser.add_argument('--video_filename', default=None) parser.add_argument('--num_trajs', type=int, default=10) args = parser.parse_args(sys.argv[1:]) params_path = os.path.join( os.path.split(args.restore_path)[0], 'params.json') with open(params_path, 'r') as f: params = json.load(f) params.update(args.overrides) baseline = LinearFeatureBaseline() env = globals()[params['env']]() # instantiate env env = normalize(env) # apply normalize wrapper to env gpu_config = tf.ConfigProto() gpu_config.gpu_options.allow_growth = True # pylint: disable=E1101 sess = tf.Session(config=gpu_config) policy = MetaGaussianMLPPolicy( name="meta-policy", obs_dim=np.prod(env.observation_space.shape), action_dim=np.prod(env.action_space.shape), meta_batch_size=params['meta_batch_size'], hidden_sizes=params['hidden_sizes'], cell_size=params['cell_size'], rollouts_per_meta_task=params['rollouts_per_meta_task'], max_path_length=params['max_path_length'], use_betas=params['use_betas'],