def main(config): baseline = LinearFeatureBaseline() env = normalize(HopperRandParamsEnv()) obs_dim = np.prod(env.observation_space.shape) policy = GaussianMLPPolicy( name="meta-policy", obs_dim=obs_dim, action_dim=np.prod(env.action_space.shape), meta_batch_size=config['meta_batch_size'], hidden_sizes=config['hidden_sizes'], ) sampler = MAMLSampler( env=env, policy=policy, rollouts_per_meta_task=config[ 'rollouts_per_meta_task'], # This batch_size is confusing meta_batch_size=config['meta_batch_size'], max_path_length=config['max_path_length'], parallel=config['parallel'], envs_per_task=5, ) sample_processor = SingleSampleProcessor( baseline=baseline, discount=config['discount'], gae_lambda=config['gae_lambda'], normalize_adv=config['normalize_adv'], positive_adv=config['positive_adv'], ) algo = PPO(policy=policy, learning_rate=config['learning_rate'], max_epochs=config['max_epochs']) trainer = Trainer( algo=algo, policy=policy, env=env, sampler=sampler, sample_processor=sample_processor, n_itr=config['n_itr'], ) trainer.train()
def construct_from_feed_dict(self, policy_pickle, env_pickle, baseline_pickle, dynamics_model_pickle, feed_dict): from meta_mb.samplers.metrpo_samplers.metrpo_sampler import METRPOSampler from meta_mb.samplers.bptt_samplers.bptt_sampler import BPTTSampler from meta_mb.samplers.base import SampleProcessor from meta_mb.algos.ppo import PPO from meta_mb.algos.trpo import TRPO env = pickle.loads(env_pickle) policy = pickle.loads(policy_pickle) baseline = pickle.loads(baseline_pickle) dynamics_model = pickle.loads(dynamics_model_pickle) self.policy = policy self.baseline = baseline if self.sampler_str == 'metrpo': self.model_sampler = METRPOSampler(env=env, policy=policy, dynamics_model=dynamics_model, **feed_dict['model_sampler']) elif self.sampler_str == 'bptt': self.model_sampler = BPTTSampler(env=env, policy=policy, dynamics_model=dynamics_model, **feed_dict['model_sampler']) else: raise NotImplementedError self.model_sample_processor = SampleProcessor( baseline=baseline, **feed_dict['model_sample_processor']) if self.algo == 'meppo': self.algo = PPO(policy=policy, **feed_dict['algo']) elif self.algo == 'metrpo': self.algo = TRPO(policy=policy, **feed_dict['algo']) else: raise NotImplementedError('algo_str must be meppo or metrpo')
def run_experiment(**kwargs): exp_dir = os.getcwd() + '/data/parallel_mb_ppo/' + EXP_NAME + '/' + kwargs.get('exp_name', '') logger.configure(dir=exp_dir, format_strs=['stdout', 'log', 'csv'], snapshot_mode='last') json.dump(kwargs, open(exp_dir + '/params.json', 'w'), indent=2, sort_keys=True, cls=ClassEncoder) config = tf.ConfigProto() config.gpu_options.allow_growth = True config.gpu_options.per_process_gpu_memory_fraction = kwargs.get('gpu_frac', 0.95) sess = tf.Session(config=config) with sess.as_default() as sess: # Instantiate classes set_seed(kwargs['seed']) baseline = kwargs['baseline']() env = normalize(kwargs['env']()) # Wrappers? policy = GaussianMLPPolicy( name="meta-policy", obs_dim=np.prod(env.observation_space.shape), action_dim=np.prod(env.action_space.shape), hidden_sizes=kwargs['policy_hidden_sizes'], learn_std=kwargs['policy_learn_std'], hidden_nonlinearity=kwargs['policy_hidden_nonlinearity'], output_nonlinearity=kwargs['policy_output_nonlinearity'], ) dynamics_model = MLPDynamicsEnsemble('dynamics-ensemble', env=env, num_models=kwargs['num_models'], hidden_nonlinearity=kwargs['dyanmics_hidden_nonlinearity'], hidden_sizes=kwargs['dynamics_hidden_sizes'], output_nonlinearity=kwargs['dyanmics_output_nonlinearity'], learning_rate=kwargs['dynamics_learning_rate'], batch_size=kwargs['dynamics_batch_size'], buffer_size=kwargs['dynamics_buffer_size'], ) env_sampler = Sampler( env=env, policy=policy, num_rollouts=kwargs['num_rollouts'], max_path_length=kwargs['max_path_length'], n_parallel=kwargs['n_parallel'], ) model_sampler = METRPOSampler( env=env, policy=policy, num_rollouts=kwargs['imagined_num_rollouts'], max_path_length=kwargs['max_path_length'], dynamics_model=dynamics_model, deterministic=kwargs['deterministic'], ) dynamics_sample_processor = ModelSampleProcessor( baseline=baseline, discount=kwargs['discount'], gae_lambda=kwargs['gae_lambda'], normalize_adv=kwargs['normalize_adv'], positive_adv=kwargs['positive_adv'], ) model_sample_processor = SampleProcessor( baseline=baseline, discount=kwargs['discount'], gae_lambda=kwargs['gae_lambda'], normalize_adv=kwargs['normalize_adv'], positive_adv=kwargs['positive_adv'], ) algo = PPO( policy=policy, learning_rate=kwargs['learning_rate'], clip_eps=kwargs['clip_eps'], max_epochs=kwargs['num_ppo_steps'], ) trainer = Trainer( algo=algo, policy=policy, env=env, model_sampler=model_sampler, env_sampler=env_sampler, model_sample_processor=model_sample_processor, dynamics_sample_processor=dynamics_sample_processor, dynamics_model=dynamics_model, n_itr=kwargs['n_itr'], dynamics_model_max_epochs=kwargs['dynamics_max_epochs'], log_real_performance=kwargs['log_real_performance'], steps_per_iter=kwargs['steps_per_iter'], sample_from_buffer=True, sess=sess, ) trainer.train()
def run_experiment(**config): exp_dir = os.getcwd() + '/data/' + EXP_NAME logger.configure(dir=exp_dir, format_strs=['stdout', 'log', 'csv'], snapshot_mode='last_gap', snapshot_gap=50) json.dump(config, open(exp_dir + '/params.json', 'w'), indent=2, sort_keys=True, cls=ClassEncoder) set_seed(config['seed']) config_sess = tf.ConfigProto() config_sess.gpu_options.allow_growth = True config_sess.gpu_options.per_process_gpu_memory_fraction = config.get('gpu_frac', 0.95) sess = tf.Session(config=config_sess) with sess.as_default() as sess: baseline = config['baseline']() #timeskip = config['timeskip'] # log_rand = config['log_rand'] # env = rl2env(normalize(config['env'](log_rand=log_rand)))#timeskip=timeskip))) env = rl2env(normalize(HalfCheetahRandVelEnv())) obs_dim = np.prod(env.observation_space.shape) + np.prod(env.action_space.shape) + 1 + 1 # obs + act + rew + done policy = GaussianRNNPolicy( name="meta-policy", obs_dim=obs_dim, action_dim=np.prod(env.action_space.shape), meta_batch_size=config['meta_batch_size'], hidden_sizes=config['hidden_sizes'], cell_type=config['cell_type'] ) sampler = MetaSampler( env=env, policy=policy, rollouts_per_meta_task=config['rollouts_per_meta_task'], meta_batch_size=config['meta_batch_size'], max_path_length=config['max_path_length'], parallel=config['parallel'], envs_per_task=1, ) sample_processor = RL2SampleProcessor( baseline=baseline, discount=config['discount'], gae_lambda=config['gae_lambda'], normalize_adv=config['normalize_adv'], positive_adv=config['positive_adv'], ) algo = PPO( policy=policy, learning_rate=config['learning_rate'], max_epochs=config['max_epochs'], backprop_steps=config['backprop_steps'], ) trainer = Trainer( algo=algo, policy=policy, env=env, sampler=sampler, sample_processor=sample_processor, n_itr=config['n_itr'], sess=sess, ) trainer.train()
def run_experiment(**kwargs): exp_dir = os.getcwd( ) + '/data/parallel_mb_ppo/' + EXP_NAME + '/' + kwargs.get('exp_name', '') logger.configure(dir=exp_dir, format_strs=['stdout', 'log', 'csv'], snapshot_mode='last') json.dump(kwargs, open(exp_dir + '/params.json', 'w'), indent=2, sort_keys=True, cls=ClassEncoder) config = tf.ConfigProto() config.gpu_options.allow_growth = True config.gpu_options.per_process_gpu_memory_fraction = kwargs.get( 'gpu_frac', 0.95) sess = tf.Session(config=config) with sess.as_default() as sess: # Instantiate classes set_seed(kwargs['seed']) baseline = kwargs['baseline']() env = normalize(kwargs['env']()) policy = GaussianMLPPolicy( name="policy", obs_dim=np.prod(env.observation_space.shape), action_dim=np.prod(env.action_space.shape), hidden_sizes=kwargs['hidden_sizes'], learn_std=kwargs['learn_std'], hidden_nonlinearity=kwargs['hidden_nonlinearity'], output_nonlinearity=kwargs['output_nonlinearity'], init_std=kwargs['init_std'], ) # Load policy here sampler = Sampler( env=env, policy=policy, num_rollouts=kwargs['num_rollouts'], max_path_length=kwargs['max_path_length'], n_parallel=kwargs['n_parallel'], ) sample_processor = SingleSampleProcessor( baseline=baseline, discount=kwargs['discount'], gae_lambda=kwargs['gae_lambda'], normalize_adv=kwargs['normalize_adv'], positive_adv=kwargs['positive_adv'], ) algo = PPO( policy=policy, learning_rate=kwargs['learning_rate'], clip_eps=kwargs['clip_eps'], max_epochs=kwargs['num_ppo_steps'], ) trainer = Trainer( algo=algo, policy=policy, env=env, sampler=sampler, sample_processor=sample_processor, n_itr=kwargs['n_itr'], sess=sess, ) trainer.train()