def prepare_start(self, policy_pickle, env_pickle, baseline_pickle, feed_dict, config, initial_random_samples): import tensorflow as tf self.sess = sess = tf.Session(config=config) with sess.as_default(): """ --------------------- Construct instances -------------------""" from asynch_mb.samplers.sampler import Sampler from asynch_mb.samplers.mb_sample_processor import ModelSampleProcessor env = pickle.loads(env_pickle) policy = pickle.loads(policy_pickle) baseline = pickle.loads(baseline_pickle) sess.run(tf.initializers.global_variables()) self.env = env self.env_sampler = Sampler(env=env, policy=policy, **feed_dict['env_sampler']) self.dynamics_sample_processor = ModelSampleProcessor( baseline=baseline, **feed_dict['dynamics_sample_processor']) """ ------------------- Step and Push ------------------""" samples_data = self.step(random=initial_random_samples) self.push(samples_data) return 1
def construct_from_feed_dict( self, policy_pickle, env_pickle, baseline_pickle, # UNUSED dynamics_model_pickle, feed_dict, ): from asynch_mb.samplers.sampler import Sampler from asynch_mb.samplers.mb_sample_processor import ModelSampleProcessor env = pickle.loads(env_pickle) policy = pickle.loads(policy_pickle) self.env = env self.env_sampler = Sampler(env=env, policy=policy, **feed_dict['sampler']) self.dynamics_sample_processor = ModelSampleProcessor( **feed_dict['sample_processor'])
def run_experiment(**kwargs): exp_dir = os.getcwd() + '/data/' + EXP_NAME + kwargs.get('exp_name', '') logger.configure(dir=exp_dir, format_strs=['stdout', 'log', 'csv'], snapshot_mode='last') json.dump(kwargs, open(exp_dir + '/params.json', 'w'), indent=2, sort_keys=True, cls=ClassEncoder) config = tf.ConfigProto() config.gpu_options.allow_growth = True config.gpu_options.per_process_gpu_memory_fraction = kwargs.get('gpu_frac', 0.95) sess = tf.Session(config=config) with sess.as_default() as sess: # Instantiate classes set_seed(kwargs['seed']) baseline = kwargs['baseline']() env = normalize(kwargs['env']()) # Wrappers? policy = GaussianMLPPolicy( name="meta-policy", obs_dim=np.prod(env.observation_space.shape), action_dim=np.prod(env.action_space.shape), hidden_sizes=kwargs['policy_hidden_sizes'], learn_std=kwargs['policy_learn_std'], hidden_nonlinearity=kwargs['policy_hidden_nonlinearity'], output_nonlinearity=kwargs['policy_output_nonlinearity'], ) dynamics_model = MLPDynamicsEnsemble('dynamics-ensemble', env=env, num_models=kwargs['num_models'], hidden_nonlinearity=kwargs['dyanmics_hidden_nonlinearity'], hidden_sizes=kwargs['dynamics_hidden_sizes'], output_nonlinearity=kwargs['dyanmics_output_nonlinearity'], learning_rate=kwargs['dynamics_learning_rate'], batch_size=kwargs['dynamics_batch_size'], buffer_size=kwargs['dynamics_buffer_size'], rolling_average_persitency=kwargs['rolling_average_persitency'] ) env_sampler = Sampler( env=env, policy=policy, num_rollouts=kwargs['num_rollouts'], max_path_length=kwargs['max_path_length'], n_parallel=kwargs['n_parallel'], ) model_sampler = METRPOSampler( env=env, policy=policy, dynamics_model=dynamics_model, num_rollouts=kwargs['imagined_num_rollouts'], max_path_length=kwargs['max_path_length'], deterministic=kwargs['deterministic'], ) dynamics_sample_processor = ModelSampleProcessor( baseline=baseline, discount=kwargs['discount'], gae_lambda=kwargs['gae_lambda'], normalize_adv=kwargs['normalize_adv'], positive_adv=kwargs['positive_adv'], ) model_sample_processor = SampleProcessor( baseline=baseline, discount=kwargs['discount'], gae_lambda=kwargs['gae_lambda'], normalize_adv=kwargs['normalize_adv'], positive_adv=kwargs['positive_adv'], ) algo = TRPO( policy=policy, step_size=kwargs['step_size'], ) trainer = Trainer( algo=algo, policy=policy, env=env, model_sampler=model_sampler, env_sampler=env_sampler, model_sample_processor=model_sample_processor, dynamics_sample_processor=dynamics_sample_processor, dynamics_model=dynamics_model, n_itr=kwargs['n_itr'], dynamics_model_max_epochs=kwargs['dynamics_max_epochs'], log_real_performance=kwargs['log_real_performance'], steps_per_iter=kwargs['steps_per_iter'], sample_from_buffer=kwargs['sample_from_buffer'], sess=sess, ) trainer.train()
class WorkerData(Worker): def __init__(self, simulation_sleep, video=False): if video: super().__init__(snapshot_mode='gap', snapshot_gap=int(30/1250/simulation_sleep)) # FIXME else: super().__init__() self.simulation_sleep = simulation_sleep self.env = None self.env_sampler = None self.dynamics_sample_processor = None self.samples_data_arr = [] def construct_from_feed_dict( self, policy_pickle, env_pickle, baseline_pickle, dynamics_model_pickle, feed_dict ): from asynch_mb.samplers.sampler import Sampler from asynch_mb.samplers.mb_sample_processor import ModelSampleProcessor env = pickle.loads(env_pickle) policy = pickle.loads(policy_pickle) baseline = pickle.loads(baseline_pickle) self.env = env self.env_sampler = Sampler(env=env, policy=policy, **feed_dict['env_sampler']) self.dynamics_sample_processor = ModelSampleProcessor( baseline=baseline, **feed_dict['dynamics_sample_processor'] ) def prepare_start(self): initial_random_samples = self.queue.get() self.step(initial_random_samples) self.push() def step(self, random=False): time_step = time.time() '''------------- Obtaining samples from the environment -----------''' if self.verbose: logger.log("Data is obtaining samples...") env_paths = self.env_sampler.obtain_samples( log=True, random=random, log_prefix='Data-EnvSampler-', ) '''-------------- Processing environment samples -------------------''' if self.verbose: logger.log("Data is processing environment samples...") samples_data = self.dynamics_sample_processor.process_samples( env_paths, log=True, log_prefix='Data-EnvTrajs-', ) self.samples_data_arr.append(samples_data) time_step = time.time() - time_step time_sleep = max(self.simulation_sleep - time_step, 0) time.sleep(time_sleep) logger.logkv('Data-TimeStep', time_step) logger.logkv('Data-TimeSleep', time_sleep) # save snapshot params = self.get_itr_snapshot() logger.save_itr_params(self.itr_counter, params) def _synch(self, policy_state_pickle): time_synch = time.time() policy_state = pickle.loads(policy_state_pickle) assert isinstance(policy_state, dict) self.env_sampler.policy.set_shared_params(policy_state) time_synch = time.time() - time_synch logger.logkv('Data-TimeSynch', time_synch) def push(self): time_push = time.time() self.queue_next.put(pickle.dumps(self.samples_data_arr)) self.samples_data_arr = [] time_push = time.time() - time_push logger.logkv('Data-TimePush', time_push) def set_stop_cond(self): if self.itr_counter >= self.n_itr: self.stop_cond.set() def get_itr_snapshot(self): """ Gets the current policy and env for storage """ return dict(itr=self.itr_counter, policy=self.env_sampler.policy, env=self.env)
def run_experiment(**kwargs): exp_dir = os.getcwd() + '/data/' + EXP_NAME logger.configure(dir=exp_dir, format_strs=['stdout', 'log', 'csv'], snapshot_mode='last') json.dump(kwargs, open(exp_dir + '/params.json', 'w'), indent=2, sort_keys=True, cls=ClassEncoder) config = tf.ConfigProto() config.gpu_options.allow_growth = True config.gpu_options.per_process_gpu_memory_fraction = kwargs.get( 'gpu_frac', 0.95) sess = tf.Session(config=config) with sess.as_default() as sess: # Instantiate classes set_seed(kwargs['seed']) baseline = kwargs['baseline']() env = normalize(kwargs['env']()) policy = GaussianMLPPolicy( name="policy", obs_dim=np.prod(env.observation_space.shape), action_dim=np.prod(env.action_space.shape), hidden_sizes=kwargs['hidden_sizes'], learn_std=kwargs['learn_std'], hidden_nonlinearity=kwargs['hidden_nonlinearity'], output_nonlinearity=kwargs['output_nonlinearity'], init_std=kwargs['init_std'], squashed=kwargs['squashed']) # Load policy here sampler = Sampler( env=env, policy=policy, num_rollouts=kwargs['num_rollouts'], max_path_length=kwargs['max_path_length'], n_parallel=kwargs['n_parallel'], ) sample_processor = SingleSampleProcessor( baseline=baseline, discount=kwargs['discount'], gae_lambda=kwargs['gae_lambda'], normalize_adv=kwargs['normalize_adv'], positive_adv=kwargs['positive_adv'], ) algo = PPO( policy=policy, learning_rate=kwargs['learning_rate'], clip_eps=kwargs['clip_eps'], max_epochs=kwargs['num_ppo_steps'], entropy_bonus=kwargs['entropy_bonus'], ) trainer = Trainer( algo=algo, policy=policy, env=env, sampler=sampler, sample_processor=sample_processor, n_itr=kwargs['n_itr'], sess=sess, ) trainer.train()
class WorkerData(Worker): def __init__(self, policy_ps, data_buffers, time_sleep, name, exp_dir, n_itr, stop_cond): super().__init__(name, exp_dir, n_itr, stop_cond) self.policy_ps = policy_ps self.data_buffers = data_buffers self.time_sleep = time_sleep self.env = None self.env_sampler = None self.dynamics_sample_processor = None def prepare_start(self, policy_pickle, env_pickle, baseline_pickle, feed_dict, config, initial_random_samples): import tensorflow as tf self.sess = sess = tf.Session(config=config) with sess.as_default(): """ --------------------- Construct instances -------------------""" from asynch_mb.samplers.sampler import Sampler from asynch_mb.samplers.mb_sample_processor import ModelSampleProcessor env = pickle.loads(env_pickle) policy = pickle.loads(policy_pickle) baseline = pickle.loads(baseline_pickle) sess.run(tf.initializers.global_variables()) self.env = env self.env_sampler = Sampler(env=env, policy=policy, **feed_dict['env_sampler']) self.dynamics_sample_processor = ModelSampleProcessor( baseline=baseline, **feed_dict['dynamics_sample_processor']) """ ------------------- Step and Push ------------------""" samples_data = self.step(random=initial_random_samples) self.push(samples_data) return 1 def step_wrapper(self): self.pull() samples_data = self.step() self.push(samples_data) return 1, 1 def step(self, random=False): time_step = time.time() '''------------- Obtaining samples from the environment -----------''' if self.verbose: logger.log("Data is obtaining samples...") env_paths = self.env_sampler.obtain_samples( log=True, random=random, log_prefix='Data-EnvSampler-', ) '''-------------- Processing environment samples -------------------''' if self.verbose: logger.log("Data is processing environment samples...") samples_data = self.dynamics_sample_processor.process_samples( env_paths, log=True, log_prefix='Data-EnvTrajs-', ) time_step = time.time() - time_step time_sleep = max(self.time_sleep - time_step, 0) time.sleep(time_sleep) logger.logkv('Data-TimeStep', time_step) logger.logkv('Data-TimeSleep', time_sleep) return samples_data def pull(self): time_synch = time.time() policy_params = ray.get(self.policy_ps.pull.remote()) assert isinstance(policy_params, dict) self.env_sampler.policy.set_shared_params(policy_params) logger.logkv('Data-TimePull', time.time() - time_synch) def push(self, samples_data): time_push = time.time() # broadcast samples to all data buffers samples_data_id = ray.put(samples_data) for data_buffer in self.data_buffers: # ray.get(data_buffer.push.remote(samples_data)) data_buffer.push.remote(samples_data_id) logger.logkv('Data-TimePush', time.time() - time_push) def set_stop_cond(self): if self.step_counter >= self.n_itr: ray.get(self.stop_cond.set.remote())
class WorkerData(Worker): def __init__(self, simulation_sleep): super().__init__() self.simulation_sleep = simulation_sleep self.env = None self.env_sampler = None self.dynamics_sample_processor = None self.samples_data_arr = [] def construct_from_feed_dict( self, policy_pickle, env_pickle, baseline_pickle, # UNUSED dynamics_model_pickle, feed_dict, ): from asynch_mb.samplers.sampler import Sampler from asynch_mb.samplers.mb_sample_processor import ModelSampleProcessor env = pickle.loads(env_pickle) policy = pickle.loads(policy_pickle) self.env = env self.env_sampler = Sampler(env=env, policy=policy, **feed_dict['sampler']) self.dynamics_sample_processor = ModelSampleProcessor( **feed_dict['sample_processor']) def prepare_start(self): random_sinusoid = self.queue.get() self.step(random_sinusoid) self.push() def step(self, random_sinusoid=(False, False)): time_step = time.time() if self.itr_counter == 1 and self.env_sampler.policy.dynamics_model.normalization is None: if self.verbose: logger.log('Data starts first step...') self.env_sampler.policy.dynamics_model = pickle.loads( self.queue.get()) if self.verbose: logger.log('Data first step done...') '''------------- Obtaining samples from the environment -----------''' if self.verbose: logger.log("Data is obtaining samples...") env_paths = self.env_sampler.obtain_samples( log=True, random=random_sinusoid[0], sinusoid=random_sinusoid[1], log_prefix='Data-EnvSampler-', ) '''-------------- Processing environment samples -------------------''' if self.verbose: logger.log("Data is processing samples...") samples_data = self.dynamics_sample_processor.process_samples( env_paths, log=True, log_prefix='Data-EnvTrajs-', ) self.samples_data_arr.append(samples_data) time_step = time.time() - time_step time_sleep = max(self.simulation_sleep - time_step, 0) time.sleep(time_sleep) logger.logkv('Data-TimeStep', time_step) logger.logkv('Data-TimeSleep', time_sleep) def _synch(self, dynamics_model_state_pickle): time_synch = time.time() dynamics_model_state = pickle.loads(dynamics_model_state_pickle) assert isinstance(dynamics_model_state, dict) self.env_sampler.policy.dynamics_model.set_shared_params( dynamics_model_state) time_synch = time.time() - time_synch logger.logkv('Data-TimeSynch', time_synch) def push(self): time_push = time.time() self.queue_next.put(pickle.dumps(self.samples_data_arr)) self.samples_data_arr = [] time_push = time.time() - time_push logger.logkv('Data-TimePush', time_push) def set_stop_cond(self): if self.itr_counter >= self.n_itr: self.stop_cond.set()