def _init(self): policy_params = {"ac_noise_std": 0.01} env = self.env_creator() preprocessor = ModelCatalog.get_preprocessor( env.spec.id, env.observation_space.shape) preprocessor_shape = preprocessor.transform_shape( env.observation_space.shape) self.sess = utils.make_session(single_threaded=False) self.policy = policies.GenericPolicy(env.observation_space, env.action_space, preprocessor, **policy_params) tf_util.initialize() self.optimizer = optimizers.Adam(self.policy, self.config["stepsize"]) self.ob_stat = utils.RunningStat(preprocessor_shape, eps=1e-2) # Create the shared noise table. print("Creating shared noise table.") noise_id = create_shared_noise.remote() self.noise = SharedNoiseTable(ray.get(noise_id)) # Create the actors. print("Creating actors.") self.workers = [ Worker.remote(self.config, policy_params, self.env_creator, noise_id) for _ in range(self.config["num_workers"]) ] self.episodes_so_far = 0 self.timesteps_so_far = 0 self.tstart = time.time()
def do_rollouts(self, params, ob_mean, ob_std, timestep_limit=None): # Set the network weights. self.policy.set_trainable_flat(params) if self.policy.needs_ob_stat: self.policy.set_ob_stat(ob_mean, ob_std) if self.config["eval_prob"] != 0: raise NotImplementedError("Eval rollouts are not implemented.") noise_inds, returns, sign_returns, lengths = [], [], [], [] # We set eps=0 because we're incrementing only. task_ob_stat = utils.RunningStat(self.preprocessor_shape, eps=0) # Perform some rollouts with noise. task_tstart = time.time() while (len(noise_inds) == 0 or time.time() - task_tstart < self.min_task_runtime): noise_idx = self.noise.sample_index(self.rs, self.policy.num_params) perturbation = self.config["noise_stdev"] * self.noise.get( noise_idx, self.policy.num_params) # These two sampling steps could be done in parallel on different # actors letting us update twice as frequently. self.policy.set_trainable_flat(params + perturbation) rews_pos, len_pos = self.rollout_and_update_ob_stat( timestep_limit, task_ob_stat) self.policy.set_trainable_flat(params - perturbation) rews_neg, len_neg = self.rollout_and_update_ob_stat( timestep_limit, task_ob_stat) noise_inds.append(noise_idx) returns.append([rews_pos.sum(), rews_neg.sum()]) sign_returns.append( [np.sign(rews_pos).sum(), np.sign(rews_neg).sum()]) lengths.append([len_pos, len_neg]) return Result( noise_inds_n=np.array(noise_inds), returns_n2=np.array(returns, dtype=np.float32), sign_returns_n2=np.array(sign_returns, dtype=np.float32), lengths_n2=np.array(lengths, dtype=np.int32), eval_return=None, eval_length=None, ob_sum=(None if task_ob_stat.count == 0 else task_ob_stat.sum), ob_sumsq=(None if task_ob_stat.count == 0 else task_ob_stat.sumsq), ob_count=task_ob_stat.count)