def __init__(self, env_name, config, upload_dir=None): config.update({"alg": "EvolutionStrategies"}) Algorithm.__init__(self, env_name, config, upload_dir=upload_dir) policy_params = { "ac_noise_std": 0.01 } env = gym.make(env_name) utils.make_session(single_threaded=False) self.policy = policies.GenericPolicy( env.observation_space, env.action_space, **policy_params) tf_util.initialize() self.optimizer = optimizers.Adam(self.policy, config["stepsize"]) self.ob_stat = utils.RunningStat(env.observation_space.shape, eps=1e-2) # Create the shared noise table. print("Creating shared noise table.") noise_id = create_shared_noise.remote() self.noise = SharedNoiseTable(ray.get(noise_id)) # Create the actors. print("Creating actors.") self.workers = [ Worker.remote(config, policy_params, env_name, noise_id) for _ in range(config["num_workers"])] self.episodes_so_far = 0 self.timesteps_so_far = 0 self.tstart = time.time() self.iteration = 0
def __init__(self, env_name, config): Algorithm.__init__(self, env_name, config) policy_params = { "ac_bins": "continuous:", "ac_noise_std": 0.01, "nonlin_type": "tanh", "hidden_dims": [256, 256], "connection_type": "ff" } # Create the shared noise table. print("Creating shared noise table.") noise_id = create_shared_noise.remote() self.noise = SharedNoiseTable(ray.get(noise_id)) # Create the actors. print("Creating actors.") self.workers = [ Worker.remote(config, policy_params, env_name, noise_id) for _ in range(config.num_workers) ] env = gym.make(env_name) utils.make_session(single_threaded=False) self.policy = policies.MujocoPolicy(env.observation_space, env.action_space, **policy_params) tf_util.initialize() self.optimizer = optimizers.Adam(self.policy, config.stepsize) self.ob_stat = utils.RunningStat(env.observation_space.shape, eps=1e-2) self.episodes_so_far = 0 self.timesteps_so_far = 0 self.tstart = time.time() self.iteration = 0
def do_rollouts(self, params, ob_mean, ob_std, timestep_limit=None): # Set the network weights. self.policy.set_trainable_flat(params) if self.policy.needs_ob_stat: self.policy.set_ob_stat(ob_mean, ob_std) if self.config["eval_prob"] != 0: raise NotImplementedError("Eval rollouts are not implemented.") noise_inds, returns, sign_returns, lengths = [], [], [], [] # We set eps=0 because we're incrementing only. task_ob_stat = utils.RunningStat(self.env.observation_space.shape, eps=0) # Perform some rollouts with noise. task_tstart = time.time() while (len(noise_inds) == 0 or time.time() - task_tstart < self.min_task_runtime): noise_idx = self.noise.sample_index(self.rs, self.policy.num_params) perturbation = self.config["noise_stdev"] * self.noise.get( noise_idx, self.policy.num_params) # These two sampling steps could be done in parallel on different # actors letting us update twice as frequently. self.policy.set_trainable_flat(params + perturbation) rews_pos, len_pos = self.rollout_and_update_ob_stat( timestep_limit, task_ob_stat) self.policy.set_trainable_flat(params - perturbation) rews_neg, len_neg = self.rollout_and_update_ob_stat( timestep_limit, task_ob_stat) noise_inds.append(noise_idx) returns.append([rews_pos.sum(), rews_neg.sum()]) sign_returns.append( [np.sign(rews_pos).sum(), np.sign(rews_neg).sum()]) lengths.append([len_pos, len_neg]) return Result( noise_inds_n=np.array(noise_inds), returns_n2=np.array(returns, dtype=np.float32), sign_returns_n2=np.array(sign_returns, dtype=np.float32), lengths_n2=np.array(lengths, dtype=np.int32), eval_return=None, eval_length=None, ob_sum=(None if task_ob_stat.count == 0 else task_ob_stat.sum), ob_sumsq=(None if task_ob_stat.count == 0 else task_ob_stat.sumsq), ob_count=task_ob_stat.count)