Exemple #1
0
    def __init__(self, env_name, config, upload_dir=None):
        config.update({"alg": "EvolutionStrategies"})

        Algorithm.__init__(self, env_name, config, upload_dir=upload_dir)

        policy_params = {
            "ac_noise_std": 0.01
        }

        env = gym.make(env_name)
        utils.make_session(single_threaded=False)
        self.policy = policies.GenericPolicy(
            env.observation_space, env.action_space, **policy_params)
        tf_util.initialize()
        self.optimizer = optimizers.Adam(self.policy, config["stepsize"])
        self.ob_stat = utils.RunningStat(env.observation_space.shape, eps=1e-2)

        # Create the shared noise table.
        print("Creating shared noise table.")
        noise_id = create_shared_noise.remote()
        self.noise = SharedNoiseTable(ray.get(noise_id))

        # Create the actors.
        print("Creating actors.")
        self.workers = [
            Worker.remote(config, policy_params, env_name, noise_id)
            for _ in range(config["num_workers"])]

        self.episodes_so_far = 0
        self.timesteps_so_far = 0
        self.tstart = time.time()
        self.iteration = 0
Exemple #2
0
    def __init__(self, env_name, config):
        Algorithm.__init__(self, env_name, config)

        policy_params = {
            "ac_bins": "continuous:",
            "ac_noise_std": 0.01,
            "nonlin_type": "tanh",
            "hidden_dims": [256, 256],
            "connection_type": "ff"
        }

        # Create the shared noise table.
        print("Creating shared noise table.")
        noise_id = create_shared_noise.remote()
        self.noise = SharedNoiseTable(ray.get(noise_id))

        # Create the actors.
        print("Creating actors.")
        self.workers = [
            Worker.remote(config, policy_params, env_name, noise_id)
            for _ in range(config.num_workers)
        ]

        env = gym.make(env_name)
        utils.make_session(single_threaded=False)
        self.policy = policies.MujocoPolicy(env.observation_space,
                                            env.action_space, **policy_params)
        tf_util.initialize()
        self.optimizer = optimizers.Adam(self.policy, config.stepsize)
        self.ob_stat = utils.RunningStat(env.observation_space.shape, eps=1e-2)

        self.episodes_so_far = 0
        self.timesteps_so_far = 0
        self.tstart = time.time()
        self.iteration = 0
Exemple #3
0
    def do_rollouts(self, params, ob_mean, ob_std, timestep_limit=None):
        # Set the network weights.
        self.policy.set_trainable_flat(params)

        if self.policy.needs_ob_stat:
            self.policy.set_ob_stat(ob_mean, ob_std)

        if self.config["eval_prob"] != 0:
            raise NotImplementedError("Eval rollouts are not implemented.")

        noise_inds, returns, sign_returns, lengths = [], [], [], []
        # We set eps=0 because we're incrementing only.
        task_ob_stat = utils.RunningStat(self.env.observation_space.shape,
                                         eps=0)

        # Perform some rollouts with noise.
        task_tstart = time.time()
        while (len(noise_inds) == 0
               or time.time() - task_tstart < self.min_task_runtime):
            noise_idx = self.noise.sample_index(self.rs,
                                                self.policy.num_params)
            perturbation = self.config["noise_stdev"] * self.noise.get(
                noise_idx, self.policy.num_params)

            # These two sampling steps could be done in parallel on different
            # actors letting us update twice as frequently.
            self.policy.set_trainable_flat(params + perturbation)
            rews_pos, len_pos = self.rollout_and_update_ob_stat(
                timestep_limit, task_ob_stat)

            self.policy.set_trainable_flat(params - perturbation)
            rews_neg, len_neg = self.rollout_and_update_ob_stat(
                timestep_limit, task_ob_stat)

            noise_inds.append(noise_idx)
            returns.append([rews_pos.sum(), rews_neg.sum()])
            sign_returns.append(
                [np.sign(rews_pos).sum(),
                 np.sign(rews_neg).sum()])
            lengths.append([len_pos, len_neg])

            return Result(
                noise_inds_n=np.array(noise_inds),
                returns_n2=np.array(returns, dtype=np.float32),
                sign_returns_n2=np.array(sign_returns, dtype=np.float32),
                lengths_n2=np.array(lengths, dtype=np.int32),
                eval_return=None,
                eval_length=None,
                ob_sum=(None if task_ob_stat.count == 0 else task_ob_stat.sum),
                ob_sumsq=(None
                          if task_ob_stat.count == 0 else task_ob_stat.sumsq),
                ob_count=task_ob_stat.count)