Ejemplo n.º 1
0
    def _init(self):

        policy_params = {"ac_noise_std": 0.01}

        env = self.env_creator()
        preprocessor = ModelCatalog.get_preprocessor(
            env.spec.id, env.observation_space.shape)
        preprocessor_shape = preprocessor.transform_shape(
            env.observation_space.shape)

        self.sess = utils.make_session(single_threaded=False)
        self.policy = policies.GenericPolicy(env.observation_space,
                                             env.action_space, preprocessor,
                                             **policy_params)
        tf_util.initialize()
        self.optimizer = optimizers.Adam(self.policy, self.config["stepsize"])
        self.ob_stat = utils.RunningStat(preprocessor_shape, eps=1e-2)

        # Create the shared noise table.
        print("Creating shared noise table.")
        noise_id = create_shared_noise.remote()
        self.noise = SharedNoiseTable(ray.get(noise_id))

        # Create the actors.
        print("Creating actors.")
        self.workers = [
            Worker.remote(self.config, policy_params, self.env_creator,
                          noise_id) for _ in range(self.config["num_workers"])
        ]

        self.episodes_so_far = 0
        self.timesteps_so_far = 0
        self.tstart = time.time()
Ejemplo n.º 2
0
    def do_rollouts(self, params, ob_mean, ob_std, timestep_limit=None):
        # Set the network weights.
        self.policy.set_trainable_flat(params)

        if self.policy.needs_ob_stat:
            self.policy.set_ob_stat(ob_mean, ob_std)

        if self.config["eval_prob"] != 0:
            raise NotImplementedError("Eval rollouts are not implemented.")

        noise_inds, returns, sign_returns, lengths = [], [], [], []
        # We set eps=0 because we're incrementing only.
        task_ob_stat = utils.RunningStat(self.preprocessor_shape, eps=0)

        # Perform some rollouts with noise.
        task_tstart = time.time()
        while (len(noise_inds) == 0
               or time.time() - task_tstart < self.min_task_runtime):
            noise_idx = self.noise.sample_index(self.rs,
                                                self.policy.num_params)
            perturbation = self.config["noise_stdev"] * self.noise.get(
                noise_idx, self.policy.num_params)

            # These two sampling steps could be done in parallel on different
            # actors letting us update twice as frequently.
            self.policy.set_trainable_flat(params + perturbation)
            rews_pos, len_pos = self.rollout_and_update_ob_stat(
                timestep_limit, task_ob_stat)

            self.policy.set_trainable_flat(params - perturbation)
            rews_neg, len_neg = self.rollout_and_update_ob_stat(
                timestep_limit, task_ob_stat)

            noise_inds.append(noise_idx)
            returns.append([rews_pos.sum(), rews_neg.sum()])
            sign_returns.append(
                [np.sign(rews_pos).sum(),
                 np.sign(rews_neg).sum()])
            lengths.append([len_pos, len_neg])

            return Result(
                noise_inds_n=np.array(noise_inds),
                returns_n2=np.array(returns, dtype=np.float32),
                sign_returns_n2=np.array(sign_returns, dtype=np.float32),
                lengths_n2=np.array(lengths, dtype=np.int32),
                eval_return=None,
                eval_length=None,
                ob_sum=(None if task_ob_stat.count == 0 else task_ob_stat.sum),
                ob_sumsq=(None
                          if task_ob_stat.count == 0 else task_ob_stat.sumsq),
                ob_count=task_ob_stat.count)