Ejemplo n.º 1
0
    def train_step(self):
        """ 
        Perform one update step of the policy weights.
        """

        # g_hat = self.aggregate_rollouts()
        # print("Euclidean norm of update step:", np.linalg.norm(g_hat))
        # self.w_policy -= self.optimizer._compute_step(g_hat).reshape(self.w_policy.shape)

        deltas_idx, rollout_rewards = self.aggregate_rollouts()
        rollout_rewards = self.compute_centered_ranks(rollout_rewards)
        t1 = time.time()
        # aggregate rollouts to form g_hat, the gradient used to compute SGD step
        for name in deltas_idx.keys():
            g_hat, count = utils.batched_weighted_sum(
                rollout_rewards[:, 0] - rollout_rewards[:, 1],
                (self.deltas.get(idx, self.w_policy[name].size)
                 for idx in deltas_idx[name]),
                batch_size=500)
            g_hat /= len(deltas_idx[name])
            print("{0}-Euclidean norm of update step:".format(name),
                  np.linalg.norm(g_hat * self.step_size))
            self.w_policy[name] += g_hat.reshape(
                self.w_policy[name].shape) * self.step_size
        self.policy.update_weights(self.w_policy)
        t2 = time.time()
        print('time to aggregate rollouts', t2 - t1)

        return
Ejemplo n.º 2
0
    def learn(self, noisy_rewards, noises):
        """ Update weights of the model in the numpy level.

        Compute the grident and take a step.

        Args:
            noisy_rewards(np.float32): [batch_size, 2]
            noises(np.float32): [batch_size, weights_total_size]
        """

        g = utils.batched_weighted_sum(
            # mirrored sampling: evaluate pairs of perturbations \epsilon, −\epsilon
            noisy_rewards[:, 0] - noisy_rewards[:, 1],
            noises,
            batch_size=500)
        g /= noisy_rewards.size

        latest_flat_weights = self.get_flat_weights()
        # Compute the new weights theta.
        theta, update_ratio = self.optimizer.update(
            latest_flat_weights,
            -g + self.config["l2_coeff"] * latest_flat_weights)
        self.set_flat_weights(theta)
Ejemplo n.º 3
0
    def compute_error(self, deltas_idx, rollout_rewards, w_policy):
        # select top performing directions if deltas_used < num_deltas
        max_rewards = np.max(rollout_rewards, axis=1)

        if self.deltas_used > self.num_deltas:
            self.deltas_used = self.num_deltas

        idx = np.arange(max_rewards.size)[max_rewards >= np.percentile(
            max_rewards, 100 * (1 - (self.deltas_used / self.num_deltas)))]
        deltas_idx = deltas_idx[idx]
        rollout_rewards = rollout_rewards[idx, :]

        # normalize rewards by their standard deviation
        if np.std(rollout_rewards) > 0:
            rollout_rewards /= np.std(rollout_rewards)

        t1 = time.time()
        # aggregate rollouts to form g_hat, the gradient used to compute SGD step
        g_hat, count = utils.batched_weighted_sum(
            rollout_rewards[:, 0] - rollout_rewards[:, 1],
            (self.deltas.get(idx, w_policy.size) for idx in deltas_idx),
            batch_size=500)
        g_hat /= deltas_idx.size
        return g_hat
Ejemplo n.º 4
0
Archivo: ars.py Proyecto: zhan0903/ARS
    def aggregate_rollouts(self, num_rollouts = None, evaluate = False):
        """ 
        Aggregate update step from rollouts generated in parallel.
        """

        if num_rollouts is None:
            num_deltas = self.num_deltas
        else:
            num_deltas = num_rollouts
            
        # put policy weights in the object store
        policy_id = ray.put(self.w_policy)

        t1 = time.time()
        num_rollouts = int(num_deltas / self.num_workers)
            
        # parallel generation of rollouts
        rollout_ids_one = [worker.do_rollouts.remote(policy_id,
                                                 num_rollouts = num_rollouts,
                                                 shift = self.shift,
                                                 evaluate=evaluate) for worker in self.workers]

        rollout_ids_two = [worker.do_rollouts.remote(policy_id,
                                                 num_rollouts = 1,
                                                 shift = self.shift,
                                                 evaluate=evaluate) for worker in self.workers[:(num_deltas % self.num_workers)]]

        # gather results 
        results_one = ray.get(rollout_ids_one)
        results_two = ray.get(rollout_ids_two)

        rollout_rewards, deltas_idx = [], [] 

        for result in results_one:
            if not evaluate:
                self.timesteps += result["steps"]
            deltas_idx += result['deltas_idx']
            rollout_rewards += result['rollout_rewards']

        for result in results_two:
            if not evaluate:
                self.timesteps += result["steps"]
            deltas_idx += result['deltas_idx']
            rollout_rewards += result['rollout_rewards']

        deltas_idx = np.array(deltas_idx)
        rollout_rewards = np.array(rollout_rewards, dtype = np.float64)
        
        print('Maximum reward of collected rollouts:', rollout_rewards.max())
        t2 = time.time()

        print('Time to generate rollouts:', t2 - t1)

        if evaluate:
            return rollout_rewards

        # select top performing directions if deltas_used < num_deltas
        max_rewards = np.max(rollout_rewards, axis = 1)
        if self.deltas_used > self.num_deltas:
            self.deltas_used = self.num_deltas
            
        idx = np.arange(max_rewards.size)[max_rewards >= np.percentile(max_rewards, 100*(1 - (self.deltas_used / self.num_deltas)))]
        deltas_idx = deltas_idx[idx]
        rollout_rewards = rollout_rewards[idx,:]
        
        # normalize rewards by their standard deviation
        rollout_rewards /= np.std(rollout_rewards)

        t1 = time.time()
        # aggregate rollouts to form g_hat, the gradient used to compute SGD step
        g_hat, count = utils.batched_weighted_sum(rollout_rewards[:,0] - rollout_rewards[:,1],
                                                  (self.deltas.get(idx, self.w_policy.size)
                                                   for idx in deltas_idx),
                                                  batch_size = 500)
        g_hat /= deltas_idx.size
        t2 = time.time()
        print('time to aggregate rollouts', t2 - t1)
        return g_hat
Ejemplo n.º 5
0
    def aggregate_rollouts(self, num_rollouts = None, evaluate = False):
        """ 
        Aggregate update step from rollouts generated in parallel.
        """
        if num_rollouts is None:
            num_deltas = self.num_deltas
        else:
            num_deltas = num_rollouts
            
        # put policy weights in the object store
        policy_id = ray.put(self.w_policy)

        t1 = time.time()
        num_rollouts = int(num_deltas / self.num_workers)
            
        # parallel generation of rollouts
        rollout_ids_one = [worker.do_rollouts.remote(policy_id,
                                                 num_rollouts = num_rollouts,
                                                 shift = self.shift,
                                                 evaluate=evaluate) for worker in self.workers]

        rollout_ids_two = [worker.do_rollouts.remote(policy_id,
                                                 num_rollouts = 1,
                                                 shift = self.shift,
                                                 evaluate=evaluate) for worker in self.workers[:(num_deltas % self.num_workers)]]

        # gather results 
        results_one = ray.get(rollout_ids_one)
        results_two = ray.get(rollout_ids_two)

        rollout_rewards, deltas_idx = [], [] 

        for result in results_one:
            if not evaluate:
                self.timesteps += result["steps"]
            deltas_idx += result['deltas_idx']
            rollout_rewards += result['rollout_rewards']

        for result in results_two:
            if not evaluate:
                self.timesteps += result["steps"]
            deltas_idx += result['deltas_idx']
            rollout_rewards += result['rollout_rewards']

        deltas_idx = np.array(deltas_idx)
        rollout_rewards = np.array(rollout_rewards, dtype = np.float64)
        
        print('Maximum reward of collected rollouts:', rollout_rewards.max())
        t2 = time.time()

        print('Time to generate rollouts:', t2 - t1)

        if evaluate:
            return rollout_rewards

        # select top performing directions if deltas_used < num_deltas
        max_rewards = np.max(rollout_rewards, axis = 1)
        if self.deltas_used > self.num_deltas:
            self.deltas_used = self.num_deltas
            
        idx = np.arange(max_rewards.size)[max_rewards >= np.percentile(max_rewards, 100*(1 - (self.deltas_used / self.num_deltas)))]
        deltas_idx = deltas_idx[idx]
        rollout_rewards = rollout_rewards[idx,:]
        
        # normalize rewards by their standard deviation
        rollout_rewards /= np.std(rollout_rewards)

        t1 = time.time()
        # aggregate rollouts to form g_hat, the gradient used to compute SGD step
        g_hat, count = utils.batched_weighted_sum(rollout_rewards[:,0] - rollout_rewards[:,1],
                                                  (self.deltas.get(idx, self.w_policy.size)
                                                   for idx in deltas_idx),
                                                  batch_size = 500)
        g_hat /= deltas_idx.size
        t2 = time.time()
        print('time to aggregate rollouts', t2 - t1)
        return g_hat
Ejemplo n.º 6
0
    def aggregate_rollouts(self, num_rollouts = None, evaluate = False):
        """ 
        Aggregate update step from rollouts generated in parallel.
        """

        if num_rollouts is None:
            num_deltas = self.num_deltas
            #print("TRAIN")
        else:
            num_deltas = num_rollouts
            #print("TEST")
            
        # put policy weights in the object store
        policy_id = ray.put(self.w_policy)

        t1 = time.time()
        num_rollouts = int(num_deltas / self.num_workers)
        #print("NUM_ROLLOUTS {}".format(num_rollouts))
            
        # parallel generation of rollouts
        rollout_ids_one = [worker.do_rollouts.remote(policy_id,
                                                 num_rollouts = num_rollouts,
                                                 shift = self.shift,
                                                 evaluate=evaluate) for worker in self.workers]

        rollout_ids_two = [worker.do_rollouts.remote(policy_id,
                                                 num_rollouts = 1,
                                                 shift = self.shift,
                                                 evaluate=evaluate) for worker in self.workers[:(num_deltas % self.num_workers)]]

        # gather results 
        results_one = ray.get(rollout_ids_one)
        results_two = ray.get(rollout_ids_two)

        rollout_rewards, deltas_idx = [], [] 
        all_transitions = []

        for result in results_one:
            if not evaluate:
                self.timesteps += result["steps"]
            deltas_idx += result['deltas_idx']
            rollout_rewards += result['rollout_rewards']
            all_transitions+=result['transitions']


        for result in results_two:
            if not evaluate:
                self.timesteps += result["steps"]
            deltas_idx += result['deltas_idx']
            rollout_rewards += result['rollout_rewards']
            all_transitions+=result['transitions']

        deltas_idx = np.array(deltas_idx)
        rollout_rewards = np.array(rollout_rewards, dtype = np.float64)
        

        # Push all the transitions collected in the Replay Buffer
        for tran in all_transitions:
            self.memory.push(torch.from_numpy(tran[0]).unsqueeze(0).to(device).float(),torch.tensor([[tran[1]]],device=device, dtype=torch.long),torch.from_numpy(tran[3]).unsqueeze(0).float().to(device),torch.tensor([tran[2]],device=device))


        print('Maximum reward of collected rollouts:', rollout_rewards.max())
        t2 = time.time()

        print('Time to generate rollouts:', t2 - t1)

        if evaluate:
            return rollout_rewards

        # select top performing directions if deltas_used < num_deltas
        max_rewards = np.max(rollout_rewards, axis = 1)
        if self.deltas_used > self.num_deltas:
            self.deltas_used = self.num_deltas
            
        idx = np.arange(max_rewards.size)[max_rewards >= np.percentile(max_rewards, 100*(1 - (self.deltas_used / self.num_deltas)))]
        deltas_idx = deltas_idx[idx]
        rollout_rewards = rollout_rewards[idx,:]
        
        # normalize rewards by their standard deviation
        if np.std(rollout_rewards)!=0:
            rollout_rewards /= np.std(rollout_rewards)

        t1 = time.time()
        # aggregate rollouts to form g_hat, the gradient used to compute SGD step
        g_hat, count = utils.batched_weighted_sum(rollout_rewards[:,0] - rollout_rewards[:,1],
                                                  (self.deltas.get(idx, self.w_policy.size)
                                                   for idx in deltas_idx),
                                                  batch_size = 500)
        g_hat /= deltas_idx.size
        t2 = time.time()
        print('time to aggregate rollouts', t2 - t1)
        return g_hat
Ejemplo n.º 7
0
def population_update(master, params):

    timesteps = 0
    rwds, embeddings, agent_deltas, data = [], [], [], []
    num_rollouts = int(params['num_sensings'] / params['num_workers'])
    params['num_sensings'] = int(num_rollouts * params['num_workers'])

    # get rewards/trajectory info
    for i in range(params['num_agents']):

        filter_id = ray.put(master.agents[i].observation_filter)
        setting_filters_ids = [
            worker.sync_filter.remote(filter_id) for worker in master.workers
        ]
        ray.get(setting_filters_ids)
        increment_filters_ids = [
            worker.stats_increment.remote() for worker in master.workers
        ]
        ray.get(increment_filters_ids)

        use_states = [1 if params['embedding'] == 'a_s' else 0][0]
        policy_id = ray.put(master.agents[i].params)
        rollout_ids = [
            worker.do_rollouts.remote(policy_id, num_rollouts, master.selected,
                                      use_states) for worker in master.workers
        ]
        results = ray.get(rollout_ids)

        for j in range(params['num_workers']):
            master.agents[i].observation_filter.update(
                ray.get(master.workers[j].get_filter.remote()))
        master.agents[i].observation_filter.stats_increment()
        master.agents[i].observation_filter.clear_buffer()

        # harvest the worker data.. quite a lot of stuff
        rollout_rewards, deltas_idx, sparsities, emb_selected = [], [], [], []
        for result in results:
            deltas_idx += result['deltas_idx']
            rollout_rewards += result['rollout_rewards']
            timesteps += result['steps']
            sparsities += result['sparsities']
            data += result['data']
            emb_selected += result['embedding']

        rwds.append(np.array(rollout_rewards))
        embeddings.append(emb_selected)
        agent_deltas.append(np.array(deltas_idx))

    # Get the correspinding determinants
    if params['w_nov'] > 0:
        dets = np.zeros(np.array(rollout_rewards).shape)
        for i in range(num_rollouts * params['num_workers']):
            pop = np.concatenate(([
                x[i][0].reshape(embeddings[0][0][0].size, 1)
                for x in embeddings
            ]),
                                 axis=1).T
            pop = normalize(pop, pop)
            dets[i, 0] = get_det(pop, params)

            pop = np.concatenate(([
                x[i][1].reshape(embeddings[0][0][0].size, 1)
                for x in embeddings
            ]),
                                 axis=1).T
            pop = normalize(pop, pop)
            dets[i, 1] = get_det(pop, params)

        dets = (dets - np.mean(dets)) / (np.std(dets) + 1e-8)
    else:
        dets = np.zeros(np.array(rollout_rewards).shape)

    # pass all the aggregate info to the master policy
    master.buffer = data

    # add a random sample of the states to a state buffer, then only keep last 10 iterations
    master.states = [x[0] for t in data
                     for x in t[0]] + [x[0] for t in data for x in t[1]]

    # individually update the policies
    g_hat = []
    for i in range(params['num_agents']):
        deltas_idx = np.array(agent_deltas[i])
        rollout_rewards = np.array(rwds[i], dtype=np.float64)
        rollout_rewards = (rollout_rewards - np.mean(rollout_rewards)) / (
            np.std(rollout_rewards) + 1e-8)
        rollout_rewards = params['w_nov'] * dets + (
            1 - params['w_nov']) * rollout_rewards

        g, count = batched_weighted_sum(
            rollout_rewards[:, 0] - rollout_rewards[:, 1],
            (master.deltas.get(idx, master.policy.params.size)
             for idx in deltas_idx),
            batch_size=500)
        g /= deltas_idx.size
        g_hat.append(g)
    g_hat = np.concatenate(g_hat)

    return (g_hat, timesteps)
Ejemplo n.º 8
0
    def aggregate_rollouts(self, num_rollouts=None, evaluate=False):
        """
        Aggregate update step from rollouts generated in parallel.
        """

        if num_rollouts is None:
            num_deltas = self.num_deltas
        else:
            num_deltas = num_rollouts

        results_one = []  #rollout_ids_one
        results_two = []  #rollout_ids_two

        t1 = time.time()
        num_rollouts = int(num_deltas / self.num_workers)
        #     if num_rollouts > 0:
        #       with futures.ThreadPoolExecutor(
        #           max_workers=self.num_workers) as executor:
        #         workers = [
        #             executor.submit(
        #                 worker.do_rollouts,
        #                 self.w_policy,
        #                 num_rollouts=num_rollouts,
        #                 shift=self.shift,
        #                 evaluate=evaluate) for worker in self.workers
        #         ]
        #         for worker in futures.as_completed(workers):
        #           results_one.append(worker.result())
        #
        #       workers = [
        #           executor.submit(
        #               worker.do_rollouts,
        #               self.w_policy,
        #               num_rollouts=1,
        #               shift=self.shift,
        #               evaluate=evaluate)
        #           for worker in self.workers[:(num_deltas % self.num_workers)]
        #       ]
        #       for worker in futures.as_completed(workers):
        #         results_two.append(worker.result())

        # parallel generation of rollouts
        rollout_ids_one = [
            worker.do_rollouts(self.w_policy,
                               num_rollouts=num_rollouts,
                               shift=self.shift,
                               evaluate=evaluate) for worker in self.workers
        ]

        rollout_ids_two = [
            worker.do_rollouts(self.w_policy,
                               num_rollouts=1,
                               shift=self.shift,
                               evaluate=evaluate)
            for worker in self.workers[:(num_deltas % self.num_workers)]
        ]
        results_one = rollout_ids_one
        results_two = rollout_ids_two
        # gather results

        rollout_rewards, deltas_idx = [], []

        for result in results_one:
            if not evaluate:
                self.timesteps += result['steps']
            deltas_idx += result['deltas_idx']
            rollout_rewards += result['rollout_rewards']

        for result in results_two:
            if not evaluate:
                self.timesteps += result['steps']
            deltas_idx += result['deltas_idx']
            rollout_rewards += result['rollout_rewards']

        deltas_idx = np.array(deltas_idx)
        rollout_rewards = np.array(rollout_rewards, dtype=np.float64)

        print('Maximum reward of collected rollouts:', rollout_rewards.max())
        info_dict = {"max_reward": rollout_rewards.max()}
        t2 = time.time()

        print('Time to generate rollouts:', t2 - t1)

        if evaluate:
            return rollout_rewards

        # select top performing directions if deltas_used < num_deltas
        max_rewards = np.max(rollout_rewards, axis=1)
        if self.deltas_used > self.num_deltas:
            self.deltas_used = self.num_deltas

        idx = np.arange(max_rewards.size)[max_rewards >= np.percentile(
            max_rewards, 100 * (1 - (self.deltas_used / self.num_deltas)))]
        deltas_idx = deltas_idx[idx]
        rollout_rewards = rollout_rewards[idx, :]

        # normalize rewards by their standard deviation
        rollout_rewards /= np.std(rollout_rewards)

        t1 = time.time()
        # aggregate rollouts to form g_hat, the gradient used to compute SGD step
        g_hat, count = utils.batched_weighted_sum(
            rollout_rewards[:, 0] - rollout_rewards[:, 1],
            (self.deltas.get(idx, self.w_policy.size) for idx in deltas_idx),
            batch_size=500)
        g_hat /= deltas_idx.size
        t2 = time.time()
        print('time to aggregate rollouts', t2 - t1)
        return g_hat, info_dict
Ejemplo n.º 9
0
        # Assemble the results.
        noise_inds_n = np.concatenate(
            [r.noise_inds_n for r in curr_task_results])
        returns_n2 = np.concatenate([r.returns_n2 for r in curr_task_results])
        lengths_n2 = np.concatenate([r.lengths_n2 for r in curr_task_results])
        assert noise_inds_n.shape[0] == returns_n2.shape[
            0] == lengths_n2.shape[0]
        # Process the returns.
        if config.return_proc_mode == "centered_rank":
            proc_returns_n2 = utils.compute_centered_ranks(returns_n2)
        else:
            raise NotImplementedError(config.return_proc_mode)

        # Compute and take a step.
        g, count = utils.batched_weighted_sum(
            proc_returns_n2[:, 0] - proc_returns_n2[:, 1],
            (noise.get(idx, policy.num_params) for idx in noise_inds_n),
            batch_size=500)
        g /= returns_n2.size
        assert (g.shape == (policy.num_params, ) and g.dtype == np.float32
                and count == len(noise_inds_n))
        update_ratio = optimizer.update(-g + config.l2coeff * theta)

        # Update ob stat (we're never running the policy in the master, but we
        # might be snapshotting the policy).
        if policy.needs_ob_stat:
            policy.set_ob_stat(ob_stat.mean, ob_stat.std)

        step_tend = time.time()
        tlogger.record_tabular("EpRewMean", returns_n2.mean())
        tlogger.record_tabular("EpRewStd", returns_n2.std())
        tlogger.record_tabular("EpLenMean", lengths_n2.mean())