def train_step(self): """ Perform one update step of the policy weights. """ # g_hat = self.aggregate_rollouts() # print("Euclidean norm of update step:", np.linalg.norm(g_hat)) # self.w_policy -= self.optimizer._compute_step(g_hat).reshape(self.w_policy.shape) deltas_idx, rollout_rewards = self.aggregate_rollouts() rollout_rewards = self.compute_centered_ranks(rollout_rewards) t1 = time.time() # aggregate rollouts to form g_hat, the gradient used to compute SGD step for name in deltas_idx.keys(): g_hat, count = utils.batched_weighted_sum( rollout_rewards[:, 0] - rollout_rewards[:, 1], (self.deltas.get(idx, self.w_policy[name].size) for idx in deltas_idx[name]), batch_size=500) g_hat /= len(deltas_idx[name]) print("{0}-Euclidean norm of update step:".format(name), np.linalg.norm(g_hat * self.step_size)) self.w_policy[name] += g_hat.reshape( self.w_policy[name].shape) * self.step_size self.policy.update_weights(self.w_policy) t2 = time.time() print('time to aggregate rollouts', t2 - t1) return
def learn(self, noisy_rewards, noises): """ Update weights of the model in the numpy level. Compute the grident and take a step. Args: noisy_rewards(np.float32): [batch_size, 2] noises(np.float32): [batch_size, weights_total_size] """ g = utils.batched_weighted_sum( # mirrored sampling: evaluate pairs of perturbations \epsilon, −\epsilon noisy_rewards[:, 0] - noisy_rewards[:, 1], noises, batch_size=500) g /= noisy_rewards.size latest_flat_weights = self.get_flat_weights() # Compute the new weights theta. theta, update_ratio = self.optimizer.update( latest_flat_weights, -g + self.config["l2_coeff"] * latest_flat_weights) self.set_flat_weights(theta)
def compute_error(self, deltas_idx, rollout_rewards, w_policy): # select top performing directions if deltas_used < num_deltas max_rewards = np.max(rollout_rewards, axis=1) if self.deltas_used > self.num_deltas: self.deltas_used = self.num_deltas idx = np.arange(max_rewards.size)[max_rewards >= np.percentile( max_rewards, 100 * (1 - (self.deltas_used / self.num_deltas)))] deltas_idx = deltas_idx[idx] rollout_rewards = rollout_rewards[idx, :] # normalize rewards by their standard deviation if np.std(rollout_rewards) > 0: rollout_rewards /= np.std(rollout_rewards) t1 = time.time() # aggregate rollouts to form g_hat, the gradient used to compute SGD step g_hat, count = utils.batched_weighted_sum( rollout_rewards[:, 0] - rollout_rewards[:, 1], (self.deltas.get(idx, w_policy.size) for idx in deltas_idx), batch_size=500) g_hat /= deltas_idx.size return g_hat
def aggregate_rollouts(self, num_rollouts = None, evaluate = False): """ Aggregate update step from rollouts generated in parallel. """ if num_rollouts is None: num_deltas = self.num_deltas else: num_deltas = num_rollouts # put policy weights in the object store policy_id = ray.put(self.w_policy) t1 = time.time() num_rollouts = int(num_deltas / self.num_workers) # parallel generation of rollouts rollout_ids_one = [worker.do_rollouts.remote(policy_id, num_rollouts = num_rollouts, shift = self.shift, evaluate=evaluate) for worker in self.workers] rollout_ids_two = [worker.do_rollouts.remote(policy_id, num_rollouts = 1, shift = self.shift, evaluate=evaluate) for worker in self.workers[:(num_deltas % self.num_workers)]] # gather results results_one = ray.get(rollout_ids_one) results_two = ray.get(rollout_ids_two) rollout_rewards, deltas_idx = [], [] for result in results_one: if not evaluate: self.timesteps += result["steps"] deltas_idx += result['deltas_idx'] rollout_rewards += result['rollout_rewards'] for result in results_two: if not evaluate: self.timesteps += result["steps"] deltas_idx += result['deltas_idx'] rollout_rewards += result['rollout_rewards'] deltas_idx = np.array(deltas_idx) rollout_rewards = np.array(rollout_rewards, dtype = np.float64) print('Maximum reward of collected rollouts:', rollout_rewards.max()) t2 = time.time() print('Time to generate rollouts:', t2 - t1) if evaluate: return rollout_rewards # select top performing directions if deltas_used < num_deltas max_rewards = np.max(rollout_rewards, axis = 1) if self.deltas_used > self.num_deltas: self.deltas_used = self.num_deltas idx = np.arange(max_rewards.size)[max_rewards >= np.percentile(max_rewards, 100*(1 - (self.deltas_used / self.num_deltas)))] deltas_idx = deltas_idx[idx] rollout_rewards = rollout_rewards[idx,:] # normalize rewards by their standard deviation rollout_rewards /= np.std(rollout_rewards) t1 = time.time() # aggregate rollouts to form g_hat, the gradient used to compute SGD step g_hat, count = utils.batched_weighted_sum(rollout_rewards[:,0] - rollout_rewards[:,1], (self.deltas.get(idx, self.w_policy.size) for idx in deltas_idx), batch_size = 500) g_hat /= deltas_idx.size t2 = time.time() print('time to aggregate rollouts', t2 - t1) return g_hat
def aggregate_rollouts(self, num_rollouts = None, evaluate = False): """ Aggregate update step from rollouts generated in parallel. """ if num_rollouts is None: num_deltas = self.num_deltas #print("TRAIN") else: num_deltas = num_rollouts #print("TEST") # put policy weights in the object store policy_id = ray.put(self.w_policy) t1 = time.time() num_rollouts = int(num_deltas / self.num_workers) #print("NUM_ROLLOUTS {}".format(num_rollouts)) # parallel generation of rollouts rollout_ids_one = [worker.do_rollouts.remote(policy_id, num_rollouts = num_rollouts, shift = self.shift, evaluate=evaluate) for worker in self.workers] rollout_ids_two = [worker.do_rollouts.remote(policy_id, num_rollouts = 1, shift = self.shift, evaluate=evaluate) for worker in self.workers[:(num_deltas % self.num_workers)]] # gather results results_one = ray.get(rollout_ids_one) results_two = ray.get(rollout_ids_two) rollout_rewards, deltas_idx = [], [] all_transitions = [] for result in results_one: if not evaluate: self.timesteps += result["steps"] deltas_idx += result['deltas_idx'] rollout_rewards += result['rollout_rewards'] all_transitions+=result['transitions'] for result in results_two: if not evaluate: self.timesteps += result["steps"] deltas_idx += result['deltas_idx'] rollout_rewards += result['rollout_rewards'] all_transitions+=result['transitions'] deltas_idx = np.array(deltas_idx) rollout_rewards = np.array(rollout_rewards, dtype = np.float64) # Push all the transitions collected in the Replay Buffer for tran in all_transitions: self.memory.push(torch.from_numpy(tran[0]).unsqueeze(0).to(device).float(),torch.tensor([[tran[1]]],device=device, dtype=torch.long),torch.from_numpy(tran[3]).unsqueeze(0).float().to(device),torch.tensor([tran[2]],device=device)) print('Maximum reward of collected rollouts:', rollout_rewards.max()) t2 = time.time() print('Time to generate rollouts:', t2 - t1) if evaluate: return rollout_rewards # select top performing directions if deltas_used < num_deltas max_rewards = np.max(rollout_rewards, axis = 1) if self.deltas_used > self.num_deltas: self.deltas_used = self.num_deltas idx = np.arange(max_rewards.size)[max_rewards >= np.percentile(max_rewards, 100*(1 - (self.deltas_used / self.num_deltas)))] deltas_idx = deltas_idx[idx] rollout_rewards = rollout_rewards[idx,:] # normalize rewards by their standard deviation if np.std(rollout_rewards)!=0: rollout_rewards /= np.std(rollout_rewards) t1 = time.time() # aggregate rollouts to form g_hat, the gradient used to compute SGD step g_hat, count = utils.batched_weighted_sum(rollout_rewards[:,0] - rollout_rewards[:,1], (self.deltas.get(idx, self.w_policy.size) for idx in deltas_idx), batch_size = 500) g_hat /= deltas_idx.size t2 = time.time() print('time to aggregate rollouts', t2 - t1) return g_hat
def population_update(master, params): timesteps = 0 rwds, embeddings, agent_deltas, data = [], [], [], [] num_rollouts = int(params['num_sensings'] / params['num_workers']) params['num_sensings'] = int(num_rollouts * params['num_workers']) # get rewards/trajectory info for i in range(params['num_agents']): filter_id = ray.put(master.agents[i].observation_filter) setting_filters_ids = [ worker.sync_filter.remote(filter_id) for worker in master.workers ] ray.get(setting_filters_ids) increment_filters_ids = [ worker.stats_increment.remote() for worker in master.workers ] ray.get(increment_filters_ids) use_states = [1 if params['embedding'] == 'a_s' else 0][0] policy_id = ray.put(master.agents[i].params) rollout_ids = [ worker.do_rollouts.remote(policy_id, num_rollouts, master.selected, use_states) for worker in master.workers ] results = ray.get(rollout_ids) for j in range(params['num_workers']): master.agents[i].observation_filter.update( ray.get(master.workers[j].get_filter.remote())) master.agents[i].observation_filter.stats_increment() master.agents[i].observation_filter.clear_buffer() # harvest the worker data.. quite a lot of stuff rollout_rewards, deltas_idx, sparsities, emb_selected = [], [], [], [] for result in results: deltas_idx += result['deltas_idx'] rollout_rewards += result['rollout_rewards'] timesteps += result['steps'] sparsities += result['sparsities'] data += result['data'] emb_selected += result['embedding'] rwds.append(np.array(rollout_rewards)) embeddings.append(emb_selected) agent_deltas.append(np.array(deltas_idx)) # Get the correspinding determinants if params['w_nov'] > 0: dets = np.zeros(np.array(rollout_rewards).shape) for i in range(num_rollouts * params['num_workers']): pop = np.concatenate(([ x[i][0].reshape(embeddings[0][0][0].size, 1) for x in embeddings ]), axis=1).T pop = normalize(pop, pop) dets[i, 0] = get_det(pop, params) pop = np.concatenate(([ x[i][1].reshape(embeddings[0][0][0].size, 1) for x in embeddings ]), axis=1).T pop = normalize(pop, pop) dets[i, 1] = get_det(pop, params) dets = (dets - np.mean(dets)) / (np.std(dets) + 1e-8) else: dets = np.zeros(np.array(rollout_rewards).shape) # pass all the aggregate info to the master policy master.buffer = data # add a random sample of the states to a state buffer, then only keep last 10 iterations master.states = [x[0] for t in data for x in t[0]] + [x[0] for t in data for x in t[1]] # individually update the policies g_hat = [] for i in range(params['num_agents']): deltas_idx = np.array(agent_deltas[i]) rollout_rewards = np.array(rwds[i], dtype=np.float64) rollout_rewards = (rollout_rewards - np.mean(rollout_rewards)) / ( np.std(rollout_rewards) + 1e-8) rollout_rewards = params['w_nov'] * dets + ( 1 - params['w_nov']) * rollout_rewards g, count = batched_weighted_sum( rollout_rewards[:, 0] - rollout_rewards[:, 1], (master.deltas.get(idx, master.policy.params.size) for idx in deltas_idx), batch_size=500) g /= deltas_idx.size g_hat.append(g) g_hat = np.concatenate(g_hat) return (g_hat, timesteps)
def aggregate_rollouts(self, num_rollouts=None, evaluate=False): """ Aggregate update step from rollouts generated in parallel. """ if num_rollouts is None: num_deltas = self.num_deltas else: num_deltas = num_rollouts results_one = [] #rollout_ids_one results_two = [] #rollout_ids_two t1 = time.time() num_rollouts = int(num_deltas / self.num_workers) # if num_rollouts > 0: # with futures.ThreadPoolExecutor( # max_workers=self.num_workers) as executor: # workers = [ # executor.submit( # worker.do_rollouts, # self.w_policy, # num_rollouts=num_rollouts, # shift=self.shift, # evaluate=evaluate) for worker in self.workers # ] # for worker in futures.as_completed(workers): # results_one.append(worker.result()) # # workers = [ # executor.submit( # worker.do_rollouts, # self.w_policy, # num_rollouts=1, # shift=self.shift, # evaluate=evaluate) # for worker in self.workers[:(num_deltas % self.num_workers)] # ] # for worker in futures.as_completed(workers): # results_two.append(worker.result()) # parallel generation of rollouts rollout_ids_one = [ worker.do_rollouts(self.w_policy, num_rollouts=num_rollouts, shift=self.shift, evaluate=evaluate) for worker in self.workers ] rollout_ids_two = [ worker.do_rollouts(self.w_policy, num_rollouts=1, shift=self.shift, evaluate=evaluate) for worker in self.workers[:(num_deltas % self.num_workers)] ] results_one = rollout_ids_one results_two = rollout_ids_two # gather results rollout_rewards, deltas_idx = [], [] for result in results_one: if not evaluate: self.timesteps += result['steps'] deltas_idx += result['deltas_idx'] rollout_rewards += result['rollout_rewards'] for result in results_two: if not evaluate: self.timesteps += result['steps'] deltas_idx += result['deltas_idx'] rollout_rewards += result['rollout_rewards'] deltas_idx = np.array(deltas_idx) rollout_rewards = np.array(rollout_rewards, dtype=np.float64) print('Maximum reward of collected rollouts:', rollout_rewards.max()) info_dict = {"max_reward": rollout_rewards.max()} t2 = time.time() print('Time to generate rollouts:', t2 - t1) if evaluate: return rollout_rewards # select top performing directions if deltas_used < num_deltas max_rewards = np.max(rollout_rewards, axis=1) if self.deltas_used > self.num_deltas: self.deltas_used = self.num_deltas idx = np.arange(max_rewards.size)[max_rewards >= np.percentile( max_rewards, 100 * (1 - (self.deltas_used / self.num_deltas)))] deltas_idx = deltas_idx[idx] rollout_rewards = rollout_rewards[idx, :] # normalize rewards by their standard deviation rollout_rewards /= np.std(rollout_rewards) t1 = time.time() # aggregate rollouts to form g_hat, the gradient used to compute SGD step g_hat, count = utils.batched_weighted_sum( rollout_rewards[:, 0] - rollout_rewards[:, 1], (self.deltas.get(idx, self.w_policy.size) for idx in deltas_idx), batch_size=500) g_hat /= deltas_idx.size t2 = time.time() print('time to aggregate rollouts', t2 - t1) return g_hat, info_dict
# Assemble the results. noise_inds_n = np.concatenate( [r.noise_inds_n for r in curr_task_results]) returns_n2 = np.concatenate([r.returns_n2 for r in curr_task_results]) lengths_n2 = np.concatenate([r.lengths_n2 for r in curr_task_results]) assert noise_inds_n.shape[0] == returns_n2.shape[ 0] == lengths_n2.shape[0] # Process the returns. if config.return_proc_mode == "centered_rank": proc_returns_n2 = utils.compute_centered_ranks(returns_n2) else: raise NotImplementedError(config.return_proc_mode) # Compute and take a step. g, count = utils.batched_weighted_sum( proc_returns_n2[:, 0] - proc_returns_n2[:, 1], (noise.get(idx, policy.num_params) for idx in noise_inds_n), batch_size=500) g /= returns_n2.size assert (g.shape == (policy.num_params, ) and g.dtype == np.float32 and count == len(noise_inds_n)) update_ratio = optimizer.update(-g + config.l2coeff * theta) # Update ob stat (we're never running the policy in the master, but we # might be snapshotting the policy). if policy.needs_ob_stat: policy.set_ob_stat(ob_stat.mean, ob_stat.std) step_tend = time.time() tlogger.record_tabular("EpRewMean", returns_n2.mean()) tlogger.record_tabular("EpRewStd", returns_n2.std()) tlogger.record_tabular("EpLenMean", lengths_n2.mean())