def __init__(self, observation_spaces, action_spaces, shared=False, hyperparameters=None, name=None): name = 'policy_group' if name is None else name self.hyperparameters = hyperparameters if hyperparameters else {} self.shared = next(iter(observation_spaces.keys())) if shared else None if shared: shared_obs_space = observation_spaces[self.shared] shared_act_space = action_spaces[self.shared] shared_policy = Policy(shared_obs_space, shared_act_space, name='shared_policy') policies = {} for pname, (obs_space, act_space) in zip_map(observation_spaces, action_spaces): assert shared_obs_space == obs_space assert shared_act_space == act_space policies[pname] = shared_policy else: policies = { name: Policy(obs_space, act_space, name=name) for name, ( obs_space, act_space) in zip_map(observation_spaces, action_spaces) } super().__init__(policies, shared=self.shared, name=name)
def _build(self, observations): ''' Build the agents in the group. :param observations: (dict) A dictionary of tensors that maps names to observations. :return: (PolicyGroupFunc) A namedtuple containing maps of actions, target actions, update functions. ''' if self.shared: names, observations = list(zip(*list(observations.items()))) length = len(observations) observations = tf.concat(observations, 0) policy = self.group[self.shared](observations) policies = {name: policy for name in names} actions = tf.split(policy.predict, length) target_actions = tf.split(policy.predict_target, length) entropy = tf.split(policy.entropy, length) noisy_target = tf.split(policy.noisy_target, length) actions = {name: action for name, action in zip(names, actions)} target_actions = { name: action for name, action in zip(names, target_actions) } entropy = {name: ent for name, ent in zip(names, entropy)} noisy_target = { name: target for name, target in zip(names, noisy_target) } update = policy.update_target else: policies = { name: policy(obs) for name, (policy, obs) in zip_map(self.group, observations) } actions = {} target_actions = {} entropy = {} noisy_target = {} update = [] policies = { name: policy(obs) for name, (policy, obs) in zip_map(self.group, observations) } for name, policy in policies.items(): actions[name] = policy.predict target_actions[name] = policy.predict_target entropy[name] = policy.entropy noisy_target[name] = policy.noisy_target update.append(policy.update_target) update = tf.group(*update) return PolicyGroupFunc(policies, actions, target_actions, update, entropy, noisy_target)
def test_convert_spaces_to_placeholders(): '''Test convert_spaces_to_placeholders function.''' spaces = {str(i): Box(0, 1, shape=(4, )) for i in range(10)} placeholders = utils.convert_spaces_to_placeholders(spaces, False) assert spaces.keys() == placeholders.keys() for _, (space, placeholder) in utils.zip_map(spaces, placeholders): assert list(space.shape) == placeholder.shape.as_list()
def compute_qvalue(self, observations, actions, rewards, dones, gamma): '''Compute the Q value.''' target = self.critic_group(observations, actions).target_values return { name: tf.stop_gradient(R + gamma * (1. - D) * Q) for name, (Q, R, D) in zip_map(target, rewards, dones) }
def __init__(self, observation_spaces, action_spaces, shared=False, hyperparameters=None, name=None): name = 'critic_group' if name is None else name self.hyperparameters = hyperparameters if hyperparameters else {} self.shared = next(iter(observation_spaces.keys())) if shared else None if shared: obs_space = observation_spaces[self.shared] act_space = action_spaces[self.shared] shared_critic = Critic(obs_space, act_space, name='shared_critic') critics = {} for key, (obs, act) in zip_map(observation_spaces, action_spaces): assert obs_space == obs assert act_space == act critics[key] = shared_critic else: critics = {key: Critic(obs, act, name=key) for key, (obs, act) in zip_map(observation_spaces, action_spaces)} super().__init__(critics, shared=self.shared, name=name)
def _encode_sample(self, idxes): obses_t = defaultdict(list) actions = defaultdict(list) rewards = defaultdict(list) obses_tp1 = defaultdict(list) dones = defaultdict(list) for i in idxes: data = self._storage[i] obs_t, action, reward, obs_tp1, done = data for key, (obs_t, action, reward, obs_tp1, done) in zip_map(*data): obses_t[key].append(obs_t) actions[key].append(action) rewards[key].append(reward) obses_tp1[key].append(obs_tp1) dones[key].append(done) return obses_t, actions, rewards, obses_tp1, dones
def create_optimizers(self, values): '''Create optimizers from the group.''' losses = {} opts = {} learning_rate = self.hyperparameters.get('learning_rate', 1e-4) if self.shared: policy = self.group[self.shared] values = values[self.shared] opts, loss = policy.create_optimizer(values, learning_rate=learning_rate) losses = {name: loss for name in self.group} else: for name, (policy, value) in zip_map(self.group, values): opts[name], losses[name] = policy.create_optimizer( value, learning_rate=learning_rate) opts = tf.group(*list(opts.values())) return opts, losses
def _build(self, observation): ''' Build the networks needed for the MADDPG. :param obs: (tensorflow.Tensor) Tensor of observations. :return: (MaddpgFunc) A tuple of functions used for evaluating and training. ''' if self.normalize.get('observation'): observation = { key: norm(obs, False) for key, (obs, norm) in zip_map(observation, self.normalize['observation']) } policies = self.policy_group(observation) predict = policies.actions return ComaFunc(None, None, None, None, predict, None, None, None)
def _build(self, observations, actions): ''' Build the agents in the group. ''' if self.shared: observations = sorted(list(observations.items()), key=lambda x: x[0]) actions = sorted(list(actions.items()), key=lambda x: x[0]) names, observations = list(zip(*observations)) _, actions = list(zip(*actions)) length = len(observations) observations = tf.concat(observations, 0) actions = tf.concat(actions, 0) critic = self.group[self.shared](observations, actions) critics = {name: critic for name in names} values = tf.split(critic.predict, length) target_values = tf.split(critic.predict_target, length) values = {name: value for name, value in zip(names, values)} target_values = {name: value for name, value in zip(names, target_values)} #update = {name: policy.update_target for name in self.group} update = critic.update_target else: critics = {name: critic(obs, act) for name, (critic, obs, act) in zip_map(self.group, observations, actions)} values = {} target_values = {} update = [] for name, critic in critics.items(): values[name] = critic.predict target_values[name] = critic.predict_target update.append(critic.update_target) update = tf.group(*update) return CriticGroupFunc(critics, values, target_values, update)
def test_map_zip_all_equal(): '''Test map_zip function.''' mappings = [{i: i for i in range(10)} for j in range(10)] for ke_y, values in utils.zip_map(*mappings): assert len(set(values)) == 1
def test_map_zip_all_descending(): '''Test map_zip function.''' mappings = [{i: i for i in range(10 - j)} for j in range(10)] with pytest.raises(KeyError): for _, values in utils.zip_map(*mappings): assert len(set(values)) == 1
def test_map_zip_all_ascending(): '''Test map_zip function.''' mappings = [{i: i for i in range(10 + j)} for j in range(10)] for _, values in utils.zip_map(*mappings): assert len(set(values)) == 1
def main(batch_size=1): multi_env = MultiOptLRs(data_set='mnist', max_batches=100, batch_size=128, max_history=25) agents = Maddpg(multi_env.observation_space, multi_env.action_space, shared_policy=True, shared_critic=True) print_tqdm('Starting...') exp_replay = {name: ReplayBuffer(1e6) for name in multi_env.action_space.spaces} global_step = 0 last_info = defaultdict(lambda: None) for _ in trange(60000): total_reward = 0 states_last = states = multi_env.reset() done = False all_actions = [] while not done: actions = agents.predict(states) actions = {key: np.squeeze(act) for key, act in actions.items()} states, reward, done, info = multi_env.step(actions) if done: last_info = info #print(np.any(np.isnan(list(actions.values())))) total_reward += reward rewards = {key: reward for key in states} dones = {key: done for key in states} all_results = states_last, actions, rewards, states, dones all_results = { name: values for name, values in zip_map(*all_results) } for key, (replay, results) in zip_map(exp_replay, all_results): replay.add(*results) if global_step > batch_size and global_step % 100 == 0: states_feed = {} actions_feed = {} rewards_feed = {} states_n_feed = {} dones_feed = {} idxs = {} for key, replay in exp_replay.items(): #idx, mem, _ = replay.sample(1024) #idxs[key] = idx #stat, actio, rewar, stat_n, don = mem stat, actio, rewar, stat_n, don = replay.sample(batch_size) states_feed[key] = stat actions_feed[key] = actio rewards_feed[key] = rewar states_n_feed[key] = stat_n dones_feed[key] = don loss_before = agents.compute_loss(states_feed, actions_feed, rewards_feed, states_n_feed, dones_feed) losses = agents.train_step(states_feed, actions_feed, rewards_feed, states_n_feed, dones_feed) losses = agents.compute_loss(states_feed, actions_feed, rewards_feed, states_n_feed, dones_feed) agents.update_targets() actor_loss_before = np.mean( list(loss_before['actor'].values())) critic_loss_before = np.mean( list(loss_before['critic'].values())) actor_loss = np.mean(list(losses['actor'].values())) critic_loss = np.mean(list(losses['critic'].values())) all_actions = [list(act.values()) for act in all_actions] print_tqdm('*'*80) print_tqdm('Training:') print_tqdm('Total Reward:', total_reward) print_tqdm('Stats:', last_info['episode']) print_tqdm('Grads Sum:', last_info['grads_sum']) print_tqdm('Action Mean:', last_info['actions_mean']) print_tqdm('Action Std:', last_info['actions_std']) print_tqdm('Network Loss:', last_info['loss']) print_tqdm('Network Accu:', last_info['accuracy']) print_tqdm('Actor Loss Before:', actor_loss_before) print_tqdm('Critic Loss Before:', critic_loss_before) print_tqdm('Actor Loss:', actor_loss) print_tqdm('Critic Loss:', critic_loss) print_tqdm('*'*80) states_last = states global_step += 1 all_actions.append(actions) agents.save('optimizer/model.ckpt')
def _build(self, observation, actions, rewards, observation_n, dones, gamma=0.95): ''' Build the networks needed for the MADDPG. :param obs: (tensorflow.Tensor) Tensor of observations. :param rewards: (tensorflow.Tensor) Tensor of rewards. :param dones: (tensorflow.Tensor) Tensor of boolean like values that denote whether an episode completed such that if the ith done in dones is 1 then the ith step was the last step. :param gamma: (float) The gamma value to use. :return: (MaddpgFunc) A tuple of functions used for evaluating and training. ''' if self.normalize.get('observation'): observation = { key: norm(obs, False) for key, (obs, norm) in zip_map(observation, self.normalize['observation']) } observation_n = { key: norm(obs, False) for key, (obs, norm) in zip_map(observation_n, self.normalize['observation']) } if self.normalize.get('reward'): rewards = { key: norm(rew, False) for key, (rew, norm) in zip_map(rewards, self.normalize['reward']) } obs_n_concat = U.concat_map(observation_n) obs_n_concat = {name: obs_n_concat for name in observation} global_critics = self.global_critic_group worst_qactions = self.worst_policy_group(observation_n).actions worst_qactions = U.concat_map(worst_qactions) worst_qactions = {name: worst_qactions for name in observation} worst_qvalues = global_critics(obs_n_concat, worst_qactions).target_values best_qactions = self.best_policy_group(observation_n).actions best_qactions = U.concat_map(best_qactions) best_qactions = {name: best_qactions for name in observation} best_qvalues = self.compute_global_qvalue(obs_n_concat, best_qactions, rewards, dones, gamma) all_actions = U.concat_map(actions) all_actions = {name: all_actions for name in self.action_spaces} obs_concat = U.concat_map(observation) obs_concat = {name: obs_concat for name in observation} global_values = global_critics(obs_concat, all_actions).values global_opts = global_critics.create_optimizers(global_values, best_qvalues) personal_reward = { name: tf.stop_gradient(gval - wval) for name, (gval, wval) in zip_map(global_values, worst_qvalues) } personal_critics = self.personal_critic_group personal_values = personal_critics(obs_concat, all_actions).values personal_qvalue = self.compute_personal_qvalue(obs_n_concat, best_qactions, personal_reward, dones, gamma) personal_critic = personal_critics.create_optimizers( personal_values, personal_qvalue) predict = self.best_policy_group(observation).actions all_actions = U.concat_map(predict) all_actions = {name: all_actions for name in self.action_spaces} target_vals = personal_critics(obs_concat, all_actions).target_values worst_predict = self.worst_policy_group(observation).actions worst_predict = U.concat_map(worst_predict) worst_predict = {name: worst_predict for name in self.action_spaces} worst_vals = personal_critics(obs_concat, worst_predict).target_values worst_vals = {name: -v for name, v in worst_vals.items()} best_policy = self.best_policy_group.create_optimizers(target_vals) worst_policy = self.worst_policy_group.create_optimizers(worst_vals) critic_opts = [global_opts[0], personal_critic[0]] critic_losses = [global_opts[1], personal_critic[1]] #po_opts = [best_policy[0], worst_policy[0]] #po_losses = [best_policy[1], worst_policy[1]] po_opts = [best_policy[0], worst_policy[0]] po_losses = [best_policy[1], worst_policy[1]] critic_opts = tf.group(critic_opts) critic_losses = { name: tf.reduce_mean(tf.stack(losses, -1), -1) for name, losses in zip_map(*critic_losses) } update_critic = tf.group([ global_critics.update_targets(5e-3), self.personal_critic_group.update_targets(5e-3) ]) po_opts = tf.group(po_opts) po_losses = { name: tf.math.reduce_std(tf.stack(losses, -1), -1) for name, losses in zip_map(*po_losses) } update_policy = tf.group([ self.worst_policy_group.update_targets(5e-3), self.best_policy_group.update_targets(5e-3) ]) return ComaFunc(po_opts, critic_opts, po_losses, critic_losses, predict, target_vals, update_policy, update_critic)
def _build(self, observation, actions, rewards, observation_n, dones, gamma=0.95): ''' Build the networks needed for the MADDPG. :param obs: (tensorflow.Tensor) Tensor of observations. :param rewards: (tensorflow.Tensor) Tensor of rewards. :param dones: (tensorflow.Tensor) Tensor of boolean like values that denote whether an episode completed such that if the ith done in dones is 1 then the ith step was the last step. :param gamma: (float) The gamma value to use. :return: (MaddpgFunc) A tuple of functions used for evaluating and training. ''' if self.normalize.get('observation'): observation = { key: norm(obs, False) for key, (obs, norm) in zip_map(observation, self.normalize['observation']) } observation_n = { key: norm(obs, False) for key, (obs, norm) in zip_map(observation_n, self.normalize['observation']) } if self.normalize.get('reward'): rewards = { key: norm(rew, False) for key, (rew, norm) in zip_map(rewards, self.normalize['reward']) } obs_n_concat = U.concat_map(observation_n) obs_n_concat = {name: obs_n_concat for name in observation} qactions = self.policy_group(observation_n).target_actions qactions = U.concat_map(qactions) qactions = {name: qactions for name in observation} # qvalues = self.compute_qvalue(observation_n, qactions, rewards, # dones, gamma) qvalues = self.compute_qvalue(obs_n_concat, qactions, rewards, dones, gamma) actions = U.concat_map(actions) actions = {name: actions for name in self.action_spaces} obs_concat = U.concat_map(observation) obs_concat = {name: obs_concat for name in observation} #values = self.critic_group(observation, actions).values values = self.critic_group(obs_concat, actions).values #cr_opts, cr_losses = self.get_critic_optimizer(values, qvalue) critic_opts, critic_losses = self.critic_group.create_optimizers( values, qvalues) predict = self.policy_group(observation).actions actions = U.concat_map(predict) actions = {name: actions for name in self.action_spaces} #target_vals = self.critic_group(observation, actions).target_values target_vals = self.critic_group(obs_concat, actions).target_values po_opts, po_losses = self.get_policy_optimizer(target_vals) update_critic = self.critic_group.update_targets(5e-3) return MaddpgFunc(po_opts, critic_opts, po_losses, critic_losses, predict, target_vals, self.policy_group.update_targets(5e-3), update_critic)