class DQN: def __init__(self, sess, state_dim, action_dim, n_agents, nn_id): self.sess = sess self.action = BaseDQN(sess, state_dim, action_dim, n_agents, nn_id, use_as_peer=True) def training_target_qnet(self): self.sess.run([self.action.update_slow_target_dqn]) def training_peer_qnet(self): self.sess.run(self.action.update_peer_dqn) def training_a_qnet(self, *params): return self.action.training_qnet(*params) def get_aq_values(self, *params): return self.action.get_q_values(*params) def get_aq_tmq_values(self, state_ph): return self.sess.run( [self.action.concat_dqns, self.action.concat_peer_dqns], feed_dict={ self.action.state_ph: state_ph, self.action.next_state_ph: state_ph })
def __init__(self, sess, state_dim, action_dim, n_agents, nn_id): self.sess = sess self.action = BaseDQN(sess, state_dim, action_dim, n_agents, nn_id) self.mission = BaseDQN(sess, state_dim, action_dim, n_agents, "lng" + nn_id, use_as_peer=True)
def __init__(self, obs_space, act_space, sess, n_agents, name): self.act_space = act_space self.n_agents = n_agents self.dqn = DQN(sess, obs_space, sup_len, act_space, n_agents, name) self.rb = ReplayBuffer(capacity=rb_capacity) self.train_cnt = 0 self.sns_q = None
class Agent(object): def __init__(self, obs_space, act_space, sess, n_agents, name): self.act_space = act_space self.n_agents = n_agents self.dqn = DQN(sess, obs_space, sup_len, act_space, n_agents, name) self.rb = ReplayBuffer(capacity=rb_capacity) self.train_cnt = 0 self.sns_q = None def act_multi(self, obs, random): q_values = self.dqn.get_q_values([obs])[0] r_action = np.random.randint(self.act_space, size=(len(obs))) action_n = ( (random + 1) % 2) * (q_values.argmax(axis=1)) + (random) * r_action return action_n def incentivize_multi(self, info): state, action, reward, next_state, done = info return reward def add_to_memory(self, exp): self.rb.add_to_memory(exp) def sync_target(self): self.dqn.training_target_qnet() def train(self, use_rx): data = self.rb.sample_from_memory(minibatch_size) state = np.asarray([x[0] for x in data]) action = np.asarray([x[1] for x in data]) base_reward = np.asarray([x[2] for x in data]) next_state = np.asarray([x[3] for x in data]) done = np.asarray([x[4] for x in data]) not_done = (done + 1) % 2 if use_rx: rx_inc = np.asarray([x[5] for x in data]) reward = base_reward + rx_inc else: reward = base_reward td_error, _ = self.dqn.training_qnet(state, action, reward, not_done, next_state) self.train_cnt += 1 if self.train_cnt % (target_update) == 0: self.dqn.training_target_qnet() return td_error