class I3QLearner(): def __init__(self, num_features, num_actions, timestep, action_space, scope): self.scope = scope self._lr = 0.5 self.discount = 1. self.replay_buffer = ReplayBuffer(1e4) with tf.variable_scope(self.scope): self.act_trajectory = tf.placeholder(tf.float32, shape = ((None, timestep, action_space))) self.target = tf.placeholder(tf.float32, shape = ((None, ))) self.act = tf.placeholder(tf.int32, shape = ((None,))) self.tau = lstm_model(self.act_trajectory, num_actions, scope = "tau_model_{}".format(scope)) self.q_input = self.tau #train network self.q = mlp_model(self.q_input, 2, scope = "q_model_{}".format(scope)) q_func_vars = U.scope_vars(U.absolute_scope_name( "q_model_{}".format(scope))) #target network self.target_q = mlp_model(self.q_input, 2, scope = "target_q_model_{}".format(scope)) target_q_func_vars = U.scope_vars(U.absolute_scope_name( "target_q_model_{}".format(scope))) # take action self.softmax = tf.nn.softmax(self.target_q) self.pred = tf.argmax(self.softmax, axis = 1) #calculate the loss self.q_t_selected = tf.reduce_mean(self.q * tf.one_hot(self.act, num_actions), 1) q_tp1_best = tf.reduce_max(self.q, 1) q_tp1_best_masked = q_tp1_best td_error = self.q_t_selected - tf.stop_gradient(self.target) self.errors = U.huber_loss(td_error) self.q_opt_op = tf.train.AdamOptimizer(self._lr).minimize(self.errors, var_list = q_func_vars) self.tau_loss = tf.reduce_mean(tf.nn.sparse_softmax_cross_entropy_with_logits(logits=self.tau, labels=self.act)) self.tau_opt_op = tf.train.AdamOptimizer(self._lr).minimize(self.tau_loss) self.get_pred = U.function(inputs = [self.act_trajectory] , outputs = [self.softmax]) self.train_q = U.function(inputs = [self.act_trajectory] + [self.target] +[self.act] , outputs = [self.errors, self.q], updates = [self.q_opt_op]) self.train_tau = U.function(inputs =[ self.act] + [self.act_trajectory], outputs = [self.tau_loss], updates =[ self.tau_opt_op ]) self.update_model = make_update_exp(q_func_vars, target_q_func_vars) def experience(self, action1, act_tra1 , reward1): self.replay_buffer.add(action1, act_tra1 , reward1) # bolzman exploration policy, set the temperature parmeter = 1 for default def get_act(self, act_trajectory): acpd = self.get_pred(act_trajectory)[0][0] # action = np.random.choice([0,1], p = acpd) action = epsilon_greedy(acpd, 0.1) return action def supervise_tau(self, a_next, action_trajectory): loss = self.train_tau(*([a_next] + [action_trajectory]))[0] return loss def update_target(self): self.update_model() def learn(self, batch_size): replay_sample_index = self.replay_buffer.make_index(batch_size) act, act_tra, reward = self.replay_buffer.sample_index(replay_sample_index) loss , q= self.train_q(*([act_tra] + [reward] + [act])) return loss, q
class Agent: def __init__(self, pos, actor, critic, actor_target, critic_target, train_mode, discrete_action, args, alg_mode='MADDPG'): self.pos = pos self.BATCH_SIZE = args.batch_size self.GAMMA = args.GAMMA self.args = args self.train_mode = train_mode self.discrete_action = discrete_action self.algorithm = alg_mode self.critic = critic self.critic_target = critic_target self.noise = OrnsteinUhlenbeckProcess(mu=np.zeros(5, )) self.actor = actor self.actor_target = actor_target self.actor_target.hard_copy(actor) self.critic_target.hard_copy(critic) self.replay_buffer = ReplayBuffer(int(1e6)) self.max_replay_buffer_len = self.BATCH_SIZE * 25 def preupdate(self): self.replay_sample_index = None def step(self, agents, t, terminal): if len(self.replay_buffer) < self.max_replay_buffer_len: # replay buffer is not large enough return if not t % 100 == 0: # only update every 100 steps return self.replay_sample_index = self.replay_buffer.make_index(self.BATCH_SIZE) obs_n = [] obs_next_n = [] act_n = [] index = self.replay_sample_index for agent in agents: obs, act, rew, obs_next, done = agent.replay_buffer.sample_index(index) obs_n.append(torch.FloatTensor(obs).to(device)) obs_next_n.append(torch.FloatTensor(obs_next).to(device)) act_n.append(torch.FloatTensor(act).to(device)) state_batch, action_batch, reward_batch, state_next_batch, t_batch = self.replay_buffer.sample_index(index) state_batch = torch.FloatTensor(state_batch).to(device) action_batch = torch.FloatTensor(action_batch).to(device) reward_batch = torch.FloatTensor(reward_batch).to(device) t_batch = torch.FloatTensor(t_batch).to(device) state_next_batch = torch.FloatTensor(state_next_batch).to(device) reward_batch = torch.reshape(reward_batch, (1024, 1)) t_batch = torch.reshape(t_batch, (1024, 1)) # Train the critic network. if self.algorithm == 'MADDPG': if self.discrete_action: target_actions = [onehot_from_logits(agent.actor_target(nobs)) for agent, nobs in zip(agents, obs_next_n)] else: target_actions = [agent.actor_target(nobs) for agent, nobs in zip(agents, obs_next_n)] obs_next_concat = torch.cat(obs_next_n, dim=-1) target_actions = torch.cat(target_actions, dim=-1) else: # Get actions in DDPG mode. if self.discrete_action: target_actions = onehot_from_logits(self.actor_target(state_next_batch)) else: target_actions = self.actor_target(state_next_batch) obs_next_concat = state_next_batch predicted_q_value = self.critic_target(obs_next_concat, target_actions) Q_targets = reward_batch + ((1 - t_batch) * self.GAMMA * predicted_q_value).detach() if self.algorithm == 'MADDPG': obs_concat = torch.cat(obs_n, dim=-1) action_concat = torch.cat(act_n, dim=-1) else: obs_concat = state_batch action_concat = action_batch self.critic.train_step(obs_concat, action_concat, Q_targets) all_actions = [] if self.discrete_action: curr_pol_out = self.actor(state_batch) curr_pol_vf_in = gumbel_softmax(curr_pol_out, hard=True) else: curr_pol_out = self.actor(state_batch) curr_pol_vf_in = curr_pol_out if self.algorithm == 'MADDPG': # Get the actions of all actors in MADDPG mode. for i, agent, obs in zip(range(len(agents)), agents, obs_n): if i == self.pos: all_actions.append(curr_pol_vf_in) elif self.discrete_action: all_actions.append(onehot_from_logits(agent.actor(obs))) else: all_actions.append(agent.actor(obs)) actions_concatenated = torch.cat(all_actions, dim=-1) else: # Get ONLY the action of the current actor in DDPG. actions_concatenated = curr_pol_vf_in self.actor.train_step(self.critic, obs_concat, actions_concatenated, curr_pol_out) self.soft_update(self.actor, self.actor_target, tau=self.args.tau) self.soft_update(self.critic, self.critic_target, tau=self.args.tau) def experience(self, obs, act, rew, new_obs, done): # Store transition in the replay buffer. self.replay_buffer.add(obs, act, rew, new_obs, done) def act(self, state, add_noise=False): """Returns actions for given state as per current policy.""" state = torch.FloatTensor(state).unsqueeze(0).to(device) noise = self.noise() noise = torch.FloatTensor(noise).unsqueeze(0).to(device) action = self.actor(state) if self.discrete_action: if add_noise: action = gumbel_softmax(action, hard=True) else: action = onehot_from_logits(action) else: if add_noise: action = action + noise action = action.clamp(-1, 1) action = action.cpu().detach().numpy()[0] return action def reset(self): self.noise.reset() def soft_update(self, local_model, target_model, tau): """Soft update model parameters. θ_target = τ*θ_local + (1 - τ)*θ_target Params ====== local_model: PyTorch model (weights will be copied from) target_model: PyTorch model (weights will be copied to) tau (float): interpolation parameter """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data)