def train(self, obs0, actions, rewards, obs1, dones, importance_weights): with tf.GradientTape() as tape: q_t = self.q_network(obs0) q_t_selected = tf.reduce_sum(q_t * tf.one_hot(actions, self.num_actions, dtype=tf.float32), 1) q_tp1 = self.target_q_network(obs1) if self.double_q: q_tp1_using_online_net = self.q_network(obs1) q_tp1_best_using_online_net = tf.argmax(q_tp1_using_online_net, 1) q_tp1_best = tf.reduce_sum( q_tp1 * tf.one_hot(q_tp1_best_using_online_net, self.num_actions, dtype=tf.float32), 1) else: q_tp1_best = tf.reduce_max(q_tp1, 1) dones = tf.cast(dones, q_tp1_best.dtype) q_tp1_best_masked = (1.0 - dones) * q_tp1_best q_t_selected_target = rewards + self.gamma * q_tp1_best_masked td_error = q_t_selected - tf.stop_gradient(q_t_selected_target) errors = huber_loss(td_error) weighted_error = tf.reduce_mean(importance_weights * errors) grads = tape.gradient(weighted_error, self.q_network.trainable_variables) if self.grad_norm_clipping: clipped_grads = [] for grad in grads: clipped_grads.append(tf.clip_by_norm(grad, self.grad_norm_clipping)) clipped_grads = grads grads_and_vars = zip(grads, self.q_network.trainable_variables) self.optimizer.apply_gradients(grads_and_vars) return td_error
def nstep_loss(self, obses_t, actions, rewards, weights, agent_id): # print(f'obses_t.shape {obses_t.shape}') s = obses_t.shape obses_t = tf.reshape(obses_t, (s[0]*s[1], *s[2:])) # print(f'obses_t.shape {obses_t.shape}') s = actions.shape actions = tf.reshape(actions, (s[0] * s[1], *s[2:])) # print(f'actions.shape {actions.shape}') s = rewards.shape rewards = tf.reshape(rewards, (s[0] * s[1], *s[2:])) # print(f'rewards.shape {rewards.shape}') s = weights.shape weights = tf.reshape(weights, (s[0] * s[1], *s[2:])) # print(f'weights.shape {weights.shape}') inputs = {0: obses_t, 1: tf.tile(self.one_hot_agents[agent_id], (s[0]*s[1], 1))} fc_values = self.model(inputs) q_t = self.agent_heads[agent_id](fc_values) q_t_selected = tf.reduce_sum(q_t * tf.one_hot(actions, self.config.num_actions, dtype=tf.float32), 1) # print(f'q_t_selected.shape is {q_t_selected.shape}') td_error = q_t_selected - tf.stop_gradient(rewards) errors = huber_loss(td_error) weighted_loss = tf.reduce_mean(weights * errors) return weighted_loss, td_error
def nstep_train(self, obs0, actions, rewards, obs1, dones, importance_weights, fps, extra_datas): batch_size = obs0.shape[0] # tile_time = batch_size // self.num_agents # td_error_ = tf.Variable(initial_value=tf.zeros(shape=batch_size)) loss = [] td_error_ = [] with tf.GradientTape() as tape: for a in self.agent_ids: fc_values = self.value_network({ 0: obs0[:, a, :], 1: tf.tile(self.one_hot_agents[a], (batch_size, 1)), 2: fps[:, a, :], 3: extra_datas[:, a, :] }) q_t = self.q_fc_list[a](fc_values) # print(f'q_values for agent {a} is {q_t}') q_t_selected = tf.reduce_sum( q_t * tf.one_hot( actions[:, a], self.num_actions, dtype=tf.float32), 1) # print(f'q_t_selected is {q_t_selected.numpy()}') q_t_selected_target = rewards[:, a] # n-step rewards sum # print(f'q_t_selected_target is {q_t_selected_target}') td_error = q_t_selected - tf.stop_gradient(q_t_selected_target) td_error_.append(td_error.numpy()) errors = huber_loss(td_error) weighted_error = tf.reduce_mean(importance_weights[:, a] * errors) loss.append(weighted_error) sum_loss = tf.reduce_mean(loss) # param = tape.watched_variables() param = self.value_network.trainable_variables for a in self.agent_ids: param += self.q_fc_list[a].trainable_variables # print(f'param is {param}') print(f'loss is {loss}') print(f'sum_loss is {sum_loss}') grads = tape.gradient(sum_loss, param) # grads = [grad if grad is not None else tf.zeros_like(var) for var, grad in zip(param, grads)] # print(f'grads is {grads}') grads_and_vars = list(zip(grads, param)) self.optimizer.apply_gradients(grads_and_vars) return np.mean(td_error_)
def nstep_loss(self, obses_t_a, actions_a, rewards_a, dones_a, weights_a, fps_a, agent_id): # print(f'obses_t.shape {obses_t.shape}') s = obses_t_a.shape obses_t_a = tf.reshape(obses_t_a, (s[0] * s[1], *s[2:])) # print(f'obses_t_a.shape {obses_t_a.shape}') s = actions_a.shape actions_a = tf.reshape(actions_a, (s[0], s[1], *s[2:])) # print(f'actions_a.shape {actions_a.shape}') s = rewards_a.shape rewards_a = tf.reshape(rewards_a, (s[0], s[1], *s[2:])) # print(f'rewards_a.shape {rewards_a.shape}') s = dones_a.shape # print(f's {s}') dones_a = tf.reshape(dones_a, (s[0], s[1], 1)) # print(f'dones_a.shape {dones_a.shape}') s = weights_a.shape weights_a = tf.reshape(weights_a, (s[0], s[1], *s[2:])) # print(f'weights_a.shape {weights_a.shape}') s = fps_a.shape fps_a = tf.reshape(fps_a, (s[0] * s[1], *s[2:])) # print(f'fps_a.shape {fps_a.shape}') inputs_a = { '0': obses_t_a, '1': tf.tile(self.one_hot_agents[agent_id], (s[0] * s[1], 1)), '2': fps_a, '3': dones_a } fc_values = self.model(inputs_a) # s = fc_values.shape # print(f'fc_values.shape {fc_values.shape}') # fc_values = tf.reshape(fc_values, (s[0] * s[1], *s[2:])) q_t = self.agent_heads[agent_id](fc_values) q_t_selected = tf.reduce_sum( q_t * tf.one_hot( actions_a[:, -1], self.config.num_actions, dtype=tf.float32), 1) # print(f'q_t_selected.shape is {q_t_selected.shape}') td_error = q_t_selected - tf.stop_gradient(rewards_a[:, -1]) errors = huber_loss(td_error) weighted_loss = tf.reduce_mean(weights_a[:, -1] * errors) return weighted_loss, td_error
def train(self, obs0, actions, rewards, obs1, dones, importance_weights, fps, extra_datas): batch_size = obs0.shape[0] td_error_ = tf.Variable(initial_value=tf.zeros(shape=batch_size)) for a in self.agent_ids: with tf.GradientTape() as tape: fc_values = self.value_network({0: obs0[:, a, :], 1: tf.ones(shape=(batch_size, 1)) * a}) q_t = self.q_fc_list[a](fc_values) q_t_selected = tf.reduce_sum(q_t * tf.one_hot(actions[:, a], self.num_actions, dtype=tf.float32), 1) fc_tp1 = self.target_network({0: obs1[:, a, :], 1: tf.ones(shape=(batch_size, 1)) * a}) q_tp1 = self.target_q_fc_list[a](fc_tp1) if self.double_q: fc_tp1_using_online_net = self.value_network({0: obs1[:, a, :], 1: tf.ones(shape=(batch_size, 1)) * a}) q_tp1_using_online_net = self.q_fc_list[a](fc_tp1_using_online_net) q_tp1_best_using_online_net = tf.argmax(q_tp1_using_online_net, 1) q_tp1_best = tf.reduce_sum(q_tp1 * tf.one_hot(q_tp1_best_using_online_net, self.num_actions, dtype=tf.float32), 1) else: q_tp1_best = tf.reduce_max(q_tp1, 1) dones = tf.cast(dones, q_tp1_best.dtype) q_tp1_best_masked = (1.0 - dones) * q_tp1_best q_t_selected_target = rewards[:, a] + self.gamma * q_tp1_best_masked td_error = q_t_selected - tf.stop_gradient(q_t_selected_target) td_error_.assign_add(td_error) errors = huber_loss(td_error) weighted_error = tf.reduce_mean(importance_weights[:, a] * errors) # loss.assign_add(weighted_error) param = tape.watched_variables() # print(param) # param = [v for v in self.q_network.trainable_variables if v.name.__contains__(agent_name)] # param += [v for v in self.q_network.trainable_variables if not v.name.__contains__('agent')] # print(f'params for is {param}') grads = tape.gradient(weighted_error, param) grads = [grad if grad is not None else tf.zeros_like(var) for var, grad in zip(param, grads)] grads_and_vars = list(zip(grads, param)) self.optimizer.apply_gradients(grads_and_vars) return td_error
def nstep_loss(self, obses_t_a, actions_a, rewards_a, dones_a, weights_a, fps_a, agent_id): # print(f'obses_t_a.shape {obses_t_a.shape}') q_t = self.network.value(obses_t_a, fps_a, agent_id) q_t_selected = tf.reduce_sum( q_t * tf.one_hot(actions_a, self.config.num_actions, dtype=tf.float32), 1) # print(f'q_t_selected.shape is {q_t_selected.shape}') td_error = q_t_selected - tf.stop_gradient(rewards_a) errors = huber_loss(td_error) weighted_loss = tf.reduce_mean(weights_a * errors) return weighted_loss, td_error
def train(batch_x, batch_y, shared_network, opt): with tf.GradientTape() as tape: q_eval_arr = shared_network(batch_x)['agent_0'] # print('q_eval_arr ', q_eval_arr) one_hot = tf.one_hot(batch_y, 4) # print(one_hot) q_t_selected = tf.reduce_sum(q_eval_arr * one_hot, 1) target_q_values = shared_network( batch_x + np.random.normal(0, 1, batch_x.shape))['agent_0'] max_target_q_values = tf.reduce_max(target_q_values, axis=1) td_error = q_t_selected - max_target_q_values # print('td_error ', td_error) errors = huber_loss(td_error) loss = tf.reduce_mean(errors) # param = shared_network.trainable_variables param = [ v for v in shared_network.trainable_variables if v.name.__contains__('agent_0') ] gradients_of_network = tape.gradient(loss, param) opt.apply_gradients(zip(gradients_of_network, param))
def nstep_train(self, obs0, actions, rewards, obs1, dones, importance_weights, fps, extra_datas): batch_size = obs0.shape[0] # tile_time = batch_size // self.num_agents td_error_ = tf.Variable(initial_value=tf.zeros(shape=batch_size//self.n_step)) loss = tf.Variable(initial_value=0.0) with tf.GradientTape() as tape: for a in self.agent_ids: # print(f'obs0[:, a, :] shape is {obs0[:, a, :].shape}') # print(f'tf.tile(self.one_hot_agents[a], (batch_size, 1)) shape is {tf.tile(self.one_hot_agents[a], (batch_size, 1)).shape}') # print(f'fps[:, a, :] shape is {fps[:, a, :].shape}') # print(f'extra_datas[:, a, :] shape is {extra_datas[:, a, :].shape}') # print(f'dones shape is {dones.shape}') fc_values = self.value_network({0: obs0[:, a, :], 1: tf.tile(self.one_hot_agents[a], (batch_size, 1)), 2: fps[:, a, :], 3: tf.expand_dims(extra_datas[:, a, :], axis=1), 4: dones}) q_t = self.q_fc_list[a](fc_values) # print(f'q_values.shape for agent {a} is {q_t.shape}') # print(f'tf.one_hot(actions[:, a], self.num_actions, dtype=tf.float32).shape for agent {a} is {tf.one_hot(actions[-1, a], self.num_actions, dtype=tf.float32).shape}') q_t_selected = tf.reduce_sum(q_t * tf.one_hot(actions[-1, a], self.num_actions, dtype=tf.float32), 1) # print(f'q_t_selected.dtype is {q_t_selected.dtype}') q_t_selected_target = rewards[-1, a] # n-step rewards sum # print(f'q_t_selected_target.dtype is {q_t_selected_target.dtype}') td_error = q_t_selected - tf.stop_gradient(q_t_selected_target) td_error_.assign_add(td_error) errors = huber_loss(td_error) weighted_error = tf.reduce_mean(importance_weights[-1, a] * errors) loss.assign_add(weighted_error) # # print(f'tile_time is {tile_time}') # with tf.GradientTape() as tape: # fc_values = self.value_network({0: obs0, # 1: tf.tile(self.one_hot_agents, (tile_time, 1, 1)), # 2: fps, # 3: extra_datas} # ) # for a in self.agent_ids: # # q_t = self.q_fc_list[a](fc_values[]) # # print(f'q_values for agent {a} is {q_t}') # # q_t_selected = tf.reduce_sum(q_t * tf.one_hot(actions, self.num_actions, dtype=tf.float32), 1) # # print(f'q_t_selected.dtype is {q_t_selected.dtype}') # # q_t_selected_target = rewards # n-step rewards sum # # print(f'q_t_selected_target.dtype is {q_t_selected_target.dtype}') # # td_error = q_t_selected - tf.stop_gradient(q_t_selected_target) # # errors = huber_loss(td_error) # weighted_error = tf.reduce_mean(importance_weights * errors) param = tape.watched_variables() grads = tape.gradient(loss, param) grads = [grad if grad is not None else tf.zeros_like(var) for var, grad in zip(param, grads)] grads_and_vars = list(zip(grads, param)) self.optimizer.apply_gradients(grads_and_vars) return td_error_