def eps_greedy_sampling(self, scores, mask, eps): scores = scores * mask scores_padded = layers.squeeze( fluid_sequence_pad(scores, 0, maxlen=128), [2]) # (b*s, 1) -> (b, s, 1) -> (b, s) mask_padded = layers.squeeze(fluid_sequence_pad(mask, 0, maxlen=128), [2]) seq_lens = fluid_sequence_get_seq_len(scores) def get_greedy_prob(scores_padded, mask_padded): s = scores_padded - (mask_padded * (-1) + 1) * self.BIG_VALUE max_value = layers.reduce_max(s, dim=1, keep_dim=True) greedy_prob = layers.cast(s >= max_value, 'float32') return greedy_prob greedy_prob = get_greedy_prob(scores_padded, mask_padded) eps_prob = mask_padded * eps / layers.reduce_sum( mask_padded, dim=1, keep_dim=True) final_prob = (greedy_prob + eps_prob) * mask_padded final_prob = final_prob / layers.reduce_sum( final_prob, dim=1, keep_dim=True) sampled_id = layers.reshape(layers.sampling_id(final_prob), [-1, 1]) max_id = layers.cast(layers.cast(seq_lens, 'float32') - 1, 'int64') sampled_id = layers.elementwise_min(sampled_id, max_id) return layers.cast(sampled_id, 'int64')
def normalize(scores_padded, mask_padded): mean_S = layers.reduce_sum(scores_padded, dim=1, keep_dim=True) / layers.reduce_sum( mask_padded, dim=1, keep_dim=True) S = scores_padded - mean_S std_S = layers.sqrt( layers.reduce_sum(layers.square(S * mask_padded), dim=1, keep_dim=True)) return S / (std_S + self.SAFE_EPS)
def get_seq_len_mask(input, atten_input, max_seq_len, max_atten_seq_len): ones = layers.reduce_sum( input, dim=1, keep_dim=True) * 0 + 1 # (batch*seq_len, 1) atten_ones = layers.reduce_sum(atten_input, dim=1, keep_dim=True) * 0 + 1 ones_padded = fluid_sequence_pad( ones, 0, max_seq_len) # (batch, seq_len, 1) atten_ones_padded = fluid_sequence_pad(atten_ones, 0, max_atten_seq_len) seq_len_mask = layers.matmul( ones_padded, layers.transpose(atten_ones_padded, perm=[0, 2, 1])) seq_len_mask.stop_gradient = True return seq_len_mask # (batch, seq_len, atten_seq_len)
def _ensemble_predict(self, obs): actor_outputs = [] for i in range(self.ensemble_num): actor_outputs.append(self.actors[i].predict(obs)) batch_actions = layers.concat(actor_outputs, axis=0) batch_obs = layers.expand(obs, expand_times=[self.ensemble_num, 1]) critic_outputs = [] for i in range(self.ensemble_num): critic_output = self.critics[i].predict(batch_obs, batch_actions) critic_output = layers.unsqueeze(critic_output, axes=[1]) critic_outputs.append(critic_output) score_matrix = layers.concat(critic_outputs, axis=1) # Normalize scores given by each critic sum_critic_score = layers.reduce_sum( score_matrix, dim=0, keep_dim=True) sum_critic_score = layers.expand( sum_critic_score, expand_times=[self.ensemble_num, 1]) norm_score_matrix = score_matrix / sum_critic_score actions_mean_score = layers.reduce_mean( norm_score_matrix, dim=1, keep_dim=True) best_score_id = layers.argmax(actions_mean_score, axis=0) best_score_id = layers.cast(best_score_id, dtype='int32') ensemble_predict_action = layers.gather(batch_actions, best_score_id) ensemble_predict_action = layers.squeeze( ensemble_predict_action, axes=[0]) return ensemble_predict_action
def define_learn(self, obs, action, reward, next_obs, terminal, weight): #Q(s,a|θ) pred_value = self.model.value(obs) #Q(s',a'|θ') targetQ_predict_value = self.target_model.value(next_obs) #Q(s',a'|θ) next_s_predcit_value = self.model.value(next_obs) #argMax[Q(s',a'|θ)] greedy_action = fluid_argmax(next_s_predcit_value) predict_onehot = fluid.layers.one_hot(greedy_action, self.action_dim) #Q(s',argMax[Q(s',a'|θ)]|θ') best_v = fluid.layers.reduce_sum(fluid.layers.elementwise_mul( predict_onehot, targetQ_predict_value), dim=1) best_v.stop_gradient = True #TD目标: R+γ*Q(s',argMax[Q(s',a'|θ)]|θ') target = reward + ( 1.0 - layers.cast(terminal, dtype='float32')) * self.gamma * best_v action_onehot = layers.one_hot(action, self.action_dim) action_onehot = layers.cast(action_onehot, dtype='float32') pred_action_value = layers.reduce_sum(layers.elementwise_mul( action_onehot, pred_value), dim=1) #计算新的TD-Error newTd = layers.abs(target - pred_action_value) cost = layers.square_error_cost(pred_action_value, target) #weight表示样本的权重,影响cost的更新幅度 cost = weight * cost cost = layers.reduce_mean(cost) optimizer = fluid.optimizer.Adam(self.lr, epsilon=1e-3) optimizer.minimize(cost) return cost, newTd
def learn(self, obs, action, reward, next_obs, terminal): """ 使用DQN算法更新self.model的value网络 """ # 从target_model中获取 max Q' 的值,用于计算target_Q next_pred_value = self.target_model.value(next_obs) best_v = layers.reduce_max(next_pred_value, dim=1) best_v.stop_gradient = True # 阻止梯度传递 terminal = layers.cast(terminal, dtype='float32') target = reward + (1.0 - terminal) * self.gamma * best_v pred_value = self.model.value(obs) # 获取Q预测值 # 将action转onehot向量,比如:3 => [0,0,0,1,0],独热编码有好处 action_onehot = layers.one_hot(action, self.act_dim) action_onehot = layers.cast(action_onehot, dtype='float32') # 下面一行是逐元素相乘,拿到action对应的 Q(s,a) # 比如:pred_value = [[2.3, 5.7, 1.2, 3.9, 1.4]], action_onehot = [[0,0,0,1,0]] # ==> pred_action_value = [[3.9]] pred_action_value = layers.reduce_sum(layers.elementwise_mul( action_onehot, pred_value), dim=1) # 计算 Q(s,a) 与 target_Q的均方差,得到loss cost = layers.square_error_cost(pred_action_value, target) cost = layers.reduce_mean(cost) optimizer = fluid.optimizer.Adam(learning_rate=self.lr) # 使用Adam优化器 optimizer.minimize(cost) return cost
def infer_onestep(self, inputs): """inference the gru-unit by one step""" prev_hidden = inputs['prev_hidden'] first_step_mask = inputs['first_step_mask'] item_embedding = self._build_embeddings( inputs, self.item_slot_names) # (b*cand_len, dim), as candidates last_click_embedding = self._build_embeddings( inputs, self.last_click_slot_names) # (b, dim) last_item_embedding = self._build_embeddings( inputs, self.last_item_slot_names) # (b, dim) item_fc = self.item_fc_op(item_embedding) last_item_fc = self.item_fc_op(last_item_embedding) * first_step_mask item_hidden = self.simple_step_rnn(last_item_fc, last_click_embedding, h_0=prev_hidden) action_hat = self.actor_policy(item_hidden) # inner product expand_action_hat = layers.sequence_expand( action_hat, item_fc) # (b*cand_len, dim) scores = layers.reduce_sum(expand_action_hat * item_fc, 1) output_dict = OrderedDict() output_dict['hidden'] = item_hidden output_dict['scores'] = scores return output_dict
def sampling_rnn(self, item_fc, h_0, pos_embed, forward_func, sampling_type, eps=0, eta=1): mask = layers.reduce_sum(item_fc, dim=1, keep_dim=True) * 0 + 1 drnn = fluid.layers.DynamicRNN() with drnn.block(): # e.g. batch_size = 2 _ = drnn.step_input(item_fc) cur_pos_embed = drnn.step_input(pos_embed) # lod = [] cur_h_0 = drnn.memory(init=h_0, need_reorder=True) # lod = [0,1,2] item_fc = drnn.static_input(item_fc) mask = drnn.memory(init=mask, need_reorder=True) # step_input will remove lod info cur_pos_embed = layers.lod_reset(cur_pos_embed, cur_h_0) # expand expand_h_0 = layers.sequence_expand( cur_h_0, item_fc) # lod = [0,1,2,3,4,5,6,7] expand_pos_embed = layers.sequence_expand( cur_pos_embed, item_fc) # lod = [0,1,2,3,4,5,6,7] expand_item_fc = layers.lod_reset(item_fc, expand_h_0) # forward expand_next_h_0, expand_scores = forward_func( expand_item_fc, expand_h_0, expand_pos_embed) # reset result lod expand_next_h_0 = layers.lod_reset(expand_next_h_0, item_fc) # lod = [0,4,7] expand_scores = layers.lod_reset(expand_scores, item_fc) # lod = [0,4,7] if sampling_type == 'eps_greedy': selected_index = self.eps_greedy_sampling(expand_scores, mask, eps=eps) elif sampling_type == 'softmax': selected_index = self.softmax_sampling(expand_scores, mask, eta=eta) drnn.output(selected_index) next_h_0 = fluid_sequence_index(expand_next_h_0, selected_index) next_mask = fluid_sequence_scatter( mask, layers.reshape(selected_index, [-1]), 0.0) # update drnn.update_memory(cur_h_0, next_h_0) drnn.update_memory(mask, next_mask) drnn_output = drnn() return drnn_output
def train_rnn(self, item_fc, atten_item_fc, h_0, pos, pos_embed, output_type=''): shifted_item_fc = fluid_sequence_advance(item_fc, OOV=0) drnn = fluid.layers.DynamicRNN() with drnn.block(): cur_item_fc = drnn.step_input(shifted_item_fc) cur_pos_embed = drnn.step_input(pos_embed) cur_h_0 = drnn.memory(init=h_0, need_reorder=True) # step_input will remove lod info cur_item_fc = layers.lod_reset(cur_item_fc, cur_h_0) cur_pos_embed = layers.lod_reset(cur_pos_embed, cur_h_0) next_h_0, hidden_fc = self.sampling_rnn_forward( cur_item_fc, cur_h_0, cur_pos_embed) if output_type == 'c_Q': cur_atten_item_fc = drnn.step_input(atten_item_fc) cur_atten_item_fc = layers.lod_reset(cur_atten_item_fc, cur_h_0) Q = layers.reduce_sum(hidden_fc * cur_atten_item_fc, dim=1, keep_dim=True) drnn.output(Q) elif output_type == 'max_Q': cur_pos = drnn.step_input(pos) pos = drnn.static_input(pos) atten_item_fc = drnn.static_input(atten_item_fc) expand_Q = self._dot_attention(hidden_fc, atten_item_fc) cur_step_id = layers.slice(cur_pos, axes=[0, 1], starts=[0, 0], ends=[1, 1]) mask = layers.cast(pos >= cur_step_id, 'float32') expand_Q = expand_Q * mask max_Q = layers.sequence_pool(expand_Q, 'max') drnn.output(max_Q) else: raise NotImplementedError(output_type) # update drnn.update_memory(cur_h_0, next_h_0) drnn_output = drnn() return drnn_output
def _build_embeddings(self, inputs, list_names): list_embed = [] for name in list_names: embed_name = self._get_embed_name(name) c_embed = self.dict_data_embed_op[embed_name](inputs[name]) if len(c_embed.shape) == 3: # squeeze (batch*num_items, None, 16) c_embed = layers.reduce_sum(c_embed, dim=1) list_embed.append(c_embed) # (batch*num_items, 16) concated_embed = layers.concat(input=list_embed, axis=1) # (batch*num_items, concat_dim) concated_embed = layers.softsign(concated_embed) return concated_embed
def learn(self, obs, action, reward): act_prob = self.model(obs) # log_prob = layers.cross_entropy(act_prob, action) log_prob = layers.reduce_sum( -1.0 * layers.log(act_prob) * layers.one_hot( action, act_prob.shape[1]), dim=1) cost = log_prob * reward cost = layers.reduce_mean(cost) optimizer = fluid.optimizer.Adam(self.lr) optimizer.minimize(cost) return cost
def sampling_rnn(self, item_fc, atten_item_fc, h_0, pos_embed, sampling_type, eps=0, eta=1): oov_item_fc = layers.fill_constant_batch_size_like(item_fc, shape=item_fc.shape, value=0, dtype='float32') oov_item_fc = layers.lod_reset(oov_item_fc, h_0) mask = layers.reduce_sum(item_fc, dim=1, keep_dim=True) * 0 + 1 drnn = fluid.layers.DynamicRNN() with drnn.block(): _ = drnn.step_input(item_fc) cur_pos_embed = drnn.step_input(pos_embed) cur_item_fc = drnn.memory(init=oov_item_fc, need_reorder=True) cur_h_0 = drnn.memory(init=h_0, need_reorder=True) mask = drnn.memory(init=mask, need_reorder=True) item_fc = drnn.static_input(item_fc) atten_item_fc = drnn.static_input(atten_item_fc) # step_input will remove lod info cur_pos_embed = layers.lod_reset(cur_pos_embed, cur_h_0) next_h_0, hidden_fc = self.sampling_rnn_forward( cur_item_fc, cur_h_0, cur_pos_embed) expand_Q = self._dot_attention(hidden_fc, atten_item_fc) if sampling_type == 'eps_greedy': selected_index = self.eps_greedy_sampling(expand_Q, mask, eps=eps) elif sampling_type == 'softmax': selected_index = self.softmax_sampling(expand_Q, mask, eta=eta) drnn.output(selected_index) next_item_fc = fluid_sequence_index(item_fc, selected_index) next_mask = fluid_sequence_scatter( mask, layers.reshape(selected_index, [-1]), 0.0) # update drnn.update_memory(cur_item_fc, next_item_fc) drnn.update_memory(cur_h_0, next_h_0) drnn.update_memory(mask, next_mask) drnn_output = drnn() return drnn_output
def learn(self, obs, action, reward): """ 用policy gradient 算法更新policy model """ act_prob = self.model(obs) # 获取输出动作概率 # log_prob = layers.cross_entropy(act_prob, action) # 交叉熵 log_prob = layers.reduce_sum(-1.0 * layers.log(act_prob) * layers.one_hot(action, act_prob.shape[1]), dim=1) cost = log_prob * reward cost = layers.reduce_mean(cost) print('====loss', cost) optimizer = fluid.optimizer.Adam(self.lr) optimizer.minimize(cost) return cost
def __call__(self, input_Q, input_K, input_V, num_head, mask): """ args: input_Q: (batch, max_num0, dim0) input_K: (batch, max_num1, dim1) input_V: (batch, max_num1, dim2) mask: (batch, max_num0, max_num1) returns: output: (batch, max_num0, dim) """ # fcs Q = self.Q_fc_op(input_Q) K = self.K_fc_op(input_K) V = self.V_fc_op(input_V) # multi head dim = Q.shape[-1] assert dim % num_head == 0, (dim, num_head) list_output = [] sub_Qs = fluid_split(Q, num_head, 2) sub_Ks = fluid_split(K, num_head, 2) sub_Vs = fluid_split(V, num_head, 2) for head_id in range(num_head): sub_Q = sub_Qs[head_id] # (batch, max_num, dim/num_head) sub_K = sub_Ks[head_id] sub_V = sub_Vs[head_id] # matmul -> scale -> mask -> softmax -> mask -> /sum Q_K_T = layers.matmul(sub_Q, layers.transpose( sub_K, perm=[0, 2, 1])) # (batch, max_num0, max_num1) Q_K_T = Q_K_T / np.sqrt(self._nf) Q_K_T = Q_K_T * mask Q_K_T = layers.softmax(Q_K_T) Q_K_T = Q_K_T * mask Q_K_T = Q_K_T / (layers.reduce_sum(Q_K_T, dim=2, keep_dim=True) + self._safe_eps) # weighted sum atten_out = layers.matmul(Q_K_T, sub_V) # (batch, max_num0, dim/num_head) list_output.append(atten_out) output = layers.concat(list_output, 2) return output
def learn(self, obs, action, reward, next_obs, terminal): ''' :param obs: St :param action: At :param reward: Rt+1 :param next_obs: St+1 :param terminal: done, True代表episode结束 :return: 损失函数的值 ''' # 通过目标网络计算得到target_Q的值 target_Q_tensor = self.target_model.value(next_obs) # 计算St+1对应的价值向量 max_Q = layers.reduce_max(target_Q_tensor, dim=1) # 获取每行的最大值,按dim=1收缩 max_Q.stop_gradient = True # 停止梯度更新 # 由于terminal不是标量,所以不能直接用判断 terminal = layers.cast(terminal, dtype="float32") target_Q = reward + (1.0 - terminal) * self.gamma * max_Q # 通过主网络计算得到perdict_Q的值 predict_Q_tensor = self.model.value(obs) # 将action转成one-hot向量,并将每一位都变成浮点数 action_onehot = layers.one_hot(action, self.act_dim) action = layers.cast(action_onehot, dtype="float32") # 进行elementwise计算并降低张量阶数 # 比如 predict_Q_tensor = [[2.3, 5.7, 1.2, 3.9, 1.4], action_onehot=[[0, 0, 0, 1, 0] # [2.1, 3.7, 4.5, 6.7, 7.1]] [0, 1, 0, 0, 0]] # 那么elementwise乘法运算后的结果是 [[0, 0, 0, 3.9, 0] # [0, 3.7, 0, 0, 0]] # 再进行dim=1的reduce_sum操作后的结果是 [3.9, 3.7] predict_Q = layers.reduce_sum(layers.elementwise_mul( action_onehot, predict_Q_tensor), dim=1) # 得到这个batch每条数据的损失函数值的平均值 cost = layers.square_error_cost(predict_Q, target_Q) cost = layers.reduce_mean(cost) # 申明优化器(使用Adam优化器) optimizer = fluid.optimizer.Adam(learning_rate=self.lr) optimizer.minimize(cost) # 指定优化目标 return cost
def learn(self, obs, action, reward, next_obs, terminal): next_pred_value = self.target_model.value(next_obs) best_v = layers.reduce_max(next_pred_value, dim=-1) best_v.stop_gradient = True terminal = layers.cast(terminal, dtype="float32") target = reward + (1.0 - terminal) * self.gamma * best_v pred_value = self.model.value(obs) action_onehot = layers.one_hot(action, self.act_dim) action_onehot = layers.cast(action_onehot, dtype="float32") pred_action_value = layers.reduce_sum(layers.elementwise_mul( pred_value, action_onehot), dim=-1) cost = layers.square_error_cost(target, pred_action_value) cost = layers.reduce_mean(cost) optimizer = fluid.optimizer.Adam(learning_rate=self.lr) optimizer.minimize(cost) return cost
def _dot_attention(self, input, atten_items): """ args: input: (batch, dim), lod_level = 0 atten_items: (batch*seq_len, dim), lod_level = 1 return: atten_weights: (batch*seq_len, 1), lod_level = 1 """ expand_input = layers.sequence_expand( input, atten_items) #(batch*seq_len, dim), lod_level = 0 expand_input = layers.lod_reset( expand_input, atten_items) #(batch*seq_len, dim), lod_level = 1 if self._attention_type == 'concat_fc': atten_weights = self.atten_fc_op( layers.concat([expand_input, atten_items], 1)) elif self._attention_type == 'dot': atten_weights = layers.reduce_sum( expand_input * atten_items, dim=1, keep_dim=True) #(batch*seq_len, 1), lod_level = 1 return atten_weights
def learn(self, obs, action, reward): """ :param obs: [B,4] :param action: [B,1] :param reward: [B,] :return: """ act_prob = self.model(obs) # [B,2] # [B, 2] -> [B, ] log_prob = layers.reduce_sum( -1.0 * layers.log(act_prob) * layers.one_hot(action, depth=act_prob.shape[1]), dim=1, keep_dim=False) cost = log_prob * reward cost = layers.reduce_mean(cost) optimizer = fluid.optimizer.Adam(self.lr) optimizer.minimize(cost) return cost
def ensemble_predict(self, obs): """ ensemble predict: 1. For actions of all actors, each critic will score them and normalize its scores; 2. For each actor, will calculate its score by average scores given by all critics 3. choose action of the actor whose score is best """ actor_outputs = [] for i in range(self.ensemble_num): actor_outputs.append(self.models[i].policy(obs)) batch_actions = layers.concat(actor_outputs, axis=0) batch_obs = layers.expand(obs, expand_times=[self.ensemble_num, 1]) critic_outputs = [] for i in range(self.ensemble_num): critic_output = self.models[i].value(batch_obs, batch_actions) critic_output = layers.unsqueeze(critic_output, axes=[1]) critic_outputs.append(critic_output) score_matrix = layers.concat(critic_outputs, axis=1) # Normalize scores given by each critic sum_critic_score = layers.reduce_sum(score_matrix, dim=0, keep_dim=True) sum_critic_score = layers.expand(sum_critic_score, expand_times=[self.ensemble_num, 1]) norm_score_matrix = score_matrix / sum_critic_score actions_mean_score = layers.reduce_mean(norm_score_matrix, dim=1, keep_dim=True) best_score_id = layers.argmax(actions_mean_score, axis=0) best_score_id = layers.cast(best_score_id, dtype='int32') ensemble_predict_action = layers.gather(batch_actions, best_score_id) return ensemble_predict_action