def eps_greedy_sampling(self, scores, mask, eps): scores = scores * mask scores_padded = layers.squeeze( fluid_sequence_pad(scores, 0, maxlen=128), [2]) # (b*s, 1) -> (b, s, 1) -> (b, s) mask_padded = layers.squeeze(fluid_sequence_pad(mask, 0, maxlen=128), [2]) seq_lens = fluid_sequence_get_seq_len(scores) def get_greedy_prob(scores_padded, mask_padded): s = scores_padded - (mask_padded * (-1) + 1) * self.BIG_VALUE max_value = layers.reduce_max(s, dim=1, keep_dim=True) greedy_prob = layers.cast(s >= max_value, 'float32') return greedy_prob greedy_prob = get_greedy_prob(scores_padded, mask_padded) eps_prob = mask_padded * eps / layers.reduce_sum( mask_padded, dim=1, keep_dim=True) final_prob = (greedy_prob + eps_prob) * mask_padded final_prob = final_prob / layers.reduce_sum( final_prob, dim=1, keep_dim=True) sampled_id = layers.reshape(layers.sampling_id(final_prob), [-1, 1]) max_id = layers.cast(layers.cast(seq_lens, 'float32') - 1, 'int64') sampled_id = layers.elementwise_min(sampled_id, max_id) return layers.cast(sampled_id, 'int64')
def value(self, obs, act): concat = layers.concat([obs, act], axis=1) Q1 = self.q1(concat) Q1 = layers.squeeze(Q1, axes=[1]) Q2 = self.q2(concat) Q2 = layers.squeeze(Q2, axes=[1]) return Q1, Q2
def softmax_sampling(self, scores, mask, eta): scores = scores * mask scores_padded = layers.squeeze( fluid_sequence_pad(scores, 0, maxlen=128), [2]) # (b*s, 1) -> (b, s, 1) -> (b, s) mask_padded = layers.squeeze(fluid_sequence_pad(mask, 0, maxlen=128), [2]) seq_lens = fluid_sequence_get_seq_len(scores) def normalize(scores_padded, mask_padded): mean_S = layers.reduce_sum(scores_padded, dim=1, keep_dim=True) / layers.reduce_sum( mask_padded, dim=1, keep_dim=True) S = scores_padded - mean_S std_S = layers.sqrt( layers.reduce_sum(layers.square(S * mask_padded), dim=1, keep_dim=True)) return S / (std_S + self.SAFE_EPS) norm_S = normalize(scores_padded, mask_padded) # set mask to large negative values norm_S = norm_S * mask_padded - (mask_padded * (-1) + 1) * self.BIG_VALUE soft_prob = layers.softmax(norm_S / eta) * mask_padded sampled_id = layers.reshape(layers.sampling_id(soft_prob), [-1, 1]) max_id = layers.cast(layers.cast(seq_lens, 'float32') - 1, 'int64') sampled_id = layers.elementwise_min(sampled_id, max_id) return layers.cast(sampled_id, 'int64')
def value(self, obs, act): x = self.fc1(obs) concat = layers.concat([x, act], axis=1) x = self.fc2(concat) Q1 = self.fc3(x) Q1 = layers.squeeze(Q1, axes=[1]) y = self.fc4(obs) concat2 = layers.concat([y, act], axis=1) Q2 = self.fc5(concat2) Q2 = self.fc6(Q2) Q2 = layers.squeeze(Q2, axes=[1]) return Q1, Q2
def value(self, obs, act): hid1 = self.fc1(obs) concat1 = layers.concat([hid1, act], axis=1) Q1 = self.fc2(concat1) Q1 = self.fc3(Q1) Q1 = layers.squeeze(Q1, axes=[1]) hid2 = self.fc4(obs) concat2 = layers.concat([hid2, act], axis=1) Q2 = self.fc5(concat2) Q2 = self.fc6(Q2) Q2 = layers.squeeze(Q2, axes=[1]) return Q1, Q2
def Q1(self, obs, act): hid1 = self.fc1(obs) concat1 = layers.concat([hid1, act], axis=1) Q1 = self.fc2(concat1) Q1 = self.fc3(Q1) Q1 = layers.squeeze(Q1, axes=[1]) return Q1
def value(self, obs): hid1 = self.fc1(obs) hid2 = self.fc2(hid1) hid3 = self.fc3(hid2) V = self.fc4(hid3) V = layers.squeeze(V, axes=[]) return V
def value(self, obs, act): concat = layers.concat([obs, act], axis=1) # out = self.fc1(concat) # out = self.fc2(out) # out = self.fc3(out) # out = self.fc4(out) # out = self.fc5(out) o = self.obs_fc1(obs) o = self.obs_fc2(o) o = self.obs_fc3(o) a = self.act_fc1(act) a = self.act_fc2(a) a = self.act_fc3(a) c = self.total_fc1(concat) c = self.total_fc2(c) c = self.total_fc3(c) out = self.re_fc1(layers.concat([o, a, c], axis=1)) out = self.re_fc2(out) out = self.re_fc3(out) out = self.re_fc4(out) return layers.squeeze(out, axes=[1])
def value(self, obs, act): x = self.fc1(obs) concat = layers.concat([x, act], axis=1) x = self.fc2(concat) Q = self.fc3(x) Q = layers.squeeze(Q, axes=[1]) return Q
def value(self, obs, act): concat = layers.concat([obs, act], axis=1) hidden1 = self.fc1(concat) hidden2 = self.fc2(hidden1) Q = self.fc3(hidden2) Q = layers.squeeze(Q, axes=[1]) return Q
def _ensemble_predict(self, obs): actor_outputs = [] for i in range(self.ensemble_num): actor_outputs.append(self.actors[i].predict(obs)) batch_actions = layers.concat(actor_outputs, axis=0) batch_obs = layers.expand(obs, expand_times=[self.ensemble_num, 1]) critic_outputs = [] for i in range(self.ensemble_num): critic_output = self.critics[i].predict(batch_obs, batch_actions) critic_output = layers.unsqueeze(critic_output, axes=[1]) critic_outputs.append(critic_output) score_matrix = layers.concat(critic_outputs, axis=1) # Normalize scores given by each critic sum_critic_score = layers.reduce_sum( score_matrix, dim=0, keep_dim=True) sum_critic_score = layers.expand( sum_critic_score, expand_times=[self.ensemble_num, 1]) norm_score_matrix = score_matrix / sum_critic_score actions_mean_score = layers.reduce_mean( norm_score_matrix, dim=1, keep_dim=True) best_score_id = layers.argmax(actions_mean_score, axis=0) best_score_id = layers.cast(best_score_id, dtype='int32') ensemble_predict_action = layers.gather(batch_actions, best_score_id) ensemble_predict_action = layers.squeeze( ensemble_predict_action, axes=[0]) return ensemble_predict_action
def value(self, obs, act): hid1 = self.fc1(obs) concat = layers.concat([hid1, act], axis=1) hid2 = self.fc2(concat) Q = self.fc3(hid2) Q = layers.squeeze(Q, axes=[1]) return Q
def value(self, obs, act): # 输入 state, action, 输出对应的Q(s,a) concat = layers.concat([obs, act], axis=1) hid = self.fc1(concat) Q = self.fc2(hid) Q = layers.squeeze(Q, axes=[1]) return Q
def policy_and_value(self, obs): x = self.fc(obs) policy_logits = self.policy_fc(x) values = self.value_fc(x) values = layers.squeeze(values, axes=[1]) return policy_logits, values
def value(self, obs_n, act_n): inputs = layers.concat(obs_n + act_n, axis=1) hid1 = self.fc1(inputs) hid2 = self.fc2(hid1) Q = self.fc3(hid2) Q = layers.squeeze(Q, axes=[1]) return Q
def value(self, obs): obs = layers.flatten(obs, axis=1) hid1 = self.fc1(obs) # concat1 = layers.concat([hid1, act], axis=1) hid2 = self.fc2(hid1) V = self.value_fc(hid2) V = layers.squeeze(V, axes=[1]) return V
def value(self, obs, act): # 因为s和a都是参数,神经网络中对于多向量输入可以使用联级的方法输入 # 所以我们先把它们拼起来 concat = layers.concat([obs, act], axis=1) hid = self.fc1(concat) Q = self.fc2(hid) Q = layers.squeeze(Q, axes=[1]) return Q
def value(self, obs, act): concat = layers.concat([obs, act], axis=1) #concat = self.conv2(concat) hid = self.fc1(concat) Q = self.fc2(hid) Q = self.fc3(Q) Q = self.fc4(Q) Q = layers.squeeze(Q, axes=[1]) return Q
def value(self, obs): """ Args: obs: A float32 tensor Returns: values: B """ h_1 = self._fc_1(obs) h_2 = self._fc_2(h_1) h_3 = self._fc_3(h_2) values = self.value_fc(h_3) values = layers.squeeze(values, axes=[1]) return values
def value(self, obs, act): # 输入 state, action, 输出对应的Q(s,a) ###################################################################### ###################################################################### concat = layers.concat([obs,act], axis=1) hid0 = self.fc1(concat) hid1 = self.fc2(hid0) hid2 = self.fc3(hid1) Q = layers.squeeze(hid2, axes=[1]) ###################################################################### ###################################################################### return Q
def predict(self, obs, action): real_obs = layers.slice(obs, axes=[1], starts=[0], ends=[self.obs_dim - self.vel_obs_dim]) vel_obs = layers.slice(obs, axes=[1], starts=[-self.vel_obs_dim], ends=[self.obs_dim]) hid0 = self.fc0(real_obs) hid1 = self.fc1(hid0) vel_hid0 = self.vel_fc0(vel_obs) vel_hid1 = self.vel_fc1(vel_hid0) a1 = self.act_fc0(action) concat = layers.concat([hid1, a1, vel_hid1], axis=1) hid2 = self.fc2(concat) V = self.fc3(hid2) V = layers.squeeze(V, axes=[1]) return V
def value(self, hidden, act): # 输入 state, action, 输出对应的Q(s,a) ###################################################################### ###################################################################### # # 5. 请组装Q网络 # flatten_obs = layers.flatten(hidden, axis=1) concat = layers.concat([flatten_obs, act], axis=1) hid = self.fc1(concat) Q = self.fc2(hid) Q2 = layers.squeeze(Q, axes=[1]) return Q2
def value(self, obs, act): conv1 = self.conv1(obs) conv2 = self.conv2(conv1) conv3 = self.conv3(conv2) conv4 = self.conv4(conv3) hid1 = self.fc1(conv4) concat1 = layers.concat([hid1, act], axis=1) Q1 = self.fc2(concat1) Q1 = self.fc3(Q1) Q1 = layers.squeeze(Q1, axes=[1]) conv5 = self.conv1(obs) conv6 = self.conv2(conv5) conv7 = self.conv3(conv6) conv8 = self.conv3(conv7) hid2 = self.fc4(conv8) concat2 = layers.concat([hid2, act], axis=1) Q2 = self.fc5(concat2) Q2 = self.fc6(Q2) Q2 = layers.squeeze(Q2, axes=[1]) return Q1, Q2
def policy_and_value(self, obs): """ Args: obs: A float32 tensor Returns: policy_logits: B * ACT_DIM values: B """ # print('obs.shape: ', obs.shape) h_1 = self._fc_1(obs) h_2 = self._fc_2(h_1) h_3 = self._fc_3(h_2) policy_logits = self.policy_fc(h_3) values = self.value_fc(h_3) values = layers.squeeze(values, axes=[1]) return policy_logits, values
def value(self, obs): """ Args: obs: A float32 tensor of shape [B, C, H, W] Returns: value: B """ obs = obs / 255.0 conv1 = self.conv1(obs) conv2 = self.conv2(conv1) conv3 = self.conv3(conv2) flatten = layers.flatten(conv3, axis=1) value = self.value_fc(flatten) value = layers.squeeze(value, axes=[1]) return value
def value(self, obs, action): real_obs = layers.slice( obs, axes=[1], starts=[0], ends=[-self.vel_obs_dim]) # target related fetures vel_obs = layers.slice( obs, axes=[1], starts=[-self.vel_obs_dim], ends=[self.obs_dim]) hid0 = self.fc0(real_obs) hid1 = self.fc1(hid0) vel_hid0 = self.vel_fc0(vel_obs) vel_hid1 = self.vel_fc1(vel_hid0) a1 = self.act_fc0(action) concat = layers.concat([hid1, a1, vel_hid1], axis=1) hid2 = self.fc2(concat) Q = self.fc3(hid2) Q = layers.squeeze(Q, axes=[1]) return Q
def value(self, obs): """ Args: obs: 输入的图像,shape为[N, C, H, W] Returns: values: N """ conv1 = self.conv1(obs) conv2 = self.conv2(conv1) conv3 = self.conv3(conv2) conv4 = self.conv4(conv3) flatten = layers.flatten(conv4, axis=1) fc_output = self.fc(flatten) values = self.value_fc(fc_output) values = layers.squeeze(values, axes=[1]) return values
def policy_and_value(self, obs): """ Args: obs: 输入的图像,shape为[N, C, H, W] Returns: policy_logits: N * ACTION_DIM values: N """ conv1 = self.conv1(obs) conv2 = self.conv2(conv1) conv3 = self.conv3(conv2) conv4 = self.conv4(conv3) flatten = layers.flatten(conv4, axis=1) fc_output = self.fc(flatten) policy_logits = self.policy_fc(fc_output) values = self.value_fc(fc_output) values = layers.squeeze(values, axes=[1]) return policy_logits, values
def policy_and_value(self, obs): """ Args: obs: A float32 tensor of shape [B, C, H, W] Returns: policy_logits: B * ACT_DIM values: B """ obs = obs / 255.0 conv1 = self.conv1(obs) conv2 = self.conv2(conv1) conv3 = self.conv3(conv2) flatten = layers.flatten(conv3, axis=1) fc_output = self.fc(flatten) policy_logits = self.policy_fc(fc_output) values = self.value_fc(fc_output) values = layers.squeeze(values, axes=[1]) return policy_logits, values
def value(self, obs, act): concat = layers.concat([obs, act], axis=1) print(concat.shape) hid1 = self.fc1(concat) hid2 = self.fc2(hid1) Q = self.fc3(hid2) Q = self.fc4(Q) Q = layers.squeeze(Q, axes=[1]) # hid1 = self.fc1(obs) # concat = layers.concat([hid1, act], axis=1) # hid2 = self.fc2(concat) # Q = self.fc3(hid2) # Q = layers.squeeze(Q, axes=[1]) # 输入 state, action, 输出对应的Q(s,a) ###################################################################### ###################################################################### # # 5. 请组装Q网络 # ###################################################################### ###################################################################### return Q