Example #1
0
    def eps_greedy_sampling(self, scores, mask, eps):
        scores = scores * mask
        scores_padded = layers.squeeze(
            fluid_sequence_pad(scores, 0, maxlen=128),
            [2])  # (b*s, 1) -> (b, s, 1) -> (b, s)
        mask_padded = layers.squeeze(fluid_sequence_pad(mask, 0, maxlen=128),
                                     [2])
        seq_lens = fluid_sequence_get_seq_len(scores)

        def get_greedy_prob(scores_padded, mask_padded):
            s = scores_padded - (mask_padded * (-1) + 1) * self.BIG_VALUE
            max_value = layers.reduce_max(s, dim=1, keep_dim=True)
            greedy_prob = layers.cast(s >= max_value, 'float32')
            return greedy_prob

        greedy_prob = get_greedy_prob(scores_padded, mask_padded)
        eps_prob = mask_padded * eps / layers.reduce_sum(
            mask_padded, dim=1, keep_dim=True)

        final_prob = (greedy_prob + eps_prob) * mask_padded
        final_prob = final_prob / layers.reduce_sum(
            final_prob, dim=1, keep_dim=True)

        sampled_id = layers.reshape(layers.sampling_id(final_prob), [-1, 1])
        max_id = layers.cast(layers.cast(seq_lens, 'float32') - 1, 'int64')
        sampled_id = layers.elementwise_min(sampled_id, max_id)
        return layers.cast(sampled_id, 'int64')
Example #2
0
 def value(self, obs, act):
     concat = layers.concat([obs, act], axis=1)
     Q1 = self.q1(concat)
     Q1 = layers.squeeze(Q1, axes=[1])
     Q2 = self.q2(concat)
     Q2 = layers.squeeze(Q2, axes=[1])
     return Q1, Q2
Example #3
0
    def softmax_sampling(self, scores, mask, eta):
        scores = scores * mask
        scores_padded = layers.squeeze(
            fluid_sequence_pad(scores, 0, maxlen=128),
            [2])  # (b*s, 1) -> (b, s, 1) -> (b, s)
        mask_padded = layers.squeeze(fluid_sequence_pad(mask, 0, maxlen=128),
                                     [2])
        seq_lens = fluid_sequence_get_seq_len(scores)

        def normalize(scores_padded, mask_padded):
            mean_S = layers.reduce_sum(scores_padded, dim=1,
                                       keep_dim=True) / layers.reduce_sum(
                                           mask_padded, dim=1, keep_dim=True)
            S = scores_padded - mean_S
            std_S = layers.sqrt(
                layers.reduce_sum(layers.square(S * mask_padded),
                                  dim=1,
                                  keep_dim=True))
            return S / (std_S + self.SAFE_EPS)

        norm_S = normalize(scores_padded, mask_padded)
        # set mask to large negative values
        norm_S = norm_S * mask_padded - (mask_padded *
                                         (-1) + 1) * self.BIG_VALUE
        soft_prob = layers.softmax(norm_S / eta) * mask_padded
        sampled_id = layers.reshape(layers.sampling_id(soft_prob), [-1, 1])
        max_id = layers.cast(layers.cast(seq_lens, 'float32') - 1, 'int64')
        sampled_id = layers.elementwise_min(sampled_id, max_id)
        return layers.cast(sampled_id, 'int64')
Example #4
0
    def value(self, obs, act):
        x = self.fc1(obs)
        concat = layers.concat([x, act], axis=1)
        x = self.fc2(concat)
        Q1 = self.fc3(x)
        Q1 = layers.squeeze(Q1, axes=[1])

        y = self.fc4(obs)
        concat2 = layers.concat([y, act], axis=1)
        Q2 = self.fc5(concat2)
        Q2 = self.fc6(Q2)
        Q2 = layers.squeeze(Q2, axes=[1])
        return Q1, Q2
Example #5
0
    def value(self, obs, act):
        hid1 = self.fc1(obs)
        concat1 = layers.concat([hid1, act], axis=1)
        Q1 = self.fc2(concat1)
        Q1 = self.fc3(Q1)
        Q1 = layers.squeeze(Q1, axes=[1])

        hid2 = self.fc4(obs)
        concat2 = layers.concat([hid2, act], axis=1)
        Q2 = self.fc5(concat2)
        Q2 = self.fc6(Q2)
        Q2 = layers.squeeze(Q2, axes=[1])
        return Q1, Q2
Example #6
0
 def Q1(self, obs, act):
     hid1 = self.fc1(obs)
     concat1 = layers.concat([hid1, act], axis=1)
     Q1 = self.fc2(concat1)
     Q1 = self.fc3(Q1)
     Q1 = layers.squeeze(Q1, axes=[1])
     return Q1
Example #7
0
 def value(self, obs):
     hid1 = self.fc1(obs)
     hid2 = self.fc2(hid1)
     hid3 = self.fc3(hid2)
     V = self.fc4(hid3)
     V = layers.squeeze(V, axes=[])
     return V
Example #8
0
    def value(self, obs, act):
        concat = layers.concat([obs, act], axis=1)
        # out = self.fc1(concat)
        # out = self.fc2(out)
        # out = self.fc3(out)
        # out = self.fc4(out)
        # out = self.fc5(out)
        o = self.obs_fc1(obs)
        o = self.obs_fc2(o)
        o = self.obs_fc3(o)

        a = self.act_fc1(act)
        a = self.act_fc2(a)
        a = self.act_fc3(a)

        c = self.total_fc1(concat)
        c = self.total_fc2(c)
        c = self.total_fc3(c)

        out = self.re_fc1(layers.concat([o, a, c], axis=1))
        out = self.re_fc2(out)
        out = self.re_fc3(out)
        out = self.re_fc4(out)

        return layers.squeeze(out, axes=[1])
 def value(self, obs, act):
     x = self.fc1(obs)
     concat = layers.concat([x, act], axis=1)
     x = self.fc2(concat)
     Q = self.fc3(x)
     Q = layers.squeeze(Q, axes=[1])
     return Q
 def value(self, obs, act):
     concat = layers.concat([obs, act], axis=1)
     hidden1 = self.fc1(concat)
     hidden2 = self.fc2(hidden1)
     Q = self.fc3(hidden2)
     Q = layers.squeeze(Q, axes=[1])
     return Q
Example #11
0
    def _ensemble_predict(self, obs):
        actor_outputs = []
        for i in range(self.ensemble_num):
            actor_outputs.append(self.actors[i].predict(obs))
        batch_actions = layers.concat(actor_outputs, axis=0)
        batch_obs = layers.expand(obs, expand_times=[self.ensemble_num, 1])

        critic_outputs = []
        for i in range(self.ensemble_num):
            critic_output = self.critics[i].predict(batch_obs, batch_actions)
            critic_output = layers.unsqueeze(critic_output, axes=[1])
            critic_outputs.append(critic_output)
        score_matrix = layers.concat(critic_outputs, axis=1)

        # Normalize scores given by each critic
        sum_critic_score = layers.reduce_sum(
            score_matrix, dim=0, keep_dim=True)
        sum_critic_score = layers.expand(
            sum_critic_score, expand_times=[self.ensemble_num, 1])
        norm_score_matrix = score_matrix / sum_critic_score

        actions_mean_score = layers.reduce_mean(
            norm_score_matrix, dim=1, keep_dim=True)
        best_score_id = layers.argmax(actions_mean_score, axis=0)
        best_score_id = layers.cast(best_score_id, dtype='int32')
        ensemble_predict_action = layers.gather(batch_actions, best_score_id)
        ensemble_predict_action = layers.squeeze(
            ensemble_predict_action, axes=[0])
        return ensemble_predict_action
Example #12
0
 def value(self, obs, act):
     hid1 = self.fc1(obs)
     concat = layers.concat([hid1, act], axis=1)
     hid2 = self.fc2(concat)
     Q = self.fc3(hid2)
     Q = layers.squeeze(Q, axes=[1])
     return Q
Example #13
0
 def value(self, obs, act):
     # 输入 state, action, 输出对应的Q(s,a)
     concat = layers.concat([obs, act], axis=1)
     hid = self.fc1(concat)
     Q = self.fc2(hid)
     Q = layers.squeeze(Q, axes=[1])
     return Q
Example #14
0
    def policy_and_value(self, obs):
        x = self.fc(obs)
        policy_logits = self.policy_fc(x)
        values = self.value_fc(x)
        values = layers.squeeze(values, axes=[1])

        return policy_logits, values
Example #15
0
 def value(self, obs_n, act_n):
     inputs = layers.concat(obs_n + act_n, axis=1)
     hid1 = self.fc1(inputs)
     hid2 = self.fc2(hid1)
     Q = self.fc3(hid2)
     Q = layers.squeeze(Q, axes=[1])
     return Q
Example #16
0
 def value(self, obs):
     obs = layers.flatten(obs, axis=1)
     hid1 = self.fc1(obs)
     # concat1 = layers.concat([hid1, act], axis=1)
     hid2 = self.fc2(hid1)
     V = self.value_fc(hid2)
     V = layers.squeeze(V, axes=[1])
     return V
Example #17
0
 def value(self, obs, act):
     # 因为s和a都是参数,神经网络中对于多向量输入可以使用联级的方法输入
     # 所以我们先把它们拼起来
     concat = layers.concat([obs, act], axis=1)
     hid = self.fc1(concat)
     Q = self.fc2(hid)
     Q = layers.squeeze(Q, axes=[1])
     return Q
    def value(self, obs, act):
        concat = layers.concat([obs, act], axis=1)
        #concat = self.conv2(concat)
        hid = self.fc1(concat)
        Q = self.fc2(hid)
        Q = self.fc3(Q)
        Q = self.fc4(Q)

        Q = layers.squeeze(Q, axes=[1])
        return Q
Example #19
0
 def value(self, obs):
     """
     Args:       obs: A float32 tensor 
     Returns:    values: B
     """
     h_1 = self._fc_1(obs)
     h_2 = self._fc_2(h_1)
     h_3 = self._fc_3(h_2)
     values = self.value_fc(h_3)
     values = layers.squeeze(values, axes=[1])
     return values
Example #20
0
 def value(self, obs, act):
     # 输入 state, action, 输出对应的Q(s,a)
     ######################################################################
     ######################################################################
     concat = layers.concat([obs,act], axis=1)
     hid0 = self.fc1(concat)
     hid1 = self.fc2(hid0)
     hid2 = self.fc3(hid1)
     Q = layers.squeeze(hid2, axes=[1])
     ######################################################################
     ######################################################################
     return Q
Example #21
0
 def predict(self, obs, action):
     real_obs = layers.slice(obs, axes=[1], starts=[0], ends=[self.obs_dim - self.vel_obs_dim])
     vel_obs = layers.slice(obs, axes=[1], starts=[-self.vel_obs_dim], ends=[self.obs_dim])
     hid0 = self.fc0(real_obs)
     hid1 = self.fc1(hid0)
     vel_hid0 = self.vel_fc0(vel_obs)
     vel_hid1 = self.vel_fc1(vel_hid0)
     a1 = self.act_fc0(action)
     concat = layers.concat([hid1, a1, vel_hid1], axis=1)
     hid2 = self.fc2(concat)
     V = self.fc3(hid2)
     V = layers.squeeze(V, axes=[1])
     return V
Example #22
0
    def value(self, hidden, act):
        # 输入 state, action, 输出对应的Q(s,a)

        ######################################################################
        ######################################################################
        #
        # 5. 请组装Q网络
        #
        flatten_obs = layers.flatten(hidden, axis=1)
        concat = layers.concat([flatten_obs, act], axis=1)
        hid = self.fc1(concat)
        Q = self.fc2(hid)
        Q2 = layers.squeeze(Q, axes=[1])
        return Q2
    def value(self, obs, act):
        conv1 = self.conv1(obs)
        conv2 = self.conv2(conv1)
        conv3 = self.conv3(conv2)
        conv4 = self.conv4(conv3)

        hid1 = self.fc1(conv4)
        concat1 = layers.concat([hid1, act], axis=1)
        Q1 = self.fc2(concat1)
        Q1 = self.fc3(Q1)
        Q1 = layers.squeeze(Q1, axes=[1])

        conv5 = self.conv1(obs)
        conv6 = self.conv2(conv5)
        conv7 = self.conv3(conv6)
        conv8 = self.conv3(conv7)

        hid2 = self.fc4(conv8)
        concat2 = layers.concat([hid2, act], axis=1)
        Q2 = self.fc5(concat2)
        Q2 = self.fc6(Q2)
        Q2 = layers.squeeze(Q2, axes=[1])

        return Q1, Q2
Example #24
0
    def policy_and_value(self, obs):
        """
        Args:       obs: A float32 tensor
        Returns:    policy_logits: B * ACT_DIM
                    values: B
        """
        # print('obs.shape: ', obs.shape)
        h_1 = self._fc_1(obs)
        h_2 = self._fc_2(h_1)
        h_3 = self._fc_3(h_2)
        policy_logits = self.policy_fc(h_3)
        values = self.value_fc(h_3)
        values = layers.squeeze(values, axes=[1])

        return policy_logits, values
Example #25
0
    def value(self, obs):
        """
        Args:
            obs: A float32 tensor of shape [B, C, H, W]
        Returns:
            value: B
        """
        obs = obs / 255.0
        conv1 = self.conv1(obs)
        conv2 = self.conv2(conv1)
        conv3 = self.conv3(conv2)

        flatten = layers.flatten(conv3, axis=1)
        value = self.value_fc(flatten)
        value = layers.squeeze(value, axes=[1])
        return value
Example #26
0
    def value(self, obs, action):
        real_obs = layers.slice(
            obs, axes=[1], starts=[0], ends=[-self.vel_obs_dim])
        # target related fetures
        vel_obs = layers.slice(
            obs, axes=[1], starts=[-self.vel_obs_dim], ends=[self.obs_dim])

        hid0 = self.fc0(real_obs)
        hid1 = self.fc1(hid0)
        vel_hid0 = self.vel_fc0(vel_obs)
        vel_hid1 = self.vel_fc1(vel_hid0)
        a1 = self.act_fc0(action)
        concat = layers.concat([hid1, a1, vel_hid1], axis=1)
        hid2 = self.fc2(concat)
        Q = self.fc3(hid2)
        Q = layers.squeeze(Q, axes=[1])
        return Q
    def value(self, obs):
        """
        Args:
            obs: 输入的图像,shape为[N, C, H, W]

        Returns:
            values: N
        """
        conv1 = self.conv1(obs)
        conv2 = self.conv2(conv1)
        conv3 = self.conv3(conv2)
        conv4 = self.conv4(conv3)

        flatten = layers.flatten(conv4, axis=1)
        fc_output = self.fc(flatten)

        values = self.value_fc(fc_output)
        values = layers.squeeze(values, axes=[1])
        return values
    def policy_and_value(self, obs):
        """
        Args:
            obs: 输入的图像,shape为[N, C, H, W]

        Returns:
            policy_logits: N * ACTION_DIM
            values: N
        """
        conv1 = self.conv1(obs)
        conv2 = self.conv2(conv1)
        conv3 = self.conv3(conv2)
        conv4 = self.conv4(conv3)

        flatten = layers.flatten(conv4, axis=1)
        fc_output = self.fc(flatten)

        policy_logits = self.policy_fc(fc_output)

        values = self.value_fc(fc_output)
        values = layers.squeeze(values, axes=[1])

        return policy_logits, values
Example #29
0
    def policy_and_value(self, obs):
        """
        Args:
            obs: A float32 tensor of shape [B, C, H, W]

        Returns:
            policy_logits: B * ACT_DIM
            values: B
        """
        obs = obs / 255.0
        conv1 = self.conv1(obs)
        conv2 = self.conv2(conv1)
        conv3 = self.conv3(conv2)

        flatten = layers.flatten(conv3, axis=1)
        fc_output = self.fc(flatten)

        policy_logits = self.policy_fc(fc_output)

        values = self.value_fc(fc_output)
        values = layers.squeeze(values, axes=[1])

        return policy_logits, values
    def value(self, obs, act):
        concat = layers.concat([obs, act], axis=1)
        print(concat.shape)
        hid1 = self.fc1(concat)
        hid2 = self.fc2(hid1)
        Q = self.fc3(hid2)
        Q = self.fc4(Q)
        Q = layers.squeeze(Q, axes=[1])

        # hid1 = self.fc1(obs)
        # concat = layers.concat([hid1, act], axis=1)
        # hid2 = self.fc2(concat)
        # Q = self.fc3(hid2)
        # Q = layers.squeeze(Q, axes=[1])
        # 输入 state, action, 输出对应的Q(s,a)

        ######################################################################
        ######################################################################
        #
        # 5. 请组装Q网络
        #
        ######################################################################
        ######################################################################
        return Q