Ejemplo n.º 1
0
    def build_policy(self):
        dilate_input = self._expand_model_graph(self.input, train=True)

        dilate_input = tf.reduce_mean(dilate_input, axis=1)
        logits = tf.matmul(dilate_input, self.softmax_w, transpose_b=True)
        logits = tf.nn.bias_add(logits, self.output_bias)
        logits = tf.reshape(logits, [-1, self.action_num, 2])
        logits = tf.sigmoid(logits)

        self.logits_check = tf.nn.softmax(logits)

        action = gumbel_softmax_(logits, hard=True, temperature=self.temp)
        self.action_predict = action[:, :, 0]
Ejemplo n.º 2
0
    def build_policy(self, input, train):
        tf.get_variable_scope().reuse_variables()

        dilate_input = self._hidden_forward(input, train=train)

        dilate_input = tf.reduce_mean(dilate_input, axis=1)
        logits = tf.matmul(dilate_input, self.softmax_w, transpose_b=True)
        logits = tf.nn.bias_add(logits, self.output_bias)
        logits = tf.reshape(logits, [-1, self.action_num, 2])
        logits = tf.sigmoid(logits)

        logits_check = tf.nn.softmax(logits)

        action = gumbel_softmax_(logits, hard=True, temperature=self.temp)
        action_predict = action[:, :, 0]

        return logits_check, action_predict
Ejemplo n.º 3
0
    def build_policy(self):
        context_seq = self.input[:, 0:-1]  # [B, S]
        dilate_input = tf.nn.embedding_lookup(
            self.item_embedding, context_seq, name="context_embedding"
        )  # [B, S, C]

        dilate_input = self.gru(dilate_input)  # [B, C]

        logits = tf.matmul(dilate_input, self.softmax_w, transpose_b=True)
        logits = tf.nn.bias_add(logits, self.output_bias)
        logits = tf.reshape(logits, [-1, self.action_num, 2])
        logits = tf.sigmoid(logits)

        self.logits_check = tf.nn.softmax(logits)

        action = gumbel_softmax_(logits, hard=True, temperature=self.temp)
        self.action_predict = action[:, :, 0]
Ejemplo n.º 4
0
    def build_policy(self, context, train):
        hidden = self._expand_model_graph(context, train=train)  # # [B, L, C]

        hidden = tf.reduce_mean(hidden, axis=1)  # [B, C]
        logits = tf.matmul(hidden, self.softmax_w,
                           transpose_b=True)  # [B, C] * [C, A*2]=[B,#block*2]
        logits = tf.nn.bias_add(logits, self.output_bias)  # [B,#block*2]
        logits = tf.reshape(logits, [-1, self.action_num, 2])  # [B, #block, 2]

        if self.hard_policy:
            logits = tf.sigmoid(logits)
            action = gumbel_softmax_(logits, temperature=self.temp,
                                     hard=True)  # [B, #block, 2]
        else:  # soft policy
            action = tf.sigmoid(logits)  # [B, #block, 2]

        logits_check = tf.nn.softmax(logits)
        action_predict = action[:, :, 0]  # [B, #block]
        return logits_check, action_predict
Ejemplo n.º 5
0
    def build_policy(self, context):
        hidden = tf.nn.embedding_lookup(self.item_embedding,
                                        context,
                                        name="context_embedding")  # [B, L, C]
        hidden = self.gru(hidden)  # [B, C]

        logits = tf.matmul(hidden, self.softmax_w,
                           transpose_b=True)  # [B, C] * [C, A*2]=[B,#block*2]
        logits = tf.nn.bias_add(logits, self.output_bias)  # [B,#block*2]
        logits = tf.reshape(logits, [-1, self.action_num, 2])  # [B, #block, 2]

        if self.hard_policy:
            logits = tf.sigmoid(logits)
            action = gumbel_softmax_(logits, temperature=self.temp,
                                     hard=True)  # [B, #block, 2]
        else:  # soft policy
            action = tf.sigmoid(logits)  # [B, #block, 2]

        logits_check = tf.nn.softmax(logits)
        action_predict = action[:, :, 0]  # [B, #block]
        return logits_check, action_predict