Example #1
0
def hash_strings(strings, hash_buckets, key=None, name=None):
  """Hash strings into buckets.

  Args:
    strings: a `Tensor` or `SparseTensor` of dtype `tf.string`.
    hash_buckets: the number of hash buckets.
    key: optional. An array of two Python `uint64`. If passed, output will be
      a deterministic function of `strings` and `key`. Note that hashing will be
      slower if this value is specified.
    name: (Optional) A name for this operation.

  Returns:
    A `Tensor` or `SparseTensor` of dtype `tf.int64` with the same shape as the
    input `strings`.

  Raises:
    TypeError: if `strings` is not a `Tensor` or `SparseTensor` of dtype
    `tf.string`.
  """
  if (not isinstance(strings, (tf.Tensor,
                               tf.SparseTensor))) or strings.dtype != tf.string:
    raise TypeError(
        'Input to hash_strings must be a Tensor or SparseTensor of dtype '
        'string; got {}'.
        format(strings.dtype))
  if isinstance(strings, tf.SparseTensor):
    return tf.SparseTensor(indices=strings.indices,
                           values=hash_strings(
                               strings.values, hash_buckets, key),
                           dense_shape=strings.dense_shape)
  if name is None:
    name = 'hash_strings'
  if key is None:
    return tf.string_to_hash_bucket_fast(strings, hash_buckets, name=name)
  return tf.string_to_hash_bucket_strong(strings, hash_buckets, key, name=name)
Example #2
0
    def _graph_fn_apply(self, text_inputs):
        """
        Args:
            text_inputs (SingleDataOp): The Text input to generate a hash bucket for.

        Returns:
            tuple:
                - SingleDataOp: The hash lookup table (int64) that can be used as input to embedding-lookups.
                - SingleDataOp: The length (number of words) of the longest string in the `text_input` batch.
        """
        if get_backend() == "tf":
            # Split the input string.
            split_text_inputs = tf.string_split(source=text_inputs, delimiter=self.delimiter)
            # Build a tensor of n rows (number of items in text_inputs) words with
            dense = tf.sparse_tensor_to_dense(sp_input=split_text_inputs, default_value="")

            length = tf.reduce_sum(input_tensor=tf.to_int32(x=tf.not_equal(x=dense, y="")), axis=-1)
            if self.hash_function == "fast":
                hash_bucket = tf.string_to_hash_bucket_fast(input=dense, num_buckets=self.num_hash_buckets)
            else:
                hash_bucket = tf.string_to_hash_bucket_strong(input=dense,
                                                              num_buckets=self.num_hash_buckets,
                                                              key=self.hash_keys)

            # Int64 is tf's default for `string_to_hash_bucket` operation: Can leave as is.
            if self.dtype != "int64":
                hash_bucket = tf.cast(x=hash_bucket, dtype=dtype_(self.dtype))

            # Hash-bucket output is always batch-major.
            hash_bucket._batch_rank = 0
            hash_bucket._time_rank = 1

            return hash_bucket, length
Example #3
0
 def testStringToOneHashBucketStrongOneHashBucket(self):
     with self.test_session():
         input_string = tf.constant(['a', 'b', 'c'])
         output = tf.string_to_hash_bucket_strong(input_string,
                                                  1,
                                                  key=[123, 345])
         self.assertAllEqual([0, 0, 0], output.eval())
 def testStringToHashBucketsStrong(self):
   with self.test_session():
     input_string = tf.constant(['a', 'b', 'c'])
     output = tf.string_to_hash_bucket_strong(input_string,
                                              10,
                                              key=[98765, 132])
     # key = [98765, 132]
     # StrongKeyedHash(key, 'a') -> 7157389809176466784 -> mod 10 -> 4
     # StrongKeyedHash(key, 'b') -> 15805638358933211562 -> mod 10 -> 2
     # StrongKeyedHash(key, 'c') -> 18100027895074076528 -> mod 10 -> 8
     self.assertAllEqual([4, 2, 8], output.eval())
Example #5
0
    def build_inference(self, x, flag="train"):
        # 设置regularizer,本别对应网络的四个部分
        regularizer1 = self.param_dict[
            "regulerizer1"] if flag == "train" else None
        regularizer2 = self.param_dict[
            "regulerizer2"] if flag == "train" else None
        regularizer3 = self.param_dict[
            "regulerizer3"] if flag == "train" else None
        regularizer4 = self.param_dict[
            "regulerizer4"] if flag == "train" else None
        is_train = True if flag == "train" else False
        # 先获取需要的参数
        hash_size = self.param_dict['hash_size']
        no_hash = self.param_dict["no_hash"]
        embed_size = self.param_dict["embed_size"]
        # browse_nums = self.param_dict["browse_nums"] # browse_nums = [20, 10, 10]
        # 根据配置获取激活函数
        act_fn = self.get_activation_func(is_train)
        # 是否启用mini-batch aware regularization
        is_mba_reg = self.param_dict["is_mba_reg"]
        lambda_reg_mba = self.param_dict["lambda_reg_mba"]
        is_action_mba_reg = self.param_dict["is_action_mba_reg"]

        # 将输入划分
        x_feature = x[:, :-3]
        x_action_lists = x[:, -3:]

        # 先将稀疏特征转换成indice
        x_sparse = []
        for i in range(len(hash_size)):
            if i in no_hash:
                # 这部分特征本身可以直接作为indice,不需要转化
                x_i = tf.string_to_number(x_feature[:, i], tf.int32)
                x_sparse.append(x_i)
            else:
                # 这部分特征可以通过哈希函数来转化成index
                x_i = tf.string_to_hash_bucket_strong(
                    input=x_feature[:, i],
                    num_buckets=hash_size[i],
                    key=[679362, 964545],
                    name="sparse_feature_{}".format(i))
                x_sparse.append(x_i)
        # 将稀疏数据转换成embedding向量
        x_embed = []
        w_action_embed = []
        x_action = []
        indice_sku_cate_brand = []
        sku_cate_brand_index = self.param_dict["sku_cate_brand_index"]
        for i in range(len(embed_size)):
            if i in sku_cate_brand_index:  # skuid, cateid, brandid对应的embedding向量
                with tf.variable_scope("embedding_{}".format(i)):
                    weights = self.get_weight_variable(
                        [hash_size[i], embed_size[i]],
                        regularizer1,
                        self.param_dict["initializer_embedding_w"](
                            [hash_size[i], embed_size[i]]),
                        partitioner=tf.fixed_size_partitioner(10, 0))
                    w_action_embed.append(weights)
                    x_i = tf.nn.embedding_lookup(weights, x_sparse[i])
                    if is_train and is_mba_reg and not is_action_mba_reg:
                        # 计算mba
                        self.calculate_mini_batch_aware_reg(
                            weights, x_sparse[i], lambda_reg_mba)

                    indice_sku_cate_brand.append(x_sparse[i])
                    x_embed.append(x_i)
                    x_action.append(x_i)
            else:
                if embed_size[i] != -1:
                    with tf.variable_scope("embedding_{}".format(i)):
                        if i == 0:
                            weights = self.get_weight_variable(
                                [hash_size[i], embed_size[i]],
                                regularizer1,
                                self.param_dict["initializer_embedding_w"](
                                    [hash_size[i], embed_size[i]]),
                                partitioner=tf.fixed_size_partitioner(10, 0))
                        else:
                            weights = self.get_weight_variable(
                                [hash_size[i], embed_size[i]], regularizer1,
                                self.param_dict["initializer_embedding_w"](
                                    [hash_size[i], embed_size[i]]))
                        x_i = tf.nn.embedding_lookup(weights, x_sparse[i])
                        if is_train and is_mba_reg:
                            # 计算mba
                            self.calculate_mini_batch_aware_reg(
                                weights, x_sparse[i], lambda_reg_mba)
                        x_embed.append(x_i)
                else:
                    x_i = tf.one_hot(x_sparse[i], depth=hash_size[i])
                    x_embed.append(x_i)
        x_embed = tf.concat(x_embed, 1)
        x_deep_in = x_embed
        is_usingg_user_act_feature = self.param_dict[
            "is_usingg_user_act_feature"]
        if is_usingg_user_act_feature:
            pooling_method = self.param_dict["pooling_method"]
            # 对浏览行为建模,构建行为embedding向量
            with tf.name_scope("user_behaviours"):
                x_browse_skus_list = tf.reshape(x_action_lists[:, 0], [
                    -1,
                ])
                x_browse_cates_list = tf.reshape(x_action_lists[:, 1], [
                    -1,
                ])
                x_browse_brand_list = tf.reshape(x_action_lists[:, 2], [
                    -1,
                ])
                browse_lists = [
                    x_browse_skus_list, x_browse_cates_list,
                    x_browse_brand_list
                ]
                browse_names = ['skus', 'cates', 'brands']
                x_action_list_embeds = []
                for i in range(len(browse_names)):
                    with tf.name_scope("user_browse_{}_embedding".format(
                            browse_names[i])):
                        browse_w_embed = w_action_embed[i]
                        # x_ad_embedded = x_action[i]
                        x_browse_action = browse_lists[
                            i]  # shape of x_browse_action is [?,]
                        x_browse_action_list = tf.string_split(
                            x_browse_action, "#")
                        x_browse_action_list_indices = tf.SparseTensor(
                            x_browse_action_list.indices,
                            tf.string_to_hash_bucket_strong(
                                x_browse_action_list.values,
                                num_buckets=browse_w_embed.get_shape()
                                [0].value,
                                key=[679362, 964545],
                                name="sparse_user_browse_{}".format(
                                    browse_names[i])),
                            x_browse_action_list.dense_shape,
                        )
                        x_action_list_embed = tf.nn.embedding_lookup_sparse(
                            browse_w_embed,
                            sp_ids=x_browse_action_list_indices,
                            sp_weights=None,
                            combiner=pooling_method)
                        if is_train and is_action_mba_reg:
                            # 计算mba
                            indice_action = tf.concat([
                                tf.string_to_hash_bucket_strong(
                                    x_browse_action_list.values,
                                    num_buckets=browse_w_embed.get_shape()
                                    [0].value,
                                    key=[679362, 964545]),
                                indice_sku_cate_brand[i]
                            ], 0)
                            self.calculate_mini_batch_aware_reg(
                                browse_w_embed, indice_action, lambda_reg_mba)
                        x_action_list_embeds.append(x_action_list_embed)
                x_deep_in = tf.concat(
                    [x_deep_in, tf.concat(x_action_list_embeds, 1)], 1)

        # 构建deep模块
        with tf.name_scope("deep_network"):
            deep_layers = self.param_dict["deep_layers"]
            for i in range(len(deep_layers)):
                with tf.variable_scope("dnn_layer_{}".format(i)):
                    weights = self.get_weight_variable(
                        [x_deep_in.shape[1].value, deep_layers[i]],
                        regularizer2, self.param_dict["initializer_dnn_w"](
                            [x_deep_in.shape[1].value, deep_layers[i]]))
                    biases = tf.get_variable(
                        "biases", [deep_layers[i]],
                        initializer=tf.constant_initializer(0.0),
                        dtype=tf.float32)
                    layer_i = act_fn(tf.matmul(x_deep_in, weights) + biases,
                                     name="deep_mlp_{}".format(i))
                    x_deep_in = layer_i

        # 构建输出模块full connect
        x_fc_in = x_deep_in
        with tf.name_scope("fc_layers"):
            fc_layers = self.param_dict['fc_layers']
            for i in range(len(fc_layers)):
                with tf.variable_scope("fc_layers_{}".format(i)):
                    weights = self.get_weight_variable(
                        [x_fc_in.shape[1].value, fc_layers[i]], regularizer4,
                        self.param_dict["initializer_fc_w"](
                            [x_fc_in.shape[1].value, fc_layers[i]]))
                    biases = tf.get_variable(
                        "biases", [fc_layers[i]],
                        initializer=tf.constant_initializer(0.0),
                        dtype=tf.float32)
                    layer_i = tf.nn.sigmoid(
                        tf.matmul(x_fc_in, weights) + biases)
                    x_fc_in = layer_i
        logit = x_fc_in
        return logit
Example #6
0
    def build_inference(self, x, flag="train"):
        # 设置regularizer,本别对应网络的四个部分
        regularizer1 = self.param_dict[
            "regulerizer1"] if flag == "train" else None
        regularizer2 = self.param_dict[
            "regulerizer2"] if flag == "train" else None
        regularizer3 = self.param_dict[
            "regulerizer3"] if flag == "train" else None
        regularizer4 = self.param_dict[
            "regulerizer4"] if flag == "train" else None
        is_train = True if flag == "train" else False
        # 先获取需要的参数
        hash_size = self.param_dict['hash_size']
        no_hash = self.param_dict["no_hash"]
        embed_size = self.param_dict["embed_size"]
        # 根据配置获取激活函数
        act_fn = self.get_activation_func(is_train)
        # 是否启用mini-batch aware regularization
        is_mba_reg = self.param_dict["is_mba_reg"]
        lambda_reg_mba = self.param_dict["lambda_reg_mba"]
        is_action_mba_reg = self.param_dict["is_action_mba_reg"]

        # 将输入划分
        x_feature = x[:, :-3]
        x_action_lists = x[:, -3:]

        # 先将稀疏特征转换成indice
        x_sparse = []
        for i in range(len(hash_size)):
            if i in no_hash:
                # 这部分特征本身可以直接作为indice,不需要转化
                x_i = tf.string_to_number(x_feature[:, i], tf.int32)
                x_sparse.append(x_i)
            else:
                # 这部分特征可以通过哈希函数来转化成index
                x_i = tf.string_to_hash_bucket_strong(
                    input=x_feature[:, i],
                    num_buckets=hash_size[i],
                    key=[679362, 964545],
                    name="sparse_feature_{}".format(i))
                x_sparse.append(x_i)
        # 将稀疏数据转换成embedding向量
        x_embed = []
        w_action_embed = []
        x_action = []
        indice_sku_cate_brand = []
        sku_cate_brand_index = self.param_dict["sku_cate_brand_index"]
        for i in range(len(embed_size)):
            if embed_size[i] != -1:
                with tf.variable_scope("embedding_{}".format(i)):
                    if hash_size[i] <= 500000:
                        weights = self.get_weight_variable(
                            [hash_size[i], embed_size[i]], regularizer1,
                            self.param_dict["initializer_embedding_w"](
                                [hash_size[i], embed_size[i]]))
                    elif hash_size[i] > 500000 and hash_size[i] <= 5000000:
                        weights = self.get_weight_variable(
                            [hash_size[i], embed_size[i]],
                            regularizer1,
                            self.param_dict["initializer_embedding_w"](
                                [hash_size[i], embed_size[i]]),
                            partitioner=tf.fixed_size_partitioner(5, 0))
                    elif hash_size[i] > 5000000 and hash_size[i] <= 10000000:
                        weights = self.get_weight_variable(
                            [hash_size[i], embed_size[i]],
                            regularizer1,
                            self.param_dict["initializer_embedding_w"](
                                [hash_size[i], embed_size[i]]),
                            partitioner=tf.fixed_size_partitioner(10, 0))
                    elif hash_size[i] > 10000000 and hash_size[i] <= 15000000:
                        weights = self.get_weight_variable(
                            [hash_size[i], embed_size[i]],
                            regularizer1,
                            self.param_dict["initializer_embedding_w"](
                                [hash_size[i], embed_size[i]]),
                            partitioner=tf.fixed_size_partitioner(15, 0))
                    elif hash_size[i] > 15000000 and hash_size[i] <= 20000000:
                        weights = self.get_weight_variable(
                            [hash_size[i], embed_size[i]],
                            regularizer1,
                            self.param_dict["initializer_embedding_w"](
                                [hash_size[i], embed_size[i]]),
                            partitioner=tf.fixed_size_partitioner(20, 0))
                    else:
                        weights = self.get_weight_variable(
                            [hash_size[i], embed_size[i]],
                            regularizer1,
                            self.param_dict["initializer_embedding_w"](
                                [hash_size[i], embed_size[i]]),
                            partitioner=tf.fixed_size_partitioner(30, 0))
                x_i = tf.nn.embedding_lookup(weights, x_sparse[i])

                if i in sku_cate_brand_index:  # skuid, cateid, brandid对应的embedding向量
                    w_action_embed.append(weights)
                    x_action.append(x_i)
                    indice_sku_cate_brand.append(x_sparse[i])
                    if is_train and is_mba_reg and not is_action_mba_reg:
                        # 计算mba
                        self.calculate_mini_batch_aware_reg(
                            weights, x_sparse[i], lambda_reg_mba)
                else:
                    if is_train and is_mba_reg:
                        # 计算mba
                        self.calculate_mini_batch_aware_reg(
                            weights, x_sparse[i], lambda_reg_mba)

            else:
                x_i = tf.one_hot(x_sparse[i], depth=hash_size[i])

            x_embed.append(x_i)

            # if i in sku_cate_brand_index: # skuid, cateid, brandid对应的embedding向量
            #     with tf.variable_scope("embedding_{}".format(i)):
            #         weights = self.get_weight_variable([hash_size[i], embed_size[i]], regularizer1,
            #                                             self.param_dict["initializer_embedding_w"]([hash_size[i], embed_size[i]]),
            #                                             partitioner=tf.fixed_size_partitioner(20, 0))
            #         w_action_embed.append(weights)
            #         x_i = tf.nn.embedding_lookup(weights, x_sparse[i])
            #         if is_train and is_mba_reg and not is_action_mba_reg:
            #             # 计算mba
            #             self.calculate_mini_batch_aware_reg(weights, x_sparse[i], lambda_reg_mba)
            #
            #         indice_sku_cate_brand.append(x_sparse[i])
            #         x_embed.append(x_i)
            #         x_action.append(x_i)
            # else:
            #     if embed_size[i] != -1:
            #         with tf.variable_scope("embedding_{}".format(i)):
            #             if i == 0:
            #                 weights = self.get_weight_variable([hash_size[i], embed_size[i]], regularizer1,
            #                                                    self.param_dict["initializer_embedding_w"]([hash_size[i], embed_size[i]]),
            #                                                    partitioner=tf.fixed_size_partitioner(20, 0))
            #             else:
            #                 weights = self.get_weight_variable([hash_size[i], embed_size[i]], regularizer1,
            #                                                    self.param_dict["initializer_embedding_w"]([hash_size[i], embed_size[i]]))
            #             x_i = tf.nn.embedding_lookup(weights, x_sparse[i])
            #             if is_train and is_mba_reg:
            #                 # 计算mba
            #                 self.calculate_mini_batch_aware_reg(weights, x_sparse[i], lambda_reg_mba)
            #
            #             x_embed.append(x_i)
            #     else:
            #         x_i = tf.one_hot(x_sparse[i], depth=hash_size[i])
            #         x_embed.append(x_i)
        x_embed = tf.concat(x_embed, 1)

        # 对浏览行为建模,构建DIN
        with tf.name_scope("user_behaviours"):
            x_browse_skus_list = tf.reshape(x_action_lists[:, 0], [
                -1,
            ])
            x_browse_cates_list = tf.reshape(x_action_lists[:, 1], [
                -1,
            ])
            x_browse_brand_list = tf.reshape(x_action_lists[:, 2], [
                -1,
            ])
            browse_lists = [
                x_browse_skus_list, x_browse_cates_list, x_browse_brand_list
            ]
            browse_names = ['skus', 'cates', 'brands']
            browse_nums = self.param_dict["browse_nums"]
            x_action_list_embeds = []
            sum_poolings = []
            x_action_list_masks = []
            for i in range(len(browse_names)):
                # for i in [0]:
                with tf.name_scope("user_browse_{}_embedding".format(
                        browse_names[i])):
                    browse_w_embed = w_action_embed[i]
                    # x_ad_embedded = x_action[i]
                    x_browse_action = browse_lists[
                        i]  # shape of x_browse_action is [?,]
                    x_browse_action_list = tf.string_split(
                        x_browse_action, "#")
                    x_browse_action_list_indices = tf.sparse_to_dense(
                        x_browse_action_list.indices,
                        # x_browse_action_list.dense_shape,
                        [x_browse_action_list.dense_shape[0], browse_nums[i]],
                        tf.string_to_hash_bucket_strong(
                            x_browse_action_list.values,
                            num_buckets=browse_w_embed.get_shape()[0].value,
                            key=[679362, 964545],
                            name="sparse_user_browse_{}".format(
                                browse_names[i])),
                        -1)
                    indice_mask = tf.reshape(
                        tf.not_equal(x_browse_action_list_indices, -1),
                        [-1, browse_nums[i]])
                    x_action_list_masks.append(indice_mask)
                    x_action_list_embed = tf.reshape(
                        tf.nn.embedding_lookup(browse_w_embed,
                                               x_browse_action_list_indices),
                        [
                            -1, browse_nums[i],
                            browse_w_embed.get_shape()[1].value
                        ])
                    if is_train and is_action_mba_reg:
                        # 计算mba
                        indice_action = tf.concat([
                            tf.string_to_hash_bucket_strong(
                                x_browse_action_list.values,
                                num_buckets=browse_w_embed.get_shape()
                                [0].value,
                                key=[679362, 964545]), indice_sku_cate_brand[i]
                        ], 0)
                        self.calculate_mini_batch_aware_reg(
                            browse_w_embed, indice_action, lambda_reg_mba)
                    x_action_list_embeds.append(x_action_list_embed)

            with tf.name_scope("activation_unit"):
                act_unit_hidden_layers = self.param_dict[
                    "act_unit_hidden_layers"]
                action_indexs = self.param_dict["action_indexs"]
                # for i in range(len(x_action_list_embeds)):
                for i in action_indexs:
                    x_action_list_embed = x_action_list_embeds[i]
                    x_ad_embedded = x_action[i]
                    indice_mask = x_action_list_masks[i]
                    # 外积:笛卡尔积矩阵拉平向量
                    # out_product_list = tf.map_fn(lambda action_emb: tf.reshape(tf.matmul(tf.expand_dims(action_emb, 2), tf.expand_dims(x_ad_embedded, 1)), [-1, x_ad_embedded.shape[1].value ** 2]),
                    #                              tf.transpose(x_action_list_embed, [1, 0, 2]))

                    # 近似外积:向量相减再concat向量点积

                    x_action_list_embed_new = tf.transpose(
                        x_action_list_embed, [1, 0, 2])

                    concat_list = [
                        tf.concat([
                            x_action_list_embed_new[ii],
                            x_action_list_embed_new[ii] - x_ad_embedded,
                            x_action_list_embed_new[ii] * x_ad_embedded,
                            x_ad_embedded
                        ], 1)
                        for ii in range(x_action_list_embed_new.shape[0].value)
                    ]

                    act_unit_in = concat_list[0].shape[1].value
                    act_in = concat_list
                    with tf.variable_scope("activation_unit_{}_list".format(
                            browse_names[i])):
                        for ii in range(len(act_unit_hidden_layers)):
                            weights_act_unit = self.get_weight_variable(
                                [act_unit_in, act_unit_hidden_layers[ii]],
                                regularizer3,
                                self.param_dict["initializer_act_unit_w"](
                                    [act_unit_in, act_unit_hidden_layers[ii]]),
                                name='_act_unit_w_{}'.format(ii))
                            biases_act_unit = tf.get_variable(
                                "biases_{}_act_unit".format(ii),
                                [act_unit_hidden_layers[ii]],
                                initializer=tf.constant_initializer(0.0),
                                dtype=tf.float32)

                            act_out = list(
                                map(
                                    lambda act_in_i: act_fn(
                                        tf.matmul(act_in_i[0], weights_act_unit
                                                  ) + biases_act_unit,
                                        name="act_func_{}_{}".format(
                                            ii, act_in_i[1])),
                                    zip(act_in, range(len(act_in)))))

                            # act_out = [tf.expand_dims(act_fn(tf.matmul(act_in[ii], weights_act_unit) + biases_act_unit, name="act_func_{}_{}".format(i, ii)), 0)
                            #                 for ii in range(act_in.shape[0].value)]
                            act_in = act_out
                            act_unit_in = act_in[0].shape[1].value
                        act_output_in = act_in
                        act_output_unit = act_unit_in
                        weights_act_unit_output = self.get_weight_variable(
                            [act_output_unit, 1],
                            regularizer3,
                            self.param_dict["initializer_act_unit_w"](
                                [act_output_unit, 1]),
                            name='_act_unit_output_w')
                        biases_act_unit_output = tf.get_variable(
                            "biases_act_unit_output", [1],
                            initializer=tf.constant_initializer(0.0),
                            dtype=tf.float32)

                        act_output_out = tf.concat(
                            list(
                                map(
                                    lambda act_output_i: tf.expand_dims(
                                        tf.matmul(act_output_i,
                                                  weights_act_unit_output) +
                                        biases_act_unit_output, 0),
                                    act_output_in)), 0)
                        # act_output_out = tf.concat([tf.expand_dims(tf.matmul(act_output_in[iii], weights_act_unit_output) + biases_act_unit_output, 0) for iii in range(act_output_in.shape[0].value)], 0)
                    active_weight_score = tf.transpose(act_output_out,
                                                       [1, 0, 2])
                    # 将空缺行为的权重设置为0.0
                    padding = tf.zeros_like(active_weight_score)
                    active_weight_score_t = tf.where(
                        tf.expand_dims(indice_mask, 2), active_weight_score,
                        padding)
                    with tf.name_scope("weight_sum_pooling"):
                        sum_pooling = tf.reduce_sum(
                            x_action_list_embed * active_weight_score_t, 1)
                    sum_poolings.append(sum_pooling)
            x_deep_in = tf.concat([x_embed, tf.concat(sum_poolings, 1)], 1)

        # 构建deep模块
        with tf.name_scope("deep_network"):
            deep_layers = self.param_dict["deep_layers"]
            for i in range(len(deep_layers)):
                with tf.variable_scope("dnn_layer_{}".format(i)):
                    weights = self.get_weight_variable(
                        [x_deep_in.shape[1].value, deep_layers[i]],
                        regularizer2, self.param_dict["initializer_dnn_w"](
                            [x_deep_in.shape[1].value, deep_layers[i]]))
                    biases = tf.get_variable(
                        "biases", [deep_layers[i]],
                        initializer=tf.constant_initializer(0.0),
                        dtype=tf.float32)
                    layer_i = act_fn(tf.matmul(x_deep_in, weights) + biases,
                                     name="deep_mlp_{}".format(i))
                    x_deep_in = layer_i

        # 构建输出模块full connect
        x_fc_in = x_deep_in
        with tf.name_scope("fc_layers"):
            fc_layers = self.param_dict['fc_layers']
            for i in range(len(fc_layers)):
                with tf.variable_scope("fc_layers_{}".format(i)):
                    weights = self.get_weight_variable(
                        [x_fc_in.shape[1].value, fc_layers[i]], regularizer4,
                        self.param_dict["initializer_fc_w"](
                            [x_fc_in.shape[1].value, fc_layers[i]]))
                    biases = tf.get_variable(
                        "biases", [fc_layers[i]],
                        initializer=tf.constant_initializer(0.0),
                        dtype=tf.float32)
                    layer_i = tf.nn.sigmoid(
                        tf.matmul(x_fc_in, weights) + biases)
                    x_fc_in = layer_i
        logit = x_fc_in
        return logit
# 第9节 字符串操作

import tensorflow as tf

a = tf.constant('Hello,world!')
b = tf.constant('I love tensorflow.')

sess = tf.Session()

# 计算哈希值
c = tf.string_to_hash_bucket_fast(a, 10000000)
d = tf.string_to_hash_bucket_strong(a, 10000000, key=[1, 3])
e = tf.string_to_hash_bucket(a, 10000000)
result = sess.run([c, d, e])

print('Hashing:\nfast:%s\nstrong:%s\nnormal:%s\n\n' %
      (result[0], result[1], result[2]))

# 把数组连接为字符串
c = tf.reduce_join([a, b], axis=0)
d = tf.string_join([a, b], '__')

result = sess.run([c, d])

print('Joining:\nc=%s\nd=%s\n\n' % (result[0], result[1]))

# 分割字符串
c = tf.string_split([a], ',')
d = tf.substr(a, 0, 5)

result = sess.run([c, d])
 def testStringToHashBucketsStrongInvalidKey(self):
   with self.test_session():
     input_string = tf.constant(['a', 'b', 'c'])
     with self.assertRaisesOpError('Key must have 2 elements'):
       tf.string_to_hash_bucket_strong(input_string, 10, key=[98765]).eval()
 def testStringToOneHashBucketStrongOneHashBucket(self):
   with self.test_session():
     input_string = tf.constant(['a', 'b', 'c'])
     output = tf.string_to_hash_bucket_strong(input_string, 1, key=[123, 345])
     self.assertAllEqual([0, 0, 0], output.eval())
Example #10
0
num_hashes = 400
num_buckets = 2**32
cpu = "/cpu:0"
window_length = 1000
kmer_length = 8
prob_error = 0.2
num_trials = 100

minhash = range(num_hashes)

with tf.device(cpu):
    r = np.random.randint(num_buckets, 2 * num_buckets, [num_hashes, 2])
    kmers = tf.placeholder(tf.string, shape=[None], name="kmers")
    for i in range(num_hashes):
        minhash[i] = tf.argmin(
            tf.string_to_hash_bucket_strong(kmers, num_buckets,
                                            [r[i, 0], r[i, 1]]))


def MinHash(A):
    with tf.Session(config=tf.ConfigProto(log_device_placement=False)) as sess:
        my_minhash, A_strings = sess.run([minhash, kmers],
                                         feed_dict={kmers: A})
    return A_strings[my_minhash]


def Jaccard(A, B):
    hA = MinHash(A.keys())
    hB = MinHash(B.keys())
    return sum(hA == hB) * 1.0 / num_hashes

import numpy as np
import tensorflow as tf
a = tf.constant([["1", "1"], ["2", "2"]], dtype=tf.string)
b = tf.constant([["2", "2"], ["3", "3"]], dtype=tf.string)

a_b = tf.string_join([a, b], separator="_")
a_b = tf.string_to_hash_bucket_strong(a_b, num_buckets=2**63 - 1, key=[0, 0])

c = tf.constant([1, 2], dtype=tf.int64)
c_ = tf.strings.as_string(c, precision=3)

if __name__ == '__main__':
    with tf.Session() as sess:
        sess.run(tf.global_variables_initializer())
        print(sess.run([a_b, c_]))
Example #12
0
#! /usr/bin/env python

import tensorflow as tf

with tf.Session() as sess:
    mapping_string = tf.constant(["foo", "bar", "baz"])
    p = tf.string_to_hash_bucket_strong(mapping_string, 3, key=[123, 456])
    q = tf.string_to_hash_bucket_strong(tf.constant(["zoo", "bar", "baz"]),
                                        3,
                                        key=[123, 456])

    i = sess.run(p)
    j = sess.run(q)
    print(i)
    print(j)