def hash_strings(strings, hash_buckets, key=None, name=None): """Hash strings into buckets. Args: strings: a `Tensor` or `SparseTensor` of dtype `tf.string`. hash_buckets: the number of hash buckets. key: optional. An array of two Python `uint64`. If passed, output will be a deterministic function of `strings` and `key`. Note that hashing will be slower if this value is specified. name: (Optional) A name for this operation. Returns: A `Tensor` or `SparseTensor` of dtype `tf.int64` with the same shape as the input `strings`. Raises: TypeError: if `strings` is not a `Tensor` or `SparseTensor` of dtype `tf.string`. """ if (not isinstance(strings, (tf.Tensor, tf.SparseTensor))) or strings.dtype != tf.string: raise TypeError( 'Input to hash_strings must be a Tensor or SparseTensor of dtype ' 'string; got {}'. format(strings.dtype)) if isinstance(strings, tf.SparseTensor): return tf.SparseTensor(indices=strings.indices, values=hash_strings( strings.values, hash_buckets, key), dense_shape=strings.dense_shape) if name is None: name = 'hash_strings' if key is None: return tf.string_to_hash_bucket_fast(strings, hash_buckets, name=name) return tf.string_to_hash_bucket_strong(strings, hash_buckets, key, name=name)
def _graph_fn_apply(self, text_inputs): """ Args: text_inputs (SingleDataOp): The Text input to generate a hash bucket for. Returns: tuple: - SingleDataOp: The hash lookup table (int64) that can be used as input to embedding-lookups. - SingleDataOp: The length (number of words) of the longest string in the `text_input` batch. """ if get_backend() == "tf": # Split the input string. split_text_inputs = tf.string_split(source=text_inputs, delimiter=self.delimiter) # Build a tensor of n rows (number of items in text_inputs) words with dense = tf.sparse_tensor_to_dense(sp_input=split_text_inputs, default_value="") length = tf.reduce_sum(input_tensor=tf.to_int32(x=tf.not_equal(x=dense, y="")), axis=-1) if self.hash_function == "fast": hash_bucket = tf.string_to_hash_bucket_fast(input=dense, num_buckets=self.num_hash_buckets) else: hash_bucket = tf.string_to_hash_bucket_strong(input=dense, num_buckets=self.num_hash_buckets, key=self.hash_keys) # Int64 is tf's default for `string_to_hash_bucket` operation: Can leave as is. if self.dtype != "int64": hash_bucket = tf.cast(x=hash_bucket, dtype=dtype_(self.dtype)) # Hash-bucket output is always batch-major. hash_bucket._batch_rank = 0 hash_bucket._time_rank = 1 return hash_bucket, length
def testStringToOneHashBucketStrongOneHashBucket(self): with self.test_session(): input_string = tf.constant(['a', 'b', 'c']) output = tf.string_to_hash_bucket_strong(input_string, 1, key=[123, 345]) self.assertAllEqual([0, 0, 0], output.eval())
def testStringToHashBucketsStrong(self): with self.test_session(): input_string = tf.constant(['a', 'b', 'c']) output = tf.string_to_hash_bucket_strong(input_string, 10, key=[98765, 132]) # key = [98765, 132] # StrongKeyedHash(key, 'a') -> 7157389809176466784 -> mod 10 -> 4 # StrongKeyedHash(key, 'b') -> 15805638358933211562 -> mod 10 -> 2 # StrongKeyedHash(key, 'c') -> 18100027895074076528 -> mod 10 -> 8 self.assertAllEqual([4, 2, 8], output.eval())
def build_inference(self, x, flag="train"): # 设置regularizer,本别对应网络的四个部分 regularizer1 = self.param_dict[ "regulerizer1"] if flag == "train" else None regularizer2 = self.param_dict[ "regulerizer2"] if flag == "train" else None regularizer3 = self.param_dict[ "regulerizer3"] if flag == "train" else None regularizer4 = self.param_dict[ "regulerizer4"] if flag == "train" else None is_train = True if flag == "train" else False # 先获取需要的参数 hash_size = self.param_dict['hash_size'] no_hash = self.param_dict["no_hash"] embed_size = self.param_dict["embed_size"] # browse_nums = self.param_dict["browse_nums"] # browse_nums = [20, 10, 10] # 根据配置获取激活函数 act_fn = self.get_activation_func(is_train) # 是否启用mini-batch aware regularization is_mba_reg = self.param_dict["is_mba_reg"] lambda_reg_mba = self.param_dict["lambda_reg_mba"] is_action_mba_reg = self.param_dict["is_action_mba_reg"] # 将输入划分 x_feature = x[:, :-3] x_action_lists = x[:, -3:] # 先将稀疏特征转换成indice x_sparse = [] for i in range(len(hash_size)): if i in no_hash: # 这部分特征本身可以直接作为indice,不需要转化 x_i = tf.string_to_number(x_feature[:, i], tf.int32) x_sparse.append(x_i) else: # 这部分特征可以通过哈希函数来转化成index x_i = tf.string_to_hash_bucket_strong( input=x_feature[:, i], num_buckets=hash_size[i], key=[679362, 964545], name="sparse_feature_{}".format(i)) x_sparse.append(x_i) # 将稀疏数据转换成embedding向量 x_embed = [] w_action_embed = [] x_action = [] indice_sku_cate_brand = [] sku_cate_brand_index = self.param_dict["sku_cate_brand_index"] for i in range(len(embed_size)): if i in sku_cate_brand_index: # skuid, cateid, brandid对应的embedding向量 with tf.variable_scope("embedding_{}".format(i)): weights = self.get_weight_variable( [hash_size[i], embed_size[i]], regularizer1, self.param_dict["initializer_embedding_w"]( [hash_size[i], embed_size[i]]), partitioner=tf.fixed_size_partitioner(10, 0)) w_action_embed.append(weights) x_i = tf.nn.embedding_lookup(weights, x_sparse[i]) if is_train and is_mba_reg and not is_action_mba_reg: # 计算mba self.calculate_mini_batch_aware_reg( weights, x_sparse[i], lambda_reg_mba) indice_sku_cate_brand.append(x_sparse[i]) x_embed.append(x_i) x_action.append(x_i) else: if embed_size[i] != -1: with tf.variable_scope("embedding_{}".format(i)): if i == 0: weights = self.get_weight_variable( [hash_size[i], embed_size[i]], regularizer1, self.param_dict["initializer_embedding_w"]( [hash_size[i], embed_size[i]]), partitioner=tf.fixed_size_partitioner(10, 0)) else: weights = self.get_weight_variable( [hash_size[i], embed_size[i]], regularizer1, self.param_dict["initializer_embedding_w"]( [hash_size[i], embed_size[i]])) x_i = tf.nn.embedding_lookup(weights, x_sparse[i]) if is_train and is_mba_reg: # 计算mba self.calculate_mini_batch_aware_reg( weights, x_sparse[i], lambda_reg_mba) x_embed.append(x_i) else: x_i = tf.one_hot(x_sparse[i], depth=hash_size[i]) x_embed.append(x_i) x_embed = tf.concat(x_embed, 1) x_deep_in = x_embed is_usingg_user_act_feature = self.param_dict[ "is_usingg_user_act_feature"] if is_usingg_user_act_feature: pooling_method = self.param_dict["pooling_method"] # 对浏览行为建模,构建行为embedding向量 with tf.name_scope("user_behaviours"): x_browse_skus_list = tf.reshape(x_action_lists[:, 0], [ -1, ]) x_browse_cates_list = tf.reshape(x_action_lists[:, 1], [ -1, ]) x_browse_brand_list = tf.reshape(x_action_lists[:, 2], [ -1, ]) browse_lists = [ x_browse_skus_list, x_browse_cates_list, x_browse_brand_list ] browse_names = ['skus', 'cates', 'brands'] x_action_list_embeds = [] for i in range(len(browse_names)): with tf.name_scope("user_browse_{}_embedding".format( browse_names[i])): browse_w_embed = w_action_embed[i] # x_ad_embedded = x_action[i] x_browse_action = browse_lists[ i] # shape of x_browse_action is [?,] x_browse_action_list = tf.string_split( x_browse_action, "#") x_browse_action_list_indices = tf.SparseTensor( x_browse_action_list.indices, tf.string_to_hash_bucket_strong( x_browse_action_list.values, num_buckets=browse_w_embed.get_shape() [0].value, key=[679362, 964545], name="sparse_user_browse_{}".format( browse_names[i])), x_browse_action_list.dense_shape, ) x_action_list_embed = tf.nn.embedding_lookup_sparse( browse_w_embed, sp_ids=x_browse_action_list_indices, sp_weights=None, combiner=pooling_method) if is_train and is_action_mba_reg: # 计算mba indice_action = tf.concat([ tf.string_to_hash_bucket_strong( x_browse_action_list.values, num_buckets=browse_w_embed.get_shape() [0].value, key=[679362, 964545]), indice_sku_cate_brand[i] ], 0) self.calculate_mini_batch_aware_reg( browse_w_embed, indice_action, lambda_reg_mba) x_action_list_embeds.append(x_action_list_embed) x_deep_in = tf.concat( [x_deep_in, tf.concat(x_action_list_embeds, 1)], 1) # 构建deep模块 with tf.name_scope("deep_network"): deep_layers = self.param_dict["deep_layers"] for i in range(len(deep_layers)): with tf.variable_scope("dnn_layer_{}".format(i)): weights = self.get_weight_variable( [x_deep_in.shape[1].value, deep_layers[i]], regularizer2, self.param_dict["initializer_dnn_w"]( [x_deep_in.shape[1].value, deep_layers[i]])) biases = tf.get_variable( "biases", [deep_layers[i]], initializer=tf.constant_initializer(0.0), dtype=tf.float32) layer_i = act_fn(tf.matmul(x_deep_in, weights) + biases, name="deep_mlp_{}".format(i)) x_deep_in = layer_i # 构建输出模块full connect x_fc_in = x_deep_in with tf.name_scope("fc_layers"): fc_layers = self.param_dict['fc_layers'] for i in range(len(fc_layers)): with tf.variable_scope("fc_layers_{}".format(i)): weights = self.get_weight_variable( [x_fc_in.shape[1].value, fc_layers[i]], regularizer4, self.param_dict["initializer_fc_w"]( [x_fc_in.shape[1].value, fc_layers[i]])) biases = tf.get_variable( "biases", [fc_layers[i]], initializer=tf.constant_initializer(0.0), dtype=tf.float32) layer_i = tf.nn.sigmoid( tf.matmul(x_fc_in, weights) + biases) x_fc_in = layer_i logit = x_fc_in return logit
def build_inference(self, x, flag="train"): # 设置regularizer,本别对应网络的四个部分 regularizer1 = self.param_dict[ "regulerizer1"] if flag == "train" else None regularizer2 = self.param_dict[ "regulerizer2"] if flag == "train" else None regularizer3 = self.param_dict[ "regulerizer3"] if flag == "train" else None regularizer4 = self.param_dict[ "regulerizer4"] if flag == "train" else None is_train = True if flag == "train" else False # 先获取需要的参数 hash_size = self.param_dict['hash_size'] no_hash = self.param_dict["no_hash"] embed_size = self.param_dict["embed_size"] # 根据配置获取激活函数 act_fn = self.get_activation_func(is_train) # 是否启用mini-batch aware regularization is_mba_reg = self.param_dict["is_mba_reg"] lambda_reg_mba = self.param_dict["lambda_reg_mba"] is_action_mba_reg = self.param_dict["is_action_mba_reg"] # 将输入划分 x_feature = x[:, :-3] x_action_lists = x[:, -3:] # 先将稀疏特征转换成indice x_sparse = [] for i in range(len(hash_size)): if i in no_hash: # 这部分特征本身可以直接作为indice,不需要转化 x_i = tf.string_to_number(x_feature[:, i], tf.int32) x_sparse.append(x_i) else: # 这部分特征可以通过哈希函数来转化成index x_i = tf.string_to_hash_bucket_strong( input=x_feature[:, i], num_buckets=hash_size[i], key=[679362, 964545], name="sparse_feature_{}".format(i)) x_sparse.append(x_i) # 将稀疏数据转换成embedding向量 x_embed = [] w_action_embed = [] x_action = [] indice_sku_cate_brand = [] sku_cate_brand_index = self.param_dict["sku_cate_brand_index"] for i in range(len(embed_size)): if embed_size[i] != -1: with tf.variable_scope("embedding_{}".format(i)): if hash_size[i] <= 500000: weights = self.get_weight_variable( [hash_size[i], embed_size[i]], regularizer1, self.param_dict["initializer_embedding_w"]( [hash_size[i], embed_size[i]])) elif hash_size[i] > 500000 and hash_size[i] <= 5000000: weights = self.get_weight_variable( [hash_size[i], embed_size[i]], regularizer1, self.param_dict["initializer_embedding_w"]( [hash_size[i], embed_size[i]]), partitioner=tf.fixed_size_partitioner(5, 0)) elif hash_size[i] > 5000000 and hash_size[i] <= 10000000: weights = self.get_weight_variable( [hash_size[i], embed_size[i]], regularizer1, self.param_dict["initializer_embedding_w"]( [hash_size[i], embed_size[i]]), partitioner=tf.fixed_size_partitioner(10, 0)) elif hash_size[i] > 10000000 and hash_size[i] <= 15000000: weights = self.get_weight_variable( [hash_size[i], embed_size[i]], regularizer1, self.param_dict["initializer_embedding_w"]( [hash_size[i], embed_size[i]]), partitioner=tf.fixed_size_partitioner(15, 0)) elif hash_size[i] > 15000000 and hash_size[i] <= 20000000: weights = self.get_weight_variable( [hash_size[i], embed_size[i]], regularizer1, self.param_dict["initializer_embedding_w"]( [hash_size[i], embed_size[i]]), partitioner=tf.fixed_size_partitioner(20, 0)) else: weights = self.get_weight_variable( [hash_size[i], embed_size[i]], regularizer1, self.param_dict["initializer_embedding_w"]( [hash_size[i], embed_size[i]]), partitioner=tf.fixed_size_partitioner(30, 0)) x_i = tf.nn.embedding_lookup(weights, x_sparse[i]) if i in sku_cate_brand_index: # skuid, cateid, brandid对应的embedding向量 w_action_embed.append(weights) x_action.append(x_i) indice_sku_cate_brand.append(x_sparse[i]) if is_train and is_mba_reg and not is_action_mba_reg: # 计算mba self.calculate_mini_batch_aware_reg( weights, x_sparse[i], lambda_reg_mba) else: if is_train and is_mba_reg: # 计算mba self.calculate_mini_batch_aware_reg( weights, x_sparse[i], lambda_reg_mba) else: x_i = tf.one_hot(x_sparse[i], depth=hash_size[i]) x_embed.append(x_i) # if i in sku_cate_brand_index: # skuid, cateid, brandid对应的embedding向量 # with tf.variable_scope("embedding_{}".format(i)): # weights = self.get_weight_variable([hash_size[i], embed_size[i]], regularizer1, # self.param_dict["initializer_embedding_w"]([hash_size[i], embed_size[i]]), # partitioner=tf.fixed_size_partitioner(20, 0)) # w_action_embed.append(weights) # x_i = tf.nn.embedding_lookup(weights, x_sparse[i]) # if is_train and is_mba_reg and not is_action_mba_reg: # # 计算mba # self.calculate_mini_batch_aware_reg(weights, x_sparse[i], lambda_reg_mba) # # indice_sku_cate_brand.append(x_sparse[i]) # x_embed.append(x_i) # x_action.append(x_i) # else: # if embed_size[i] != -1: # with tf.variable_scope("embedding_{}".format(i)): # if i == 0: # weights = self.get_weight_variable([hash_size[i], embed_size[i]], regularizer1, # self.param_dict["initializer_embedding_w"]([hash_size[i], embed_size[i]]), # partitioner=tf.fixed_size_partitioner(20, 0)) # else: # weights = self.get_weight_variable([hash_size[i], embed_size[i]], regularizer1, # self.param_dict["initializer_embedding_w"]([hash_size[i], embed_size[i]])) # x_i = tf.nn.embedding_lookup(weights, x_sparse[i]) # if is_train and is_mba_reg: # # 计算mba # self.calculate_mini_batch_aware_reg(weights, x_sparse[i], lambda_reg_mba) # # x_embed.append(x_i) # else: # x_i = tf.one_hot(x_sparse[i], depth=hash_size[i]) # x_embed.append(x_i) x_embed = tf.concat(x_embed, 1) # 对浏览行为建模,构建DIN with tf.name_scope("user_behaviours"): x_browse_skus_list = tf.reshape(x_action_lists[:, 0], [ -1, ]) x_browse_cates_list = tf.reshape(x_action_lists[:, 1], [ -1, ]) x_browse_brand_list = tf.reshape(x_action_lists[:, 2], [ -1, ]) browse_lists = [ x_browse_skus_list, x_browse_cates_list, x_browse_brand_list ] browse_names = ['skus', 'cates', 'brands'] browse_nums = self.param_dict["browse_nums"] x_action_list_embeds = [] sum_poolings = [] x_action_list_masks = [] for i in range(len(browse_names)): # for i in [0]: with tf.name_scope("user_browse_{}_embedding".format( browse_names[i])): browse_w_embed = w_action_embed[i] # x_ad_embedded = x_action[i] x_browse_action = browse_lists[ i] # shape of x_browse_action is [?,] x_browse_action_list = tf.string_split( x_browse_action, "#") x_browse_action_list_indices = tf.sparse_to_dense( x_browse_action_list.indices, # x_browse_action_list.dense_shape, [x_browse_action_list.dense_shape[0], browse_nums[i]], tf.string_to_hash_bucket_strong( x_browse_action_list.values, num_buckets=browse_w_embed.get_shape()[0].value, key=[679362, 964545], name="sparse_user_browse_{}".format( browse_names[i])), -1) indice_mask = tf.reshape( tf.not_equal(x_browse_action_list_indices, -1), [-1, browse_nums[i]]) x_action_list_masks.append(indice_mask) x_action_list_embed = tf.reshape( tf.nn.embedding_lookup(browse_w_embed, x_browse_action_list_indices), [ -1, browse_nums[i], browse_w_embed.get_shape()[1].value ]) if is_train and is_action_mba_reg: # 计算mba indice_action = tf.concat([ tf.string_to_hash_bucket_strong( x_browse_action_list.values, num_buckets=browse_w_embed.get_shape() [0].value, key=[679362, 964545]), indice_sku_cate_brand[i] ], 0) self.calculate_mini_batch_aware_reg( browse_w_embed, indice_action, lambda_reg_mba) x_action_list_embeds.append(x_action_list_embed) with tf.name_scope("activation_unit"): act_unit_hidden_layers = self.param_dict[ "act_unit_hidden_layers"] action_indexs = self.param_dict["action_indexs"] # for i in range(len(x_action_list_embeds)): for i in action_indexs: x_action_list_embed = x_action_list_embeds[i] x_ad_embedded = x_action[i] indice_mask = x_action_list_masks[i] # 外积:笛卡尔积矩阵拉平向量 # out_product_list = tf.map_fn(lambda action_emb: tf.reshape(tf.matmul(tf.expand_dims(action_emb, 2), tf.expand_dims(x_ad_embedded, 1)), [-1, x_ad_embedded.shape[1].value ** 2]), # tf.transpose(x_action_list_embed, [1, 0, 2])) # 近似外积:向量相减再concat向量点积 x_action_list_embed_new = tf.transpose( x_action_list_embed, [1, 0, 2]) concat_list = [ tf.concat([ x_action_list_embed_new[ii], x_action_list_embed_new[ii] - x_ad_embedded, x_action_list_embed_new[ii] * x_ad_embedded, x_ad_embedded ], 1) for ii in range(x_action_list_embed_new.shape[0].value) ] act_unit_in = concat_list[0].shape[1].value act_in = concat_list with tf.variable_scope("activation_unit_{}_list".format( browse_names[i])): for ii in range(len(act_unit_hidden_layers)): weights_act_unit = self.get_weight_variable( [act_unit_in, act_unit_hidden_layers[ii]], regularizer3, self.param_dict["initializer_act_unit_w"]( [act_unit_in, act_unit_hidden_layers[ii]]), name='_act_unit_w_{}'.format(ii)) biases_act_unit = tf.get_variable( "biases_{}_act_unit".format(ii), [act_unit_hidden_layers[ii]], initializer=tf.constant_initializer(0.0), dtype=tf.float32) act_out = list( map( lambda act_in_i: act_fn( tf.matmul(act_in_i[0], weights_act_unit ) + biases_act_unit, name="act_func_{}_{}".format( ii, act_in_i[1])), zip(act_in, range(len(act_in))))) # act_out = [tf.expand_dims(act_fn(tf.matmul(act_in[ii], weights_act_unit) + biases_act_unit, name="act_func_{}_{}".format(i, ii)), 0) # for ii in range(act_in.shape[0].value)] act_in = act_out act_unit_in = act_in[0].shape[1].value act_output_in = act_in act_output_unit = act_unit_in weights_act_unit_output = self.get_weight_variable( [act_output_unit, 1], regularizer3, self.param_dict["initializer_act_unit_w"]( [act_output_unit, 1]), name='_act_unit_output_w') biases_act_unit_output = tf.get_variable( "biases_act_unit_output", [1], initializer=tf.constant_initializer(0.0), dtype=tf.float32) act_output_out = tf.concat( list( map( lambda act_output_i: tf.expand_dims( tf.matmul(act_output_i, weights_act_unit_output) + biases_act_unit_output, 0), act_output_in)), 0) # act_output_out = tf.concat([tf.expand_dims(tf.matmul(act_output_in[iii], weights_act_unit_output) + biases_act_unit_output, 0) for iii in range(act_output_in.shape[0].value)], 0) active_weight_score = tf.transpose(act_output_out, [1, 0, 2]) # 将空缺行为的权重设置为0.0 padding = tf.zeros_like(active_weight_score) active_weight_score_t = tf.where( tf.expand_dims(indice_mask, 2), active_weight_score, padding) with tf.name_scope("weight_sum_pooling"): sum_pooling = tf.reduce_sum( x_action_list_embed * active_weight_score_t, 1) sum_poolings.append(sum_pooling) x_deep_in = tf.concat([x_embed, tf.concat(sum_poolings, 1)], 1) # 构建deep模块 with tf.name_scope("deep_network"): deep_layers = self.param_dict["deep_layers"] for i in range(len(deep_layers)): with tf.variable_scope("dnn_layer_{}".format(i)): weights = self.get_weight_variable( [x_deep_in.shape[1].value, deep_layers[i]], regularizer2, self.param_dict["initializer_dnn_w"]( [x_deep_in.shape[1].value, deep_layers[i]])) biases = tf.get_variable( "biases", [deep_layers[i]], initializer=tf.constant_initializer(0.0), dtype=tf.float32) layer_i = act_fn(tf.matmul(x_deep_in, weights) + biases, name="deep_mlp_{}".format(i)) x_deep_in = layer_i # 构建输出模块full connect x_fc_in = x_deep_in with tf.name_scope("fc_layers"): fc_layers = self.param_dict['fc_layers'] for i in range(len(fc_layers)): with tf.variable_scope("fc_layers_{}".format(i)): weights = self.get_weight_variable( [x_fc_in.shape[1].value, fc_layers[i]], regularizer4, self.param_dict["initializer_fc_w"]( [x_fc_in.shape[1].value, fc_layers[i]])) biases = tf.get_variable( "biases", [fc_layers[i]], initializer=tf.constant_initializer(0.0), dtype=tf.float32) layer_i = tf.nn.sigmoid( tf.matmul(x_fc_in, weights) + biases) x_fc_in = layer_i logit = x_fc_in return logit
# 第9节 字符串操作 import tensorflow as tf a = tf.constant('Hello,world!') b = tf.constant('I love tensorflow.') sess = tf.Session() # 计算哈希值 c = tf.string_to_hash_bucket_fast(a, 10000000) d = tf.string_to_hash_bucket_strong(a, 10000000, key=[1, 3]) e = tf.string_to_hash_bucket(a, 10000000) result = sess.run([c, d, e]) print('Hashing:\nfast:%s\nstrong:%s\nnormal:%s\n\n' % (result[0], result[1], result[2])) # 把数组连接为字符串 c = tf.reduce_join([a, b], axis=0) d = tf.string_join([a, b], '__') result = sess.run([c, d]) print('Joining:\nc=%s\nd=%s\n\n' % (result[0], result[1])) # 分割字符串 c = tf.string_split([a], ',') d = tf.substr(a, 0, 5) result = sess.run([c, d])
def testStringToHashBucketsStrongInvalidKey(self): with self.test_session(): input_string = tf.constant(['a', 'b', 'c']) with self.assertRaisesOpError('Key must have 2 elements'): tf.string_to_hash_bucket_strong(input_string, 10, key=[98765]).eval()
def testStringToOneHashBucketStrongOneHashBucket(self): with self.test_session(): input_string = tf.constant(['a', 'b', 'c']) output = tf.string_to_hash_bucket_strong(input_string, 1, key=[123, 345]) self.assertAllEqual([0, 0, 0], output.eval())
num_hashes = 400 num_buckets = 2**32 cpu = "/cpu:0" window_length = 1000 kmer_length = 8 prob_error = 0.2 num_trials = 100 minhash = range(num_hashes) with tf.device(cpu): r = np.random.randint(num_buckets, 2 * num_buckets, [num_hashes, 2]) kmers = tf.placeholder(tf.string, shape=[None], name="kmers") for i in range(num_hashes): minhash[i] = tf.argmin( tf.string_to_hash_bucket_strong(kmers, num_buckets, [r[i, 0], r[i, 1]])) def MinHash(A): with tf.Session(config=tf.ConfigProto(log_device_placement=False)) as sess: my_minhash, A_strings = sess.run([minhash, kmers], feed_dict={kmers: A}) return A_strings[my_minhash] def Jaccard(A, B): hA = MinHash(A.keys()) hB = MinHash(B.keys()) return sum(hA == hB) * 1.0 / num_hashes
import numpy as np import tensorflow as tf a = tf.constant([["1", "1"], ["2", "2"]], dtype=tf.string) b = tf.constant([["2", "2"], ["3", "3"]], dtype=tf.string) a_b = tf.string_join([a, b], separator="_") a_b = tf.string_to_hash_bucket_strong(a_b, num_buckets=2**63 - 1, key=[0, 0]) c = tf.constant([1, 2], dtype=tf.int64) c_ = tf.strings.as_string(c, precision=3) if __name__ == '__main__': with tf.Session() as sess: sess.run(tf.global_variables_initializer()) print(sess.run([a_b, c_]))
#! /usr/bin/env python import tensorflow as tf with tf.Session() as sess: mapping_string = tf.constant(["foo", "bar", "baz"]) p = tf.string_to_hash_bucket_strong(mapping_string, 3, key=[123, 456]) q = tf.string_to_hash_bucket_strong(tf.constant(["zoo", "bar", "baz"]), 3, key=[123, 456]) i = sess.run(p) j = sess.run(q) print(i) print(j)