コード例 #1
0
def input_fn_1021(table,
                  selected_cols="user_id,item_id,ui_fea,uu_fea,label",
                  shuffle=True):
    col_num = len(selected_cols.split(','))
    print('input_fn: {}'.format(table))
    print('select col: {}'.format(selected_cols))
    file_queue = tf.train.string_input_producer([table],
                                                num_epochs=NUM_EPOCH,
                                                shuffle=shuffle)

    reader = tf.TableRecordReader(selected_cols=selected_cols)
    keys, values = reader.read_up_to(file_queue, num_records=BATCH_SIZE)
    default_val = [[' ']] * col_num
    default_val[-1] = [-1.0]
    [user_id, item_id, ui_fea, uu_fea,
     label] = tf.decode_csv(values, default_val)

    u_id_hash = tf.string_to_hash_bucket(user_id, NUM_USER_ID)
    i_id_hash = tf.string_to_hash_bucket(item_id, NUM_ITEM_ID)

    uu_info_hash = decode_node_list_attr(
        uu_fea,
        5,  # uu neigh
        user_hash_size_list,
        is_hash=True)
    ui_info_hash = decode_node_list_attr(ui_fea,
                                         5,
                                         item_hash_size_list,
                                         is_hash=True)
    return user_id, item_id, u_id_hash, i_id_hash, label, uu_info_hash, ui_info_hash
コード例 #2
0
 def model_fn(features, labels, mode, params):
     features["text"] = tf.sparse_tensor_to_dense(features["text"],
                                                  default_value=" ")
     if FLAGS.use_ngrams:
         features["ngrams"] = tf.sparse_tensor_to_dense(features["ngrams"],
                                                        default_value=" ")
     text_lookup_table = tf.contrib.lookup.index_table_from_file(
         FLAGS.vocab_file, FLAGS.num_oov_vocab_buckets, FLAGS.vocab_size)
     text_ids = text_lookup_table.lookup(features["text"])
     text_embedding_w = tf.Variable(
         tf.random_uniform([
             FLAGS.vocab_size + FLAGS.num_oov_vocab_buckets,
             FLAGS.embedding_dimension
         ], -0.1, 0.1))
     text_embedding = tf.reduce_mean(tf.nn.embedding_lookup(
         text_embedding_w, text_ids),
                                     axis=-2)
     input_layer = text_embedding
     if FLAGS.use_ngrams:
         ngram_hash = tf.string_to_hash_bucket(features["ngrams"],
                                               FLAGS.num_ngram_buckets)
         ngram_embedding_w = tf.Variable(
             tf.random_uniform(
                 [FLAGS.num_ngram_buckets, FLAGS.ngram_embedding_dimension],
                 -0.1, 0.1))
         ngram_embedding = tf.reduce_mean(tf.nn.embedding_lookup(
             ngram_embedding_w, ngram_hash),
                                          axis=-2)
         ngram_embedding = tf.expand_dims(ngram_embedding, -2)
         input_layer = tf.concat([text_embedding, ngram_embedding], -1)
     num_classes = FLAGS.num_labels
     logits = tf.contrib.layers.fully_connected(inputs=input_layer,
                                                num_outputs=num_classes,
                                                activation_fn=None)
     predictions = tf.argmax(logits, axis=-1)
     probs = tf.nn.softmax(logits)
     loss, train_op = None, None
     metrics = {}
     if mode != tf.estimator.ModeKeys.PREDICT:
         label_lookup_table = tf.contrib.lookup.index_table_from_file(
             FLAGS.label_file, vocab_size=FLAGS.num_labels)
         labels = label_lookup_table.lookup(labels)
         loss = tf.reduce_mean(
             tf.nn.sparse_softmax_cross_entropy_with_logits(labels=labels,
                                                            logits=logits))
         opt = tf.train.AdamOptimizer(params["learning_rate"])
         if FLAGS.horovod:
             opt = hvd.DistributedOptimizer(opt)
         train_op = opt.minimize(loss,
                                 global_step=tf.train.get_global_step())
         metrics = {"accuracy": tf.metrics.accuracy(labels, predictions)}
     exports = {}
     if FLAGS.export_dir:
         exports = Exports(probs, text_embedding)
     return tf.estimator.EstimatorSpec(mode,
                                       predictions=predictions,
                                       loss=loss,
                                       train_op=train_op,
                                       eval_metric_ops=metrics,
                                       export_outputs=exports)
コード例 #3
0
    def testStringToOneHashBucketLegacyHash(self):
        with self.test_session():
            input_string = tf.placeholder(tf.string)
            output = tf.string_to_hash_bucket(input_string, 1)
            result = output.eval(feed_dict={input_string: ['a', 'b', 'c']})

            self.assertAllEqual([0, 0, 0], result)
コード例 #4
0
def decode_node_id(info, hash_size, is_hash=False):
    user_hash_size_list
    id_val = tf.decode_csv(info, [[" "]])
    if is_hash:
        id_hash_val = tf.string_to_hash_bucket(id_val, hash_size)
        return id_hash_val
    return id_val
コード例 #5
0
    def testStringToOneHashBucket(self):
        with self.test_session():
            input_string = tf.placeholder(tf.string)
            output = tf.string_to_hash_bucket(input_string, 1)
            result = output.eval(feed_dict={input_string: ["a", "b", "c"]})

            self.assertAllEqual([0, 0, 0], result)
コード例 #6
0
def decode_node_list_attr(infos, node_num, hash_size_list, is_hash=False):
    """
    decode artibrary len node_fea list, e.g., user_friend_list or user_buy_list
    node_num: num of node in list, e.g., num of user friend
    """
    infos_list = tf.decode_csv(infos, [[" "]] * node_num, chr(3))
    infos_fea_list = [
        tf.decode_csv(i, [[' ']] * len(hash_size_list), '#')
        for i in infos_list
    ]

    infos_fea_val_list = [
        decode_node_attr(node, hash_size_list, is_hash=False)
        for node in infos_fea_list
    ]
    # print('infos_fea_val_list' , infos_fea_val_list)
    return_list = [[] for i in range(len(hash_size_list))]

    # print(len(return_list), len(infos_fea_val_list), len(infos_fea_val_list[0]))
    for x in infos_fea_val_list:
        for idx, val in enumerate(hash_size_list):
            return_list[idx].append(x[idx])
    # print(return_list, len(return_list))

    if is_hash:
        return_hash_list = [
            tf.string_to_hash_bucket(node, hash_size)
            for node, hash_size in zip(return_list, hash_size_list)
        ]
        return return_hash_list
コード例 #7
0
 def model_fn(features, labels, mode, params):
     text_lookup_table = tf.contrib.lookup.index_table_from_file(
         FLAGS.vocab_file, FLAGS.num_oov_vocab_buckets, FLAGS.vocab_size)
     text_ids = text_lookup_table.lookup(features["text"])
     text_embedding_w = tf.Variable(
         tf.random_uniform([
             FLAGS.vocab_size + FLAGS.num_oov_vocab_buckets,
             FLAGS.embedding_dimension
         ], -0.1, 0.1))
     text_embedding = tf.reduce_mean(tf.nn.embedding_lookup(
         text_embedding_w, text_ids),
                                     axis=-2)
     text_embedding = tf.expand_dims(text_embedding, -2)
     input_layer = text_embedding
     if FLAGS.use_ngrams:
         ngram_hash = tf.string_to_hash_bucket(features["ngrams"],
                                               FLAGS.num_ngram_buckets)
         ngram_embedding_w = tf.Variable(
             tf.random_uniform(
                 [FLAGS.num_ngram_buckets, FLAGS.ngram_embedding_dimension],
                 -0.1, 0.1))
         ngram_embedding = tf.reduce_mean(tf.nn.embedding_lookup(
             ngram_embedding_w, ngram_hash),
                                          axis=-2)
         ngram_embedding = tf.expand_dims(ngram_embedding, -2)
         input_layer = tf.concat([text_embedding, ngram_embedding], -1)
     num_classes = len(open(FLAGS.label_file).readlines())
     logits = tf.contrib.layers.fully_connected(inputs=input_layer,
                                                num_outputs=num_classes,
                                                activation_fn=None)
     predictions = tf.argmax(logits, axis=-1)
     loss, train_op = None, None
     metrics = {}
     if mode != tf.estimator.ModeKeys.PREDICT:
         loss = tf.reduce_mean(
             tf.nn.sparse_softmax_cross_entropy_with_logits(labels=labels,
                                                            logits=logits))
         # Squeeze dimensions from labels and switch to 0-offset
         labels = tf.squeeze(labels, -1)
         opt = tf.train.AdamOptimizer(params["learning_rate"])
         if FLAGS.horovod:
             opt = hvd.DistributedOptimizer(opt)
         train_op = opt.minimize(loss,
                                 global_step=tf.train.get_global_step())
         metrics = {"accuracy": tf.metrics.accuracy(labels, predictions)}
     exports = {}
     if FLAGS.export_dir:
         probs = tf.nn.softmax(logits)
         exports["proba"] = tf.estimator.export.ClassificationOutput(
             scores=probs)
         exports["embedding"] = tf.estimator.export.RegressionOutput(
             value=text_embedding)
         exports[tf.saved_model.signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY] = \
                 tf.estimator.export.ClassificationOutput(scores=probs)
     return tf.estimator.EstimatorSpec(mode,
                                       predictions=predictions,
                                       loss=loss,
                                       train_op=train_op,
                                       eval_metric_ops=metrics,
                                       export_outputs=exports)
コード例 #8
0
def cross_feature(row, feature_name_1, feature_name_2, hash_size):
    row["cross_" + feature_name_1 + "_" + feature_name_2] = tf.one_hot(
        tf.string_to_hash_bucket(
            tf.string_join([
                tf.cast(row[feature_name_1], tf.string),
                tf.cast(row[feature_name_2], tf.string)
            ]), hash_size), 2 * hash_size)
    return row
コード例 #9
0
    def testStringToHashBuckets(self):
        with self.test_session():
            input_string = tf.placeholder(tf.string)
            output = tf.string_to_hash_bucket(input_string, 10)
            result = output.eval(feed_dict={input_string: ["a", "b", "c"]})

            # Hash64('a') -> 2996632905371535868 -> mod 10 -> 8
            # Hash64('b') -> 5795986006276551370 -> mod 10 -> 0
            # Hash64('c') -> 14899841994519054197 -> mod 10 -> 7
            self.assertAllEqual([8, 0, 7], result)
コード例 #10
0
    def testStringToHashBucketsLegacyHash(self):
        with self.test_session():
            input_string = tf.placeholder(tf.string)
            output = tf.string_to_hash_bucket(input_string, 10)
            result = output.eval(feed_dict={input_string: ['a', 'b', 'c']})

            # Hash64('a') -> 2996632905371535868 -> mod 10 -> 8
            # Hash64('b') -> 5795986006276551370 -> mod 10 -> 0
            # Hash64('c') -> 14899841994519054197 -> mod 10 -> 7
            self.assertAllEqual([8, 0, 7], result)
コード例 #11
0
def decode_node_attr(infos, hash_size_list, is_hash=False):
    fea_val_list = [
        tf.decode_csv(info, [[" "], [" "]], ":")[1] for info in infos
    ]
    if is_hash:
        fea_hash_list = [
            tf.string_to_hash_bucket(i, j)
            for (i, j) in zip(fea_val_list, hash_size_list)
        ]
        return fea_hash_list
    return fea_val_list
コード例 #12
0
 def testWithHash(self):
     parser_op = fm_ops.fm_parser(tf.constant(self.EXAMPLES, tf.string),
                                  self.VOCAB_SIZE, True)
     string_ids = [str(x) for x in self.TARGET_FEATURE_IDS]
     hashed_feature_ids = tf.string_to_hash_bucket(string_ids,
                                                   self.VOCAB_SIZE)
     with self.test_session() as sess:
         hashed_ids = sess.run(hashed_feature_ids)
         labels, sizes, feature_ids, feature_vals = sess.run(parser_op)
         self.assertAllClose(labels, self.TARGET_LABELS)
         self.assertAllEqual(sizes, self.TARGET_SIZES)
         self.assertAllEqual(feature_ids, hashed_ids)
         self.assertAllClose(feature_vals, self.TARGET_FEATURE_VALS)
コード例 #13
0
def decode_node_attr(infos, hash_size_list, is_hash=False):
    # decode arbitrary num of node attr, len(infos) can be arbitrary number
    # work for both user and item
    fea_val_list = [
        tf.decode_csv(info, [[" "], [" "]], ":")[1] for info in infos
    ]
    if is_hash:
        fea_hash_list = [
            tf.string_to_hash_bucket(i, j)
            for (i, j) in zip(fea_val_list, hash_size_list)
        ]
        return fea_hash_list
    return fea_val_list
コード例 #14
0
def read_and_decode(filename_queue):
    reader = tf.TFRecordReader()
    _, serialized_example = reader.read(filename_queue)
    features = tf.parse_single_example(
        serialized_example,
        features={
            "label":
            tf.FixedLenFeature([], tf.float32),
            "categorical_features":
            tf.FixedLenFeature([CATEGORICAL_FEATURES_SIZE], tf.string),
            "continuous_features":
            tf.FixedLenFeature([CONTINUOUS_FEATURES_SIZE], tf.float32),
        })
    label = features["label"]
    continuous_features = features["continuous_features"]
    categorical_features = tf.cast(
        tf.string_to_hash_bucket(features["categorical_features"],
                                 BUCKET_SIZE), tf.float32)
    return label, tf.concat(0, [continuous_features, categorical_features])
コード例 #15
0
def decode_node_list_attr(infos, node_num, hash_size_list, is_hash=False):
    infos_list = tf.decode_csv(infos, [[" "]] * node_num, chr(3))
    infos_fea_list = [
        tf.decode_csv(i, [[' ']] * len(hash_size_list), '#')
        for i in infos_list
    ]

    infos_fea_val_list = [
        decode_node_attr(node, hash_size_list, is_hash=False)
        for node in infos_fea_list
    ]
    return_list = [[] for i in range(len(hash_size_list))]

    for x in infos_fea_val_list:
        for idx, val in enumerate(hash_size_list):
            return_list[idx].append(x[idx])

    if is_hash:
        return_hash_list = [
            tf.string_to_hash_bucket(node, hash_size)
            for node, hash_size in zip(return_list, hash_size_list)
        ]
        return return_hash_list
コード例 #16
0
# -*- coding:utf-8 -*-
# @version: 1.0
# @author: wuxikun
# @date: '2020/11/28 9:08 PM'

import tensorflow as tf

features = {
    'sex': ['male', 'male', 'female', 'female', 'mid', 'man'],
}

sex_tensor = tf.constant(['male', 'male', 'female', 'female', 'mid', 'man'],
                         dtype=tf.string)

# sex_column = tf.feature_column.categorical_column_with_vocabulary_list('sex', ['male', 'female'])
sex_column = tf.string_to_hash_bucket(sex_tensor, 10)
# sex_column = tf.feature_column.indicator_column(sex_column)
# columns = [sex_column]
#
#
# inputs = tf.feature_column.input_layer(features, columns)

init = tf.global_variables_initializer()

with tf.Session() as sess:
    sess.run(tf.tables_initializer())
    sess.run(init)
    v = sess.run(sex_column)
    print(v)
コード例 #17
0
    print("string expand_dim 0: ",
          sess.run(tf.expand_dims(string_tensor, dim=0)))  # 相当于插入一维
    print("string expand_dim -1: ",
          sess.run(tf.expand_dims(string_tensor, dim=-1)))  # 相当于插入一维
    print("sparse tensor: ", sess.run(chars))
    print("sp values: ", sess.run(chars.values))
"""
string :  [b'hello world' b'a b c' b'hello hkx c']
string newaxis:  [[b'hello world' b'a b c' b'hello hkx c']]
string expand_dim 0:  [[b'hello world' b'a b c' b'hello hkx c']]
string expand_dim -1:  [[b'hello world']
 [b'a b c']
 [b'hello hkx c']]
sparse tensor:  SparseTensorValue(indices=array([[0, 0],
       [0, 1],
       [1, 0],
       [1, 1],
       [1, 2],
       [2, 0],
       [2, 1],
       [2, 2]], dtype=int64), values=array([b'hello', b'world', b'a', b'b', b'c', b'hello', b'hkx', b'c'], dtype=object), dense_shape=array([3, 3], dtype=int64))
sp values:  [b'hello' b'world' b'a' b'b' b'c' b'hello' b'hkx' b'c']
"""

sess = tf.Session()
string = ["hello world", "a b c", "hello hkx c"]
string_bucket = tf.string_to_hash_bucket(string, num_buckets=1000)
print("string_bucket: ", sess.run(string_bucket))

sess.close()
コード例 #18
0
tf.reduce_join(a, -2)  # ==> ["ac", "bd"]
tf.reduce_join(a, -1)  # ==> ["ab", "cd"]
tf.reduce_join(a, 0, keep_dims=True)  #==> [["ac", "bd"]]
tf.reduce_join(a, 1, keep_dims=True)  #==> [["ab"], ["cd"]]
tf.reduce_join(a, 0, separator=".")  #==> ["a.c", "b.d"]
tf.reduce_join(a, [0, 1])  #==> ["acbd"]
tf.reduce_join(a, [1, 0])  #==> ["abcd"]
tf.reduce_join(a, [])  #==> ["abcd"]

b = tf.convert_to_tensor(["ac"])
c = tf.convert_to_tensor(["bd"])
d = tf.string_join([b, c], separator=" ", name=None)
print(sess.run(d))

e = tf.reduce_join(a, 0)
print(tf.string_to_hash_bucket(e, 2))
print(sess.run(tf.string_to_hash_bucket(e, 5)))

f = tf.string_to_hash_bucket(e, 2)
hw = tf.convert_to_tensor(["hello worls"])
print(sess.run(tf.string_split(hw, delimiter=' ')))

### Exercise modelue_1_4

#Create new string tensors with:
#	a) transform str_1 in a way to get [["name: ", "surname: "], ["Jan", "Idziak"]]
#   a')str_1 with argument ["name: Jan", "surname: Idziak"]
#	b) str_2 with argument[["helo ", "world"], ["tensor", "flow"]]
#	b') str_2 with argument ["helo world","tensorflow"]
#   c) Create simple string tensors with arguments:
#	c')	str_3 - ["My name is:"]
コード例 #19
0
def hash_one_hot(row, feature_name, hash_size):
    row[feature_name] = tf.one_hot(
        tf.string_to_hash_bucket(tf.cast(row[feature_name], tf.string),
                                 hash_size), hash_size)
    return row
コード例 #20
0
# 第9节 字符串操作

import tensorflow as tf

a = tf.constant('Hello,world!')
b = tf.constant('I love tensorflow.')

sess = tf.Session()

# 计算哈希值
c = tf.string_to_hash_bucket_fast(a, 10000000)
d = tf.string_to_hash_bucket_strong(a, 10000000, key=[1, 3])
e = tf.string_to_hash_bucket(a, 10000000)
result = sess.run([c, d, e])

print('Hashing:\nfast:%s\nstrong:%s\nnormal:%s\n\n' %
      (result[0], result[1], result[2]))

# 把数组连接为字符串
c = tf.reduce_join([a, b], axis=0)
d = tf.string_join([a, b], '__')

result = sess.run([c, d])

print('Joining:\nc=%s\nd=%s\n\n' % (result[0], result[1]))

# 分割字符串
c = tf.string_split([a], ',')
d = tf.substr(a, 0, 5)

result = sess.run([c, d])