Ejemplo n.º 1
0
def load_embedding(file_name,
                   vocab_size,
                   embedding_size,
                   vectors,
                   shift=0,
                   name='waou'):
    with tf.device("/cpu:0"):
        with tf.name_scope('load_w2v_embed_' + name):
            reader = tf.TableRecordReader(selected_cols='word,vector',
                                          csv_delimiter=',',
                                          name=name)
            file_queue = tf.train.string_input_producer([file_name],
                                                        name='w2v_queue_' +
                                                        name)
            _, values = reader.read_up_to(file_queue,
                                          vocab_size,
                                          name='w2v_read_' + name)
            embed_raw = tf.decode_csv(
                values,
                record_defaults=[[''] for _ in range(1 + embedding_size)],
                field_delim=',')
            embed_raw = tf.transpose(embed_raw)
            ids = tf.string_to_number(embed_raw[:, 0],
                                      tf.int32,
                                      name='word_ids_' + name)
            ids = tf.reshape(ids, [-1])
            embeddings = tf.string_to_number(
                embed_raw[:, 1:1 + embedding_size], tf.float32)
            init = tf.scatter_update(vectors,
                                     ids + shift,
                                     embeddings,
                                     name='word_ids_scatter_update' + name).op
    return init
Ejemplo n.º 2
0
def input_fn_1021(table,
                  selected_cols="user_id,item_id,ui_fea,uu_fea,label",
                  shuffle=True):
    col_num = len(selected_cols.split(','))
    print('input_fn: {}'.format(table))
    print('select col: {}'.format(selected_cols))
    file_queue = tf.train.string_input_producer([table],
                                                num_epochs=NUM_EPOCH,
                                                shuffle=shuffle)

    reader = tf.TableRecordReader(selected_cols=selected_cols)
    keys, values = reader.read_up_to(file_queue, num_records=BATCH_SIZE)
    default_val = [[' ']] * col_num
    default_val[-1] = [-1.0]
    [user_id, item_id, ui_fea, uu_fea,
     label] = tf.decode_csv(values, default_val)

    u_id_hash = tf.string_to_hash_bucket(user_id, NUM_USER_ID)
    i_id_hash = tf.string_to_hash_bucket(item_id, NUM_ITEM_ID)

    uu_info_hash = decode_node_list_attr(
        uu_fea,
        5,  # uu neigh
        user_hash_size_list,
        is_hash=True)
    ui_info_hash = decode_node_list_attr(ui_fea,
                                         5,
                                         item_hash_size_list,
                                         is_hash=True)
    return user_id, item_id, u_id_hash, i_id_hash, label, uu_info_hash, ui_info_hash
    def read_test_data(self, file_queue):
        if self.local:
            reader = tf.TextLineReader(skip_header_lines=1)
        else:
            reader = tf.TableRecordReader()
        key, value = reader.read(file_queue)
        defaults = [['0']] * 8
        user_id, gender, age, consume_level, client_type, brand_id, province, city = tf.decode_csv(value, defaults)

        return user_id, gender, age, consume_level, client_type, brand_id, province, city
Ejemplo n.º 4
0
 def read_train_data(self, file_queue):
     if self.local:
         reader = tf.TextLineReader(skip_header_lines=1)
     else:
         reader = tf.TableRecordReader()
     key, value = reader.read(file_queue)
     defaults = [[0]] + [['0']] + [[0]] * 5
     user_id, item_id_list, item_id_list_len, gender, age, consume_level, label = tf.decode_csv(
         value, defaults)
     item_id_list = tf.string_to_number(
         tf.string_split([item_id_list], ';').values, tf.int32)
     return user_id, item_id_list, item_id_list_len, gender, age, consume_level, label
Ejemplo n.º 5
0
 def read_user_test_data(self, file_queue):
     if self.local:
         reader = tf.TextLineReader(skip_header_lines=1)
     else:
         reader = tf.TableRecordReader()
     key, value = reader.read(file_queue)
     defaults = [[0]] + [['0']] + [[0]] + [['0']] + [[0]] + [['0']] + [[0]] * 2
     user_id, item_id_list, item_id_list_len, cate_id_list, cat_id_list_len, tag_id_list, tag_id_list_len, gender = tf.decode_csv(
         value, defaults)
     item_id_list = tf.string_to_number(tf.string_split([item_id_list], ';').values, tf.int32)
     cate_id_list = tf.string_to_number(tf.string_split([cate_id_list], ';').values, tf.int32)
     tag_id_list = tf.string_to_number(tf.string_split([tag_id_list], ';').values, tf.int32)
     return user_id, item_id_list, item_id_list_len, cate_id_list, cat_id_list_len, tag_id_list, tag_id_list_len, gender
Ejemplo n.º 6
0
    def input_LP(self,
                 FLAGS,
                 batch_size,
                 slice_id=None,
                 slice_count=None,
                 is_dict=False):
        #  for training
        cols_name, cols_defval = build_col_defval_LP()
        input_file = FLAGS.input_path + "train.txt".format(FLAGS.sv_suffix)
        num_epochs = FLAGS.num_epochs
        shuffle = FLAGS.shuffle
        min_after_dequeue = 32 * batch_size
        capacity = 64 * batch_size

        filename_queue = tf.train.string_input_producer([input_file],
                                                        num_epochs=num_epochs,
                                                        shuffle=shuffle)

        if FLAGS.file_reader == "textline":
            reader = tf.TextLineReader()
        else:
            reader = tf.TableRecordReader(csv_delimiter=FLAGS.col_delim1,
                                          slice_count=slice_count,
                                          slice_id=slice_id)
        _, value = reader.read_up_to(filename_queue, batch_size)
        value = tf.train.shuffle_batch([value],
                                       batch_size=batch_size,
                                       num_threads=24,
                                       capacity=capacity,
                                       enqueue_many=True,
                                       min_after_dequeue=min_after_dequeue)

        features = tf.decode_csv(value,
                                 record_defaults=cols_defval,
                                 field_delim=FLAGS.col_delim1,
                                 use_quote_delim=False)

        pids, labels = features
        if is_dict:
            return {
                "pid": pids,
                "label": labels,
            }
        else:
            return pids, labels
Ejemplo n.º 7
0
    def read_train_data(self, file_queue):
        if self.local:
            reader = tf.TextLineReader(skip_header_lines=1)
        else:
            reader = tf.TableRecordReader()
        key, value = reader.read(file_queue)
        defaults = [['0']] * 17
        click_label, user_id, item_id, class_id, tag_ids, dev_type, dev_brand, dev_brand_type, \
        dev_os, dev_net, client_type, dev_carrier, click_seq_50size, gender, age, consume_level, unclick_seq_50size \
            = tf.decode_csv(value, defaults)

        # user_id = tf.cast(user_id, dtype=tf.int64)
        # gender = tf.cast(gender, dtype=tf.int64)
        # client_type = tf.cast(client_type, dtype=tf.int64)
        # tag_ids = tf.string_to_number(tf.string_split([tag_ids], ',').values, tf.int64)
        # click_seq_50size = tf.string_to_number(tf.string_split([click_seq_50size], ',').values, tf.int64)
        # unclick_seq_50size = tf.string_to_number(tf.string_split([unclick_seq_50size], ',').values, tf.int64)
        return click_label, user_id, item_id, class_id, tag_ids, dev_type, dev_brand, dev_brand_type, dev_os, dev_net, client_type, dev_carrier, click_seq_50size, gender, age, consume_level, unclick_seq_50size
Ejemplo n.º 8
0
def read_table(filename_queue):
    batch_size = 128
    reader = tf.TableRecordReader(csv_delimiter=';',
                                  num_threads=8,
                                  capacity=8 * batch_size)
    key, value = reader.read_up_to(filename_queue, batch_size)
    values = tf.train.batch([value],
                            batch_size=batch_size,
                            capacity=8 * capacity,
                            enqueue_many=True,
                            num_threads=8)
    record_defaults = [[1.0], [""], [""], [""], [""], [""]]
    feature_size = [1322, 30185604, 43239874, 5758226, 41900998]
    col1, col2, col3, col4, col5, col6 = tf.decode_csv(
        values, record_defaults=record_defaults, field_delim=';')
    outmatrix = tf.trans_csv_to_dense(['2,3,5', '2,6,7,7', '0,9,3'], 6)
    col2 = tf.trans_csv_kv2dense(col2, feature_size[0])
    col3 = tf.trans_csv_id2sparse(col3, feature_size[1])
    col4 = tf.trans_csv_id2sparse(col4, feature_size[2])
    col5 = tf.trans_csv_id2sparse(col5, feature_size[3])
    col6 = tf.trans_csv_id2sparse(col6, feature_size[4])
    return [col1, col2, col3, col4, col5, col6]
Ejemplo n.º 9
0
    def input_EP(self,
                 FLAGS,
                 batch_size,
                 slice_id=None,
                 slice_count=None,
                 is_dict=False):
        cols, cols_defval = build_col_defval_EP(FLAGS.n_node_type)
        input_file = FLAGS.input_path + "graph.txt"
        num_epochs = FLAGS.num_epochs
        n_node_type = FLAGS.n_node_type

        shuffle = FLAGS.shuffle
        min_after_dequeue = 32 * batch_size
        capacity = 64 * batch_size

        filename_queue = tf.train.string_input_producer([input_file],
                                                        num_epochs=num_epochs,
                                                        shuffle=shuffle)

        if FLAGS.file_reader == "textline":
            reader = tf.TextLineReader()
        else:
            reader = tf.TableRecordReader(csv_delimiter=FLAGS.col_delim1,
                                          slice_count=slice_count,
                                          slice_id=slice_id)
        _, value = reader.read_up_to(filename_queue, batch_size)
        value = tf.train.shuffle_batch([value],
                                       batch_size=batch_size,
                                       num_threads=24,
                                       capacity=capacity,
                                       enqueue_many=True,
                                       min_after_dequeue=min_after_dequeue)

        features = tf.decode_csv(value,
                                 record_defaults=cols_defval,
                                 field_delim=FLAGS.col_delim1,
                                 use_quote_delim=False)

        node_ids = features[0]
        node_types = features[1]
        nodes_nbrs_array = []
        nbr_segment_array = []

        edge_features_array = []
        edge_type_array = []

        base_l = 3

        negs = extract_negative_nodes(features[base_l - 1], FLAGS.col_delim2)

        for i in range(n_node_type):
            nbr_segment, nbr_nodes = extract_neighbor_nodes(
                features[base_l + i], FLAGS.col_delim2)
            edge_features = extract_features(
                features[base_l + n_node_type + i], FLAGS.col_delim2)
            edge_type = features[base_l + 2 * n_node_type + i]

            nodes_nbrs_array.append(nbr_nodes)
            nbr_segment_array.append(nbr_segment)
            edge_features_array.append(edge_features)
            edge_type_array.append(edge_type)

        if is_dict:
            return {
                "node_ids": node_ids,
                "node_types": node_types,
                "nodes_nbrs_array": nodes_nbrs_array,
                "nbr_segment_array": nbr_segment_array,
                "edge_type_array": edge_type_array,
                "edge_features_array": edge_features_array,
                "negs": negs,
            }
        else:
            return node_ids, node_types, negs, nodes_nbrs_array, nbr_segment_array, edge_type_array, edge_features_array
Ejemplo n.º 10
0
def input_fn_1021(
        table,
        selected_cols="u_fea,v_fea,i_fea,u_friend,v_friend,u_share,v_share,u_pay,v_pay,i_buy,label",
        shuffle=True):
    """
    selected_cols: label must be the last one
    for u, i, v
    shuffle=True for train/val
    shuffle=False for test
    """
    col_num = len(selected_cols.split(','))
    print('input_fn: {}'.format(table))
    print('select col: {}'.format(selected_cols))
    file_queue = tf.train.string_input_producer([table],
                                                num_epochs=NUM_EPOCH,
                                                shuffle=shuffle)

    reader = tf.TableRecordReader(selected_cols=selected_cols)
    keys, values = reader.read_up_to(file_queue, num_records=BATCH_SIZE)
    # , to_ndarray=False) # len(red) = num_records

    # src_user_fea, des_user_fea, src_user_items, des_user_items, _, _
    default_val = [[' ']] * col_num
    default_val[-1] = [-1.0]
    [
        u_fea, v_fea, i_fea, u_tao_friend, v_tao_friend, u_share, v_share,
        u_pay, v_pay, i_buy, label
    ] = tf.decode_csv(values, default_val)
    # u_fea type: id_age:15#id_gender:2#
    u_fea = tf.decode_csv(u_fea, [[' ']] * 6, "#")
    v_fea = tf.decode_csv(v_fea, [[' ']] * 6, "#")
    i_fea = tf.decode_csv(i_fea, [[' ']] * 3, "#")

    u_info_hash = decode_node_attr(u_fea, user_hash_size_list, is_hash=True)
    v_info_hash = decode_node_attr(v_fea, user_hash_size_list, is_hash=True)
    i_info_hash = decode_node_attr(i_fea, item_hash_size_list, is_hash=True)

    uf_info_hash = decode_node_list_attr(u_tao_friend,
                                         5,
                                         user_hash_size_list,
                                         is_hash=True)
    vf_info_hash = decode_node_list_attr(v_tao_friend,
                                         5,
                                         user_hash_size_list,
                                         is_hash=True)
    us_info_hash = decode_node_list_attr(u_share,
                                         10,
                                         user_hash_size_list,
                                         is_hash=True)
    vs_info_hash = decode_node_list_attr(v_share,
                                         10,
                                         user_hash_size_list,
                                         is_hash=True)
    up_info_hash = decode_node_list_attr(u_pay,
                                         2,
                                         user_hash_size_list,
                                         is_hash=True)
    vp_info_hash = decode_node_list_attr(v_pay,
                                         2,
                                         user_hash_size_list,
                                         is_hash=True)
    ib_info_hash = decode_node_list_attr(i_buy,
                                         50,
                                         user_hash_size_list,
                                         is_hash=True)



    return u_info_hash, v_info_hash, i_info_hash, \
           uf_info_hash, vf_info_hash, \
           us_info_hash, vs_info_hash, \
           up_info_hash, vp_info_hash, \
           ib_info_hash, \
           label
Ejemplo n.º 11
0
def input_table_batch_fn(table_name,
                         batch_size,
                         schema_config,
                         allow_smaller_final_batch=True,
                         num_epoches=None,
                         slice_count=None,
                         slice_id=None):
    selected_col = ','.join([e['column_name'] for e in schema_config])
    file_queue = tf.train.string_input_producer([table_name],
                                                num_epochs=num_epoches)
    print(selected_col)
    reader = tf.TableRecordReader(slice_count=slice_count,
                                  slice_id=slice_id,
                                  csv_delimiter=',',
                                  selected_cols=selected_col,
                                  num_threads=32,
                                  capacity=batch_size * 20)
    key, value = reader.read_up_to(file_queue, batch_size)
    batch_res = tf.train.shuffle_batch(
        [value],
        batch_size=batch_size,
        capacity=batch_size * 20,
        enqueue_many=True,
        num_threads=16,
        min_after_dequeue=batch_size,
        allow_smaller_final_batch=allow_smaller_final_batch)
    record_defaults = [
        [''] for _ in range(np.sum([e['length'] for e in schema_config]))
    ]
    feature = tf.decode_csv(batch_res,
                            record_defaults=record_defaults,
                            field_delim=',')

    res = {}
    length = len(schema_config)
    start = 0
    for e in schema_config:
        datatype = e['dtype']
        length = e['length']
        name = e['feat_name']
        if datatype == tf.string:
            b = feature[start:start + length]
        else:
            b = tf.string_to_number(feature[start:start + length], datatype)
        # From [Batch_sz,Feat_sz] to [Feat_sz,Batch_sz]
        val = tf.transpose(b)
        if 'string_to_patched_seq' in e:
            param = e['string_to_patched_seq']
            seq_len = param.get('seq_len', 5)
            patch_value = param.get('patch_value', '</s>')
            delimiter = param.get('delimiter', ';')
            dtype = param.get('dtype', tf.string)
            print(val)
            val = tf.map_fn(
                lambda x: tf.string_split_and_pad(x,
                                                  max_length=seq_len,
                                                  delimiter=delimiter,
                                                  default_value=patch_value),
                val,
                parallel_iterations=512,
                back_prop=False)
            if dtype == tf.string:
                pass
            else:
                val = tf.string_to_number(val, out_type=dtype)

        res[name] = val

        start = start + length

    return res
Ejemplo n.º 12
0
import tensorflow as tf
import time

tf.app.flags.DEFINE_string("tables", "", "tables info")

FLAGS = tf.app.flags.FLAGS

print("tables:", FLAGS.tables)

tables = [FLAGS.tables]

filename_queue = tf.train.string_input_producer(tables, num_epochs=1)

print(filename_queue)

reader = tf.TableRecordReader()
key, value = reader.read(filename_queue)
record_defaults = [[1.0], [1.0], [1.0], [1.0], ["Iris-virginica"]]
col1, col2, col3, col4, col5 = tf.decode_csv(value,
                                             record_defaults=record_defaults)
features = tf.pack([col1, col2, col3, col4])
init = tf.initialize_all_variables()

with tf.Session() as sess:
    with tf.device("/cpu:0"):
        sess.run(init)
        sess.run(tf.initialize_local_variables())
        coord = tf.train.Coordinator()
        threads = tf.train.start_queue_runners(coord=coord)
        try:
            step = 0