def load_embedding(file_name, vocab_size, embedding_size, vectors, shift=0, name='waou'): with tf.device("/cpu:0"): with tf.name_scope('load_w2v_embed_' + name): reader = tf.TableRecordReader(selected_cols='word,vector', csv_delimiter=',', name=name) file_queue = tf.train.string_input_producer([file_name], name='w2v_queue_' + name) _, values = reader.read_up_to(file_queue, vocab_size, name='w2v_read_' + name) embed_raw = tf.decode_csv( values, record_defaults=[[''] for _ in range(1 + embedding_size)], field_delim=',') embed_raw = tf.transpose(embed_raw) ids = tf.string_to_number(embed_raw[:, 0], tf.int32, name='word_ids_' + name) ids = tf.reshape(ids, [-1]) embeddings = tf.string_to_number( embed_raw[:, 1:1 + embedding_size], tf.float32) init = tf.scatter_update(vectors, ids + shift, embeddings, name='word_ids_scatter_update' + name).op return init
def input_fn_1021(table, selected_cols="user_id,item_id,ui_fea,uu_fea,label", shuffle=True): col_num = len(selected_cols.split(',')) print('input_fn: {}'.format(table)) print('select col: {}'.format(selected_cols)) file_queue = tf.train.string_input_producer([table], num_epochs=NUM_EPOCH, shuffle=shuffle) reader = tf.TableRecordReader(selected_cols=selected_cols) keys, values = reader.read_up_to(file_queue, num_records=BATCH_SIZE) default_val = [[' ']] * col_num default_val[-1] = [-1.0] [user_id, item_id, ui_fea, uu_fea, label] = tf.decode_csv(values, default_val) u_id_hash = tf.string_to_hash_bucket(user_id, NUM_USER_ID) i_id_hash = tf.string_to_hash_bucket(item_id, NUM_ITEM_ID) uu_info_hash = decode_node_list_attr( uu_fea, 5, # uu neigh user_hash_size_list, is_hash=True) ui_info_hash = decode_node_list_attr(ui_fea, 5, item_hash_size_list, is_hash=True) return user_id, item_id, u_id_hash, i_id_hash, label, uu_info_hash, ui_info_hash
def read_test_data(self, file_queue): if self.local: reader = tf.TextLineReader(skip_header_lines=1) else: reader = tf.TableRecordReader() key, value = reader.read(file_queue) defaults = [['0']] * 8 user_id, gender, age, consume_level, client_type, brand_id, province, city = tf.decode_csv(value, defaults) return user_id, gender, age, consume_level, client_type, brand_id, province, city
def read_train_data(self, file_queue): if self.local: reader = tf.TextLineReader(skip_header_lines=1) else: reader = tf.TableRecordReader() key, value = reader.read(file_queue) defaults = [[0]] + [['0']] + [[0]] * 5 user_id, item_id_list, item_id_list_len, gender, age, consume_level, label = tf.decode_csv( value, defaults) item_id_list = tf.string_to_number( tf.string_split([item_id_list], ';').values, tf.int32) return user_id, item_id_list, item_id_list_len, gender, age, consume_level, label
def read_user_test_data(self, file_queue): if self.local: reader = tf.TextLineReader(skip_header_lines=1) else: reader = tf.TableRecordReader() key, value = reader.read(file_queue) defaults = [[0]] + [['0']] + [[0]] + [['0']] + [[0]] + [['0']] + [[0]] * 2 user_id, item_id_list, item_id_list_len, cate_id_list, cat_id_list_len, tag_id_list, tag_id_list_len, gender = tf.decode_csv( value, defaults) item_id_list = tf.string_to_number(tf.string_split([item_id_list], ';').values, tf.int32) cate_id_list = tf.string_to_number(tf.string_split([cate_id_list], ';').values, tf.int32) tag_id_list = tf.string_to_number(tf.string_split([tag_id_list], ';').values, tf.int32) return user_id, item_id_list, item_id_list_len, cate_id_list, cat_id_list_len, tag_id_list, tag_id_list_len, gender
def input_LP(self, FLAGS, batch_size, slice_id=None, slice_count=None, is_dict=False): # for training cols_name, cols_defval = build_col_defval_LP() input_file = FLAGS.input_path + "train.txt".format(FLAGS.sv_suffix) num_epochs = FLAGS.num_epochs shuffle = FLAGS.shuffle min_after_dequeue = 32 * batch_size capacity = 64 * batch_size filename_queue = tf.train.string_input_producer([input_file], num_epochs=num_epochs, shuffle=shuffle) if FLAGS.file_reader == "textline": reader = tf.TextLineReader() else: reader = tf.TableRecordReader(csv_delimiter=FLAGS.col_delim1, slice_count=slice_count, slice_id=slice_id) _, value = reader.read_up_to(filename_queue, batch_size) value = tf.train.shuffle_batch([value], batch_size=batch_size, num_threads=24, capacity=capacity, enqueue_many=True, min_after_dequeue=min_after_dequeue) features = tf.decode_csv(value, record_defaults=cols_defval, field_delim=FLAGS.col_delim1, use_quote_delim=False) pids, labels = features if is_dict: return { "pid": pids, "label": labels, } else: return pids, labels
def read_train_data(self, file_queue): if self.local: reader = tf.TextLineReader(skip_header_lines=1) else: reader = tf.TableRecordReader() key, value = reader.read(file_queue) defaults = [['0']] * 17 click_label, user_id, item_id, class_id, tag_ids, dev_type, dev_brand, dev_brand_type, \ dev_os, dev_net, client_type, dev_carrier, click_seq_50size, gender, age, consume_level, unclick_seq_50size \ = tf.decode_csv(value, defaults) # user_id = tf.cast(user_id, dtype=tf.int64) # gender = tf.cast(gender, dtype=tf.int64) # client_type = tf.cast(client_type, dtype=tf.int64) # tag_ids = tf.string_to_number(tf.string_split([tag_ids], ',').values, tf.int64) # click_seq_50size = tf.string_to_number(tf.string_split([click_seq_50size], ',').values, tf.int64) # unclick_seq_50size = tf.string_to_number(tf.string_split([unclick_seq_50size], ',').values, tf.int64) return click_label, user_id, item_id, class_id, tag_ids, dev_type, dev_brand, dev_brand_type, dev_os, dev_net, client_type, dev_carrier, click_seq_50size, gender, age, consume_level, unclick_seq_50size
def read_table(filename_queue): batch_size = 128 reader = tf.TableRecordReader(csv_delimiter=';', num_threads=8, capacity=8 * batch_size) key, value = reader.read_up_to(filename_queue, batch_size) values = tf.train.batch([value], batch_size=batch_size, capacity=8 * capacity, enqueue_many=True, num_threads=8) record_defaults = [[1.0], [""], [""], [""], [""], [""]] feature_size = [1322, 30185604, 43239874, 5758226, 41900998] col1, col2, col3, col4, col5, col6 = tf.decode_csv( values, record_defaults=record_defaults, field_delim=';') outmatrix = tf.trans_csv_to_dense(['2,3,5', '2,6,7,7', '0,9,3'], 6) col2 = tf.trans_csv_kv2dense(col2, feature_size[0]) col3 = tf.trans_csv_id2sparse(col3, feature_size[1]) col4 = tf.trans_csv_id2sparse(col4, feature_size[2]) col5 = tf.trans_csv_id2sparse(col5, feature_size[3]) col6 = tf.trans_csv_id2sparse(col6, feature_size[4]) return [col1, col2, col3, col4, col5, col6]
def input_EP(self, FLAGS, batch_size, slice_id=None, slice_count=None, is_dict=False): cols, cols_defval = build_col_defval_EP(FLAGS.n_node_type) input_file = FLAGS.input_path + "graph.txt" num_epochs = FLAGS.num_epochs n_node_type = FLAGS.n_node_type shuffle = FLAGS.shuffle min_after_dequeue = 32 * batch_size capacity = 64 * batch_size filename_queue = tf.train.string_input_producer([input_file], num_epochs=num_epochs, shuffle=shuffle) if FLAGS.file_reader == "textline": reader = tf.TextLineReader() else: reader = tf.TableRecordReader(csv_delimiter=FLAGS.col_delim1, slice_count=slice_count, slice_id=slice_id) _, value = reader.read_up_to(filename_queue, batch_size) value = tf.train.shuffle_batch([value], batch_size=batch_size, num_threads=24, capacity=capacity, enqueue_many=True, min_after_dequeue=min_after_dequeue) features = tf.decode_csv(value, record_defaults=cols_defval, field_delim=FLAGS.col_delim1, use_quote_delim=False) node_ids = features[0] node_types = features[1] nodes_nbrs_array = [] nbr_segment_array = [] edge_features_array = [] edge_type_array = [] base_l = 3 negs = extract_negative_nodes(features[base_l - 1], FLAGS.col_delim2) for i in range(n_node_type): nbr_segment, nbr_nodes = extract_neighbor_nodes( features[base_l + i], FLAGS.col_delim2) edge_features = extract_features( features[base_l + n_node_type + i], FLAGS.col_delim2) edge_type = features[base_l + 2 * n_node_type + i] nodes_nbrs_array.append(nbr_nodes) nbr_segment_array.append(nbr_segment) edge_features_array.append(edge_features) edge_type_array.append(edge_type) if is_dict: return { "node_ids": node_ids, "node_types": node_types, "nodes_nbrs_array": nodes_nbrs_array, "nbr_segment_array": nbr_segment_array, "edge_type_array": edge_type_array, "edge_features_array": edge_features_array, "negs": negs, } else: return node_ids, node_types, negs, nodes_nbrs_array, nbr_segment_array, edge_type_array, edge_features_array
def input_fn_1021( table, selected_cols="u_fea,v_fea,i_fea,u_friend,v_friend,u_share,v_share,u_pay,v_pay,i_buy,label", shuffle=True): """ selected_cols: label must be the last one for u, i, v shuffle=True for train/val shuffle=False for test """ col_num = len(selected_cols.split(',')) print('input_fn: {}'.format(table)) print('select col: {}'.format(selected_cols)) file_queue = tf.train.string_input_producer([table], num_epochs=NUM_EPOCH, shuffle=shuffle) reader = tf.TableRecordReader(selected_cols=selected_cols) keys, values = reader.read_up_to(file_queue, num_records=BATCH_SIZE) # , to_ndarray=False) # len(red) = num_records # src_user_fea, des_user_fea, src_user_items, des_user_items, _, _ default_val = [[' ']] * col_num default_val[-1] = [-1.0] [ u_fea, v_fea, i_fea, u_tao_friend, v_tao_friend, u_share, v_share, u_pay, v_pay, i_buy, label ] = tf.decode_csv(values, default_val) # u_fea type: id_age:15#id_gender:2# u_fea = tf.decode_csv(u_fea, [[' ']] * 6, "#") v_fea = tf.decode_csv(v_fea, [[' ']] * 6, "#") i_fea = tf.decode_csv(i_fea, [[' ']] * 3, "#") u_info_hash = decode_node_attr(u_fea, user_hash_size_list, is_hash=True) v_info_hash = decode_node_attr(v_fea, user_hash_size_list, is_hash=True) i_info_hash = decode_node_attr(i_fea, item_hash_size_list, is_hash=True) uf_info_hash = decode_node_list_attr(u_tao_friend, 5, user_hash_size_list, is_hash=True) vf_info_hash = decode_node_list_attr(v_tao_friend, 5, user_hash_size_list, is_hash=True) us_info_hash = decode_node_list_attr(u_share, 10, user_hash_size_list, is_hash=True) vs_info_hash = decode_node_list_attr(v_share, 10, user_hash_size_list, is_hash=True) up_info_hash = decode_node_list_attr(u_pay, 2, user_hash_size_list, is_hash=True) vp_info_hash = decode_node_list_attr(v_pay, 2, user_hash_size_list, is_hash=True) ib_info_hash = decode_node_list_attr(i_buy, 50, user_hash_size_list, is_hash=True) return u_info_hash, v_info_hash, i_info_hash, \ uf_info_hash, vf_info_hash, \ us_info_hash, vs_info_hash, \ up_info_hash, vp_info_hash, \ ib_info_hash, \ label
def input_table_batch_fn(table_name, batch_size, schema_config, allow_smaller_final_batch=True, num_epoches=None, slice_count=None, slice_id=None): selected_col = ','.join([e['column_name'] for e in schema_config]) file_queue = tf.train.string_input_producer([table_name], num_epochs=num_epoches) print(selected_col) reader = tf.TableRecordReader(slice_count=slice_count, slice_id=slice_id, csv_delimiter=',', selected_cols=selected_col, num_threads=32, capacity=batch_size * 20) key, value = reader.read_up_to(file_queue, batch_size) batch_res = tf.train.shuffle_batch( [value], batch_size=batch_size, capacity=batch_size * 20, enqueue_many=True, num_threads=16, min_after_dequeue=batch_size, allow_smaller_final_batch=allow_smaller_final_batch) record_defaults = [ [''] for _ in range(np.sum([e['length'] for e in schema_config])) ] feature = tf.decode_csv(batch_res, record_defaults=record_defaults, field_delim=',') res = {} length = len(schema_config) start = 0 for e in schema_config: datatype = e['dtype'] length = e['length'] name = e['feat_name'] if datatype == tf.string: b = feature[start:start + length] else: b = tf.string_to_number(feature[start:start + length], datatype) # From [Batch_sz,Feat_sz] to [Feat_sz,Batch_sz] val = tf.transpose(b) if 'string_to_patched_seq' in e: param = e['string_to_patched_seq'] seq_len = param.get('seq_len', 5) patch_value = param.get('patch_value', '</s>') delimiter = param.get('delimiter', ';') dtype = param.get('dtype', tf.string) print(val) val = tf.map_fn( lambda x: tf.string_split_and_pad(x, max_length=seq_len, delimiter=delimiter, default_value=patch_value), val, parallel_iterations=512, back_prop=False) if dtype == tf.string: pass else: val = tf.string_to_number(val, out_type=dtype) res[name] = val start = start + length return res
import tensorflow as tf import time tf.app.flags.DEFINE_string("tables", "", "tables info") FLAGS = tf.app.flags.FLAGS print("tables:", FLAGS.tables) tables = [FLAGS.tables] filename_queue = tf.train.string_input_producer(tables, num_epochs=1) print(filename_queue) reader = tf.TableRecordReader() key, value = reader.read(filename_queue) record_defaults = [[1.0], [1.0], [1.0], [1.0], ["Iris-virginica"]] col1, col2, col3, col4, col5 = tf.decode_csv(value, record_defaults=record_defaults) features = tf.pack([col1, col2, col3, col4]) init = tf.initialize_all_variables() with tf.Session() as sess: with tf.device("/cpu:0"): sess.run(init) sess.run(tf.initialize_local_variables()) coord = tf.train.Coordinator() threads = tf.train.start_queue_runners(coord=coord) try: step = 0