def main(): # Define argument parsing parser = argparse.ArgumentParser(description="Tell RaNNdy what to do.") parser.add_argument('--mode', help='Whether to train or infer.', default='train', choices=['train', 'infer']) parser.add_argument('--sentence_tokens', help ='Tokenized sentences to train on.', default='../data/sentence_tokens.csv') parser.add_argument('--vocab', help ='Vocabulary to use for training.', default='../data/vocabulary.csv') args = parser.parse_args() if args.mode == 'train': # Step 1: Load Dataset data_iterator = DataIterator(args.sentence_tokens, args.vocab) # Step 2: Create Auto Encoder in Trainng Mode ranndy = SentenceAutoEncoder(data_iterator, tf.estimator.ModeKeys.TRAIN) # Step 3: Train ranndy.train() else: # Step 1: Load Dataset w/ batch size of 1 data_iterator = DataIterator(args.sentence_tokens, args.vocab, batch_size=2, shuffle=False) # Step 2: Create Auto Encoder in Inference Mode ranndy = SentenceAutoEncoder(data_iterator, tf.estimator.ModeKeys.PREDICT) # Step 3: Infer ranndy.infer(num_batch_infer=1)
def train(estimator): tf.logging.set_verbosity(tf.logging.INFO) train_file_pattern = "./data/part-00000" data_iterator = DataIterator(params) train_input_fn = lambda: data_iterator.input_fn(train_file_pattern, 'offline') estimator.train(input_fn=train_input_fn, steps=None)
def evaluate(sess, model, data_set): # Run evals on development set and print their perplexity/loss. sess.run(model.dropout10_op) # 验证的时候dropout设置为1,也就是不dropout loss = 0.0 n_steps = 0 n_valids = 0 batch_size = FLAGS.batch_size dite = DataIterator(model, data_set, len(FLAGS._buckets), batch_size, None) ite = dite.next_sequence(stop=True) for inputs, outputs, weights, bucket_id in ite: L = model.step(sess, inputs, outputs, weights, bucket_id, forward_only=True) loss += L n_steps += 1 n_valids += np.sum(weights) loss = loss / (n_valids) ppx = math.exp(loss) if loss < 300 else float("inf") sess.run(model.dropoutAssign_op) #验证结束需要将 dropout恢复原来的设置。 return loss, ppx
def predict(data_params): meta_path = "./model/dssm.ckpt.meta" ckpt_path = "./model/dssm.ckpt" data_file = "./data/train.txt.10" dssm = DSSM() data_iterator = DataIterator(data_params) iterator = data_iterator.input_fn(data_file) # config with tf.Session() as sess: saver = tf.train.import_meta_graph(meta_path) saver.restore(sess, ckpt_path) sess.run(tf.global_variables_initializer()) sess.run(iterator.initializer) s = time.time() while True: try: (query_features, creative_ids, labels) = iterator.get_next() (batch_query, batch_creative_ids, batch_labels) = sess.run( [query_features, creative_ids, labels]) prediction = sess.run(dssm.score, feed_dict={ dssm.query: batch_query, dssm.doc: batch_creative_ids }) print(prediction) except tf.errors.OutOfRangeError: break e = time.time() # 平均每条 0.0001s print(e - s)
def __init__(self, train_file="local_train_splitByUser", test_file="local_test_splitByUser", uid_voc="uid_voc.pkl", mid_voc="mid_voc.pkl", cat_voc="cat_voc.pkl", item_info='item-info', reviews_info='reviews-info', batch_size=128, maxlen=100, embedding_dim=18, light_embedding_dim=4, return_neg=True): self.maxlen = maxlen self.return_neg = return_neg self.train_data = DataIterator(train_file, uid_voc, mid_voc, cat_voc, item_info, reviews_info, batch_size, maxlen, shuffle_each_epoch=False) self.test_data = DataIterator(test_file, uid_voc, mid_voc, cat_voc, item_info, reviews_info, batch_size, maxlen) self.n_uid, self.n_mid, self.n_cat = self.train_data.get_n() self.embedding_dim = embedding_dim self.light_embedding_dim = light_embedding_dim
def test(depth, p, dataset, num_epochs=200, seed=None): if seed is None: seed = 0 np.random.seed(seed) data = None if dataset == "mnist": data = mnist.load().astype(np.float32) elif dataset == "cifar10": data = cifar10.load().astype(np.float32) num_observations, input_dim = data.shape data_split_index = int(num_observations * 0.9) training_data_iterator = DataIterator(batch_size, data[:data_split_index], data[:data_split_index]) validation_data_iterator = DataIterator(batch_size, data[data_split_index:], data[data_split_index:]) # make net net = Network(input_dim, input_dim, hidden_layers=([ 1000, ] * depth), p=p) losses = net.train(training_data_iterator, validation_data_iterator, num_epochs=num_epochs) net.close() return losses
def evaluate(sess, model, data_set, item_sampled_id2idx=None): # Run evals on development set and print their perplexity/loss. dropoutRateRaw = FLAGS.keep_prob sess.run(model.dropout10_op) start_id = 0 loss = 0.0 n_steps = 0 n_valids = 0 batch_size = FLAGS.batch_size dite = DataIterator(model, data_set, len(_buckets), batch_size, None) ite = dite.next_sequence(stop=True) for users, inputs, outputs, weights, bucket_id in ite: L = model.step(sess, users, inputs, outputs, weights, bucket_id, forward_only=True) loss += L n_steps += 1 n_valids += np.sum(np.sign(weights[0])) loss = loss / (n_valids) ppx = math.exp(loss) if loss < 300 else float("inf") sess.run(model.dropoutAssign_op) return loss, ppx
def test(train_file="local_train_splitByUser", test_file="local_test_splitByUser", uid_voc="uid_voc.pkl", mid_voc="mid_voc.pkl", cat_voc="cat_voc.pkl", batch_size=128, maxlen=100, model_type='DNN', seed=2): model_path = "dnn_best_model/ckpt_noshuff" + model_type + str(seed) gpu_options = tf.GPUOptions(allow_growth=True) with tf.Session(config=tf.ConfigProto(gpu_options=gpu_options)) as sess: train_data = DataIterator(train_file, uid_voc, mid_voc, cat_voc, batch_size, maxlen) test_data = DataIterator(test_file, uid_voc, mid_voc, cat_voc, batch_size, maxlen) n_uid, n_mid, n_cat = train_data.get_n() model = Model_DIN_V2_Gru_Vec_attGru_Neg(n_uid, n_mid, n_cat, EMBEDDING_DIM, HIDDEN_SIZE, ATTENTION_SIZE) model.restore(sess, model_path) print( 'test_auc: %.4f ----test_loss: %.4f ---- test_accuracy: %.4f ---- test_aux_loss: %.4f' % eval(sess, test_data, model, model_path))
def test(train_file="local_train_splitByUser", test_file="local_test_splitByUser", uid_voc="uid_voc.pkl", mid_voc="mid_voc.pkl", cat_voc="cat_voc.pkl", batch_size=128, maxlen=20, model_type='DNN', seed=2): model_path = "dnn_best_model/ckpt_noshuff" + model_type + str(seed) gpu_options = tf.GPUOptions(allow_growth=True) with tf.Session(config=tf.ConfigProto(gpu_options=gpu_options)) as sess: train_data = DataIterator(train_file, uid_voc, mid_voc, cat_voc, batch_size, maxlen) test_data = DataIterator(test_file, uid_voc, mid_voc, cat_voc, batch_size, maxlen) n_uid, n_mid, n_cat = train_data.get_n() if model_type == 'DNN': model = Model_DNN(n_uid, n_mid, n_cat, EMBEDDING_DIM, HIDDEN_SIZE, ATTENTION_SIZE) elif model_type == 'PNN': model = Model_PNN(n_uid, n_mid, n_cat, EMBEDDING_DIM, HIDDEN_SIZE, ATTENTION_SIZE) elif model_type == 'Wide': model = Model_WideDeep(n_uid, n_mid, n_cat, EMBEDDING_DIM, HIDDEN_SIZE, ATTENTION_SIZE) elif model_type == 'DIN': model = Model_DIN(n_uid, n_mid, n_cat, EMBEDDING_DIM, HIDDEN_SIZE, ATTENTION_SIZE) elif model_type == 'DIN-V2-gru-att-gru': model = Model_DIN_V2_Gru_att_Gru(n_uid, n_mid, n_cat, EMBEDDING_DIM, HIDDEN_SIZE, ATTENTION_SIZE) elif model_type == 'DIN-V2-gru-gru-att': model = Model_DIN_V2_Gru_Gru_att(n_uid, n_mid, n_cat, EMBEDDING_DIM, HIDDEN_SIZE, ATTENTION_SIZE) elif model_type == 'DIN-V2-gru-qa-attGru': model = Model_DIN_V2_Gru_QA_attGru(n_uid, n_mid, n_cat, EMBEDDING_DIM, HIDDEN_SIZE, ATTENTION_SIZE) elif model_type == 'DIN-V2-gru-vec-attGru': model = Model_DIN_V2_Gru_Vec_attGru(n_uid, n_mid, n_cat, EMBEDDING_DIM, HIDDEN_SIZE, ATTENTION_SIZE) elif model_type == 'DIEN': model = Model_DIN_V2_Gru_Vec_attGru_Neg(n_uid, n_mid, n_cat, EMBEDDING_DIM, HIDDEN_SIZE, ATTENTION_SIZE) elif model_type == 'DMIN': model = Model_DNN_Multi_Head(n_uid, n_mid, n_cat, EMBEDDING_DIM, HIDDEN_SIZE, ATTENTION_SIZE) else: print("Invalid model_type : %s", model_type) return model.restore(sess, model_path) print( 'test_auc: %.4f ----test_loss: %.4f ---- test_accuracy: %.4f ---- test_aux_loss: %.4f' % eval(sess, test_data, model, model_path, maxlen=maxlen))
def test( files, batch_size=1024, max_len=100, seed=2, shuffle_each_epoch=False, ): train_file, test_file, uid_voc, mid_voc, cat_voc = files[0], files[ 1], files[2], files[3], files[4] if shuffle_each_epoch: model_path = "best_model_SGD/ckpt_shuffle" + str(seed) else: model_path = "best_model_SGD/ckpt_noshuffle" + str(seed) gpu_options = tf.GPUOptions(allow_growth=True) with tf.Session(config=tf.ConfigProto(gpu_options=gpu_options)) as sess: train_data = DataIterator(train_file, uid_voc, mid_voc, cat_voc, batch_size, max_len) test_data = DataIterator(test_file, uid_voc, mid_voc, cat_voc, batch_size, max_len) n_uid, n_mid, n_cat = train_data.get_n() model = Model_DIN(n_uid, n_mid, n_cat, EMBEDDING_DIM, HIDDEN_SIZE, ATTENTION_SIZE) model.restore(sess, model_path) print('test_auc: %.4f ----test_loss: %.4f ---- test_accuracy: %.4f ' % eval(sess, test_data, model, model_path))
def get_batch_data(self): r""" 输出一个batch预处理的样本 """ if self.shuffle: random.shuffle(self.data) it = DataIterator(self.data, self.batch_size) for batch_data in it.get_batch_data(): texts, emotions = [], [] for item in batch_data: texts.append(item['text'].strip().split()) emotions.append(item['emotion']) id_texts, len_texts = [], [] for text in texts: id_text, len_text = self.sp.word2index(text) id_texts.append(id_text) len_texts.append(len_text) len_texts = [l + 2 for l in len_texts] # 加上start和end后的长度 maxlen_text = max(len_texts) pad_id_texts = [ self.sp.pad_sentence(t, maxlen_text) for t in id_texts ] # 补齐长度 new_batch_data = { 'str_texts': texts, # [batch, len] 'texts': pad_id_texts, # [batch, len] 'len_texts': len_texts, # [batch] 'emotions': emotions } # [batch] yield new_batch_data
def main(params, train_file, test_file): label_dict, word_dict, word_freq_dict, train_data = init_required_data(train_file) print(len(word_dict)) train_iterator = DataIterator(params, label_dict, word_dict, train_data) params["n_classes"] = len(label_dict) deep_match = DeepMatch(params) config = tf.ConfigProto() config.gpu_options.allow_growth = True config.gpu_options.per_process_gpu_memory_fraction = 0.5 sess = tf.Session(config=config) sess.run(tf.global_variables_initializer()) i = 1 for (x, y, word_num) in train_iterator: # [input, label] = sess.run([deep_match.input, deep_match.label], feed_dict={deep_match.user_input : x, deep_match.label : y}) [_, loss] = sess.run([deep_match.train_op, deep_match.loss], feed_dict={deep_match.user_input : x, deep_match.label : y, deep_match.word_num : word_num}) if i % 100 == 0: print("step %d, loss %.6f" % (i, loss[0])) i += 1 # test accuracy label_dict, word_dict, word_freq_dict, test_data = init_required_data(test_file) test_iterator = DataIterator(params, label_dict, word_dict, test_data) acc = 0 batch_num = len(test_data) // params["batch_size"] for (x, y, word_num) in test_iterator: batch_acc = sess.run(deep_match.accuracy, feed_dict={deep_match.user_input : x, deep_match.label : y, deep_match.word_num : word_num}) acc += batch_acc print(acc / batch_num)
def test(buckets, train_file = "local_train_splitByUser", test_file = "local_test_splitByUser", uid_voc = "uid_voc.pkl", mid_voc = "mid_voc.pkl", cat_voc = "cat_voc.pkl", model_folder = "dnn_best_model/ckpt_noshuff", batch_size = 128, maxlen = 100, model_type = 'DNN', seed = 2 ): model_path = model_folder + model_type + str(seed) gpu_options = tf.GPUOptions(allow_growth=True) with tf.Session(config=tf.ConfigProto(gpu_options=gpu_options)) as sess: train_data = DataIterator(train_file,buckets, uid_voc, mid_voc, cat_voc, batch_size, maxlen) test_data = DataIterator(test_file,buckets, uid_voc, mid_voc, cat_voc, batch_size, maxlen) n_uid, n_mid, n_cat = train_data.get_n() if model_type == 'PNN': model = Model_PNN(n_uid, n_mid, n_cat, EMBEDDING_DIM, HIDDEN_SIZE, ATTENTION_SIZE) elif model_type == 'Wide': model = Model_WideDeep(n_uid, n_mid, n_cat, EMBEDDING_DIM, HIDDEN_SIZE, ATTENTION_SIZE) elif model_type == 'DIN': model = Model_DIN(n_uid, n_mid, n_cat, EMBEDDING_DIM, HIDDEN_SIZE, ATTENTION_SIZE) elif model_type == 'DIEN': model = Model_DIEN(n_uid, n_mid, n_cat, EMBEDDING_DIM, HIDDEN_SIZE, ATTENTION_SIZE) elif model_type =='DHAN': model = Model_DHAN(n_uid, n_mid, n_cat, EMBEDDING_DIM, HIDDEN_SIZE, ATTENTION_SIZE) else: print ("Invalid model_type : %s", model_type) return model.restore(sess, model_path) print('test_auc: %.4f ----test_loss: %.4f ---- test_accuracy: %.4f ---- test_aux_loss: %.4f' % eval(sess, test_data, model, model_path))
def beam_search(): mylog("Reading Data...") task = Task(FLAGS.dataset) _, _, test_set, embAttr, START_ID, _, _, evaluation, uids = read_data( task, test=True) test_bucket_sizes = [len(test_set[b]) for b in xrange(len(_buckets))] test_total_size = int(sum(test_bucket_sizes)) # reports mylog(_buckets) mylog("Test:") mylog("total: {}".format(test_total_size)) mylog("buckets: {}".format(test_bucket_sizes)) with tf.Session(config=tf.ConfigProto(allow_soft_placement=True, log_device_placement=False)) as sess: # runtime profile if FLAGS.profile: run_options = tf.RunOptions(trace_level=tf.RunOptions.FULL_TRACE) run_metadata = tf.RunMetadata() else: run_options = None run_metadata = None mylog("Creating Model") model = create_model(sess, embAttr, START_ID, run_options, run_metadata) show_all_variables() model.init_beam_decoder() sess.run(model.dropoutRate.assign(1.0)) start_id = 0 n_steps = 0 batch_size = FLAGS.batch_size dite = DataIterator(model, test_set, len(_buckets), batch_size, None) ite = dite.next_sequence(stop=True, recommend=True) n_total_user = len(uids) n_recommended = 0 uid2rank = {} for r, uid in enumerate(uids): uid2rank[uid] = r rec = np.zeros((n_total_user, FLAGS.topk), dtype=int) rec_value = np.zeros((n_total_user, FLAGS.topk), dtype=float) start = time.time() for users, inputs, positions, valids, bucket_id in ite: print(inputs) print(positions) results = model.beam_step(sess, index=0, user_input=users, item_inputs=inputs, sequence_length=positions, bucket_id=bucket_id) break
def test( train_file = "local_train", test_file = "local_test", uid_voc = "uid_voc_large.pkl", mid_voc = "mid_voc_large.pkl", cat_voc = "cat_voc_large.pkl", batch_size = 128, maxlen = 100, model_type = 'ASVD', seed = 2 ): model_path = "dnn_best_model/ckpt_noshuff" + model_type + str(seed) gpu_options = tf.GPUOptions(allow_growth=True) with tf.Session(config=tf.ConfigProto(gpu_options=gpu_options)) as sess: train_data = DataIterator(train_file, uid_voc, mid_voc, cat_voc, batch_size, maxlen) test_data = DataIterator(test_file, uid_voc, mid_voc, cat_voc, batch_size, maxlen) n_uid, n_mid, n_cat = train_data.get_n() #Baselines if model_type == 'ASVD': model = Model_ASVD(n_uid, n_mid, n_cat, EMBEDDING_DIM, HIDDEN_SIZE, ATTENTION_SIZE) elif model_type == 'DIN': model = Model_DIN(n_uid, n_mid, n_cat, EMBEDDING_DIM, HIDDEN_SIZE, ATTENTION_SIZE) elif model_type == 'LSTM': model = Model_LSTM(n_uid, n_mid, n_cat, EMBEDDING_DIM, HIDDEN_SIZE, ATTENTION_SIZE) elif model_type == 'LSTMPP': model = Model_LSTMPP(n_uid, n_mid, n_cat, EMBEDDING_DIM, HIDDEN_SIZE, ATTENTION_SIZE) elif model_type == 'NARM': model = Model_NARM(n_uid, n_mid, n_cat, EMBEDDING_DIM, HIDDEN_SIZE, ATTENTION_SIZE) elif model_type == 'CARNN': model = Model_CARNN(n_uid, n_mid, n_cat, EMBEDDING_DIM, HIDDEN_SIZE, ATTENTION_SIZE) elif model_type == 'Time1LSTM': model = Model_Time1LSTM(n_uid, n_mid, n_cat, EMBEDDING_DIM, HIDDEN_SIZE, ATTENTION_SIZE) elif model_type == 'Time2LSTM': model = Model_Time2LSTM(n_uid, n_mid, n_cat, EMBEDDING_DIM, HIDDEN_SIZE, ATTENTION_SIZE) elif model_type == 'Time3LSTM': model = Model_Time3LSTM(n_uid, n_mid, n_cat, EMBEDDING_DIM, HIDDEN_SIZE, ATTENTION_SIZE) elif model_type == 'DIEN': model = Model_DIEN(n_uid, n_mid, n_cat, EMBEDDING_DIM, HIDDEN_SIZE, ATTENTION_SIZE) #Our models elif model_type == 'A2SVD': model = Model_A2SVD(n_uid, n_mid, n_cat, EMBEDDING_DIM, HIDDEN_SIZE, ATTENTION_SIZE) elif model_type == 'T_SeqRec': model = Model_T_SeqRec(n_uid, n_mid, n_cat, EMBEDDING_DIM, HIDDEN_SIZE, ATTENTION_SIZE) elif model_type == 'TC_SeqRec_I': model = Model_TC_SeqRec_I(n_uid, n_mid, n_cat, EMBEDDING_DIM, HIDDEN_SIZE, ATTENTION_SIZE) elif model_type == 'TC_SeqRec_G': model = Model_TC_SeqRec_G(n_uid, n_mid, n_cat, EMBEDDING_DIM, HIDDEN_SIZE, ATTENTION_SIZE) elif model_type == 'TC_SeqRec': model = Model_TC_SeqRec(n_uid, n_mid, n_cat, EMBEDDING_DIM, HIDDEN_SIZE, ATTENTION_SIZE) elif model_type == 'SLi_Rec_Fixed': model = Model_SLi_Rec_Fixed(n_uid, n_mid, n_cat, EMBEDDING_DIM, HIDDEN_SIZE, ATTENTION_SIZE) elif model_type == 'SLi_Rec_Adaptive': model = Model_SLi_Rec_Adaptive(n_uid, n_mid, n_cat, EMBEDDING_DIM, HIDDEN_SIZE, ATTENTION_SIZE) else: print ("Invalid model_type : %s", model_type) return model.restore(sess, model_path) print('test_auc: %.4f ----test_loss: %.4f ---- test_accuracy: %.4f ---- test_aux_loss: %.4f' % eval(sess, test_data, model, model_path))
def eval(estimator): eval_file_pattern = "./data/part-5" data_iterator = DataIterator(params) eval_input_fn = lambda: data_iterator.input_fn(eval_file_pattern, 'offline' ) eval_results = estimator.evaluate(input_fn=eval_input_fn) auc_score = eval_results["auc"] # print(type(auc_score)) # numpy.float32 print("\nTest auc: %.6f" % auc_score)
def get_batch_data(self): if self.shuffle: random.shuffle(self.data) it = DataIterator(self.data, self.batch_size) for batch_data in it.get_batch_data(): str_posts, str_responses = [], [] for item in batch_data: str_posts.append(item['post']) str_responses.append(item['response']) id_posts, id_responses = [], [] len_posts, len_responses = [], [] for post in str_posts: id_post, len_post = self.sp.word2index(post) id_posts.append(id_post) len_posts.append(len_post) for response in str_responses: id_response, len_response = self.sp.word2index(response) id_responses.append(id_response) len_responses.append(len_response) len_posts = [l + 2 for l in len_posts] # 加上start和end后的长度 len_responses = [l + 2 for l in len_responses] maxlen_post = max(len_posts) maxlen_response = max(len_responses) pad_id_posts = [ self.sp.pad_sentence(p, maxlen_post) for p in id_posts ] # pad_id_responses = [ self.sp.pad_sentence(r, maxlen_response) for r in id_responses ] new_batch_data = { 'str_posts': str_posts, 'str_responses': str_responses, 'posts': pad_id_posts, 'responses': pad_id_responses, 'len_posts': len_posts, 'len_responses': len_responses } yield new_batch_data
def test(train_file="local_train_splitByUser", test_file="local_test_splitByUser", batch_size=BATCH_SIZE, maxlen=MAXLEN, model_type='DNN', seed=2): model_path = "dnn_best_model/ckpt_noshuff" + model_type + str(seed) gpu_options = tf.GPUOptions(allow_growth=True) with tf.Session(config=tf.ConfigProto(gpu_options=gpu_options)) as sess: train_data = DataIterator(train_file, FEATURE_COUNT, QUERY_COUNT, voc_list, batch_size, maxlen) test_data = DataIterator(test_file, FEATURE_COUNT, QUERY_COUNT, voc_list, batch_size, maxlen) n_query, n = train_data.get_n() if model_type == 'DNN': model = Model_DNN(n, n_query, EMBEDDING_DIM, HIDDEN_SIZE, ATTENTION_SIZE) elif model_type == 'PNN': model = Model_PNN(n, n_query, EMBEDDING_DIM, HIDDEN_SIZE, ATTENTION_SIZE) elif model_type == 'Wide': model = Model_WideDeep(n, n_query, EMBEDDING_DIM, HIDDEN_SIZE, ATTENTION_SIZE) elif model_type == 'DIN': model = Model_DIN(n, n_query, EMBEDDING_DIM, HIDDEN_SIZE, ATTENTION_SIZE) elif model_type == 'DIN-V2-gru-att-gru': model = Model_DIN_V2_Gru_att_Gru(n, n_query, EMBEDDING_DIM, HIDDEN_SIZE, ATTENTION_SIZE) elif model_type == 'DIN-V2-gru-gru-att': model = Model_DIN_V2_Gru_Gru_att(n, n_query, EMBEDDING_DIM, HIDDEN_SIZE, ATTENTION_SIZE) elif model_type == 'DIN-V2-gru-qa-attGru': model = Model_DIN_V2_Gru_QA_attGru(n, n_query, EMBEDDING_DIM, HIDDEN_SIZE, ATTENTION_SIZE) elif model_type == 'DIN-V2-gru-vec-attGru': model = Model_DIN_V2_Gru_Vec_attGru(n, n_query, EMBEDDING_DIM, HIDDEN_SIZE, ATTENTION_SIZE) elif model_type == 'DIEN': model = Model_DIN_V2_Gru_Vec_attGru_Neg(n, n_query, EMBEDDING_DIM, HIDDEN_SIZE, ATTENTION_SIZE) else: print("Invalid model_type : %s", model_type) return with tf.summary.FileWriter('./test_log') as writer: writer.add_graph(sess.graph) model.restore(sess, model_path) print( 'test_auc: %.4f ----test_loss: %.4f ---- test_accuracy: %.4f ---- test_aux_loss: %.4f' % eval(sess, test_data, model, model_path)) writer.flush()
def train(train_file = "data/local_train_splitByUser", test_file = "data/local_test_splitByUser", uid_voc = "data/uid_voc.pkl", mid_voc = "data/mid_voc.pkl", cat_voc = "data/cat_voc.pkl", batch_size = 128, maxlen = 100, test_iter = 100, save_iter = 100, model_type = 'DNN', seed = 2, ): model_path = "dnn_save_path/ckpt_noshuff" + model_type + str(seed) best_model_path = "dnn_bast_model/ckpt_noshuff" + model_type + str(seed) gpu_options = tf.GPUOptions(allow_growth=True) with tf.Session(config=tf.ConfigProto(gpu_options=gpu_options)) as sess: train_data = DataIterator(train_file,uid_voc,mid_voc,cat_voc,batch_size,maxlen) test_data = DataIterator(test_file,uid_voc,mid_voc,cat_voc,batch_size,maxlen) n_uid,n_mid,n_cat = train_data.get_n() model = Model_DIN_V2_Gru_Vec_attGru_Neg(n_uid,n_mid,n_cat,EMBEDDING_DIM, HIDDEN_SIZE, ATTENTION_SIZE) sess.run(tf.global_variables_initializer()) sess.run(tf.local_variables_initializer()) iter = 0 lr = 0.001 for itr in range(3): loss_sum = 0.0 accuracy_sum = 0.0 aux_loss_sum = 0.0 for src,tgt in train_data: uids, mids, cats, mid_his, cat_his, mid_mask, target, sl, noclk_mids, noclk_cats = prepare_data(src,tgt,maxlen,return_neg=True) loss, acc, aux_loss = model.train(sess, [uids, mids, cats, mid_his, cat_his, mid_mask, target, sl, lr,noclk_mids, noclk_cats]) loss_sum += loss accuracy_sum += acc aux_loss_sum += aux_loss iter += 1 if (iter % test_iter) == 0: print('iter: %d ----> train_loss: %.8f ---- train_accuracy: %.4f ---- tran_aux_loss: %.4f' % \ (iter, loss_sum / test_iter, accuracy_sum / test_iter, aux_loss_sum / test_iter)) print('test_auc: %.4f ----test_loss: %.4f ---- test_accuracy: %.4f ---- test_aux_loss: %.4f' % eval(sess, test_data, model, best_model_path)) loss_sum = 0.0 accuracy_sum = 0.0 aux_loss_sum = 0.0 if (iter % save_iter) == 0: print('save model iter: %d' % (iter)) model.save(sess, model_path + "--" + str(iter)) lr *= 0.5
def train(train_file = "data/local_train_splitByUser", test_file = "data/local_test_splitByUser", uid_voc = "data/uid_voc.pkl", mid_voc = "data/mid_voc.pkl", cat_voc = "data/cat_voc.pkl", batch_size = 128, maxlen = 100, test_iter = 100, save_iter = 100, model_type = 'DNN', seed = 2, ): model_path = "dnn_save_path/ckpt_noshuff" + model_type + str(seed) best_model_path = "dnn_bast_model/ckpt_noshuff" + model_type + str(seed) gpu_options = tf.GPUOptions(allow_growth=True) with tf.Session(config=tf.ConfigProto(gpu_options=gpu_options)) as sess: train_data = DataIterator(train_file,uid_voc,mid_voc,cat_voc,batch_size,maxlen) test_data = DataIterator(test_file,uid_voc,mid_voc,cat_voc,batch_size,maxlen) n_uid,n_mid,n_cat = train_data.get_n() model = Model_DIN_V2_Gru_Vec_attGru_Neg(n_uid,n_mid,n_cat,EMBEDDING_DIM, HIDDEN_SIZE, ATTENTION_SIZE) sess.run(tf.global_variables_initializer()) sess.run(tf.local_variables_initializer()) iter = 0 lr = 0.001 for itr in range(3): loss_sum = 0.0 accuracy_sum = 0.0 aux_loss_sum = 0.0 for src,tgt in train_data: uids, mids, cats, mid_his, cat_his, mid_mask, target, sl, noclk_mids, noclk_cats = prepare_data(src,tgt,maxlen,return_neg=True) loss, acc, aux_loss = model.train(sess, [uids, mids, cats, mid_his, cat_his, mid_mask, target, sl, lr,noclk_mids, noclk_cats]) loss_sum += loss accuracy_sum += acc aux_loss_sum += aux_loss iter += 1 if (iter % test_iter) == 0: print('iter: %d ----> train_loss: %.8f ---- train_accuracy: %.4f ---- tran_aux_loss: %.4f' % \ (iter, loss_sum / test_iter, accuracy_sum / test_iter, aux_loss_sum / test_iter)) print('test_auc: %.4f ----test_loss: %.4f ---- test_accuracy: %.4f ---- test_aux_loss: %.4f' % eval(sess, test_data, model, best_model_path)) loss_sum = 0.0 accuracy_sum = 0.0 aux_loss_sum = 0.0 if (iter % save_iter) == 0: print('save model iter: %d' % (iter)) model.save(sess, model_path + "--" + str(iter)) lr *= 0.5
def run(self, is_test=False, is_log=False): if is_test: self._test_data_iterator = DataIterator( data_path=os.path.abspath( os.path.join(self._base_path, self._conf["test_file"])), digit_vocab=self._digit_vocab, data_type_vocab=self._data_type_vocab, operation_vocab=self._operation_vocab, lambda_vocab=self._lambda_vocab, batch_size=self._batch_size, max_argument_num=self._max_argument_num, max_memory_size=self._max_memory_size, max_value_length=self._max_value_size, case_num=self._case_num) test_accuracy, test_opt_accuracy, test_arg_accuracy = self.test( self._test_data_iterator, is_log=is_log) tqdm.write( "Test, accuracy: %f, opt_accuracy: %f, arg_accuracy: %f" % (test_accuracy, test_opt_accuracy, test_arg_accuracy)) else: self._train_data_iterator = DataIterator( data_path=os.path.abspath( os.path.join(self._base_path, self._conf["train_file"])), digit_vocab=self._digit_vocab, data_type_vocab=self._data_type_vocab, operation_vocab=self._operation_vocab, lambda_vocab=self._lambda_vocab, batch_size=self._batch_size, max_argument_num=self._max_argument_num, max_memory_size=self._max_memory_size, max_value_length=self._max_value_size, case_num=self._case_num) self._dev_data_iterator = DataIterator( data_path=os.path.abspath( os.path.join(self._base_path, self._conf["dev_file"])), digit_vocab=self._digit_vocab, data_type_vocab=self._data_type_vocab, operation_vocab=self._operation_vocab, lambda_vocab=self._lambda_vocab, batch_size=self._batch_size, max_argument_num=self._max_argument_num, max_memory_size=self._max_memory_size, max_value_length=self._max_value_size, case_num=self._case_num) self.train()
def predict(estimator): predict_file = "./data/part-predict" data_iterator = DataIterator(params) with open(predict_file, 'r') as infile: for line in infile: line = line.strip('\n') items = line.split('\t') dmp_id = items[0] ins = "\t".join(items[1:]) predict_input_fn = lambda: data_iterator.input_fn(ins, 'online') predictions = estimator.predict(input_fn=predict_input_fn) predictions = itertools.islice(predictions, 1) for i, p in enumerate(predictions): print("dmp_id %s: logits:%.6f probability:%.6f" % (dmp_id, p["logits"], p["probabilities"]))
def kfold_validate(model, k, kwargs): """ This functin does something similar to k fold validation. We train and test our model k times, by randomly splitting our entire data set into three parts (train, dev and test) and return the average of the K runs. Args: model (str): What kind of model to use. It can be either lstm or cnn k (int): Number of iterations over which to average kwargs (dict): The parameters that define the model Returns: dict: A dictionary of results, contating the keys precision, recall and fscore. """ p_1 = 0.0 r_1 = 0.0 f_1 = 0.0 train_data = ATEDataProcessor(kwargs["train_file"], **kwargs) test_data = ATEDataProcessor(kwargs["test_file"], pos_id=get_count(train_data.annotated_sentences), **kwargs) sentences = train_data.annotated_sentences + test_data.annotated_sentences for i in range(k): print("Run number: {}".format(i)) train_set, test_set = split(sentences, test_size=0.2, random_state=42) train_set, dev_set = split(train_set, test_size=kwargs["test_size"], random_state=42) train = DataIterator(train_set, **kwargs) dev = DataIterator(dev_set, **kwargs) test = DataIterator(test_set, **kwargs) if model == "lstm": model = LSTMNetwork(**kwargs) elif model == "cnn": model = CNNNetwork(max_sentence_length=train_data.max_sentence_len, **kwargs) model.build() model.train(train, dev) results = model.evaluate(test) p_1 += float(results["p_1"]) r_1 += float(results["r_1"]) f_1 += float(results["f_1"]) model.close_session() print("p_1: {}\nr_1: {}\nf_1: {}".format(p_1/k, r_1/k, f_1/k)) return { "precision": p_1/k, "recall": r_1/k, "fscore": f_1/k }
def get_generator(): print('dddddddddddddddd') train_generator = DataIterator(train_file, uid_voc, mid_voc, cat_voc, batch_size, maxlen, shuffle_each_epoch=False) for src, tgt in train_generator: uids, mids, cats, mid_his, cat_his, mid_mask, target, sl, noclk_mids, noclk_cats = prepare_data(src, tgt, maxlen, return_neg=True) features['uids'] = uids features['mids'] = mids features['cats' ] = cats features['mid_his']= mid_his features['cat_his'] = cat_his features['mid_mask'] = mid_mask features['sl'] = sl features['noclk_mids'] = noclk_mids features['noclk_cats'] = noclk_cats ''' print("features uids:{}".format(features['uids'])) print("features mids:{}".format(features['uids'])) print("features cats:{}".format(features['uids'])) print("features mid_his:{}".format(features['mid_his'])) print("features cat_his:{}".format(features['cat_his'])) print("features mid_mask:{}".format(features['mid_mask'])) print("features sl:{}".format(features['sl'])) print("features noclk_mids:{}".format(features['noclk_mids'])) print("features noclk_cats:{}".format(features['noclk_cats'])) print("target:{}".format(target)) print("features:{}".format(features)) ''' #features =[uids, mids, cats, mid_his, cat_his, mid_mask, sl, noclk_mids, noclk_cats] yield features, target
def main(network_type): if network_type == "cnn": print("Testing CNN network") from cnn_params import params if network_type == "lstm": print("Testing LSTM network") from params import params train_data = ATEDataProcessor(params["train_file"], **params) test_data = ATEDataProcessor(params["test_file"], pos_id=get_count( train_data.annotated_sentences), **params) test_set = test_data.annotated_sentences test = DataIterator(test_set, word_file=params["word_file"], char_file=params["char_file"]) if network_type == "cnn": model = CNNNetwork(max_sentence_length=train_data.max_sentence_len, **params) elif network_type == "lstm": model = LSTMNetwork(**params) model.build() model.restore_session(model.model_directory) model.evaluate(test)
def test(test_file, cate_file, item_count, dataset="book", batch_size=128, maxlen=100, model_type='DNN', lr=0.001): exp_name = get_exp_name(dataset, model_type, batch_size, lr, maxlen, save=False) best_model_path = "best_model/" + exp_name + '/' gpu_options = tf.GPUOptions(allow_growth=True) model = get_model(dataset, model_type, item_count, batch_size, maxlen) item_cate_map = load_item_cate(cate_file) with tf.Session(config=tf.ConfigProto(gpu_options=gpu_options)) as sess: model.restore(sess, best_model_path) test_data = DataIterator(test_file, batch_size, maxlen, train_flag=2) metrics = evaluate_full(sess, test_data, model, best_model_path, batch_size, item_cate_map, save=False, coef=args.coef) print(', '.join([ 'test ' + key + ': %.6f' % value for key, value in metrics.items() ]))
def main(params): text_cnn = TextCNN(params) config = tf.ConfigProto() config.gpu_options.allow_growth = True config.gpu_options.per_process_gpu_memory_fraction = 0.5 with tf.Session(config=config) as sess: sess.run(tf.global_variables_initializer()) for i in range(params["epoch"]): train_iterator = DataIterator(params["train_file"], params["train_data_size"], params["batch_size"]) sum_loss = 0.0 for x, y in train_iterator: _, loss, score, label, global_step = sess.run([text_cnn.train_op, \ text_cnn.loss, \ text_cnn.score, \ text_cnn.max_score_label, \ text_cnn.global_step], \ feed_dict={text_cnn.input : x, text_cnn.label : y}) sum_loss += loss if global_step % 100 == 0: logging.info("global_step:%d, loss:%.6f" % (global_step, sum_loss / 100)) eval(sess, text_cnn) sum_loss = 0.0
def train_and_eval(estimator): tf.logging.set_verbosity(tf.logging.INFO) # train_file_pattern and eval_file_pattern could be the parameters of FLAGS train_file_pattern = "./data/part-00000" eval_file_pattern = "./data/part-5" data_iterator = DataIterator(params) train_input_fn = lambda: data_iterator.input_fn(train_file_pattern, 'offline') eval_input_fn = lambda: data_iterator.input_fn(eval_file_pattern, 'offline' ) train_spec = tf.estimator.TrainSpec(input_fn=train_input_fn, max_steps=None) eval_spec = tf.estimator.EvalSpec(input_fn=eval_input_fn, steps=100, start_delay_secs=60, throttle_secs=30) tf.estimator.train_and_evaluate(estimator, train_spec, eval_spec)
def eval(sess, model): iterator = DataIterator(params["test_file"], params["test_data_size"], params["test_data_size"]) for x, y in iterator: accuracy, loss = sess.run([model.accuracy, model.loss], feed_dict={ model.input: x, model.label: y }) logging.info("accuracy:%.6f, test_loss:%.6f" % (accuracy, loss))
def main(): train_file_pattern = "./data/part-00000" eval_file_pattern = "./data/part-5" data_iterator = DataIterator(params) train_input_fn = lambda: data_iterator.input_fn(train_file_pattern, 'offline') eval_input_fn = lambda: data_iterator.input_fn(eval_file_pattern, 'offline' ) predict_input_fn = lambda: data_iterator.input_fn(eval_file_pattern, 'offline') # define estimator # estimator = tf.estimator.Estimator(model_fn=model_fn, params=params, model_dir="./model") estimator = tf.estimator.Estimator(model_fn=model_fn, params=params, model_dir="./model") # train(estimator) # eval(estimator) train_and_eval(estimator)
def __init__(self, train_file="local_train_splitByUser", test_file="local_test_splitByUser", uid_voc="uid_voc.pkl", mid_voc="mid_voc.pkl", cat_voc="cat_voc.pkl", item_info='item-info', reviews_info='reviews-info', batch_size=128, maxlen=100, embedding_dim=None, return_neg=True): self.maxlen = maxlen self.embedding_dim = embedding_dim self.return_neg = return_neg self.train_data = DataIterator( train_file, uid_voc, mid_voc, cat_voc, item_info, reviews_info, batch_size, maxlen, shuffle_each_epoch=False) self.test_data = DataIterator( test_file, uid_voc, mid_voc, cat_voc, item_info, reviews_info, batch_size, maxlen) self.n_uid, self.n_mid, self.n_cat = self.train_data.get_n()
def test( train_file = "local_train_splitByUser", test_file = "local_test_splitByUser", uid_voc = "uid_voc.pkl", mid_voc = "mid_voc.pkl", cat_voc = "cat_voc.pkl", batch_size = 128, maxlen = 100, model_type = 'DNN', seed = 2 ): model_path = "dnn_best_model/ckpt_noshuff" + model_type + str(seed) gpu_options = tf.GPUOptions(allow_growth=True) with tf.Session(config=tf.ConfigProto(gpu_options=gpu_options)) as sess: train_data = DataIterator(train_file, uid_voc, mid_voc, cat_voc, batch_size, maxlen) test_data = DataIterator(test_file, uid_voc, mid_voc, cat_voc, batch_size, maxlen) n_uid, n_mid, n_cat = train_data.get_n() if model_type == 'DNN': model = Model_DNN(n_uid, n_mid, n_cat, EMBEDDING_DIM, HIDDEN_SIZE, ATTENTION_SIZE) elif model_type == 'PNN': model = Model_PNN(n_uid, n_mid, n_cat, EMBEDDING_DIM, HIDDEN_SIZE, ATTENTION_SIZE) elif model_type == 'Wide': model = Model_WideDeep(n_uid, n_mid, n_cat, EMBEDDING_DIM, HIDDEN_SIZE, ATTENTION_SIZE) elif model_type == 'DIN': model = Model_DIN(n_uid, n_mid, n_cat, EMBEDDING_DIM, HIDDEN_SIZE, ATTENTION_SIZE) elif model_type == 'DIN-V2-gru-att-gru': model = Model_DIN_V2_Gru_att_Gru(n_uid, n_mid, n_cat, EMBEDDING_DIM, HIDDEN_SIZE, ATTENTION_SIZE) elif model_type == 'DIN-V2-gru-gru-att': model = Model_DIN_V2_Gru_Gru_att(n_uid, n_mid, n_cat, EMBEDDING_DIM, HIDDEN_SIZE, ATTENTION_SIZE) elif model_type == 'DIN-V2-gru-qa-attGru': model = Model_DIN_V2_Gru_QA_attGru(n_uid, n_mid, n_cat, EMBEDDING_DIM, HIDDEN_SIZE, ATTENTION_SIZE) elif model_type == 'DIN-V2-gru-vec-attGru': model = Model_DIN_V2_Gru_Vec_attGru(n_uid, n_mid, n_cat, EMBEDDING_DIM, HIDDEN_SIZE, ATTENTION_SIZE) elif model_type == 'DIEN': model = Model_DIN_V2_Gru_Vec_attGru_Neg(n_uid, n_mid, n_cat, EMBEDDING_DIM, HIDDEN_SIZE, ATTENTION_SIZE) else: print ("Invalid model_type : %s", model_type) return model.restore(sess, model_path) print('test_auc: %.4f ----test_loss: %.4f ---- test_accuracy: %.4f ---- test_aux_loss: %.4f' % eval(sess, test_data, model, model_path))
def test( train_file = "local_train_splitByUser", test_file = "local_test_splitByUser", uid_voc = "uid_voc.pkl", mid_voc = "mid_voc.pkl", cat_voc = "cat_voc.pkl", batch_size = 128, maxlen = 100, model_type = 'DNN', seed = 2 ): model_path = "dnn_best_model/ckpt_noshuff" + model_type + str(seed) gpu_options = tf.GPUOptions(allow_growth=True) with tf.Session(config=tf.ConfigProto(gpu_options=gpu_options)) as sess: train_data = DataIterator(train_file, uid_voc, mid_voc, cat_voc, batch_size, maxlen) test_data = DataIterator(test_file, uid_voc, mid_voc, cat_voc, batch_size, maxlen) n_uid, n_mid, n_cat = train_data.get_n() model = Model_DIN_V2_Gru_Vec_attGru_Neg(n_uid, n_mid, n_cat, EMBEDDING_DIM, HIDDEN_SIZE, ATTENTION_SIZE) model.restore(sess, model_path) print('test_auc: %.4f ----test_loss: %.4f ---- test_accuracy: %.4f ---- test_aux_loss: %.4f' % eval(sess, test_data, model, model_path))
def test( train_file = "local_train_sample_sorted_by_time", test_file = "local_test_sample_sorted_by_time", uid_voc = "uid_voc.pkl", mid_voc = "mid_voc.pkl", cat_voc = "cat_voc.pkl", batch_size = 128, user_maxlen = 50, maxlen = 20, model_type = 'DNN', seed = 2 ): model_path = "dnn_best_model/ckpt_noshuff" + model_type + str(seed)+ "_"+str(user_maxlen) gpu_options = tf.GPUOptions(allow_growth=True) with tf.Session(config=tf.ConfigProto(gpu_options=gpu_options)) as sess: train_data = DataIterator(train_file, uid_voc, mid_voc, cat_voc, batch_size, maxlen) test_data = DataIterator(test_file, uid_voc, mid_voc, cat_voc, batch_size, maxlen) n_uid, n_mid, n_cat = train_data.get_n() if model_type == 'DNN': model = Model_DNN(n_uid, n_mid, n_cat, EMBEDDING_DIM, HIDDEN_SIZE, ATTENTION_SIZE) elif model_type == 'SVDPP': model = Model_SVDPP(n_uid, n_mid, n_cat, EMBEDDING_DIM, HIDDEN_SIZE, ATTENTION_SIZE) elif model_type == 'GRU4REC': model = Model_GRU4REC(n_uid, n_mid, n_cat, EMBEDDING_DIM, HIDDEN_SIZE, ATTENTION_SIZE) elif model_type == 'PNN': model = Model_PNN(n_uid, n_mid, n_cat, EMBEDDING_DIM, HIDDEN_SIZE, ATTENTION_SIZE) elif model_type == 'DUMN': model = Model_DUMN(n_uid, n_mid, n_cat, EMBEDDING_DIM, HIDDEN_SIZE, ATTENTION_SIZE) elif model_type == 'Wide': model = Model_WideDeep(n_uid, n_mid, n_cat, EMBEDDING_DIM, HIDDEN_SIZE, ATTENTION_SIZE) elif model_type == 'DIN': model = Model_DIN(n_uid, n_mid, n_cat, EMBEDDING_DIM, HIDDEN_SIZE, ATTENTION_SIZE) elif model_type == 'DIEN': model = DIEN(n_uid, n_mid, n_cat, EMBEDDING_DIM, HIDDEN_SIZE, ATTENTION_SIZE) else: print ("Invalid model_type : %s", model_type) return model.restore(sess, model_path) print('test_auc: %.4f ----test_loss: %.4f ---- test_accuracy: %.4f ---- Logloss: %.4f' % eval(sess, test_data, model, model_path, maxlen,user_maxlen))
class SampleIO(object): def __init__(self, train_file="local_train_splitByUser", test_file="local_test_splitByUser", uid_voc="uid_voc.pkl", mid_voc="mid_voc.pkl", cat_voc="cat_voc.pkl", item_info='item-info', reviews_info='reviews-info', batch_size=128, maxlen=100, embedding_dim=None, return_neg=True): self.maxlen = maxlen self.embedding_dim = embedding_dim self.return_neg = return_neg self.train_data = DataIterator( train_file, uid_voc, mid_voc, cat_voc, item_info, reviews_info, batch_size, maxlen, shuffle_each_epoch=False) self.test_data = DataIterator( test_file, uid_voc, mid_voc, cat_voc, item_info, reviews_info, batch_size, maxlen) self.n_uid, self.n_mid, self.n_cat = self.train_data.get_n() def get_n(self): return self.n_uid, self.n_mid, self.n_cat def next_train(self): if self.return_neg: return self._py_func(self._next_train) else: return self._py_func(self._next_train, sparse_cnt=5) def next_test(self): if self.return_neg: return self._py_func(self._next_test) else: return self._py_func(self._next_test, sparse_cnt=5) def _next_train(self): try: src, tgt = self.train_data.next() except StopIteration: self.src = self.tgt = None raise OutOfRange("train end") return self.prepare_data(src, tgt, self.maxlen, return_neg=self.return_neg) def _next_test(self): try: src, tgt = self.test_data.next() except StopIteration: self.src = self.tgt = None raise OutOfRange("test end") return self.prepare_data(src, tgt, self.maxlen, return_neg=self.return_neg) def _py_func(self, fn, sparse_cnt=7): types = [] for _ in range(sparse_cnt): types.extend([np.int64, np.float32, np.int32]) types.extend([np.float32, np.float32, np.int32]) types.extend([np.int32 for _ in range(5)]) datas = xdl.py_func(fn, [], output_type=types) sparse_tensors = [] for i in range(sparse_cnt): sparse_tensors.append(xdl.SparseTensor( datas[3 * i], datas[3 * i + 1], datas[3 * i + 2])) return sparse_tensors + datas[sparse_cnt * 3:] def prepare_data(self, input, target, maxlen=None, return_neg=False): # x: a list of sentences lengths_x = [len(s[4]) for s in input] seqs_mid = [inp[3] for inp in input] seqs_cat = [inp[4] for inp in input] noclk_seqs_mid = [inp[5] for inp in input] noclk_seqs_cat = [inp[6] for inp in input] if maxlen is not None: new_seqs_mid = [] new_seqs_cat = [] new_noclk_seqs_mid = [] new_noclk_seqs_cat = [] new_lengths_x = [] for l_x, inp in zip(lengths_x, input): if l_x > maxlen: new_seqs_mid.append(inp[3][l_x - maxlen:]) new_seqs_cat.append(inp[4][l_x - maxlen:]) new_noclk_seqs_mid.append(inp[5][l_x - maxlen:]) new_noclk_seqs_cat.append(inp[6][l_x - maxlen:]) new_lengths_x.append(maxlen) else: new_seqs_mid.append(inp[3]) new_seqs_cat.append(inp[4]) new_noclk_seqs_mid.append(inp[5]) new_noclk_seqs_cat.append(inp[6]) new_lengths_x.append(l_x) lengths_x = new_lengths_x seqs_mid = new_seqs_mid seqs_cat = new_seqs_cat noclk_seqs_mid = new_noclk_seqs_mid noclk_seqs_cat = new_noclk_seqs_cat if len(lengths_x) < 1: return None, None, None, None n_samples = len(seqs_mid) maxlen_x = np.max(lengths_x) + 1 neg_samples = len(noclk_seqs_mid[0][0]) mid_his = np.zeros((n_samples, maxlen_x)).astype('int64') cat_his = np.zeros((n_samples, maxlen_x)).astype('int64') noclk_mid_his = np.zeros( (n_samples, maxlen_x, neg_samples)).astype('int64') noclk_cat_his = np.zeros( (n_samples, maxlen_x, neg_samples)).astype('int64') mid_mask = np.zeros((n_samples, maxlen_x)).astype('float32') for idx, [s_x, s_y, no_sx, no_sy] in enumerate(zip(seqs_mid, seqs_cat, noclk_seqs_mid, noclk_seqs_cat)): mid_mask[idx, :lengths_x[idx] + 1] = 1. mid_his[idx, :lengths_x[idx]] = s_x cat_his[idx, :lengths_x[idx]] = s_y noclk_mid_his[idx, :lengths_x[idx], :] = no_sx noclk_cat_his[idx, :lengths_x[idx], :] = no_sy uids = np.array([inp[0] for inp in input], dtype=np.int64) mids = np.array([inp[1] for inp in input], dtype=np.int64) cats = np.array([inp[2] for inp in input], dtype=np.int64) id_values = np.ones([n_samples], np.float32) his_values = np.ones([n_samples * maxlen_x], np.float32) neg_his_values = np.ones( [n_samples * maxlen_x * neg_samples], np.float32) id_seg = np.array([i + 1 for i in range(n_samples)], dtype=np.int32) his_seg = np.array( [i + 1 for i in range(n_samples * maxlen_x)], dtype=np.int32) neg_his_seg = np.array( [i + 1 for i in range(n_samples * maxlen_x * neg_samples)], dtype=np.int32) results = [] for e in [uids, mids, cats]: results.append(np.reshape(e, (-1))) results.append(id_values) results.append(id_seg) for e in [mid_his, cat_his]: results.append(np.reshape(e, (-1))) results.append(his_values) results.append(his_seg) if return_neg: for e in [noclk_mid_his, noclk_cat_his]: results.append(np.reshape(e, (-1))) results.append(neg_his_values) results.append(neg_his_seg) results.extend( [mid_mask, np.array(target, dtype=np.float32), np.array(lengths_x, dtype=np.int32)]) # for split results.append(np.array([n_samples, n_samples], dtype=np.int32)) # shape results.extend([np.array([-1, self.embedding_dim], dtype=np.int32), np.array([-1, maxlen_x, self.embedding_dim], dtype=np.int32), np.array( [-1, maxlen_x, neg_samples, self.embedding_dim], dtype=np.int32), np.array([-1, maxlen_x], dtype=np.int32)]) return results
def train( train_file = "local_train_splitByUser", test_file = "local_test_splitByUser", uid_voc = "uid_voc.pkl", mid_voc = "mid_voc.pkl", cat_voc = "cat_voc.pkl", batch_size = 128, maxlen = 100, test_iter = 100, save_iter = 100, model_type = 'DNN', seed = 2, ): model_path = "dnn_save_path/ckpt_noshuff" + model_type + str(seed) best_model_path = "dnn_best_model/ckpt_noshuff" + model_type + str(seed) gpu_options = tf.GPUOptions(allow_growth=True) with tf.Session(config=tf.ConfigProto(gpu_options=gpu_options)) as sess: train_data = DataIterator(train_file, uid_voc, mid_voc, cat_voc, batch_size, maxlen, shuffle_each_epoch=False) test_data = DataIterator(test_file, uid_voc, mid_voc, cat_voc, batch_size, maxlen) n_uid, n_mid, n_cat = train_data.get_n() if model_type == 'DNN': model = Model_DNN(n_uid, n_mid, n_cat, EMBEDDING_DIM, HIDDEN_SIZE, ATTENTION_SIZE) elif model_type == 'PNN': model = Model_PNN(n_uid, n_mid, n_cat, EMBEDDING_DIM, HIDDEN_SIZE, ATTENTION_SIZE) elif model_type == 'Wide': model = Model_WideDeep(n_uid, n_mid, n_cat, EMBEDDING_DIM, HIDDEN_SIZE, ATTENTION_SIZE) elif model_type == 'DIN': model = Model_DIN(n_uid, n_mid, n_cat, EMBEDDING_DIM, HIDDEN_SIZE, ATTENTION_SIZE) elif model_type == 'DIN-V2-gru-att-gru': model = Model_DIN_V2_Gru_att_Gru(n_uid, n_mid, n_cat, EMBEDDING_DIM, HIDDEN_SIZE, ATTENTION_SIZE) elif model_type == 'DIN-V2-gru-gru-att': model = Model_DIN_V2_Gru_Gru_att(n_uid, n_mid, n_cat, EMBEDDING_DIM, HIDDEN_SIZE, ATTENTION_SIZE) elif model_type == 'DIN-V2-gru-qa-attGru': model = Model_DIN_V2_Gru_QA_attGru(n_uid, n_mid, n_cat, EMBEDDING_DIM, HIDDEN_SIZE, ATTENTION_SIZE) elif model_type == 'DIN-V2-gru-vec-attGru': model = Model_DIN_V2_Gru_Vec_attGru(n_uid, n_mid, n_cat, EMBEDDING_DIM, HIDDEN_SIZE, ATTENTION_SIZE) elif model_type == 'DIEN': model = Model_DIN_V2_Gru_Vec_attGru_Neg(n_uid, n_mid, n_cat, EMBEDDING_DIM, HIDDEN_SIZE, ATTENTION_SIZE) else: print ("Invalid model_type : %s", model_type) return # model = Model_DNN(n_uid, n_mid, n_cat, EMBEDDING_DIM, HIDDEN_SIZE, ATTENTION_SIZE) sess.run(tf.global_variables_initializer()) sess.run(tf.local_variables_initializer()) sys.stdout.flush() print(' test_auc: %.4f ---- test_loss: %.4f ---- test_accuracy: %.4f ---- test_aux_loss: %.4f' % eval(sess, test_data, model, best_model_path)) sys.stdout.flush() start_time = time.time() iter = 0 lr = 0.001 for itr in range(3): loss_sum = 0.0 accuracy_sum = 0. aux_loss_sum = 0. for src, tgt in train_data: uids, mids, cats, mid_his, cat_his, mid_mask, target, sl, noclk_mids, noclk_cats = prepare_data(src, tgt, maxlen, return_neg=True) loss, acc, aux_loss = model.train(sess, [uids, mids, cats, mid_his, cat_his, mid_mask, target, sl, lr, noclk_mids, noclk_cats]) loss_sum += loss accuracy_sum += acc aux_loss_sum += aux_loss iter += 1 sys.stdout.flush() if (iter % test_iter) == 0: print('iter: %d ----> train_loss: %.4f ---- train_accuracy: %.4f ---- tran_aux_loss: %.4f' % \ (iter, loss_sum / test_iter, accuracy_sum / test_iter, aux_loss_sum / test_iter)) print(' test_auc: %.4f ----test_loss: %.4f ---- test_accuracy: %.4f ---- test_aux_loss: %.4f' % eval(sess, test_data, model, best_model_path)) loss_sum = 0.0 accuracy_sum = 0.0 aux_loss_sum = 0.0 if (iter % save_iter) == 0: print('save model iter: %d' %(iter)) model.save(sess, model_path+"--"+str(iter)) lr *= 0.5