def accuracy( output, target, topk=(5, 20)): # output: [batch_size, item_size] target: [batch_size] """Computes the accuracy over the k top predictions for the specified values of k""" global curr_preds_5 global rec_preds_5 global ndcg_preds_5 global curr_preds_20 global rec_preds_20 global ndcg_preds_20 for bi in range(output.shape[0]): pred_items_5 = utils.sample_top_k(output[bi], top_k=topk[0]) # top_k=5 pred_items_20 = utils.sample_top_k(output[bi], top_k=topk[1]) true_item = target[bi] predictmap_5 = {ch: i for i, ch in enumerate(pred_items_5)} pred_items_20 = {ch: i for i, ch in enumerate(pred_items_20)} rank_5 = predictmap_5.get(true_item) rank_20 = pred_items_20.get(true_item) if rank_5 == None: curr_preds_5.append(0.0) rec_preds_5.append(0.0) ndcg_preds_5.append(0.0) else: MRR_5 = 1.0 / (rank_5 + 1) Rec_5 = 1.0 #3 ndcg_5 = 1.0 / math.log(rank_5 + 2, 2) # 3 curr_preds_5.append(MRR_5) rec_preds_5.append(Rec_5) #4 ndcg_preds_5.append(ndcg_5) # 4 if rank_20 == None: curr_preds_20.append(0.0) rec_preds_20.append(0.0) #2 ndcg_preds_20.append(0.0) #2 else: MRR_20 = 1.0 / (rank_20 + 1) Rec_20 = 1.0 #3 ndcg_20 = 1.0 / math.log(rank_20 + 2, 2) # 3 curr_preds_20.append(MRR_20) rec_preds_20.append(Rec_20) #4 ndcg_preds_20.append(ndcg_20) # 4
def main(): parser = argparse.ArgumentParser() parser.add_argument('--top_k', type=int, default=5, help='Sample from top k predictions') parser.add_argument('--beta1', type=float, default=0.9, help='hyperpara-Adam') parser.add_argument( '--datapath', type=str, default='Data/Session/user-filter-20000items-session5.csv', help='data path') parser.add_argument('--eval_iter', type=int, default=10, help='Sample generator output evry x steps') parser.add_argument('--save_para_every', type=int, default=10, help='save model parameters every') parser.add_argument('--tt_percentage', type=float, default=0.5, help='default=0.2 means 80% training 20% testing') parser.add_argument( '--is_generatesubsession', type=bool, default=False, help= 'whether generating a subsessions, e.g., 12345-->01234,00123,00012 It may be useful for very some very long sequences' ) args = parser.parse_args() dl = data_loader_recsys.Data_Loader({ 'model_type': 'generator', 'dir_name': args.datapath }) all_samples = dl.item items = dl.item_dict print "len(items)", len(items) # Randomly shuffle data np.random.seed(10) shuffle_indices = np.random.permutation(np.arange(len(all_samples))) all_samples = all_samples[shuffle_indices] # Split train/test set dev_sample_index = -1 * int(args.tt_percentage * float(len(all_samples))) train_set, valid_set = all_samples[:dev_sample_index], all_samples[ dev_sample_index:] model_para = { #all parameters shuold be consist with those in nextitred.py!!!! 'item_size': len(items), 'dilated_channels': 100, 'dilations': [ 1, 2, ], 'kernel_size': 3, 'learning_rate': 0.001, 'batch_size': 32, 'iterations': 2, #useless, can be removed 'is_negsample': False #False denotes no negative sampling } itemrec = generator_recsys.NextItNet_Decoder(model_para) itemrec.train_graph(model_para['is_negsample']) itemrec.predict_graph(model_para['is_negsample'], reuse=True) sess = tf.Session() init = tf.global_variables_initializer() sess.run(init) saver = tf.train.Saver() saver.restore(sess, "Data/Models/generation_model/model_nextitnet.ckpt") batch_no_test = 0 batch_size_test = model_para['batch_size'] curr_preds_5 = [] rec_preds_5 = [] # 1 ndcg_preds_5 = [] # 1 curr_preds_20 = [] rec_preds_20 = [] # 1 ndcg_preds_20 = [] # 1 while (batch_no_test + 1) * batch_size_test < valid_set.shape[0]: text_batch = valid_set[batch_no_test * batch_size_test:(batch_no_test + 1) * batch_size_test, :] [probs] = sess.run([itemrec.g_probs], feed_dict={itemrec.input_predict: text_batch}) for bi in range(probs.shape[0]): pred_words_5 = utils.sample_top_k(probs[bi][-1], top_k=args.top_k) # top_k=5 pred_words_20 = utils.sample_top_k(probs[bi][-1], top_k=args.top_k + 15) true_word = text_batch[bi][-1] predictmap_5 = {ch: i for i, ch in enumerate(pred_words_5)} pred_words_20 = {ch: i for i, ch in enumerate(pred_words_20)} rank_5 = predictmap_5.get(true_word) rank_20 = pred_words_20.get(true_word) if rank_5 == None: curr_preds_5.append(0.0) rec_preds_5.append(0.0) # 2 ndcg_preds_5.append(0.0) # 2 else: MRR_5 = 1.0 / (rank_5 + 1) Rec_5 = 1.0 # 3 ndcg_5 = 1.0 / math.log(rank_5 + 2, 2) # 3 curr_preds_5.append(MRR_5) rec_preds_5.append(Rec_5) # 4 ndcg_preds_5.append(ndcg_5) # 4 if rank_20 == None: curr_preds_20.append(0.0) rec_preds_20.append(0.0) # 2 ndcg_preds_20.append(0.0) # 2 else: MRR_20 = 1.0 / (rank_20 + 1) Rec_20 = 1.0 # 3 ndcg_20 = 1.0 / math.log(rank_20 + 2, 2) # 3 curr_preds_20.append(MRR_20) rec_preds_20.append(Rec_20) # 4 ndcg_preds_20.append(ndcg_20) # 4 batch_no_test += 1 print "BATCH_NO: {}".format(batch_no_test) print "Accuracy mrr_5:", sum(curr_preds_5) / float( len(curr_preds_5)) # 5 print "Accuracy mrr_20:", sum(curr_preds_20) / float( len(curr_preds_20)) # 5 print "Accuracy hit_5:", sum(rec_preds_5) / float( len(rec_preds_5)) # 5 print "Accuracy hit_20:", sum(rec_preds_20) / float( len(rec_preds_20)) # 5 print "Accuracy ndcg_5:", sum(ndcg_preds_5) / float( len(ndcg_preds_5)) # 5 print "Accuracy ndcg_20:", sum(ndcg_preds_20) / float( len(ndcg_preds_20)) #
def main(args): exps = pd.read_csv('exp.csv') cPid = os.getpid() train_time = 0 test_time = 0 for i, row in exps.iterrows(): gc.collect() args['expname'] = row['name'] args['sessionid'] = row['sessionid'] args['itemid'] = row['itemid'] args['data_folder'] = row['path'] args['valid_data'] = row['test'] args['train_data'] = row['train'] args['freq'] = row['freq'] args['model_type'] = 'generator' print(("\n\n############################################\n"), args['train_data'], ' --- ', args['valid_data']) with open("LOGGER_" + args['expname'] + ".txt", "a") as myfile: myfile.write(row['train'] + ", " + row['test'] + "\n") train_data = os.path.join(args['data_folder'], args['train_data']) args['dir_name'] = train_data dl = data_loader_recsys.Data_Loader(args) train_set = dl.item items = dl.item_dict print("len(train items)", len(items)) valid_data = os.path.join(args['data_folder'], args['valid_data']) args['dir_name'] = valid_data vdl = data_loader_recsys.Data_Loader(args, testFlag=True, itemsIDs=dl.itemsIDs, max_doc=dl.max_document_length, vocab_proc=dl.vocab_processor) valid_set = vdl.item items2 = vdl.item_dict print("len(valid items)", len(items2)) model_para = { #if you changed the parameters here, also do not forget to change paramters in nextitrec_generate.py 'item_size': len(items), 'dilated_channels': 100, #larger is better until 512 or 1024 # if you use nextitnet_residual_block, you can use [1, 4, 1, 4, 1,4,], # if you use nextitnet_residual_block_one, you can tune and i suggest [1, 2, 4, ], for a trial # when you change it do not forget to change it in nextitrec_generate.py 'dilations': [ 1, 2, 4, ], #YOU should tune this hyper-parameter, refer to the paper. 'kernel_size': 3, 'learning_rate': args['learning_rate'], #YOU should tune this hyper-parameter 'batch_size': int(args['batch_size']), #YOU should tune this hyper-parameter 'epochs': args[ 'epochs'], # if your dataset is small, suggest adding regularization to prevent overfitting 'is_negsample': False #False denotes no negative sampling } tf.compat.v1.reset_default_graph() with tf.variable_scope(tf.get_variable_scope(), reuse=tf.AUTO_REUSE): itemrec = generator_recsys.NextItNet_Decoder(model_para) itemrec.train_graph(model_para['is_negsample']) optimizer = tf.compat.v1.train.AdamOptimizer( model_para['learning_rate'], beta1=args['beta1']).minimize(itemrec.loss) itemrec.predict_graph(model_para['is_negsample'], reuse=True) sess = tf.Session() init = tf.global_variables_initializer() sess.run(init) for e in range(model_para['epochs']): print("\n############################\nEPOCH #:", e) batch_no = 0 batch_size = model_para['batch_size'] losses = [] t1 = time.time() while (batch_no + 1) * batch_size < train_set.shape[0]: batch_no += 1 item_batch = train_set[(batch_no - 1) * batch_size:(batch_no) * batch_size, :] _, loss, results = sess.run( [optimizer, itemrec.loss, itemrec.arg_max_prediction], feed_dict={itemrec.itemseq_input: item_batch}) losses.append(loss) if batch_no % 100 == 0: print('Finished Batch:', batch_no) print('Train Loss:', np.mean(losses), valid_set.shape[0]) train_time += (time.time() - t1) # Report intermediate result nni.report_intermediate_result(np.mean(losses)) logger.debug('train loss %g', np.mean(losses)) logger.debug('Pipe send intermediate result done.') batch_no_test = 0 batch_size_test = batch_size * 1 MRR = [[], [], [], [], []] Rec = [[], [], [], [], []] cov = [[], [], [], [], []] pop = [[], [], [], [], []] Ks = [1, 3, 5, 10, 20] t1 = time.time() while (batch_no_test + 1) * batch_size_test < valid_set.shape[0]: batch_no_test += 1 item_batch = valid_set[(batch_no_test - 1) * batch_size_test:(batch_no_test) * batch_size_test, :] [probs ] = sess.run([itemrec.g_probs], feed_dict={itemrec.input_predict: item_batch}) for bi in range(probs.shape[0]): true_item = item_batch[bi][-1] if true_item == 1: continue if args['freq'] != 0 and dl.freqs[true_item] > args['freq']: continue for k in range(len(Ks)): pred_items = utils.sample_top_k(probs[bi][-1], top_k=Ks[k]) predictmap = {ch: i for i, ch in enumerate(pred_items)} print(pred_items, predictmap) for p in pred_items: if p == 1: continue if p not in cov[k]: cov[k].append(p) pop[k].append(dl.freqs[p]) rank = predictmap.get(true_item) if rank == None: mrr = 0.0 rec = 0.0 else: mrr = 1.0 / (rank + 1) rec = 1.0 MRR[k].append(mrr) Rec[k].append(rec) test_time += (time.time() - t1) / len(Ks) Rec[:] = [np.mean(x) for x in Rec] MRR[:] = [np.mean(x) for x in MRR] cov[:] = [len(x) / len(items) for x in cov] maxi = max(dl.freqs.values()) pop[:] = [np.mean(x) / maxi for x in pop] print("MRR@20:", MRR[-1]) print("Recall@20:", Rec[-1]) print("Cov@20:", cov[-1]) print("Pop@20:", pop[-1]) # Print to the logger print("LOGGER_ " + args['expname']) print('EPOCH #:' + str(e)) print( str(Rec[0]) + ',' + str(Rec[1]) + ',' + str(Rec[2]) + ',' + str(Rec[3]) + ',' + str(Rec[4]) + ',' + str(MRR[0]) + ',' + str(MRR[1]) + ',' + str(MRR[2]) + ',' + str(MRR[3]) + ',' + str(MRR[4])) print("\nCOV:" + str(cov[0]) + ',' + str(cov[1]) + ',' + str(cov[2]) + ',' + str(cov[3]) + ',' + str(cov[4])) print("\nPOP:" + str(pop[0]) + ',' + str(pop[1]) + ',' + str(pop[2]) + ',' + str(pop[3]) + ',' + str(pop[4])) print("\nTrainTime:" + str(train_time)) print("\nTestTime:" + str(test_time)) print("\n############################################\n") with open("LOGGER_" + args['expname'] + ".txt", "a") as myfile: myfile.write('EPOCH #:' + str(e)) myfile.write( str(Rec[0]) + ',' + str(Rec[1]) + ',' + str(Rec[2]) + ',' + str(Rec[3]) + ',' + str(Rec[4]) + ',' + str(MRR[0]) + ',' + str(MRR[1]) + ',' + str(MRR[2]) + ',' + str(MRR[3]) + ',' + str(MRR[4])) myfile.write("\nCOV:" + str(cov[0]) + ',' + str(cov[1]) + ',' + str(cov[2]) + ',' + str(cov[3]) + ',' + str(cov[4])) myfile.write("\nPOP:" + str(pop[0]) + ',' + str(pop[1]) + ',' + str(pop[2]) + ',' + str(pop[3]) + ',' + str(pop[4])) myfile.write("\nTrainTime:" + str(train_time)) myfile.write("\nTestTime:" + str(test_time)) myfile.write( "\n############################################\n") # Report final result nni.report_final_result(np.mean(losses)) logger.debug('Final result %g', np.mean(losses)) logger.debug('Pipe send intermediate result done.')
def main(): parser = argparse.ArgumentParser() parser.add_argument('--top_k', type=int, default=5, help='Sample from top k predictions') parser.add_argument('--beta1', type=float, default=0.9, help='hyperpara-Adam') #history_sequences_20181014_fajie_smalltest.csv parser.add_argument( '--datapath', type=str, default='Data/Session/user-filter-20000items-session5.csv', help='data path') parser.add_argument('--eval_iter', type=int, default=5000, help='Sample generator output evry x steps') parser.add_argument('--save_para_every', type=int, default=10000, help='save model parameters every') parser.add_argument('--tt_percentage', type=float, default=0.2, help='0.2 means 80% training 20% testing') parser.add_argument( '--is_generatesubsession', type=bool, default=False, help= 'whether generating a subsessions, e.g., 12345-->01234,00123,00012 It may be useful for very some very long sequences' ) args = parser.parse_args() dl = data_loader_recsys.Data_Loader({ 'model_type': 'generator', 'dir_name': args.datapath }) all_samples = dl.item items = dl.item_dict print "len(items)", len(items) # Randomly shuffle data np.random.seed(10) shuffle_indices = np.random.permutation(np.arange(len(all_samples))) all_samples = all_samples[shuffle_indices] # Split train/test set dev_sample_index = -1 * int(args.tt_percentage * float(len(all_samples))) train_set, valid_set = all_samples[:dev_sample_index], all_samples[ dev_sample_index:] if args.is_generatesubsession: x_train = generatesubsequence(train_set) model_para = { #if you changed the parameters here, also do not forget to change paramters in nextitrec_generate.py 'item_size': len(items), 'dilated_channels': 100, # if you use nextitnet_residual_block, you can use [1, 4, ], # if you use nextitnet_residual_block_one, you can tune and i suggest [1, 2, 4, ], for a trial # when you change it do not forget to change it in nextitrec_generate.py 'dilations': [ 1, 2, ], 'kernel_size': 3, 'learning_rate': 0.001, 'batch_size': 32, 'iterations': 400, 'is_negsample': False #False denotes no negative sampling } itemrec = generator_recsys.NextItNet_Decoder(model_para) itemrec.train_graph(model_para['is_negsample']) optimizer = tf.train.AdamOptimizer(model_para['learning_rate'], beta1=args.beta1).minimize(itemrec.loss) itemrec.predict_graph(model_para['is_negsample'], reuse=True) sess = tf.Session() init = tf.global_variables_initializer() sess.run(init) saver = tf.train.Saver() numIters = 1 for iter in range(model_para['iterations']): batch_no = 0 batch_size = model_para['batch_size'] while (batch_no + 1) * batch_size < train_set.shape[0]: start = time.clock() item_batch = train_set[batch_no * batch_size:(batch_no + 1) * batch_size, :] _, loss, results = sess.run( [optimizer, itemrec.loss, itemrec.arg_max_prediction], feed_dict={itemrec.itemseq_input: item_batch}) end = time.clock() if numIters % args.eval_iter == 0: print "-------------------------------------------------------train1" print "LOSS: {}\tITER: {}\tBATCH_NO: {}\t STEP:{}\t total_batches:{}".format( loss, iter, batch_no, numIters, train_set.shape[0] / batch_size) print "TIME FOR BATCH", end - start print "TIME FOR ITER (mins)", (end - start) * ( train_set.shape[0] / batch_size) / 60.0 if numIters % args.eval_iter == 0: print "-------------------------------------------------------test1" if (batch_no + 1) * batch_size < valid_set.shape[0]: item_batch = valid_set[(batch_no) * batch_size:(batch_no + 1) * batch_size, :] loss = sess.run([itemrec.loss_test], feed_dict={itemrec.input_predict: item_batch}) print "LOSS: {}\tITER: {}\tBATCH_NO: {}\t STEP:{}\t total_batches:{}".format( loss, iter, batch_no, numIters, valid_set.shape[0] / batch_size) batch_no += 1 if numIters % args.eval_iter == 0: batch_no_test = 0 batch_size_test = batch_size * 1 curr_preds_5 = [] rec_preds_5 = [] #1 ndcg_preds_5 = [] #1 curr_preds_20 = [] rec_preds_20 = [] # 1 ndcg_preds_20 = [] # 1 while (batch_no_test + 1) * batch_size_test < valid_set.shape[0]: if (numIters / (args.eval_iter) < 10): if (batch_no_test > 20): break else: if (batch_no_test > 500): break item_batch = valid_set[batch_no_test * batch_size_test:(batch_no_test + 1) * batch_size_test, :] [probs] = sess.run( [itemrec.g_probs], feed_dict={itemrec.input_predict: item_batch}) for bi in range(probs.shape[0]): pred_items_5 = utils.sample_top_k( probs[bi][-1], top_k=args.top_k) #top_k=5 pred_items_20 = utils.sample_top_k(probs[bi][-1], top_k=args.top_k + 15) true_item = item_batch[bi][-1] predictmap_5 = { ch: i for i, ch in enumerate(pred_items_5) } pred_items_20 = { ch: i for i, ch in enumerate(pred_items_20) } rank_5 = predictmap_5.get(true_item) rank_20 = pred_items_20.get(true_item) if rank_5 == None: curr_preds_5.append(0.0) rec_preds_5.append(0.0) #2 ndcg_preds_5.append(0.0) #2 else: MRR_5 = 1.0 / (rank_5 + 1) Rec_5 = 1.0 #3 ndcg_5 = 1.0 / math.log(rank_5 + 2, 2) # 3 curr_preds_5.append(MRR_5) rec_preds_5.append(Rec_5) #4 ndcg_preds_5.append(ndcg_5) # 4 if rank_20 == None: curr_preds_20.append(0.0) rec_preds_20.append(0.0) #2 ndcg_preds_20.append(0.0) #2 else: MRR_20 = 1.0 / (rank_20 + 1) Rec_20 = 1.0 #3 ndcg_20 = 1.0 / math.log(rank_20 + 2, 2) # 3 curr_preds_20.append(MRR_20) rec_preds_20.append(Rec_20) #4 ndcg_preds_20.append(ndcg_20) # 4 batch_no_test += 1 print "BATCH_NO: {}".format(batch_no_test) print "Accuracy mrr_5:", sum(curr_preds_5) / float( len(curr_preds_5)) #5 print "Accuracy mrr_20:", sum(curr_preds_20) / float( len(curr_preds_20)) # 5 print "Accuracy hit_5:", sum(rec_preds_5) / float( len(rec_preds_5)) #5 print "Accuracy hit_20:", sum(rec_preds_20) / float( len(rec_preds_20)) # 5 print "Accuracy ndcg_5:", sum(ndcg_preds_5) / float( len(ndcg_preds_5)) # 5 print "Accuracy ndcg_20:", sum(ndcg_preds_20) / float( len(ndcg_preds_20)) # #print "curr_preds",curr_preds # print "---------------------------Test Accuray----------------------------" numIters += 1 if numIters % args.save_para_every == 0: save_path = saver.save( sess, "Data/Models/generation_model/model_nextitnet.ckpt".format( iter, numIters))
def main(): parser = argparse.ArgumentParser() parser.add_argument('--top_k', type=int, default=5, help='Sample from top k predictions') parser.add_argument('--beta1', type=float, default=0.9, help='hyperpara-Adam') parser.add_argument('--datapath', type=str, default='Data/Session/ratings_seq20_order.txt', help='data path') parser.add_argument('--eval_iter', type=int, default=100, help='Sample generator output evry x steps') parser.add_argument('--save_para_every', type=int, default=100, help='save model parameters every') parser.add_argument('--tt_percentage', type=float, default=0.2, help='0.2 means 80% training 20% testing') parser.add_argument('--masked_lm_prob', type=float, default=0.5, help='0.2 means 20% items are masked') parser.add_argument('--max_predictions_per_seq', type=int, default=50, help='maximum number of masked tokens') parser.add_argument('--max_position', type=int, default=100, help='maximum number of for positional embedding, it has to be larger than the sequence lens') parser.add_argument('--has_positionalembedding', type=bool, default=False, help='whether contains positional embedding before performing cnnn') args = parser.parse_args() dl = data_loader_recsys.Data_Loader({'model_type': 'generator', 'dir_name': args.datapath}) all_samples = dl.item items = dl.itemrank itemlist=items.values() item_size=len(items)+1 # add one the last token used for masking print "len(items)",item_size max_predictions_per_seq=args.max_predictions_per_seq masked_lm_prob=args.masked_lm_prob # Randomly shuffle data np.random.seed(10) shuffle_indices = np.random.permutation(np.arange(len(all_samples))) all_samples = all_samples[shuffle_indices] # Split train/test set dev_sample_index = -1 * int(args.tt_percentage * float(len(all_samples))) train_set, valid_set = all_samples[:dev_sample_index], all_samples[dev_sample_index:] model_para = { #if you changed the parameters here, also do not forget to change paramters in nextitrec_generate.py 'item_size': item_size, 'dilated_channels': 64, # if you use nextitnet_residual_block, you can use [1, 4, ], # if you use nextitnet_residual_block_one, you can tune and i suggest [1, 2, 4, ], for a trial # when you change it do not forget to change it in nextitrec_generate.py # if you find removing residual network, the performance does not obviously decrease, then I think your data does not have strong seqeunce. Change a dataset and try again. 'dilations': [1,4,1,4,],#1,4 means 1 2 4 8 for dilation 'kernel_size': 3, 'learning_rate':0.001, 'batch_size': 2, 'iterations':400, 'max_position':args.max_position,#maximum number of for positional embedding, it has to be larger than the sequence lens 'has_positionalembedding':args.has_positionalembedding, 'is_negsample':True, #False denotes no negative sampling 'neg_num': 64,# you need fine tune this hyper-parameters, it is very sensitive --- usually larger is better. 'top_k':args.top_k, 'mask_per':args.masked_lm_prob, 'seq_len':dl.max_document_length } itemrec = generator_recsys.GRec_Archi(model_para) itemrec.train_graph() optimizer = tf.train.AdamOptimizer(model_para['learning_rate'], beta1=args.beta1).minimize(itemrec.loss) itemrec.predict_graph(reuse=True) sess= tf.Session() init=tf.global_variables_initializer() sess.run(init) saver = tf.train.Saver() numIters = 1 for iter in range(model_para['iterations']): batch_no = 0 batch_size = model_para['batch_size'] while (batch_no + 1) * batch_size < train_set.shape[0]: start = time.time() item_batch = train_set[batch_no * batch_size: (batch_no + 1) * batch_size, :] # original input 1 2 3 4 5 6 7 8 9 # item_batch[:,1:-1] 2 3 4 5 6 7 8 # output_tokens_batch 2 0 4 5 0 7 8 #maskedpositions_batch [1 4] #maskedlabels_batch [3 6] output_tokens_batch, maskedpositions_batch, maskedlabels_batch,masked_lm_weights_batch= create_masked_lm_predictions_frombatch( item_batch,masked_lm_prob,max_predictions_per_seq,items=itemlist,rng=None,item_size=item_size ) _, loss = sess.run( [optimizer, itemrec.loss], feed_dict={ itemrec.itemseq_output: item_batch[:, 1:], # 2 3 4 5 6 7 8 9 itemrec.itemseq_input_en: output_tokens_batch, # 1 2 0 4 5 0 7 8 9 itemrec.itemseq_input_de: item_batch, # 1 2 3 4 5 6 7 8 9 itemrec.masked_position: maskedpositions_batch,#[1 4] itemrec.masked_items: maskedlabels_batch,#[3,6] itemrec.label_weights: masked_lm_weights_batch#[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0] #useless }) end = time.time() if numIters % args.eval_iter == 0: print "-------------------------------------------------------train1" print "LOSS: {}\tITER: {}\tBATCH_NO: {}\t STEP:{}\t total_batches:{}".format( loss, iter, batch_no, numIters, train_set.shape[0] / batch_size) print "TIME FOR BATCH", end - start # print "TIME FOR ITER (mins)", (end - start) * (train_set.shape[0] / batch_size) / 60.0 if numIters % args.eval_iter == 0: print "-------------------------------------------------------test1" batch_no_valid=0 batch_size_valid=batch_size if (batch_no_valid + 1) * batch_size_valid < valid_set.shape[0]: start = time.time() item_batch = valid_set[(batch_no_valid) * batch_size_valid: (batch_no_valid + 1) * batch_size_valid, :] output_tokens_batch, maskedpositions_batch, maskedlabels_batch, masked_lm_weights_batch = create_masked_lm_predictions_frombatch( item_batch, masked_lm_prob, max_predictions_per_seq, items=itemlist, rng=None, item_size=item_size ) loss = sess.run( [itemrec.loss], feed_dict={ itemrec.itemseq_output: item_batch[:, 1:], itemrec.itemseq_input_en: output_tokens_batch, itemrec.itemseq_input_de: item_batch, itemrec.masked_position: maskedpositions_batch, itemrec.masked_items: maskedlabels_batch, itemrec.label_weights: masked_lm_weights_batch }) end = time.time() print "LOSS: {}\tITER: {}\tBATCH_NO: {}\t STEP:{}\t total_batches:{}".format( loss, iter, batch_no_valid, numIters, valid_set.shape[0] / batch_size_valid) print "TIME FOR BATCH", end - start batch_no += 1 if numIters % args.eval_iter == 0: batch_no_test = 0 batch_size_test = batch_size*1 curr_preds_5=[] rec_preds_5=[] #1 ndcg_preds_5=[] #1 curr_preds_20 = [] rec_preds_20 = [] # 1 ndcg_preds_20 = [] # 1 while (batch_no_test + 1) * batch_size_test < valid_set.shape[0]: if (numIters / (args.eval_iter) < 20): if (batch_no_test > 50): break else: if (batch_no_test > 100): break item_batch = valid_set[batch_no_test * batch_size_test: (batch_no_test + 1) * batch_size_test, :] # output_tokens_batch,maskedpositions_batch,maskedlabels_batch=create_masked_predictions_frombatch(item_batch) [probs] = sess.run( [itemrec.log_probs], feed_dict={ itemrec.itemseq_input_en: item_batch[:, 0:-1], # 1 2 3 4 5 6 7 8 itemrec.itemseq_input_de: item_batch[:, 0:-1], # 1 2 3 4 5 6 7 8 # itemrec.itemseq_input_en: item_batch, # 1 2 3 4 5 6 7 8 # itemrec.itemseq_input_de: item_batch, # 1 2 3 4 5 6 7 8 }) for bi in range(probs.shape[0]): pred_items_5 = utils.sample_top_k(probs[bi], top_k=args.top_k)#top_k=5 pred_items_20 = utils.sample_top_k(probs[bi], top_k=args.top_k+15) true_item=item_batch[bi][-1] predictmap_5={ch : i for i, ch in enumerate(pred_items_5)} pred_items_20 = {ch: i for i, ch in enumerate(pred_items_20)} rank_5=predictmap_5.get(true_item) rank_20 = pred_items_20.get(true_item) if rank_5 ==None: curr_preds_5.append(0.0) rec_preds_5.append(0.0)#2 ndcg_preds_5.append(0.0)#2 else: MRR_5 = 1.0/(rank_5+1) Rec_5=1.0#3 ndcg_5 = 1.0 / math.log(rank_5 + 2, 2) # 3 curr_preds_5.append(MRR_5) rec_preds_5.append(Rec_5)#4 ndcg_preds_5.append(ndcg_5) # 4 if rank_20 ==None: curr_preds_20.append(0.0) rec_preds_20.append(0.0)#2 ndcg_preds_20.append(0.0)#2 else: MRR_20 = 1.0/(rank_20+1) Rec_20=1.0#3 ndcg_20 = 1.0 / math.log(rank_20 + 2, 2) # 3 curr_preds_20.append(MRR_20) rec_preds_20.append(Rec_20)#4 ndcg_preds_20.append(ndcg_20) # 4 batch_no_test += 1 if (numIters / (args.eval_iter) < 20): if (batch_no_test == 50): print "BATCH_NO: {}".format(batch_no_test) print "mrr_5:", sum(curr_preds_5) / float(len(curr_preds_5)), "mrr_20:", sum( curr_preds_20) / float(len(curr_preds_20)), "hit_5:", sum(rec_preds_5) / float( len(rec_preds_5)), "hit_20:", sum(rec_preds_20) / float( len(rec_preds_20)), "ndcg_5:", sum(ndcg_preds_5) / float( len(ndcg_preds_5)), "ndcg_20:", sum(ndcg_preds_20) / float(len(ndcg_preds_20)) else: if (batch_no_test == 100): print "BATCH_NO: {}".format(batch_no_test) print "mrr_5:", sum(curr_preds_5) / float(len(curr_preds_5)), "mrr_20:", sum( curr_preds_20) / float(len(curr_preds_20)), "hit_5:", sum(rec_preds_5) / float( len(rec_preds_5)), "hit_20:", sum(rec_preds_20) / float( len(rec_preds_20)), "ndcg_5:", sum(ndcg_preds_5) / float( len(ndcg_preds_5)), "ndcg_20:", sum(ndcg_preds_20) / float(len(ndcg_preds_20)) numIters += 1
def main(): parser = argparse.ArgumentParser() parser.add_argument('--top_k', type=int, default=10, help='Sample from top k predictions') parser.add_argument('--beta1', type=float, default=0.9, help='hyperpara-Adam') #this is a demo dataset, which just let you run this code, suggest dataset link: http://grouplens.org/datasets/. parser.add_argument('--datapath', type=str, default='Data/Session/', help='data path') parser.add_argument('--eval_iter', type=int, default=100, help='Sample generator output evry x steps') parser.add_argument('--save_para_every', type=int, default=100, help='save model parameters every') parser.add_argument('--tt_percentage', type=float, default=0.2, help='0.2 means 80% training 20% testing') parser.add_argument("--iterations", type=int, default=2, help='number of training iterations') parser.add_argument("--dilated_channels", type=int, default=100, help='number of dilated channels') parser.add_argument("--learning_rate", type=float, default=0.008, help='learning rate') parser.add_argument("--kernel_size", type=int, default=3, help="kernel size") parser.add_argument("--batch_size", type=int, default=300, help="batch size") parser.add_argument("--max_seq_size", type=int, default=80, help="max seq len") parser.add_argument( '--is_generatesubsession', type=bool, default=False, help= 'whether generating a subsessions, e.g., 12345-->01234,00123,00012 It may be useful for very some very long sequences' ) args = parser.parse_args() training_path = args.datapath + "/" + "training.csv" model_path = args.datapath + "/" + "model.ckpt" vocab_path = args.datapath + "/" + "vocab.pickle" dl = data_loader_recsys.Data_Loader( { 'model_type': 'generator', 'dir_name': training_path }, max_seq_size=args.max_seq_size) all_samples = dl.item items = dl.item_dict print("len(items)") print(len(all_samples)) with open(vocab_path, 'w') as fp: json.dump(items, fp) with open(vocab_path + "inverted", 'w') as fp: json.dump(dl.vocabulary, fp) # Randomly shuffle data np.random.seed(10) shuffle_indices = np.random.permutation(np.arange(len(all_samples))) all_samples = all_samples[shuffle_indices] # Split train/test set dev_sample_index = -1 * int(args.tt_percentage * float(len(all_samples))) train_set, valid_set = all_samples[:dev_sample_index], all_samples[ dev_sample_index:] if args.is_generatesubsession: x_train = generatesubsequence(train_set) model_para = { #if you changed the parameters here, also do not forget to change paramters in nextitrec_generate.py 'item_size': len(items) + 1, 'dilated_channels': args.dilated_channels, #larger is better until 512 or 1024 # if you use nextitnet_residual_block, you can use [1, 4, 1, 4, 1,4,], # if you use nextitnet_residual_block_one, you can tune and i suggest [1, 2, 4, ], for a trial # when you change it do not forget to change it in nextitrec_generate.py 'dilations': [ 1, 2, 1, 2, 1, 2, ], #YOU should tune this hyper-parameter, refer to the paper. 'kernel_size': args.kernel_size, 'learning_rate': args.learning_rate, #YOU should tune this hyper-parameter 'batch_size': args.batch_size, #YOU should tune this hyper-parameter 'iterations': args. iterations, # if your dataset is small, suggest adding regularization to prevent overfitting probably bump this to 100 'is_negsample': False #False denotes no negative sampling } itemrec = generator_recsys.NextItNet_Decoder(model_para) itemrec.train_graph(model_para['is_negsample']) optimizer = tf.train.AdamOptimizer(model_para['learning_rate'], beta1=args.beta1).minimize(itemrec.loss) itemrec.predict_graph(model_para['is_negsample'], reuse=True) sess = tf.Session() init = tf.global_variables_initializer() sess.run(init) saver = tf.train.Saver() numIters = 1 for iter in range(model_para['iterations']): batch_no = 0 batch_size = model_para['batch_size'] while (batch_no + 1) * batch_size < train_set.shape[0]: start = time.clock() item_batch = train_set[batch_no * batch_size:(batch_no + 1) * batch_size, :] _, loss, results = sess.run( [optimizer, itemrec.loss, itemrec.arg_max_prediction], feed_dict={itemrec.itemseq_input: item_batch}) end = time.clock() if numIters % args.eval_iter == 0: print "-------------------------------------------------------train1" print "LOSS: {}\tITER: {}\tBATCH_NO: {}\t STEP:{}\t total_batches:{}".format( loss, iter, batch_no, numIters, train_set.shape[0] / batch_size) print "TIME FOR BATCH", end - start print "TIME FOR ITER (mins)", (end - start) * ( train_set.shape[0] / batch_size) / 60.0 if numIters % args.eval_iter == 0: print "-------------------------------------------------------test1" if (batch_no + 1) * batch_size < valid_set.shape[0]: item_batch = valid_set[(batch_no) * batch_size:(batch_no + 1) * batch_size, :] loss = sess.run([itemrec.loss_test], feed_dict={itemrec.input_predict: item_batch}) print "LOSS: {}\tITER: {}\tBATCH_NO: {}\t STEP:{}\t total_batches:{}".format( loss, iter, batch_no, numIters, valid_set.shape[0] / batch_size) batch_no += 1 if numIters % args.eval_iter == 0: batch_no_test = 0 batch_size_test = batch_size * 1 curr_preds_5 = [] rec_preds_5 = [] #1 ndcg_preds_5 = [] #1 curr_preds_20 = [] rec_preds_20 = [] # 1 ndcg_preds_20 = [] # 1 while (batch_no_test + 1) * batch_size_test < valid_set.shape[0]: if (numIters / (args.eval_iter) < 10): if (batch_no_test > 20): break else: if (batch_no_test > 500): break item_batch = valid_set[batch_no_test * batch_size_test:(batch_no_test + 1) * batch_size_test, :] [probs] = sess.run( [itemrec.g_probs], feed_dict={itemrec.input_predict: item_batch}) for bi in range(probs.shape[0]): pred_items_5 = utils.sample_top_k( probs[bi][-1], top_k=args.top_k) #top_k=5 pred_items_20 = utils.sample_top_k(probs[bi][-1], top_k=args.top_k + 15) true_item = item_batch[bi][-1] predictmap_5 = { ch: i for i, ch in enumerate(pred_items_5) } pred_items_20 = { ch: i for i, ch in enumerate(pred_items_20) } rank_5 = predictmap_5.get(true_item) rank_20 = pred_items_20.get(true_item) if rank_5 == None: curr_preds_5.append(0.0) rec_preds_5.append(0.0) #2 ndcg_preds_5.append(0.0) #2 else: MRR_5 = 1.0 / (rank_5 + 1) Rec_5 = 1.0 #3 ndcg_5 = 1.0 / math.log(rank_5 + 2, 2) # 3 curr_preds_5.append(MRR_5) rec_preds_5.append(Rec_5) #4 ndcg_preds_5.append(ndcg_5) # 4 if rank_20 == None: curr_preds_20.append(0.0) rec_preds_20.append(0.0) #2 ndcg_preds_20.append(0.0) #2 else: MRR_20 = 1.0 / (rank_20 + 1) Rec_20 = 1.0 #3 ndcg_20 = 1.0 / math.log(rank_20 + 2, 2) # 3 curr_preds_20.append(MRR_20) rec_preds_20.append(Rec_20) #4 ndcg_preds_20.append(ndcg_20) # 4 batch_no_test += 1 print "BATCH_NO: {}".format(batch_no_test) print "Accuracy mrr_5:", sum(curr_preds_5) / float( len(curr_preds_5)) #5 print "Accuracy mrr_20:", sum(curr_preds_20) / float( len(curr_preds_20)) # 5 print "Accuracy hit_5:", sum(rec_preds_5) / float( len(rec_preds_5)) #5 print "Accuracy hit_20:", sum(rec_preds_20) / float( len(rec_preds_20)) # 5 print "Accuracy ndcg_5:", sum(ndcg_preds_5) / float( len(ndcg_preds_5)) # 5 print "Accuracy ndcg_20:", sum(ndcg_preds_20) / float( len(ndcg_preds_20)) # #print "curr_preds",curr_preds # print "---------------------------Test Accuray----------------------------" numIters += 1 if numIters % args.save_para_every == 0: print("saving..") save_path = saver.save(sess, model_path)
def main(): parser = argparse.ArgumentParser() parser.add_argument('--top_k', type=int, default=5, help='Sample from top k predictions') parser.add_argument('--beta1', type=float, default=0.9, help='hyperpara-Adam') #history_sequences_20181014_fajie_smalltest.csv parser.add_argument( '--datapath', type=str, default= 'Data/Session/history_sequences_20181014_fajie_transfer_pretrain_small.csv', help='data path') parser.add_argument('--eval_iter', type=int, default=10, help='Sample generator output evry x steps') parser.add_argument('--save_para_every', type=int, default=10, help='save model parameters every') parser.add_argument('--tt_percentage', type=float, default=0.5, help='0.2 means 80% training 20% testing') parser.add_argument( '--is_generatesubsession', type=bool, default=False, help= 'whether generating a subsessions, e.g., 12345-->01234,00123,00012 It may be useful for very some very long sequences' ) parser.add_argument( '--padtoken', type=str, default='0', help='is the padding token in the beggining of the sequence') args = parser.parse_args() dl = data_loader_recsys.Data_Loader({ 'model_type': 'generator', 'dir_name': args.datapath }) all_samples = dl.item items = dl.item_dict print("len(items)") print(len(items)) if items.has_key(args.padtoken): padtoken = items[ args. padtoken] # is the padding token in the beggining of the sentence else: # padtoken = sys.maxint padtoken = len(items) + 1 # Randomly shuffle data np.random.seed(10) shuffle_indices = np.random.permutation(np.arange(len(all_samples))) all_samples = all_samples[shuffle_indices] # Split train/test set dev_sample_index = -1 * int(args.tt_percentage * float(len(all_samples))) train_set, valid_set = all_samples[:dev_sample_index], all_samples[ dev_sample_index:] if args.is_generatesubsession: train_set = generatesubsequence(train_set, padtoken) model_para = { #if you changed the parameters here, also do not forget to change paramters in nextitrec_generate.py 'item_size': len(items), 'dilated_channels': 64, # if you use nextitnet_residual_block, you can use [1, 4, ], # if you use nextitnet_residual_block_one, you can tune and i suggest [1, 2, 4, ], for a trial # when you change it do not forget to change it in nextitrec_generate.py # if you find removing residual network, the performance does not obviously decrease, then I think your data does not have strong seqeunce. Change a dataset and try again. 'dilations': [ 1, 4, 1, 4, 1, 4, 1, 4, ], 'kernel_size': 3, 'learning_rate': 0.001, 'batch_size': 2, 'iterations': 400, 'is_negsample': False #False denotes using full softmax } itemrec = generator_recsys_cau.NextItNet_Decoder(model_para) itemrec.train_graph(model_para['is_negsample']) optimizer = tf.train.AdamOptimizer(model_para['learning_rate'], beta1=args.beta1).minimize(itemrec.loss) itemrec.predict_graph(model_para['is_negsample'], reuse=True) tf.add_to_collection("dilate_input", itemrec.dilate_input) tf.add_to_collection("context_embedding", itemrec.context_embedding) # sess= tf.Session(config=tf.ConfigProto(log_device_placement=True)) sess = tf.Session() init = tf.global_variables_initializer() sess.run(init) saver = tf.train.Saver() numIters = 1 for iter in range(model_para['iterations']): batch_no = 0 batch_size = model_para['batch_size'] while (batch_no + 1) * batch_size < train_set.shape[0]: start = time.time() item_batch = train_set[batch_no * batch_size:(batch_no + 1) * batch_size, :] _, loss, results = sess.run( [optimizer, itemrec.loss, itemrec.arg_max_prediction], feed_dict={itemrec.itemseq_input: item_batch}) end = time.time() if numIters % args.eval_iter == 0: print( "-------------------------------------------------------train1" ) print( "LOSS: {}\tITER: {}\tBATCH_NO: {}\t STEP:{}\t total_batches:{}" .format(loss, iter, batch_no, numIters, train_set.shape[0] / batch_size)) print("TIME FOR BATCH", end - start) print("TIME FOR ITER (mins)", (end - start) * (train_set.shape[0] / batch_size) / 60.0) if numIters % args.eval_iter == 0: print( "-------------------------------------------------------test1" ) if (batch_no + 1) * batch_size < valid_set.shape[0]: # it is well written here when train_set is much larger than valid_set, 'if' may not hold. it will not have impact on the final results. item_batch = valid_set[(batch_no) * batch_size:(batch_no + 1) * batch_size, :] loss = sess.run([itemrec.loss_test], feed_dict={itemrec.input_predict: item_batch}) print( "LOSS: {}\tITER: {}\tBATCH_NO: {}\t STEP:{}\t total_batches:{}" .format(loss, iter, batch_no, numIters, valid_set.shape[0] / batch_size)) batch_no += 1 if numIters % args.eval_iter == 0: batch_no_test = 0 batch_size_test = batch_size * 1 curr_preds_5 = [] rec_preds_5 = [] #1 ndcg_preds_5 = [] #1 curr_preds_20 = [] rec_preds_20 = [] # 1 ndcg_preds_20 = [] # 1 while (batch_no_test + 1) * batch_size_test < valid_set.shape[0]: if (numIters / (args.eval_iter) < 10): if (batch_no_test > 20): break else: if (batch_no_test > 500): break item_batch = valid_set[batch_no_test * batch_size_test:(batch_no_test + 1) * batch_size_test, :] [probs] = sess.run( [itemrec.g_probs], feed_dict={itemrec.input_predict: item_batch}) for bi in range(probs.shape[0]): pred_items_5 = utils.sample_top_k( probs[bi][-1], top_k=args.top_k) #top_k=5 pred_items_20 = utils.sample_top_k(probs[bi][-1], top_k=args.top_k + 15) true_item = item_batch[bi][-1] predictmap_5 = { ch: i for i, ch in enumerate(pred_items_5) } pred_items_20 = { ch: i for i, ch in enumerate(pred_items_20) } rank_5 = predictmap_5.get(true_item) rank_20 = pred_items_20.get(true_item) if rank_5 == None: curr_preds_5.append(0.0) rec_preds_5.append(0.0) #2 ndcg_preds_5.append(0.0) #2 else: MRR_5 = 1.0 / (rank_5 + 1) Rec_5 = 1.0 #3 ndcg_5 = 1.0 / math.log(rank_5 + 2, 2) # 3 curr_preds_5.append(MRR_5) rec_preds_5.append(Rec_5) #4 ndcg_preds_5.append(ndcg_5) # 4 if rank_20 == None: curr_preds_20.append(0.0) rec_preds_20.append(0.0) #2 ndcg_preds_20.append(0.0) #2 else: MRR_20 = 1.0 / (rank_20 + 1) Rec_20 = 1.0 #3 ndcg_20 = 1.0 / math.log(rank_20 + 2, 2) # 3 curr_preds_20.append(MRR_20) rec_preds_20.append(Rec_20) #4 ndcg_preds_20.append(ndcg_20) # 4 batch_no_test += 1 print("BATCH_NO: {}".format(batch_no_test)) print("Accuracy mrr_5:", sum(curr_preds_5) / float(len(curr_preds_5))) #5 print("Accuracy mrr_20:", sum(curr_preds_20) / float(len(curr_preds_20))) # 5 print("Accuracy hit_5:", sum(rec_preds_5) / float(len(rec_preds_5))) #5 print("Accuracy hit_20:", sum(rec_preds_20) / float(len(rec_preds_20))) # 5 print("Accuracy ndcg_5:", sum(ndcg_preds_5) / float(len(ndcg_preds_5))) # 5 print("Accuracy ndcg_20:", sum(ndcg_preds_20) / float(len(ndcg_preds_20))) # numIters += 1 if numIters % args.save_para_every == 0: save_path = saver.save( sess, "Data/Models/generation_model/model_nextitnet_transfer_pretrain.ckpt" .format(iter, numIters))
def main(): parser = argparse.ArgumentParser() parser.add_argument('--learning_rate', type=float, default=0.001, help='Learning Rate') parser.add_argument('--batch_size', type=int, default=10, help='Learning Rate') parser.add_argument('--sample_every', type=int, default=2000, help='Sample generator output evry x steps') parser.add_argument('--summary_every', type=int, default=50, help='Sample generator output evry x steps') parser.add_argument('--save_model_every', type=int, default=1500, help='Save model every') parser.add_argument('--sample_size', type=int, default=300, help='Sampled output size') parser.add_argument('--top_k', type=int, default=5, help='Sample from top k predictions') parser.add_argument('--max_epochs', type=int, default=50, help='Max Epochs') parser.add_argument('--beta1', type=float, default=0.5, help='Momentum for Adam Update') parser.add_argument('--resume_model', type=str, default=None, help='Pre-Trained Model Path, to resume from') # parser.add_argument('--text_dir', type=str, default='Data/generator_training_data', # help='Directory containing text files') parser.add_argument( '--text_dir', type=str, default='Data/Session/history_sequences_20181014_fajie_small.csv', help='Directory containing text files') parser.add_argument('--data_dir', type=str, default='Data', help='Data Directory') parser.add_argument( '--seed', type=str, default= 'f78c95a8-9256-4757-9a9f-213df5c6854e,1151b040-8022-4965-96d2-8a4605ce456c', help='Seed for text generation') parser.add_argument( '--sample_percentage', type=float, default=0.2, help= 'sample_percentage from whole data, e.g.0.2= 80% training 20% testing') parser.add_argument('--filter_sizes', nargs='?', default='[2,3,4]', help='Specify the filter_size') parser.add_argument( '--num_filters', type=int, default=100, help='Number of filters per filter size (default: 128)') parser.add_argument('--loss_type', nargs='?', default='square_loss', help='Specify a loss type (square_loss or log_loss).') parser.add_argument('--l2_reg_lambda', type=float, default=0, help='L2 regularization lambda (default: 0.0)') parser.add_argument("--allow_soft_placement", default=True, help="Allow device soft device placement") parser.add_argument("--log_device_placement", default=False, help="Log placement of ops on devices") parser.add_argument('--dropout_keep_prob', type=float, default=0.5, help='Dropout keep probability (default: 0.5)') args = parser.parse_args() dl = data_loader.Data_Loader({ 'model_type': 'generator', 'dir_name': args.text_dir }) # text_samples=16390600 vocab=947255 session100 all_samples = dl.item items = dl.item_dict # Randomly shuffle data np.random.seed(10) shuffle_indices = np.random.permutation(np.arange(len(all_samples))) text_samples = all_samples[shuffle_indices] # Split train/test set # TODO: This is very crude, should use cross-validation dev_sample_index = -1 * int( args.sample_percentage * float(len(text_samples))) x_train, x_dev = text_samples[:dev_sample_index], text_samples[ dev_sample_index:] #create subsession only for training subseqtrain = [] for i in range(len(x_train)): #print x_train[i] seq = x_train[i] lenseq = len(seq) #session lens=100 shortest subsession=5 realvalue+95 0 for j in range(lenseq - 4): subseqend = seq[:len(seq) - j] subseqbeg = [0] * j subseq = np.append(subseqbeg, subseqend) #beginseq=padzero+subseq #newsubseq=pad+subseq subseqtrain.append(subseq) x_train = np.array(subseqtrain) #list to ndarray del subseqtrain # Randomly shuffle data np.random.seed(10) shuffle_train = np.random.permutation(np.arange(len(x_train))) x_train = x_train[shuffle_train] print "generating subsessions is done!" print "shape", x_train.shape[0] print "dataset", args.text_dir model_options = { 'vocab_size': len(items), 'residual_channels': 100, } cnn = TextCNN_hv(sequence_length=x_train.shape[1], num_classes=len(items), vocab_size=len(items), embedding_size=model_options['residual_channels'], filter_sizes=eval(args.filter_sizes), num_filters=args.num_filters, loss_type=args.loss_type, l2_reg_lambda=args.l2_reg_lambda) print "embedding_size", model_options['residual_channels'] session_conf = tf.ConfigProto( # allow to distribute device automatically if your assigned device is not found allow_soft_placement=args.allow_soft_placement, # whether print or not log_device_placement=args.log_device_placement) sess = tf.Session(config=session_conf) with sess.as_default(): # Define Training procedure global_step = tf.Variable(0, name="global_step", trainable=False) optimizer = tf.train.AdamOptimizer(1e-3) grads_and_vars = optimizer.compute_gradients(cnn.loss) train_op = optimizer.apply_gradients(grads_and_vars, global_step=global_step) sess.run(tf.global_variables_initializer()) step = 1 for epoch in range(args.max_epochs): batch_no = 0 batch_size = args.batch_size while (batch_no + 1) * batch_size < x_train.shape[0]: start = time.clock() text_batch = x_train[batch_no * batch_size:(batch_no + 1) * batch_size, :] _, loss, prediction = sess.run( [train_op, cnn.loss, cnn.arg_max_prediction], feed_dict={ cnn.wholesession: text_batch, cnn.dropout_keep_prob: args.dropout_keep_prob }) end = time.clock() if step % args.sample_every == 0: print "-------------------------------------------------------train1" print "LOSS: {}\tEPOCH: {}\tBATCH_NO: {}\t STEP:{}\t total_batches:{}".format( loss, epoch, batch_no, step, x_train.shape[0] / args.batch_size) print "TIME FOR BATCH", end - start print "TIME FOR EPOCH (mins)", (end - start) * ( x_train.shape[0] / args.batch_size) / 60.0 # print "-------------------------------------------------------train2" # loss = sess.run( # [generator_model.loss_test], # feed_dict={ # generator_model.seed_sentence: text_batch # }) # print "LOSS: {}\tEPOCH: {}\tBATCH_NO: {}\t STEP:{}\t total_batches:{}".format( # loss, epoch, batch_no, step, x_train.shape[0] / args.batch_size) # print "-------------------------------------------------------test1" # if (batch_no + 1) * batch_size < x_dev.shape[0]: # text_batch = x_dev[(batch_no) * batch_size: (batch_no + 1) * batch_size, :] # loss = sess.run( # [generator_model.loss], # feed_dict={ # generator_model.t_sentence: text_batch # }) # print "LOSS: {}\tEPOCH: {}\tBATCH_NO: {}\t STEP:{}\t total_batches:{}".format( # loss, epoch, batch_no, step, x_dev.shape[0] / args.batch_size) if step % args.sample_every == 0: print "-------------------------------------------------------test1" if (batch_no + 1) * batch_size < x_dev.shape[0]: text_batch = x_dev[(batch_no) * batch_size:(batch_no + 1) * batch_size, :] loss = sess.run([cnn.loss], feed_dict={ cnn.wholesession: text_batch, cnn.dropout_keep_prob: 1.0 }) print "LOSS: {}\tEPOCH: {}\tBATCH_NO: {}\t STEP:{}\t total_batches:{}".format( loss, epoch, batch_no, step, x_dev.shape[0] / args.batch_size) batch_no += 1 if step % args.sample_every == 0: print "********************************************************accuracy" batch_no_test = 0 batch_size_test = batch_size * 2 curr_preds_5 = [] rec_preds_5 = [] # 1 ndcg_preds_5 = [] # 1 curr_preds_20 = [] rec_preds_20 = [] # 1 ndcg_preds_20 = [] # 1 while (batch_no_test + 1) * batch_size_test < x_dev.shape[0]: # do not need to evaluate all, only after several 10 sample_every, then output final results if (step / (args.sample_every) < 10): if (batch_no_test > 2): break else: if (batch_no_test > 500): break text_batch = x_dev[batch_no_test * batch_size_test:(batch_no_test + 1) * batch_size_test, :] [probs] = sess.run([cnn.probs_flat], feed_dict={ cnn.wholesession: text_batch, cnn.dropout_keep_prob: 1.0 }) for bi in range(probs.shape[0]): pred_words_5 = utils.sample_top_k( probs[bi], top_k=args.top_k) # top_k=5 pred_words_20 = utils.sample_top_k(probs[bi], top_k=args.top_k + 15) true_word = text_batch[bi][-1] predictmap_5 = { ch: i for i, ch in enumerate(pred_words_5) } pred_words_20 = { ch: i for i, ch in enumerate(pred_words_20) } rank_5 = predictmap_5.get(true_word) rank_20 = pred_words_20.get(true_word) if rank_5 == None: curr_preds_5.append(0.0) rec_preds_5.append(0.0) # 2 ndcg_preds_5.append(0.0) # 2 else: MRR_5 = 1.0 / (rank_5 + 1) Rec_5 = 1.0 # 3 ndcg_5 = 1.0 / math.log(rank_5 + 2, 2) # 3 curr_preds_5.append(MRR_5) rec_preds_5.append(Rec_5) # 4 ndcg_preds_5.append(ndcg_5) # 4 if rank_20 == None: curr_preds_20.append(0.0) rec_preds_20.append(0.0) # 2 ndcg_preds_20.append(0.0) # 2 else: MRR_20 = 1.0 / (rank_20 + 1) Rec_20 = 1.0 # 3 ndcg_20 = 1.0 / math.log(rank_20 + 2, 2) # 3 curr_preds_20.append(MRR_20) rec_preds_20.append(Rec_20) # 4 ndcg_preds_20.append(ndcg_20) # 4 batch_no_test += 1 print "BATCH_NO: {}".format(batch_no_test) print "Accuracy mrr_5:", sum(curr_preds_5) / float( len(curr_preds_5)) # 5 print "Accuracy mrr_20:", sum(curr_preds_20) / float( len(curr_preds_20)) # 5 print "Accuracy hit_5:", sum(rec_preds_5) / float( len(rec_preds_5)) # 5 print "Accuracy hit_20:", sum(rec_preds_20) / float( len(rec_preds_20)) # 5 print "Accuracy ndcg_5:", sum(ndcg_preds_5) / float( len(ndcg_preds_5)) # 5 print "Accuracy ndcg_20:", sum(ndcg_preds_20) / float( len(ndcg_preds_20)) # # print "curr_preds",curr_preds step += 1
def main(): parser = argparse.ArgumentParser() parser.add_argument('--top_k', type=int, default=5, help='Sample from top k predictions') parser.add_argument('--beta1', type=float, default=0.9, help='hyperpara-Adam') parser.add_argument( '--datapath', type=str, default= 'Data/Session/history_sequences_20181014_fajie_transfer_pretrain_small.csv', help='data path') parser.add_argument('--eval_iter', type=int, default=2, help='Sample generator output evry x steps') parser.add_argument('--save_para_every', type=int, default=2, help='save model parameters every') parser.add_argument('--tt_percentage', type=float, default=0.1, help='0.2 means 80% training 20% testing') parser.add_argument('--masked_lm_prob', type=float, default=0.3, help='0.2 means 20% items are masked') parser.add_argument('--max_predictions_per_seq', type=int, default=30, help='maximum number of masked tokens') parser.add_argument( '--max_position', type=int, default=100, help= 'maximum number of for positional embedding, it has to be larger than the sequence lens' ) parser.add_argument( '--is_generatesubsession', type=bool, default=False, help= 'whether generating a subsessions, e.g., 12345-->01234,00123,00012 It may be useful for very some very long sequences' ) parser.add_argument( '--has_positionalembedding', type=bool, default=False, help='whether contains positional embedding before performing cnnn') parser.add_argument( '--padtoken', type=str, default='-1', help='is the padding token in the beggining of the sequence') parser.add_argument( '--is_shuffle', type=bool, default=False, help= 'whether shuffle the training and testing dataset, e.g., 012345-->051324' ) args = parser.parse_args() dl = data_loader_recsys.Data_Loader({ 'model_type': 'generator', 'dir_name': args.datapath }) all_samples = dl.item items = dl.item_dict #key is the original token, value is the mapped value, i.e., 0, 1,2,3... itemlist = items.values() item_size = len(items) # the first token is 'unk' print "len(items)", item_size if items.has_key(args.padtoken): padtoken = items[ args. padtoken] # is the padding token in the beggining of the sentence else: padtoken = item_size + 1 max_predictions_per_seq = args.max_predictions_per_seq masked_lm_prob = args.masked_lm_prob np.random.seed(10) shuffle_indices = np.random.permutation(np.arange(len(all_samples))) all_samples = all_samples[shuffle_indices] dev_sample_index = -1 * int(args.tt_percentage * float(len(all_samples))) train_set, valid_set = all_samples[:dev_sample_index], all_samples[ dev_sample_index:] if args.is_generatesubsession: train_set = generatesubsequence(train_set) if args.is_shuffle: train_set = shuffleseq(train_set, padtoken) model_para = { #if you changed the parameters here, also do not forget to change paramters in nextitrec_generate.py 'item_size': item_size, 'dilated_channels': 64, 'dilations': [ 1, 4, 1, 4, 1, 4, 1, 4, ], 'kernel_size': 3, 'learning_rate': 0.001, 'batch_size': 2, 'iterations': 400, 'max_position': args. max_position, #maximum number of for positional embedding, it has to be larger than the sequence lens 'has_positionalembedding': args.has_positionalembedding, 'is_negsample': True #False denotes no negative sampling } itemrec = generator_recsys.NextItNet_Decoder(model_para) itemrec.train_graph() optimizer = tf.train.AdamOptimizer(model_para['learning_rate'], beta1=args.beta1).minimize(itemrec.loss) itemrec.predict_graph(reuse=True) tf.add_to_collection("dilate_input", itemrec.dilate_input) tf.add_to_collection("context_embedding", itemrec.context_embedding) sess = tf.Session() init = tf.global_variables_initializer() sess.run(init) saver = tf.train.Saver() numIters = 1 for iter in range(model_para['iterations']): batch_no = 0 batch_size = model_para['batch_size'] while (batch_no + 1) * batch_size < train_set.shape[0]: start = time.time() item_batch = train_set[batch_no * batch_size:(batch_no + 1) * batch_size, :] output_tokens_batch, maskedpositions_batch, maskedlabels_batch, masked_lm_weights_batch = create_masked_lm_predictions_frombatch( item_batch, masked_lm_prob, max_predictions_per_seq, items=itemlist, rng=None, item_size=item_size) _, loss = sess.run( [optimizer, itemrec.loss], feed_dict={ itemrec.itemseq_input: output_tokens_batch, itemrec.masked_position: maskedpositions_batch, itemrec.masked_items: maskedlabels_batch, itemrec.label_weights: masked_lm_weights_batch }) end = time.time() if numIters % args.eval_iter == 0: print "-------------------------------------------------------train1" print "LOSS: {}\tITER: {}\tBATCH_NO: {}\t STEP:{}\t total_batches:{}".format( loss, iter, batch_no, numIters, train_set.shape[0] / batch_size) print "TIME FOR BATCH", end - start if numIters % args.eval_iter == 0: print "-------------------------------------------------------test1" batch_no_valid = 0 batch_size_valid = batch_size if (batch_no_valid + 1) * batch_size_valid < valid_set.shape[0]: start = time.time() item_batch = valid_set[(batch_no_valid) * batch_size_valid:(batch_no_valid + 1) * batch_size_valid, :] output_tokens_batch, maskedpositions_batch, maskedlabels_batch, masked_lm_weights_batch = create_masked_lm_predictions_frombatch( item_batch, masked_lm_prob, max_predictions_per_seq, items=itemlist, rng=None, item_size=item_size) loss = sess.run( [itemrec.loss], feed_dict={ itemrec.itemseq_input: output_tokens_batch, itemrec.masked_position: maskedpositions_batch, itemrec.masked_items: maskedlabels_batch, itemrec.label_weights: masked_lm_weights_batch }) end = time.time() print "LOSS: {}\tITER: {}\tBATCH_NO: {}\t STEP:{}\t total_batches:{}".format( loss, iter, batch_no_valid, numIters, valid_set.shape[0] / batch_size_valid) print "TIME FOR BATCH", end - start batch_no += 1 if numIters % args.eval_iter == 0: batch_no_test = 0 batch_size_test = batch_size * 1 curr_preds_5 = [] rec_preds_5 = [] #1 ndcg_preds_5 = [] #1 curr_preds_20 = [] rec_preds_20 = [] # 1 ndcg_preds_20 = [] # 1 while (batch_no_test + 1) * batch_size_test < valid_set.shape[0]: if (numIters / (args.eval_iter) < 10): if (batch_no_test > 20): break else: if (batch_no_test > 1000): break item_batch = valid_set[batch_no_test * batch_size_test:(batch_no_test + 1) * batch_size_test, :] output_tokens_batch, maskedpositions_batch, maskedlabels_batch = create_masked_predictions_frombatch( item_batch) [probs] = sess.run( [itemrec.log_probs], feed_dict={ itemrec.itemseq_input: output_tokens_batch, itemrec.masked_position: maskedpositions_batch }) for bi in range(probs.shape[0]): pred_items_5 = utils.sample_top_k( probs[bi], top_k=args.top_k) #top_k=5 pred_items_20 = utils.sample_top_k(probs[bi], top_k=args.top_k + 15) true_item = item_batch[bi][-1] predictmap_5 = { ch: i for i, ch in enumerate(pred_items_5) } pred_items_20 = { ch: i for i, ch in enumerate(pred_items_20) } rank_5 = predictmap_5.get(true_item) rank_20 = pred_items_20.get(true_item) if rank_5 == None: curr_preds_5.append(0.0) rec_preds_5.append(0.0) #2 ndcg_preds_5.append(0.0) #2 else: MRR_5 = 1.0 / (rank_5 + 1) Rec_5 = 1.0 #3 ndcg_5 = 1.0 / math.log(rank_5 + 2, 2) # 3 curr_preds_5.append(MRR_5) rec_preds_5.append(Rec_5) #4 ndcg_preds_5.append(ndcg_5) # 4 if rank_20 == None: curr_preds_20.append(0.0) rec_preds_20.append(0.0) #2 ndcg_preds_20.append(0.0) #2 else: MRR_20 = 1.0 / (rank_20 + 1) Rec_20 = 1.0 #3 ndcg_20 = 1.0 / math.log(rank_20 + 2, 2) # 3 curr_preds_20.append(MRR_20) rec_preds_20.append(Rec_20) #4 ndcg_preds_20.append(ndcg_20) # 4 batch_no_test += 1 print "BATCH_NO: {}".format(batch_no_test) print "Accuracy mrr_5:", sum(curr_preds_5) / float( len(curr_preds_5)) #5 print "Accuracy mrr_20:", sum(curr_preds_20) / float( len(curr_preds_20)) # 5 print "Accuracy hit_5:", sum(rec_preds_5) / float( len(rec_preds_5)) #5 print "Accuracy hit_20:", sum(rec_preds_20) / float( len(rec_preds_20)) # 5 print "Accuracy ndcg_5:", sum(ndcg_preds_5) / float( len(ndcg_preds_5)) # 5 print "Accuracy ndcg_20:", sum(ndcg_preds_20) / float( len(ndcg_preds_20)) # numIters += 1 if numIters % args.save_para_every == 0: save_path = saver.save( sess, "Data/Models/generation_model/model_nextitnet_cloze". format(iter, numIters))
ndcg_preds_5=[] #1 curr_preds_20 = [] rec_preds_20 = [] # 1 ndcg_preds_20 = [] # 1 test_start = time.time() while (batch_no_test + 1) * batch_size_test < valid_set.shape[0]: item_batch = valid_set[batch_no_test * batch_size_test: (batch_no_test + 1) * batch_size_test, :] [probs] = sess.run( # , loss_test [itemrec.g_probs], # , itemrec.loss_test feed_dict={ itemrec.input_predict: item_batch }) for bi in range(probs.shape[0]): pred_items_5 = utils.sample_top_k(probs[bi], top_k=args.top_k) # top_k=5 pred_items_20 = utils.sample_top_k(probs[bi], top_k=args.top_k + 15) true_item=item_batch[bi][-1] predictmap_5 = {ch: i for i, ch in enumerate(pred_items_5)} pred_items_20 = {ch: i for i, ch in enumerate(pred_items_20)} rank_5=predictmap_5.get(true_item) rank_20 = pred_items_20.get(true_item) if rank_5 ==None: curr_preds_5.append(0.0) rec_preds_5.append(0.0)#2 ndcg_preds_5.append(0.0)#2 else: MRR_5 = 1.0/(rank_5+1) Rec_5=1.0#3