def train(raw_data=FLAGS.raw_data): # Read Data mylog("Reading Data...") train_set, dev_set, test_set, embAttr, START_ID, item_population, p_item, _, _, _, _, _ = get_data( raw_data, data_dir=FLAGS.data_dir) n_targets_train = np.sum( [np.sum([len(items) for uid, items in x]) for x in train_set]) train_bucket_sizes = [len(train_set[b]) for b in xrange(len(_buckets))] train_total_size = float(sum(train_bucket_sizes)) train_buckets_scale = [ sum(train_bucket_sizes[:i + 1]) / train_total_size for i in xrange(len(train_bucket_sizes)) ] dev_bucket_sizes = [len(dev_set[b]) for b in xrange(len(_buckets))] dev_total_size = int(sum(dev_bucket_sizes)) # steps batch_size = FLAGS.batch_size n_epoch = FLAGS.n_epoch steps_per_epoch = int(train_total_size / batch_size) steps_per_dev = int(dev_total_size / batch_size) steps_per_checkpoint = int(steps_per_epoch / 2) total_steps = steps_per_epoch * n_epoch # reports mylog(_buckets) mylog("Train:") mylog("total: {}".format(train_total_size)) mylog("bucket sizes: {}".format(train_bucket_sizes)) mylog("Dev:") mylog("total: {}".format(dev_total_size)) mylog("bucket sizes: {}".format(dev_bucket_sizes)) mylog("") mylog("Steps_per_epoch: {}".format(steps_per_epoch)) mylog("Total_steps:{}".format(total_steps)) mylog("Steps_per_checkpoint: {}".format(steps_per_checkpoint)) # with tf.Session(config=tf.ConfigProto(allow_soft_placement=True, log_device_placement = False, device_count={'CPU':8, 'GPU':1})) as sess: with tf.Session(config=tf.ConfigProto(allow_soft_placement=True, log_device_placement=False)) as sess: # runtime profile if FLAGS.profile: run_options = tf.RunOptions(trace_level=tf.RunOptions.FULL_TRACE) run_metadata = tf.RunMetadata() else: run_options = None run_metadata = None mylog("Creating Model.. (this can take a few minutes)") model = create_model(sess, embAttr, START_ID, run_options, run_metadata) show_all_variables() # Data Iterators dite = DataIterator(model, train_set, len(train_buckets_scale), batch_size, train_buckets_scale) iteType = 0 if iteType == 0: mylog("withRandom") ite = dite.next_random() elif iteType == 1: mylog("withSequence") ite = dite.next_sequence() # statistics during training step_time, loss = 0.0, 0.0 current_step = 0 previous_losses = [] his = [] low_ppx = float("inf") low_ppx_step = 0 steps_per_report = 30 n_targets_report = 0 report_time = 0 n_valid_sents = 0 patience = FLAGS.patience item_sampled, item_sampled_id2idx = None, None while current_step < total_steps: # start start_time = time.time() # re-sample every once a while if FLAGS.loss in ['mw', 'mce' ] and current_step % FLAGS.n_resample == 0: item_sampled, item_sampled_id2idx = sample_items( item_population, FLAGS.n_sampled, p_item) else: item_sampled = None # data and train users, inputs, outputs, weights, bucket_id = ite.next() L = model.step(sess, users, inputs, outputs, weights, bucket_id, item_sampled=item_sampled, item_sampled_id2idx=item_sampled_id2idx) # loss and time step_time += (time.time() - start_time) / steps_per_checkpoint loss += L current_step += 1 n_valid_sents += np.sum(np.sign(weights[0])) # for report report_time += (time.time() - start_time) n_targets_report += np.sum(weights) if current_step % steps_per_report == 0: mylog("--------------------" + "Report" + str(current_step) + "-------------------") mylog( "StepTime: {} Speed: {} targets / sec in total {} targets". format(report_time / steps_per_report, n_targets_report * 1.0 / report_time, n_targets_train)) report_time = 0 n_targets_report = 0 # Create the Timeline object, and write it to a json if FLAGS.profile: tl = timeline.Timeline(run_metadata.step_stats) ctf = tl.generate_chrome_trace_format() with open('timeline.json', 'w') as f: f.write(ctf) exit() if current_step % steps_per_checkpoint == 0: mylog("--------------------" + "TRAIN" + str(current_step) + "-------------------") # Print statistics for the previous epoch. loss = loss / n_valid_sents perplexity = math.exp( float(loss)) if loss < 300 else float("inf") mylog( "global step %d learning rate %.4f step-time %.2f perplexity " "%.2f" % (model.global_step.eval(), model.learning_rate.eval(), step_time, perplexity)) train_ppx = perplexity # Save checkpoint and zero timer and loss. step_time, loss, n_valid_sents = 0.0, 0.0, 0 # dev data mylog("--------------------" + "DEV" + str(current_step) + "-------------------") eval_loss, eval_ppx = evaluate( sess, model, dev_set, item_sampled_id2idx=item_sampled_id2idx) mylog("dev: ppx: {}".format(eval_ppx)) his.append([current_step, train_ppx, eval_ppx]) if eval_ppx < low_ppx: patience = FLAGS.patience low_ppx = eval_ppx low_ppx_step = current_step checkpoint_path = os.path.join(FLAGS.train_dir, "best.ckpt") mylog("Saving best model....") s = time.time() model.saver.save(sess, checkpoint_path, global_step=0, write_meta_graph=False) mylog("Best model saved using {} sec".format(time.time() - s)) else: patience -= 1 if patience <= 0: mylog("Training finished. Running out of patience.") break sys.stdout.flush()
def recommend(raw_data=FLAGS.raw_data): # Read Data mylog("recommend") mylog("Reading Data...") _, _, test_set, embAttr, START_ID, _, _, evaluation, uinds, user_index, item_index, logit_ind2item_ind = get_data( raw_data, data_dir=FLAGS.data_dir, recommend=True) test_bucket_sizes = [len(test_set[b]) for b in xrange(len(_buckets))] test_total_size = int(sum(test_bucket_sizes)) # reports mylog(_buckets) mylog("Test:") mylog("total: {}".format(test_total_size)) mylog("buckets: {}".format(test_bucket_sizes)) with tf.Session(config=tf.ConfigProto(allow_soft_placement=True, log_device_placement=False)) as sess: # runtime profile if FLAGS.profile: run_options = tf.RunOptions(trace_level=tf.RunOptions.FULL_TRACE) run_metadata = tf.RunMetadata() else: run_options = None run_metadata = None mylog("Creating Model") model = create_model(sess, embAttr, START_ID, run_options, run_metadata) show_all_variables() sess.run(model.dropoutRate.assign(1.0)) start_id = 0 n_steps = 0 batch_size = FLAGS.batch_size dite = DataIterator(model, test_set, len(_buckets), batch_size, None) ite = dite.next_sequence(stop=True, recommend=True) n_total_user = len(uinds) n_recommended = 0 uind2rank = {} for r, uind in enumerate(uinds): uind2rank[uind] = r rec = np.zeros((n_total_user, FLAGS.topk), dtype=int) rec_value = np.zeros((n_total_user, FLAGS.topk), dtype=float) start = time.time() for users, inputs, positions, valids, bucket_id in ite: results = model.step_recommend(sess, users, inputs, positions, bucket_id) for i, valid in enumerate(valids): if valid == 1: n_recommended += 1 if n_recommended % 1000 == 0: mylog("Evaluating n {} bucket_id {}".format( n_recommended, bucket_id)) uind, topk_values, topk_indexes = results[i] rank = uind2rank[uind] rec[rank, :] = topk_indexes rec_value[rank, :] = topk_values n_steps += 1 end = time.time() mylog("Time used {} sec for {} steps {} users ".format( end - start, n_steps, n_recommended)) ind2id = {} for iid in item_index: iind = item_index[iid] assert (iind not in ind2id) ind2id[iind] = iid uind2id = {} for uid in user_index: uind = user_index[uid] assert (uind not in uind2id) uind2id[uind] = uid R = {} for i in xrange(n_total_user): uid = uind2id[uinds[i]] R[uid] = [ind2id[logit_ind2item_ind[v]] for v in list(rec[i, :])] evaluation.eval_on(R) scores_self, scores_ex = evaluation.get_scores() mylog( "====evaluation scores (NDCG, RECALL, PRECISION, MAP) @ 2,5,10,20,30====" ) mylog("METRIC_FORMAT (self): {}".format(scores_self)) mylog("METRIC_FORMAT (ex ): {}".format(scores_ex)) # save the two matrix np.save( os.path.join(FLAGS.train_dir, "top{}_index.npy".format(FLAGS.topk)), rec) np.save( os.path.join(FLAGS.train_dir, "top{}_value.npy".format(FLAGS.topk)), rec_value)
def train(): # Read Data mylog_section("READ DATA") from_train = None to_train = None from_dev = None to_dev = None from_train, to_train, from_dev, to_dev, _, _ = data_utils.prepare_data( FLAGS.data_cache_dir, FLAGS.train_path_from, FLAGS.train_path_to, FLAGS.dev_path_from, FLAGS.dev_path_to, FLAGS.from_vocab_size, FLAGS.to_vocab_size) train_data_bucket = read_data(from_train, to_train) dev_data_bucket = read_data(from_dev, to_dev) _, _, real_vocab_size_from, real_vocab_size_to = data_utils.get_vocab_info( FLAGS.data_cache_dir) FLAGS._buckets = _buckets FLAGS.real_vocab_size_from = real_vocab_size_from FLAGS.real_vocab_size_to = real_vocab_size_to #train_n_tokens = total training target size train_n_tokens = np.sum( [np.sum([len(items[1]) for items in x]) for x in train_data_bucket]) train_bucket_sizes = [ len(train_data_bucket[b]) for b in xrange(len(_buckets)) ] train_total_size = float(sum(train_bucket_sizes)) train_buckets_scale = [ sum(train_bucket_sizes[:i + 1]) / train_total_size for i in xrange(len(train_bucket_sizes)) ] dev_bucket_sizes = [len(dev_data_bucket[b]) for b in xrange(len(_buckets))] dev_total_size = int(sum(dev_bucket_sizes)) mylog_section("REPORT") # steps batch_size = FLAGS.batch_size n_epoch = FLAGS.n_epoch steps_per_epoch = int(train_total_size / batch_size) steps_per_dev = int(dev_total_size / batch_size) steps_per_checkpoint = int(steps_per_epoch / 2) total_steps = steps_per_epoch * n_epoch # reports mylog("from_vocab_size: {}".format(FLAGS.from_vocab_size)) mylog("to_vocab_size: {}".format(FLAGS.to_vocab_size)) mylog("_buckets: {}".format(FLAGS._buckets)) mylog("Train:") mylog("total: {}".format(train_total_size)) mylog("bucket sizes: {}".format(train_bucket_sizes)) mylog("Dev:") mylog("total: {}".format(dev_total_size)) mylog("bucket sizes: {}".format(dev_bucket_sizes)) mylog("Steps_per_epoch: {}".format(steps_per_epoch)) mylog("Total_steps:{}".format(total_steps)) mylog("Steps_per_checkpoint: {}".format(steps_per_checkpoint)) mylog_section("IN TENSORFLOW") config = tf.ConfigProto(allow_soft_placement=True, log_device_placement=False) config.gpu_options.allow_growth = FLAGS.allow_growth with tf.Session(config=config) as sess: # runtime profile if FLAGS.profile: run_options = tf.RunOptions(trace_level=tf.RunOptions.FULL_TRACE) run_metadata = tf.RunMetadata() else: run_options = None run_metadata = None mylog_section("MODEL/SUMMARY/WRITER") mylog("Creating Model.. (this can take a few minutes)") model = create_model(sess, run_options, run_metadata) if FLAGS.with_summary: mylog("Creating ModelSummary") modelSummary = ModelSummary() mylog("Creating tf.summary.FileWriter") summaryWriter = tf.summary.FileWriter( os.path.join(FLAGS.summary_dir, "train.summary"), sess.graph) mylog_section("All Variables") show_all_variables() # Data Iterators mylog_section("Data Iterators") dite = DataIterator(model, train_data_bucket, len(train_buckets_scale), batch_size, train_buckets_scale) iteType = 0 if iteType == 0: mylog("Itetype: withRandom") ite = dite.next_random() elif iteType == 1: mylog("Itetype: withSequence") ite = dite.next_sequence() # statistics during training step_time, loss = 0.0, 0.0 current_step = 0 previous_losses = [] low_ppx = float("inf") low_ppx_step = 0 steps_per_report = 30 n_targets_report = 0 report_time = 0 n_valid_sents = 0 n_valid_words = 0 patience = FLAGS.patience mylog_section("TRAIN") while current_step < total_steps: # start start_time = time.time() # data and train source_inputs, target_inputs, target_outputs, target_weights, bucket_id = ite.next( ) L = model.step(sess, source_inputs, target_inputs, target_outputs, target_weights, bucket_id) # loss and time step_time += (time.time() - start_time) / steps_per_checkpoint loss += L current_step += 1 n_valid_sents += np.sum(np.sign(target_weights[0])) n_valid_words += np.sum(target_weights) # for report report_time += (time.time() - start_time) n_targets_report += np.sum(target_weights) if current_step % steps_per_report == 0: sect_name = "STEP {}".format(current_step) msg = "StepTime: {:.2f} sec Speed: {:.2f} targets/s Total_targets: {}".format( report_time / steps_per_report, n_targets_report * 1.0 / report_time, train_n_tokens) mylog_line(sect_name, msg) report_time = 0 n_targets_report = 0 # Create the Timeline object, and write it to a json if FLAGS.profile: tl = timeline.Timeline(run_metadata.step_stats) ctf = tl.generate_chrome_trace_format() with open('timeline.json', 'w') as f: f.write(ctf) exit() if current_step % steps_per_checkpoint == 0: i_checkpoint = int(current_step / steps_per_checkpoint) # train_ppx loss = loss / n_valid_words train_ppx = math.exp( float(loss)) if loss < 300 else float("inf") learning_rate = model.learning_rate.eval() # dev_ppx dev_loss, dev_ppx = evaluate(sess, model, dev_data_bucket) # report sect_name = "CHECKPOINT {} STEP {}".format( i_checkpoint, current_step) msg = "Learning_rate: {:.4f} Dev_ppx: {:.2f} Train_ppx: {:.2f}".format( learning_rate, dev_ppx, train_ppx) mylog_line(sect_name, msg) if FLAGS.with_summary: # save summary _summaries = modelSummary.step_record( sess, train_ppx, dev_ppx) for _summary in _summaries: summaryWriter.add_summary(_summary, i_checkpoint) # save model per checkpoint if FLAGS.saveCheckpoint: checkpoint_path = os.path.join(FLAGS.saved_model_dir, "model") s = time.time() model.saver.save(sess, checkpoint_path, global_step=i_checkpoint, write_meta_graph=False) msg = "Model saved using {:.2f} sec at {}".format( time.time() - s, checkpoint_path) mylog_line(sect_name, msg) # save best model if dev_ppx < low_ppx: patience = FLAGS.patience low_ppx = dev_ppx low_ppx_step = current_step checkpoint_path = os.path.join(FLAGS.saved_model_dir, "best") s = time.time() model.best_saver.save(sess, checkpoint_path, global_step=0, write_meta_graph=False) msg = "Model saved using {:.2f} sec at {}".format( time.time() - s, checkpoint_path) mylog_line(sect_name, msg) else: patience -= 1 if patience <= 0: mylog("Training finished. Running out of patience.") break # Save checkpoint and zero timer and loss. step_time, loss, n_valid_sents, n_valid_words = 0.0, 0.0, 0, 0
def beam_decode(): # not yet tested: # known issues: # should use next_original mylog("Reading Data...") test_data_bucket, _buckets, test_data_order = read_test( FLAGS.data_cache_dir, FLAGS.test_path, get_vocab_path(FLAGS.data_cache_dir), FLAGS.L, FLAGS.n_bucket) vocab_path = get_vocab_path(FLAGS.data_cache_dir) real_vocab_size = get_real_vocab_size(vocab_path) FLAGS._buckets = _buckets FLAGS.real_vocab_size = real_vocab_size test_bucket_sizes = [ len(test_data_bucket[b]) for b in xrange(len(_buckets)) ] test_total_size = int(sum(test_bucket_sizes)) # reports mylog("real_vocab_size: {}".format(FLAGS.real_vocab_size)) mylog("_buckets:{}".format(FLAGS._buckets)) mylog("BEAM_DECODE:") mylog("total: {}".format(test_total_size)) mylog("buckets: {}".format(test_bucket_sizes)) config = tf.ConfigProto(allow_soft_placement=True, log_device_placement=False) config.gpu_options.allow_growth = FLAGS.allow_growth with tf.Session(config=config) as sess: # runtime profile if FLAGS.profile: run_options = tf.RunOptions(trace_level=tf.RunOptions.FULL_TRACE) run_metadata = tf.RunMetadata() else: run_options = None run_metadata = None mylog("Creating Model") model = create_model(sess, run_options, run_metadata) mylog("before init_beam_decoder()") show_all_variables() model.init_beam_decoder(beam_size=FLAGS.beam_size, max_steps=FLAGS.beam_step) model.init_beam_variables(sess) mylog("after init_beam_decoder()") show_all_variables() sess.run(model.dropoutRate.assign(1.0)) start_id = 0 n_steps = 0 batch_size = FLAGS.batch_size dite = DataIterator(model, test_data_bucket, len(_buckets), batch_size, None) ite = dite.next_sequence(stop=True, test=True) i_sent = 0 for inputs, positions, valids, bucket_id in ite: # user : [0] # inputs: [[_GO],[1],[2],[3],[_EOS],[pad_id],[pad_id]] # positions: [4] print("--- decoding {}/{} sent ---".format(i_sent, n_total_user)) i_sent += 1 # do the following convert: # inputs: [[pad_id],[1],[2],[pad_id],[pad_id],[pad_id]] # positions:[2] PAD_ID = 0 last_history = inputs[positions[0]] inputs_beam = [last_history * FLAGS.beam_size] inputs[positions[0]] = list([PAD_ID] * FLAGS.beam_size) inputs[positions[0] - 1] = list([PAD_ID] * FLAGS.beam_size) positions[0] = positions[0] - 2 if positions[0] >= 2 else 0 scores = [0.0] * FLAGS.beam_size sentences = [[] for x in xrange(FLAGS.beam_size)] beam_parent = range(FLAGS.beam_size) for i in xrange(FLAGS.beam_step): if i == 0: top_value, top_index = model.beam_step( sess, index=i, word_inputs_history=inputs, sequence_length=positions, word_inputs_beam=inputs_beam) else: top_value, top_index = model.beam_step( sess, index=i, word_inputs_beam=inputs_beam, beam_parent=beam_parent) # expand global_queue = [] if i == 0: nrow = 1 else: nrow = top_index[0].shape[0] for row in xrange(nrow): for col in xrange(top_index[0].shape[1]): score = scores[row] + np.log(top_value[0][row, col]) word_index = top_index[0][row, col] beam_index = row if FLAGS.no_repeat: if not word_index in sentences[beam_index]: global_queue.append( (score, beam_index, word_index)) else: global_queue.append( (score, beam_index, word_index)) global_queue = sorted(global_queue, key=lambda x: -x[0]) inputs_beam = [] beam_parent = [] scores = [] temp_sentences = [] if FLAGS.print_beam: print("--------- Step {} --------".format(i)) for j, (score, beam_index, word_index) in enumerate( global_queue[:FLAGS.beam_size]): if FLAGS.print_beam: print("Beam:{} Father:{} word:{} score:{}".format( j, beam_index, word_index, score)) beam_parent.append(beam_index) inputs_beam.append(word_index) scores.append(score) temp_sentences.append(sentences[beam_index] + [word_index]) inputs_beam = [inputs_beam] sentences = temp_sentences if FLAGS.print_beam: print(sentences)