Example #1
0
def train(raw_data=FLAGS.raw_data):

    # Read Data
    mylog("Reading Data...")
    train_set, dev_set, test_set, embAttr, START_ID, item_population, p_item, _, _, _, _, _ = get_data(
        raw_data, data_dir=FLAGS.data_dir)
    n_targets_train = np.sum(
        [np.sum([len(items) for uid, items in x]) for x in train_set])
    train_bucket_sizes = [len(train_set[b]) for b in xrange(len(_buckets))]
    train_total_size = float(sum(train_bucket_sizes))
    train_buckets_scale = [
        sum(train_bucket_sizes[:i + 1]) / train_total_size
        for i in xrange(len(train_bucket_sizes))
    ]
    dev_bucket_sizes = [len(dev_set[b]) for b in xrange(len(_buckets))]
    dev_total_size = int(sum(dev_bucket_sizes))

    # steps
    batch_size = FLAGS.batch_size
    n_epoch = FLAGS.n_epoch
    steps_per_epoch = int(train_total_size / batch_size)
    steps_per_dev = int(dev_total_size / batch_size)

    steps_per_checkpoint = int(steps_per_epoch / 2)
    total_steps = steps_per_epoch * n_epoch

    # reports
    mylog(_buckets)
    mylog("Train:")
    mylog("total: {}".format(train_total_size))
    mylog("bucket sizes: {}".format(train_bucket_sizes))
    mylog("Dev:")
    mylog("total: {}".format(dev_total_size))
    mylog("bucket sizes: {}".format(dev_bucket_sizes))
    mylog("")
    mylog("Steps_per_epoch: {}".format(steps_per_epoch))
    mylog("Total_steps:{}".format(total_steps))
    mylog("Steps_per_checkpoint: {}".format(steps_per_checkpoint))

    # with tf.Session(config=tf.ConfigProto(allow_soft_placement=True, log_device_placement = False, device_count={'CPU':8, 'GPU':1})) as sess:
    with tf.Session(config=tf.ConfigProto(allow_soft_placement=True,
                                          log_device_placement=False)) as sess:

        # runtime profile
        if FLAGS.profile:
            run_options = tf.RunOptions(trace_level=tf.RunOptions.FULL_TRACE)
            run_metadata = tf.RunMetadata()
        else:
            run_options = None
            run_metadata = None

        mylog("Creating Model.. (this can take a few minutes)")
        model = create_model(sess, embAttr, START_ID, run_options,
                             run_metadata)
        show_all_variables()

        # Data Iterators
        dite = DataIterator(model, train_set, len(train_buckets_scale),
                            batch_size, train_buckets_scale)

        iteType = 0
        if iteType == 0:
            mylog("withRandom")
            ite = dite.next_random()
        elif iteType == 1:
            mylog("withSequence")
            ite = dite.next_sequence()

        # statistics during training
        step_time, loss = 0.0, 0.0
        current_step = 0
        previous_losses = []
        his = []
        low_ppx = float("inf")
        low_ppx_step = 0
        steps_per_report = 30
        n_targets_report = 0
        report_time = 0
        n_valid_sents = 0
        patience = FLAGS.patience
        item_sampled, item_sampled_id2idx = None, None

        while current_step < total_steps:

            # start
            start_time = time.time()

            # re-sample every once a while
            if FLAGS.loss in ['mw', 'mce'
                              ] and current_step % FLAGS.n_resample == 0:
                item_sampled, item_sampled_id2idx = sample_items(
                    item_population, FLAGS.n_sampled, p_item)
            else:
                item_sampled = None

            # data and train
            users, inputs, outputs, weights, bucket_id = ite.next()

            L = model.step(sess,
                           users,
                           inputs,
                           outputs,
                           weights,
                           bucket_id,
                           item_sampled=item_sampled,
                           item_sampled_id2idx=item_sampled_id2idx)

            # loss and time
            step_time += (time.time() - start_time) / steps_per_checkpoint

            loss += L
            current_step += 1
            n_valid_sents += np.sum(np.sign(weights[0]))

            # for report
            report_time += (time.time() - start_time)
            n_targets_report += np.sum(weights)

            if current_step % steps_per_report == 0:
                mylog("--------------------" + "Report" + str(current_step) +
                      "-------------------")
                mylog(
                    "StepTime: {} Speed: {} targets / sec in total {} targets".
                    format(report_time / steps_per_report,
                           n_targets_report * 1.0 / report_time,
                           n_targets_train))

                report_time = 0
                n_targets_report = 0

                # Create the Timeline object, and write it to a json
                if FLAGS.profile:
                    tl = timeline.Timeline(run_metadata.step_stats)
                    ctf = tl.generate_chrome_trace_format()
                    with open('timeline.json', 'w') as f:
                        f.write(ctf)
                    exit()

            if current_step % steps_per_checkpoint == 0:
                mylog("--------------------" + "TRAIN" + str(current_step) +
                      "-------------------")
                # Print statistics for the previous epoch.

                loss = loss / n_valid_sents
                perplexity = math.exp(
                    float(loss)) if loss < 300 else float("inf")
                mylog(
                    "global step %d learning rate %.4f step-time %.2f perplexity "
                    "%.2f" %
                    (model.global_step.eval(), model.learning_rate.eval(),
                     step_time, perplexity))

                train_ppx = perplexity

                # Save checkpoint and zero timer and loss.
                step_time, loss, n_valid_sents = 0.0, 0.0, 0

                # dev data
                mylog("--------------------" + "DEV" + str(current_step) +
                      "-------------------")
                eval_loss, eval_ppx = evaluate(
                    sess,
                    model,
                    dev_set,
                    item_sampled_id2idx=item_sampled_id2idx)
                mylog("dev: ppx: {}".format(eval_ppx))

                his.append([current_step, train_ppx, eval_ppx])

                if eval_ppx < low_ppx:
                    patience = FLAGS.patience
                    low_ppx = eval_ppx
                    low_ppx_step = current_step
                    checkpoint_path = os.path.join(FLAGS.train_dir,
                                                   "best.ckpt")
                    mylog("Saving best model....")
                    s = time.time()
                    model.saver.save(sess,
                                     checkpoint_path,
                                     global_step=0,
                                     write_meta_graph=False)
                    mylog("Best model saved using {} sec".format(time.time() -
                                                                 s))
                else:
                    patience -= 1

                if patience <= 0:
                    mylog("Training finished. Running out of patience.")
                    break

                sys.stdout.flush()
Example #2
0
def recommend(raw_data=FLAGS.raw_data):

    # Read Data
    mylog("recommend")
    mylog("Reading Data...")
    _, _, test_set, embAttr, START_ID, _, _, evaluation, uinds, user_index, item_index, logit_ind2item_ind = get_data(
        raw_data, data_dir=FLAGS.data_dir, recommend=True)
    test_bucket_sizes = [len(test_set[b]) for b in xrange(len(_buckets))]
    test_total_size = int(sum(test_bucket_sizes))

    # reports
    mylog(_buckets)
    mylog("Test:")
    mylog("total: {}".format(test_total_size))
    mylog("buckets: {}".format(test_bucket_sizes))

    with tf.Session(config=tf.ConfigProto(allow_soft_placement=True,
                                          log_device_placement=False)) as sess:

        # runtime profile
        if FLAGS.profile:
            run_options = tf.RunOptions(trace_level=tf.RunOptions.FULL_TRACE)
            run_metadata = tf.RunMetadata()
        else:
            run_options = None
            run_metadata = None

        mylog("Creating Model")
        model = create_model(sess, embAttr, START_ID, run_options,
                             run_metadata)
        show_all_variables()

        sess.run(model.dropoutRate.assign(1.0))

        start_id = 0
        n_steps = 0
        batch_size = FLAGS.batch_size

        dite = DataIterator(model, test_set, len(_buckets), batch_size, None)
        ite = dite.next_sequence(stop=True, recommend=True)

        n_total_user = len(uinds)
        n_recommended = 0
        uind2rank = {}
        for r, uind in enumerate(uinds):
            uind2rank[uind] = r
        rec = np.zeros((n_total_user, FLAGS.topk), dtype=int)
        rec_value = np.zeros((n_total_user, FLAGS.topk), dtype=float)
        start = time.time()

        for users, inputs, positions, valids, bucket_id in ite:
            results = model.step_recommend(sess, users, inputs, positions,
                                           bucket_id)
            for i, valid in enumerate(valids):
                if valid == 1:
                    n_recommended += 1
                    if n_recommended % 1000 == 0:
                        mylog("Evaluating n {} bucket_id {}".format(
                            n_recommended, bucket_id))
                    uind, topk_values, topk_indexes = results[i]
                    rank = uind2rank[uind]
                    rec[rank, :] = topk_indexes
                    rec_value[rank, :] = topk_values
            n_steps += 1
        end = time.time()
        mylog("Time used {} sec for {} steps {} users ".format(
            end - start, n_steps, n_recommended))

        ind2id = {}
        for iid in item_index:
            iind = item_index[iid]
            assert (iind not in ind2id)
            ind2id[iind] = iid

        uind2id = {}
        for uid in user_index:
            uind = user_index[uid]
            assert (uind not in uind2id)
            uind2id[uind] = uid

        R = {}
        for i in xrange(n_total_user):
            uid = uind2id[uinds[i]]
            R[uid] = [ind2id[logit_ind2item_ind[v]] for v in list(rec[i, :])]

        evaluation.eval_on(R)

        scores_self, scores_ex = evaluation.get_scores()
        mylog(
            "====evaluation scores (NDCG, RECALL, PRECISION, MAP) @ 2,5,10,20,30===="
        )
        mylog("METRIC_FORMAT (self): {}".format(scores_self))
        mylog("METRIC_FORMAT (ex  ): {}".format(scores_ex))

        # save the two matrix
        np.save(
            os.path.join(FLAGS.train_dir,
                         "top{}_index.npy".format(FLAGS.topk)), rec)
        np.save(
            os.path.join(FLAGS.train_dir,
                         "top{}_value.npy".format(FLAGS.topk)), rec_value)
Example #3
0
def train():

    # Read Data
    mylog_section("READ DATA")

    from_train = None
    to_train = None
    from_dev = None
    to_dev = None

    from_train, to_train, from_dev, to_dev, _, _ = data_utils.prepare_data(
        FLAGS.data_cache_dir, FLAGS.train_path_from, FLAGS.train_path_to,
        FLAGS.dev_path_from, FLAGS.dev_path_to, FLAGS.from_vocab_size,
        FLAGS.to_vocab_size)

    train_data_bucket = read_data(from_train, to_train)
    dev_data_bucket = read_data(from_dev, to_dev)
    _, _, real_vocab_size_from, real_vocab_size_to = data_utils.get_vocab_info(
        FLAGS.data_cache_dir)

    FLAGS._buckets = _buckets
    FLAGS.real_vocab_size_from = real_vocab_size_from
    FLAGS.real_vocab_size_to = real_vocab_size_to

    #train_n_tokens = total training target size
    train_n_tokens = np.sum(
        [np.sum([len(items[1]) for items in x]) for x in train_data_bucket])
    train_bucket_sizes = [
        len(train_data_bucket[b]) for b in xrange(len(_buckets))
    ]
    train_total_size = float(sum(train_bucket_sizes))
    train_buckets_scale = [
        sum(train_bucket_sizes[:i + 1]) / train_total_size
        for i in xrange(len(train_bucket_sizes))
    ]
    dev_bucket_sizes = [len(dev_data_bucket[b]) for b in xrange(len(_buckets))]
    dev_total_size = int(sum(dev_bucket_sizes))

    mylog_section("REPORT")
    # steps
    batch_size = FLAGS.batch_size
    n_epoch = FLAGS.n_epoch
    steps_per_epoch = int(train_total_size / batch_size)
    steps_per_dev = int(dev_total_size / batch_size)
    steps_per_checkpoint = int(steps_per_epoch / 2)
    total_steps = steps_per_epoch * n_epoch

    # reports
    mylog("from_vocab_size: {}".format(FLAGS.from_vocab_size))
    mylog("to_vocab_size: {}".format(FLAGS.to_vocab_size))
    mylog("_buckets: {}".format(FLAGS._buckets))
    mylog("Train:")
    mylog("total: {}".format(train_total_size))
    mylog("bucket sizes: {}".format(train_bucket_sizes))
    mylog("Dev:")
    mylog("total: {}".format(dev_total_size))
    mylog("bucket sizes: {}".format(dev_bucket_sizes))
    mylog("Steps_per_epoch: {}".format(steps_per_epoch))
    mylog("Total_steps:{}".format(total_steps))
    mylog("Steps_per_checkpoint: {}".format(steps_per_checkpoint))

    mylog_section("IN TENSORFLOW")

    config = tf.ConfigProto(allow_soft_placement=True,
                            log_device_placement=False)
    config.gpu_options.allow_growth = FLAGS.allow_growth

    with tf.Session(config=config) as sess:

        # runtime profile
        if FLAGS.profile:
            run_options = tf.RunOptions(trace_level=tf.RunOptions.FULL_TRACE)
            run_metadata = tf.RunMetadata()
        else:
            run_options = None
            run_metadata = None

        mylog_section("MODEL/SUMMARY/WRITER")

        mylog("Creating Model.. (this can take a few minutes)")
        model = create_model(sess, run_options, run_metadata)

        if FLAGS.with_summary:
            mylog("Creating ModelSummary")
            modelSummary = ModelSummary()

            mylog("Creating tf.summary.FileWriter")
            summaryWriter = tf.summary.FileWriter(
                os.path.join(FLAGS.summary_dir, "train.summary"), sess.graph)

        mylog_section("All Variables")
        show_all_variables()

        # Data Iterators
        mylog_section("Data Iterators")

        dite = DataIterator(model, train_data_bucket, len(train_buckets_scale),
                            batch_size, train_buckets_scale)

        iteType = 0
        if iteType == 0:
            mylog("Itetype: withRandom")
            ite = dite.next_random()
        elif iteType == 1:
            mylog("Itetype: withSequence")
            ite = dite.next_sequence()

        # statistics during training
        step_time, loss = 0.0, 0.0
        current_step = 0
        previous_losses = []
        low_ppx = float("inf")
        low_ppx_step = 0
        steps_per_report = 30
        n_targets_report = 0
        report_time = 0
        n_valid_sents = 0
        n_valid_words = 0
        patience = FLAGS.patience

        mylog_section("TRAIN")

        while current_step < total_steps:

            # start
            start_time = time.time()

            # data and train
            source_inputs, target_inputs, target_outputs, target_weights, bucket_id = ite.next(
            )

            L = model.step(sess, source_inputs, target_inputs, target_outputs,
                           target_weights, bucket_id)

            # loss and time
            step_time += (time.time() - start_time) / steps_per_checkpoint

            loss += L
            current_step += 1
            n_valid_sents += np.sum(np.sign(target_weights[0]))
            n_valid_words += np.sum(target_weights)

            # for report
            report_time += (time.time() - start_time)
            n_targets_report += np.sum(target_weights)

            if current_step % steps_per_report == 0:
                sect_name = "STEP {}".format(current_step)
                msg = "StepTime: {:.2f} sec Speed: {:.2f} targets/s Total_targets: {}".format(
                    report_time / steps_per_report,
                    n_targets_report * 1.0 / report_time, train_n_tokens)
                mylog_line(sect_name, msg)

                report_time = 0
                n_targets_report = 0

                # Create the Timeline object, and write it to a json
                if FLAGS.profile:
                    tl = timeline.Timeline(run_metadata.step_stats)
                    ctf = tl.generate_chrome_trace_format()
                    with open('timeline.json', 'w') as f:
                        f.write(ctf)
                    exit()

            if current_step % steps_per_checkpoint == 0:

                i_checkpoint = int(current_step / steps_per_checkpoint)

                # train_ppx
                loss = loss / n_valid_words
                train_ppx = math.exp(
                    float(loss)) if loss < 300 else float("inf")
                learning_rate = model.learning_rate.eval()

                # dev_ppx
                dev_loss, dev_ppx = evaluate(sess, model, dev_data_bucket)

                # report
                sect_name = "CHECKPOINT {} STEP {}".format(
                    i_checkpoint, current_step)
                msg = "Learning_rate: {:.4f} Dev_ppx: {:.2f} Train_ppx: {:.2f}".format(
                    learning_rate, dev_ppx, train_ppx)
                mylog_line(sect_name, msg)

                if FLAGS.with_summary:
                    # save summary
                    _summaries = modelSummary.step_record(
                        sess, train_ppx, dev_ppx)
                    for _summary in _summaries:
                        summaryWriter.add_summary(_summary, i_checkpoint)

                # save model per checkpoint
                if FLAGS.saveCheckpoint:
                    checkpoint_path = os.path.join(FLAGS.saved_model_dir,
                                                   "model")
                    s = time.time()
                    model.saver.save(sess,
                                     checkpoint_path,
                                     global_step=i_checkpoint,
                                     write_meta_graph=False)
                    msg = "Model saved using {:.2f} sec at {}".format(
                        time.time() - s, checkpoint_path)
                    mylog_line(sect_name, msg)

                # save best model
                if dev_ppx < low_ppx:
                    patience = FLAGS.patience
                    low_ppx = dev_ppx
                    low_ppx_step = current_step
                    checkpoint_path = os.path.join(FLAGS.saved_model_dir,
                                                   "best")
                    s = time.time()
                    model.best_saver.save(sess,
                                          checkpoint_path,
                                          global_step=0,
                                          write_meta_graph=False)
                    msg = "Model saved using {:.2f} sec at {}".format(
                        time.time() - s, checkpoint_path)
                    mylog_line(sect_name, msg)
                else:
                    patience -= 1

                if patience <= 0:
                    mylog("Training finished. Running out of patience.")
                    break

                # Save checkpoint and zero timer and loss.
                step_time, loss, n_valid_sents, n_valid_words = 0.0, 0.0, 0, 0
Example #4
0
def beam_decode():
    # not yet tested:
    # known issues:
    #   should use next_original
    mylog("Reading Data...")
    test_data_bucket, _buckets, test_data_order = read_test(
        FLAGS.data_cache_dir, FLAGS.test_path,
        get_vocab_path(FLAGS.data_cache_dir), FLAGS.L, FLAGS.n_bucket)
    vocab_path = get_vocab_path(FLAGS.data_cache_dir)
    real_vocab_size = get_real_vocab_size(vocab_path)

    FLAGS._buckets = _buckets
    FLAGS.real_vocab_size = real_vocab_size

    test_bucket_sizes = [
        len(test_data_bucket[b]) for b in xrange(len(_buckets))
    ]
    test_total_size = int(sum(test_bucket_sizes))

    # reports
    mylog("real_vocab_size: {}".format(FLAGS.real_vocab_size))
    mylog("_buckets:{}".format(FLAGS._buckets))
    mylog("BEAM_DECODE:")
    mylog("total: {}".format(test_total_size))
    mylog("buckets: {}".format(test_bucket_sizes))

    config = tf.ConfigProto(allow_soft_placement=True,
                            log_device_placement=False)
    config.gpu_options.allow_growth = FLAGS.allow_growth

    with tf.Session(config=config) as sess:

        # runtime profile
        if FLAGS.profile:
            run_options = tf.RunOptions(trace_level=tf.RunOptions.FULL_TRACE)
            run_metadata = tf.RunMetadata()
        else:
            run_options = None
            run_metadata = None

        mylog("Creating Model")
        model = create_model(sess, run_options, run_metadata)
        mylog("before init_beam_decoder()")
        show_all_variables()
        model.init_beam_decoder(beam_size=FLAGS.beam_size,
                                max_steps=FLAGS.beam_step)
        model.init_beam_variables(sess)
        mylog("after init_beam_decoder()")
        show_all_variables()

        sess.run(model.dropoutRate.assign(1.0))

        start_id = 0
        n_steps = 0
        batch_size = FLAGS.batch_size

        dite = DataIterator(model, test_data_bucket, len(_buckets), batch_size,
                            None)
        ite = dite.next_sequence(stop=True, test=True)

        i_sent = 0
        for inputs, positions, valids, bucket_id in ite:
            # user : [0]
            # inputs: [[_GO],[1],[2],[3],[_EOS],[pad_id],[pad_id]]
            # positions: [4]

            print("--- decoding {}/{} sent ---".format(i_sent, n_total_user))
            i_sent += 1

            # do the following convert:
            # inputs: [[pad_id],[1],[2],[pad_id],[pad_id],[pad_id]]
            # positions:[2]
            PAD_ID = 0
            last_history = inputs[positions[0]]
            inputs_beam = [last_history * FLAGS.beam_size]
            inputs[positions[0]] = list([PAD_ID] * FLAGS.beam_size)
            inputs[positions[0] - 1] = list([PAD_ID] * FLAGS.beam_size)
            positions[0] = positions[0] - 2 if positions[0] >= 2 else 0
            scores = [0.0] * FLAGS.beam_size
            sentences = [[] for x in xrange(FLAGS.beam_size)]
            beam_parent = range(FLAGS.beam_size)

            for i in xrange(FLAGS.beam_step):
                if i == 0:
                    top_value, top_index = model.beam_step(
                        sess,
                        index=i,
                        word_inputs_history=inputs,
                        sequence_length=positions,
                        word_inputs_beam=inputs_beam)
                else:
                    top_value, top_index = model.beam_step(
                        sess,
                        index=i,
                        word_inputs_beam=inputs_beam,
                        beam_parent=beam_parent)

                # expand
                global_queue = []

                if i == 0:
                    nrow = 1
                else:
                    nrow = top_index[0].shape[0]

                for row in xrange(nrow):
                    for col in xrange(top_index[0].shape[1]):
                        score = scores[row] + np.log(top_value[0][row, col])
                        word_index = top_index[0][row, col]
                        beam_index = row

                        if FLAGS.no_repeat:
                            if not word_index in sentences[beam_index]:
                                global_queue.append(
                                    (score, beam_index, word_index))
                        else:
                            global_queue.append(
                                (score, beam_index, word_index))

                global_queue = sorted(global_queue, key=lambda x: -x[0])

                inputs_beam = []
                beam_parent = []
                scores = []
                temp_sentences = []

                if FLAGS.print_beam:
                    print("--------- Step {} --------".format(i))

                for j, (score, beam_index, word_index) in enumerate(
                        global_queue[:FLAGS.beam_size]):
                    if FLAGS.print_beam:
                        print("Beam:{} Father:{} word:{} score:{}".format(
                            j, beam_index, word_index, score))
                    beam_parent.append(beam_index)
                    inputs_beam.append(word_index)
                    scores.append(score)
                    temp_sentences.append(sentences[beam_index] + [word_index])

                inputs_beam = [inputs_beam]
                sentences = temp_sentences

            if FLAGS.print_beam:
                print(sentences)