Example #1
0
def optimize(model, sampler, train, valid):
    """
    Optimize the model. TODO: implement early-stopping
    :param model: model to optimize
    :param sampler: mini-batch sampler
    :param train: train user-item matrix
    :param valid: validation user-item matrix
    :return: None
    """

    sess = tf.Session()
    sess.run(tf.global_variables_initializer())

    saver = tf.train.Saver()  # 用于保存变量

    save_dir = 'checkpoints/'  # 保存目录

    save_path = os.path.join(save_dir, 'model2')

    # if model.feature_projection is not None:
    #     # initialize item embedding with feature projection
    #     sess.run(tf.assign(model.item_embeddings, model.feature_projection))

    # sample some users to calculate recall validation
    valid_users = numpy.random.choice(list(set(valid.nonzero()[0])),
                                      size=1000,
                                      replace=False)

    while True:
        # create evaluator on validation set
        validation_recall = RecallEvaluator(model, train, valid)
        # compute recall on validate set
        valid_recalls = []

        # compute recall in chunks to utilize speedup provided by Tensorflow
        for user_chunk in toolz.partition_all(100, valid_users):
            valid_recalls.extend([validation_recall.eval(sess, user_chunk)])
        print("\nRecall on (sampled) validation set: {}".format(
            numpy.mean(valid_recalls)))
        # TODO: early stopping based on validation recall

        # train model
        losses = []
        # run n mini-batches
        for _ in tqdm(range(EVALUATION_EVERY_N_BATCHES), desc="Optimizing..."):
            user_pos, neg = sampler.next_batch()
            _, loss = sess.run(
                (model.optimize, model.loss), {
                    model.user_positive_items_pairs: user_pos,
                    model.negative_samples: neg
                })

            losses.append(loss)

        print("\nTraining loss {}".format(numpy.mean(losses)))

        saver.save(sess=sess, save_path=save_path)
Example #2
0
File: MAML.py Project: xxyy1/MAML
def optimize(model, sampler, train, test, args):
    """
    Optimize the model. TODO: implement early-stopping
    :param model: model to optimize
    :param sampler: mini-batch sampler
    :param train: train user-item matrix
    :param valid: validation user-item matrix
    :return: None
    """
    config = tf.ConfigProto(allow_soft_placement=True)
    config.gpu_options.allow_growth = True
    sess = tf.Session(config=config)
    sess.run(tf.global_variables_initializer())
    #sess.graph.finalize()
    if model.feature_projection is not None:
        # initialize item embedding with feature projection
        sess.run(tf.assign(model.item_embeddings, model.feature_projection))
    # all test users to calculate recall validation
    test_users = numpy.asarray(list(set(test.nonzero()[0])), dtype=numpy.int32)
    testresult = RecallEvaluator(model, train, test)
    epoch = 0
    tempbest = 0
    while True:
        print('\nepochs:{}'.format(epoch))
        epoch += 1
        # TODO: early stopping based on validation recall
        # train model
        losses = []
        feature_losses = []
        # run n mini-batches
        for _ in tqdm(range(args.eva_batches), desc="Optimizing..."):
            user_pos, neg = sampler.next_batch()
            _, loss, feature_loss = sess.run(
                (model.optimize, model.loss, model.feature_loss), {
                    model.user_positive_items_pairs: user_pos,
                    model.negative_samples: neg
                })
            feature_losses.append(feature_loss)
        #tf.train.Saver.save(sess,'my-model')
        # compute recall,ndcg,hr,pr on test set
        test_recalls, test_ndcg, test_hr, test_pr = [], [], [], []
        for user_chunk in toolz.partition_all(100, test_users):
            recalls, ndcgs, hit_ratios, precisions = testresult.eval(
                sess, user_chunk)
            test_recalls.extend(recalls)
            test_ndcg.extend(ndcgs)
            test_hr.extend(hit_ratios)
            test_pr.extend(precisions)
        print("\nresult on test set: ndcg:{},recall:{},hr:{},pr:{}".format(
            sum(test_ndcg) / float(len(test_ndcg)),
            sum(test_recalls) / float(len(test_recalls)),
            sum(test_hr) / float(len(test_hr)),
            sum(test_pr) / float(len(test_pr))))
Example #3
0
def xnmt_evaluate(args):
    """"Returns the eval score (e.g. BLEU) of the hyp sents using reference trg sents
  """
    cols = args.evaluator.split("|")
    eval_type = cols[0]
    eval_param = {} if len(cols) == 1 else {
        key: value
        for key, value in [param.split("=") for param in cols[1].split()]
    }

    hyp_postprocess = lambda line: line.split()
    ref_postprocess = lambda line: line.split()
    if eval_type == "bleu":
        ngram = int(eval_param.get("ngram", 4))
        evaluator = BLEUEvaluator(ngram=int(ngram))
    elif eval_type == "wer":
        evaluator = WEREvaluator()
    elif eval_type == "cer":
        evaluator = CEREvaluator()
    elif eval_type == "recall":
        nbest = int(eval_param.get("nbest", 5))
        hyp_postprocess = lambda x: ast.literal_eval(x)
        ref_postprocess = lambda x: int(x)
        evaluator = RecallEvaluator(nbest=int(nbest))
    else:
        raise RuntimeError("Unknown evaluation metric {}".format(eval_type))

    ref_corpus = read_data(args.ref_file, post_process=ref_postprocess)
    hyp_corpus = read_data(args.hyp_file, post_process=hyp_postprocess)
    len_before = len(hyp_corpus)
    ref_corpus, hyp_corpus = zip(
        *filter(lambda x: NO_DECODING_ATTEMPTED not in x[1],
                zip(ref_corpus, hyp_corpus)))
    if len(ref_corpus) < len_before:
        print("> ignoring %s out of %s test sentences." %
              (len_before - len(ref_corpus), len_before))

    eval_score = evaluator.evaluate(ref_corpus, hyp_corpus)
    return eval_score
def optimize(model, rating_sampler, social_sampler, train, valid, test):
    """
    Optimize the model.
    :param model: model to optimize
    :param rating_sampler: mini-batch sampler for rating part
    :param social_sampler: mini-batch sampler for social part
    :param train: train user-item matrix
    :param valid: validation user-item matrix
    :param test: test user-item matrix
    :return: None
    """
    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True
    sess = tf.Session(config=config)
    sess.run(tf.global_variables_initializer())

    # sample some users to calculate recall validation
    valid_users = list(set(valid.nonzero()[0]))
    test_users = list(set(test.nonzero()[0]))

    best_val_hr_list = None
    best_val_ndcg_list = None
    test_hrs = None
    test_ndcgs = None
    best_val_ndcg_20 = -1
    epoch_count = 0
    endure_count = 0
    if manifold_name == 'Euclidean':
        num_user_chunck = 1000
    else:
        num_user_chunck = 100
    while True:
        epoch_count += 1
        endure_count += 1
        start = time.time()
        # create evaluator on validation set
        validation_recall = RecallEvaluator(model, train, valid)
        # compute hr and ndcg on validate set
        valid_hrs = [[], [], [], [], [], []]
        valid_ndcgs = [[], [], [], [], [], []]

        for user_chunk in toolz.partition_all(num_user_chunck, valid_users):
            hrs_1, hrs_5, hrs_10, hrs_15, hrs_20, hrs_50, ndcgs_1, ndcgs_5, ndcgs_10, ndcgs_15, ndcgs_20, ndcgs_50 = validation_recall.eval(
                sess, user_chunk)
            valid_hrs[0].extend(hrs_1)
            valid_hrs[1].extend(hrs_5)
            valid_hrs[2].extend(hrs_10)
            valid_hrs[3].extend(hrs_15)
            valid_hrs[4].extend(hrs_20)
            valid_hrs[5].extend(hrs_50)
            valid_ndcgs[0].extend(ndcgs_1)
            valid_ndcgs[1].extend(ndcgs_5)
            valid_ndcgs[2].extend(ndcgs_10)
            valid_ndcgs[3].extend(ndcgs_15)
            valid_ndcgs[4].extend(ndcgs_20)
            valid_ndcgs[5].extend(ndcgs_50)
        valid_hrs[0] = numpy.mean(valid_hrs[0])
        valid_hrs[1] = numpy.mean(valid_hrs[1])
        valid_hrs[2] = numpy.mean(valid_hrs[2])
        valid_hrs[3] = numpy.mean(valid_hrs[3])
        valid_hrs[4] = numpy.mean(valid_hrs[4])
        valid_hrs[5] = numpy.mean(valid_hrs[5])
        valid_ndcgs[0] = numpy.mean(valid_ndcgs[0])
        valid_ndcgs[1] = numpy.mean(valid_ndcgs[1])
        valid_ndcgs[2] = numpy.mean(valid_ndcgs[2])
        valid_ndcgs[3] = numpy.mean(valid_ndcgs[3])
        valid_ndcgs[4] = numpy.mean(valid_ndcgs[4])
        valid_ndcgs[5] = numpy.mean(valid_ndcgs[5])

        val_ndcg_20 = valid_ndcgs[-2]
        if val_ndcg_20 > best_val_ndcg_20:
            endure_count = 0
            best_val_ndcg_20 = val_ndcg_20
            best_val_hr_list = valid_hrs
            best_val_ndcg_list = valid_ndcgs
            test_hrs = [[], [], [], [], [], []]
            test_ndcgs = [[], [], [], [], [], []]
            test_recall = RecallEvaluator(model, train, test)

            for user_chunk in toolz.partition_all(num_user_chunck, test_users):
                hrs_1, hrs_5, hrs_10, hrs_15, hrs_20, hrs_50, ndcgs_1, ndcgs_5, ndcgs_10, ndcgs_15, ndcgs_20, ndcgs_50 = test_recall.eval(
                    sess, user_chunk)
                test_hrs[0].extend(hrs_1)
                test_hrs[1].extend(hrs_5)
                test_hrs[2].extend(hrs_10)
                test_hrs[3].extend(hrs_15)
                test_hrs[4].extend(hrs_20)
                test_hrs[5].extend(hrs_50)
                test_ndcgs[0].extend(ndcgs_1)
                test_ndcgs[1].extend(ndcgs_5)
                test_ndcgs[2].extend(ndcgs_10)
                test_ndcgs[3].extend(ndcgs_15)
                test_ndcgs[4].extend(ndcgs_20)
                test_ndcgs[5].extend(ndcgs_50)
            test_hrs[0] = numpy.mean(test_hrs[0])
            test_hrs[1] = numpy.mean(test_hrs[1])
            test_hrs[2] = numpy.mean(test_hrs[2])
            test_hrs[3] = numpy.mean(test_hrs[3])
            test_hrs[4] = numpy.mean(test_hrs[4])
            test_hrs[5] = numpy.mean(test_hrs[5])
            test_ndcgs[0] = numpy.mean(test_ndcgs[0])
            test_ndcgs[1] = numpy.mean(test_ndcgs[1])
            test_ndcgs[2] = numpy.mean(test_ndcgs[2])
            test_ndcgs[3] = numpy.mean(test_ndcgs[3])
            test_ndcgs[4] = numpy.mean(test_ndcgs[4])
            test_ndcgs[5] = numpy.mean(test_ndcgs[5])
        else:
            if endure_count >= 10:
                break

        print(
            "\n[Epoch %d] val HR: [%.4f,%.4f,%.4f,%.4f,%.4f,%.4f], val NDCG: [%.4f,%.4f,%.4f,%.4f,%.4f,%.4f], best val HR: [%.4f,%.4f,%.4f,%.4f,%.4f,%.4f]"
            ", best val NDCG: [%.4f,%.4f,%.4f,%.4f,%.4f,%.4f], test HR: [%.4f,%.4f,%.4f,%.4f,%.4f,%.4f], test NDCG: [%.4f,%.4f,%.4f,%.4f,%.4f,%.4f]"
            %
            (epoch_count, valid_hrs[0], valid_hrs[1], valid_hrs[2],
             valid_hrs[3], valid_hrs[4], valid_hrs[5], valid_ndcgs[0],
             valid_ndcgs[1], valid_ndcgs[2], valid_ndcgs[3], valid_ndcgs[4],
             valid_ndcgs[5], best_val_hr_list[0], best_val_hr_list[1],
             best_val_hr_list[2], best_val_hr_list[3], best_val_hr_list[4],
             best_val_hr_list[5], best_val_ndcg_list[0], best_val_ndcg_list[1],
             best_val_ndcg_list[2], best_val_ndcg_list[3],
             best_val_ndcg_list[4], best_val_ndcg_list[5], test_hrs[0],
             test_hrs[1], test_hrs[2], test_hrs[3], test_hrs[4], test_hrs[5],
             test_ndcgs[0], test_ndcgs[1], test_ndcgs[2], test_ndcgs[3],
             test_ndcgs[4], test_ndcgs[5]))

        # train model
        losses = []
        # run n mini-batches
        time1 = time.time()
        for _ in range(EVALUATION_EVERY_N_BATCHES):
            user_pos, neg = rating_sampler.next_batch()
            social_pos, social_neg = social_sampler.next_batch()
            _, loss = sess.run(
                (model.optimize, model.loss), {
                    model.user_positive_items_pairs: user_pos,
                    model.negative_samples: neg,
                    model.positive_social_pairs: social_pos,
                    model.negative_social_samples: social_neg
                })
            losses.append(loss)

        end = time.time()
        print('time1:', time1 - start, ' time2:', end - time1)
        print("\nTraining loss {} finisded in {}s".format(
            numpy.mean(losses), end - start))
    print(
        "\nFinished. Best val HR: [%.4f,%.4f,%.4f,%.4f,%.4f,%.4f]"
        ", best val NDCG: [%.4f,%.4f,%.4f,%.4f,%.4f,%.4f], test HR: [%.4f,%.4f,%.4f,%.4f,%.4f,%.4f], test NDCG: [%.4f,%.4f,%.4f,%.4f,%.4f,%.4f]"
        % (best_val_hr_list[0], best_val_hr_list[1], best_val_hr_list[2],
           best_val_hr_list[3], best_val_hr_list[4], best_val_hr_list[5],
           best_val_ndcg_list[0], best_val_ndcg_list[1], best_val_ndcg_list[2],
           best_val_ndcg_list[3], best_val_ndcg_list[4], best_val_ndcg_list[5],
           test_hrs[0], test_hrs[1], test_hrs[2], test_hrs[3], test_hrs[4],
           test_hrs[5], test_ndcgs[0], test_ndcgs[1], test_ndcgs[2],
           test_ndcgs[3], test_ndcgs[4], test_ndcgs[5]))
    # hyp_user_embeddings, hyp_item_embeddings = sess.run(model.save_embeddings)
    # pkl.dump(hyp_user_embeddings, open(), 'wb'))
    # pkl.dump(hyp_item_embeddings, open(), 'wb'))
    # print('Embeddings Saved.')
    rating_sampler.close()
    social_sampler.close()
Example #5
0
def optimize(model, sampler, train, valid, test):
    """
    Optimize the model. TODO: implement early-stopping
    :param model: model to optimize
    :param sampler: mini-batch sampler
    :param train: train user-item matrix
    :param valid: validation user-item matrix
    :return: None
    """
    sess = tf.Session()
    sess.run(tf.global_variables_initializer())
    if model.feature_projection is not None:
        # initialize item embedding with feature projection
        sess.run(tf.assign(model.item_embeddings, model.feature_projection))

    valid_users = list(set(valid.nonzero()[0]))

    stop_num = 10
    t_stop_num = 0
    stop_threshold = 0.005
    pre_recall = 0
    k_Mat = [5, 10, 20, 30, 40, 50]
    r_recalls = np.zeros([1, 6])
    r_precisions = np.zeros([1, 6])
    while True:
        # create evaluator on validation set
        validation_recall = RecallEvaluator(model, train, valid)
        valid_recalls, valid_precisions = validation_recall.eval(
            sess, valid_users, 50)

        # TODO: early stopping based on validation recall
        # train model
        losses = []
        # run n mini-batches
        for _ in tqdm(range(EVALUATION_EVERY_N_BATCHES), desc="Optimizing..."):
            user_pos, neg = sampler.next_batch()

            _, loss = sess.run(
                (model.optimize, model.loss), {
                    model.user_positive_items_pairs: user_pos,
                    model.negative_samples: neg
                })

            losses.append(loss)

        if abs(np.mean(valid_recalls) - pre_recall) < stop_threshold:
            t_stop_num = t_stop_num + 1
        else:
            t_stop_num = 0
            pre_recall = np.mean(valid_recalls)
        if t_stop_num > stop_num:
            # performance evaluation based on test set
            test_recall = RecallEvaluator(model, train, test)
            test_users = list(set(test.nonzero()[0]))

            r_aupr = test_recall.eval_val(sess, test_users, test)

            for num_k in range(1, 7):
                k = k_Mat[num_k - 1]
                print("k = ", k, "\n")
                test_recall_r, test_precision_r = test_recall.eval(
                    sess, test_users, k)
                print("\nRecall on (sampled) test set: {}".format(
                    np.mean(test_recall_r)))
                print("\nPrecision on (sampled) test set: {}".format(
                    np.mean(test_precision_r)))
                r_recalls[:, num_k - 1] = np.mean(test_recall_r)
                r_precisions[:, num_k - 1] = np.mean(test_precision_r)
            break

    sess.close()
    return r_recalls, r_precisions, r_aupr
Example #6
0
def optimize(model, sampler, train, valid, total_batch):
    init = tf.global_variables_initializer()
    sess = tf.Session()
    sess.run(init)

    saver = tf.train.Saver()

    saver.save(sess, os.path.join(model_saving_path, model_name))
    # Sample users to calculate recall validation
    validation_users = set(list(set(valid.nonzero()[0])))
    #validation_users = np.random.choice(list(set(valid.nonzero()[0])),size = len(valid_users),replace = False)
    # early-stopping
    k1, k2 = 30, 50
    epoch = 0
    Recall = RecallEvaluator(model=model,
                             train_user_item_matrix=train,
                             test_user_item_matrix=valid)

    while True:

        # Trian model
        Loss = []
        for i in tqdm(range(total_batch), desc="Training..."):
            user_item_pairs, neg_item = sampler.next_batch()
            _, loss = sess.run(
                (model.optimize, model.loss), {
                    model.user_item_pos_pairs: user_item_pairs,
                    model.neg_items: neg_item
                })
            Loss.append(loss)
        print("epoch :{} loss : {}".format(epoch, np.mean(Loss)))
        epoch += 1
        recalls_k1, precisions_k1, ndcgs_k1 = [], [], []
        recalls_k2, precisions_k2, ndcgs_k2 = [], [], []
        _maps, _mrrs, _aucs = [], [], []
        #validation_users = np.random.choice(list(set(valid.nonzero()[0])),size = VALID_USERS_NUMBERS,
        #                                    replace = False)
        for users_chunk in toolz.partition_all(100, validation_users):
            precision_k1, recall_k1, ndcg_k1 = Recall.precision_recall_ndcg_k(
                sess=sess, users=users_chunk, k=k1)
            precisions_k1.extend(precision_k1)
            recalls_k1.extend(recall_k1)
            ndcgs_k1.extend(ndcg_k1)

            precision_k2, recall_k2, ndcg_k2 = Recall.precision_recall_ndcg_k(
                sess=sess, users=users_chunk, k=k2)
            precisions_k2.extend(precision_k2)
            recalls_k2.extend(recall_k2)
            ndcgs_k2.extend(ndcg_k2)

            _map, _mrr, _auc, _ = Recall.map_mrr_auc_ndcg(sess=sess,
                                                          users=users_chunk)
            _maps.extend(_map)
            _mrrs.extend(_mrr)
            _aucs.extend(_auc)
        print("+" * 20)
        print("P@" + str(k1) + ": {}".format(np.mean(precisions_k1)))
        print("R@" + str(k1) + ": {}".format(np.mean(recalls_k1)))
        print("NDCG@" + str(k1) + ": {}".format(np.mean(ndcgs_k1)))
        print("-" * 20)
        print("P@" + str(k2) + ": {}".format(np.mean(precisions_k2)))
        print("R@" + str(k2) + ": {}".format(np.mean(recalls_k2)))
        print("NDCG@" + str(k2) + ": {}".format(np.mean(ndcgs_k2)))
        print("-" * 20)
        print("MAP: {}".format(np.mean(_maps)))
        print("MRR: {}".format(np.mean(_mrrs)))
        print("AUC: {}".format(np.mean(_aucs)))
        print("+" * 20)
        saver.save(sess, os.path.join(model_saving_path, model_name))
    sess.close()
Example #7
0
def optimize(model,
             sampler,
             train,
             valid,
             test,
             args,
             item_neighbors,
             user_neighbors,
             early_stopping_n=5):
    """
    Optimize the model. DONETODO: implement early-stopping
    :param model: model to optimize
    :param sampler: mini-batch sampler
    :param train: train user-item matrix
    :param valid: validation user-item matrix
    :return: None
    """
    config = tf.ConfigProto(allow_soft_placement=True)
    config.gpu_options.allow_growth = True
    sess = tf.Session(config=config)
    sess.run(tf.global_variables_initializer())
    if model.feature_projection is not None:
        # initialize item embedding with feature projection
        sess.run(tf.assign(model.item_embeddings, model.feature_projection))
    # all test users to calculate recall validation
    valid_users = numpy.asarray(list(set(valid.nonzero()[0])),
                                dtype=numpy.int32)
    test_users = numpy.asarray(list(set(test.nonzero()[0])), dtype=numpy.int32)
    validresult = RecallEvaluator(model, train, valid)
    testresult = RecallEvaluator(model, train, test)

    # 这里较为特殊
    # 每若干批数据训练后,进行一次评估,则记为一轮训练
    # 这是延续cml和maml的历史设计
    # cml为了提升速度,使用多进程并行地
    # 在训练集中取每一批数据
    # 所以难以界定何时便利了所有训练集数据
    # 并且cml使用的并行采样,抛弃最后构不成一批的数据
    # 不能按照传统的方式定义一轮训练
    epoch = 0
    # 用于early stopping的计数
    fail_cnt = 0

    # best_ndcg=-100.0
    best_recall = -100.0
    # best_hr=-100.0
    # best_pr=-100.0
    saver = tf.train.Saver()

    while True:
        print('\nepochs:{}'.format(epoch), file=outputfile)
        epoch += 1
        # train model
        losses = []
        # run n mini-batches
        for _ in tqdm(range(args.eva_batches), desc="Optimizing..."):
            user_pos, neg = sampler.next_batch()
            # print("get next batch",datetime.utcnow().strftime("%Y-%m-%d %H:%M:%S.%f")[:-3])

            # (N+N*W, 1)
            all_item_ids = numpy.concatenate(
                (user_pos[:, 1], numpy.reshape(neg, (-1))), axis=0)
            # print("concat all item",datetime.utcnow().strftime("%Y-%m-%d %H:%M:%S.%f")[:-3])

            # 首先去随机数用于后面采样定位
            # 实验中所用的数据集中,用户物品数量都在1000000以内
            # 足够覆盖每一个邻居
            # 若在使用个别更大的数据集,可以相应调大数值
            all_item_neis_id_ranindex = numpy.random.randint(
                1000000, size=all_item_ids.shape[0])

            # 取模后访问列表的元素,
            # 这种实现方式是尝试的多种采样方法中最快的
            # 可以在几毫秒或几十毫秒内完成采样
            # 而其他方法则较慢
            # 采样用户邻居时,可以先转换为numpy的array,可以使用列表进行索引
            # 但是实际效果很慢,不如一个个取数
            # 最后输入到tensorflow中
            # (N+N*W, 1)
            all_item_neis_id_sample = [
                item_neighbors[i][ranindex % len(item_neighbors[i])]
                for i, ranindex in zip(all_item_ids, all_item_neis_id_ranindex)
            ]
            # assert len(all_item_neis_id_sample)==all_item_ids.shape[0]
            # print("sample all item nei done",datetime.utcnow().strftime("%Y-%m-%d %H:%M:%S.%f")[:-3])
            # assert len(all_item_neis_id_sample[0])==1

            # (N+N*W, 1)
            user_ids = numpy.concatenate(
                (user_pos[:, 0],
                 numpy.tile(numpy.expand_dims(user_pos[:, 0], 1),
                            (1, args.num_neg)).flatten()),
                axis=0)
            user_neis_id_ranindex = numpy.random.randint(
                1000000, size=user_ids.shape[0])
            # (N+N*W, 1)
            user_neis_id_sample = [
                user_neighbors[i][ranindex % len(user_neighbors[i])]
                for i, ranindex in zip(all_item_ids, user_neis_id_ranindex)
            ]
            # print("sample user nei done",datetime.utcnow().strftime("%Y-%m-%d %H:%M:%S.%f")[:-3])
            # assert len(user_neis_id_sample[0])==1
            # raise NotImplementedError

            _, loss = sess.run(
                (model.optimize, model.loss), {
                    model.user_positive_items_pairs: user_pos,
                    model.negative_samples: neg,
                    model.all_item_neis_id_sample: all_item_neis_id_sample,
                    model.user_neis_id_sample: user_neis_id_sample,
                })
            losses.append(loss)
        # 训练中的日志同时记录到标准输出和文件中
        print("\nTrain loss: {}".format(numpy.mean(losses)), file=outputfile)
        print("\nTrain loss: {}".format(numpy.mean(losses)))

        # 在验证集进行评估,仅使用recall
        # 其他指标为maml遗留,在评估过程中直接置零,不再计算,减少计算用时
        valid_recalls, valid_ndcg, valid_hr, valid_pr = [], [], [], []
        for user_chunk in toolz.partition_all(100, valid_users):
            recalls, ndcgs, hit_ratios, precisions = validresult.eval(
                sess, user_chunk, item_neighbors, user_neighbors)
            valid_recalls.extend(recalls)
            valid_ndcg.extend(ndcgs)
            valid_hr.extend(hit_ratios)
            valid_pr.extend(precisions)
        ndcg_mean = numpy.mean(valid_ndcg)
        recall_mean = numpy.mean(valid_recalls)
        hr_mean = numpy.mean(valid_hr)
        pr_mean = numpy.mean(valid_pr)
        print("\nresult on valid set: recall:{}".format(recall_mean),
              file=outputfile)
        print("\nresult on valid set: recall:{}".format(recall_mean))

        # 看是否在验证集上过拟合,如果在指定轮数后
        # 在验证集仍未有提升,则触发提前终止
        # 实验中指定的是10轮
        # 每次取得最佳效果后保存模型
        # 供后续在测试集上还原
        if recall_mean <= best_recall:
            fail_cnt += 1
        else:
            # best_ndcg=ndcg_mean
            best_recall = recall_mean
            # best_hr=hr_mean
            # best_pr=pr_mean
            fail_cnt = 0
            saver.save(
                sess,
                os.path.join(os.getcwd(),
                             "models_{:%Y%m%d_%H%M%S}/".format(nowdate),
                             Filename + "_model.ckpt"))
            print("Best result!", file=outputfile)
            print("Best result!")
            # print(saver.last_checkpoints[-1])
        outputfile.flush()
        if fail_cnt >= early_stopping_n:
            break

    # 还原最佳的模型
    # 在测试集进行评估
    # saver.restore(sess, saver.last_checkpoints[-1])
    ckpt_state = tf.train.get_checkpoint_state("./models_20200604_000541")
    with open("test_pred.txt", 'w') as ftest:
        saver.restore(sess, ckpt_state.model_checkpoint_path)
        test_recalls, test_ndcg, test_hr, test_pr = [], [], [], []
        for user_chunk in toolz.partition_all(100, test_users):
            recalls, ndcgs, hit_ratios, precisions = testresult.eval(
                sess, user_chunk, item_neighbors, user_neighbors, ftest=ftest)
            test_recalls.extend(recalls)
            test_ndcg.extend(ndcgs)
            test_hr.extend(hit_ratios)
            test_pr.extend(precisions)
        ndcg_mean = numpy.mean(test_ndcg)
        recall_mean = numpy.mean(test_recalls)
        hr_mean = numpy.mean(test_hr)
        pr_mean = numpy.mean(test_pr)
        print("\nresult on test set: recall:{}".format(recall_mean),
              file=outputfile)
        print("\nresult on test set: recall:{}".format(recall_mean))
Example #8
0
def optimize(model,
             sampler,
             train,
             valid,
             test,
             train_exp_neg,
             valid_exp_neg,
             test_exp_neg,
             epochs=10):
    """
    Optimize the model. TODO: implement early-stopping
    :param model: model to optimize
    :param sampler: mini-batch sampler
    :param train: train user-item matrix
    :param valid: validation user-item matrix
    :param epochs: amount of epochs to run
    :return: None
    """
    merged_summary_op = tf.summary.merge_all()
    with tf.Session() as sess:
        sess.run(tf.global_variables_initializer())
        #if model.feature_projection is not None:
        # initialize item embedding with feature projection
        #    sess.run(tf.assign(model.item_embeddings, model.feature_projection))

        # sample some users to calculate recall validation
        test_users = numpy.random.choice(list(set(test.nonzero()[0])),
                                         size=1000,
                                         replace=False)

        #Variouse sections which would be needed for tensorboard are commented
        #The reason is that they don't work on the cluster

        # Initiate summary writer and give unique log dir to all
        #logs=str(os.getcwd())+'/train'
        users_name = "_not_named"
        #users_name=raw_input("Enter a name for this runs log:")

        #log_dir=logs+"/iters_"+str(epochs*EVALUATION_EVERY_N_BATCHES)+"_time_"+str(datetime.datetime.now()).replace(" ","_")+"__"+str(users_name)
        #if not os.path.exists(log_dir):
        #    os.makedirs(log_dir)
        # copy the metadata
        #copyfile(logs+"/projector_config.pbtxt", log_dir+"/projector_config.pbtxt")
        #create statistics of the run
        #stat_file=open(log_dir+'/stat_file.dat', 'w+')

        #train_writer = tf.summary.FileWriter(log_dir,
        #                                     graph=tf.get_default_graph())
        #saver = tf.train.Saver()
        # init history to plot with matplot
        history = dict()
        history["Recall"] = []
        history["Prec"] = []
        history["NIT5"] = []
        history["NIT10"] = []

        for x in tqdm(xrange(epochs), desc='Epochs running...'):
            # create evaluator on validation set
            validation_recall = RecallEvaluator(model, train, test,
                                                train_exp_neg, test_exp_neg)
            # compute recall on validate set
            valid_recalls = []
            valid_precision_at_len_test = []
            exp_neg_items_in_top_k = []
            exp_neg_items_in_top_5 = []
            exp_neg_items_in_top_10 = []

            # compute recall in chunks to utilize speedup provided by Tensorflow
            for user_chunk in toolz.partition_all(100, test_users):
                val_recall, val_precision_at_len_test, exp_neg_in_top_5, exp_neg_in_top_10 = validation_recall.eval(
                    sess, user_chunk, k=50)
                valid_recalls.extend([val_recall])
                exp_neg_items_in_top_5.extend([exp_neg_in_top_5])
                exp_neg_items_in_top_10.extend([exp_neg_in_top_10])
                valid_precision_at_len_test.extend([val_precision_at_len_test])
            flatten = lambda l: [item for sublist in l for item in sublist]
            his_rec, hist_prec, his_nit5, hist_nit10 = log_eval_stats_binned(
                flatten(valid_recalls), flatten(valid_precision_at_len_test),
                flatten(exp_neg_items_in_top_k),
                flatten(exp_neg_items_in_top_5),
                flatten(exp_neg_items_in_top_10))

            history["Recall"] += [his_rec]
            history["Prec"] += [hist_prec]
            history["NIT5"] += [his_nit5]
            history["NIT10"] += [hist_nit10]

            #NITK_summary=tf.summary.scalar("NITK", numpy.mean(exp_neg_items_in_top_k))
            # TODO: early stopping based on validation recall
            # train model
            losses = []
            model.merged_summary_op = tf.summary.merge_all()
            #train_writer.add_summary(model.merged_summary_op, x)
            # run n mini-batches
            for i in tqdm(range(EVALUATION_EVERY_N_BATCHES),
                          desc="Optimizing..."):
                user_pos, neg, user_exp_neg, pos_neg_pairs = sampler.next_batch(
                )
                _, loss, summary = sess.run(
                    (model.optimize, model.loss, model.merged_summary_op), {
                        model.user_positive_items_pairs: user_pos,
                        model.negative_samples: neg,
                        model.user_exp_neg_items_pairs: user_exp_neg,
                        model.pos_neg_pairs: pos_neg_pairs
                    })
                #train_writer.add_summary(summary, i + (x * EVALUATION_EVERY_N_BATCHES))
                #saver.save(sess, os.path.join(log_dir, "model.ckpt"), i + (x * EVALUATION_EVERY_N_BATCHES))
                losses.append(loss)
            print "\nEpoch:" + str(x),
            print("\nTraining loss {}".format(numpy.mean(losses)))
        print(10 * "\n")
        print("Training has ended!")
        print(10 * "\n")

        #calculate recall on test set

        validation_recall = RecallEvaluator(model, train, valid, train_exp_neg,
                                            valid_exp_neg)

        valid_users = numpy.random.choice(
            list(set(valid.nonzero()[0])),
            size=int(len(list(set(valid.nonzero()[0])))),
            replace=False)
        printValForAll(valid_users,
                       validation_recall,
                       sess,
                       k=10,
                       nI1=1,
                       nI2=5)
        printValForAll(valid_users,
                       validation_recall,
                       sess,
                       k=20,
                       nI1=10,
                       nI2=20)
        printValForAll(valid_users,
                       validation_recall,
                       sess,
                       k=50,
                       nI1=30,
                       nI2=40)
        printValForAll(valid_users,
                       validation_recall,
                       sess,
                       k=75,
                       nI1=50,
                       nI2=60)
        printValForAll(valid_users,
                       validation_recall,
                       sess,
                       k=100,
                       nI1=70,
                       nI2=80)

        #print 5*"\n"
        #stat_file.writelines("Val_recall at 50 on test set:{}".format(numpy.mean(val_recall))+"\n")
        #stat_file.writelines("NIT5 on test set:{}".format(numpy.mean(exp_neg_in_top_5))+"\n")
        #stat_file.writelines("NIT10 test set:{}".format(numpy.mean(exp_neg_in_top_10))+"\n")
        #stat_file.writelines("Precision at 50 test set: {}".format(numpy.mean(val_precision_at_len_test))+"\n")

        # print "Starting tsne"
        # print model.item_embeddings
        # tsne.tsne(model.item_embeddings, no_dims=2, initial_dims=100, perplexity=30.0)
    print "Starting summary:"
    # plot the current run
    #and ma a file
    pp = PdfPages('LastRunCMLEN_' + "" + '.pdf')
    plt.figure(1)
    plt.xlabel('Epochs, each epoch is iterations: ' +
               str(EVALUATION_EVERY_N_BATCHES))
    plt.title('Recall')
    plt.plot(history["Recall"])
    pp.savefig()

    plt.figure(2)
    plt.xlabel('Epochs, each epoch is iterations: ' +
               str(EVALUATION_EVERY_N_BATCHES))

    plt.title('Precision')
    plt.plot(history["Prec"])
    pp.savefig()

    plt.figure(3)
    plt.xlabel('Epochs, each epoch is iterations: ' +
               str(EVALUATION_EVERY_N_BATCHES))
    plt.title('NIT5')
    plt.plot(history["NIT5"])
    pp.savefig()

    plt.figure(4)
    plt.xlabel('Epochs, each epoch is iterations: ' +
               str(EVALUATION_EVERY_N_BATCHES))
    plt.title('NIT50')
    plt.plot(history["NIT10"])
    pp.savefig()

    # End the session
    pp.close()
    sess.close()
    sampler.close()

    #try:
    #    os.system("tensorboard --logdir="+logs)
    #    "Started tensorboard"
    #except:
    #    print "Sth with your log dir is wrong"

    return