Ejemplo n.º 1
0
def main():
    global user_matrix, user_bias, item_matrix, item_bias, user_score, item_score, global_bias, tfidf_predictor, vec_predictor
    base_dir = '../../paper/data/dianping/mf/'
    user_item_score_file = os.path.join(
        base_dir, 'train/comment.keyword.train.user_item_star')
    user_matrix_file = os.path.join(
        base_dir, 'out/comment.keyword.train.user_item_star.user')
    item_matrix_file = os.path.join(
        base_dir, 'out/comment.keyword.train.user_item_star.item')
    test_file = os.path.join(base_dir, '../comment.keyword.test')
    #test_file = os.path.join(base_dir, 'train/comment.mongo.train')
    logging.info('loading user matrix...')
    user_matrix, user_bias = load_nmf_matrix(user_matrix_file, print_log=True)
    logging.info('loading item matrix...')
    item_matrix, item_bias = load_nmf_matrix(item_matrix_file, print_log=True)
    logging.info('loading item score...')
    user_score, item_score, global_bias = load_user_item_score(
        user_item_score_file, print_log=True)
    logging.info('global_bias:%f' % global_bias)
    #load vector
    vector_directory = "../../paper/data/dianping/w2v/vector"
    model_directory = "../../paper/data/dianping/lr_model/"
    user_vector = os.path.join(vector_directory,
                               "comment.keyword.train.user.vector.200")
    shop_vector = os.path.join(vector_directory,
                               "comment.keyword.train.shop.vector.200")
    vector_model_file = os.path.join(model_directory, "w2v_200_lr")
    vec_predictor = vec_lr_predictor(user_vector, shop_vector,
                                     vector_model_file)

    user_trained_shops = load_user_trained_shops(user_item_score_file)
    shop_ids = load_ids(shop_vector)
    user_ids = load_ids(user_vector)

    fout = file('./predict_res.all', 'w')
    user_count = 0
    for user_id in user_ids:
        if user_count % 1000 == 0:
            logging.info('user count:%d' % user_count)
        user_count += 1
        if user_id not in user_trained_shops: continue
        #predict_res = []
        heap = MinSizeHeap(10)
        for shop_id in shop_ids:
            # if shop in training data, ignore it
            if shop_id in user_trained_shops[user_id]: continue
            heap.push((vector_score_function(user_id, shop_id), shop_id))
        heap.sort()
        #sorted_res = sorted(predict_res, key=lambda x: x[0], reverse=True)[0]
        #res = ['%s:%lf' % (shop_id, score) for score, shop_id in sorted_res]
        #fout.write('%s\t%s\n' % (user_id, '\t'.join(res)))
        for score, shop_id in heap.arr:
            fout.write('%s\t%s\t%s\n' % (user_id, shop_id, score))
    fout.close()
Ejemplo n.º 2
0
def main():
    global user_matrix, user_bias, item_matrix, item_bias, user_score, item_score, global_bias, tfidf_predictor, vec_predictor
    base_dir = '../../paper/data/dianping/mf/'
    user_item_score_file = os.path.join(base_dir, 'train/comment.keyword.train.user_item_star')
    user_matrix_file = os.path.join(base_dir, 'out/comment.keyword.train.user_item_star.user')
    item_matrix_file = os.path.join(base_dir, 'out/comment.keyword.train.user_item_star.item')
    test_file = os.path.join(base_dir, '../comment.keyword.test')
    #test_file = os.path.join(base_dir, 'train/comment.mongo.train')
    logging.info('loading user matrix...')
    user_matrix, user_bias = load_nmf_matrix(user_matrix_file, print_log=True)
    logging.info('loading item matrix...')
    item_matrix, item_bias = load_nmf_matrix(item_matrix_file, print_log=True)
    logging.info('loading item score...')
    user_score, item_score, global_bias = load_user_item_score(user_item_score_file, print_log=True)
    logging.info('global_bias:%f' % global_bias)
    #load vector
    vector_directory = "../../paper/data/dianping/w2v/vector"
    model_directory = "../../paper/data/dianping/lr_model/"
    user_vector = os.path.join(vector_directory, "comment.keyword.train.user.vector.200")
    shop_vector = os.path.join(vector_directory, "comment.keyword.train.shop.vector.200")
    vector_model_file = os.path.join(model_directory, "w2v_200_lr")
    vec_predictor = vec_lr_predictor(user_vector, shop_vector, vector_model_file)

    user_trained_shops = load_user_trained_shops(user_item_score_file)
    shop_ids = load_ids(shop_vector)
    user_ids = load_ids(user_vector)

    fout = file('./predict_res.all', 'w')
    user_count = 0
    for user_id in user_ids:
        if user_count % 1000 == 0:
            logging.info('user count:%d' % user_count)
        user_count += 1
        if user_id not in user_trained_shops: continue
        #predict_res = []
        heap = MinSizeHeap(10)
        for shop_id in shop_ids:
            # if shop in training data, ignore it
            if shop_id in user_trained_shops[user_id]: continue
            heap.push((vector_score_function(user_id, shop_id), shop_id))
        heap.sort()
        #sorted_res = sorted(predict_res, key=lambda x: x[0], reverse=True)[0]
        #res = ['%s:%lf' % (shop_id, score) for score, shop_id in sorted_res]
        #fout.write('%s\t%s\n' % (user_id, '\t'.join(res)))
        for score, shop_id in heap.arr:
            fout.write('%s\t%s\t%s\n' % (user_id, shop_id, score))
    fout.close()
Ejemplo n.º 3
0
def main():
    global user_matrix, user_bias, item_matrix, item_bias, user_score, item_score, global_bias, tfidf_predictor, vec_predictor
    base_dir = '../../paper/data/dianping/mf/'
    user_item_score_file = os.path.join(
        base_dir, 'train/comment.keyword.train.user_item_star')
    user_matrix_file = os.path.join(
        base_dir, 'out/comment.keyword.train.user_item_star.user')
    item_matrix_file = os.path.join(
        base_dir, 'out/comment.keyword.train.user_item_star.item')
    test_file = os.path.join(base_dir, '../comment.keyword.test')
    #test_file = os.path.join(base_dir, 'train/comment.mongo.train')
    logging.info('loading user matrix...')
    user_matrix, user_bias = load_nmf_matrix(user_matrix_file, print_log=True)
    logging.info('loading item matrix...')
    item_matrix, item_bias = load_nmf_matrix(item_matrix_file, print_log=True)
    logging.info('loading item score...')
    user_score, item_score, global_bias = load_user_item_score(
        user_item_score_file, print_log=True)
    logging.info('global_bias:%f' % global_bias)

    #load vector
    tfidf_directory = "../../paper/data/dianping/tfidf/vector"
    vector_directory = "../../paper/data/dianping/w2v/vector"
    model_directory = "../../paper/data/dianping/lr_model/"
    tfidf_user_vector = os.path.join(tfidf_directory,
                                     "comment.keyword.train.user.vector.1000")
    tfidf_shop_vector = os.path.join(tfidf_directory,
                                     "comment.keyword.train.shop.vector.1000")
    user_vector = os.path.join(vector_directory,
                               "comment.keyword.train.user.vector")
    shop_vector = os.path.join(vector_directory,
                               "comment.keyword.train.shop.vector")
    tfidf_model_file = os.path.join(model_directory, "tfidf_top10K")
    vector_model_file = os.path.join(model_directory, "w2v_500")
    #tfidf_predictor = tfidf_lr_predictor(tfidf_user_vector, tfidf_shop_vector, tfidf_model_file)
    vec_predictor = vec_lr_predictor(user_vector, shop_vector,
                                     vector_model_file)

    logging.info('calculating rmse...')
    #rmse = cal_rmse(test_file, mf_score_function)
    rmse = cal_rmse(test_file, vector_score_function)
    print 'rmse:%lf' % rmse
    logging.info('user_miss:%d, item_miss:%d, all_miss: %d' %
                 (user_miss, item_miss, all_miss))
    #logging.info('tfidf_predictor.hit:%d, miss:%d' % (tfidf_predictor.hit, tfidf_predictor.miss))
    logging.info('vec_predictor.hit:%d, miss:%d' %
                 (vec_predictor.hit, vec_predictor.miss))
Ejemplo n.º 4
0
def main():
    global user_matrix, user_bias, item_matrix, item_bias, user_score, item_score, global_bias, tfidf_predictor, vec_predictor
    base_dir = "../../paper/data/dianping/mf/"
    user_item_score_file = os.path.join(base_dir, "train/comment.keyword.train.user_item_star")
    user_matrix_file = os.path.join(base_dir, "out/comment.keyword.train.user_item_star.user")
    item_matrix_file = os.path.join(base_dir, "out/comment.keyword.train.user_item_star.item")
    test_file = os.path.join(base_dir, "../comment.keyword.test")
    # test_file = os.path.join(base_dir, 'train/comment.mongo.train')
    logging.info("loading user matrix...")
    user_matrix, user_bias = load_nmf_matrix(user_matrix_file, print_log=True)
    logging.info("loading item matrix...")
    item_matrix, item_bias = load_nmf_matrix(item_matrix_file, print_log=True)
    logging.info("loading item score...")
    user_score, item_score, global_bias = load_user_item_score(user_item_score_file, print_log=True)
    logging.info("global_bias:%f" % global_bias)

    # load vector
    tfidf_directory = "../../paper/data/dianping/tfidf/vector"
    vector_directory = "../../paper/data/dianping/w2v/vector"
    model_directory = "../../paper/data/dianping/lr_model/"
    tfidf_user_vector = os.path.join(tfidf_directory, "comment.keyword.train.user.vector.1000")
    tfidf_shop_vector = os.path.join(tfidf_directory, "comment.keyword.train.shop.vector.1000")
    user_vector = os.path.join(vector_directory, "comment.keyword.train.user.vector")
    shop_vector = os.path.join(vector_directory, "comment.keyword.train.shop.vector")
    tfidf_model_file = os.path.join(model_directory, "tfidf_top10K")
    vector_model_file = os.path.join(model_directory, "w2v_500")
    # tfidf_predictor = tfidf_lr_predictor(tfidf_user_vector, tfidf_shop_vector, tfidf_model_file)
    vec_predictor = vec_lr_predictor(user_vector, shop_vector, vector_model_file)

    logging.info("calculating rmse...")
    # rmse = cal_rmse(test_file, mf_score_function)
    rmse = cal_rmse(test_file, vector_score_function)
    print "rmse:%lf" % rmse
    logging.info("user_miss:%d, item_miss:%d, all_miss: %d" % (user_miss, item_miss, all_miss))
    # logging.info('tfidf_predictor.hit:%d, miss:%d' % (tfidf_predictor.hit, tfidf_predictor.miss))
    logging.info("vec_predictor.hit:%d, miss:%d" % (vec_predictor.hit, vec_predictor.miss))