def main():
    global user_matrix, user_bias, item_matrix, item_bias, user_score, item_score, global_bias, tfidf_predictor, vec_predictor
    base_dir = '../../paper/data/dianping/mf/'
    user_item_score_file = os.path.join(base_dir, 'train/comment.keyword.train.user_item_star')
    user_matrix_file = os.path.join(base_dir, 'out/comment.keyword.train.user_item_star.user')
    item_matrix_file = os.path.join(base_dir, 'out/comment.keyword.train.user_item_star.item')
    test_file = os.path.join(base_dir, '../comment.keyword.test')
    #test_file = os.path.join(base_dir, 'train/comment.mongo.train')
    logging.info('loading user matrix...')
    user_matrix, user_bias = load_nmf_matrix(user_matrix_file, print_log=True)
    logging.info('loading item matrix...')
    item_matrix, item_bias = load_nmf_matrix(item_matrix_file, print_log=True)
    logging.info('loading item score...')
    user_score, item_score, global_bias = load_user_item_score(user_item_score_file, print_log=True)
    logging.info('global_bias:%f' % global_bias)
    #load vector
    vector_directory = "../../paper/data/dianping/w2v/vector"
    model_directory = "../../paper/data/dianping/lr_model/"
    user_vector = os.path.join(vector_directory, "comment.keyword.train.user.vector.200")
    shop_vector = os.path.join(vector_directory, "comment.keyword.train.shop.vector.200")
    vector_model_file = os.path.join(model_directory, "w2v_200_lr")
    vec_predictor = vec_lr_predictor(user_vector, shop_vector, vector_model_file)

    user_trained_shops = load_user_trained_shops(user_item_score_file)
    shop_ids = load_ids(shop_vector)
    user_ids = load_ids(user_vector)

    fout = file('./predict_res.all', 'w')
    user_count = 0
    for user_id in user_ids:
        if user_count % 1000 == 0:
            logging.info('user count:%d' % user_count)
        user_count += 1
        if user_id not in user_trained_shops: continue
        #predict_res = []
        heap = MinSizeHeap(10)
        for shop_id in shop_ids:
            # if shop in training data, ignore it
            if shop_id in user_trained_shops[user_id]: continue
            heap.push((vector_score_function(user_id, shop_id), shop_id))
        heap.sort()
        #sorted_res = sorted(predict_res, key=lambda x: x[0], reverse=True)[0]
        #res = ['%s:%lf' % (shop_id, score) for score, shop_id in sorted_res]
        #fout.write('%s\t%s\n' % (user_id, '\t'.join(res)))
        for score, shop_id in heap.arr:
            fout.write('%s\t%s\t%s\n' % (user_id, shop_id, score))
    fout.close()
Esempio n. 2
0
def main():
    global user_matrix, user_bias, item_matrix, item_bias, user_score, item_score, global_bias, tfidf_predictor, vec_predictor
    base_dir = '../../paper/data/dianping/mf/'
    user_item_score_file = os.path.join(
        base_dir, 'train/comment.keyword.train.user_item_star')
    user_matrix_file = os.path.join(
        base_dir, 'out/comment.keyword.train.user_item_star.user')
    item_matrix_file = os.path.join(
        base_dir, 'out/comment.keyword.train.user_item_star.item')
    test_file = os.path.join(base_dir, '../comment.keyword.test')
    #test_file = os.path.join(base_dir, 'train/comment.mongo.train')
    logging.info('loading user matrix...')
    user_matrix, user_bias = load_nmf_matrix(user_matrix_file, print_log=True)
    logging.info('loading item matrix...')
    item_matrix, item_bias = load_nmf_matrix(item_matrix_file, print_log=True)
    logging.info('loading item score...')
    user_score, item_score, global_bias = load_user_item_score(
        user_item_score_file, print_log=True)
    logging.info('global_bias:%f' % global_bias)
    #load vector
    vector_directory = "../../paper/data/dianping/w2v/vector"
    model_directory = "../../paper/data/dianping/lr_model/"
    user_vector = os.path.join(vector_directory,
                               "comment.keyword.train.user.vector.200")
    shop_vector = os.path.join(vector_directory,
                               "comment.keyword.train.shop.vector.200")
    vector_model_file = os.path.join(model_directory, "w2v_200_lr")
    vec_predictor = vec_lr_predictor(user_vector, shop_vector,
                                     vector_model_file)

    user_trained_shops = load_user_trained_shops(user_item_score_file)
    shop_ids = load_ids(shop_vector)
    user_ids = load_ids(user_vector)

    fout = file('./predict_res.all', 'w')
    user_count = 0
    for user_id in user_ids:
        if user_count % 1000 == 0:
            logging.info('user count:%d' % user_count)
        user_count += 1
        if user_id not in user_trained_shops: continue
        #predict_res = []
        heap = MinSizeHeap(10)
        for shop_id in shop_ids:
            # if shop in training data, ignore it
            if shop_id in user_trained_shops[user_id]: continue
            heap.push((vector_score_function(user_id, shop_id), shop_id))
        heap.sort()
        #sorted_res = sorted(predict_res, key=lambda x: x[0], reverse=True)[0]
        #res = ['%s:%lf' % (shop_id, score) for score, shop_id in sorted_res]
        #fout.write('%s\t%s\n' % (user_id, '\t'.join(res)))
        for score, shop_id in heap.arr:
            fout.write('%s\t%s\t%s\n' % (user_id, shop_id, score))
    fout.close()