def main(): global user_matrix, user_bias, item_matrix, item_bias, user_score, item_score, global_bias, tfidf_predictor, vec_predictor base_dir = '../../paper/data/dianping/mf/' user_item_score_file = os.path.join(base_dir, 'train/comment.keyword.train.user_item_star') user_matrix_file = os.path.join(base_dir, 'out/comment.keyword.train.user_item_star.user') item_matrix_file = os.path.join(base_dir, 'out/comment.keyword.train.user_item_star.item') test_file = os.path.join(base_dir, '../comment.keyword.test') #test_file = os.path.join(base_dir, 'train/comment.mongo.train') logging.info('loading user matrix...') user_matrix, user_bias = load_nmf_matrix(user_matrix_file, print_log=True) logging.info('loading item matrix...') item_matrix, item_bias = load_nmf_matrix(item_matrix_file, print_log=True) logging.info('loading item score...') user_score, item_score, global_bias = load_user_item_score(user_item_score_file, print_log=True) logging.info('global_bias:%f' % global_bias) #load vector vector_directory = "../../paper/data/dianping/w2v/vector" model_directory = "../../paper/data/dianping/lr_model/" user_vector = os.path.join(vector_directory, "comment.keyword.train.user.vector.200") shop_vector = os.path.join(vector_directory, "comment.keyword.train.shop.vector.200") vector_model_file = os.path.join(model_directory, "w2v_200_lr") vec_predictor = vec_lr_predictor(user_vector, shop_vector, vector_model_file) user_trained_shops = load_user_trained_shops(user_item_score_file) shop_ids = load_ids(shop_vector) user_ids = load_ids(user_vector) fout = file('./predict_res.all', 'w') user_count = 0 for user_id in user_ids: if user_count % 1000 == 0: logging.info('user count:%d' % user_count) user_count += 1 if user_id not in user_trained_shops: continue #predict_res = [] heap = MinSizeHeap(10) for shop_id in shop_ids: # if shop in training data, ignore it if shop_id in user_trained_shops[user_id]: continue heap.push((vector_score_function(user_id, shop_id), shop_id)) heap.sort() #sorted_res = sorted(predict_res, key=lambda x: x[0], reverse=True)[0] #res = ['%s:%lf' % (shop_id, score) for score, shop_id in sorted_res] #fout.write('%s\t%s\n' % (user_id, '\t'.join(res))) for score, shop_id in heap.arr: fout.write('%s\t%s\t%s\n' % (user_id, shop_id, score)) fout.close()
def main(): global user_matrix, user_bias, item_matrix, item_bias, user_score, item_score, global_bias, tfidf_predictor, vec_predictor base_dir = '../../paper/data/dianping/mf/' user_item_score_file = os.path.join( base_dir, 'train/comment.keyword.train.user_item_star') user_matrix_file = os.path.join( base_dir, 'out/comment.keyword.train.user_item_star.user') item_matrix_file = os.path.join( base_dir, 'out/comment.keyword.train.user_item_star.item') test_file = os.path.join(base_dir, '../comment.keyword.test') #test_file = os.path.join(base_dir, 'train/comment.mongo.train') logging.info('loading user matrix...') user_matrix, user_bias = load_nmf_matrix(user_matrix_file, print_log=True) logging.info('loading item matrix...') item_matrix, item_bias = load_nmf_matrix(item_matrix_file, print_log=True) logging.info('loading item score...') user_score, item_score, global_bias = load_user_item_score( user_item_score_file, print_log=True) logging.info('global_bias:%f' % global_bias) #load vector vector_directory = "../../paper/data/dianping/w2v/vector" model_directory = "../../paper/data/dianping/lr_model/" user_vector = os.path.join(vector_directory, "comment.keyword.train.user.vector.200") shop_vector = os.path.join(vector_directory, "comment.keyword.train.shop.vector.200") vector_model_file = os.path.join(model_directory, "w2v_200_lr") vec_predictor = vec_lr_predictor(user_vector, shop_vector, vector_model_file) user_trained_shops = load_user_trained_shops(user_item_score_file) shop_ids = load_ids(shop_vector) user_ids = load_ids(user_vector) fout = file('./predict_res.all', 'w') user_count = 0 for user_id in user_ids: if user_count % 1000 == 0: logging.info('user count:%d' % user_count) user_count += 1 if user_id not in user_trained_shops: continue #predict_res = [] heap = MinSizeHeap(10) for shop_id in shop_ids: # if shop in training data, ignore it if shop_id in user_trained_shops[user_id]: continue heap.push((vector_score_function(user_id, shop_id), shop_id)) heap.sort() #sorted_res = sorted(predict_res, key=lambda x: x[0], reverse=True)[0] #res = ['%s:%lf' % (shop_id, score) for score, shop_id in sorted_res] #fout.write('%s\t%s\n' % (user_id, '\t'.join(res))) for score, shop_id in heap.arr: fout.write('%s\t%s\t%s\n' % (user_id, shop_id, score)) fout.close()