def load_data_from_raw(raw_data): ''' 加载原始数据 :param raw_data: :return: ''' with open(raw_data, 'r', encoding='utf-8') as testfile: test_X, test_y, test_qids, comments = read_dataset(testfile) return test_X, test_y, test_qids, comments
def test_data_ndcg(model_path, test_path): ''' 评估测试数据的ndcg ''' with open(test_path, 'r', encoding='utf-8') as testfile: test_X, test_y, test_qids, comments = read_dataset(testfile) gbm = lgb.Booster(model_file=model_path) test_predict = gbm.predict(test_X) average_ndcg, _ = validate(test_qids, test_y, test_predict, 60) # 所有qid的平均ndcg print("all qid average ndcg: ", average_ndcg) print("job done!")
train_start = datetime.now() x_train, y_train, q_train = load_data(data_feats, data_group) train(x_train, y_train, q_train, model_path) train_end = datetime.now() consume_time = (train_end - train_start).seconds print("consume time : {}".format(consume_time)) elif sys.argv[1] == '-predict': train_start = datetime.now() predict_data_path = base_path + '/data/test/test.txt' #格式如ranklib中的数据格式 test_X, test_y, test_qids, comments = load_data_from_raw(raw_data_path) t_results = predict(test_X, comments, model_path) print(t_results) train_end = datetime.now() consume_time = (train_end - train_start).seconds print("consume time : {}".format(consume_time)) elif sys.argv[1] == '-ndcg': # ndcg test_path = base_path + '/data/test/test.txt' #评估测试数据的平均ndcg test_data_ndcg(model_path, test_path) elif sys.argv[1] == '-feature': plot_print_feature_importance(model_path) elif sys.argv[1] == '-leaf': #利用模型得到样本叶结点的one-hot表示 raw_data = base_path + '/data/test/leaf.txt' with open(raw_data, 'r', encoding='utf-8') as testfile: test_X, test_y, test_qids, comments = read_dataset(testfile) get_leaf_index(test_X, model_path)