Ejemplo n.º 1
0
def test_data_ndcg(model_path, test_path):
    '''
    评估测试数据的ndcg
    '''
    with open(test_path, 'r', encoding='utf-8') as testfile:
        test_X, test_y, test_qids, comments = letor.read_dataset(testfile)

    gbm = lgb.Booster(model_file=model_path)
    test_predict = gbm.predict(test_X)

    average_ndcg, _ = ndcg.validate(test_qids, test_y, test_predict, 60)
    # 所有qid的平均ndcg
    print("all qid average ndcg: ", average_ndcg)
    print("job done!")
def model(train_data, test_data, train_x, test_x, train_y, test_y, objective,
          metric):
    fp = open("train_model.txt", "a+")
    param = {
        'task': 'train',
        'boosting_type': 'gbdt',
        'num_leaves': 16,
        'num_trees': 100,
        'objective': 'multiclassova',
        'metric': 'multi_error',
        'max_bin': 255,
        'learning_rate': 0.05,
        'early_stopping': 10
    }
    param.update({'objective': objective})
    param.update({'metric': metric})
    print(param)
    print("multierror")

    num_round = 10

    bst = lgb.train(param, train_data, num_round, valid_sets=[test_data])
    bst.save_model('model.txt')
    mybst = lgb.Booster(model_file='model.txt')  # init model
    ypred_train = bst.predict(train_x)
    ypred_test = bst.predict(test_x)
    print("ypred_train", ypred_train)
    print("ypred_test", ypred_test)
    train_pred = []
    for i in range(len(ypred_train)):
        if ypred_train[i] > 0.5:
            train_pred.append(1)
        else:
            train_pred.append(0)
    test_pred = []
    for i in range(len(ypred_test)):
        if ypred_test[i] > 0.5:
            test_pred.append(1)
        else:
            test_pred.append(0)

    print('The train error rate of prediction is:',
          (accuracy_score(train_y, train_pred)))
    print('The test error rate of prediction is:',
          (accuracy_score(test_y, test_pred)))

    cv_results = lgb.cv(param, train_data, num_round, nfold=5)
    name = metric + "-mean"
    print('best n_estimators:', (len(cv_results[name])))
    print('best cv score:', (pd.Series(cv_results[name]).min()), '\n')
Ejemplo n.º 3
0
def test_5model():
    # model_goss_noall_0.889: 0.658 , model_gbdt_0.947:0.612   ,model_goss_0.93: 0.642,  model_dart_0.921:0.637
    # model_dart_5league0.953 0.676  ,   model_gbdt_5league0.953  .651
    model_file = root + 'usemodel\\model_gbdt_5league0.953.txt'
    gbm = lgb.Booster(model_file=model_file)

    print('开始预测...')
    test_data = pd.read_table(root + 'predict_0531_5league.txt')
    testlabel = np.array(test_data.label)
    testdata = np.array(test_data.drop("label", axis=1))
    y_predt = gbm.predict(testdata, num_iteration=gbm.best_iteration)
    y_preds = [list(x).index(max(x)) for x in y_predt]  # 3 value

    get_accuracy(y_preds, testlabel)
Ejemplo n.º 4
0
def predict():
    model_file = root + 'usemodel\\model_dart_10000.675.txt'  # model_dart_10000.675
    gbm = lgb.Booster(model_file=model_file)

    print('开始预测...')

    predict_data = pd.read_table(root +
                                 'predict_zcw2020-08-01.txt')  # predict0619
    predictdata = np.array(predict_data)
    y_predt = gbm.predict(predictdata, num_iteration=gbm.best_iteration)
    # print(y_predt)
    # y_preds = [ 1 if i >=0.5 else 0 for i in y_predt]  # 2 value
    # y_preds = [list(x).index(max(x)) for x in y_predt]  # 3 value
    pred_limit(y_predt, 0.6)
Ejemplo n.º 5
0
def predict():
    model_file = root + 'usemodel\\model_gbdt_score_1.0.txt'  #   model_goss_0.889
    gbm = lgb.Booster(model_file=model_file)

    print('开始预测...')
    y_predt = gbm.predict(predictdata, num_iteration=gbm.best_iteration)
    #print(y_predt)
    # y_preds = [ 1 if i >=0.5 else 0 for i in y_predt]  # 2 value
    y_preds = [list(x).index(max(x)) for x in y_predt]  # 3 value

    print(y_preds)

    dt = pd.DataFrame(y_preds)
    dt.to_csv(root + "result10624.csv", encoding='utf_8_sig')
Ejemplo n.º 6
0
def lgbPredict(predictInput, modelFile):
    """
    Desc:借助已经跑出的模型,来预测A榜的正确性
    """
    model = lgb.Booster(model_file='./model/' + modelFile)  #init model

    dfData = pd.read_csv(predictInput)
    # 去掉uid列,uid不是特征
    data = dfData.drop(labels='USRID', axis=1)
    preds = model.predict(data)
    preds = pd.DataFrame(preds)
    preds.columns = ['RST']
    df = pd.concat([dfData[['USRID']], preds], axis=1)
    df.to_csv('./model/test_result.csv', sep='\t', index=None)
Ejemplo n.º 7
0
    def load_models(self, directory):
        if not os.path.exists(directory):
            raise IOError(directory + ' is not exists')

        print('loading models from ', directory, '  ...................')
        # 这里只是简单地把一个文件夹下所有的model文件都读取进来,日后可能会有改动
        for files in os.listdir(directory):
            model_file_path = os.path.join(directory, files)
            if os.path.isfile(model_file_path):
                best_model = lgb.Booster(model_file=model_file_path)
                self.best_model.append(best_model)
                self.best_round_list.append(best_model.best_iteration)

        print('completed loading models from ', directory, '  ..........')
Ejemplo n.º 8
0
    def _preprocess(self, data):
        preprocessed_data = {}
        filesDatas = []
        for k, v in data.items():
            for file_name, file_content in v.items():
                test_data = pd.read_csv(file_content)
                test_set = feat(test_data)
                if test_data['Frequency Band'][0] == 2585.0:
                    lgb_model_path = os.path.join(self.model_path, 'lgb_model_2585.0')
                    clf = lgb.Booster(model_file=lgb_model_path)
                    pb_data = clf.predict(test_set, num_iteration=clf.best_iteration)
                elif test_data['Frequency Band'][0] == 2604.8:
                    lgb_model_path = os.path.join(self.model_path, 'lgb_model_2604.8')
                    clf = lgb.Booster(model_file=lgb_model_path)
                    pb_data = clf.predict(test_set, num_iteration=clf.best_iteration)
                elif test_data['Frequency Band'][0] == 2624.6:
                    lgb_model_path = os.path.join(self.model_path, 'lgb_model_2624.6')
                    clf = lgb.Booster(model_file=lgb_model_path)
                    pb_data = clf.predict(test_set, num_iteration=clf.best_iteration)
                else:
                    # 取平均
                    lgb_model_path = os.path.join(self.model_path, 'lgb_model_2585.0')
                    clf1 = lgb.Booster(model_file=lgb_model_path)
                    lgb_model_path = os.path.join(self.model_path, 'lgb_model_2604.8')
                    clf2 = lgb.Booster(model_file=lgb_model_path)
                    lgb_model_path = os.path.join(self.model_path, 'lgb_model_2624.6')
                    clf3 = lgb.Booster(model_file=lgb_model_path)
                    
                    pb_data1 = clf1.predict(test_set, num_iteration=clf1.best_iteration).reshape(-1)
                    pb_data2 = clf2.predict(test_set, num_iteration=clf2.best_iteration).reshape(-1)
                    pb_data3 = clf3.predict(test_set, num_iteration=clf3.best_iteration).reshape(-1)
                    pb_data = (pb_data1 + pb_data2 + pb_data3) / 3

                #     print('other frequency! use model 2585.0')
                #     lgb_model_path = os.path.join(self.model_path, 'lgb_model_2585.0')
                # clf = lgb.Booster(model_file=lgb_model_path)

                # if test_data['Frequency Band'][0] == 2604.8:
                #     lgb_model_path = os.path.join(self.model_path, 'lgb_model_2604.8')
                # else:
                #     lgb_model_path = os.path.join(self.model_path, 'lgb_model_2624.6')
                # clf = lgb.Booster(model_file=lgb_model_path)

                

                
                # test_set = feat(test_data)
                # pb_data = clf.predict(test_set, num_iteration=clf.best_iteration)
                input_data = np.array(pb_data.reshape(-1, 1))
                print(file_name, input_data.shape)
                filesDatas.append(input_data)

        filesDatas = np.array(filesDatas,dtype=np.float32).reshape(-1, 1)
        preprocessed_data['inputs'] = filesDatas        
        print("preprocessed_data[\'inputs\'].shape = ", preprocessed_data['inputs'].shape)

        return preprocessed_data
Ejemplo n.º 9
0
    def test(self):
        X_train, X_test, y_train, y_test = train_test_split(
            *load_breast_cancer(True), test_size=0.1, random_state=1)
        train_data = lgb.Dataset(X_train, max_bin=255, label=y_train)
        valid_data = train_data.create_valid(X_test, label=y_test)

        params = {
            "objective": "binary",
            "metric": "auc",
            "min_data": 1,
            "num_leaves": 15,
            "verbose": -1
        }
        bst = lgb.Booster(params, train_data)
        bst.add_valid(valid_data, "valid_1")

        for i in range(30):
            bst.update()
            if i % 10 == 0:
                print(bst.eval_train(), bst.eval_valid())
        bst.save_model("model.txt")
        pred_from_matr = bst.predict(X_test)
        with tempfile.NamedTemporaryFile() as f:
            tname = f.name
        with open(tname, "w+b") as f:
            np.savetxt(f, X_test, delimiter=',')
        pred_from_file = bst.predict(tname)
        os.remove(tname)
        self.assertEqual(len(pred_from_matr), len(pred_from_file))
        for preds in zip(pred_from_matr, pred_from_file):
            self.assertAlmostEqual(*preds, places=15)
        # check saved model persistence
        bst = lgb.Booster(params, model_file="model.txt")
        pred_from_model_file = bst.predict(X_test)
        self.assertEqual(len(pred_from_matr), len(pred_from_model_file))
        for preds in zip(pred_from_matr, pred_from_model_file):
            self.assertEqual(*preds)
Ejemplo n.º 10
0
def predict_with_lgbm_building(test_df, row_ids, model_filepath):
    """"
    Takes a given directory which contains n folders (one for each
    building) and then predicts the rows with the respective models
    :param test_df: DataFrame containing the test data
    :param row_ids: A vector with the matching row ids for the predicted labels
    :param model_filepath: Directory that contains the trained model
    :return: Vector containing the predicted labels for the test data
    """
    buildings_in_dir = sorted(os.listdir(model_filepath), key=int)
    test_df["row_id"] = row_ids
    test_df = test_df.drop(columns=["site_id"], axis=1)
    test_df = test_df.groupby("building_id")

    predictions_by_building = []
    row_id_by_building = []
    for b in buildings_in_dir:
        test_by_building = test_df.get_group(int(b))
        test_by_building = test_by_building.reset_index(drop=True)
        rows_grouped = list(test_by_building["row_id"])
        test_by_building = test_by_building.drop(columns=["building_id"],
                                                 axis=1)

        models_in_dir = os.listdir(model_filepath + "/" + b)
        num_models = len(models_in_dir)
        predictions_group = np.zeros(len(rows_grouped))
        i = 0
        for model in models_in_dir:
            i += 1
            click.echo("Predicting Building " + b + " [" + str(i) + "/" +
                       str(num_models) + "]")
            lgbm_model = lgb.Booster(model_file=model_filepath + "/" + b +
                                     "/" + model)
            predictions_current = lgbm_model.predict(test_by_building)
            predictions_group += np.expm1(predictions_current)

        predictions_group = predictions_group / num_models
        predictions_by_building.extend(list(predictions_group))
        row_id_by_building.extend(rows_grouped)

    # Order the predictions by merging them to the original row ids
    pred_df = pd.DataFrame({
        "row_id": row_id_by_building,
        "pred": predictions_by_building
    })
    pred_df = pred_df.sort_values("row_id")
    predictions = pred_df["pred"].copy(deep=True)
    predictions[predictions < 0] = 0
    return predictions
def predict(model_path,
            X_test,
            is_lgbm=False,
            is_catboost=False,
            is_cnn=False,
            maxlen=400,
            lgbm_threshold=0.5):
    """
    load the model and predict unseen data
    """

    print('\n === predict === \n')

    if is_lgbm:
        # lightgbm
        model = lgb.Booster(model_file=model_path)
    elif is_catboost:
        model = CatBoostClassifier()
        model = model.load_model(model_path)
    elif is_cnn:
        model = load_model(model_path)
    else:
        # sklearn
        # xgboost
        model = joblib.load(model_path)

    # y_pred = model.predict_prob(X_test)
    y_pred = model.predict(X_test)

    if is_lgbm:
        #print('==')
        #print(y_pred)
        y_output = []
        for y in y_pred:
            if y > lgbm_threshold:
                y_output.append(1)
            else:
                y_output.append(0)
        #print('==')
        #print(y_output)
        return (np.array(y_output))
        #return np.array([np.argmax(y) for y in y_pred])
    elif is_cnn:
        # X_test = sequence.pad_sequences(X_test, maxlen=maxlen)
        y_pred = model.predict(X_test)
        y_pred = [np.argmax(y) for y in y_pred]
        return np.array(y_pred)
    else:
        return y_pred
Ejemplo n.º 12
0
 def evaluate_params(
     trial: optuna.trial.Trial,
     train_data: lgb.Dataset,
     validation_data: lgb.Dataset,
 ) -> Union[None, dict]:
     """Compute out-of-sample performance for a parameter set."""
     params = {}
     params["num_iterations"] = trial.suggest_int(
         "num_iterations", 8, 128)
     params["learning_rate"] = trial.suggest_uniform(
         "learning_rate", 2**-5, 0.5)
     params["num_leaves"] = trial.suggest_int("num_leaves", 8, 256)
     params["max_depth"] = trial.suggest_int("max_depth", 4, 32)
     params["min_data_in_leaf"] = trial.suggest_int(
         "min_data_in_leaf", 4, 512)
     params["min_sum_hessian_in_leaf"] = trial.suggest_uniform(
         "min_sum_hessian_in_leaf", 2**-5, 0.25)
     params["bagging_freq"] = trial.suggest_int("bagging_freq", 0, 1)
     params["bagging_fraction"] = trial.suggest_uniform(
         "bagging_fraction", 0.5, 1)
     params["feature_fraction"] = trial.suggest_uniform(
         "feature_fraction", 0.5, 1)
     params["lambda_l1"] = trial.suggest_uniform("lambda_l1", 0, 64)
     params["lambda_l2"] = trial.suggest_uniform("lambda_l2", 0, 64)
     params["min_gain_to_split"] = trial.suggest_uniform(
         "min_gain_to_split", 0, 0.25)
     params["min_data_per_group"] = trial.suggest_int(
         "min_data_per_group", 1, 512)
     params["max_cat_threshold"] = trial.suggest_int(
         "max_cat_threshold", 1, 512)
     params["cat_l2"] = trial.suggest_uniform("cat_l2", 0, 64)
     params["cat_smooth"] = trial.suggest_uniform("cat_smooth", 0, 2048)
     params["max_cat_to_onehot"] = trial.suggest_int(
         "max_cat_to_onehot", 1, 64)
     params["max_bin"] = trial.suggest_int("max_bin", 32, 1024)
     params["min_data_in_bin"] = trial.suggest_int(
         "min_data_in_bin", 1, 64)
     params["objective"] = self.objective
     params["num_class"] = self.num_class
     params["verbosity"] = -1
     booster = lgb.Booster(params=params, train_set=train_data)
     booster.add_valid(validation_data, "validation_set")
     for step in range(params["num_iterations"]):
         booster.update()
         validation_loss = booster.eval_valid()[0][2]
         trial.report(validation_loss, step)
         if trial.should_prune():
             raise optuna.exceptions.TrialPruned()
     return validation_loss
Ejemplo n.º 13
0
def get_leaf_index(data, model_path):
    '''
    得到叶结点并进行one-hot编码
    :param data:
    :param model_path:
    :return:
    '''

    gbm = lgb.Booster(model_file=model_path)
    ypred = gbm.predict(data, pred_leaf=True)

    one_hot_encoder = OneHotEncoder()
    x_one_hot = one_hot_encoder.fit_transform(ypred)
    print(x_one_hot.shape)
    print(x_one_hot.toarray())
Ejemplo n.º 14
0
def lgb_predict(logger, model_file, test_file, output_file, predictors):
    logger.print_log('loading test data...')
    test_df = pd.read_csv(test_file, dtype=default_config.dtypes)
    sub = pd.DataFrame()
    sub['click_id'] = test_df['click_id'].astype('int')

    logger.print_log("Predicting...")
    model = lgb.Booster(model_file=model_file)
    sub['is_attributed'] = model.predict(test_df[predictors])
    logger.print_log("writing to file <" + output_file + ">...")
    if output_file[-2:] == "gz":
        sub.to_csv(output_file, index=False, compression='gzip')
    else:
        sub.to_csv(output_file, index=False)
    logger.print_log("done.")
Ejemplo n.º 15
0
def select_model(feats):
    model_id = 0
    model_folder_path = './models/'
    models = ['clf_pre.txt', 'clf_A.txt', 'clf_B.txt', 'clf_C.txt']
    fundings = [
        'PreSeries_post_money_valuation_usd_augmented',
        'RoundA_post_money_valuation_usd_augmented',
        'RoundB_post_money_valuation_usd_augmented',
        'RoundC_post_money_valuation_usd_augmented',
    ]
    for i, round in enumerate(fundings):
        if feats[round] and float(feats[round]) > 0:
            model_id = i
    return model_id, lgb.Booster(model_file=model_folder_path +
                                 models[model_id])
Ejemplo n.º 16
0
def get_fold_mae(save_path_orig, split, fold, num_folds, train_val_split,
                 features, targets):
    fold_description = get_fold_description(fold, num_folds)
    save_path = '{}-{}-{}.txt'.format(save_path_orig, split, fold_description)
    loaded_model = lgb.Booster(model_file=save_path)
    valid_ids = train_val_split[split][fold][1] if fold < num_folds else (
        np.arange(features.shape[0]))
    x_valid = features.iloc[valid_ids]
    y_valid = targets[valid_ids]
    valid_preds = loaded_model.predict(x_valid)
    oof_mae = np.abs(valid_preds - y_valid).mean()
    error_description = "OOF MAE" if fold < num_folds else "Train error"
    print('{}: {}'.format(error_description, np.round(oof_mae, 3)))

    return oof_mae, valid_ids.size
Ejemplo n.º 17
0
def mypredict(testInputs, predictDate, ROUND):
    average = None
    for i in range(ROUND):
        model_save_path = os.path.join(
            LOCALDATAPATH, 'lightgbmModel2',
            str(predictDate) + '_' + str(i + 1) + '.txt')
        gbm = lgb.Booster(model_file=model_save_path)
        predict = gbm.predict(testInputs)
        if i == 0:
            average = predict
        else:
            average = average + predict
        average = average / ROUND
    return average
    pass
Ejemplo n.º 18
0
 def test_add_features_same_booster_behaviour(self):
     X = np.random.random((1000, 5))
     X[:, [1, 3]] = 0
     names = ['col_%d' % (i, ) for i in range(5)]
     for j in range(1, 5):
         d1 = lgb.Dataset(X[:, :j], feature_name=names[:j]).construct()
         d2 = lgb.Dataset(X[:, j:], feature_name=names[j:]).construct()
         d1.add_features_from(d2)
         d = lgb.Dataset(X, feature_name=names).construct()
         y = np.random.random(1000)
         d1.set_label(y)
         d.set_label(y)
         b1 = lgb.Booster(train_set=d1)
         b = lgb.Booster(train_set=d)
         for k in range(10):
             b.update()
             b1.update()
         dname = self.tempFileName()
         d1name = self.tempFileName()
         b1.save_model(d1name)
         b.save_model(dname)
         self.assertFilesEqual(d1name, dname)
         os.remove(d1name)
         os.remove(dname)
Ejemplo n.º 19
0
def lgb_predict(model_filename, test):

    print(model_filename)
    lgb_model = lgb.Booster(model_file=path + '/models/' + model_filename)
    print("Loading Model ...")
    print("Predicting ... ")
    submit = lgb_model.predict(test)
    submit = np.argmax(submit, axis=1)
    print("Predict over ...")
    for i, v in enumerate(submit):
        submit[i] = service_label[v]
    data = pd.read_csv('./data/submit_sample.csv')
    data['current_service'] = submit
    data['current_service'] = data['current_service'].astype(int)
    data.to_csv('./temp/lgb_prediction-new.csv', index=False)
Ejemplo n.º 20
0
def import_models():
    stations=[]
    models=[]
#    model_files=glob.glob(BASE+"/data/*.pk")
    model_files=glob.glob(BASE+"/lgb_model/*.txt")

    for file in model_files:
        
        model_station,model_id=file.replace(BASE+"/lgb_model/","").replace(".txt","").split("_")
        stations.append([model_station,model_id])
        model_tmp=lgb.Booster(model_file=file)
        models.append([model_tmp])
    
    stations=np.array(stations)
    return stations,models
Ejemplo n.º 21
0
def lgb_cv_predict(logger, model_files, test_file, output_file, predictors):
    logger.print_log('loading test data...')
    test_df = pd.read_csv(test_file, dtype=default_config.dtypes)
    sub = pd.DataFrame()
    sub['click_id'] = test_df['click_id'].astype('int')

    logger.print_log("Predicting...")
    sub['is_attributed'] = 0
    for model_file in model_files:
        model = lgb.Booster(model_file=model_file)
        sub['is_attributed'] += model.predict(test_df[predictors])
    sub['is_attributed'] = sub['is_attributed'] / len(model_files)
    logger.print_log("writing to file <" + output_file + ">...")
    sub.to_csv(output_file, index=False)
    logger.print_log("done.")
Ejemplo n.º 22
0
def build_gbm():
	
	home = str(Path.home())
	
	if os.path.isfile(home+'/.deepface/weights/face-recognition-ensemble-model.txt') != True:
		print("face-recognition-ensemble-model.txt will be downloaded...")
		url = 'https://raw.githubusercontent.com/serengil/deepface/master/deepface/models/face-recognition-ensemble-model.txt'
		output = home+'/.deepface/weights/face-recognition-ensemble-model.txt'
		gdown.download(url, output, quiet=False)
		
	ensemble_model_path = home+'/.deepface/weights/face-recognition-ensemble-model.txt'
	
	deepface_ensemble = lgb.Booster(model_file = ensemble_model_path)
	
	return deepface_ensemble
Ejemplo n.º 23
0
 def post_init(self):
     super().post_init()
     if self.model_path and os.path.exists(self.model_path):
         import lightgbm
         self.booster = lightgbm.Booster(model_file=self.model_path)
         model_num_features = self.booster.num_feature()
         expected_num_features = len(self.query_feature_names +
                                     self.match_feature_names)
         if model_num_features != expected_num_features:
             raise ValueError(
                 f'The number of features expected by the LightGBM model {model_num_features} is different'
                 f'than the ones provided in input {expected_num_features}')
     else:
         raise PretrainedModelFileDoesNotExist(
             f'model {self.model_path} does not exist')
Ejemplo n.º 24
0
    def _load(self, models_dir: pathlib.Path):
        import lightgbm as lgb

        seeds = [123] if self.seeds is None else self.seeds
        self.gbms_ = np.array(
            [
                [
                    lgb.Booster(
                        model_file=str(models_dir / f"model.fold{fold}.seed{seed}.txt")
                    )
                    for seed in seeds
                ]
                for fold in range(self.nfold)
            ]
        )
Ejemplo n.º 25
0
def pred_lgbm(X_test,
              categorical_features,
              feature_name,
              fold_id,
              lgb_params,
              fit_params,
              model_name,
              score_func,
              calc_importances=True):
    model = lgb.Booster(
        model_file=('{}_fold{}.txt'.format(model_name, fold_id)))
    y_pred_test = model.predict(X_test)
    y_pred_test[y_pred_test < 0] = 0

    return y_pred_test
Ejemplo n.º 26
0
def get_predictions_per_era(df=None,
                            num_models=1,
                            prefix=None,
                            folder_name=None,
                            era_idx=[],
                            model_type='xgb',
                            rank_average=False):
    """

    :param df: dataframe with the features used to train and predict
    :param num_models: number of models in the folder
    :param prefix: prefix to choose specific models from the folder - use it only if you had run a CV scheme
                   for many different targets or something
    :param folder_name: name of the folder
    :param era_idx: indices of dataframe
    :param model_type: xgb or lgb
    :param rank_average: True - rank the predictions per era or False -  total ranks in the whole dataframe
    :return: final predictions with proper dimensions for further use
    """
    model_lst = bf.get_model_lst(num_models=num_models,
                                 prefix=prefix,
                                 folder_name=folder_name)
    predictions_total = []

    X_test = df

    for cv_num in range(num_models):
        if model_type == 'lgb':
            model = lgb.Booster(model_file=model_lst[cv_num])
        if model_type == 'xgb':
            model = bf.create_model(model_type='xgb')
            model.load_model(model_lst[cv_num])

        predictions = predict_in_era_batch(model=model,
                                           df=X_test,
                                           era_idx=era_idx,
                                           rank_per_era=rank_average)

        predictions_total.append(predictions)

    if rank_average:
        scaler = MinMaxScaler(feature_range=(0, 1))
        predictions_final = scaler.fit_transform(
            X=np.mean(predictions_total, axis=0).reshape(-1, 1))
    else:
        predictions_final = np.mean(predictions_total, axis=0)

    return predictions_final.squeeze()
Ejemplo n.º 27
0
    def test_pandas_categorical(self):
        params = {      #需要更详细的的测试
            "objective": "binary", "metric": "logloss", 'early_stop': 5, 'num_boost_round': 50,
            "verbosity": 1,
        }
        import pandas as pd
        X = pd.DataFrame({"A": np.random.permutation(['a', 'b', 'c', 'd'] * 75),  # str
                          "B": np.random.permutation([1, 2, 3] * 100),  # int
                          "C": np.random.permutation([0.1, 0.2, -0.1, -0.1, 0.2] * 60),  # float
                          "D": np.random.permutation([True, False] * 150)})  # bool
        y = np.random.permutation([0, 1] * 150)
        X_test = pd.DataFrame({"A": np.random.permutation(['a', 'b', 'e'] * 20),
                               "B": np.random.permutation([1, 3] * 30),
                               "C": np.random.permutation([0.1, -0.1, 0.2, 0.2] * 15),
                               "D": np.random.permutation([True, False] * 30)})
        if True:
            X, X_test = Mort_Preprocess.OrdinalEncode_(X,X_test)
        for col in ["A", "B", "C", "D"]:
            X[col] = X[col].astype('category')
            X_test[col] = X_test[col].astype('category')
        #trn_data = lgb.Dataset(X, label=y)

        if isMORT:
            mort0 = LiteMORT(params).fit(X, y)
            pred0 = list(mort0.predict(X_test))
            mort1 = LiteMORT(params).fit(X, y, categorical_feature=[0])
            pred1 = list(mort1.predict(X_test))
            mort2 = LiteMORT(params).fit(X, y, categorical_feature=['A'])
            pred2 = list(mort2.predict(X_test))
            mort3 = LiteMORT(params).fit(X, y, categorical_feature=['A', 'B', 'C', 'D'])
            pred3 = list(mort3.predict(X_test))
        else:
            clf=lgb.sklearn.LGBMClassifier()
            gbm_ = clf.fit(X, y)
            gbm0 = lgb.sklearn.LGBMClassifier().fit(X, y)
            pred0 = list(gbm0.predict(X_test))
            gbm1 = lgb.sklearn.LGBMClassifier().fit(X, y, categorical_feature=[0])
            pred1 = list(gbm1.predict(X_test))
            gbm2 = lgb.sklearn.LGBMClassifier().fit(X, y, categorical_feature=['A'])
            pred2 = list(gbm2.predict(X_test))
            gbm3 = lgb.sklearn.LGBMClassifier().fit(X, y, categorical_feature=['A', 'B', 'C', 'D'])
            pred3 = list(gbm3.predict(X_test))
            gbm3.booster_.save_model('categorical.model')
            gbm4 = lgb.Booster(model_file='categorical.model')
            pred4 = list(gbm4.predict(X_test))
            pred_prob = list(gbm0.predict_proba(X_test)[:, 1])
            np.testing.assert_almost_equal(pred_prob, pred4)
            input("...")
Ejemplo n.º 28
0
Archivo: ball.py Proyecto: ljt0000mf/ML
def predict():
    model_file = root + 'usemodel\\goss_73_model0621.txt'
    gbm = lgb.Booster(model_file=model_file)

    print('开始预测...')
    predict_data = pd.read_table(root + 'predict.txt')  # predict0619
    predictdata = np.array(predict_data)
    y_predt = gbm.predict(predictdata, num_iteration=gbm.best_iteration)
    # print(y_predt)
    # y_preds = [ 1 if i >=0.5 else 0 for i in y_predt]  # 2 value
    y_preds = [list(x).index(max(x)) for x in y_predt]  # 3 value

    print(y_preds)

    dt = pd.DataFrame(y_preds)
    dt.to_csv(root + "result1.csv", encoding='utf_8_sig')
Ejemplo n.º 29
0
    def load_parameters(self, params):
        # Load model parameters
        h5_model_base64 = params.get('h5_model_base64', None)
        data_config_base64 = params.get('data_config_base64', None)

        data_config_bytes = base64.b64decode(
            data_config_base64.encode('utf-8'))
        self._features, self._target = pickle.loads(data_config_bytes)

        with tempfile.NamedTemporaryFile() as tmp:
            # Convert back to bytes & write to temp file
            h5_model_bytes = base64.b64decode(h5_model_base64.encode('utf-8'))
            with open(tmp.name, 'wb') as f:
                f.write(h5_model_bytes)
            # Load model from temp file
            self._model = lgb.Booster(model_file=tmp.name)
Ejemplo n.º 30
0
 def load_pickle(self):
     self.le_color = joblib.load("le_color.pkl")
     self.le_fuel = joblib.load("le_fuel.pkl")
     self.le_trans = joblib.load("le_trans.pkl")
     self.st_price = joblib.load("st_price.pkl")
     self.st_weight = joblib.load("st_weight.pkl")
     self.st_age = joblib.load("st_age.pkl")
     self.st_km = joblib.load("st_km.pkl")
     self.st_cc = joblib.load("st_cc.pkl")
     self.st_hp = joblib.load("st_hp.pkl")
     for i in range(4):
         m_path = "model_{i}.txt".format(i=i + 1)
         print(m_path)
         m = lgb.Booster(model_file=m_path)
         print(type(m))
         self.models.append(m)