Exemple #1
0
 def test_classification(self):
     data, target = load_breast_cancer(True)
     x_train, x_test, y_train, y_test = train_test_split(data,
                                                         target,
                                                         test_size=0.2,
                                                         random_state=42)
     ngb = NGBoost(Base=default_tree_learner,
                   Dist=Bernoulli,
                   Score=MLE,
                   verbose=False)
     ngb.fit(x_train, y_train)
     preds = ngb.pred_dist(x_test)
     score = roc_auc_score(y_test, preds.prob)
     assert score >= 0.95
Exemple #2
0
 def test_regression(self):
     data, target = load_boston(True)
     x_train, x_test, y_train, y_test = train_test_split(data,
                                                         target,
                                                         test_size=0.2,
                                                         random_state=42)
     ngb = NGBoost(Base=default_tree_learner,
                   Dist=Normal,
                   Score=MLE,
                   natural_gradient=True,
                   verbose=False)
     ngb.fit(x_train, y_train)
     preds = ngb.predict(x_test)
     score = mean_squared_error(y_test, preds)
     assert score <= 8.0
Exemple #3
0
def ngb_impute(estimator, X, Y):
    base_name_to_learner = {
        "tree": default_tree_learner,
        "linear": default_linear_learner,
    }

    ngb = NGBoost(Dist=estimator,
                  n_estimators=200,
                  learning_rate=.05,
                  natural_gradient=True,
                  verbose=False,
                  minibatch_frac=1.0,
                  Base=base_name_to_learner[LEARNER],
                  Score=MLE)

    train = ngb.fit(X, Y)
    Y_imputed = np.copy(Y)

    cens_mask = (Y['Event'] == 0)
    min_vals = Y['Time'][cens_mask]
    pred_dists = train.pred_dist(X[cens_mask])

    try:
        outputs = pred_dists.loc[:, 0]
    except IndexError:
        outputs = pred_dists.loc

    # mus = pred_dists.loc
    # sigmas = pred_dists.scale
    # preds = cond_expectation(estimator, mus, sigmas, min_vals)

    # print(np.sum(cens_mask))
    # print(min_vals.shape, preds.shape)
    # print(min_vals)
    # print(preds)

    # print(min_vals[:10])
    # print(np.exp(pred_dists.loc)[:10])
    # print(pred_dists.mean()[:10])

    Y_imputed['Time'][cens_mask] = np.exp(outputs)
    return Y_imputed
Exemple #4
0
            folds.append( (train_index, test_index) )
        #breakpoint()

    for itr, (train_index, test_index) in enumerate(folds):
        X_trainall, X_test = X[train_index], X[test_index]
        y_trainall, y_test = y[train_index], y[test_index]


        X_train, X_val, y_train, y_val = train_test_split(X_trainall, y_trainall, test_size=0.2)
        
        y_true += list(y_test.flatten())

        ngb = NGBoost(Base=base_name_to_learner[args.base],
                      Dist=eval(args.distn),
                      Score=score_name_to_score[args.score](64),
                      n_estimators=args.n_est,
                      learning_rate=args.lr,
                      natural_gradient=args.natural,
                      minibatch_frac=args.minibatch_frac,
                      verbose=args.verbose)

        train_loss, val_loss = ngb.fit(X_train, y_train) #, X_val, y_val)

        y_preds = ngb.staged_predict(X_val)
        y_forecasts = ngb.staged_pred_dist(X_val)
        val_rmse = [mean_squared_error(y_pred, y_val) for y_pred in y_preds]
        val_nll = [-y_forecast.logpdf(y_val.flatten()).mean() for y_forecast in y_forecasts]
        best_itr = np.argmin(val_rmse) + 1
        best_itr = np.argmin(val_nll) + 1

        full_retrain = True
        if full_retrain:
from ngboost.ngboost import NGBoost
from ngboost.distns import Bernoulli
from ngboost.learners import default_tree_learner
from ngboost.scores import MLE

from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score

if __name__ == "__main__":

    X, Y = load_breast_cancer(True)
    X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2)

    ngb = NGBoost(Base=default_tree_learner,
                  Dist=Bernoulli,
                  Score=MLE(),
                  verbose=True)
    ngb.fit(X_train, Y_train)

    preds = ngb.pred_dist(X_test)
    print("ROC:", roc_auc_score(Y_test, preds.prob))
Exemple #6
0
    m, n = 1000, 5
    X = np.random.randn(m, n) / np.sqrt(n)
    Y = X @ np.ones((n, 1)) + 0.5 * np.random.randn(*(m, 1))
    T = X @ np.ones((n, 1)) + 0.5 * np.random.randn(*(m, 1)) + args.eps
    C = (T < Y).astype(int)

    print(X.shape, Y.shape, C.shape)
    print(f"Censorship: {np.mean(C):.2f}")

    X_tr, X_te, Y_tr, Y_te, T_tr, T_te, C_tr, C_te = train_test_split(
        X, Y, T, C, test_size=0.2)

    ngb = NGBoost(Dist=LogNormal,
                  n_estimators=args.n_estimators,
                  learning_rate=args.lr,
                  natural_gradient=False,
                  Base=default_linear_learner,
                  Score=MLE())
    train_losses = ngb.fit(X_tr, Y_join(np.exp(np.minimum(Y_tr, T_tr)), C_tr))

    preds = ngb.pred_dist(X_te)
    print(f"R2: {r2_score(Y_te, np.log(preds.mean()))}")

    plt.hist(preds.mean(), range=(-5, 5), bins=30, alpha=0.5, label="Pred")
    plt.hist(Y_te, range=(-5, 5), bins=30, alpha=0.5, label="True")
    plt.legend()
    plt.show()

    # since we simulated the data we fully observe all outcomes
    pctles, observed, slope, intercept = calibration_regression(preds, Y_te)
    plot_calibration_curve(pctles, observed)
Exemple #7
0
    argparser.add_argument("--distn", type=str, default="Normal")
    argparser.add_argument("--natural", action="store_true")
    argparser.add_argument("--score", type=str, default="CRPS")
    args = argparser.parse_args()

    np.random.seed(123)

    m, n = 1200, 50
    noise = np.random.randn(*(m, 1))
    beta1 = np.random.randn(n, 1)
    X = np.random.randn(m, n) / np.sqrt(n)
    Y = X @ beta1 + args.noise_lvl * noise
    print(X.shape, Y.shape)

    X_train, X_test = X[:1000, :], X[1000:, ]
    Y_train, Y_test = Y[:1000], Y[1000:]

    ngb = NGBoost(n_estimators=400,
                  learning_rate=args.lr,
                  Dist=Normal,
                  Base=default_linear_learner,
                  natural_gradient=args.natural,
                  minibatch_frac=1.0,
                  Score=eval(args.score)(),
                  verbose=True,
                  verbose_eval=10)

    losses = ngb.fit(X_train, Y_train)
    forecast = ngb.pred_dist(X_test)
    print("R2:", r2_score(Y_test, forecast.loc))
    m, n = 1000, 10
    noise = sp.stats.laplace.rvs(size=(m, 1))
    beta1 = np.random.randn(n, 1)
    beta2 = np.random.randn(n, 1)
    X = np.random.randn(m, n) / np.sqrt(n)
    # Y = X @ beta + 0.5 * noise
    Y = X @ beta1 + 0.5 * np.sqrt(np.exp(X @ beta2)) * noise
    print(X.shape, Y.shape)

    axis = np.linspace(0.0, 2, 200)
    plt.figure(figsize=(8, 3))

    ngb = NGBoost(n_estimators=100,
                  learning_rate=1.0,
                  Dist=Normal,
                  Base=default_linear_learner,
                  natural_gradient=True,
                  minibatch_frac=1.0,
                  Score=CRPS())
    ngb.fit(X, Y)
    preds = ngb.pred_dist(X)
    print(preds.scale.mean())
    print(preds.scale.std())
    pctles, observed, slope, intercept = calibration_regression(preds, Y)

    plt.subplot(1, 2, 1)
    plot_pit_histogram(pctles, observed, label="CRPS", linestyle="--")
    plt.subplot(1, 2, 2)
    plt.plot(axis,
             gaussian_kde(preds.scale)(axis),
             linestyle="--",
Exemple #9
0
    m, n = 1000, 5
    X = np.random.randn(m, n) / np.sqrt(n)
    Y = X @ np.ones((n, 1)) + 0.5 * np.random.randn(*(m, 1))
    T = X @ np.ones((n, 1)) + 0.5 * np.random.randn(*(m, 1)) + args.eps
    C = (T < Y).astype(int)

    print(X.shape, Y.shape, C.shape)
    print(f"Censorship: {np.mean(C):.2f}")

    X_tr, X_te, Y_tr, Y_te, T_tr, T_te, C_tr, C_te = train_test_split(
        X, Y, T, C, test_size=0.2)

    ngb = NGBoost(Dist=Laplace,
                  n_estimators=args.n_estimators,
                  learning_rate=args.lr,
                  natural_gradient=False,
                  Base=default_linear_learner,
                  Score=MLE_SURV())
    train_losses = ngb.fit(X_tr, np.c_[np.minimum(Y_tr, T_tr), C_tr])

    preds = ngb.pred_dist(X_te)
    print(f"R2: {r2_score(Y_te, preds.loc)}")

    plt.hist(preds.loc, range=(-5, 5), bins=30, alpha=0.5, label="Pred")
    plt.hist(Y_te, range=(-5, 5), bins=30, alpha=0.5, label="True")
    plt.legend()
    plt.show()

    # since we simulated the data we fully observe all outcomes
    pctles, observed, slope, intercept = calibration_regression(preds, Y_te)
    plot_calibration_curve(pctles, observed)
Exemple #10
0
    m, n = 1000, 5
    X = np.random.randn(m, n) / np.sqrt(n)
    Y = X @ np.ones((n, 1)) + 0.5 * np.random.randn(*(m, 1))
    T = X @ np.ones((n, 1)) + 0.5 * np.random.randn(*(m, 1)) + args.eps
    C = (T < Y).astype(int)

    print(X.shape, Y.shape, C.shape)
    print(f"Censorship: {np.mean(C):.2f}")

    X_tr, X_te, Y_tr, Y_te, T_tr, T_te, C_tr, C_te = train_test_split(
        X, Y, T, C, test_size=0.2)

    ngb = NGBoost(Dist=Exponential,
                  n_estimators=args.n_estimators,
                  learning_rate=args.lr,
                  natural_gradient=True,
                  Base=default_linear_learner,
                  Score=MLE,
                  verbose=True,
                  verbose_eval=1)
    train_losses = ngb.fit(X_tr, Y_join(np.exp(np.minimum(Y_tr, T_tr)), C_tr))

    preds = ngb.pred_dist(X_te)
    print(f"R2: {r2_score(Y_te, np.log(preds.mean()))}")

    plt.hist(preds.mean(), range=(0, 10), bins=30, alpha=0.5, label="Pred")
    plt.hist(np.exp(Y_te), range=(0, 10), bins=30, alpha=0.5, label="True")
    plt.legend()
    plt.show()

    # since we simulated the data we fully observe all outcomes
    pctles, observed, slope, intercept = calibration_regression(preds, Y_te)
Exemple #11
0
    def fold_run(self,
            src_dir : str,
            X_train : pd.DataFrame, 
            y_train,
            X_test : pd.DataFrame,
            n_folds : int,
            col : str,
            parameters = None,
            categorical_features = None):

            """
            # Arguments: 
            : src_dir - str - main dir for saving model,history and fold runs 
            
            : X_train,y_train - training dataset with labels 

            : X_test,y_test - test dataset with labels

            : n_folds - number of folds to split dataset 

            : col - if self.stratify is True, you need to specified a col that consist of 

            binary of multilabel classes because StratifiedKFold does not support continous values

            : parameters - run_parameters that are necessary to run the Tree-Based models,

            for better understading of these parameters, you should go and read LightGBM,

            XGBoost,CatBoost API 

            : categorical_features - list - list of categorical features in dataset, 

            necessary for LightGBM and CatBoost

            # Returns: 
            valid_predictions,test_predictions - predictions made by model
            """

            if isinstance(y_train, pd.DataFrame):
                y_train = y_train.values
            if src_dir:
                if os.path.isdir(src_dir):
                    pass
                else:
                    print(f"Making dir:{src_dir}")
                    os.makedirs(src_dir)
            try: 
                print("X_train_shape",X_train.shape)
                if X_test is not None:
                    print("X_test_shape",X_test.shape)
            except ValueError: 
                print("Shape does not fit")

            if self.stratify:
                print(f"Make {n_folds} stratified folds")
                kf = StratifiedKFold(n_splits=n_folds,shuffle=True,random_state = self.seed)
            elif self.time_series:
                kf = TimeSeriesSplit(n_splits=n_folds,random_state = self.seed)
            else: 
                kf = KFold(n_splits=n_folds,random_state = self.seed)

            valid_predictions = np.zeros((X_train.shape[0],n_folds))
            print("Vaild_predict",valid_predictions.shape[0])
            if X_test is not None: 
                test_predictions = np.zeros((X_test.shape[0],n_folds))
                print("Test_predict",test_predictions.shape[0])
            i = 0
            for train_index, val_index in kf.split(X_train,y_train) if self.stratify is False else kf.split(X_train,X_train[col]):
                if self.train_gbm:
                    print("Train LightGBM")
                    train_X = X_train.iloc[train_index]
                    val_X = X_train.iloc[val_index]
                    if isinstance(y_train,pd.DataFrame): 
                        train_y = y_train.iloc[train_index]
                        val_y = y_train.iloc[val_index]
                    else: 
                        train_y = y_train[train_index]
                        val_y  = y_train[val_index]
                    
                    lgb_train = lgb.Dataset(train_X,train_y,categorical_feature = categorical_features)
                    lgb_val = lgb.Dataset(val_X,val_y,categorical_feature = categorical_features,reference = lgb_train)

                    gbm = lgb.train(params = parameters,
                                    train_set=lgb_train,
                                    num_boost_round=parameters['num_boost_round'],
                                    valid_sets=[lgb_train,lgb_val],
                                    early_stopping_rounds=parameters['early_stopping_rounds'],
                                    evals_result = self.history,
                                    verbose_eval = parameters['verbose_eval'],
                                    feval = self.eval_metric)
                    valid_predictions[val_index,i] = gbm.predict(val_X,num_iteration=gbm.best_iteration)
                    valid_predictions[val_index,i] = np.clip(valid_predictions[val_index,i],a_min=0,a_max=None)

                    r2= r2_score(np.nan_to_num(val_y[val_index]),np.nan_to_num(valid_predictions[val_index,i]))
                    log_error = np.sqrt(mean_squared_log_error(np.nan_to_num(val_y[val_index]),np.nan_to_num(valid_predictions[val_index,i])))
                    print(f"R2 Score for current validation set:{r2}")
                    print(f"RMSLE for current val set:{log_error}")
                
                    if self.save_model:
                        print("Saving model")
                        gbm.save_model(f'{src_dir}/fold_{i}_{self.name}_eval_history.txt')
                    
                    if self.save_history: 
                        print("Saving Hisotry")
                        pd.to_pickle(self.history,f'{src_dir}/fold_{i}_{self.name}_pickle_eval_history.pkl')
                    
                    if self.test_predict: 
                        test_predictions[:,i] = self.predict_test(X_test,i,src_dir)
                        test_predictions[:,i] = np.clip(test_predictions[:,i],a_min=0,a_max=None)

                    if self.importance: 
                        self.visualize_importance(i,src_dir)
                    
                    if self.show_metric_results:
                        self.show_results(i)
                        
                elif self.train_xg: 
                    print("Train XGBooost")
                    train_X = np.nan_to_num(X_train.iloc[train_index])
                    val_X = np.nan_to_num(X_train.iloc[val_index])
                    if isinstance(y_train,pd.DataFrame): 
                        train_y = np.nan_to_num(y_train.iloc[train_index])
                        val_y = np.nan_to_num(y_train.iloc[val_index])
                    else: 
                        train_y = np.nan_to_num(y_train[train_index])
                        val_y =  np.nan_to_num(y_train[val_index])
                    
                    xg_train = xgb.DMatrix(train_X,label=train_y,feature_names=X_train.columns)
                    xg_val = xgb.DMatrix(val_X,label=val_y,feature_names=X_train.columns)
                    eval_list = [(xg_train,'train'),(xg_val,'val')]

                    xgboost_train = xgb.train(parameters,xg_train,evals=eval_list,
                                              evals_result=self.history, 
                                              num_boost_round=parameters['boost_round'],
                                              early_stopping_rounds=parameters['early_stopping'],
                                              verbose_eval = parameters['verbose_eval'])

                    valid_predictions[val_index,i] = xgboost_train.predict(xg_val,
                                                                           ntree_limit = xgboost_train.best_ntree_limit)
                    valid_predictions[val_index,i] = np.clip(valid_predictions[val_index,i],a_min=0,a_max=None)

                    r2= r2_score(np.nan_to_num(val_y[val_index]),np.nan_to_num(valid_predictions[val_index,i]))
                    log_error = np.sqrt(mean_squared_log_error(np.nan_to_num(val_y[val_index]),np.nan_to_num(valid_predictions[val_index,i])))
                    print(f"R2 Score for current validation set:{r2}")
                    print(f"RMSLE for current val set:{log_error}")
                
                    if self.save_model:
                        print("Saving model")
                        xgboost_train.save_model(f'{src_dir}/fold_{i}_{self.name}_eval_history.txt')

                    if self.save_history: 
                        print("Saving Hisotry")
                        pd.to_pickle(self.history,f'{src_dir}/fold_{i}_{self.name}_pickle_eval_history.pkl')
           
                    if self.test_predict: 
                        test_predictions[:,i] = self.predict_test(X_test,i,src_dir,xgboost_train)
                        test_predictions[:,i] = np.clip(test_predictions[:,i],a_min=0,a_max=None)

                    if self.importance: 
                        self.visualize_importance(i,src_dir)
                    
                    if self.show_metric_results: 
                        self.show_results(i)
                
                elif self.train_cat: 
                    print("Training CatBoost")
                    train_X = np.array(X_train.iloc[train_index],dtype=np.float32)
                    val_X = np.array(X_train.iloc[val_index],dtype=np.float32)
                    if isinstance(y_train,pd.DataFrame): 
                        train_y = np.array(y_train.iloc[train_index],dtype=np.float32)
                        val_y = np.array(y_train.iloc[val_index],dtype=np.float32)
                    else: 
                        train_y = np.array(y_train[train_index],dtype=np.float32)
                        val_y = np.array(y_train[val_index],dtype=np.float32)

                    cat_train = catboost.Pool(train_X,label=train_y)
                    cat_test = catboost.Pool(val_X,label=val_y)
                    self.cat = catboost.CatBoostRegressor(**parameters).fit(cat_train,use_best_model=True,
                                                                       eval_set=cat_test,verbose_eval=True)
                    self.history = self.cat.get_evals_result()
                    #Index Error after first epoch, need to fix it
                    valid_predictions[val_index,i] = self.cat.predict(cat_test)
                    valid_predictions[val_index,i] = np.clip(valid_predictions[val_index,i],a_min=0,a_max=None)
                    r2= r2_score(np.nan_to_num(val_y[val_index]),np.nan_to_num(valid_predictions[val_index,i]))
                    log_error = np.sqrt(mean_squared_log_error(np.nan_to_num(val_y[val_index]),np.nan_to_num(valid_predictions[val_index,i])))
                    print(f"R2 Score for current validation set:{r2}")
                    print(f"RMSLE for current val set:{log_error}")
                    if self.save_model:
                        print("Saving model")
                        self.cat.save_model(f'{src_dir}/fold_{i}_{self.name}_eval_history',format='json')

                    if self.test_predict: 
                        test_predictions[:,i] = self.predict_test(X_test,i,src_dir)
                        test_predictions[:,i] = np.clip(test_predictions[:,i],a_min=0,a_max=None)

                elif self.train_ng: 
                    print("Train NGBooost")
                    train_X = np.nan_to_num(X_train.iloc[train_index])
                    val_X = np.nan_to_num(X_train.iloc[val_index])
                    if isinstance(y_train,pd.DataFrame): 
                        train_y = np.nan_to_num(y_train.iloc[train_index])
                        val_y = np.nan_to_num(y_train.iloc[val_index])
                    else: 
                        train_y = np.nan_to_num(y_train[train_index])
                        val_y = np.nan_to_num(y_train[val_index])

                        ng = NGBoost(Dist=Normal,Score=MLE,
                                     Base=default_tree_learner,natural_gradient=True,
                                     n_estimators = 150,learning_rate = 0.01,verbose=True,
                                     verbose_eval=50).fit(train_X,train_y)
                        valid_predictions[val_index,i] = ng.predict(val_X)
                        valid_predictions[val_index,i] = np.clip(valid_predictions[val_index,i],a_min=0,a_max=None)
                        rmse = np.sqrt(mean_squared_error(np.nan_to_num(val_y[val_index]),np.nan_to_num(valid_predictions[val_index,i])))
                        r2= r2_score(np.nan_to_num(val_y[val_index]),np.nan_to_num(valid_predictions[val_index,i]))
                        log_error = np.sqrt(mean_squared_log_error(np.nan_to_num(val_y[val_index]),np.nan_to_num(valid_predictions[val_index,i])))
                        print(f"RMSE for current fold:{rmse}")
                        print(f"R2 Score for current fold:{r2}")
                        print(f"RMSLE for current val set:{log_error}")
                        test_predictions[:,i] = np.clip(ng.predict(X_test),a_min=0,a_max=None)
                i += 1
            if self.jsonize:
                print("Saving model parameters to json")
                if os.path.isdir('parameters'):
                    pass
                else:
                    print("Making Dir: parameters")
                    os.makedirs('parameters')
                model_dict = {"model":f"{src_dir}_{i}_folds",
                              "parameters":parameters}
                with open(f"./parameters/{src_dir}_{i}_fold.json",'w+') as model_param: 
                    json.dump(model_dict,model_param)
            
            if self.prepare_submission: 
                self.output_submission(self,test_predictions,i)

            return valid_predictions,test_predictions
class CatBoostRegressor(iterations=None,
                        learning_rate=None,
                        depth=None,
                        l2_leaf_reg=None,
                        model_size_reg=None,
                        rsm=None,
                        loss_function='RMSE',
                        border_count=None,
                        feature_border_type=None,
                        per_float_feature_quantization=None,
                        input_borders=None,
                        output_borders=None,
                        fold_permutation_block=None,
                        od_pval=None,
                        od_wait=None,
                        od_type=None,
                        nan_mode=None,
                        counter_calc_method=None,
                        leaf_estimation_iterations=None,
                        leaf_estimation_method=None,
                        thread_count=None,
                        random_seed=None,
                        use_best_model=None,
                        best_model_min_trees=None,
                        verbose=None,
                        silent=None,
                        logging_level=None,
                        metric_period=None,
                        ctr_leaf_count_limit=None,
                        store_all_simple_ctr=None,
                        max_ctr_complexity=None,
                        has_time=None,
                        allow_const_label=None,
                        one_hot_max_size=None,
                        random_strength=None,
                        name=None,
                        ignored_features=None,
                        train_dir=None,
                        custom_metric=None,
                        eval_metric=None,
                        bagging_temperature=None,
                        save_snapshot=None,
                        snapshot_file=None,
                        snapshot_interval=None,
                        fold_len_multiplier=None,
                        used_ram_limit=None,
                        gpu_ram_part=None,
                        pinned_memory_size=None,
                        allow_writing_files=None,
                        final_ctr_computation_mode=None,
                        approx_on_full_history=None,
                        boosting_type=None,
                        simple_ctr=None,
                        combinations_ctr=None,
                        per_feature_ctr=None,
                        ctr_target_border_count=None,
                        task_type=None,
                        device_config=None,                        
                        devices=None,
                        bootstrap_type=None,
                        subsample=None,                        
                        sampling_unit=None,
                        dev_score_calc_obj_block_size=None,
                        max_depth=None,
                        n_estimators=None,
                        num_boost_round=None,
                        num_trees=None,
                        colsample_bylevel=None,
                        random_state=None,
                        reg_lambda=None,
                        objective=None,
                        eta=None,
                        max_bin=None,
                        gpu_cat_features_storage=None,
                        data_partition=None,
                        metadata=None,
                        early_stopping_rounds=None,
                        cat_features=None,
                        grow_policy=None,
                        min_data_in_leaf=None,
                        min_child_samples=None,
                        max_leaves=None,
                        num_leaves=None,
                        score_function=None,
                        leaf_estimation_backtracking=None,
                        ctr_history_unit=None,
                        monotone_constraints=None)

##------NGBoost----------

# import packages
import pandas as pd
from ngboost.ngboost import NGBoost
from ngboost.learners import default_tree_learner
from ngboost.distns import Normal
import ngboost.scores
from MLE import lightgbm as lgb
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from math import sqrt
# read the dataset
df = pd.read_csv('~/train.csv')
# feature engineering
tr, te = Nanashi_solution(df)
# NGBoost
ngb = NGBoost(Base=default_tree_learner, Dist=Normal, Score=MLE(),
natural_gradient=True,verbose=False)

ngboost = ngb.fit(np.asarray(tr.drop(['SalePrice'],1)),
np.asarray(tr.SalePrice))

y_pred_ngb = pd.DataFrame(ngb.predict(te.drop(['SalePrice'],1)))
# LightGBM
ltr = lgb.Dataset(tr.drop(['SalePrice'],1),label=tr['SalePrice'])

param = {
'bagging_freq': 5,
'bagging_fraction': 0.6,
'bagging_seed': 123,
'boost_from_average':'false',
'boost': 'gbdt',
'feature_fraction': 0.3,
'learning_rate': .01,
'max_depth': 3,
'metric':'rmse',
'min_data_in_leaf': 128,
'min_sum_hessian_in_leaf': 8,
'num_leaves': 128, 'num_threads': 8,
'tree_learner': 'serial',
'objective': 'regression',
'verbosity': -1,
'random_state':123,
'max_bin': 8,
'early_stopping_round':100
}


lgbm = lgb.train(param,ltr,num_boost_round=10000,valid_sets= [(ltr)],verbose_eval=1000)

y_pred_lgb = lgbm.predict(te.drop(['SalePrice'],1))
y_pred_lgb = np.where(y_pred>=.25,1,0)

# XGBoost
params = {
            'max_depth': 4, 'eta': 0.01,
            'objective':'reg:squarederror',
            'eval_metric': ['rmse'],
            'booster':'gbtree',
            'verbosity':0,
            'sample_type':'weighted',
            'max_delta_step':4,
            'subsample':.5,
            'min_child_weight':100,
            'early_stopping_round':50
}

dtr, dte = xgb.DMatrix(tr.drop(['SalePrice'],1),label=tr.SalePrice),
xgb.DMatrix(te.drop(['SalePrice'],1),label=te.SalePrice)

num_round = 5000
xgbst = xgb.train(params,dtr,num_round,verbose_eval=500)

y_pred_xgb = xgbst.predict(dte)

# Check the results
print('RMSE: NGBoost',
round(sqrt(mean_squared_error(X_val.SalePrice,y_pred_ngb)),4))
print('RMSE: LGBM',
round(sqrt(mean_squared_error(X_val.SalePrice,y_pred_lgbm)),4))
print('RMSE: XGBoost',
round(sqrt(mean_squared_error(X_val.SalePrice,y_pred_xgb)),4))

# see the probability distributions by visualising
Y_dists = ngb.pred_dist(X_val.drop(['SalePrice'],1))
y_range = np.linspace(min(X_val.SalePrice), max(X_val.SalePrice), 200)
dist_values = Y_dists.pdf(y_range).transpose()

# plot index 0 and 114
idx = 114
plt.plot(y_range,dist_values[idx])
plt.title(f"idx: {idx}")
plt.tight_layout()
plt.show()


'''
Exemple #13
0
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
# 获取高能对撞粒子数据集
train_data = pd.read_csv("jet_simple_data/simple_train_R04_jet.csv")
# 获取特征列
features = train_data[[
    'number_of_particles_in_this_jet', 'jet_px', 'jet_py', 'jet_pz',
    'jet_energy', 'jet_mass'
]]
# 随机切分数据集
X_train, X_test, Y_train, Y_test = train_test_split(features.values,
                                                    train_data.label.values,
                                                    test_size=0.2)
ngb = NGBoost(Base=default_tree_learner,
              Dist=Normal,
              Score=MLE(),
              natural_gradient=True,
              verbose=False)
# 拟合
ngb.fit(X_train, Y_train)
# 预测
Y_preds = ngb.predict(X_test)
test_data = pd.read_csv("jet_simple_data/simple_test_R04_jet.csv")
features = test_data[[
    'number_of_particles_in_this_jet', 'jet_px', 'jet_py', 'jet_pz',
    'jet_energy', 'jet_mass'
]]
Y_test_data = ngb.predict(features)
with open("submmission.csv", "") as f:
    f.write("id,label\n")
    for jet_id, label in zip(test_data['jet_id'], Y_test_data):
Exemple #14
0
    print('== Dataset=%s X.shape=%s Censorship=%.4f' %
          (args.dataset, str(X.shape), np.mean(1 - E)))

    for itr in range(args.reps):

        X_train, X_test, Y_train, Y_test = train_test_split(X,
                                                            Y,
                                                            test_size=0.2)
        X_train, X_val, Y_train, Y_val = train_test_split(X_train,
                                                          Y_train,
                                                          test_size=0.2)

        ngb = NGBoost(Dist=eval(args.distn),
                      n_estimators=args.n_est,
                      learning_rate=args.lr,
                      natural_gradient=args.natural,
                      verbose=args.verbose,
                      minibatch_frac=1.0,
                      Base=base_name_to_learner[args.base],
                      Score=eval(args.score)())

        train_losses = ngb.fit(X_train, Y_train)  #, X_val, Y_val)
        forecast = ngb.pred_dist(X_test)
        train_forecast = ngb.pred_dist(X_train)
        print('NGB score: %.4f (val), %.4f (train)' %
              (concordance_index_censored(Y_test['Event'], Y_test['Time'],
                                          -forecast.mean())[0],
               concordance_index_censored(Y_train['Event'], Y_train['Time'],
                                          -train_forecast.mean())[0]))
        #logger.tick(forecast, Y_test)

        ##
Exemple #15
0
    print("Models")

    start = datetime.now().timestamp()
    qreg = MLPQuantile()
    qreg.fit(X_train_std,y_train)
    preds = qreg.predict(X_test_std)
    end = datetime.now().timestamp()
    results=evaluate((np.exp(preds)-1),(np.exp(y_test)-1).values)
    results["duration"]=end-start
    save_result([horizon,
                    "MLP",
                    results,
                    1],f"unit_{horizon}",folder)

    start = datetime.now().timestamp()
    ngb = NGBoost(Base=default_tree_learner, Dist=Normal, Score=MLE(), natural_gradient=True,
              verbose=True,n_estimators=1500)
    ngb.fit(X_train_std, y_train.values)
    Y_dists = ngb.pred_dist(X_test_std)
    a=pd.DataFrame()
    for i in np.arange(1,100):
        a[i]=Y_dists.ppf(i/100)
    preds = a.values
    end = datetime.now().timestamp()
    results=evaluate((np.exp(preds)-1),(np.exp(y_test)-1).values)
    results["duration"]=end-start
    save_result([horizon,
                    "NGBOOST",
                    results,
                    1],f"unit_{horizon}",folder)

                    
Exemple #16
0
    m, n = 1000, 50
    if args.noise_dist == "Normal":
        noise = np.random.randn(*(m, 1))
    elif args.noise_dist == "Laplace":
        noise = sp.stats.laplace.rvs(size=(m, 1))
    beta = np.random.randn(n, 1)
    X = np.random.randn(m, n) / np.sqrt(n)
    Y = np.exp(X @ beta + 0.5 * noise)
    print(X.shape, Y.shape)

    dist = eval("Log" + args.dist)

    ngb = NGBoost(n_estimators=50,
                  learning_rate=0.5,
                  Dist=dist,
                  Base=default_linear_learner,
                  natural_gradient=False,
                  minibatch_frac=1.0,
                  Score=CRPS())
    losses = ngb.fit(X, Y)

    preds = ngb.pred_dist(X)

    print(f"R2: {r2_score(Y, np.exp(preds.loc)):.4f}")
    pctles, observed, slope, intercept = calibration_regression(preds, Y)

    plt.figure(figsize=(8, 3))
    plt.subplot(1, 2, 1)
    plot_pit_histogram(pctles, observed)
    plt.title("Original scale")
Exemple #17
0
    argparser.add_argument("--noise-dist", type=str, default="Normal")
    args = argparser.parse_args()

    m, n = 1000, 50
    if args.noise_dist == "Normal":
        noise = np.random.randn(*(m, 1))
    elif args.noise_dist == "Laplace":
        noise = sp.stats.laplace.rvs(size=(m, 1))
    beta = np.random.randn(n, 1)
    X = np.random.randn(m, n) / np.sqrt(n)
    Y = X @ beta + 0.5 * noise + 20
    print(X.shape, Y.shape)

    ngb = NGBoost(n_estimators=100, learning_rate=1.,
                  Dist=eval(args.dist),
                  Base=default_linear_learner,
                  natural_gradient=True,
                  minibatch_frac=1.0,
                  Score=MLE())
    ngb.fit(X, Y)

    preds = ngb.pred_dist(X)
    print(f"R2: {r2_score(Y, preds.loc):.4f}")

    pctles, observed, slope, intercept = calibration_regression(preds, Y)
    print(observed)
    plt.figure(figsize = (8, 3))
    plt.subplot(1, 2, 1)
    plot_calibration_curve(pctles, observed)
    plt.subplot(1, 2, 2)
    plot_pit_histogram(pctles, observed)
    plt.tight_layout()
Exemple #18
0
    argparser.add_argument("--n-estimators", type=int, default=301)
    argparser.add_argument("--lr", type=float, default=0.03)
    argparser.add_argument("--minibatch-frac", type=float, default=0.1)
    argparser.add_argument("--natural", action="store_true")
    args = argparser.parse_args()

    x_tr, y_tr, _ = gen_data(n=50)

    poly_transform = PolynomialFeatures(1)
    x_tr = poly_transform.fit_transform(x_tr)

    ngb = NGBoost(
        Base=default_tree_learner,
        Dist=Normal,
        Score=MLE,
        n_estimators=args.n_estimators,
        learning_rate=args.lr,
        natural_gradient=args.natural,
        minibatch_frac=args.minibatch_frac,
        verbose=True,
    )

    ngb.fit(x_tr, y_tr)

    x_te, y_te, _ = gen_data(n=1000, bound=1.3)
    x_te = poly_transform.transform(x_te)
    preds = ngb.pred_dist(x_te)

    pctles, obs, _, _ = calibration_regression(preds, y_te)

    all_preds = ngb.staged_pred_dist(x_te)
    preds = all_preds[-1]
Exemple #19
0
    argparser.add_argument("--noise-lvl", type=float, default=0.25)
    argparser.add_argument("--distn", type=str, default="Normal")
    argparser.add_argument("--natural", action="store_true")
    argparser.add_argument("--score", type=str, default="CRPS")
    args = argparser.parse_args()

    m, n = 1200, 50
    noise = np.random.randn(*(m, 1))
    beta1 = np.random.randn(n, 1)
    beta2 = np.random.randn(n, 1)
    X = np.random.randn(m, n) / np.sqrt(n)
    Y = X @ beta1 + args.noise_lvl * np.sqrt(np.exp(X @ beta2)) * noise
    print(X.shape, Y.shape)

    X_train, X_test = X[:1000, :], X[1000:, ]
    Y_train, Y_test = Y[:1000], Y[1000:]

    ngb = NGBoost(n_estimators=150,
                  learning_rate=args.lr,
                  Dist=Laplace,
                  Base=default_linear_learner,
                  natural_gradient=args.natural,
                  minibatch_frac=1.0,
                  Score=eval(args.score)())

    losses = ngb.fit(X_train, Y_train)
    forecast = ngb.pred_dist(X_test)
    logger = RegressionLogger(args)
    logger.tick(forecast, Y_test)
    logger.save()