Exemple #1
0
    def __init__(self,
                 output_dim,
                 boosting_type='gbdt',
                 num_leaves=31,
                 max_depth=-1,
                 learning_rate=0.1,
                 n_estimators=100,
                 subsample_for_bin=200000,
                 class_weight=None,
                 min_split_gain=0.,
                 min_child_weight=1e-3,
                 min_child_samples=20,
                 subsample=1.,
                 subsample_freq=0,
                 colsample_bytree=1.,
                 reg_alpha=0.,
                 reg_lambda=0.,
                 random_state=None,
                 n_jobs=-1,
                 silent=True,
                 importance_type='split'):
        if output_dim > 2:
            objective = "multiclass"
        else:
            objective = "regression"

        self.model = lightgbm.LGBMModel(boosting_type,
                                        num_leaves,
                                        max_depth,
                                        learning_rate,
                                        n_estimators,
                                        subsample_for_bin,
                                        objective,
                                        class_weight,
                                        min_split_gain,
                                        min_child_weight,
                                        min_child_samples,
                                        subsample,
                                        subsample_freq,
                                        colsample_bytree,
                                        reg_alpha,
                                        reg_lambda,
                                        random_state,
                                        n_jobs,
                                        silent,
                                        importance_type,
                                        n_classes=output_dim,
                                        first_metric_only=True)

        self.output_dim = output_dim

        self.custom_metrics = {}

        self.result = None

        self.rgetter = RGetter()

        self.stop_training = False

        self.custom_loss_callable = None
 def optmize_hyperparams(self,
                         param_grid,
                         X,
                         Y,
                         cv=4,
                         scoring='neg_mean_squared_error',
                         verbose=1):
     '''Use GridSearchCV to optimize models params'''
     params = self.params
     params['learning_rate'] = 0.05
     params['n_estimators'] = 1000
     gsearch1 = GridSearchCV(estimator=lgb.LGBMModel(**params),
                             param_grid=param_grid,
                             scoring=scoring,
                             n_jobs=1,
                             iid=False,
                             cv=4)
     gsearch1.fit(X, Y)
     scores = gsearch1.grid_scores_
     best_params = gsearch1.best_params_
     best_score = np.sqrt(-gsearch1.best_score_)
     if verbose > 0:
         if verbose > 1:
             print('Scores are: ', scores)
         print('Best params: ', best_params)
         print('Best score: ', best_score)
Exemple #3
0
 def __init__(
         self,
         boostring='dart',
         learning_rate=0.05,
         min_data_in_leaf=20,  #applications='binary'
         feature_fraction=0.7,
         num_leaves=41,
         metric='auc',
         drop_date=0.15):
     self.parameters = {
         'boosting':
         boostring,  # dart (drop out trees) often performs better
         #'application': applications,     # Binary classification
         'learning_rate':
         learning_rate,  # Learning rate, controls size of a gradient descent step
         'min_data_in_leaf':
         min_data_in_leaf,  # Data set is quite small so reduce this a bit
         'feature_fraction':
         feature_fraction,  # Proportion of features in each boost, controls overfitting
         'num_leaves':
         num_leaves,  # Controls size of tree since LGBM uses leaf wise splits
         'metric': metric,  # Area under ROC curve as the evaulation metric
         'drop_rate': drop_date
     }
     self.evaluation_results = {}
     self.model = lgb.LGBMModel()
    def __init__(self, **kwargs):
        # TODO: use config file to set default parameters (like in candle)
        
       #  self.model = lgb.LGBMModel(
       #      objective = LGBM_REGRESSOR.ml_objective,
       #      n_estimators = n_estimators,
       #      n_jobs = n_jobs,
       #      random_state = random_state)

        self.model = lgb.LGBMModel( objective = LGBM_CLASSIFIER.ml_objective, **kwargs )
Exemple #5
0
    def __init__(self,
                 n_estimators=100,
                 eval_metric=['l2', 'l1'],
                 n_jobs=1,
                 random_state=None,
                 logger=None):
        # TODO: use config file to set default parameters (like in candle)

        self.model = lgb.LGBMModel(objective=LGBM_REGRESSOR.ml_objective,
                                   n_estimators=n_estimators,
                                   n_jobs=n_jobs,
                                   random_state=random_state)
    def run_rfe(self,
                model_params,
                target,
                X_vars,
                threshold=0,
                model_type='indicator'):
        if model_type == 'indicator':
            model = lightgbm.LGBMModel(**model_params, importance_type='gain')
        elif model_type == 'regressor':
            model = lightgbm.LGBMRegressor(**model_params,
                                           importance_type='gain')
        eval_set = [(self.df_tune[X_vars], self.df_tune[self.target])]
        model.fit(X=self.df_train[X_vars],
                  y=self.df_train[self.target],
                  eval_set=eval_set,
                  verbose=False)

        #Dataframe of features and their corresponding level of importance by gain
        importance_df = pd.DataFrame(
            data={
                'features': X_vars,
                'gain_importances': model.feature_importances_
            })

        while sum(model.feature_importances_ <= threshold) > 0:
            print(
                f"{sum(model.feature_importances_ <= threshold)} features below threshold"
            )
            print("The following features will be removed:")
            print(importance_df.loc[model.feature_importances_ <= threshold]
                  ['features'].tolist())

            features = importance_df.loc[
                model.feature_importances_ > threshold]['features'].tolist()
            eval_set = [(self.df_tune[features], self.df_tune[self.target])]
            model.fit(X=self.df_train[features],
                      y=self.df_train[self.target],
                      eval_set=eval_set,
                      verbose=False)
            importance_df = pd.DataFrame(
                data={
                    'features': model.booster_.feature_name(),
                    'gain_importances': model.feature_importances_
                })

        self.feature_importance_df = importance_df.sort_values(
            by=['gain_importances'], ascending=False)
        self.post_rfe_model = model
        return self.post_rfe_model, self.feature_importance_df
def Model_stack(df_train_x, df_train_y, df_test):
    # kernel has 'linear'/'poly'/'rbf'/'sigmoid'/'precomputed'/'callable' 如果没有给出,默认'rbf' callable 预先计算内核矩阵
    svr_ = SVR(kernel='linear', degree=3, coef0=0.0, tol=0.001,
               C=1.0, epsilon=0.1, shrinking=True, cache_size=20)
    lgb_ = lgb.LGBMModel(boosting_type='gbdt', num_leaves=35,
                         max_depth=20, max_bin=255, learning_rate=0.03, n_estimator=10, subsample_for_bin=2000,
                         objective='regression', min_split_gain=0.0, min_child_weight=0.001, min_child_samples=20,
                         subsample=1.0, verbose=0, subsample_freq=1, colsample_bytree=1.0, reg_alpha=0.0,
                         reg_lambda=0.0, random_state=None, n_jobs=-1, silent=True)
    RF_model = RandomForestRegressor(n_estimators=50, max_depth=25, min_samples_split=20, min_samples_leaf=10,
                                     max_features='sqrt', oob_score=True, random_state=10)
    # 贝叶斯岭回归
    BR_model = BayesianRidge(alpha_1=1e-06, alpha_2=1e-06, compute_score=False, copy_X=True, fit_intercept=True,
                             lambda_1=1e-06, lambda_2=1e-06, n_iter=300, normalize=False, tol=0.0000001, verbose=False)
    linear_model = LinearRegression()
    ls = Lasso(alpha=0.00375)
    x_train, x_test, y_train, y_test = train_test_split(df_train_x, df_train_y,
                                                        test_size=0.6)
    rg = RidgeCV(cv=5)
    stack = pd.DataFrame()
    stack_test = pd.DataFrame()

    ls.fit(x_train, y_train)
    lgb_.fit(x_train, y_train)
    RF_model.fit(x_train, y_train)
    svr_.fit(x_train, y_train)
    linear_model.fit(x_train, y_train)
    BR_model.fit(x_train, y_train)

    stack['rf'] = ls.predict(x_test)
    stack['adaboost'] = lgb_.predict(x_test)
    stack['gbdt'] = RF_model.predict(x_test)
    stack['lightgbm'] = svr_.predict(x_test)
    stack['linear_model'] = linear_model.predict(x_test)
    stack['BR'] = BR_model.predict(x_test)
    # print('stacking_model: ',Cross_validation(stack, y_test, rg))

    rg.fit(stack, y_test)
    stack_test['rf'] = ls.predict(df_test)
    stack_test['adaboost'] = lgb_.predict(df_test)
    stack_test['gbdt'] = RF_model.predict(df_test)
    stack_test['lightgbm'] = svr_.predict(df_test)
    stack_test['linear_model'] = linear_model.predict(df_test)
    stack_test['BR'] = BR_model.predict(df_test)

    final_ans = rg.predict(stack_test)
    pd.DataFrame(final_ans).to_csv('predict_drop+3.txt', index=False, header=False)
 def lgb_params(self):
     return lgb.LGBMModel(boosting_type=self.boosting_type, 
                          num_leaves=self.num_leaves,
                          max_depth=self.max_depth,
                          learning_rate=self.learning_rate,
                          n_estimators=self.n_estimators,
                          max_bin=self.max_bin,
                          subsample_for_bin=self.subsample_for_bin,
                          objective=self.objective, 
                          min_split_gain=self.min_split_gain,
                          min_child_weight=self.min_child_weight,
                          min_child_samples=self.min_child_samples,
                          subsample=self.subsample,
                          subsample_freq=self.subsample_freq,
                          colsample_bytree=self.colsample_bytree,
                          reg_alpha=self.reg_alpha,
                          reg_lambda=self.reg_lambda,
     #                     random_state=self.random_state,
                         # n_jobs=n_jobs,
                          #silent=silent
                         )
def main():
    args = handleArguments()

    #Rea ddata
    XImgTrain, yImgTrain = loadEdgesDataFromDirs(
        True, target=args.targetTrainDir, nonTarget=args.nonTargetTrainDir)

    XAudioTargetTrain = extractMFCCFromDir(args.targetTrainDir)
    # yAudioTargetTrain = [1 for i in range(len(XAudioTargetTrain))]
    XAudioNonTargetTrain = extractMFCCFromDir(args.nonTargetTrainDir)
    # yAudioNonTargetTrain = [0 for i in range(len(XAudioNonTargetTrain))]

    #train and pickle
    if (args.hmmModelOutput):
        hmmClassifier = HMMBinaryModel()
        hmmClassifier.fit(XAudioTargetTrain, XAudioNonTargetTrain)
        pickle.dump(hmmClassifier, args.hmmModelOutput)

    if (args.lgbmModelOutput):
        gbmClassifier = lgbm.LGBMModel(objective='binary', random_state=42)
        gbmClassifier.fit(XImgTrain, yImgTrain)
        pickle.dump(gbmClassifier, args.lgbmModelOutput)
Exemple #10
0
def model_train(train_data, test_data, train_target, test_target):
    # 多元线性回归
    model = LinearRegression()
    model.fit(train_data, train_target)
    score = mean_squared_error(test_target, model.predict(test_data))
    print('LinearRegression:', score)

    # K近邻回归
    model = KNeighborsRegressor(n_neighbors=8)
    model.fit(train_data, train_target)
    score = mean_squared_error(test_target, model.predict(test_data))
    print('KNeighborsRegressor:', score)

    # 决策树回归
    model = DecisionTreeRegressor(random_state=0)
    model.fit(train_data, train_target)
    score = mean_squared_error(test_target, model.predict(test_data))
    print('DecisionTreeRegressor:', score)

    # 随机森林回归
    model = RandomForestRegressor(n_estimators=200)
    model.fit(train_data, train_target)
    score = mean_squared_error(test_target, model.predict(test_data))
    print('RandomForestRegressor:', score)

    # LGB模型回归
    model = lgb.LGBMModel(
        learning_rate=0.01,
        max_depth=-1,
        n_estimators=5000,
        boosting_type='gbdt',
        random_state=2020,
        objective='regression',
    )
    model.fit(train_data, train_target)
    score = mean_squared_error(test_target, model.predict(test_data))
    print('lightGbm:', score)
Exemple #11
0
 def init(self, param: dict) -> None:
     """Initialize predictor."""
     self._model = lgb.LGBMModel(**param)
Exemple #12
0
import lightgbm as lgbm

clf = lgbm.LGBMModel(boosting_type='gbdt',
                     objective='regression',
                     n_estimators=300,
                     num_boost_round=200000,
                     num_leaves=30,
                     learning_rate=0.05,
                     min_split_gain=0.25,
                     min_child_weight=1,
                     min_child_samples=10,
                     scale_pos_weight=1,
                     seed=42,
                     max_depth=-1,
                     subsample=0.8,
                     bagging_fraction=1,
                     max_bin=5000,
                     bagging_freq=20,
                     colsample_bytree=0.6,
                     metric="rmse",
                     n_jobs=10,
                     silent=False)
Exemple #13
0
add_name = np.array([n for n in names if n[:3] == 'AD_'])

if len(add_name) != 0:
    first_X, first_y, second_X, second_y, second_index = Ad_split(X,
                                                                  y,
                                                                  names,
                                                                  AD=True)

    kf = KFold(n_splits=5)
    mse = []
    for train_idx, test_idx in tqdm(kf.split(first_X), total=5):
        train_X, test_X = first_X[train_idx], first_X[test_idx]
        train_y, test_y = first_y[train_idx], first_y[test_idx]
        vector = []
        for i in range(train_y.shape[1]):
            first_model = lgb.LGBMModel(objective='regression')
            first_model.fit(train_X, train_y[:, i])
            first_model.booster_.save_model('first_model_' + str(i) + '.txt')
            pred = first_model.predict(test_X)
            vector.append(mean_squared_error(test_y[:, i], pred))
        mse.append(vector)
    print('5-Fold score(mse):', np.mean(mse, axis=0))

    add = np.zeros((len(second_X), first_y.shape[1]), dtype=np.float32)
    for i in range(first_y.shape[1]):
        bst = lgb.Booster(model_file='first_model_' + str(i) + '.txt')
        add[:, i] = bst.predict(second_X)
    second_X = np.concatenate([second_X, add], axis=1)

else:
    second_X, second_y, second_index = Ad_split(X, y, names, AD=False)
Exemple #14
0
results.reset_index(inplace=True, drop=True)

# Convert from a string to a dictionary
ast.literal_eval(results.loc[0, 'params'])

# Extract the ideal number of estimators and hyperparameters
best_bayes_estimators = int(results.loc[0, 'estimators'])
best_bayes_params = ast.literal_eval(results.loc[0, 'params']).copy()

del best_bayes_params['metric']

print("Creating Model")
# Re-create the best model and train on the training data
best_bayes_model = lgb.LGBMModel(n_estimators=best_bayes_estimators,
                                 n_jobs=-1,
                                 metric='multi_error',
                                 random_state=50,
                                 **best_bayes_params)

print("Fitting Model")
best_bayes_model.fit(train_features, train_labels)

print("Predicting Model")
# Evaluate on the testing data
Predictions = best_bayes_model.predict(test_features)

correct = 0
#Calculate the number of times the model correctly predicted the test labels given the test features
for i in range(0, Predictions.shape[0]):
    maxProbability = np.max(Predictions[i, :])
    for j in range(0, len(Predictions[i, :])):
Exemple #15
0
                                  verbose=0,
                                  warm_start=False)

#myGBR.get_params                 # 获取模型的所有参数

##############################--lightgbm--####################################
mylgb = lgb.LGBMModel(boosting_type='gbdt',
                      num_leaves=40,
                      max_depth=7,
                      max_bin=233,
                      learning_rate=0.03,
                      n_estimator=10,
                      subsample_for_bin=300,
                      objective='regression',
                      min_split_gain=0.0,
                      min_child_weight=0.1,
                      min_child_samples=20,
                      subsample=1.0,
                      verbose=0,
                      subsample_freq=1,
                      colsample_bytree=1.0,
                      reg_alpha=0.0,
                      reg_lambda=0.0,
                      random_state=None,
                      n_jobs=-1,
                      silent=True)

###############################--xgboost--######################################

cv_params = {'n_estimators': [2000]}
other_params = {
    'learning_rate': 0.005,
Exemple #16
0
def objective(hpp):
    '''Returns compet. cv validation score - avg ensemble of folds'''

    # CV main loop
    for i, (train_index, test_index) in enumerate(kf.split(x_train_test)):

        # Get current fold
        x_train = x_train_test[train_index]
        y_train = y_train_test[train_index]
        x_test = x_train_test[test_index]
        y_test = y_train_test[test_index]

        # Create lgb booster
        bst = lgb.LGBMModel(
            objective='regression',
            num_leaves=int(hpp['num_leaves']),
            learning_rate=hpp['lr'],
            n_estimators=10000,
            min_child_samples=int(hpp['min_child_samples']),
            subsample=hpp['subsample'],
            reg_lambda=hpp['reg_lambda'],
        )

        bst.fit(
            X=x_train,
            y=y_train,
            eval_set=[(x_test, y_test)],
            eval_metric='rmse',
            early_stopping_rounds=15,
            categorical_feature=ccols,
            verbose=False,
        )

        # Calculate competition metric below

        # Predict
        y_pred = bst.predict(x_test)
        y_pred = np.clip(y_pred, 0, np.inf)
        y_pred = np.expm1(np.squeeze(y_pred))

        # Build dataframe with predictions and truth per session
        session_df = train_test_df.iloc[test_index, [0, -1]].copy()
        session_df['y_pred'] = y_pred

        # Aggregate pred and truth per user
        y_true_user = session_df.groupby(
            'fullVisitorId')['totals.transactionRevenue'].sum().values
        y_pred_user = session_df.groupby(
            'fullVisitorId')['y_pred'].sum().values

        # Apply log1p to aggregated predictions
        y_pred_user = np.log1p(y_pred_user)
        y_true_user = np.log1p(y_true_user)

        # Comp. metric
        val_loss = np.sqrt(np.mean(np.power(y_pred_user - y_true_user, 2)))

        return {
            'loss': val_loss,
            'params': hpp,
            'status': STATUS_OK,
        }
Exemple #17
0
    def fit(self,

            clinical,
            genes,
            treatments,
            outcome,

            optimization_n_call=50,
            optimization_n_folds=2,
            optimization_early_stopping_rounds=1,

            clinical_marker_selection_threshold=.05,
            gene_selection_threshold=.05,

            dae_early_stopping_rounds=1000,
            dae_decay_rate=0.1,
            dae_learning_rate=1e-4,
            dae_steps=50000,

            lgb_fixed_parameters=dict(),
            lgb_early_stopping_rounds=10,

            predictor_n_folds=5):
        """
        """

        self.__reset__()

        ############################################################################################
        # Select gene expressions
        ############################################################################################

        self.selected_clinical = self.select_markers(
            clinical, outcome, threshold=clinical_marker_selection_threshold)

        self.selected_genes = self.select_markers(
            genes, outcome, threshold=gene_selection_threshold)

        if self.n_gene_limit is None:
            self.n_gene_limit = self.select_k_top_markers(self.selected_genes[2])

        if self.n_gene_limit is not None:
            if 4 <= self.n_gene_limit < len(self.selected_genes[0]):
                self.selected_genes = (self.selected_genes[0][:self.n_gene_limit],
                                       self.selected_genes[1][:self.n_gene_limit],
                                       self.selected_genes[2][:self.n_gene_limit])

        pd.DataFrame({'clinical_marker': self.selected_clinical[0],
                      'pvalue': self.selected_clinical[1],
                      'entropy': self.selected_clinical[2]}).to_csv(
            os.path.join(
                self.output_path, 'selected_markers',
                'clinical_{0:03}_{1:03}.csv'.format(
                    self.experiment_number, self.number_of_experiments)),
            index=False)

        pd.DataFrame({'gene': self.selected_genes[0],
                      'pvalue': self.selected_genes[1],
                      'entropy': self.selected_genes[2]}).to_csv(
            os.path.join(
                self.output_path, 'selected_markers',
                'genes_{0:03}_{1:03}.csv'.format(
                    self.experiment_number, self.number_of_experiments)),
            index=False)

        clinical = clinical.loc[:, self.selected_clinical[0]].join(treatments)
        genes = genes.loc[:, self.selected_genes[0]]

        ############################################################################################
        # Normalizing Gene Expression Data
        ############################################################################################

        self.genes_min_max_scaler = MinMaxScaler()

        genes = pd.DataFrame(self.genes_min_max_scaler.fit_transform(genes),
                             index=genes.index, columns=genes.columns)

        ############################################################################################
        # Genetic Profiling
        ############################################################################################

        self.fit_genetic_profiling(genes)
        profiling = self.predict_genetic_profiling(genes)

        clinical = pd.concat([clinical, profiling], axis=1)

        ############################################################################################
        # Gene Clustering
        ############################################################################################

        self.fit_gene_clustering(genes)
        gene_clusters = self.predict_gene_clustering(genes)

        clinical = pd.concat([clinical, gene_clusters], axis=1)

        ############################################################################################
        # Normalizing Clinical Data
        ############################################################################################

        self.clinical_min_max_scaler = MinMaxScaler()

        clinical = pd.DataFrame(self.clinical_min_max_scaler.fit_transform(clinical),
                                index=clinical.index, columns=clinical.columns)

        clinical = clinical.fillna(0)

        ############################################################################################
        # Denoising Autoencoder
        ############################################################################################

        self.fit_dae(markers=genes,
                     decay_rate=dae_decay_rate,
                     learning_rate=dae_learning_rate,
                     steps=dae_steps,
                     early_stopping_rounds=dae_early_stopping_rounds)

        dda = self.predict_dae(genes)

        ############################################################################################
        # Joining all features
        ############################################################################################

        join = clinical.join(genes, how='inner').join(dda, how='inner')

        x = join.values
        y = outcome.values

        # smote = SMOTE(sampling_strategy='minority', random_state=self.random_state, n_jobs=-1)
        # x, y = smote.fit_resample(x, y)
        # del smote

        ############################################################################################
        # LightGBM Hyperparameter Optimization
        ############################################################################################

        lgb_params = LightGBMOptimizer(
            n_calls=optimization_n_call,
            n_folds=optimization_n_folds,
            fixed_parameters=lgb_fixed_parameters,
            early_stopping_rounds=optimization_early_stopping_rounds,
            random_state=self.random_state).optimize(x, y)

        self.lgb_optimized_params = lgb_params

        lgb_params = {**lgb_params, **lgb_fixed_parameters}

        ############################################################################################
        # Training
        ############################################################################################

        kkfold = StratifiedKFold(predictor_n_folds, random_state=self.random_state)

        for iii, (t_index, v_index) in enumerate(kkfold.split(x, y)):

            x_train, y_train = x[t_index, :], y[t_index]
            x_valid, y_valid = x[v_index, :], y[v_index]

            ###############################################################################
            # Light GBM
            ###############################################################################

            lgb = lightgbm.LGBMModel(**lgb_params)

            lgb.fit(
                X=x_train, y=y_train,
                eval_set=[(x_valid, y_valid)],
                early_stopping_rounds=lgb_early_stopping_rounds,
                verbose=self.verbose is not None and self.verbose > 0)

            y_train_hat_lgb = lgb.predict(x_train)
            y_valid_hat_lgb = lgb.predict(x_valid)

            self.lgb_models.append(lgb)

            self.lgb_mins.append(min(np.min(y_train_hat_lgb), np.min(y_valid_hat_lgb)))
            self.lgb_maxs.append(max(np.max(y_train_hat_lgb), np.max(y_valid_hat_lgb)))

            ###############################################################################
            # Performance metrics
            ###############################################################################

            # y_train_hat = (y_train_hat_lgb - self.lgb_mins[-1]) / (self.lgb_maxs[-1] - self.lgb_mins[-1])
            # y_valid_hat = (y_valid_hat_lgb - self.lgb_mins[-1]) / (self.lgb_maxs[-1] - self.lgb_mins[-1])
            y_train_hat = y_train_hat_lgb
            y_valid_hat = y_valid_hat_lgb

            self.predictor_train_losses.append(log_loss(y_train, y_train_hat))
            self.predictor_train_aucs.append(roc_auc_score(y_train, y_train_hat))

            self.predictor_valid_losses.append(log_loss(y_valid, y_valid_hat))
            self.predictor_valid_aucs.append(roc_auc_score(y_valid, y_valid_hat))

        print('TRAIN mean log loss: {0:03}'.format(np.mean(self.predictor_train_losses)))
        print('TRAIN mean AUC: {0:03}'.format(np.mean(self.predictor_train_aucs)))

        print('VALID mean log loss: {0:03}'.format(np.mean(self.predictor_valid_losses)))
        print('VALID mean AUC: {0:03}'.format(np.mean(self.predictor_valid_aucs)))
                        metrics='auc',
                        seed=42)

    # results to retun
    score = cv_results['auc-mean'][-1]
    estimators = len(cv_results['auc-mean'])
    hyperparameters['n_estimators'] = estimators

    return [score, hyperparameters, iteration]


score, params, iteration = objective(default_params, 1)

print('The cross-validation ROC AUC was {:.5f}.'.format(score))
# Create a default model
model = lgb.LGBMModel()
model.get_params()
# Hyperparameter grid
param_grid = {
    'boosting_type': ['gbdt', 'goss', 'dart'],
    'num_leaves':
    list(range(20, 150)),
    'learning_rate':
    list(np.logspace(np.log10(0.005), np.log10(0.5), base=10, num=1000)),
    'subsample_for_bin':
    list(range(20000, 300000, 20000)),
    'min_child_samples':
    list(range(20, 500, 5)),
    'reg_alpha':
    list(np.linspace(0, 1)),
    'reg_lambda':
Exemple #19
0
    def fit(self,
            genes,
            outcome,
            clinical=None,
            treatments=None,
            optimization_n_call=50,
            optimization_n_folds=2,
            optimization_early_stopping_rounds=1,
            clinical_marker_selection_threshold=.05,
            gene_selection_threshold=.05,
            dae_early_stopping_rounds=1000,
            dae_decay_rate=0.1,
            dae_learning_rate=1e-4,
            dae_steps=50000,
            dae_keep_probability=.75,
            use_predictor=True,
            lgb_fixed_parameters=None,
            lgb_early_stopping_rounds=10,
            predictor_n_folds=5,
            minor_class_augmentation=False):
        """
        """

        self.__reset__()

        self.minor_class_augmentation = minor_class_augmentation

        x = None

        ############################################################################################
        # Select gene expressions
        ############################################################################################

        if clinical is not None:
            self.selected_clinical = self.select_markers(
                clinical,
                outcome,
                threshold=clinical_marker_selection_threshold,
                random_state=self.random_state)

        self.selected_genes = self.select_markers(
            genes,
            outcome,
            threshold=gene_selection_threshold,
            random_state=self.random_state)

        # if self.n_gene_limit is None:
        #    self.n_gene_limit = self.select_k_top_markers(self.selected_genes[2])

        if self.n_gene_limit is not None:
            if 4 <= self.n_gene_limit < len(self.selected_genes[0]):
                self.selected_genes = (
                    self.selected_genes[0][:self.n_gene_limit],
                    self.selected_genes[1][:self.n_gene_limit],
                    self.selected_genes[2][:self.n_gene_limit])

        if self.export_metadata:

            if clinical is not None:
                pd.DataFrame({
                    'clinical_marker': self.selected_clinical[0],
                    'pvalue': self.selected_clinical[1],
                    'entropy': self.selected_clinical[2]
                }).to_csv(os.path.join(
                    self.output_path, 'selected_markers',
                    'clinical_{0:03}_{1:03}.csv'.format(
                        self.experiment_number, self.number_of_experiments)),
                          index=False)

            pd.DataFrame({
                'gene': self.selected_genes[0],
                'pvalue': self.selected_genes[1],
                'entropy': self.selected_genes[2]
            }).to_csv(os.path.join(
                self.output_path, 'selected_markers',
                'genes_{0:03}_{1:03}.csv'.format(self.experiment_number,
                                                 self.number_of_experiments)),
                      index=False)

        if clinical is not None:
            if len(self.selected_clinical[0]) > 0:
                x = clinical.loc[:, self.selected_clinical[0]]

        if treatments is not None:
            if treatments.shape[1] > 0:
                x = treatments if x is None else x.join(treatments)

        assert len(self.selected_genes[0]) >= 4, \
            'At least 4 genes are required for MuLT approach. You can increase the threshold.'

        genes = genes.loc[:, self.selected_genes[0]]

        #
        self.raw_genes_min_max_scaler = MinMaxScaler()

        genes_norm = self.raw_genes_min_max_scaler.fit_transform(genes)

        genes_norm = pd.DataFrame(genes_norm,
                                  columns=genes.columns,
                                  index=genes.index)

        ############################################################################################
        # Normalizing Gene Expression Data
        ############################################################################################

        self.genes_min_max_scaler = MinMaxScaler()

        genes = pd.DataFrame(self.genes_min_max_scaler.fit_transform(
            np.log1p(genes)),
                             index=genes.index,
                             columns=genes.columns)

        ############################################################################################
        # Genetic Profiling
        ############################################################################################

        self.fit_genetic_profiling(genes_norm)
        profiling = self.predict_genetic_profiling(genes_norm)

        x = pd.concat([x, profiling], axis=1) if x is not None else profiling

        ############################################################################################
        # Gene Clustering
        ############################################################################################

        self.fit_gene_clustering(genes_norm)

        gene_clusters = self.predict_gene_clustering(genes_norm)

        x = pd.concat([x, gene_clusters], axis=1)

        ############################################################################################
        # Denoising Autoencoder
        ############################################################################################

        self.fit_dae(markers=genes,
                     keep_probability=dae_keep_probability,
                     decay_rate=dae_decay_rate,
                     learning_rate=dae_learning_rate,
                     steps=dae_steps,
                     early_stopping_rounds=dae_early_stopping_rounds)

        if use_predictor:

            dda = self.predict_dae(genes)

            ############################################################################################
            # Joining all features
            ############################################################################################

            x = x.join(genes_norm).join(dda, how='inner').fillna(0)

            x, y = x.values, outcome.values

            if minor_class_augmentation:
                smote = SMOTE(sampling_strategy='minority',
                              random_state=self.random_state,
                              n_jobs=-1)
                x, y = smote.fit_resample(x, y)
                del smote

            ############################################################################################
            # LightGBM Hyper parameter Optimization
            ############################################################################################

            if lgb_fixed_parameters is None:
                lgb_fixed_parameters = dict()

            self.predictor_optimizer = LightGBMOptimizer(
                n_calls=optimization_n_call,
                n_folds=optimization_n_folds,
                fixed_parameters=lgb_fixed_parameters,
                early_stopping_rounds=optimization_early_stopping_rounds,
                random_state=self.random_state)

            lgb_params = self.predictor_optimizer.optimize(x, y)

            self.lgb_optimized_params = lgb_params

            lgb_params = {**lgb_params, **lgb_fixed_parameters}

            ############################################################################################
            # Training
            ############################################################################################

            if predictor_n_folds > 1:
                kkfold = StratifiedKFold(predictor_n_folds,
                                         random_state=self.random_state)
                splits = kkfold.split(x, y)

            else:
                splits = [(list(range(0, x.shape[0])), None)]

            for iii, (t_index, v_index) in enumerate(splits):

                x_train, y_train = x[t_index, :], y[t_index]

                if v_index is not None:
                    x_valid, y_valid = x[v_index, :], y[v_index]

                ###############################################################################
                # Light GBM
                ###############################################################################

                lgb = lightgbm.LGBMModel(**lgb_params)

                lgb.fit(X=x_train,
                        y=y_train,
                        eval_set=[(x_valid,
                                   y_valid)] if v_index is not None else None,
                        early_stopping_rounds=lgb_early_stopping_rounds
                        if v_index is not None else None,
                        verbose=self.verbose is not None and self.verbose > 0)

                y_train_hat_lgb = lgb.predict(x_train)

                self.lgb_models.append(lgb)

                if v_index is not None:
                    y_valid_hat_lgb = lgb.predict(x_valid)
                    self.lgb_mins.append(
                        min(np.min(y_train_hat_lgb), np.min(y_valid_hat_lgb)))
                    self.lgb_maxs.append(
                        max(np.max(y_train_hat_lgb), np.max(y_valid_hat_lgb)))

                else:
                    self.lgb_mins.append(min(y_train_hat_lgb))
                    self.lgb_maxs.append(max(y_train_hat_lgb))

                ###############################################################################
                # Performance metrics
                ###############################################################################

                y_train_hat = (y_train_hat_lgb - self.lgb_mins[-1]) / (
                    self.lgb_maxs[-1] - self.lgb_mins[-1])

                if v_index is not None:
                    y_valid_hat = (y_valid_hat_lgb - self.lgb_mins[-1]) / (
                        self.lgb_maxs[-1] - self.lgb_mins[-1])

                self.predictor_train_losses.append(
                    log_loss(y_train, y_train_hat))
                self.predictor_train_aucs.append(
                    roc_auc_score(y_train, y_train_hat))

                if v_index is not None:
                    self.predictor_valid_losses.append(
                        log_loss(y_valid, y_valid_hat))
                    self.predictor_valid_aucs.append(
                        roc_auc_score(y_valid, y_valid_hat))

            if self.verbose:

                print('Train mean log loss: {0:03}'.format(
                    np.mean(self.predictor_train_losses)))
                print('Train mean AUC: {0:03}'.format(
                    np.mean(self.predictor_train_aucs)))

                if v_index is not None:
                    print('Valid mean log loss: {0:03}'.format(
                        np.mean(self.predictor_valid_losses)))
                    print('Valid mean AUC: {0:03}'.format(
                        np.mean(self.predictor_valid_aucs)))
    def run_hyperopt(self,
                     param_space,
                     X_vars,
                     model_params,
                     fmin_max_evals,
                     algo='tpe',
                     metric='balanced_accuracy_score',
                     trials_obj=None,
                     model_type='indicator'):
        '''
        Function to run Bayeisan or Random Search hyperparameter optimization
        '''

        #Builds the model object to conduct hyperparameter tuning on
        if model_type == 'indicator':
            hyperopt_model = lightgbm.LGBMModel(**model_params,
                                                importance_type='gain')
        elif model_type == 'regressor':
            hyperopt_model = lightgbm.LGBMRegressor(**model_params,
                                                    importance_type='gain')
        eval_set = [(self.df_tune[X_vars], self.df_tune[self.target])]
        hyperopt_model.fit(X=self.df_train[X_vars],
                           y=self.df_train[self.target],
                           eval_set=eval_set,
                           verbose=False)
        data = self.df_tune

        def evaluate_metric(params):

            hyperopt_model.set_params(**params, bagging_freq=1).fit(
                X=self.df_train[X_vars],
                y=self.df_train[self.target],
                eval_set=eval_set,
                verbose=False)

            eval_x = data[X_vars]
            y_true = data[self.target]

            y_score = hyperopt_model.predict(eval_x)

            y_pred = [np.argmax(i) for i in y_score]

            if isinstance(metric, str):
                sk_scorer = getattr(metrics, metric, None)
            if sk_scorer is None:
                print(f"Specified metric {metric} does not exist in sklearn")

            score = sk_scorer(y_true=y_true, y_pred=y_pred)

            return {'loss': -score, 'params': params, 'status': STATUS_OK}

        if trials_obj is None:
            self.trials = Trials()
        else:
            self.trials = trials_obj

        if algo == 'tpe':
            algo = tpe.suggest
        elif algo == 'random':
            algo = rand.suggest

        best_params = fmin(evaluate_metric,
                           space=param_space,
                           algo=algo,
                           max_evals=fmin_max_evals,
                           rstate=np.random.RandomState(self.seed),
                           trials=self.trials)

        return best_params, self.trials
Exemple #21
0
oof_y_train_test_pred = np.zeros(y_train_test.shape[0])
y_val_preds_sess = np.zeros((x_val.shape[0], num_folds))


for i, (train_index, test_index) in enumerate(sess_folds):

    # Get current fold
    x_train, y_train = x_train_test[train_index], y_train_test[train_index]
    x_test, y_test = x_train_test[test_index], y_train_test[test_index]

    # Create lgb booster
    bst = lgb.LGBMModel(
        objective='regression',
        num_leaves=73,
        learning_rate=0.072,
        n_estimators=10000,
        min_child_samples=155,
        subsample=0.99,
        reg_lambda=0.0,
    )

    bst.fit(
        X=x_train,
        y=y_train,
        eval_set=[(x_test, y_test)],
        eval_metric='rmse',
        early_stopping_rounds=20,
        categorical_feature=cat_col_nums,
        verbose=False,
    )