Esempio n. 1
0
def fit_lasso(X_train, y_train, X_test, y_test, nfolds=10, n_jobs=7):
    model = Lasso()
    params = {
        'alpha':
        [0.005, 0.01, 0.1, 1.0, 5.0, 10.0, 100.0, 500.0, 750.0, 1000.0],
        'copy_X': [True],
        'fit_intercept': [True, False],
        'normalize': [True, False],
        'precompute': [False]
    }

    cv = KFold(n_splits=nfolds, shuffle=True, random_state=42)
    n_iter_search = 40
    random_search = RandomizedSearchCV(model,
                                       param_distributions=params,
                                       n_iter=n_iter_search,
                                       verbose=10,
                                       scoring="neg_mean_absolute_error",
                                       n_jobs=n_jobs,
                                       cv=cv)

    random_search = random_search.fit(X_train, y_train)

    xgb_model = random_search.best_estimator_

    test_preds = xgb_model.predict(X_test)

    train_preds = xgb_model.predict(X_train)

    model = Lasso(**random_search.best_params_)
    train_cross_preds = cross_val_predict(model, X_train, y_train, cv=cv)

    random_search.feats = X_train.columns

    return train_preds, train_cross_preds, test_preds
Esempio n. 2
0
def train_xgb(X,
              y,
              mod_number=1,
              cv=None,
              outfile="model.pickle",
              n_iter_search=100,
              nfolds=20,
              random_state=42):
    """
    Train an XGBoost model with hyper parameter optimization.

    Parameters
    ----------
    X : matrix
        Matrix with all the features, every instance should be coupled to the y-value
    y : vector
        Vector with the class, every value should be coupled to an x-vector with features
        
    Returns
    -------
    object
        Trained XGBoost model
    object
        Cross-validation results
    """
    
    xgb_handle = xgb.XGBClassifier()

    one_to_left = st.beta(10, 1)  
    from_zero_positive = st.expon(0, 50)
    
    #Define distributions to sample from for hyper parameter optimization
    param_dist = {  
        "n_estimators": st.randint(25, 150),
        "max_depth": st.randint(5, 10),
        "learning_rate": st.uniform(0.05, 0.4),
        #"colsample_bytree": one_to_left,
        "subsample": one_to_left,
        "gamma": st.uniform(0, 10),
        "reg_alpha": from_zero_positive,
        "min_child_weight": from_zero_positive,
    }

    if not cv: cv = KFold(n_splits=nfolds, shuffle=True,random_state=random_state)

    mcc = make_scorer(matthews_corrcoef)
    random_search = RandomizedSearchCV(xgb_handle, param_distributions=param_dist,
                                       n_iter=n_iter_search,verbose=10,scoring="roc_auc",
                                       n_jobs=1,refit=True,cv=cv)

    random_search.fit(X, y)

    random_search.feats = X.columns
    pickle.dump(random_search,open(outfile,"wb"))

    return(random_search.best_score_)
Esempio n. 3
0
def fit_xgb(X_train, y_train, X_test, y_test, config_file="config.ini"):
    """
    Extract all features we can extract; without parallelization; use if you want to run feature extraction
    with a single core

    Parameters
    ----------
    X_train : pd.DataFrame
        feature matrix
    y_train : pd.DataFrame/Series
        objective values for training
    X_test : pd.DataFrame
        feature matrix for testing/evaluating
    y_test : pd.DataFrame/Series
        objective values for testing/evaluating
    config_file : str
        location of the configuration file that contains the hyperparemeter spaces

    Returns
    -------
    list
        predictions for the train set
    list
        cross-validation predictions (hyperparameters still determined on the training set; not the model parameters)
    list
        test predictions
    sklearn.model_selection.RandomizedSearchCV
        object containing the model and training settings
    """

    cparser = ConfigParser()
    cparser.read(config_file)

    # get hyperparameter space to sample from
    n_estimators = eval(cparser.get("fitXGB", "n_estimators"))
    max_depth = eval(cparser.get("fitXGB", "max_depth"))
    learning_rate = eval(cparser.get("fitXGB", "learning_rate"))
    gamma = eval(cparser.get("fitXGB", "gamma"))
    reg_alpha = eval(cparser.get("fitXGB", "reg_alpha"))
    reg_lambda = eval(cparser.get("fitXGB", "reg_lambda"))

    random_state = cparser.getint("fitXGB", "random_state")
    nfolds = cparser.getint("fitXGB", "nfolds")
    n_iter_search = cparser.getint("fitXGB", "n_iter_search")
    verbose = cparser.getint("fitXGB", "verbose")
    n_jobs = cparser.getint("fitXGB", "n_jobs")
    eval_metric = cparser.get("fitXGB", "eval_metric").strip('"')

    model = xgb.XGBRegressor()

    params = {
        'n_estimators': n_estimators,
        'max_depth': max_depth,
        'learning_rate': learning_rate,
        'gamma': gamma,
        'reg_alpha': reg_alpha,
        'reg_lambda': reg_lambda,
        'n_jobs': [n_jobs]
    }

    cv = KFold(n_splits=nfolds, shuffle=True, random_state=random_state)

    random_search = RandomizedSearchCV(model,
                                       param_distributions=params,
                                       n_iter=n_iter_search,
                                       verbose=verbose,
                                       scoring=eval_metric,
                                       cv=cv,
                                       random_state=random_state)

    random_search = random_search.fit(X_train, y_train)

    xgb_model = random_search.best_estimator_

    train_preds = xgb_model.predict(X_train)

    # train using the best hyperparameters and make cv preds
    model = xgb.XGBRegressor(**random_search.best_params_)

    if verbose > 0:
        logging.debug("Predicting tR with CV now...")
    train_cross_preds = cross_val_predict(model, X_train, y_train, cv=cv)

    random_search.feats = X_train.columns

    test_preds = xgb_model.predict(X_test)

    if verbose > 0:
        logging.debug("=====")
        logging.debug(random_search.best_params_)
        logging.debug(random_search.best_score_)
        logging.debug("=====")

    return train_preds, train_cross_preds, test_preds, random_search