Ejemplo n.º 1
0
def test_repeated_kfold_determinstic_split():
    X = [[1, 2], [3, 4], [5, 6], [7, 8], [9, 10]]
    random_state = 258173307
    rkf = RepeatedKFold(
        n_splits=2,
        n_repeats=2,
        random_state=random_state)

    # split should produce same and deterministic splits on
    # each call
    for _ in range(3):
        splits = rkf.split(X)
        train, test = next(splits)
        assert_array_equal(train, [2, 4])
        assert_array_equal(test, [0, 1, 3])

        train, test = next(splits)
        assert_array_equal(train, [0, 1, 3])
        assert_array_equal(test, [2, 4])

        train, test = next(splits)
        assert_array_equal(train, [0, 1])
        assert_array_equal(test, [2, 3, 4])

        train, test = next(splits)
        assert_array_equal(train, [2, 3, 4])
        assert_array_equal(test, [0, 1])

        assert_raises(StopIteration, next, splits)
Ejemplo n.º 2
0
def kfold_lightgbm(important_features,train_df, test_df, num_folds, stratified = False, debug= False):
    print("Starting LightGBM. Train shape: {}, test shape: {}".format(train_df.shape, test_df.shape))

    # Cross validation model
    if stratified:
        folds = StratifiedKFold(n_splits= num_folds, shuffle=True, random_state=326)
        folds1 = StratifiedKFold(n_splits= num_folds, shuffle=True, random_state=326)
    else:
        folds = KFold(n_splits= num_folds, shuffle=True, random_state=326)
        folds1 = KFold(n_splits= num_folds, shuffle=True, random_state=326)

    # Create arrays and dataframes to store results
    oof_preds = np.zeros(train_df.shape[0])
    sub_preds = np.zeros(test_df.shape[0])
    feature_importance_df = pd.DataFrame()
    feats = [f for f in important_features if f not in FEATS_EXCLUDED]
    oof_preds1 = np.zeros(train_df.shape[0])
    sub_preds1 = np.zeros(test_df.shape[0])
    feature_importance_df1 = pd.DataFrame()
    target =train_df['target']

    for n_fold, (train_idx, valid_idx) in enumerate(folds.split(train_df[feats], train_df['outliers'])):
            train_x, train_y = train_df[feats].iloc[train_idx], train_df['target'].iloc[train_idx]
            valid_x, valid_y = train_df[feats].iloc[valid_idx], train_df['target'].iloc[valid_idx]
    model = xgb.sklearn.XGBRegressor(
        nthread=20,
        learn_rate=0.01,
        max_depth=15,
        min_child_weight=2,
        subsample=0.8,
        colsample_bytree=1,
        objective='rank:pairwise',
        n_estimators=300,
        gamma=0,
        reg_alpha=0,
        reg_lambda=1,
        max_delta_step=0,
        scale_pos_weight=1
    )
    watchlist = [(train_x, train_y), (valid_x, valid_y)]
    a=model.fit(train_x, train_y, eval_set=watchlist, eval_metric='ndcg', early_stopping_rounds=10)
    b=train_df[feats]
    b.columns=list(a.feature_importances_)
    b.sort_index(axis=1,ascending=False)
    c=test_df[feats]	
    c.columns=list(a.feature_importances_)
    c.sort_index(axis=1,ascending=False)
	
	
    # k-fold
    for n_fold, (train_idx, valid_idx) in enumerate(folds.split(b, train_df['outliers'])):
        train_x, train_y = b.iloc[train_idx], train_df['target'].iloc[train_idx]
        valid_x, valid_y = b.iloc[valid_idx], train_df['target'].iloc[valid_idx]

        # set data structure
        lgb_train = lgb.Dataset(train_x,
                                label=train_y,
                                free_raw_data=False)
        lgb_test = lgb.Dataset(valid_x,
                               label=valid_y,
                               free_raw_data=False)

        # params optimized by optuna
        params ={
                'task': 'train',
                'boosting': 'goss',
                'objective': 'regression',
                'metric': 'rmse',
                'learning_rate': 0.01,
                'subsample': 0.9855232997390695,
                'max_depth': 7,
                'top_rate': 0.9064148448434349,
                'num_leaves': 63,
                'min_child_weight': 41.9612869171337,
                'other_rate': 0.0721768246018207,
                'reg_alpha': 9.677537745007898,
                'colsample_bytree': 0.5665320670155495,
                'min_split_gain': 9.820197773625843,
                'reg_lambda': 8.2532317400459,
                'min_data_in_leaf': 21,
                'verbose': -1,
                'seed':int(2**n_fold),
                'bagging_seed':int(2**n_fold),
                'drop_seed':int(2**n_fold)
                }

        reg = lgb.train(
                        params,
                        lgb_train,
                        valid_sets=[lgb_train, lgb_test],
                        valid_names=['train', 'test'],
                        num_boost_round=10000,
                        early_stopping_rounds= 200,
                        verbose_eval=100
                        )

        oof_preds[valid_idx] = reg.predict(valid_x, num_iteration=reg.best_iteration)
        sub_preds += reg.predict(c, num_iteration=reg.best_iteration) / folds.n_splits
        
        fold_importance_df = pd.DataFrame()
        fold_importance_df["feature"] = feats
        fold_importance_df["importance"] = np.log1p(reg.feature_importance())
        fold_importance_df["fold"] = n_fold + 1
        feature_importance_df = pd.concat([feature_importance_df, fold_importance_df], axis=0)
        
		
        print('Fold %2d RMSE : %.6f' % (n_fold + 1, rmse(valid_y, oof_preds[valid_idx])))
        del reg, train_x, train_y, valid_x, valid_y
        gc.collect()
    # k-fold
    for n_fold, (train_idx, valid_idx) in enumerate(folds1.split(b, train_df['outliers'])):
        train_x, train_y = b.iloc[train_idx], train_df['target'].iloc[train_idx]
        valid_x, valid_y = b.iloc[valid_idx], train_df['target'].iloc[valid_idx]

        # set data structure
        lgb_train = lgb.Dataset(train_x,
                                label=train_y,
                                free_raw_data=False)
        lgb_test = lgb.Dataset(valid_x,
                               label=valid_y,
                               free_raw_data=False)

        # params optimized by optuna
        params ={
                'task': 'train',
                'boosting': 'goss',
                'objective': 'regression',
                'metric': 'rmse',
                'learning_rate': 0.01,
                'subsample': 0.9855232997390695,
                'max_depth': 7,
                'top_rate': 0.9064148448434349,
                'num_leaves': 63,
                'min_child_weight': 41.9612869171337,
                'other_rate': 0.0721768246018207,
                'reg_alpha': 9.677537745007898,
                'colsample_bytree': 0.5665320670155495,
                'min_split_gain': 9.820197773625843,
                'reg_lambda': 8.2532317400459,
                'min_data_in_leaf': 21,
                'verbose': -1,
                'seed':int(2**n_fold),
                'bagging_seed':int(2**n_fold),
                'drop_seed':int(2**n_fold)
                }

        reg = lgb.train(
                        params,
                        lgb_train,
                        valid_sets=[lgb_train, lgb_test],
                        valid_names=['train', 'test'],
                        num_boost_round=10000,
                        early_stopping_rounds= 200,
                        verbose_eval=100
                        )

        oof_preds1[valid_idx] = reg.predict(valid_x, num_iteration=reg.best_iteration)
        sub_preds1 += reg.predict(c, num_iteration=reg.best_iteration) / folds.n_splits
        
        fold_importance_df = pd.DataFrame()
        fold_importance_df["feature"] = feats
        fold_importance_df["importance"] = np.log1p(reg.feature_importance())
        fold_importance_df["fold"] = n_fold + 1
        feature_importance_df1 = pd.concat([feature_importance_df1, fold_importance_df], axis=0)
        
        print('Fold %2d RMSE : %.6f' % (n_fold + 1, rmse(valid_y, oof_preds[valid_idx])))
        del reg, train_x, train_y, valid_x, valid_y
        gc.collect()
    # display importances
    display_importances(feature_importance_df)
    display_importances(feature_importance_df1)


    #stack
    from sklearn.linear_model import BayesianRidge
    from sklearn.model_selection import StratifiedKFold, RepeatedKFold

    train_stack = np.vstack([oof_preds,oof_preds1]).transpose()
    test_stack = np.vstack([sub_preds, sub_preds1]).transpose()

    folds_stack = RepeatedKFold(n_splits=5, n_repeats=1, random_state=5590)
    oof_stack = np.zeros(train_stack.shape[0])
    predictions_3 = np.zeros(test_stack.shape[0])

    for fold_, (trn_idx, val_idx) in enumerate(folds_stack.split(train_stack,target)):
        print("fold {}".format(fold_))
        trn_data, trn_y = train_stack[trn_idx], target.iloc[trn_idx].values
        val_data, val_y = train_stack[val_idx], target.iloc[val_idx].values

        clf_3 = BayesianRidge()
        clf_3.fit(trn_data, trn_y)

        oof_stack[val_idx] = clf_3.predict(val_data)
        predictions_3 += clf_3.predict(test_stack) / 5

    print(np.sqrt(mean_squared_error(target.values, oof_stack)))
    # save submission file
    test_df.loc[:,'target'] = predictions_3
    test_df = test_df.reset_index()

    test_df[['card_id', 'target']].to_csv("F:\\zxd\\elo-merchant-category-recommendation\\submission201926_9.csv", index=False)
Ejemplo n.º 3
0
                                                  test_size=0.15)

# Standardize the feature values by computing the mean, subtracting the mean from the data points and then dividing by
# the standard deviation
scaler = StandardScaler()
trainX = scaler.fit_transform(trainX)
testX = scaler.transform(testX)

# Initialize the model and define the space of the hyperparameters to perform the grid-search over
print("[INFO] Initializing the support vector regression model...")
model = SVR()
kernel = ["linear", "rbf", "sigmoid", "poly"]
tolerance = [1e-3, 1e-4, 1e-5, 1e-6]
C = [1, 1.5, 2, 2.5, 3]
grid = dict(kernel=kernel, tol=tolerance, C=C)

# Initialize a cross-validation fold and perform a grid-search to tune the hyperparameters
print("[INFO] Grid searching over the hyperparameters...")
cvFold = RepeatedKFold(n_splits=10, n_repeats=3, random_state=1)
gridSearch = GridSearchCV(estimator=model,
                          param_grid=grid,
                          n_jobs=-1,
                          cv=cvFold,
                          scoring="neg_mean_squared_error")
searchResults = gridSearch.fit(trainX, trainY)

# Extract the best model and evaluate it
print("[INFO] Evaluating the model...")
bestModel = searchResults.best_estimator_
print("R2: {:.2f}".format(bestModel.score(testX, testY)))
Ejemplo n.º 4
0
from sklearn.datasets import load_boston
from sklearn.model_selection import RepeatedKFold
import pandas as pd
from sklearn import linear_model
from sklearn.metrics import mean_squared_error, r2_score
import matplotlib.pyplot as plt
boston = load_boston()
data_boston = pd.DataFrame(boston['data'])
data_boston.columns = boston['feature_names']
data_boston['target'] = boston['target']
coef = []
intercept = []
mse = []
r2 = []
kf = RepeatedKFold(n_splits=10, n_repeats=10, random_state=0)
for train_index, test_index in kf.split(data_boston.iloc[:, :-1]):
    train_X, train_y = data_boston.iloc[train_index, :-1], data_boston.iloc[
        train_index, -1]
    test_X, test_y = data_boston.iloc[test_index, :-1], data_boston.iloc[
        test_index, -1]
    regr = linear_model.LinearRegression(fit_intercept=True,
                                         normalize=True,
                                         copy_X=True,
                                         n_jobs=-1)
    regr.fit(train_X, train_y)
    predict_y = regr.predict(test_X)
    print('Coefficients: \n', regr.coef_)
    print('Mean squared error: %.2f' % mean_squared_error(test_y, predict_y))
    print('Coefficient of determination: %.2f' % r2_score(test_y, predict_y))
    coef.append(regr.coef_)
    intercept.append(regr.intercept_)
Ejemplo n.º 5
0
del df_all
gc.collect()

features = list(train.columns.values)
features.remove(id_col)
features.remove(target_col)


# Build the model
cnt = 0
p_buf = []
n_splits = 4
n_repeats = 1
kf = RepeatedKFold(
    n_splits=n_splits, 
    n_repeats=n_repeats, 
    random_state=0)
err_buf = []   
undersampling = 0

lgb_params = {
    'boosting_type': 'gbdt',
    'objective': 'binary',
    'metric': 'auc',
    'max_depth': 8,
    'learning_rate': 0.05, 
    'feature_fraction': 0.85,
    'bagging_fraction': 0.85,
    'bagging_freq': 5,
    'lambda_l1': 1.0,
    'lambda_l2': 1.0,
y = y[filt]

#- Predictors
X = pd.get_dummies(X, dummy_na=True)
X = X[filt]

le_y = preprocessing.LabelEncoder()
y = le_y.fit_transform(y)

#%% Initialize model
clf = MNB()

#%% Cross Validation using Stratisfied 10-Fold

kf = RepeatedKFold(n_splits=10, n_repeats=10, random_state=0)

scores = []
for train_idx, test_idx in kf.split(X, y):
    #print("TRAIN:", train_idx, "TEST:", test_idx)
    X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
    y_train, y_test = y[train_idx], y[test_idx]

    model = clf.fit(X_train, y_train)
    predictions = model.predict(X_test)
    scores.append(accuracy_score(y_test, predictions))
print("Model training complete!")
print('Average 10-Fold Accuracy: {}'.format(np.mean(scores)))

#%%
class_probs = []
Ejemplo n.º 7
0
            # Layer('Rectifier', name='hidden1', units=3),
            # Layer('Linear', name='hidden2', units=5),
            Layer('Linear')
        ],
        learning_rate=0.001,
        n_iter=25)
elif method == 'linReg':
    classifier = linear_model.LinearRegression()
else:
    print('dumbass')
    exit()

if eval:
    if method in ['nnreg', 'linReg', 'svr']:
        rkf = RepeatedKFold(n_splits=5, n_repeats=1, random_state=random_state)
        for train, test in rkf.split(X, y):
            X_train, X_test, y_train, y_test = X[train], X[test], y[train], y[
                test]
            y_pred = classifier.fit(X_train, y_train).predict(X_test)
            print "\nResults of Linear Regression...."
            print "================================"
            print('explained variance {}'.format(
                explained_variance_score(y_test, y_pred)))
            print('mean_squared_error {}'.format(
                mean_squared_error(y_test, y_pred)))
            print('r2_score {}'.format(r2_score(y_test, y_pred)))
            plt.scatter(y_test.T, y_pred.T)
            # plt.matshow(y_pred)
            plt.show()
Ejemplo n.º 8
0
def pls_train(groups, varname='valence', arrayname='norm', scale=True,
              ncomps=2, cv_folds=None, cv_repeats=None, skip_cv=False,
              xmin=-np.inf, xmax=np.inf, _larch=None, **kws):

    """use a list of data groups to train a Partial Least Squares model

    Arguments
    ---------
      groups      list of groups to use as components
      varname     name of characteristic value to model ['valence']
      arrayname   string of array name to be fit (see Note 3) ['norm']
      xmin        x-value for start of fit range [-inf]
      xmax        x-value for end of fit range [+inf]
      scale       bool to scale data [True]
      cv_folds    None or number of Cross-Validation folds (Seee Note 4) [None]
      cv_repeats  None or number of Cross-Validation repeats (Seee Note 4) [None]
      skip_cv     bool to skip doing Cross-Validation [None]
      ncomps      number of independent components  (See Note 5) [2]

    Returns
    -------
      group with trained PSLResgession, to be used with pls_predict

    Notes
    -----
     1.  The group members for the components must match each other
         in data content and array names.
     2.  all grouops must have an attribute (scalar value) for `varname`
     3.  arrayname can be one of `norm` or `dmude`
     4.  Cross-Validation:  if cv_folds is None, sqrt(len(groups)) will be used
            (rounded to integer).  if cv_repeats is None, sqrt(len(groups))-1
            will be used (rounded).
     5.  The optimal number of components may be best found from PCA. If set to None,
         a search will be done for ncomps that gives the lowest RMSE_CV.
    """
    xdat, spectra = groups2matrix(groups, arrayname, xmin=xmin, xmax=xmax)
    groupnames = []
    ydat = []
    for g in groups:
        groupnames.append(getattr(g, 'filename',
                                  getattr(g, 'groupname', repr(g))))
        val = getattr(g, varname, None)
        if val is None:
            raise Value("group '%s' does not have attribute '%s'" % (g, varname))
        ydat.append(val)
    ydat = np.array(ydat)

    nvals = len(groups)

    kws['scale'] = scale
    kws['n_components'] = ncomps

    model = PLSRegression(**kws)

    rmse_cv = None
    if not skip_cv:
        if cv_folds is None:
            cv_folds = int(round(np.sqrt(nvals)))
        if  cv_repeats is None:
            cv_repeats = int(round(np.sqrt(nvals)) - 1)

        resid = []
        cv = RepeatedKFold(n_splits=cv_folds, n_repeats=cv_repeats)
        for ctrain, ctest in cv.split(range(nvals)):
            model.fit(spectra[ctrain, :], ydat[ctrain])
            ypred = model.predict(spectra[ctest, :])[:, 0]
            resid.extend((ypred - ydat[ctest]).tolist())
        resid = np.array(resid)
        rmse_cv = np.sqrt( (resid**2).mean() )

    # final fit without cross-validation
    model = PLSRegression(**kws)
    out = model.fit(spectra, ydat)

    ypred = model.predict(spectra)[:, 0]

    rmse = np.sqrt(((ydat - ypred)**2).mean())

    return Group(x=xdat, spectra=spectra, ydat=ydat, ypred=ypred,
                 coefs=model.x_weights_, loadings=model.x_loadings_,
                 cv_folds=cv_folds, cv_repeats=cv_repeats, rmse_cv=rmse_cv,
                 rmse=rmse, model=model, varname=varname,
                 arrayname=arrayname, scale=scale, groupnames=groupnames,
                 keywords=kws)
Ejemplo n.º 9
0
def main():

    #Load data
    url_data = 'https://raw.githubusercontent.com/jbrownlee/Datasets/master/sonar.csv'
    data = pd.read_csv(url_data, header=None)
    print("\n\n####### 1. LOAD DATASET #######\n\n")
    print("\n\nPrint the 5 first line of data\n\n")
    print(data.head())
    print("\n\nthe information of data\n\n")
    print(data.info())
    print("\n\n Describe data\n\n")
    print(data.describe())
    print('\n\nShape of the dataset: \n\n')
    print(data.shape)
    print('=' * 80)

    ####### 2. LABEL ENCODER #######
    print("\n\n####### 2. LABEL ENCODER #######\n\n")

    #Use label encoder function
    data = label_encoder(data)
    print("\n\nthe information of data after use Label Encoder function\n\n")
    print(data.info())

    print('=' * 80)

    ####### 3. CLEAN DATA #######
    print("\n\n", "~0~0" * 27, "\n\n")
    print("\n\n####### 2. CLEAN DATA #######\n\n")
    #Perform outlier-removing
    data = remove_outlier(data)
    print('=' * 80)

    ####### 4. SPLIT DATA #######
    print("\n\n####### 4. SPLIT DATA  #######\n\n")
    #Split Data
    X = data.iloc[:, :-1]
    y = data.iloc[:, -1]
    print("\n\nPrint the 5 first line of X\n\n")
    print(X.head())
    print("\n\nPrint the 5 first line of y\n\n")
    print(y.head())
    print('=' * 80)

    ####### 5. USE PRINCIPAL COMPONENT ANALYSIS (PCA) #######
    print("\n\n####### 5. USE PRINCIPAL COMPONENT ANALYSIS (PCA) #######\n\n")
    #Do Principal Component Analysis as pca
    print('Perform Principal Component Analysis')
    X_pca = PCA_method(X)
    print('\n\nShape of dataset before PCA: \n\n')
    print(X.shape)
    print('\n\nShape of dataset after PCA: \n\n')
    print(X_pca.shape)
    print('=' * 100)
    print('=' * 80)

    ####### 6. PERFORM GRIBSEARCHCV #######
    print("\n\n####### 6. PERFORM GRIBSEARCHCV #######\n\n")

    #Define model
    model = Ridge()

    #Define evaluation
    cv = RepeatedKFold(n_splits=10, n_repeats=3, random_state=1)

    #Define search space
    space = dict()
    space['solver'] = ['svd', 'cholesky', 'lsqr', 'sag']
    space['alpha'] = [1e-5, 1e-4, 1e-3, 1e-2, 1e-1, 1, 10, 100]
    space['fit_intercept'] = [True, False]
    space['normalize'] = [True, False]

    #Define search
    search = GridSearchCV(model,
                          space,
                          scoring='neg_mean_squared_error',
                          n_jobs=-1,
                          cv=cv)
    result = search.fit(X_pca, y)

    #Result
    print('Best Score: %s' % result.best_score_)
    print('Best Hyperparameters: %s' % result.best_params_)
    print('=' * 80)
    '''
Ejemplo n.º 10
0
from sklearn.model_selection import RepeatedKFold
from sklearn.neural_network import MLPClassifier
from sklearn import metrics
from scipy import stats
import pandas as pd
import numpy
import training

training.createdata()
dataset = pd.read_csv('dir/hog.csv')
dataset = dataset[(numpy.abs(stats.zscore(dataset)) < 5.04).all(axis=1)]
random_state = 12883823
rkf = RepeatedKFold(n_splits=5, n_repeats=30, random_state=random_state)
result = next(rkf.split(dataset), None)

data_train = dataset.iloc[result[0]]
data_test = dataset.iloc[result[1]]

data = data_train.iloc[:, [0, 3780]]
target = data_train.iloc[:, [3781]]

classifier = MLPClassifier(random_state=30,
                           hidden_layer_sizes=8,
                           learning_rate_init=0.1,
                           momentum=0.9)
classifier.fit(data, target)

dataset_test = pd.read_csv('dir/test_hog.csv')

predicted = classifier.predict(dataset_test.iloc[:, [0, 3780]])
print(metrics.classification_report(dataset_test.iloc[:, [3781]], predicted))
# We can check the coefficient variability through cross-validation:
# it is a form of data perturbation (related to
# `resampling <https://en.wikipedia.org/wiki/Resampling_(statistics)>`_).
#
# If coefficients vary significantly when changing the input dataset
# their robustness is not guaranteed, and they should probably be interpreted
# with caution.

from sklearn.model_selection import cross_validate
from sklearn.model_selection import RepeatedKFold

cv_model = cross_validate(
    model,
    X,
    y,
    cv=RepeatedKFold(n_splits=5, n_repeats=5),
    return_estimator=True,
    n_jobs=-1,
)
coefs = pd.DataFrame(
    [
        est.named_steps["transformedtargetregressor"].regressor_.coef_ *
        X_train_preprocessed.std(axis=0) for est in cv_model["estimator"]
    ],
    columns=feature_names,
)
plt.figure(figsize=(9, 7))
sns.stripplot(data=coefs, orient="h", color="k", alpha=0.5)
sns.boxplot(data=coefs, orient="h", color="cyan", saturation=0.5)
plt.axvline(x=0, color=".5")
plt.xlabel("Coefficient importance")
Ejemplo n.º 12
0
df.head()


# In[9]:


#No standardizing nor normalization needed/possible
X = df.iloc[:, :-1].values
y = df.iloc[:, -1].values
xtrain, xtest, ytrain, ytest = train_test_split(X, y, test_size = 0.15)


# In[10]:


cv = RepeatedKFold(n_splits=7, n_repeats = 2)
ridge = RidgeCV(alphas= np.arange(0, 5, 0.01), cv = cv, scoring='r2')
ridge.fit(xtrain, ytrain)


# In[11]:


ypred = ridge.predict(xtest)
r2 = r2_score(ytest, ypred)
print('R2 Score: ', r2)
score = ridge.score(xtrain, ytrain)
print('R Squared: ', score)


# In[12]:
Ejemplo n.º 13
0
def test_get_n_splits_for_repeated_kfold():
    n_splits = 3
    n_repeats = 4
    rkf = RepeatedKFold(n_splits, n_repeats)
    expected_n_splits = n_splits * n_repeats
    assert_equal(expected_n_splits, rkf.get_n_splits())
Ejemplo n.º 14
0
    if len(use_features) > 0:
        for i_feat_indx in use_features:
            fi += 1
            Xs_train[:, fi] = X_all_feats[:, i_feat_indx]

        for ns in [3, 4, 5]:
            xs = 0.22 * _N.random.randn(ns * nrep)
            coefsLR = _N.empty((nrep * ns, len(use_features)))
            #test_sz = ns*(len(filtdat)//ns)-(ns-1)*(len(filtdat)//ns)
            test_sz = len(filtdat) // ns + 1 if len(
                filtdat) % ns != 0 else len(filtdat) // ns
            print("test_sz   %d" % test_sz)
            obs_v_preds = _N.zeros((nrep * ns, test_sz, 2))

            scoresLR = _N.empty(nrep * ns)
            rkf = RepeatedKFold(n_splits=ns,
                                n_repeats=nrep)  #, random_state=0)
            iii = -1

            for train, test in rkf.split(datinds):
                iii += 1
                clf_f = clf.fit(Xs_train[train], y[train])
                scoresLR[iii] = clf_f.score(Xs_train[test], y[test])
                coefsLR[iii] = clf_f.coef_
                obs_v_preds[iii, 0:len(test), 0] = y[test]
                obs_v_preds[iii, 0:len(test),
                            1] = clf_f.predict(Xs_train[test])

            use_features_dmp["weights_thresh%(t)d_fld%(f)d" % {
                "t": iu,
                "f": ns
            }] = _N.mean(coefsLR, axis=0)
ITERATIONS_DATA_FILE_PATHS = [
    (5, 'resources/variabilities_5_iterations.csv'),
    (10, 'resources/variabilities_10_iterations.csv'),
    (20, 'resources/variabilities_20_iterations.csv'),
    (30, 'resources/variabilities_30_iterations.csv'),
]
RESULTS_OUTPUT_CSV_FILE_PATH = 'resources/output/classification_results.csv'
FEATURES_OUTPUT_CSV_FILE_PATH = 'resources/output/classification_features.csv'

RANDOM_SEED = 42

CROSS_VALIDATION_FOLDS = 10
CROSS_VALIDATION_REPETITIONS = 30
CROSS_VALIDATION_GENERATOR = RepeatedKFold(
    n_splits=CROSS_VALIDATION_FOLDS,
    n_repeats=CROSS_VALIDATION_REPETITIONS,
    random_state=RANDOM_SEED,
)
TOTAL_CROSS_VALIDATION_FOLDS = CROSS_VALIDATION_FOLDS * CROSS_VALIDATION_REPETITIONS

DEPENDENT_VARIABLES = [
    'rciw99',
    'rciw99mjhd',
    'rmadhd',
]

BINARY_CLASSIFICATION_THRESHOLDS = [
    1,
    3,
    5,
    10,
        # first 120 days, select view count or watch time as dependent variable
        daily_attention = [dailywatch, dailyview][use_view][:age]
        daily_share = dailyshare[:age]
        if len(daily_attention) == age and len(daily_share) == age:
            attention_data.append(daily_attention)
            share_data.append(daily_share)
            vid_array.append(vid)

    # convert to ndarray
    attention_data = np.array(attention_data)
    share_data = np.array(share_data)
    vid_array = np.array(vid_array)

    # == == == == == == == == Part 4: Forecast future attention == == == == == == == == #
    # 10-repeated 10-fold cross validation
    rkf = RepeatedKFold(n_splits=10, n_repeats=10)

    fold_idx = 0
    for train_cv_idx, test_idx in rkf.split(vid_array):
        fold_idx += 1
        print('>>> Forecast on fold: {0}'.format(fold_idx))

        # == == == == == == == == Part 5: Split cv subset to select best alpha value == == == == == == == == #
        train_idx, cv_idx = train_test_split(train_cv_idx, test_size=0.1)

        # grid search best alpha value over -4 to 4 in log space
        alpha_array = [10 ** t for t in range(-4, 5)]
        cv_mse = []
        for alpha in alpha_array:
            # == == == == == == == == Part 6: Training with Ridge Regression == == == == == == == == #
            cv_predict = forecast_future_attention(train_idx, cv_idx, alpha)
import numpy
import pandas
import matplotlib.pyplot as plot
from sklearn import metrics
from sklearn.ensemble import AdaBoostRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import RepeatedKFold

dataset = pandas.read_csv('salaryData.csv')

x = dataset['YearsExperience'].values
y = dataset['Salary'].values
X = x.reshape(len(x), 1)
Y = y.reshape(len(y), 1)

kf = RepeatedKFold(n_splits=2, n_repeats=1, random_state=200)
kf.get_n_splits(X)

for train_index, test_index in kf.split(X):
    xTrain, xTest = X[train_index], X[test_index]
    yTrain, yTest = Y[train_index], Y[test_index]

    regressor = DecisionTreeRegressor()
    regressor.fit(xTrain, yTrain)

    regr = AdaBoostRegressor()
    regr.fit(X, y)

    yPrediction = regressor.predict(xTest)
    yPred = regr.predict(xTest)
Ejemplo n.º 18
0
    cross_val_score,
)
from sklearn.pipeline import Pipeline

from sktime.classification.interval_based import CanonicalIntervalForest
from sktime.transformations.panel.pca import PCATransformer
from sktime.utils._testing.estimator_checks import _make_args

DATA_ARGS = [
    {"return_numpy": True, "n_columns": 2},
    {"return_numpy": False, "n_columns": 2},
]
# StratifiedGroupKFold(n_splits=2), , removed, not available in sklearn 0.24
CROSS_VALIDATION_METHODS = [
    KFold(n_splits=2),
    RepeatedKFold(n_splits=2, n_repeats=2),
    LeaveOneOut(),
    LeavePOut(p=5),
    ShuffleSplit(n_splits=2, test_size=0.25),
    StratifiedKFold(n_splits=2),
    StratifiedShuffleSplit(n_splits=2, test_size=0.25),
    GroupKFold(n_splits=2),
    LeavePGroupsOut(n_groups=5),
    GroupShuffleSplit(n_splits=2, test_size=0.25),
    TimeSeriesSplit(n_splits=2),
]
PARAMETER_TUNING_METHODS = [
    GridSearchCV,
    RandomizedSearchCV,
    HalvingGridSearchCV,
    HalvingRandomSearchCV,
Ejemplo n.º 19
0
def train(train_path, test_path, output_path):
    train = pd.read_csv(train_path, encoding='gb18030')
    test = pd.read_csv(test_path, encoding='gb18030')
    test_id = test[u'样本id']

    for df in [train, test]:
        df.drop(['B3', 'B13', 'A13', 'A18', 'A23'], axis=1, inplace=True)

    good_cols = list(train.columns)
    for col in train.columns:
        rate = train[col].value_counts(normalize=True, dropna=False).values[0]
        if rate > 0.9:
            good_cols.remove(col)

    train = train[(train[u'收率'] > 0.87) & (train['B14'] > 40) &
                  (train['A6'] < 50)]

    good_cols.append('A1')
    good_cols.append('A3')
    good_cols.append('A4')

    good_cols.remove('sample_id')
    train = train[good_cols]
    good_cols.remove(u'收率')
    test = test[good_cols]

    target = train[u'收率']
    del train[u'收率']
    data = pd.concat([train, test], axis=0, ignore_index=True)
    data = data.fillna(-1)

    for f in ['A5', 'A7', 'A9', 'A11', 'A14', 'A16', 'A24', 'A26', 'B5', 'B7']:
        try:
            data[f] = data[f].apply(timeTranSecond)
        except:
            continue
    for f in ['A20', 'A28', 'B4', 'B9', 'B10', 'B11']:
        data[f] = data.apply(lambda df: getDuration(df[f]), axis=1)

    categorical_columns = [f for f in data.columns]
    numerical_columns = [
        f for f in data.columns if f not in categorical_columns
    ]

    for f in ['B14']:
        data[f + '_median'] = data[f].median()
        data[f + '_std'] = data[f].std()
        data[f + '_max'] = data[f].max()
        data[f + '_min'] = data[f].min()
        data[f + '**2'] = data[f]**2

    data['b14/a1_a3_a4_a19_b1_b12'] = data['B14'] / (data['A1'] + data['A3'] +
                                                     data['A4'] + data['A19'] +
                                                     data['B1'] + data['B12'])
    data['b14_a1_a3_a4_a19_b1_b12'] = data['B14'] + data['A1'] + data[
        'A3'] + data['A4'] + data['A19'] + data['B1'] + data['B12']
    data['b14*a1_a3_a4_a19_b1_b12'] = data['B14'] * (data['A1'] + data['A3'] +
                                                     data['A4'] + data['A19'] +
                                                     data['B1'] + data['B12'])

    numerical_columns.append('b14/a1_a3_a4_a19_b1_b12')
    numerical_columns.append('b14_a1_a3_a4_a19_b1_b12')
    numerical_columns.append('b14*a1_a3_a4_a19_b1_b12')

    data['b14*b12'] = data['B14'] * data['B12']
    numerical_columns.append('b14*b12')

    data['b14/b1'] = data['B14'] / data['B1']
    numerical_columns.append('b14/b1')

    data['b14*a19'] = data['B14'] * data['A19']
    numerical_columns.append('b14*a19')

    data['b14/a4'] = data['B14'] / data['A4']
    numerical_columns.append('b14/a4')

    data['b14+a4'] = data['B14'] + data['A4']
    numerical_columns.append('b14+a4')

    data['B11*B14'] = data['B11'] * data['B14']
    numerical_columns.append('B11*B14')

    data['A7*A8'] = data['A7'] * data['A8']
    numerical_columns.append('A7*A8')

    data['A9*A10'] = data['A10'] * data['A9']
    numerical_columns.append('A9*A10')

    data['A10*A11'] = data['A10'] * data['A11']
    numerical_columns.append('A10*A11')

    data['A16*A17'] = data['A16'] * data['A17']
    numerical_columns.append('A16*A17')

    data['A25*A26'] = data['A25'] * data['A26']
    numerical_columns.append('A25*A26')

    data['B10*B11'] = data['B10'] * data['B11']
    numerical_columns.append('B10*B11')

    data['B12*B14'] = data['B12'] * data['B14']
    numerical_columns.append('B12*B14')

    data['A5*A7'] = data['A5'] * data['A7']
    numerical_columns.append('A5*A7')

    data['A9*A11'] = data['A9'] * data['A11']
    numerical_columns.append('A9*A11')

    data['A19*A21'] = data['A19'] * data['A21']
    numerical_columns.append('A19*A21')

    data['B8*B10'] = data['B8'] * data['B10']
    numerical_columns.append('B8*B10')

    data['B10*B12'] = data['B10'] * data['B12']
    numerical_columns.append('B10*B12')

    data['A11*A14'] = data['A11'] * data['A14']
    numerical_columns.append('A11*A14')

    data['A12*A15'] = data['A12'] * data['A15']
    numerical_columns.append('A12*A15')

    data['A11*A15'] = data['A11'] * data['A15']
    numerical_columns.append('A11*A15')

    data['A16*A19'] = data['A16'] * data['A19']
    numerical_columns.append('A16*A19')

    data['A19*A22'] = data['A19'] * data['A22']
    numerical_columns.append('A19*A22')

    del data['A1']
    del data['A3']
    del data['A4']
    categorical_columns.remove('A1')
    categorical_columns.remove('A3')
    categorical_columns.remove('A4')

    for f in categorical_columns:
        data[f] = data[f].map(
            dict(zip(data[f].unique(), range(0, data[f].nunique()))))
    train = data[:train.shape[0]]
    test = data[train.shape[0]:]

    train['target'] = target
    train['intTarget'] = pd.cut(train['target'], 5, labels=False)
    train = pd.get_dummies(train, columns=['intTarget'])
    li = [
        'intTarget_0.0', 'intTarget_1.0', 'intTarget_2.0', 'intTarget_3.0',
        'intTarget_4.0'
    ]
    mean_columns = []
    for f1 in categorical_columns:
        cate_rate = train[f1].value_counts(normalize=True,
                                           dropna=False).values[0]
        if cate_rate < 0.90:
            for f2 in li:
                col_name = 'B14_to_' + f1 + "_" + f2 + '_mean'
                mean_columns.append(col_name)
                order_label = train.groupby([f1])[f2].mean()
                train[col_name] = train['B14'].map(order_label)
                miss_rate = train[col_name].isnull().sum(
                ) * 100 / train[col_name].shape[0]
                if miss_rate > 0:
                    train = train.drop([col_name], axis=1)
                    mean_columns.remove(col_name)
                else:
                    test[col_name] = test['B14'].map(order_label)

    train.drop(li + ['target'], axis=1, inplace=True)

    X_train = train[mean_columns + numerical_columns].values
    X_test = test[mean_columns + numerical_columns].values
    # one hot
    enc = OneHotEncoder()
    for f in categorical_columns:
        enc.fit(data[f].values.reshape(-1, 1))
        X_train = sparse.hstack(
            (X_train, enc.transform(train[f].values.reshape(-1, 1))), 'csr')
        X_test = sparse.hstack(
            (X_test, enc.transform(test[f].values.reshape(-1, 1))), 'csr')

    y_train = target.values

    param = {
        'num_leaves': 120,
        'min_data_in_leaf': 30,
        'objective': 'regression',
        'max_depth': -1,
        'learning_rate': 0.01,
        "min_child_samples": 30,
        "boosting": "gbdt",
        "feature_fraction": 0.9,
        "bagging_freq": 1,
        "bagging_fraction": 0.9,
        "bagging_seed": 11,
        "metric": 'mse',
        "lambda_l1": 0.1,
        "verbosity": -1
    }

    folds = KFold(n_splits=5, shuffle=True, random_state=2018)
    oof_lgb = np.zeros(len(train))
    predictions_lgb = np.zeros(len(test))

    for fold_, (trn_idx, val_idx) in enumerate(folds.split(X_train, y_train)):
        print("fold n°{}".format(fold_ + 1))
        trn_data = lgb.Dataset(X_train[trn_idx], y_train[trn_idx])
        val_data = lgb.Dataset(X_train[val_idx], y_train[val_idx])

        num_round = 10000
        clf = lgb.train(param,
                        trn_data,
                        num_round,
                        feval=lgbFeval,
                        valid_sets=[trn_data, val_data],
                        verbose_eval=200,
                        early_stopping_rounds=100)
        oof_lgb[val_idx] = clf.predict(X_train[val_idx],
                                       num_iteration=clf.best_iteration)

        predictions_lgb += clf.predict(
            X_test, num_iteration=clf.best_iteration) / folds.n_splits

    xgb_params = {
        'eta': 0.005,
        'max_depth': 10,
        'subsample': 0.8,
        'colsample_bytree': 0.8,
        'objective': 'reg:linear',
        'eval_metric': 'rmse',
        'silent': True,
        'nthread': 16
    }

    folds = KFold(n_splits=5, shuffle=True, random_state=2018)
    oof_xgb = np.zeros(len(train))
    predictions_xgb = np.zeros(len(test))

    for fold_, (trn_idx, val_idx) in enumerate(folds.split(X_train, y_train)):
        print("fold n°{}".format(fold_ + 1))
        trn_data = xgb.DMatrix(X_train[trn_idx], y_train[trn_idx])
        val_data = xgb.DMatrix(X_train[val_idx], y_train[val_idx])

        watchlist = [(trn_data, 'train'), (val_data, 'valid_data')]
        clf = xgb.train(dtrain=trn_data,
                        num_boost_round=20000,
                        feval=xgbFeval,
                        evals=watchlist,
                        early_stopping_rounds=200,
                        verbose_eval=100,
                        params=xgb_params)
        oof_xgb[val_idx] = clf.predict(xgb.DMatrix(X_train[val_idx]),
                                       ntree_limit=clf.best_ntree_limit)
        predictions_xgb += clf.predict(
            xgb.DMatrix(X_test),
            ntree_limit=clf.best_ntree_limit) / folds.n_splits

    train_stack = np.vstack([oof_lgb, oof_xgb]).transpose()
    test_stack = np.vstack([predictions_lgb, predictions_xgb]).transpose()

    folds_stack = RepeatedKFold(n_splits=5, n_repeats=2, random_state=4590)
    oof_stack_xgb = np.zeros(train_stack.shape[0])
    predictions = np.zeros(test_stack.shape[0])

    for fold_, (trn_idx,
                val_idx) in enumerate(folds_stack.split(train_stack, target)):
        print("fold {}".format(fold_))
        trn_data, trn_y = train_stack[trn_idx], target.iloc[trn_idx].values
        val_data, val_y = train_stack[val_idx], target.iloc[val_idx].values

        clf_3 = xgb.XGBRegressor()
        clf_3.fit(trn_data, trn_y)

        oof_stack_xgb[val_idx] = clf_3.predict(val_data)
        predictions += clf_3.predict(test_stack) / 10

    print("LGB score: {}".format((mean_squared_error(oof_lgb, target) * 0.5)))
    print("XGB score: {}".format((mean_squared_error(oof_xgb, target) * 0.5)))
    print("STACK score: {}".format(
        (mean_squared_error(target.values, oof_stack_xgb) * 0.5)))

    sub_df = pd.DataFrame()
    sub_df[0] = test_id
    sub_df[1] = predictions
    sub_df[1] = sub_df[1].apply(lambda x: round(x, 3))
    sub_df.to_csv(output_path, index=False, header=None)
Ejemplo n.º 20
0
def run(task, n_splits, n_repeats):
    """
    Trains the specified classifier on the node embedding. This method is
    called in parallel on multiple `tasks`.
    The results of the training is saved to disk in the `results` subfolder.

    Parameters
    ----------
    task : tuple
        The first entry contains the name of the classifier, the second a fict
        with the keys `node_embeddings`, `embedding_name`, `node_labels` and
        `disting_node_labels`.
    n_splits : int
    n_repeats : int
    """
    embedding, classifier = task
    print(f"Start of {classifier} classifier")

    if classifier == "adaboost":
        fn = adaboost
    elif classifier == "decision_tree":
        fn = decision_tree
    elif classifier == "neural_network":
        fn = neural_network
    elif classifier == "random_forest":
        fn = random_forest

    scores = {
        "train_score_accuracy": list(),
        "test_score_accuracy": list(),
        "train_score_f1_micro": list(),
        "test_score_f1_micro": list(),
        "train_score_f1_macro": list(),
        "test_score_f1_macro": list(),
        "test_predictions": list()
    }

    rkf = RepeatedKFold(n_splits=n_splits,
                        n_repeats=n_repeats).split(node_embeddings)
    for i, (train_index, test_index) in enumerate(rkf):
        train_pred, test_pred = fn(embedding["node_embeddings"][train_index],
                                   embedding["node_embeddings"][test_index],
                                   embedding["node_labels"][train_index],
                                   embedding["node_labels"][test_index])

        scores["train_score_accuracy"].append(
            accuracy_score(embedding["node_labels"][train_index], train_pred))
        scores["test_score_accuracy"].append(
            accuracy_score(embedding["node_labels"][test_index], test_pred))
        scores["train_score_f1_micro"].append(
            f1_score(embedding["node_labels"][train_index],
                     train_pred,
                     average="micro"))
        scores["test_score_f1_micro"].append(
            f1_score(embedding["node_labels"][test_index],
                     test_pred,
                     average="micro"))
        scores["train_score_f1_macro"].append(
            f1_score(embedding["node_labels"][train_index],
                     train_pred,
                     average="macro"))
        scores["test_score_f1_macro"].append(
            f1_score(embedding["node_labels"][test_index],
                     test_pred,
                     average="macro"))
        scores["test_predictions"].append(
            {index: pred
             for index, pred in zip(test_index, test_pred)})

    filename = os.path.dirname(
        os.path.abspath(__file__)
    ) + "/results/" + classifier + "/" + embedding["embedding_name"]
    if classifier == "neural_network":
        filename += "_one_hidden"

    save_classification_results(filename + ".txt", scores,
                                embedding["distinct_node_labels"], n_splits,
                                n_repeats)
Ejemplo n.º 21
0
oof_lgb = np.array(pd.read_csv(tree_data_path + 'lgb_train.csv')['price'])

# 导入树模型cab预测数据,进行二层stacking输出
predictions_cb = np.array(
    pd.read_csv(tree_data_path + 'cab_test.csv')['price'])
oof_cb = np.array(pd.read_csv(tree_data_path + 'cab_train.csv')['price'])

# 读取price,对验证集进行评估
Train_data = pd.read_csv(tree_data_path + 'train_tree.csv', sep=' ')
TestA_data = pd.read_csv(tree_data_path + 'text_tree.csv', sep=' ')
Y_data = Train_data['price']

train_stack = np.vstack([oof_lgb, oof_cb]).transpose()
test_stack = np.vstack([predictions_lgb, predictions_cb]).transpose()
print(train_stack)
folds_stack = RepeatedKFold(n_splits=10, n_repeats=2, random_state=2018)
tree_stack = np.zeros(train_stack.shape[0])
predictions = np.zeros(test_stack.shape[0])

# 二层贝叶斯回归stack
for fold_, (trn_idx,
            val_idx) in enumerate(folds_stack.split(train_stack, Y_data)):
    print("fold {}".format(fold_))
    trn_data, trn_y = train_stack[trn_idx], Y_data[trn_idx]
    val_data, val_y = train_stack[val_idx], Y_data[val_idx]

    Bayes = linear_model.BayesianRidge()
    Bayes.fit(trn_data, trn_y)
    tree_stack[val_idx] = Bayes.predict(val_data)
    predictions += Bayes.predict(test_stack) / 20
Ejemplo n.º 22
0
    def best_nclust(self, data, iter_cv=1, strat_vect=None):
        """
        This method takes as input the training dataset and the
        stratification vector (if available) and performs a
        (repeated) CV procedure to select the best number of clusters that minimizes
        normalized stability.

        :param data: training dataset.
        :type data: ndarray, (n_samples, n_features)
        :param iter_cv: number of iteration for repeated CV, default 1.
        :type iter_cv: integer
        :param strat_vect: vector for stratification, defaults to None.
        :type strat_vect: ndarray, (n_samples,)
        :return: CV metrics for training and validation sets, best number of clusters,
            misclassification errors at each CV iteration.
        :rtype: dictionary, int, (list) if n_clusters parameter is not available
        """
        data_array = np.array(data)
        reval = RelativeValidation(self.class_method, self.clust_method, self.nrand)

        if strat_vect is not None:
            kfold = RepeatedStratifiedKFold(n_splits=self.nfold, n_repeats=iter_cv, random_state=42)
        else:
            kfold = RepeatedKFold(n_splits=self.nfold, n_repeats=iter_cv, random_state=42)
        fold_gen = kfold.split(data_array, strat_vect)
        if self.nclust_range is not None and 'n_clusters' in self.clust_method.get_params().keys():
            params = list(itertools.product([(data_array, reval)],
                                            fold_gen, self.nclust_range))
        else:
            params = list(itertools.product([(data_array, reval)], fold_gen))
        if self.n_jobs > 1:
            p = mp.Pool(processes=self.n_jobs)
            miscl = list(zip(*p.starmap(self._fit, params)))
            p.close()
            p.join()
            out = list(zip(*miscl))
        else:
            miscl = []
            for p in params:
                if len(p) > 2:
                    miscl.append(self._fit(data_obj=p[0],
                                           idxs=p[1],
                                           ncl=p[2]))
                else:
                    miscl.append(self._fit(data_obj=p[0],
                                           idxs=p[1]))
            out = miscl

        # return dataframe attribute (cv_results_) with cv scores
        # If no point are labeled (e.g., all points assigned to -1 class by HDBSCAN)
        # the method returns
        cv_results_ = pd.DataFrame(out,
                                   columns=['ncl', 'ms_tr', 'ms_val', 'tr_labels', 'val_labels'])
        ctrl_rows = cv_results_.shape[0]
        cv_results_.dropna(axis=0, inplace=True)
        if 0 < ctrl_rows - cv_results_.shape[0] < ctrl_rows:
            logging.info("Dropped results where clustering algorithm failed to identify clusters.")
            FindBestClustCV.cv_results_ = cv_results_
        elif ctrl_rows - cv_results_.shape[0] == 0:
            FindBestClustCV.cv_results_ = cv_results_
        else:
            logging.info(f"{self.clust_method} was not able to identify any cluster. Failed run with "
                         f"{self.class_method}.")
            return None

        metrics = {'train': {}, 'val': {}}
        for ncl in cv_results_.ncl.unique():
            norm_stab_tr = cv_results_.loc[cv_results_.ncl == ncl]['ms_tr']
            norm_stab_val = cv_results_.loc[cv_results_.ncl == ncl]['ms_val']
            metrics['train'][ncl] = (np.mean(norm_stab_tr), _confint(norm_stab_tr))
            metrics['val'][ncl] = (np.mean(norm_stab_val), _confint(norm_stab_val))

        val_score = np.array([val[0] for val in metrics['val'].values()])
        bestscore = min(val_score)
        # select the cluster with the minimum misclassification error
        # and the maximum number of clusters
        if self.nclust_range is not None and 'n_clusters' in self.clust_method.get_params().keys():
            bestncl = self.nclust_range[np.flatnonzero(val_score == bestscore)[-1]]
            return metrics, bestncl
        else:
            bestncl = list(metrics['val'].keys())[np.flatnonzero(val_score == bestscore)[-1]]
            best_idx = cv_results_.loc[cv_results_.ncl == bestncl].ms_val.idxmin()
            idx_vect = np.concatenate((params[best_idx][1][-2], params[best_idx][1][-1]))
            label_vect = np.concatenate((out[best_idx][-2], out[best_idx][-1]))
            tr_lab = [lab for _, lab in sorted(zip(idx_vect, label_vect))]
            return metrics, bestncl, tr_lab
Ejemplo n.º 23
0
 mae_gen = []
 #MDAE for mean
 mdae_ds = []
 mdae_clus = []
 mdae_gen = []
 #EVS for mean
 evs_ds = []
 evs_clus = []
 evs_gen = []
 #R2 for mean
 r2_ds = []
 r2_clus = []
 r2_gen = []
 p = 0
 #Repeated K Fold Cross Validation
 for tr_i, ts_i in rkf.split(ds):
     print(i, c, p)
     p += 1
     train, test = ds.iloc[tr_i], ds.iloc[ts_i]
     l = list(test['Index'])
     train_ds_x = train.drop(columns=['Index', 'District', 'Rainfall'])
     test_ds_x = test.drop(columns=['Index', 'District', 'Rainfall'])
     test_ds_y = test['Rainfall']
     train_ds_y = train['Rainfall']
     clus_tr = clus_ds[~clus_ds['Index'].isin(l)]
     clus_ds_n = clus_ds_n.append(clus_tr)
     clus_ts = clus_ds[clus_ds['Index'].isin(l)]
     gen_tr = gen_ds[~gen_ds['Index'].isin(l)]
     gen_ds_n = gen_ds_n.append(gen_tr)
     gen_ts = gen_ds[gen_ds['Index'].isin(l)]
     print(
Ejemplo n.º 24
0
vv feature transform vv
###############################################################################
"""

gnbHH = GaussianNB()
gnbHL = GaussianNB()
gnbLH = GaussianNB()
gnbLL = GaussianNB()

gnbHH.fit(featH, do_longH)
gnbHL.fit(featH, do_shortH)
gnbLH.fit(featL, do_longL)
gnbLL.fit(featL, do_shortL)

rf = RFC(n_estimators=100)
my_cv = RepeatedKFold(n_splits=5, n_repeats=10)
params = {'max_depth' : (3,4,5,6), 'min_samples_split' : (20,30,40,50,60,70)}

clfHH = GridSearchCV(rf, params, cv=my_cv, scoring='precision', n_jobs=-1)
clfHL = GridSearchCV(rf, params, cv=my_cv, scoring='precision', n_jobs=-1)
clfLH = GridSearchCV(rf, params, cv=my_cv, scoring='precision', n_jobs=-1)
clfLL = GridSearchCV(rf, params, cv=my_cv, scoring='precision', n_jobs=-1)

clfHH.fit(featH, do_longH)
clfHL.fit(featH, do_shortH)
clfLH.fit(featL, do_longL)
clfLL.fit(featL, do_shortL)


"""
###############################################################################
Ejemplo n.º 25
0
def optimize_hp(X,
                Y,
                E,
                mode='grid',
                n_splits=3,
                n_repeats=5,
                verbose=True,
                **params):
    """
    Optimize the (hyper)parameters of a DeepSurvK model using 
    cross-validation.
    
    Parameters
    ----------
    X: pandas DataFrame
        Data
    Y: pandas DataFrame
        It needs to have column 'T'
    E: pandas DataFrame
        It needs to have column 'E'
    mode: string
        Possible values are:
            'grid' (default)
            'random' TODO
    n_splits: int (optional)
        Number of folds. Default value is 3, as suggested in [1].
    n_repeats: int (optional)
        Number of CV repetition. Default value is 5.
    verbose: boolean (optional)
        Define if verbose output is desired (True, default) or not (False)
    params: dictionary
        Each key corresponds to a parameter. 
        The values correspond to a list of parameters to be explored.
        
        The number of epochs can be defined here. It should also be given as
        an entry of the dictionary with key `epochs` and value a list
        comprised of only one element. If the list has more than
        one element, only the first one will be considered. If number
        of epochs isn't defined by the user, then a default of 1000 will
        be used.
            
    Returns
    -------
    best_params: dictionary
        Best parameters.
        Each key corresponds to a parameter. 
        The values correspond to the optimized parameter.
        
    References
    ----------
    [1] Katzman, Jared L., et al. "DeepSurv: personalized treatment recommender system using a Cox proportional hazards deep neural network." BMC medical research methodology 18.1 (2018): 24.
    """

    # Check if number of epochs was defined.
    if 'epochs' in params:
        # If yes, extract its value (and remove it from the dictionary,
        # since it won't be optimized).
        epochs = params['epochs'][0]
        params.pop('epochs')

    else:
        # If not, set a default value of 1000.
        epochs = 1000

    # Generating a list of dictionaries with all possible combinations.
    # Trick from https://stackoverflow.com/a/61335465/948768
    keys, values = zip(*params.items())
    params_list = [dict(zip(keys, v)) for v in itertools.product(*values)]

    # Compute important parameters.
    n_features = X.shape[1]
    n_combinations = len(params_list)
    if verbose:
        print(f"Optimizing {n_combinations} parameter combinations.")

    if verbose:
        started_at = datetime.datetime.now().replace(microsecond=0)
        print("Optimization started at: ", end='', flush=True)
        print(started_at.strftime("%Y-%m-%d %H:%M:%S"))

    # Initialize important variables.
    c_index_mean = []
    c_index_std = []

    # Loop through all possible parameter combinations.
    for ii, params_curr in enumerate(params_list):

        if verbose:
            print(f"Parameter set {ii+1}/{n_combinations}...")
            print(params_curr)

        # Create RepatedKFold object.
        rkf = RepeatedKFold(n_splits=n_splits, n_repeats=n_repeats)

        # To store results.
        c_index_param = []

        # Loop through different data partitions.
        for jj, (train_index, val_index) in enumerate(rkf.split(X, Y)):

            if verbose:
                print(f"\tIteration {jj+1}/{n_splits*n_repeats}...",
                      end='',
                      flush=True)

            # Perform data partition.
            X_train, X_val = X.iloc[train_index, :], X.iloc[val_index, :]
            Y_train, Y_val = Y.iloc[train_index, :], Y.iloc[val_index, :]
            E_train, E_val = E.iloc[train_index, :], E.iloc[val_index, :]

            # Create DSK model (with current loop's parameters)
            dsk = deepsurvk.DeepSurvK(n_features=n_features,
                                      E=E_train,
                                      **params_curr)
            loss = deepsurvk.negative_log_likelihood(E_train)
            dsk.compile(loss=loss)
            callbacks = deepsurvk.common_callbacks()

            # Fit model.
            n_patients_train = X_train.shape[0]
            dsk.fit(X_train,
                    Y_train,
                    batch_size=n_patients_train,
                    epochs=epochs,
                    callbacks=callbacks,
                    shuffle=False)

            # Generate predictions.
            Y_pred_val = np.exp(-dsk.predict(X_val))

            # Compute quality metric (c-index)
            c = deepsurvk.concordance_index(Y_val, Y_pred_val, E_val)
            c_index_param.append(c)

            if verbose:
                print(f"\tc-index = {c}")

        # Calculate c-index mean and STD for current parameter set.
        c_index_mean.append(np.nanmean(c_index_param))
        c_index_std.append(np.nanstd(c_index_param))

    if verbose:
        ended_at = datetime.datetime.now().replace(microsecond=0)
        print("Optimization ended at: ", end='', flush=True)
        print(ended_at.strftime("%Y-%m-%d %H:%M:%S"))
        print(f"Optimization took {ended_at-started_at}")

    # Find parameter combination with highest c-index.
    c_index_mean_max = max(c_index_mean)
    idx = c_index_mean.index(c_index_mean_max)

    best_params = params_list[idx]

    return best_params
Ejemplo n.º 26
0
y = y.values
x = x.dropna()
x = x.values

x_train, x_test, y_train, y_test = train_test_split(x,
                                                    y,
                                                    test_size=0.2,
                                                    shuffle=True)
## Scale the dataset by removing mean and scaling to unit variance
sc = StandardScaler()
x_train = sc.fit_transform(x_train)
x_test = sc.transform(x_test)

c_values = [0.0000001, 0.000001, 0.00001, 0.0001, 0.001, 0.01]
param_grid = dict(C=c_values)
cv = RepeatedKFold(n_splits=5, n_repeats=5, random_state=200889)
model = linear_model.LogisticRegression(solver="lbfgs",
                                        multi_class="multinomial")

## Based on the chosen model, create a grid to search for the optimal model
grid = GridSearchCV(estimator=model,
                    param_grid=param_grid,
                    cv=cv,
                    scoring="accuracy",
                    n_jobs=-1)
## Get the grid results and fit to training set
grid_result = grid.fit(x_train, y_train)
print('Best C:', grid_result.best_estimator_.get_params()['C'])
print('Best model:', grid_result.best_estimator_)
print("Best: %f using %s" %
      (grid_result.best_score_, grid_result.best_params_))
def train_model(model, param_grid=[], X=[], y=[], splits=5, repeats=5):

    # get unmodified training data, unless data to use already specified
    if len(y) == 0:
        X, y = get_trainning_data_omitoutliers()
        #poly_trans=PolynomialFeatures(degree=2)
        #X=poly_trans.fit_transform(X)
        #X=MinMaxScaler().fit_transform(X)

    # create cross-validation method
    rkfold = RepeatedKFold(n_splits=splits, n_repeats=repeats)

    # perform a grid search if param_grid given
    if len(param_grid) > 0:
        # setup grid search parameters
        gsearch = GridSearchCV(model,
                               param_grid,
                               cv=rkfold,
                               scoring="neg_mean_squared_error",
                               verbose=1,
                               return_train_score=True)

        # search the grid
        gsearch.fit(X, y)

        # extract best model from the grid
        model = gsearch.best_estimator_
        best_idx = gsearch.best_index_

        # get cv-scores for best model
        grid_results = pd.DataFrame(gsearch.cv_results_)
        cv_mean = abs(grid_results.loc[best_idx, 'mean_test_score'])
        cv_std = grid_results.loc[best_idx, 'std_test_score']

    # no grid search, just cross-val score for given model
    else:
        grid_results = []
        cv_results = cross_val_score(model,
                                     X,
                                     y,
                                     scoring="neg_mean_squared_error",
                                     cv=rkfold)
        cv_mean = abs(np.mean(cv_results))
        cv_std = np.std(cv_results)

    # combine mean and std cv-score in to a pandas series
    cv_score = pd.Series({'mean': cv_mean, 'std': cv_std})

    # predict y using the fitted model
    y_pred = model.predict(X)

    # print stats on model performance
    print('----------------------')
    print(model)
    print('----------------------')
    print('score=', model.score(X, y))
    print('rmse=', rmse(y, y_pred))
    print('mse=', mse(y, y_pred))
    print('cross_val: mean=', cv_mean, ', std=', cv_std)

    # residual plots
    y_pred = pd.Series(y_pred, index=y.index)
    resid = y - y_pred
    mean_resid = resid.mean()
    std_resid = resid.std()
    z = (resid - mean_resid) / std_resid
    n_outliers = sum(abs(z) > 3)

    plt.figure(figsize=(15, 5))
    ax_131 = plt.subplot(1, 3, 1)
    plt.plot(y, y_pred, '.')
    plt.xlabel('y')
    plt.ylabel('y_pred')
    plt.title('corr = {:.3f}'.format(np.corrcoef(y, y_pred)[0][1]))
    ax_132 = plt.subplot(1, 3, 2)
    plt.plot(y, y - y_pred, '.')
    plt.xlabel('y')
    plt.ylabel('y - y_pred')
    plt.title('std resid = {:.3f}'.format(std_resid))

    ax_133 = plt.subplot(1, 3, 3)
    z.plot.hist(bins=50, ax=ax_133)
    plt.xlabel('z')
    plt.title('{:.0f} samples with z>3'.format(n_outliers))

    return model, cv_score, grid_results
from sklearn.model_selection import GridSearchCV

coxph = CoxPHSurvivalAnalysis()
grid_values = {'alpha': [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1]}

grid_c = GridSearchCV(coxph, param_grid=grid_values, scoring=None)
grid_c.fit(data_x, data_y)

print('Grid best parameter (max c-index): ', grid_c.best_params_)
print('Grid best score (c-index): ', grid_c.best_score_)

# Apply Cox-PH model based on 3-fold 10-repeated CV using optimal alpha selected from grid search:

from sklearn.model_selection import RepeatedKFold

rkf = RepeatedKFold(n_splits=3, n_repeats=10,
                    random_state=0)  # 3-fold 10-repeated CV

c_index_train, c_index_test = [], []

for train_index, test_index in rkf.split(data_x):
    x_train, x_test = data_x[train_index], data_x[test_index]
    y_train, y_test = data_y[train_index], data_y[test_index]
    coxph = CoxPHSurvivalAnalysis(
        alpha=float(grid_c.best_params_['alpha'])).fit(x_train, y_train)
    c_index_train.append(coxph.score(x_train, y_train))
    c_index_test.append(coxph.score(x_test, y_test))

print("Averaged c-index from 3-fold 10 repeated CV(training): {:.3f}".format(
    np.mean(c_index_train)))
print("Averaged c-index from 3-fold 10 repeated CV(test): {:.3f}".format(
    np.mean(c_index_test)))
from sklearn.model_selection import KFold, cross_val_score, RepeatedKFold

#creating a data frame containing our data, each column can be accessed by df['column name']
df = pd.read_csv(
    'C:\\Users\\MuthaNagaVenkataSaty\\Desktop\\Python DL\\Python Lesson 4\\Python_Lesson6\\iris.csv'
)

# df.loc[df.Species == 'Iris-setosa', 'class'] = 1
# df.loc[df.Species == 'Iris-versicolor', 'class'] = 2
# df.loc[df.Species == 'Iris-virginica', 'class'] = 3

# train data
X = df.as_matrix(
    columns=['sepal length', 'sepal width', 'petal length', 'petal width'])

y = df['Species'].values

kf = RepeatedKFold(n_splits=5, n_repeats=10, random_state=None)
scores = []
for train_index, test_index in kf.split(X):
    #print("Train:", train_index, "Validation:",test_index)
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    gnb = GaussianNB()
    gnb.fit(X_train, y_train)
    train_predicted_values = gnb.predict(X_test)
    scores.append(metrics.accuracy_score(y_test, train_predicted_values) * 100)

print(pd.np.mean(scores))
Ejemplo n.º 30
0
# @Email   : [email protected]
# @File    : estimation_of_classify_model.py
# @Software: PyCharm

from sklearn.model_selection import RepeatedKFold
from sklearn import metrics
import ml_models as oldmethod_class
from esti_pretreat_method import load_and_normalized
from esti_pretreat_method import esti_NMF

data, target_class = load_and_normalized(drug_id=134)
data, target_class = esti_NMF(data=data,
                              target_class=target_class,
                              for_sklearn=True)

rkf = RepeatedKFold(n_splits=10, n_repeats=1,
                    random_state=1234567)  # random state就是为了可重复而已
sum_fpr = []
sum_tpr = []
sum_AUC = []

for train, test in rkf.split(data):
    data_train, data_test, target_train, target_test = data[train], data[
        test], target_class[train], target_class[test]
    model = oldmethod_class.RandomForestC(data_train, target_train)
    y_pred = model.predict_proba(data_test)
    y_score = [i[1] for i in y_pred]
    fpr, tpr, thresholds = metrics.roc_curve(y_true=target_test,
                                             y_score=y_score,
                                             pos_label=1)  # pos_label一般都是1
    sum_fpr.append(fpr)
    sum_tpr.append(tpr)
Ejemplo n.º 31
0
    # Log transform data, note that +1 is to avoid zero values
    # dataSet.data = np.log(dataSet.data + 1).to_numpy()
    for algorithm in ALGORITHMS:
        precision_score = []
        scores_list = []
        auc_score = []
        recall_score = []
        f1_score = []
        
        ext_precision_score = []
        ext_scores_list = []
        ext_auc_score = []
        ext_recall_score = []
        ext_f1_score = []
        rkf = RepeatedKFold(n_splits=10, n_repeats=10, random_state=2652124)
        for train_index, test_index in rkf.split(dataSet.components):
            X_train, X_test, y_train, y_test = dataSet.components[train_index], dataSet.components[test_index], dataSet.target[train_index], dataSet.target[test_index]
            result = algorithm(X_train, X_test, y_train, y_test)
            precision_score.append(result.get('precision'))
            scores_list.append(result.get('accuracy'))
            auc_score.append(result.get('auc'))
            recall_score.append(result.get('recall'))
            f1_score.append(result.get('f1_measure'))
            trainedModel = result.get('algorithm')
            
            if checkExternalValidity:
                external_pred = trainedModel.predict(totalMinedData)

                ext_precision_score.append(metrics.precision_score(expectedResult, external_pred))
                try:
Ejemplo n.º 32
0
df = pd.concat([df, tempDF], axis=1, join='inner')
df['Ratio'] = df['key']
df.drop(labels=['key'], axis=1, inplace=True)
df['Ratio'] = df['Ratio'].apply(lambda x: round(x / 5))
print(df.columns)
df.head()

# In[10]:

X = df.iloc[:, :-1].values
y = df.iloc[:, -1].values
xtrain, xtest, ytrain, ytest = train_test_split(X, y, test_size=0.15)

# In[11]:

cv = RepeatedKFold(n_splits=6, n_repeats=3)
param_grid = dict()
param_grid['alpha'] = [0.0001, 0.001, 0.01]
param_grid['l1_ratio'] = list(np.arange(0.2, 0.6, 0.05))
sgd = SGDClassifier(penalty='elasticnet',
                    max_iter=100000,
                    epsilon=0.001,
                    learning_rate='optimal',
                    loss='log',
                    n_jobs=-1)
model = GridSearchCV(
    sgd,
    param_grid=param_grid,
    cv=cv,
    n_jobs=-1,
)
Ejemplo n.º 33
0
                    evals=watchlist,
                    early_stopping_rounds=200,
                    verbose_eval=100,
                    params=xgb_params)
    oof_xgb[val_idx] = clf.predict(xgb.DMatrix(X_train[val_idx]),
                                   ntree_limit=clf.best_ntree_limit)
    predictions_xgb += clf.predict(
        xgb.DMatrix(X_test), ntree_limit=clf.best_ntree_limit) / folds.n_splits

print("CV score: {:<8.8f}".format(mean_squared_error(oof_xgb, target)))

# 将lgb和xgb的结果进行stacking
train_stack = np.vstack([oof_lgb, oof_xgb]).transpose()
test_stack = np.vstack([predictions_lgb, predictions_xgb]).transpose()

folds_stack = RepeatedKFold(n_splits=5, n_repeats=2, random_state=4590)
oof_stack = np.zeros(train_stack.shape[0])
predictions = np.zeros(test_stack.shape[0])

for fold_, (trn_idx,
            val_idx) in enumerate(folds_stack.split(train_stack, target)):
    print("fold {}".format(fold_))
    trn_data, trn_y = train_stack[trn_idx], target.iloc[trn_idx].values
    val_data, val_y = train_stack[val_idx], target.iloc[val_idx].values

    clf_3 = BayesianRidge()
    clf_3.fit(trn_data, trn_y)

    oof_stack[val_idx] = clf_3.predict(val_data)
    predictions += clf_3.predict(test_stack) / 10
Ejemplo n.º 34
0
NA = [1, 2, 3, 4, 5, 6, 7, 8]
ltden = list()
lsden = list()
lterr = list()
lserr = list()
lterrt = list()
lserrt = list()
ltpcc = list()
lspcc = list()
ltpcct = list()
lspcct = list()
lind1 = list()
lind2 = list()
for m in MA:
    for n in NA:
        rkf = RepeatedKFold(n_splits=4, n_repeats=6, random_state=random_state)
        if n == 1:
            clf = MLPRegressor(solver='lbfgs',
                               alpha=alpha,
                               hidden_layer_sizes=(m),
                               shuffle=False,
                               random_state=random_state)
        elif n == 2:
            clf = MLPRegressor(solver='lbfgs',
                               alpha=alpha,
                               hidden_layer_sizes=(m, m),
                               shuffle=False,
                               random_state=random_state)
        elif n == 3:
            clf = MLPRegressor(solver='lbfgs',
                               alpha=alpha,
Ejemplo n.º 35
0
def lasso_train(groups, varname='valence', arrayname='norm', alpha=None,
                use_lars=True, fit_intercept=True, normalize=True,
                cv_folds=None, cv_repeats=None, skip_cv=False,
                xmin=-np.inf, xmax=np.inf, _larch=None, **kws):

    """use a list of data groups to train a Lasso/LassoLars model

    Arguments
    ---------
      groups      list of groups to use as components
      varname     name of characteristic value to model ['valence']
      arrayname   string of array name to be fit (see Note 3) ['norm']
      xmin        x-value for start of fit range [-inf]
      xmax        x-value for end of fit range [+inf]
      alpha       alpha parameter for LassoLars (See Note 5) [None]
      use_lars    bool to use LassoLars instead of Lasso [True]
      cv_folds    None or number of Cross-Validation folds (Seee Note 4) [None]
      cv_repeats  None or number of Cross-Validation repeats (Seee Note 4) [None]
      skip_cv     bool to skip doing Cross-Validation [None]

    Returns
    -------
      group with trained LassoLars model, to be used with lasso_predict
    Notes
    -----
     1.  The group members for the components must match each other
         in data content and array names.
     2.  all grouops must have an attribute (scalar value) for `varname`
     3.  arrayname can be one of `norm` or `dmude`
     4.  Cross-Validation:  if cv_folds is None, sqrt(len(groups)) will be used
            (rounded to integer).  if cv_repeats is None, sqrt(len(groups))-1
            will be used (rounded).
     5.  alpha is the regularization parameter. if alpha is None it will
         be set using LassoLarsSCV
    """
    xdat, spectra = groups2matrix(groups, arrayname, xmin=xmin, xmax=xmax)
    groupnames = []
    ydat = []
    for g in groups:
        groupnames.append(getattr(g, 'filename',
                                  getattr(g, 'groupname', repr(g))))
        val = getattr(g, varname, None)
        if val is None:
            raise Value("group '%s' does not have attribute '%s'" % (g, varname))
        ydat.append(val)
    ydat = np.array(ydat)

    nvals = len(groups)

    kws.update(dict(fit_intercept=fit_intercept, normalize=normalize))
    creator = LassoLars if use_lars else Lasso
    model = None

    rmse_cv = None
    if not skip_cv:
        if cv_folds is None:
            cv_folds = int(round(np.sqrt(nvals)))
        if  cv_repeats is None:
            cv_repeats = int(round(np.sqrt(nvals)) - 1)

        cv = RepeatedKFold(n_splits=cv_folds, n_repeats=cv_repeats)
        if alpha is None:
            lcvmod = LassoLarsCV(cv=cv, max_n_alphas=1e7,
                                 max_iter=1e7, eps=1.e-12, **kws)
            lcvmod.fit(spectra, ydat)
            alpha = lcvmod.alpha_

        model = creator(alpha=alpha, **kws)
        resid = []
        for ctrain, ctest in cv.split(range(nvals)):
            model.fit(spectra[ctrain, :], ydat[ctrain])
            ypred = model.predict(spectra[ctest, :])
            resid.extend((ypred - ydat[ctest]).tolist())
        resid = np.array(resid)
        rmse_cv = np.sqrt( (resid**2).mean() )

    if alpha is None:
        cvmod = creator(**kws)
        cvmod.fit(spectra, ydat)
        alpha = cvmod.alpha_

    if model is None:
        model = creator(alpha=alpha, **kws)

    # final fit without cross-validation
    out = model.fit(spectra, ydat)

    ypred = model.predict(spectra)

    rmse = np.sqrt(((ydat - ypred)**2).mean())

    return Group(x=xdat, spectra=spectra, ydat=ydat, ypred=ypred,
                 alpha=alpha, active=model.active_, coefs=model.coef_,
                 cv_folds=cv_folds, cv_repeats=cv_repeats,
                 rmse_cv=rmse_cv, rmse=rmse, model=model, varname=varname,
                 arrayname=arrayname, fit_intercept=fit_intercept,
                 normalize=normalize, groupnames=groupnames, keywords=kws)