def test_repeated_kfold_determinstic_split(): X = [[1, 2], [3, 4], [5, 6], [7, 8], [9, 10]] random_state = 258173307 rkf = RepeatedKFold( n_splits=2, n_repeats=2, random_state=random_state) # split should produce same and deterministic splits on # each call for _ in range(3): splits = rkf.split(X) train, test = next(splits) assert_array_equal(train, [2, 4]) assert_array_equal(test, [0, 1, 3]) train, test = next(splits) assert_array_equal(train, [0, 1, 3]) assert_array_equal(test, [2, 4]) train, test = next(splits) assert_array_equal(train, [0, 1]) assert_array_equal(test, [2, 3, 4]) train, test = next(splits) assert_array_equal(train, [2, 3, 4]) assert_array_equal(test, [0, 1]) assert_raises(StopIteration, next, splits)
def kfold_lightgbm(important_features,train_df, test_df, num_folds, stratified = False, debug= False): print("Starting LightGBM. Train shape: {}, test shape: {}".format(train_df.shape, test_df.shape)) # Cross validation model if stratified: folds = StratifiedKFold(n_splits= num_folds, shuffle=True, random_state=326) folds1 = StratifiedKFold(n_splits= num_folds, shuffle=True, random_state=326) else: folds = KFold(n_splits= num_folds, shuffle=True, random_state=326) folds1 = KFold(n_splits= num_folds, shuffle=True, random_state=326) # Create arrays and dataframes to store results oof_preds = np.zeros(train_df.shape[0]) sub_preds = np.zeros(test_df.shape[0]) feature_importance_df = pd.DataFrame() feats = [f for f in important_features if f not in FEATS_EXCLUDED] oof_preds1 = np.zeros(train_df.shape[0]) sub_preds1 = np.zeros(test_df.shape[0]) feature_importance_df1 = pd.DataFrame() target =train_df['target'] for n_fold, (train_idx, valid_idx) in enumerate(folds.split(train_df[feats], train_df['outliers'])): train_x, train_y = train_df[feats].iloc[train_idx], train_df['target'].iloc[train_idx] valid_x, valid_y = train_df[feats].iloc[valid_idx], train_df['target'].iloc[valid_idx] model = xgb.sklearn.XGBRegressor( nthread=20, learn_rate=0.01, max_depth=15, min_child_weight=2, subsample=0.8, colsample_bytree=1, objective='rank:pairwise', n_estimators=300, gamma=0, reg_alpha=0, reg_lambda=1, max_delta_step=0, scale_pos_weight=1 ) watchlist = [(train_x, train_y), (valid_x, valid_y)] a=model.fit(train_x, train_y, eval_set=watchlist, eval_metric='ndcg', early_stopping_rounds=10) b=train_df[feats] b.columns=list(a.feature_importances_) b.sort_index(axis=1,ascending=False) c=test_df[feats] c.columns=list(a.feature_importances_) c.sort_index(axis=1,ascending=False) # k-fold for n_fold, (train_idx, valid_idx) in enumerate(folds.split(b, train_df['outliers'])): train_x, train_y = b.iloc[train_idx], train_df['target'].iloc[train_idx] valid_x, valid_y = b.iloc[valid_idx], train_df['target'].iloc[valid_idx] # set data structure lgb_train = lgb.Dataset(train_x, label=train_y, free_raw_data=False) lgb_test = lgb.Dataset(valid_x, label=valid_y, free_raw_data=False) # params optimized by optuna params ={ 'task': 'train', 'boosting': 'goss', 'objective': 'regression', 'metric': 'rmse', 'learning_rate': 0.01, 'subsample': 0.9855232997390695, 'max_depth': 7, 'top_rate': 0.9064148448434349, 'num_leaves': 63, 'min_child_weight': 41.9612869171337, 'other_rate': 0.0721768246018207, 'reg_alpha': 9.677537745007898, 'colsample_bytree': 0.5665320670155495, 'min_split_gain': 9.820197773625843, 'reg_lambda': 8.2532317400459, 'min_data_in_leaf': 21, 'verbose': -1, 'seed':int(2**n_fold), 'bagging_seed':int(2**n_fold), 'drop_seed':int(2**n_fold) } reg = lgb.train( params, lgb_train, valid_sets=[lgb_train, lgb_test], valid_names=['train', 'test'], num_boost_round=10000, early_stopping_rounds= 200, verbose_eval=100 ) oof_preds[valid_idx] = reg.predict(valid_x, num_iteration=reg.best_iteration) sub_preds += reg.predict(c, num_iteration=reg.best_iteration) / folds.n_splits fold_importance_df = pd.DataFrame() fold_importance_df["feature"] = feats fold_importance_df["importance"] = np.log1p(reg.feature_importance()) fold_importance_df["fold"] = n_fold + 1 feature_importance_df = pd.concat([feature_importance_df, fold_importance_df], axis=0) print('Fold %2d RMSE : %.6f' % (n_fold + 1, rmse(valid_y, oof_preds[valid_idx]))) del reg, train_x, train_y, valid_x, valid_y gc.collect() # k-fold for n_fold, (train_idx, valid_idx) in enumerate(folds1.split(b, train_df['outliers'])): train_x, train_y = b.iloc[train_idx], train_df['target'].iloc[train_idx] valid_x, valid_y = b.iloc[valid_idx], train_df['target'].iloc[valid_idx] # set data structure lgb_train = lgb.Dataset(train_x, label=train_y, free_raw_data=False) lgb_test = lgb.Dataset(valid_x, label=valid_y, free_raw_data=False) # params optimized by optuna params ={ 'task': 'train', 'boosting': 'goss', 'objective': 'regression', 'metric': 'rmse', 'learning_rate': 0.01, 'subsample': 0.9855232997390695, 'max_depth': 7, 'top_rate': 0.9064148448434349, 'num_leaves': 63, 'min_child_weight': 41.9612869171337, 'other_rate': 0.0721768246018207, 'reg_alpha': 9.677537745007898, 'colsample_bytree': 0.5665320670155495, 'min_split_gain': 9.820197773625843, 'reg_lambda': 8.2532317400459, 'min_data_in_leaf': 21, 'verbose': -1, 'seed':int(2**n_fold), 'bagging_seed':int(2**n_fold), 'drop_seed':int(2**n_fold) } reg = lgb.train( params, lgb_train, valid_sets=[lgb_train, lgb_test], valid_names=['train', 'test'], num_boost_round=10000, early_stopping_rounds= 200, verbose_eval=100 ) oof_preds1[valid_idx] = reg.predict(valid_x, num_iteration=reg.best_iteration) sub_preds1 += reg.predict(c, num_iteration=reg.best_iteration) / folds.n_splits fold_importance_df = pd.DataFrame() fold_importance_df["feature"] = feats fold_importance_df["importance"] = np.log1p(reg.feature_importance()) fold_importance_df["fold"] = n_fold + 1 feature_importance_df1 = pd.concat([feature_importance_df1, fold_importance_df], axis=0) print('Fold %2d RMSE : %.6f' % (n_fold + 1, rmse(valid_y, oof_preds[valid_idx]))) del reg, train_x, train_y, valid_x, valid_y gc.collect() # display importances display_importances(feature_importance_df) display_importances(feature_importance_df1) #stack from sklearn.linear_model import BayesianRidge from sklearn.model_selection import StratifiedKFold, RepeatedKFold train_stack = np.vstack([oof_preds,oof_preds1]).transpose() test_stack = np.vstack([sub_preds, sub_preds1]).transpose() folds_stack = RepeatedKFold(n_splits=5, n_repeats=1, random_state=5590) oof_stack = np.zeros(train_stack.shape[0]) predictions_3 = np.zeros(test_stack.shape[0]) for fold_, (trn_idx, val_idx) in enumerate(folds_stack.split(train_stack,target)): print("fold {}".format(fold_)) trn_data, trn_y = train_stack[trn_idx], target.iloc[trn_idx].values val_data, val_y = train_stack[val_idx], target.iloc[val_idx].values clf_3 = BayesianRidge() clf_3.fit(trn_data, trn_y) oof_stack[val_idx] = clf_3.predict(val_data) predictions_3 += clf_3.predict(test_stack) / 5 print(np.sqrt(mean_squared_error(target.values, oof_stack))) # save submission file test_df.loc[:,'target'] = predictions_3 test_df = test_df.reset_index() test_df[['card_id', 'target']].to_csv("F:\\zxd\\elo-merchant-category-recommendation\\submission201926_9.csv", index=False)
test_size=0.15) # Standardize the feature values by computing the mean, subtracting the mean from the data points and then dividing by # the standard deviation scaler = StandardScaler() trainX = scaler.fit_transform(trainX) testX = scaler.transform(testX) # Initialize the model and define the space of the hyperparameters to perform the grid-search over print("[INFO] Initializing the support vector regression model...") model = SVR() kernel = ["linear", "rbf", "sigmoid", "poly"] tolerance = [1e-3, 1e-4, 1e-5, 1e-6] C = [1, 1.5, 2, 2.5, 3] grid = dict(kernel=kernel, tol=tolerance, C=C) # Initialize a cross-validation fold and perform a grid-search to tune the hyperparameters print("[INFO] Grid searching over the hyperparameters...") cvFold = RepeatedKFold(n_splits=10, n_repeats=3, random_state=1) gridSearch = GridSearchCV(estimator=model, param_grid=grid, n_jobs=-1, cv=cvFold, scoring="neg_mean_squared_error") searchResults = gridSearch.fit(trainX, trainY) # Extract the best model and evaluate it print("[INFO] Evaluating the model...") bestModel = searchResults.best_estimator_ print("R2: {:.2f}".format(bestModel.score(testX, testY)))
from sklearn.datasets import load_boston from sklearn.model_selection import RepeatedKFold import pandas as pd from sklearn import linear_model from sklearn.metrics import mean_squared_error, r2_score import matplotlib.pyplot as plt boston = load_boston() data_boston = pd.DataFrame(boston['data']) data_boston.columns = boston['feature_names'] data_boston['target'] = boston['target'] coef = [] intercept = [] mse = [] r2 = [] kf = RepeatedKFold(n_splits=10, n_repeats=10, random_state=0) for train_index, test_index in kf.split(data_boston.iloc[:, :-1]): train_X, train_y = data_boston.iloc[train_index, :-1], data_boston.iloc[ train_index, -1] test_X, test_y = data_boston.iloc[test_index, :-1], data_boston.iloc[ test_index, -1] regr = linear_model.LinearRegression(fit_intercept=True, normalize=True, copy_X=True, n_jobs=-1) regr.fit(train_X, train_y) predict_y = regr.predict(test_X) print('Coefficients: \n', regr.coef_) print('Mean squared error: %.2f' % mean_squared_error(test_y, predict_y)) print('Coefficient of determination: %.2f' % r2_score(test_y, predict_y)) coef.append(regr.coef_) intercept.append(regr.intercept_)
del df_all gc.collect() features = list(train.columns.values) features.remove(id_col) features.remove(target_col) # Build the model cnt = 0 p_buf = [] n_splits = 4 n_repeats = 1 kf = RepeatedKFold( n_splits=n_splits, n_repeats=n_repeats, random_state=0) err_buf = [] undersampling = 0 lgb_params = { 'boosting_type': 'gbdt', 'objective': 'binary', 'metric': 'auc', 'max_depth': 8, 'learning_rate': 0.05, 'feature_fraction': 0.85, 'bagging_fraction': 0.85, 'bagging_freq': 5, 'lambda_l1': 1.0, 'lambda_l2': 1.0,
y = y[filt] #- Predictors X = pd.get_dummies(X, dummy_na=True) X = X[filt] le_y = preprocessing.LabelEncoder() y = le_y.fit_transform(y) #%% Initialize model clf = MNB() #%% Cross Validation using Stratisfied 10-Fold kf = RepeatedKFold(n_splits=10, n_repeats=10, random_state=0) scores = [] for train_idx, test_idx in kf.split(X, y): #print("TRAIN:", train_idx, "TEST:", test_idx) X_train, X_test = X.iloc[train_idx], X.iloc[test_idx] y_train, y_test = y[train_idx], y[test_idx] model = clf.fit(X_train, y_train) predictions = model.predict(X_test) scores.append(accuracy_score(y_test, predictions)) print("Model training complete!") print('Average 10-Fold Accuracy: {}'.format(np.mean(scores))) #%% class_probs = []
# Layer('Rectifier', name='hidden1', units=3), # Layer('Linear', name='hidden2', units=5), Layer('Linear') ], learning_rate=0.001, n_iter=25) elif method == 'linReg': classifier = linear_model.LinearRegression() else: print('dumbass') exit() if eval: if method in ['nnreg', 'linReg', 'svr']: rkf = RepeatedKFold(n_splits=5, n_repeats=1, random_state=random_state) for train, test in rkf.split(X, y): X_train, X_test, y_train, y_test = X[train], X[test], y[train], y[ test] y_pred = classifier.fit(X_train, y_train).predict(X_test) print "\nResults of Linear Regression...." print "================================" print('explained variance {}'.format( explained_variance_score(y_test, y_pred))) print('mean_squared_error {}'.format( mean_squared_error(y_test, y_pred))) print('r2_score {}'.format(r2_score(y_test, y_pred))) plt.scatter(y_test.T, y_pred.T) # plt.matshow(y_pred) plt.show()
def pls_train(groups, varname='valence', arrayname='norm', scale=True, ncomps=2, cv_folds=None, cv_repeats=None, skip_cv=False, xmin=-np.inf, xmax=np.inf, _larch=None, **kws): """use a list of data groups to train a Partial Least Squares model Arguments --------- groups list of groups to use as components varname name of characteristic value to model ['valence'] arrayname string of array name to be fit (see Note 3) ['norm'] xmin x-value for start of fit range [-inf] xmax x-value for end of fit range [+inf] scale bool to scale data [True] cv_folds None or number of Cross-Validation folds (Seee Note 4) [None] cv_repeats None or number of Cross-Validation repeats (Seee Note 4) [None] skip_cv bool to skip doing Cross-Validation [None] ncomps number of independent components (See Note 5) [2] Returns ------- group with trained PSLResgession, to be used with pls_predict Notes ----- 1. The group members for the components must match each other in data content and array names. 2. all grouops must have an attribute (scalar value) for `varname` 3. arrayname can be one of `norm` or `dmude` 4. Cross-Validation: if cv_folds is None, sqrt(len(groups)) will be used (rounded to integer). if cv_repeats is None, sqrt(len(groups))-1 will be used (rounded). 5. The optimal number of components may be best found from PCA. If set to None, a search will be done for ncomps that gives the lowest RMSE_CV. """ xdat, spectra = groups2matrix(groups, arrayname, xmin=xmin, xmax=xmax) groupnames = [] ydat = [] for g in groups: groupnames.append(getattr(g, 'filename', getattr(g, 'groupname', repr(g)))) val = getattr(g, varname, None) if val is None: raise Value("group '%s' does not have attribute '%s'" % (g, varname)) ydat.append(val) ydat = np.array(ydat) nvals = len(groups) kws['scale'] = scale kws['n_components'] = ncomps model = PLSRegression(**kws) rmse_cv = None if not skip_cv: if cv_folds is None: cv_folds = int(round(np.sqrt(nvals))) if cv_repeats is None: cv_repeats = int(round(np.sqrt(nvals)) - 1) resid = [] cv = RepeatedKFold(n_splits=cv_folds, n_repeats=cv_repeats) for ctrain, ctest in cv.split(range(nvals)): model.fit(spectra[ctrain, :], ydat[ctrain]) ypred = model.predict(spectra[ctest, :])[:, 0] resid.extend((ypred - ydat[ctest]).tolist()) resid = np.array(resid) rmse_cv = np.sqrt( (resid**2).mean() ) # final fit without cross-validation model = PLSRegression(**kws) out = model.fit(spectra, ydat) ypred = model.predict(spectra)[:, 0] rmse = np.sqrt(((ydat - ypred)**2).mean()) return Group(x=xdat, spectra=spectra, ydat=ydat, ypred=ypred, coefs=model.x_weights_, loadings=model.x_loadings_, cv_folds=cv_folds, cv_repeats=cv_repeats, rmse_cv=rmse_cv, rmse=rmse, model=model, varname=varname, arrayname=arrayname, scale=scale, groupnames=groupnames, keywords=kws)
def main(): #Load data url_data = 'https://raw.githubusercontent.com/jbrownlee/Datasets/master/sonar.csv' data = pd.read_csv(url_data, header=None) print("\n\n####### 1. LOAD DATASET #######\n\n") print("\n\nPrint the 5 first line of data\n\n") print(data.head()) print("\n\nthe information of data\n\n") print(data.info()) print("\n\n Describe data\n\n") print(data.describe()) print('\n\nShape of the dataset: \n\n') print(data.shape) print('=' * 80) ####### 2. LABEL ENCODER ####### print("\n\n####### 2. LABEL ENCODER #######\n\n") #Use label encoder function data = label_encoder(data) print("\n\nthe information of data after use Label Encoder function\n\n") print(data.info()) print('=' * 80) ####### 3. CLEAN DATA ####### print("\n\n", "~0~0" * 27, "\n\n") print("\n\n####### 2. CLEAN DATA #######\n\n") #Perform outlier-removing data = remove_outlier(data) print('=' * 80) ####### 4. SPLIT DATA ####### print("\n\n####### 4. SPLIT DATA #######\n\n") #Split Data X = data.iloc[:, :-1] y = data.iloc[:, -1] print("\n\nPrint the 5 first line of X\n\n") print(X.head()) print("\n\nPrint the 5 first line of y\n\n") print(y.head()) print('=' * 80) ####### 5. USE PRINCIPAL COMPONENT ANALYSIS (PCA) ####### print("\n\n####### 5. USE PRINCIPAL COMPONENT ANALYSIS (PCA) #######\n\n") #Do Principal Component Analysis as pca print('Perform Principal Component Analysis') X_pca = PCA_method(X) print('\n\nShape of dataset before PCA: \n\n') print(X.shape) print('\n\nShape of dataset after PCA: \n\n') print(X_pca.shape) print('=' * 100) print('=' * 80) ####### 6. PERFORM GRIBSEARCHCV ####### print("\n\n####### 6. PERFORM GRIBSEARCHCV #######\n\n") #Define model model = Ridge() #Define evaluation cv = RepeatedKFold(n_splits=10, n_repeats=3, random_state=1) #Define search space space = dict() space['solver'] = ['svd', 'cholesky', 'lsqr', 'sag'] space['alpha'] = [1e-5, 1e-4, 1e-3, 1e-2, 1e-1, 1, 10, 100] space['fit_intercept'] = [True, False] space['normalize'] = [True, False] #Define search search = GridSearchCV(model, space, scoring='neg_mean_squared_error', n_jobs=-1, cv=cv) result = search.fit(X_pca, y) #Result print('Best Score: %s' % result.best_score_) print('Best Hyperparameters: %s' % result.best_params_) print('=' * 80) '''
from sklearn.model_selection import RepeatedKFold from sklearn.neural_network import MLPClassifier from sklearn import metrics from scipy import stats import pandas as pd import numpy import training training.createdata() dataset = pd.read_csv('dir/hog.csv') dataset = dataset[(numpy.abs(stats.zscore(dataset)) < 5.04).all(axis=1)] random_state = 12883823 rkf = RepeatedKFold(n_splits=5, n_repeats=30, random_state=random_state) result = next(rkf.split(dataset), None) data_train = dataset.iloc[result[0]] data_test = dataset.iloc[result[1]] data = data_train.iloc[:, [0, 3780]] target = data_train.iloc[:, [3781]] classifier = MLPClassifier(random_state=30, hidden_layer_sizes=8, learning_rate_init=0.1, momentum=0.9) classifier.fit(data, target) dataset_test = pd.read_csv('dir/test_hog.csv') predicted = classifier.predict(dataset_test.iloc[:, [0, 3780]]) print(metrics.classification_report(dataset_test.iloc[:, [3781]], predicted))
# We can check the coefficient variability through cross-validation: # it is a form of data perturbation (related to # `resampling <https://en.wikipedia.org/wiki/Resampling_(statistics)>`_). # # If coefficients vary significantly when changing the input dataset # their robustness is not guaranteed, and they should probably be interpreted # with caution. from sklearn.model_selection import cross_validate from sklearn.model_selection import RepeatedKFold cv_model = cross_validate( model, X, y, cv=RepeatedKFold(n_splits=5, n_repeats=5), return_estimator=True, n_jobs=-1, ) coefs = pd.DataFrame( [ est.named_steps["transformedtargetregressor"].regressor_.coef_ * X_train_preprocessed.std(axis=0) for est in cv_model["estimator"] ], columns=feature_names, ) plt.figure(figsize=(9, 7)) sns.stripplot(data=coefs, orient="h", color="k", alpha=0.5) sns.boxplot(data=coefs, orient="h", color="cyan", saturation=0.5) plt.axvline(x=0, color=".5") plt.xlabel("Coefficient importance")
df.head() # In[9]: #No standardizing nor normalization needed/possible X = df.iloc[:, :-1].values y = df.iloc[:, -1].values xtrain, xtest, ytrain, ytest = train_test_split(X, y, test_size = 0.15) # In[10]: cv = RepeatedKFold(n_splits=7, n_repeats = 2) ridge = RidgeCV(alphas= np.arange(0, 5, 0.01), cv = cv, scoring='r2') ridge.fit(xtrain, ytrain) # In[11]: ypred = ridge.predict(xtest) r2 = r2_score(ytest, ypred) print('R2 Score: ', r2) score = ridge.score(xtrain, ytrain) print('R Squared: ', score) # In[12]:
def test_get_n_splits_for_repeated_kfold(): n_splits = 3 n_repeats = 4 rkf = RepeatedKFold(n_splits, n_repeats) expected_n_splits = n_splits * n_repeats assert_equal(expected_n_splits, rkf.get_n_splits())
if len(use_features) > 0: for i_feat_indx in use_features: fi += 1 Xs_train[:, fi] = X_all_feats[:, i_feat_indx] for ns in [3, 4, 5]: xs = 0.22 * _N.random.randn(ns * nrep) coefsLR = _N.empty((nrep * ns, len(use_features))) #test_sz = ns*(len(filtdat)//ns)-(ns-1)*(len(filtdat)//ns) test_sz = len(filtdat) // ns + 1 if len( filtdat) % ns != 0 else len(filtdat) // ns print("test_sz %d" % test_sz) obs_v_preds = _N.zeros((nrep * ns, test_sz, 2)) scoresLR = _N.empty(nrep * ns) rkf = RepeatedKFold(n_splits=ns, n_repeats=nrep) #, random_state=0) iii = -1 for train, test in rkf.split(datinds): iii += 1 clf_f = clf.fit(Xs_train[train], y[train]) scoresLR[iii] = clf_f.score(Xs_train[test], y[test]) coefsLR[iii] = clf_f.coef_ obs_v_preds[iii, 0:len(test), 0] = y[test] obs_v_preds[iii, 0:len(test), 1] = clf_f.predict(Xs_train[test]) use_features_dmp["weights_thresh%(t)d_fld%(f)d" % { "t": iu, "f": ns }] = _N.mean(coefsLR, axis=0)
ITERATIONS_DATA_FILE_PATHS = [ (5, 'resources/variabilities_5_iterations.csv'), (10, 'resources/variabilities_10_iterations.csv'), (20, 'resources/variabilities_20_iterations.csv'), (30, 'resources/variabilities_30_iterations.csv'), ] RESULTS_OUTPUT_CSV_FILE_PATH = 'resources/output/classification_results.csv' FEATURES_OUTPUT_CSV_FILE_PATH = 'resources/output/classification_features.csv' RANDOM_SEED = 42 CROSS_VALIDATION_FOLDS = 10 CROSS_VALIDATION_REPETITIONS = 30 CROSS_VALIDATION_GENERATOR = RepeatedKFold( n_splits=CROSS_VALIDATION_FOLDS, n_repeats=CROSS_VALIDATION_REPETITIONS, random_state=RANDOM_SEED, ) TOTAL_CROSS_VALIDATION_FOLDS = CROSS_VALIDATION_FOLDS * CROSS_VALIDATION_REPETITIONS DEPENDENT_VARIABLES = [ 'rciw99', 'rciw99mjhd', 'rmadhd', ] BINARY_CLASSIFICATION_THRESHOLDS = [ 1, 3, 5, 10,
# first 120 days, select view count or watch time as dependent variable daily_attention = [dailywatch, dailyview][use_view][:age] daily_share = dailyshare[:age] if len(daily_attention) == age and len(daily_share) == age: attention_data.append(daily_attention) share_data.append(daily_share) vid_array.append(vid) # convert to ndarray attention_data = np.array(attention_data) share_data = np.array(share_data) vid_array = np.array(vid_array) # == == == == == == == == Part 4: Forecast future attention == == == == == == == == # # 10-repeated 10-fold cross validation rkf = RepeatedKFold(n_splits=10, n_repeats=10) fold_idx = 0 for train_cv_idx, test_idx in rkf.split(vid_array): fold_idx += 1 print('>>> Forecast on fold: {0}'.format(fold_idx)) # == == == == == == == == Part 5: Split cv subset to select best alpha value == == == == == == == == # train_idx, cv_idx = train_test_split(train_cv_idx, test_size=0.1) # grid search best alpha value over -4 to 4 in log space alpha_array = [10 ** t for t in range(-4, 5)] cv_mse = [] for alpha in alpha_array: # == == == == == == == == Part 6: Training with Ridge Regression == == == == == == == == # cv_predict = forecast_future_attention(train_idx, cv_idx, alpha)
import numpy import pandas import matplotlib.pyplot as plot from sklearn import metrics from sklearn.ensemble import AdaBoostRegressor from sklearn.tree import DecisionTreeRegressor from sklearn.model_selection import RepeatedKFold dataset = pandas.read_csv('salaryData.csv') x = dataset['YearsExperience'].values y = dataset['Salary'].values X = x.reshape(len(x), 1) Y = y.reshape(len(y), 1) kf = RepeatedKFold(n_splits=2, n_repeats=1, random_state=200) kf.get_n_splits(X) for train_index, test_index in kf.split(X): xTrain, xTest = X[train_index], X[test_index] yTrain, yTest = Y[train_index], Y[test_index] regressor = DecisionTreeRegressor() regressor.fit(xTrain, yTrain) regr = AdaBoostRegressor() regr.fit(X, y) yPrediction = regressor.predict(xTest) yPred = regr.predict(xTest)
cross_val_score, ) from sklearn.pipeline import Pipeline from sktime.classification.interval_based import CanonicalIntervalForest from sktime.transformations.panel.pca import PCATransformer from sktime.utils._testing.estimator_checks import _make_args DATA_ARGS = [ {"return_numpy": True, "n_columns": 2}, {"return_numpy": False, "n_columns": 2}, ] # StratifiedGroupKFold(n_splits=2), , removed, not available in sklearn 0.24 CROSS_VALIDATION_METHODS = [ KFold(n_splits=2), RepeatedKFold(n_splits=2, n_repeats=2), LeaveOneOut(), LeavePOut(p=5), ShuffleSplit(n_splits=2, test_size=0.25), StratifiedKFold(n_splits=2), StratifiedShuffleSplit(n_splits=2, test_size=0.25), GroupKFold(n_splits=2), LeavePGroupsOut(n_groups=5), GroupShuffleSplit(n_splits=2, test_size=0.25), TimeSeriesSplit(n_splits=2), ] PARAMETER_TUNING_METHODS = [ GridSearchCV, RandomizedSearchCV, HalvingGridSearchCV, HalvingRandomSearchCV,
def train(train_path, test_path, output_path): train = pd.read_csv(train_path, encoding='gb18030') test = pd.read_csv(test_path, encoding='gb18030') test_id = test[u'样本id'] for df in [train, test]: df.drop(['B3', 'B13', 'A13', 'A18', 'A23'], axis=1, inplace=True) good_cols = list(train.columns) for col in train.columns: rate = train[col].value_counts(normalize=True, dropna=False).values[0] if rate > 0.9: good_cols.remove(col) train = train[(train[u'收率'] > 0.87) & (train['B14'] > 40) & (train['A6'] < 50)] good_cols.append('A1') good_cols.append('A3') good_cols.append('A4') good_cols.remove('sample_id') train = train[good_cols] good_cols.remove(u'收率') test = test[good_cols] target = train[u'收率'] del train[u'收率'] data = pd.concat([train, test], axis=0, ignore_index=True) data = data.fillna(-1) for f in ['A5', 'A7', 'A9', 'A11', 'A14', 'A16', 'A24', 'A26', 'B5', 'B7']: try: data[f] = data[f].apply(timeTranSecond) except: continue for f in ['A20', 'A28', 'B4', 'B9', 'B10', 'B11']: data[f] = data.apply(lambda df: getDuration(df[f]), axis=1) categorical_columns = [f for f in data.columns] numerical_columns = [ f for f in data.columns if f not in categorical_columns ] for f in ['B14']: data[f + '_median'] = data[f].median() data[f + '_std'] = data[f].std() data[f + '_max'] = data[f].max() data[f + '_min'] = data[f].min() data[f + '**2'] = data[f]**2 data['b14/a1_a3_a4_a19_b1_b12'] = data['B14'] / (data['A1'] + data['A3'] + data['A4'] + data['A19'] + data['B1'] + data['B12']) data['b14_a1_a3_a4_a19_b1_b12'] = data['B14'] + data['A1'] + data[ 'A3'] + data['A4'] + data['A19'] + data['B1'] + data['B12'] data['b14*a1_a3_a4_a19_b1_b12'] = data['B14'] * (data['A1'] + data['A3'] + data['A4'] + data['A19'] + data['B1'] + data['B12']) numerical_columns.append('b14/a1_a3_a4_a19_b1_b12') numerical_columns.append('b14_a1_a3_a4_a19_b1_b12') numerical_columns.append('b14*a1_a3_a4_a19_b1_b12') data['b14*b12'] = data['B14'] * data['B12'] numerical_columns.append('b14*b12') data['b14/b1'] = data['B14'] / data['B1'] numerical_columns.append('b14/b1') data['b14*a19'] = data['B14'] * data['A19'] numerical_columns.append('b14*a19') data['b14/a4'] = data['B14'] / data['A4'] numerical_columns.append('b14/a4') data['b14+a4'] = data['B14'] + data['A4'] numerical_columns.append('b14+a4') data['B11*B14'] = data['B11'] * data['B14'] numerical_columns.append('B11*B14') data['A7*A8'] = data['A7'] * data['A8'] numerical_columns.append('A7*A8') data['A9*A10'] = data['A10'] * data['A9'] numerical_columns.append('A9*A10') data['A10*A11'] = data['A10'] * data['A11'] numerical_columns.append('A10*A11') data['A16*A17'] = data['A16'] * data['A17'] numerical_columns.append('A16*A17') data['A25*A26'] = data['A25'] * data['A26'] numerical_columns.append('A25*A26') data['B10*B11'] = data['B10'] * data['B11'] numerical_columns.append('B10*B11') data['B12*B14'] = data['B12'] * data['B14'] numerical_columns.append('B12*B14') data['A5*A7'] = data['A5'] * data['A7'] numerical_columns.append('A5*A7') data['A9*A11'] = data['A9'] * data['A11'] numerical_columns.append('A9*A11') data['A19*A21'] = data['A19'] * data['A21'] numerical_columns.append('A19*A21') data['B8*B10'] = data['B8'] * data['B10'] numerical_columns.append('B8*B10') data['B10*B12'] = data['B10'] * data['B12'] numerical_columns.append('B10*B12') data['A11*A14'] = data['A11'] * data['A14'] numerical_columns.append('A11*A14') data['A12*A15'] = data['A12'] * data['A15'] numerical_columns.append('A12*A15') data['A11*A15'] = data['A11'] * data['A15'] numerical_columns.append('A11*A15') data['A16*A19'] = data['A16'] * data['A19'] numerical_columns.append('A16*A19') data['A19*A22'] = data['A19'] * data['A22'] numerical_columns.append('A19*A22') del data['A1'] del data['A3'] del data['A4'] categorical_columns.remove('A1') categorical_columns.remove('A3') categorical_columns.remove('A4') for f in categorical_columns: data[f] = data[f].map( dict(zip(data[f].unique(), range(0, data[f].nunique())))) train = data[:train.shape[0]] test = data[train.shape[0]:] train['target'] = target train['intTarget'] = pd.cut(train['target'], 5, labels=False) train = pd.get_dummies(train, columns=['intTarget']) li = [ 'intTarget_0.0', 'intTarget_1.0', 'intTarget_2.0', 'intTarget_3.0', 'intTarget_4.0' ] mean_columns = [] for f1 in categorical_columns: cate_rate = train[f1].value_counts(normalize=True, dropna=False).values[0] if cate_rate < 0.90: for f2 in li: col_name = 'B14_to_' + f1 + "_" + f2 + '_mean' mean_columns.append(col_name) order_label = train.groupby([f1])[f2].mean() train[col_name] = train['B14'].map(order_label) miss_rate = train[col_name].isnull().sum( ) * 100 / train[col_name].shape[0] if miss_rate > 0: train = train.drop([col_name], axis=1) mean_columns.remove(col_name) else: test[col_name] = test['B14'].map(order_label) train.drop(li + ['target'], axis=1, inplace=True) X_train = train[mean_columns + numerical_columns].values X_test = test[mean_columns + numerical_columns].values # one hot enc = OneHotEncoder() for f in categorical_columns: enc.fit(data[f].values.reshape(-1, 1)) X_train = sparse.hstack( (X_train, enc.transform(train[f].values.reshape(-1, 1))), 'csr') X_test = sparse.hstack( (X_test, enc.transform(test[f].values.reshape(-1, 1))), 'csr') y_train = target.values param = { 'num_leaves': 120, 'min_data_in_leaf': 30, 'objective': 'regression', 'max_depth': -1, 'learning_rate': 0.01, "min_child_samples": 30, "boosting": "gbdt", "feature_fraction": 0.9, "bagging_freq": 1, "bagging_fraction": 0.9, "bagging_seed": 11, "metric": 'mse', "lambda_l1": 0.1, "verbosity": -1 } folds = KFold(n_splits=5, shuffle=True, random_state=2018) oof_lgb = np.zeros(len(train)) predictions_lgb = np.zeros(len(test)) for fold_, (trn_idx, val_idx) in enumerate(folds.split(X_train, y_train)): print("fold n°{}".format(fold_ + 1)) trn_data = lgb.Dataset(X_train[trn_idx], y_train[trn_idx]) val_data = lgb.Dataset(X_train[val_idx], y_train[val_idx]) num_round = 10000 clf = lgb.train(param, trn_data, num_round, feval=lgbFeval, valid_sets=[trn_data, val_data], verbose_eval=200, early_stopping_rounds=100) oof_lgb[val_idx] = clf.predict(X_train[val_idx], num_iteration=clf.best_iteration) predictions_lgb += clf.predict( X_test, num_iteration=clf.best_iteration) / folds.n_splits xgb_params = { 'eta': 0.005, 'max_depth': 10, 'subsample': 0.8, 'colsample_bytree': 0.8, 'objective': 'reg:linear', 'eval_metric': 'rmse', 'silent': True, 'nthread': 16 } folds = KFold(n_splits=5, shuffle=True, random_state=2018) oof_xgb = np.zeros(len(train)) predictions_xgb = np.zeros(len(test)) for fold_, (trn_idx, val_idx) in enumerate(folds.split(X_train, y_train)): print("fold n°{}".format(fold_ + 1)) trn_data = xgb.DMatrix(X_train[trn_idx], y_train[trn_idx]) val_data = xgb.DMatrix(X_train[val_idx], y_train[val_idx]) watchlist = [(trn_data, 'train'), (val_data, 'valid_data')] clf = xgb.train(dtrain=trn_data, num_boost_round=20000, feval=xgbFeval, evals=watchlist, early_stopping_rounds=200, verbose_eval=100, params=xgb_params) oof_xgb[val_idx] = clf.predict(xgb.DMatrix(X_train[val_idx]), ntree_limit=clf.best_ntree_limit) predictions_xgb += clf.predict( xgb.DMatrix(X_test), ntree_limit=clf.best_ntree_limit) / folds.n_splits train_stack = np.vstack([oof_lgb, oof_xgb]).transpose() test_stack = np.vstack([predictions_lgb, predictions_xgb]).transpose() folds_stack = RepeatedKFold(n_splits=5, n_repeats=2, random_state=4590) oof_stack_xgb = np.zeros(train_stack.shape[0]) predictions = np.zeros(test_stack.shape[0]) for fold_, (trn_idx, val_idx) in enumerate(folds_stack.split(train_stack, target)): print("fold {}".format(fold_)) trn_data, trn_y = train_stack[trn_idx], target.iloc[trn_idx].values val_data, val_y = train_stack[val_idx], target.iloc[val_idx].values clf_3 = xgb.XGBRegressor() clf_3.fit(trn_data, trn_y) oof_stack_xgb[val_idx] = clf_3.predict(val_data) predictions += clf_3.predict(test_stack) / 10 print("LGB score: {}".format((mean_squared_error(oof_lgb, target) * 0.5))) print("XGB score: {}".format((mean_squared_error(oof_xgb, target) * 0.5))) print("STACK score: {}".format( (mean_squared_error(target.values, oof_stack_xgb) * 0.5))) sub_df = pd.DataFrame() sub_df[0] = test_id sub_df[1] = predictions sub_df[1] = sub_df[1].apply(lambda x: round(x, 3)) sub_df.to_csv(output_path, index=False, header=None)
def run(task, n_splits, n_repeats): """ Trains the specified classifier on the node embedding. This method is called in parallel on multiple `tasks`. The results of the training is saved to disk in the `results` subfolder. Parameters ---------- task : tuple The first entry contains the name of the classifier, the second a fict with the keys `node_embeddings`, `embedding_name`, `node_labels` and `disting_node_labels`. n_splits : int n_repeats : int """ embedding, classifier = task print(f"Start of {classifier} classifier") if classifier == "adaboost": fn = adaboost elif classifier == "decision_tree": fn = decision_tree elif classifier == "neural_network": fn = neural_network elif classifier == "random_forest": fn = random_forest scores = { "train_score_accuracy": list(), "test_score_accuracy": list(), "train_score_f1_micro": list(), "test_score_f1_micro": list(), "train_score_f1_macro": list(), "test_score_f1_macro": list(), "test_predictions": list() } rkf = RepeatedKFold(n_splits=n_splits, n_repeats=n_repeats).split(node_embeddings) for i, (train_index, test_index) in enumerate(rkf): train_pred, test_pred = fn(embedding["node_embeddings"][train_index], embedding["node_embeddings"][test_index], embedding["node_labels"][train_index], embedding["node_labels"][test_index]) scores["train_score_accuracy"].append( accuracy_score(embedding["node_labels"][train_index], train_pred)) scores["test_score_accuracy"].append( accuracy_score(embedding["node_labels"][test_index], test_pred)) scores["train_score_f1_micro"].append( f1_score(embedding["node_labels"][train_index], train_pred, average="micro")) scores["test_score_f1_micro"].append( f1_score(embedding["node_labels"][test_index], test_pred, average="micro")) scores["train_score_f1_macro"].append( f1_score(embedding["node_labels"][train_index], train_pred, average="macro")) scores["test_score_f1_macro"].append( f1_score(embedding["node_labels"][test_index], test_pred, average="macro")) scores["test_predictions"].append( {index: pred for index, pred in zip(test_index, test_pred)}) filename = os.path.dirname( os.path.abspath(__file__) ) + "/results/" + classifier + "/" + embedding["embedding_name"] if classifier == "neural_network": filename += "_one_hidden" save_classification_results(filename + ".txt", scores, embedding["distinct_node_labels"], n_splits, n_repeats)
oof_lgb = np.array(pd.read_csv(tree_data_path + 'lgb_train.csv')['price']) # 导入树模型cab预测数据,进行二层stacking输出 predictions_cb = np.array( pd.read_csv(tree_data_path + 'cab_test.csv')['price']) oof_cb = np.array(pd.read_csv(tree_data_path + 'cab_train.csv')['price']) # 读取price,对验证集进行评估 Train_data = pd.read_csv(tree_data_path + 'train_tree.csv', sep=' ') TestA_data = pd.read_csv(tree_data_path + 'text_tree.csv', sep=' ') Y_data = Train_data['price'] train_stack = np.vstack([oof_lgb, oof_cb]).transpose() test_stack = np.vstack([predictions_lgb, predictions_cb]).transpose() print(train_stack) folds_stack = RepeatedKFold(n_splits=10, n_repeats=2, random_state=2018) tree_stack = np.zeros(train_stack.shape[0]) predictions = np.zeros(test_stack.shape[0]) # 二层贝叶斯回归stack for fold_, (trn_idx, val_idx) in enumerate(folds_stack.split(train_stack, Y_data)): print("fold {}".format(fold_)) trn_data, trn_y = train_stack[trn_idx], Y_data[trn_idx] val_data, val_y = train_stack[val_idx], Y_data[val_idx] Bayes = linear_model.BayesianRidge() Bayes.fit(trn_data, trn_y) tree_stack[val_idx] = Bayes.predict(val_data) predictions += Bayes.predict(test_stack) / 20
def best_nclust(self, data, iter_cv=1, strat_vect=None): """ This method takes as input the training dataset and the stratification vector (if available) and performs a (repeated) CV procedure to select the best number of clusters that minimizes normalized stability. :param data: training dataset. :type data: ndarray, (n_samples, n_features) :param iter_cv: number of iteration for repeated CV, default 1. :type iter_cv: integer :param strat_vect: vector for stratification, defaults to None. :type strat_vect: ndarray, (n_samples,) :return: CV metrics for training and validation sets, best number of clusters, misclassification errors at each CV iteration. :rtype: dictionary, int, (list) if n_clusters parameter is not available """ data_array = np.array(data) reval = RelativeValidation(self.class_method, self.clust_method, self.nrand) if strat_vect is not None: kfold = RepeatedStratifiedKFold(n_splits=self.nfold, n_repeats=iter_cv, random_state=42) else: kfold = RepeatedKFold(n_splits=self.nfold, n_repeats=iter_cv, random_state=42) fold_gen = kfold.split(data_array, strat_vect) if self.nclust_range is not None and 'n_clusters' in self.clust_method.get_params().keys(): params = list(itertools.product([(data_array, reval)], fold_gen, self.nclust_range)) else: params = list(itertools.product([(data_array, reval)], fold_gen)) if self.n_jobs > 1: p = mp.Pool(processes=self.n_jobs) miscl = list(zip(*p.starmap(self._fit, params))) p.close() p.join() out = list(zip(*miscl)) else: miscl = [] for p in params: if len(p) > 2: miscl.append(self._fit(data_obj=p[0], idxs=p[1], ncl=p[2])) else: miscl.append(self._fit(data_obj=p[0], idxs=p[1])) out = miscl # return dataframe attribute (cv_results_) with cv scores # If no point are labeled (e.g., all points assigned to -1 class by HDBSCAN) # the method returns cv_results_ = pd.DataFrame(out, columns=['ncl', 'ms_tr', 'ms_val', 'tr_labels', 'val_labels']) ctrl_rows = cv_results_.shape[0] cv_results_.dropna(axis=0, inplace=True) if 0 < ctrl_rows - cv_results_.shape[0] < ctrl_rows: logging.info("Dropped results where clustering algorithm failed to identify clusters.") FindBestClustCV.cv_results_ = cv_results_ elif ctrl_rows - cv_results_.shape[0] == 0: FindBestClustCV.cv_results_ = cv_results_ else: logging.info(f"{self.clust_method} was not able to identify any cluster. Failed run with " f"{self.class_method}.") return None metrics = {'train': {}, 'val': {}} for ncl in cv_results_.ncl.unique(): norm_stab_tr = cv_results_.loc[cv_results_.ncl == ncl]['ms_tr'] norm_stab_val = cv_results_.loc[cv_results_.ncl == ncl]['ms_val'] metrics['train'][ncl] = (np.mean(norm_stab_tr), _confint(norm_stab_tr)) metrics['val'][ncl] = (np.mean(norm_stab_val), _confint(norm_stab_val)) val_score = np.array([val[0] for val in metrics['val'].values()]) bestscore = min(val_score) # select the cluster with the minimum misclassification error # and the maximum number of clusters if self.nclust_range is not None and 'n_clusters' in self.clust_method.get_params().keys(): bestncl = self.nclust_range[np.flatnonzero(val_score == bestscore)[-1]] return metrics, bestncl else: bestncl = list(metrics['val'].keys())[np.flatnonzero(val_score == bestscore)[-1]] best_idx = cv_results_.loc[cv_results_.ncl == bestncl].ms_val.idxmin() idx_vect = np.concatenate((params[best_idx][1][-2], params[best_idx][1][-1])) label_vect = np.concatenate((out[best_idx][-2], out[best_idx][-1])) tr_lab = [lab for _, lab in sorted(zip(idx_vect, label_vect))] return metrics, bestncl, tr_lab
mae_gen = [] #MDAE for mean mdae_ds = [] mdae_clus = [] mdae_gen = [] #EVS for mean evs_ds = [] evs_clus = [] evs_gen = [] #R2 for mean r2_ds = [] r2_clus = [] r2_gen = [] p = 0 #Repeated K Fold Cross Validation for tr_i, ts_i in rkf.split(ds): print(i, c, p) p += 1 train, test = ds.iloc[tr_i], ds.iloc[ts_i] l = list(test['Index']) train_ds_x = train.drop(columns=['Index', 'District', 'Rainfall']) test_ds_x = test.drop(columns=['Index', 'District', 'Rainfall']) test_ds_y = test['Rainfall'] train_ds_y = train['Rainfall'] clus_tr = clus_ds[~clus_ds['Index'].isin(l)] clus_ds_n = clus_ds_n.append(clus_tr) clus_ts = clus_ds[clus_ds['Index'].isin(l)] gen_tr = gen_ds[~gen_ds['Index'].isin(l)] gen_ds_n = gen_ds_n.append(gen_tr) gen_ts = gen_ds[gen_ds['Index'].isin(l)] print(
vv feature transform vv ############################################################################### """ gnbHH = GaussianNB() gnbHL = GaussianNB() gnbLH = GaussianNB() gnbLL = GaussianNB() gnbHH.fit(featH, do_longH) gnbHL.fit(featH, do_shortH) gnbLH.fit(featL, do_longL) gnbLL.fit(featL, do_shortL) rf = RFC(n_estimators=100) my_cv = RepeatedKFold(n_splits=5, n_repeats=10) params = {'max_depth' : (3,4,5,6), 'min_samples_split' : (20,30,40,50,60,70)} clfHH = GridSearchCV(rf, params, cv=my_cv, scoring='precision', n_jobs=-1) clfHL = GridSearchCV(rf, params, cv=my_cv, scoring='precision', n_jobs=-1) clfLH = GridSearchCV(rf, params, cv=my_cv, scoring='precision', n_jobs=-1) clfLL = GridSearchCV(rf, params, cv=my_cv, scoring='precision', n_jobs=-1) clfHH.fit(featH, do_longH) clfHL.fit(featH, do_shortH) clfLH.fit(featL, do_longL) clfLL.fit(featL, do_shortL) """ ###############################################################################
def optimize_hp(X, Y, E, mode='grid', n_splits=3, n_repeats=5, verbose=True, **params): """ Optimize the (hyper)parameters of a DeepSurvK model using cross-validation. Parameters ---------- X: pandas DataFrame Data Y: pandas DataFrame It needs to have column 'T' E: pandas DataFrame It needs to have column 'E' mode: string Possible values are: 'grid' (default) 'random' TODO n_splits: int (optional) Number of folds. Default value is 3, as suggested in [1]. n_repeats: int (optional) Number of CV repetition. Default value is 5. verbose: boolean (optional) Define if verbose output is desired (True, default) or not (False) params: dictionary Each key corresponds to a parameter. The values correspond to a list of parameters to be explored. The number of epochs can be defined here. It should also be given as an entry of the dictionary with key `epochs` and value a list comprised of only one element. If the list has more than one element, only the first one will be considered. If number of epochs isn't defined by the user, then a default of 1000 will be used. Returns ------- best_params: dictionary Best parameters. Each key corresponds to a parameter. The values correspond to the optimized parameter. References ---------- [1] Katzman, Jared L., et al. "DeepSurv: personalized treatment recommender system using a Cox proportional hazards deep neural network." BMC medical research methodology 18.1 (2018): 24. """ # Check if number of epochs was defined. if 'epochs' in params: # If yes, extract its value (and remove it from the dictionary, # since it won't be optimized). epochs = params['epochs'][0] params.pop('epochs') else: # If not, set a default value of 1000. epochs = 1000 # Generating a list of dictionaries with all possible combinations. # Trick from https://stackoverflow.com/a/61335465/948768 keys, values = zip(*params.items()) params_list = [dict(zip(keys, v)) for v in itertools.product(*values)] # Compute important parameters. n_features = X.shape[1] n_combinations = len(params_list) if verbose: print(f"Optimizing {n_combinations} parameter combinations.") if verbose: started_at = datetime.datetime.now().replace(microsecond=0) print("Optimization started at: ", end='', flush=True) print(started_at.strftime("%Y-%m-%d %H:%M:%S")) # Initialize important variables. c_index_mean = [] c_index_std = [] # Loop through all possible parameter combinations. for ii, params_curr in enumerate(params_list): if verbose: print(f"Parameter set {ii+1}/{n_combinations}...") print(params_curr) # Create RepatedKFold object. rkf = RepeatedKFold(n_splits=n_splits, n_repeats=n_repeats) # To store results. c_index_param = [] # Loop through different data partitions. for jj, (train_index, val_index) in enumerate(rkf.split(X, Y)): if verbose: print(f"\tIteration {jj+1}/{n_splits*n_repeats}...", end='', flush=True) # Perform data partition. X_train, X_val = X.iloc[train_index, :], X.iloc[val_index, :] Y_train, Y_val = Y.iloc[train_index, :], Y.iloc[val_index, :] E_train, E_val = E.iloc[train_index, :], E.iloc[val_index, :] # Create DSK model (with current loop's parameters) dsk = deepsurvk.DeepSurvK(n_features=n_features, E=E_train, **params_curr) loss = deepsurvk.negative_log_likelihood(E_train) dsk.compile(loss=loss) callbacks = deepsurvk.common_callbacks() # Fit model. n_patients_train = X_train.shape[0] dsk.fit(X_train, Y_train, batch_size=n_patients_train, epochs=epochs, callbacks=callbacks, shuffle=False) # Generate predictions. Y_pred_val = np.exp(-dsk.predict(X_val)) # Compute quality metric (c-index) c = deepsurvk.concordance_index(Y_val, Y_pred_val, E_val) c_index_param.append(c) if verbose: print(f"\tc-index = {c}") # Calculate c-index mean and STD for current parameter set. c_index_mean.append(np.nanmean(c_index_param)) c_index_std.append(np.nanstd(c_index_param)) if verbose: ended_at = datetime.datetime.now().replace(microsecond=0) print("Optimization ended at: ", end='', flush=True) print(ended_at.strftime("%Y-%m-%d %H:%M:%S")) print(f"Optimization took {ended_at-started_at}") # Find parameter combination with highest c-index. c_index_mean_max = max(c_index_mean) idx = c_index_mean.index(c_index_mean_max) best_params = params_list[idx] return best_params
y = y.values x = x.dropna() x = x.values x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, shuffle=True) ## Scale the dataset by removing mean and scaling to unit variance sc = StandardScaler() x_train = sc.fit_transform(x_train) x_test = sc.transform(x_test) c_values = [0.0000001, 0.000001, 0.00001, 0.0001, 0.001, 0.01] param_grid = dict(C=c_values) cv = RepeatedKFold(n_splits=5, n_repeats=5, random_state=200889) model = linear_model.LogisticRegression(solver="lbfgs", multi_class="multinomial") ## Based on the chosen model, create a grid to search for the optimal model grid = GridSearchCV(estimator=model, param_grid=param_grid, cv=cv, scoring="accuracy", n_jobs=-1) ## Get the grid results and fit to training set grid_result = grid.fit(x_train, y_train) print('Best C:', grid_result.best_estimator_.get_params()['C']) print('Best model:', grid_result.best_estimator_) print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
def train_model(model, param_grid=[], X=[], y=[], splits=5, repeats=5): # get unmodified training data, unless data to use already specified if len(y) == 0: X, y = get_trainning_data_omitoutliers() #poly_trans=PolynomialFeatures(degree=2) #X=poly_trans.fit_transform(X) #X=MinMaxScaler().fit_transform(X) # create cross-validation method rkfold = RepeatedKFold(n_splits=splits, n_repeats=repeats) # perform a grid search if param_grid given if len(param_grid) > 0: # setup grid search parameters gsearch = GridSearchCV(model, param_grid, cv=rkfold, scoring="neg_mean_squared_error", verbose=1, return_train_score=True) # search the grid gsearch.fit(X, y) # extract best model from the grid model = gsearch.best_estimator_ best_idx = gsearch.best_index_ # get cv-scores for best model grid_results = pd.DataFrame(gsearch.cv_results_) cv_mean = abs(grid_results.loc[best_idx, 'mean_test_score']) cv_std = grid_results.loc[best_idx, 'std_test_score'] # no grid search, just cross-val score for given model else: grid_results = [] cv_results = cross_val_score(model, X, y, scoring="neg_mean_squared_error", cv=rkfold) cv_mean = abs(np.mean(cv_results)) cv_std = np.std(cv_results) # combine mean and std cv-score in to a pandas series cv_score = pd.Series({'mean': cv_mean, 'std': cv_std}) # predict y using the fitted model y_pred = model.predict(X) # print stats on model performance print('----------------------') print(model) print('----------------------') print('score=', model.score(X, y)) print('rmse=', rmse(y, y_pred)) print('mse=', mse(y, y_pred)) print('cross_val: mean=', cv_mean, ', std=', cv_std) # residual plots y_pred = pd.Series(y_pred, index=y.index) resid = y - y_pred mean_resid = resid.mean() std_resid = resid.std() z = (resid - mean_resid) / std_resid n_outliers = sum(abs(z) > 3) plt.figure(figsize=(15, 5)) ax_131 = plt.subplot(1, 3, 1) plt.plot(y, y_pred, '.') plt.xlabel('y') plt.ylabel('y_pred') plt.title('corr = {:.3f}'.format(np.corrcoef(y, y_pred)[0][1])) ax_132 = plt.subplot(1, 3, 2) plt.plot(y, y - y_pred, '.') plt.xlabel('y') plt.ylabel('y - y_pred') plt.title('std resid = {:.3f}'.format(std_resid)) ax_133 = plt.subplot(1, 3, 3) z.plot.hist(bins=50, ax=ax_133) plt.xlabel('z') plt.title('{:.0f} samples with z>3'.format(n_outliers)) return model, cv_score, grid_results
from sklearn.model_selection import GridSearchCV coxph = CoxPHSurvivalAnalysis() grid_values = {'alpha': [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1]} grid_c = GridSearchCV(coxph, param_grid=grid_values, scoring=None) grid_c.fit(data_x, data_y) print('Grid best parameter (max c-index): ', grid_c.best_params_) print('Grid best score (c-index): ', grid_c.best_score_) # Apply Cox-PH model based on 3-fold 10-repeated CV using optimal alpha selected from grid search: from sklearn.model_selection import RepeatedKFold rkf = RepeatedKFold(n_splits=3, n_repeats=10, random_state=0) # 3-fold 10-repeated CV c_index_train, c_index_test = [], [] for train_index, test_index in rkf.split(data_x): x_train, x_test = data_x[train_index], data_x[test_index] y_train, y_test = data_y[train_index], data_y[test_index] coxph = CoxPHSurvivalAnalysis( alpha=float(grid_c.best_params_['alpha'])).fit(x_train, y_train) c_index_train.append(coxph.score(x_train, y_train)) c_index_test.append(coxph.score(x_test, y_test)) print("Averaged c-index from 3-fold 10 repeated CV(training): {:.3f}".format( np.mean(c_index_train))) print("Averaged c-index from 3-fold 10 repeated CV(test): {:.3f}".format( np.mean(c_index_test)))
from sklearn.model_selection import KFold, cross_val_score, RepeatedKFold #creating a data frame containing our data, each column can be accessed by df['column name'] df = pd.read_csv( 'C:\\Users\\MuthaNagaVenkataSaty\\Desktop\\Python DL\\Python Lesson 4\\Python_Lesson6\\iris.csv' ) # df.loc[df.Species == 'Iris-setosa', 'class'] = 1 # df.loc[df.Species == 'Iris-versicolor', 'class'] = 2 # df.loc[df.Species == 'Iris-virginica', 'class'] = 3 # train data X = df.as_matrix( columns=['sepal length', 'sepal width', 'petal length', 'petal width']) y = df['Species'].values kf = RepeatedKFold(n_splits=5, n_repeats=10, random_state=None) scores = [] for train_index, test_index in kf.split(X): #print("Train:", train_index, "Validation:",test_index) X_train, X_test = X[train_index], X[test_index] y_train, y_test = y[train_index], y[test_index] gnb = GaussianNB() gnb.fit(X_train, y_train) train_predicted_values = gnb.predict(X_test) scores.append(metrics.accuracy_score(y_test, train_predicted_values) * 100) print(pd.np.mean(scores))
# @Email : [email protected] # @File : estimation_of_classify_model.py # @Software: PyCharm from sklearn.model_selection import RepeatedKFold from sklearn import metrics import ml_models as oldmethod_class from esti_pretreat_method import load_and_normalized from esti_pretreat_method import esti_NMF data, target_class = load_and_normalized(drug_id=134) data, target_class = esti_NMF(data=data, target_class=target_class, for_sklearn=True) rkf = RepeatedKFold(n_splits=10, n_repeats=1, random_state=1234567) # random state就是为了可重复而已 sum_fpr = [] sum_tpr = [] sum_AUC = [] for train, test in rkf.split(data): data_train, data_test, target_train, target_test = data[train], data[ test], target_class[train], target_class[test] model = oldmethod_class.RandomForestC(data_train, target_train) y_pred = model.predict_proba(data_test) y_score = [i[1] for i in y_pred] fpr, tpr, thresholds = metrics.roc_curve(y_true=target_test, y_score=y_score, pos_label=1) # pos_label一般都是1 sum_fpr.append(fpr) sum_tpr.append(tpr)
# Log transform data, note that +1 is to avoid zero values # dataSet.data = np.log(dataSet.data + 1).to_numpy() for algorithm in ALGORITHMS: precision_score = [] scores_list = [] auc_score = [] recall_score = [] f1_score = [] ext_precision_score = [] ext_scores_list = [] ext_auc_score = [] ext_recall_score = [] ext_f1_score = [] rkf = RepeatedKFold(n_splits=10, n_repeats=10, random_state=2652124) for train_index, test_index in rkf.split(dataSet.components): X_train, X_test, y_train, y_test = dataSet.components[train_index], dataSet.components[test_index], dataSet.target[train_index], dataSet.target[test_index] result = algorithm(X_train, X_test, y_train, y_test) precision_score.append(result.get('precision')) scores_list.append(result.get('accuracy')) auc_score.append(result.get('auc')) recall_score.append(result.get('recall')) f1_score.append(result.get('f1_measure')) trainedModel = result.get('algorithm') if checkExternalValidity: external_pred = trainedModel.predict(totalMinedData) ext_precision_score.append(metrics.precision_score(expectedResult, external_pred)) try:
df = pd.concat([df, tempDF], axis=1, join='inner') df['Ratio'] = df['key'] df.drop(labels=['key'], axis=1, inplace=True) df['Ratio'] = df['Ratio'].apply(lambda x: round(x / 5)) print(df.columns) df.head() # In[10]: X = df.iloc[:, :-1].values y = df.iloc[:, -1].values xtrain, xtest, ytrain, ytest = train_test_split(X, y, test_size=0.15) # In[11]: cv = RepeatedKFold(n_splits=6, n_repeats=3) param_grid = dict() param_grid['alpha'] = [0.0001, 0.001, 0.01] param_grid['l1_ratio'] = list(np.arange(0.2, 0.6, 0.05)) sgd = SGDClassifier(penalty='elasticnet', max_iter=100000, epsilon=0.001, learning_rate='optimal', loss='log', n_jobs=-1) model = GridSearchCV( sgd, param_grid=param_grid, cv=cv, n_jobs=-1, )
evals=watchlist, early_stopping_rounds=200, verbose_eval=100, params=xgb_params) oof_xgb[val_idx] = clf.predict(xgb.DMatrix(X_train[val_idx]), ntree_limit=clf.best_ntree_limit) predictions_xgb += clf.predict( xgb.DMatrix(X_test), ntree_limit=clf.best_ntree_limit) / folds.n_splits print("CV score: {:<8.8f}".format(mean_squared_error(oof_xgb, target))) # 将lgb和xgb的结果进行stacking train_stack = np.vstack([oof_lgb, oof_xgb]).transpose() test_stack = np.vstack([predictions_lgb, predictions_xgb]).transpose() folds_stack = RepeatedKFold(n_splits=5, n_repeats=2, random_state=4590) oof_stack = np.zeros(train_stack.shape[0]) predictions = np.zeros(test_stack.shape[0]) for fold_, (trn_idx, val_idx) in enumerate(folds_stack.split(train_stack, target)): print("fold {}".format(fold_)) trn_data, trn_y = train_stack[trn_idx], target.iloc[trn_idx].values val_data, val_y = train_stack[val_idx], target.iloc[val_idx].values clf_3 = BayesianRidge() clf_3.fit(trn_data, trn_y) oof_stack[val_idx] = clf_3.predict(val_data) predictions += clf_3.predict(test_stack) / 10
NA = [1, 2, 3, 4, 5, 6, 7, 8] ltden = list() lsden = list() lterr = list() lserr = list() lterrt = list() lserrt = list() ltpcc = list() lspcc = list() ltpcct = list() lspcct = list() lind1 = list() lind2 = list() for m in MA: for n in NA: rkf = RepeatedKFold(n_splits=4, n_repeats=6, random_state=random_state) if n == 1: clf = MLPRegressor(solver='lbfgs', alpha=alpha, hidden_layer_sizes=(m), shuffle=False, random_state=random_state) elif n == 2: clf = MLPRegressor(solver='lbfgs', alpha=alpha, hidden_layer_sizes=(m, m), shuffle=False, random_state=random_state) elif n == 3: clf = MLPRegressor(solver='lbfgs', alpha=alpha,
def lasso_train(groups, varname='valence', arrayname='norm', alpha=None, use_lars=True, fit_intercept=True, normalize=True, cv_folds=None, cv_repeats=None, skip_cv=False, xmin=-np.inf, xmax=np.inf, _larch=None, **kws): """use a list of data groups to train a Lasso/LassoLars model Arguments --------- groups list of groups to use as components varname name of characteristic value to model ['valence'] arrayname string of array name to be fit (see Note 3) ['norm'] xmin x-value for start of fit range [-inf] xmax x-value for end of fit range [+inf] alpha alpha parameter for LassoLars (See Note 5) [None] use_lars bool to use LassoLars instead of Lasso [True] cv_folds None or number of Cross-Validation folds (Seee Note 4) [None] cv_repeats None or number of Cross-Validation repeats (Seee Note 4) [None] skip_cv bool to skip doing Cross-Validation [None] Returns ------- group with trained LassoLars model, to be used with lasso_predict Notes ----- 1. The group members for the components must match each other in data content and array names. 2. all grouops must have an attribute (scalar value) for `varname` 3. arrayname can be one of `norm` or `dmude` 4. Cross-Validation: if cv_folds is None, sqrt(len(groups)) will be used (rounded to integer). if cv_repeats is None, sqrt(len(groups))-1 will be used (rounded). 5. alpha is the regularization parameter. if alpha is None it will be set using LassoLarsSCV """ xdat, spectra = groups2matrix(groups, arrayname, xmin=xmin, xmax=xmax) groupnames = [] ydat = [] for g in groups: groupnames.append(getattr(g, 'filename', getattr(g, 'groupname', repr(g)))) val = getattr(g, varname, None) if val is None: raise Value("group '%s' does not have attribute '%s'" % (g, varname)) ydat.append(val) ydat = np.array(ydat) nvals = len(groups) kws.update(dict(fit_intercept=fit_intercept, normalize=normalize)) creator = LassoLars if use_lars else Lasso model = None rmse_cv = None if not skip_cv: if cv_folds is None: cv_folds = int(round(np.sqrt(nvals))) if cv_repeats is None: cv_repeats = int(round(np.sqrt(nvals)) - 1) cv = RepeatedKFold(n_splits=cv_folds, n_repeats=cv_repeats) if alpha is None: lcvmod = LassoLarsCV(cv=cv, max_n_alphas=1e7, max_iter=1e7, eps=1.e-12, **kws) lcvmod.fit(spectra, ydat) alpha = lcvmod.alpha_ model = creator(alpha=alpha, **kws) resid = [] for ctrain, ctest in cv.split(range(nvals)): model.fit(spectra[ctrain, :], ydat[ctrain]) ypred = model.predict(spectra[ctest, :]) resid.extend((ypred - ydat[ctest]).tolist()) resid = np.array(resid) rmse_cv = np.sqrt( (resid**2).mean() ) if alpha is None: cvmod = creator(**kws) cvmod.fit(spectra, ydat) alpha = cvmod.alpha_ if model is None: model = creator(alpha=alpha, **kws) # final fit without cross-validation out = model.fit(spectra, ydat) ypred = model.predict(spectra) rmse = np.sqrt(((ydat - ypred)**2).mean()) return Group(x=xdat, spectra=spectra, ydat=ydat, ypred=ypred, alpha=alpha, active=model.active_, coefs=model.coef_, cv_folds=cv_folds, cv_repeats=cv_repeats, rmse_cv=rmse_cv, rmse=rmse, model=model, varname=varname, arrayname=arrayname, fit_intercept=fit_intercept, normalize=normalize, groupnames=groupnames, keywords=kws)