def lpo_sklearn(X,y, regparam): lpo = LeavePOut(p=2) preda = [] predb = [] for train, test in lpo.split(X): rls = KernelRidge(kernel="rbf", gamma=0.01) rls.fit(X[train], y[train]) p = rls.predict(X[test]) preda.append(p[0]) predb.append(p[1]) return preda, predb
def main(): path_boy = "F:\\study in school\\machine learning\\forstudent\\实验数据\\boynew.txt" path_girl = "F:\\study in school\\machine learning\\forstudent\\实验数据\\girlnew.txt" # height = [] # weight = [] # feetsize = [] x_boy = [] x_girl = [] label_boy = [] # 1表示男,0表示女 label_girl = [] readdata1(path_boy, x_boy, label_boy, 1) readdata1(path_girl, x_girl, label_girl, 0) x_boy = np.mat(x_boy) x_girl = np.mat(x_girl) m1 = x_boy.mean(0) m0 = x_girl.mean(0) S1 = (x_boy - m1[0]).T * (x_boy - m1[0]) S0 = (x_girl - m0[0]).T * (x_girl - m0[0]) Sw = S1 + S0 S_inverse = Sw.I W = S_inverse * (m1 - m0).T M1 = float(W.T * m1.T) M0 = float(W.T * m0.T) w_decision0 = (M0 + M1) / 2 path_boy_test = "F:\\study in school\\machine learning\\forstudent\\实验数据\\boy.txt" path_girl_test = "F:\\study in school\\machine learning\\forstudent\\实验数据\\girl.txt" x = [] label = [] readdata1(path_boy_test, x, label, 1) readdata1(path_girl_test, x, label, 0) label_test = [] y = x * W errorcount = 0 for i in range(len(label)): if float(y[i] > w_decision0): label_test.append(1) if label[i] != 1: errorcount = errorcount + 1 else: label_test.append(0) if label[i] != 0: errorcount = errorcount + 1 e_percentage = errorcount / len(label_test) print('fisher测试集的错误率为%f' % e_percentage) #留一法 loo = LeavePOut(p=1) error = 0 for train, test in loo.split(x, label): x_boy = [] x_girl = [] label_boy = [] # 1表示男,0表示女 label_girl = [] for i in train: if label[i] == 1: x_boy.append(x[i]) label_boy.append(1) else: x_girl.append(x[i]) label_girl.append(0) x_boy = np.mat(x_boy) x_girl = np.mat(x_girl) m1 = x_boy.mean(0) m0 = x_girl.mean(0) S1 = (x_boy - m1[0]).T * (x_boy - m1[0]) S0 = (x_girl - m0[0]).T * (x_girl - m0[0]) Sw = S1 + S0 S_inverse = Sw.I W = S_inverse * (m1 - m0).T M1 = float(W.T * m1.T) M0 = float(W.T * m0.T) w_decision0 = (M0 + M1) / 2 for j in test: if float(x[j] * W > w_decision0): if label[j] != 1: error = error + 1 else: label_test.append(0) if label[j] != 0: error = error + 1 print('fisher留一法的错误率为%f' % (error / len(label))) figure(3) FPR, TPR = get_roc_fisher(W, w_decision0, x, label) plot(FPR, TPR, label='fisher') figure(5) x1 = np.arange(130, 190, 0.01) y1 = (w_decision0 - W[0] * x1) / W[1] plot(x1, array(y1)[0]) plot(x1, x1 * float(W[1]) / float(W[0])) for i in range(len(label)): if label[i] == 1: plot(float(x[i][0]), float(x[i][1]), 'o', color='r') else: plot(float(x[i][0]), float(x[i][1]), 'o', color='g') a=(float(x[i][1])+float(x[i][0])*float(W[0])/float(W[1]))/\ (float(W[1])/float(W[0])+float(W[0])/float(W[1])) b = a * float(W[1]) / float(W[0]) plot([float(x[i][0]), a], [float(x[i][1]), b], '--', color='0.75') axis([140, 190, 35, 85]) Bayes()
import numpy as np from sklearn.datasets import load_iris from sklearn.linear_model import LogisticRegression from sklearn.model_selection import cross_val_score, LeavePOut # Set random seed for reproducibility np.random.seed(1000) if __name__ == '__main__': # Load the dataset data = load_iris() p = 3 lr = LogisticRegression() # Perform Leave-P-Out Cross Validation lpo_scores = cross_val_score(lr, data['data'], data['target'], cv=LeavePOut(p)) print('LPO scores (100): {}'.format(lpo_scores[0:100])) print('Average LPO score: {}'.format(lpo_scores.mean()))
assert tokenize(cls(n_splits=3, random_state=0)) != tokenize( cls(n_splits=3, random_state=2)) assert tokenize(cls(n_splits=3, random_state=0)) != tokenize( cls(n_splits=4, random_state=0)) cv = cls(n_splits=3) assert compute_n_splits(cv, np_X, np_y, np_groups) == 3 with assert_dask_compute(False): assert compute_n_splits(cv, da_X, da_y, da_groups) == 3 @pytest.mark.parametrize("cvs", [(LeaveOneOut(), ), (LeavePOut(2), LeavePOut(3))]) def test_leave_out(cvs): tokens = [] for cv in cvs: assert tokenize(cv) == tokenize(cv) tokens.append(cv) assert len(set(tokens)) == len(tokens) cv = cvs[0] sol = cv.get_n_splits(np_X, np_y, np_groups) assert compute_n_splits(cv, np_X, np_y, np_groups) == sol with assert_dask_compute(True): assert compute_n_splits(cv, da_X, da_y, da_groups) == sol with assert_dask_compute(False):
#!/usr/bin/python # -*- coding: utf-8 -*- # Copyright (C) 2018 David Arroyo Menéndez # Author: David Arroyo Menéndez <*****@*****.**> # Maintainer: David Arroyo Menéndez <*****@*****.**> # This file is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation; either version 3, or (at your option) # any later version. # This file is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # You should have received a copy of the GNU General Public License # along with GNU Emacs; see the file COPYING. If not, write to # the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, # Boston, MA 02110-1301 USA, from sklearn.model_selection import LeavePOut import numpy as np X = np.ones(4) lpo = LeavePOut(p=2) for train, test in lpo.split(X): print("train: %s, test: %s" % (train, test))
for i in test: bar[i] = "T" output_test = "{}({}: {}) ".format(output_test, i, data[i]) print("[ {} ]".format(" ".join(bar))) print("Train: {}".format(output_train)) print("Test: {}\n".format(output_test)) # Create some data to split with data = numpy.array([[1, 2], [3, 4], [5, 6], [7, 8]]) # Our two methods loocv = LeaveOneOut() lpocv = LeavePOut(p=P_VAL) split_loocv = loocv.split(data) split_lpocv = lpocv.split(data) print("""\ The Leave-P-Out method works by using every combination of P points as test data. The following output shows the result of splitting some sample data by Leave-One-Out and Leave-P-Out methods. A bar displaying the current train-test split as well as the actual data points are displayed for each split. In the bar, "-" is a training point and "T" is a test point. """) print("Data:\n{}\n".format(data)) print("Leave-One-Out:\n")
min_list = [] if c == 'i': min_list = [0, 8, 9, 12] else: min_list = [3, 4, 7, 10, 11, 13] data = pd.read_csv('input_' + c + '_2_hrv_c.csv', header=None) decisionTree = DecisionTreeClassifier() knnClf = KNeighborsClassifier( n_neighbors=3 ) # default:k = 5,defined by yourself:KNeighborsClassifier(n_neighbors=10) svc = svm.SVC( kernel='linear', C=1) # (kernel='linear', C=1) #(kernel='rbf') #(kernel='poly', degree=5) naive_bayes = GaussianNB() rand_forrest = RandomForestClassifier(n_estimators=25) lpo = LeavePOut(p=3) kf = KFold(n_splits=5) X_raw = data.iloc[:, :data.shape[1] - 1] y = data.iloc[:, data.shape[1] - 1] def my_validation(model, X_f, y_f): score = np.array([]) if c == 'i': X_train = X_f.iloc[8:, :] y_train = y_f.iloc[8:] X_test = X_f.iloc[:8, :] y_test = y_f.iloc[:8] else: X_train = X_f.iloc[:8, :] y_train = y_f.iloc[:8]
import numpy as np from sklearn.model_selection import LeavePOut ''' يترك عدد عناصر معين تقوم بتحديده للاختبار و الباقي للتدريب ''' X = np.array([[1, 11], [2, 12], [3, 13], [4, 14], [5, 15], [6, 16], [7, 17], [8, 18], [9, 19], [10, 20]]) y = np.array([[1], [0], [1], [1], [0], [1], [1], [0], [0], [1]]) lpo = LeavePOut(4) print('number of splits = ', str(lpo.get_n_splits(X))) print("----------------------------------------------------------") folds = lpo.split(X) for train_index, test_index in folds: print('train : ', train_index, ' test : ', test_index) print('X_train \n ', X[train_index]) print('X_test \n ', X[test_index]) print('y_train \n ', y[train_index]) print('y_test \n ', y[test_index]) print("----------------------------------------------------------")
def train(X, y, k_cross_validation_ratio, testing_size, optimal_k=True, min_range_k=0, max_range_k=0): X0_train, X_test, y0_train, y_test = train_test_split( X, y, test_size=testing_size, random_state=7) #Scaler is needed to scale all the inputs to a similar range scaler = StandardScaler() scaler = scaler.fit(X0_train) X0_train = scaler.transform(X0_train) X_test = scaler.transform(X_test) #X_train, X_eval, y_train, y_eval = train_test_split(X0_train, y0_train, test_size= 100/k_cross_validation_ratio, random_state=7) #finding the range for the optimal value of k either within the specified range (user input) # or by our default range if optimal_k and min_range_k > 0 and max_range_k > min_range_k: k_range = range(min_range_k, max_range_k) else: k_range = range(1, 50) scores = {} scores_list = [] #finding the optimal nb of neighbors for k in tqdm(k_range): knn = KNeighborsClassifier(n_neighbors=k) knn.fit(X0_train, y0_train) y_pred = knn.predict(X_test) scores[k] = metrics.accuracy_score(y_test, y_pred) scores_list.append(metrics.accuracy_score(y_test, y_pred)) k_optimal = scores_list.index(max(scores_list)) model = KNeighborsClassifier(n_neighbors=k_optimal) eval_score_list = [] #Evaluation using cross validation: lpo: leave p out from sklearn.model_selection import StratifiedKFold lpo = LeavePOut(p=1) accuracys = [] skf = StratifiedKFold(n_splits=10, random_state=None) skf.get_n_splits(X0_train, y0_train) for train_index, test_index in skf.split(X0_train, y0_train): # print("TRAIN:", train_index, "Validation:", test_index) X_train, X_eval = pd.DataFrame(X0_train).iloc[ train_index], pd.DataFrame(X0_train).iloc[test_index] y_train, y_eval = pd.DataFrame(y0_train).iloc[ train_index], pd.DataFrame(y0_train).iloc[test_index] model.fit(X0_train, y0_train) predictions = model.predict(X_eval) score = accuracy_score(predictions, y_eval) accuracys.append(score) #scores = cross_val_score(knn, X, y, cv=5, scoring='accuracy') #eval_score_list.append(scores.mean()) #eval_accuracy = np.mean(eval_score_list) eval_accuracy = np.mean(accuracys) #save the pretrained model: model_name = 'pretrained_knn_model' pickle.dump(model, open(model_name, 'wb')) return eval_accuracy, model, X0_train, y0_train, X_test, y_test
4: weights[4] } over = SMOTE(sampling_strategy=ratio_over, random_state=314) X_train, y_train = over.fit_resample(X_train, y_train) # undersample samples > average ratio_under = { 0: average_samples, 1: average_samples, 2: average_samples, 3: average_samples, 4: average_samples } under = RandomUnderSampler(sampling_strategy=ratio_under, random_state=314) X_train, y_train = under.fit_resample(X_train, y_train) cv_inner = LeavePOut(2) model = KerasClassifier(build_fn=create_model, verbose=1) batch_size = [8, 16, 32] neurons = [30, 40, 50] hidden_layers = [1, 2, 3] epochs = [10, 50, 100] activation = ['softmax', 'relu', 'tanh'] param_grid = dict(batch_size=batch_size, neurons=neurons, hidden_layers=hidden_layers, epochs=epochs, activation=activation) grid = GridSearchCV(estimator=model, param_grid=param_grid, n_jobs=-2,
def nestedCVClassifier(df, outcomeVar, predVars, model, params={}, nFolds=10, LPO=None, scorer='log_loss', n_jobs=1): """Apply model to df in nested cross-validation framework with inner folds to optimize hyperparameters. and outer test folds to evaluate performance. Parameters ---------- df : pd.DataFrame Must contain outcome and predictor variables. outcomeVar : str predVars : ndarray or list Predictor variables in the model. model : sklearn model nFolds : int N-fold stratified cross-validation LPO : int or None Use Leave-P-Out cross-validation instead of StratifiedNFoldCV params : dict Keys of model hyperparameters withe values to try in a grid search. Returns ------- results : dict Contains results as keys below: fpr: (100, ) average FPR for ROC tpr: (100, ) average TPR for ROC AUC: (outerFolds, ) AUC of ROC for each outer test fold meanAUC: (1, ) AUC of the average ROC ACC: (outerFolds, ) accuracy across outer test folds scores: (outerFolds, innerFolds, Cs) log-likelihood for each C across inner and outer CV folds optimalCs: (outerFolds, ) optimal C from each set of inner CV finalResult: final fitted model with predict() exposed prob: (N,) pd.Series of predicted probabilities avg over outer folds varList: (Nvars, ) list of vars with non-zero coef in final model Cs: (Cs, ) pre-specified grid of Cs coefs: (outerFolds, predVars) refit with optimalC in each fold paths: (outerFolds, Cs, predVars + intercept) avg across inner folds XVars: list of all vars in X yVar: name of outcome variable N: total number of rows/instances in the model""" if not isinstance(predVars, list): predVars = list(predVars) tmp = df[[outcomeVar] + predVars].dropna() X, y = tmp[predVars].astype(float), tmp[outcomeVar].astype(float) if LPO is None: innerCV = StratifiedKFold(n_splits=nFolds, shuffle=True) outerCV = StratifiedKFold(n_splits=nFolds, shuffle=True) else: innerCV = LeavePOut(LPO) outerCV = LeavePOut(LPO) if scorer == 'log_loss': scorerFunc = sklearn.metrics.make_scorer(sklearn.metrics.log_loss, greater_is_better=False, needs_proba=True, needs_threshold=False, labels=[0, 1]) elif scorer == 'accuracy': scorerFunc = sklearn.metrics.make_scorer( sklearn.metrics.accuracy_score, greater_is_better=True, needs_proba=False, needs_threshold=False) fpr = np.linspace(0, 1, 100) tpr = np.nan * np.zeros((fpr.shape[0], nFolds)) acc = np.nan * np.zeros(nFolds) auc = np.nan * np.zeros(nFolds) probs = [] optimalParams = [] optimalScores = [] cvResults = [] for outi, (trainInd, testInd) in enumerate(outerCV.split(X=X, y=y)): Xtrain, Xtest = X.iloc[trainInd], X.iloc[testInd] ytrain, ytest = y.iloc[trainInd], y.iloc[testInd] clf = GridSearchCV(estimator=model, param_grid=params, cv=innerCV, refit=True, scoring=scorerFunc, n_jobs=n_jobs) clf.fit(Xtrain, ytrain) cvResults.append(clf.cv_results_) optimalParams.append(clf.best_params_) optimalScores.append(clf.best_score_) prob = clf.predict_proba(Xtest) fprTest, tprTest, _ = sklearn.metrics.roc_curve(ytest, prob[:, 1]) tpr[:, outi] = np.interp(fpr, fprTest, tprTest) auc[outi] = sklearn.metrics.auc(fprTest, tprTest) acc[outi] = sklearn.metrics.accuracy_score(ytest, np.round(prob[:, 1]), normalize=True) probs.append(pd.Series(prob[:, 1], index=Xtest.index)) meanTPR = np.mean(tpr, axis=1) meanTPR[0], meanTPR[-1] = 0, 1 meanACC = np.mean(acc) meanAUC = sklearn.metrics.auc(fpr, meanTPR) """Compute mean probability over test predictions in CV""" probS = pd.concat(probs).groupby(level=0).agg(np.mean) probS.name = 'Prob' """Select "outer" optimal param for final model""" avgFunc = lambda v: 10**np.mean(np.log10(v)) # avgFunc = lambda v: np.mean(v) optP = { k: avgFunc([o[k] for o in optimalParams]) for k in optimalParams[0].keys() } for k, v in optP.items(): setattr(model, k, v) result = model.fit(X=X, y=y) rocRes = rocStats(y, np.round(probS)) outD = { 'fpr': fpr, 'tpr': meanTPR, 'AUC': auc, 'mAUC': meanAUC, 'mACC': np.mean(acc), 'ACC': acc, 'CVres': cvResults, 'optimalScores': np.array(optimalScores), 'optimalParams': optimalParams, 'finalParams': optP, 'finalResult': result, # final fitted model with predict() exposed 'prob': probS, # (N,) pd.Series of predicted probabilities avg over outer folds 'Xvars': predVars, 'Yvar': outcomeVar, 'N': tmp.shape[0], 'params': params } outD.update(rocRes[['Sensitivity', 'Specificity']].to_dict()) return outD
def logisticL1NestedCV(df, outcomeVar, predVars, nFolds=10, LPO=None, Cs=10, n_jobs=1): """Apply logistic regression with L1-regularization (LASSO) to df. Uses nested cross-validation framework with inner folds to optimize C and outer test folds to evaluate performance. Parameters ---------- df : pd.DataFrame Must contain outcome and predictor variables. outcomeVar : str predVars : ndarray or list Predictor variables in the model. nFolds : int N-fold stratified cross-validation LPO : int or None Use Leave-P-Out cross-validation instead of StratifiedNFoldCV Cs : int or list Each of the values in Cs describes the inverse of regularization strength. If Cs is as an int, then a grid of Cs values are chosen in a logarithmic scale between 1e-4 and 1e4. Smaller values specify stronger regularization. Returns ------- results : dict Contains results as keys below: fpr: (100, ) average FPR for ROC tpr: (100, ) average TPR for ROC AUC: (outerFolds, ) AUC of ROC for each outer test fold meanAUC: (1, ) AUC of the average ROC ACC: (outerFolds, ) accuracy across outer test folds scores: (outerFolds, innerFolds, Cs) log-likelihood for each C across inner and outer CV folds optimalCs: (outerFolds, ) optimal C from each set of inner CV finalResult: final fitted model with predict() exposed prob: (N,) pd.Series of predicted probabilities avg over outer folds varList: (Nvars, ) list of vars with non-zero coef in final model Cs: (Cs, ) pre-specified grid of Cs coefs: (outerFolds, predVars) refit with optimalC in each fold paths: (outerFolds, Cs, predVars + intercept) avg across inner folds XVars: list of all vars in X yVar: name of outcome variable N: total number of rows/instances in the model""" if not isinstance(predVars, list): predVars = list(predVars) tmp = df[[outcomeVar] + predVars].dropna() X, y = tmp[predVars].astype(float), tmp[outcomeVar].astype(float) if LPO is None: innerCV = StratifiedKFold(n_splits=nFolds, shuffle=True) outerCV = StratifiedKFold(n_splits=nFolds, shuffle=True) else: innerCV = LeavePOut(LPO) outerCV = LeavePOut(LPO) scorerFunc = sklearn.metrics.make_scorer(sklearn.metrics.log_loss, greater_is_better=False, needs_proba=True, needs_threshold=False, labels=[0, 1]) fpr = np.linspace(0, 1, 100) tpr = np.nan * np.zeros((fpr.shape[0], nFolds)) acc = np.nan * np.zeros(nFolds) auc = np.nan * np.zeros(nFolds) paths = [] coefs = [] probs = [] optimalCs = np.nan * np.zeros(nFolds) scores = [] for outi, (trainInd, testInd) in enumerate(outerCV.split(X=X, y=y)): Xtrain, Xtest = X.iloc[trainInd], X.iloc[testInd] ytrain, ytest = y.iloc[trainInd], y.iloc[testInd] model = sklearn.linear_model.LogisticRegressionCV(Cs=Cs, cv=innerCV, penalty='l1', solver='liblinear', scoring=scorerFunc, refit=True, n_jobs=n_jobs) """With refit = True, the scores are averaged across all folds, and the coefs and the C that corresponds to the best score is taken, and a final refit is done using these parameters.""" results = model.fit(X=Xtrain, y=ytrain) prob = results.predict_proba(Xtest) class1Ind = np.nonzero(results.classes_ == 1)[0][0] fprTest, tprTest, _ = sklearn.metrics.roc_curve( ytest, prob[:, class1Ind]) tpr[:, outi] = np.interp(fpr, fprTest, tprTest) auc[outi] = sklearn.metrics.auc(fprTest, tprTest) acc[outi] = sklearn.metrics.accuracy_score(ytest, np.round(prob[:, class1Ind]), normalize=True) optimalCs[outi] = results.C_[0] scores.append(results.scores_[1]) paths.append(results.coefs_paths_[1]) coefs.append(results.coef_) probs.append(pd.Series(prob[:, class1Ind], index=Xtest.index)) meanTPR = np.mean(tpr, axis=1) meanTPR[0], meanTPR[-1] = 0, 1 meanACC = np.mean(acc) meanAUC = sklearn.metrics.auc(fpr, meanTPR) meanC = 10**np.mean(np.log10(optimalCs)) paths = np.concatenate([p.mean(axis=0, keepdims=True) for p in paths], axis=0) scores = np.concatenate([s[None, :, :] for s in scores], axis=0) """Compute mean probability over test predictions in CV""" probS = pd.concat(probs).groupby(level=0).agg(np.mean) probS.name = 'Prob' """Refit all the data with the optimal C for variable selection and classification of holdout data""" model = sklearn.linear_model.LogisticRegression(C=meanC, penalty='l1', solver='liblinear') result = model.fit(X=X, y=y) varList = np.array(predVars)[result.coef_.ravel() != 0].tolist() rocRes = rocStats(y, np.round(probS)) outD = { 'fpr': fpr, # (100, ) average FPR for ROC 'tpr': meanTPR, # (100, ) average TPR for ROC 'AUC': auc, # (outerFolds, ) AUC of ROC for each outer test fold 'mAUC': meanAUC, # (1, ) AUC of the average ROC 'ACC': acc, # (outerFolds, ) accuracy across outer test folds 'mACC': np.mean(acc), 'scores': scores, # (outerFolds, innerFolds, Cs) score for each C across inner and outer CV folds 'optimalCs': optimalCs, # (outerFolds, ) optimal C from each set of inner CV 'C': meanC, 'finalResult': result, # final fitted model with predict() exposed 'prob': probS, # (N,) pd.Series of predicted probabilities avg over outer folds 'varList': varList, # list of vars with non-zero coef in final model 'Cs': Cs, # pre-specified grid of Cs 'coefs': np.concatenate( coefs), # (outerFolds, predVars) refit with optimalC in each fold 'paths': paths, # (outerFolds, Cs, predVars + intercept) avg across inner folds 'Xvars': predVars, 'Yvar': outcomeVar, 'N': tmp.shape[0] } outD.update(rocRes[['Sensitivity', 'Specificity']].to_dict()) return outD
def palatability_identity_calculations(rec_dir, pal_ranks=None, params=None, shell=False): warnings.filterwarnings('ignore', category=UserWarning) warnings.filterwarnings('ignore', category=RuntimeWarning) dat = load_dataset(rec_dir) dim = dat.dig_in_mapping if 'palatability_rank' in dim.columns: pass elif pal_ranks is None: dim = get_palatability_ranks(dim, shell=shell) else: dim['palatability_rank'] = dim['name'].map(pal_ranks) dim = dim.dropna(subset=['palatability_rank']) dim = dim[dim['palatability_rank'] > 0] dim = dim.reset_index(drop=True) num_tastes = len(dim) taste_names = dim.name.to_list() trial_list = dat.dig_in_trials.copy() trial_list = trial_list[[True if x in taste_names else False for x in trial_list.name]] num_trials = trial_list.groupby('channel').count()['name'].unique() if len(num_trials) > 1: raise ValueError('Unequal number of trials for tastes to used') else: num_trials = num_trials[0] dim['num_trials'] = num_trials # Get which units to use unit_table = h5io.get_unit_table(rec_dir) unit_types = ['Single', 'Multi', 'All', 'Custom'] unit_type = params.get('unit_type') if unit_type is None: q = userIO.ask_user('Which units do you want to use for taste ' 'discrimination and palatability analysis?', choices=unit_types, shell=shell) unit_type = unit_types[q] if unit_type == 'Single': chosen_units = unit_table.loc[unit_table['single_unit'], 'unit_num'].to_list() elif unit_type == 'Multi': chosen_units = unit_table.loc[unit_table['single_unit'] == False, 'unit_num'].to_list() elif unit_type == 'All': chosen_units = unit_table['unit_num'].to_list() else: selection = userIO.select_from_list('Select units to use:', unit_table['unit_num'], 'Select Units', multi_select=True) chosen_units = list(map(int, selection)) num_units = len(chosen_units) unit_table = unit_table.loc[chosen_units] # Enter Parameters if params is None or params.keys() != default_pal_id_params.keys(): params = default_pal_id_params.copy() params = userIO.confirm_parameter_dict(params, ('Palatability/Identity ' 'Calculation Parameters' '\nTimes in ms'), shell=shell) win_size = params['window_size'] win_step = params['window_step'] print('Running palatability/identity calculations with parameters:\n%s' % pt.print_dict(params)) with tables.open_file(dat.h5_file, 'r+') as hf5: trains_dig_in = hf5.list_nodes('/spike_trains') time = trains_dig_in[0].array_time[:] bin_times = np.arange(time[0], time[-1] - win_size + win_step, win_step) num_bins = len(bin_times) palatability = np.empty((num_bins, num_units, num_tastes*num_trials), dtype=int) identity = np.empty((num_bins, num_units, num_tastes*num_trials), dtype=int) unscaled_response = np.empty((num_bins, num_units, num_tastes*num_trials), dtype=np.dtype('float64')) response = np.empty((num_bins, num_units, num_tastes*num_trials), dtype=np.dtype('float64')) laser = np.empty((num_bins, num_units, num_tastes*num_trials, 2), dtype=float) # Fill arrays with data print('Filling data arrays...') onesies = np.ones((num_bins, num_units, num_trials)) for i, row in dim.iterrows(): idx = range(num_trials*i, num_trials*(i+1)) palatability[:, :, idx] = row.palatability_rank * onesies identity[:, :, idx] = row.channel * onesies for j, u in enumerate(chosen_units): for k,t in enumerate(bin_times): t_idx = np.where((time >= t) & (time <= t+win_size))[0] unscaled_response[k, j, idx] = \ np.mean(trains_dig_in[i].spike_array[:, u, t_idx], axis=1) try: lasers[k, j, idx] = \ np.vstack((trains_dig_in[i].laser_durations[:], trains_dig_in[i].laser_onset_lag[:])) except: laser[k, j, idx] = np.zeros((num_trials, 2)) # Scaling was not done, so: response = unscaled_response.copy() # Make ancillary_analysis node and put in arrays if '/ancillary_analysis' in hf5: hf5.remove_node('/ancillary_analysis', recursive=True) hf5.create_group('/', 'ancillary_analysis') hf5.create_array('/ancillary_analysis', 'palatability', palatability) hf5.create_array('/ancillary_analysis', 'identity', identity) hf5.create_array('/ancillary_analysis', 'laser', laser) hf5.create_array('/ancillary_analysis', 'scaled_neural_response', response) hf5.create_array('/ancillary_analysis', 'window_params', np.array([win_size, win_step])) hf5.create_array('/ancillary_analysis', 'bin_times', bin_times) hf5.create_array('/ancillary_analysis', 'unscaled_neural_response', unscaled_response) # for backwards compatibility hf5.create_array('/ancillary_analysis', 'params', np.array([win_size, win_step])) hf5.create_array('/ancillary_analysis', 'pre_stim', np.array(time[0])) hf5.flush() # Get unique laser (duration, lag) combinations print('Organizing trial data...') unique_lasers = np.vstack(list({tuple(row) for row in laser[0, 0, :, :]})) unique_lasers = unique_lasers[unique_lasers[:, 1].argsort(), :] num_conditions = unique_lasers.shape[0] trials = [] for row in unique_lasers: tmp_trials = [j for j in range(num_trials * num_tastes) if np.array_equal(laser[0, 0, j, :], row)] trials.append(tmp_trials) trials_per_condition = [len(x) for x in trials] if not all(x == trials_per_condition[0] for x in trials_per_condition): raise ValueError('Different number of trials for each laser condition') trials_per_condition = int(trials_per_condition[0] / num_tastes) #assumes same number of trials per taste per condition print('Detected:\n %i tastes\n %i laser conditions\n' ' %i trials per condition per taste' % (num_tastes, num_conditions, trials_per_condition)) trials = np.array(trials) # Store laser conditions and indices of trials per condition in trial x # taste space hf5.create_array('/ancillary_analysis', 'trials', trials) hf5.create_array('/ancillary_analysis', 'laser_combination_d_l', unique_lasers) hf5.flush() # Taste Similarity Calculation neural_response_laser = np.empty((num_conditions, num_bins, num_tastes, num_units, trials_per_condition), dtype=np.dtype('float64')) taste_cosine_similarity = np.empty((num_conditions, num_bins, num_tastes, num_tastes), dtype=np.dtype('float64')) taste_euclidean_distance = np.empty((num_conditions, num_bins, num_tastes, num_tastes), dtype=np.dtype('float64')) # Re-format neural responses from bin x unit x (trial*taste) to # laser_condition x bin x taste x unit x trial print('Reformatting data arrays...') for i, trial in enumerate(trials): for j, _ in enumerate(bin_times): for k, _ in dim.iterrows(): idx = np.where((trial >= num_trials*k) & (trial < num_trials*(k+1)))[0] neural_response_laser[i, j, k, :, :] = \ response[j, :, trial[idx]].T # Compute taste cosine similarity and euclidean distances print('Computing taste cosine similarity and euclidean distances...') for i, _ in enumerate(trials): for j, _ in enumerate(bin_times): for k, _ in dim.iterrows(): for l, _ in dim.iterrows(): taste_cosine_similarity[i, j, k, l] = \ np.mean(cosine_similarity( neural_response_laser[i, j, k, :, :].T, neural_response_laser[i, j, l, :, :].T)) taste_euclidean_distance[i, j, k, l] = \ np.mean(cdist( neural_response_laser[i, j, k, :, :].T, neural_response_laser[i, j, l, :, :].T, metric='euclidean')) hf5.create_array('/ancillary_analysis', 'taste_cosine_similarity', taste_cosine_similarity) hf5.create_array('/ancillary_analysis', 'taste_euclidean_distance', taste_euclidean_distance) hf5.flush() # Taste Responsiveness calculations bin_params = [params['num_comparison_bins'], params['comparison_bin_size']] discrim_p = params['discrim_p'] responsive_neurons = [] discriminating_neurons = [] taste_responsiveness = np.zeros((bin_params[0], num_units, 2)) new_bin_times = np.arange(0, np.prod(bin_params), bin_params[1]) baseline = np.where(bin_times < 0)[0] print('Computing taste responsiveness and taste discrimination...') for i, t in enumerate(new_bin_times): places = np.where((bin_times >= t) & (bin_times <= t+bin_params[1]))[0] for j, u in enumerate(chosen_units): # Check taste responsiveness f, p = f_oneway(np.mean(response[places, j, :], axis=0), np.mean(response[baseline, j, :], axis=0)) if np.isnan(f): f = 0.0 p = 1.0 if p <= discrim_p and u not in responsive_neurons: responsive_neurons.append(u) taste_responsiveness[i, j, 0] = 1 # Check taste discrimination taste_idx = [np.arange(num_trials*k, num_trials*(k+1)) for k in range(num_tastes)] taste_responses = [np.mean(response[places, j, :][:, k], axis=0) for k in taste_idx] f, p = f_oneway(*taste_responses) if np.isnan(f): f = 0.0 p = 1.0 if p <= discrim_p and u not in discriminating_neurons: discriminating_neurons.append(u) responsive_neurons = np.sort(responsive_neurons) discriminating_neurons = np.sort(discriminating_neurons) # Write taste responsive and taste discriminating units to text file save_file = os.path.join(rec_dir, 'discriminative_responsive_neurons.txt') with open(save_file, 'w') as f: print('Taste discriminative neurons', file=f) for u in discriminating_neurons: print(u, file=f) print('Taste responsive neurons', file=f) for u in responsive_neurons: print(u, file=f) hf5.create_array('/ancillary_analysis', 'taste_disciminating_neurons', discriminating_neurons) hf5.create_array('/ancillary_analysis', 'taste_responsive_neurons', responsive_neurons) hf5.create_array('/ancillary_analysis', 'taste_responsiveness', taste_responsiveness) hf5.flush() # Get time course of taste discrimibility print('Getting taste discrimination time course...') p_discrim = np.empty((num_conditions, num_bins, num_tastes, num_tastes, num_units), dtype=np.dtype('float64')) for i in range(num_conditions): for j, t in enumerate(bin_times): for k in range(num_tastes): for l in range(num_tastes): for m in range(num_units): _, p = ttest_ind(neural_response_laser[i, j, k, m, :], neural_response_laser[i, j, l, m, :], equal_var = False) if np.isnan(p): p = 1.0 p_discrim[i, j, k, l, m] = p hf5.create_array('/ancillary_analysis', 'p_discriminability', p_discrim) hf5.flush() # Palatability Rank Order calculation (if > 2 tastes) t_start = params['pal_deduce_start_time'] t_end = params['pal_deduce_end_time'] if num_tastes > 2: print('Deducing palatability rank order...') palatability_rank_order_deduction(rec_dir, neural_response_laser, unique_lasers, bin_times, [t_start, t_end]) # Palatability calculation r_spearman = np.zeros((num_conditions, num_bins, num_units)) p_spearman = np.ones((num_conditions, num_bins, num_units)) r_pearson = np.zeros((num_conditions, num_bins, num_units)) p_pearson = np.ones((num_conditions, num_bins, num_units)) f_identity = np.ones((num_conditions, num_bins, num_units)) p_identity = np.ones((num_conditions, num_bins, num_units)) lda_palatability = np.zeros((num_conditions, num_bins)) lda_identity = np.zeros((num_conditions, num_bins)) r_isotonic = np.zeros((num_conditions, num_bins, num_units)) id_pal_regress = np.zeros((num_conditions, num_bins, num_units, 2)) pairwise_identity = np.zeros((num_conditions, num_bins, num_tastes, num_tastes)) print('Computing palatability metrics...') for i, t in enumerate(trials): for j in range(num_bins): for k in range(num_units): ranks = rankdata(response[j, k, t]) r_spearman[i, j, k], p_spearman[i, j, k] = \ spearmanr(ranks, palatability[j, k, t]) r_pearson[i, j, k], p_pearson[i, j, k] = \ pearsonr(response[j, k, t], palatability[j, k, t]) if np.isnan(r_spearman[i, j, k]): r_spearman[i, j, k] = 0.0 p_spearman[i, j, k] = 1.0 if np.isnan(r_pearson[i, j, k]): r_pearson[i, j, k] = 0.0 p_pearson[i, j, k] = 1.0 # Isotonic regression of firing against palatability model = IsotonicRegression(increasing = 'auto') model.fit(palatability[j, k, t], response[j, k, t]) r_isotonic[i, j, k] = model.score(palatability[j, k, t], response[j, k, t]) # Multiple Regression of firing rate against palatability and identity # Regress palatability on identity tmp_id = identity[j, k, t].reshape(-1, 1) tmp_pal = palatability[j, k, t].reshape(-1, 1) tmp_resp = response[j, k, t].reshape(-1, 1) model_pi = LinearRegression() model_pi.fit(tmp_id, tmp_pal) pi_residuals = tmp_pal - model_pi.predict(tmp_id) # Regress identity on palatability model_ip = LinearRegression() model_ip.fit(tmp_pal, tmp_id) ip_residuals = tmp_id - model_ip.predict(tmp_pal) # Regress firing on identity model_fi = LinearRegression() model_fi.fit(tmp_id, tmp_resp) fi_residuals = tmp_resp - model_fi.predict(tmp_id) # Regress firing on palatability model_fp = LinearRegression() model_fp.fit(tmp_pal, tmp_resp) fp_residuals = tmp_resp - model_fp.predict(tmp_pal) # Get partial correlation coefficient of response with identity idp_reg0, p = pearsonr(fp_residuals, ip_residuals) if np.isnan(idp_reg0): idp_reg0 = 0.0 idp_reg1, p = pearsonr(fi_residuals, pi_residuals) if np.isnan(idp_reg1): idp_reg1 = 0.0 id_pal_regress[i, j, k, 0] = idp_reg0 id_pal_regress[i, j, k, 1] = idp_reg1 # Identity Calculation samples = [] for _, row in dim.iterrows(): taste = row.channel samples.append([trial for trial in t if identity[j, k, trial] == taste]) tmp_resp = [response[j, k, sample] for sample in samples] f_identity[i, j, k], p_identity[i, j, k] = f_oneway(*tmp_resp) if np.isnan(f_identity[i, j, k]): f_identity[i, j, k] = 0.0 p_identity[i, j, k] = 1.0 # Linear Discriminant analysis for palatability X = response[j, :, t] Y = palatability[j, 0, t] test_results = [] c_validator = LeavePOut(1) for train, test in c_validator.split(X, Y): model = LDA() model.fit(X[train, :], Y[train]) tmp = np.mean(model.predict(X[test]) == Y[test]) test_results.append(tmp) lda_palatability[i, j] = np.mean(test_results) # Linear Discriminant analysis for identity Y = identity[j, 0, t] test_results = [] c_validator = LeavePOut(1) for train, test in c_validator.split(X, Y): model = LDA() model.fit(X[train, :], Y[train]) tmp = np.mean(model.predict(X[test]) == Y[test]) test_results.append(tmp) lda_identity[i, j] = np.mean(test_results) # Pairwise Identity Calculation for ti1, r1 in dim.iterrows(): for ti2, r2 in dim.iterrows(): t1 = r1.channel t2 = r2.channel tmp_trials = np.where((identity[j, 0, :] == t1) | (identity[j, 0, :] == t2))[0] idx = [trial for trial in t if trial in tmp_trials] X = response[j, :, idx] Y = identity[j, 0, idx] test_results = [] c_validator = StratifiedShuffleSplit(n_splits=10, test_size=0.25, random_state=0) for train, test in c_validator.split(X, Y): model = GaussianNB() model.fit(X[train, :], Y[train]) tmp_score = model.score(X[test, :], Y[test]) test_results.append(tmp_score) pairwise_identity[i, j, ti1, ti2] = np.mean(test_results) hf5.create_array('/ancillary_analysis', 'r_pearson', r_pearson) hf5.create_array('/ancillary_analysis', 'r_spearman', r_spearman) hf5.create_array('/ancillary_analysis', 'p_pearson', p_pearson) hf5.create_array('/ancillary_analysis', 'p_spearman', p_spearman) hf5.create_array('/ancillary_analysis', 'lda_palatability', lda_palatability) hf5.create_array('/ancillary_analysis', 'lda_identity', lda_identity) hf5.create_array('/ancillary_analysis', 'r_isotonic', r_isotonic) hf5.create_array('/ancillary_analysis', 'id_pal_regress', id_pal_regress) hf5.create_array('/ancillary_analysis', 'f_identity', f_identity) hf5.create_array('/ancillary_analysis', 'p_identity', p_identity) hf5.create_array('/ancillary_analysis', 'pairwise_NB_identity', pairwise_identity) hf5.flush() warnings.filterwarnings('default', category=UserWarning) warnings.filterwarnings('default', category=RuntimeWarning)
X = np.array([[1, 2], [3, 4], [1, 2], [3, 4]]) random_state = 12883823 rkf = RepeatedKFold(n_splits=2, n_repeats=2, random_state=random_state) for train, test in rkf.split(X): print("%s %s" % (train, test)) # Leave One Out (LOO) from sklearn.model_selection import LeaveOneOut X = [1, 2, 3, 4] loo = LeaveOneOut() for train, test in loo.split(X): print("%s %s" % (train, test)) # Leave P out (LPO) # Example of Leave-2-Out on a dataset with 4 samples: from sklearn.model_selection import LeavePOut X = np.ones(4) lpo = LeavePOut(p=2) for train, test in lpo.split(X): print("%s %s" % (train, test)) ## Cross validation of time series data # Example of 3-split time series cross-validation on a dataset with 6 samples: from sklearn.model_selection import TimeSeriesSplit X = np.array([[1, 2], [3, 4], [1, 2], [3, 4], [1, 2], [3, 4]]) y = np.array([1, 2, 3, 4, 5, 6]) tscv = TimeSeriesSplit(n_splits=3) print(tscv) TimeSeriesSplit(max_train_size=None, n_splits=3) for train, test in tscv.split(X): print("%s %s" % (train, test)) #### Cross validation and model selection ### Model evaluation: Quantifying the quality of prediction
import numpy as np from sklearn.model_selection import LeavePOut # ---------------------------------------------------- ''' class sklearn.model_selection.LeavePOut(p) ''' # ---------------------------------------------------- X = np.array([[1, 2], [3, 4], [5, 6], [7, 8]]) y = np.array([1, 2, 3, 4]) #lpo = LeavePOut(1) #lpo = LeavePOut(2) lpo = LeavePOut(3) print(lpo.get_n_splits(X)) print(lpo) lpo = LeavePOut(p=2) for train_index, test_index in lpo.split(X): print("TRAIN:", train_index, "TEST:", test_index) X_train, X_test = X[train_index], X[test_index] y_train, y_test = y[train_index], y[test_index] print('X_train \n', X_train) print('X_test \n', X_test) print('y_train \n', y_train) print('y_test \n', y_test) print('*********************')
permutations_path = op.join(permutations_dir,subject+'_permutations.jl') subj_permuts = joblib.load(permutations_path) if subj_ind == 0: allsubj_permuts = subj_permuts else: shift = allsubj_permuts.shape[1] allsubj_permuts = np.hstack([allsubj_permuts,shift+subj_permuts]) print(allsubj_permuts.shape) n_permuts = allsubj_permuts.shape[0] """ modality_list = ['A', 'V'] lnso_cv = LeavePOut(n_leftout_subjects) n_splits = lnso_cv.get_n_splits(subjects_list, subjects_list, subjects_list) print(n_splits) allsplits_xval_inds = [] for split_ind, (trainsubj_inds, testsubj_inds) in enumerate( lnso_cv.split(subjects_list, subjects_list, subjects_list)): # initialize struct for storing all train and test inds for this split xval_inds = dict() for modality in modality_list: xval_inds['train_{}'.format(modality)] = [] xval_inds['test_{}'.format(modality)] = [] shift_ind = 0
df_hi = df[df['Conc'] == 'Hi'] working_dir = '../results' working_data = glob(os.path.join(working_dir, '*all_words.csv')) label_map = {'Living': 0, 'Nonliving': 1} for lan in ['en', 'es', 'eu']: f = [item for item in working_data if (f'{lan}_all_words' in item)][0] df_words = pd.read_csv(f, encoding='latin-1') word_vecs = np.array([df_words[word].values for word in df_hi[lan]]) clf = make_pipeline( StandardScaler(), LogisticRegression(C=1, solver='liblinear', multi_class='auto')) labels = np.array([label_map[item] for item in df_hi['Living']]) cv = LeavePOut(p=2) results = dict( fold=[], score=[], test_word1=[], test_word2=[], ) groups = df_hi[lan].values for fold, (idx_train, idx_test) in enumerate( cv.split(word_vecs, labels, groups=groups)): X_train, y_train = word_vecs[idx_train], labels[idx_train] X_test, y_test = word_vecs[idx_test], labels[idx_test] X_train, y_train = shuffle(X_train, y_train) test_pairs = groups[idx_test] clf = make_pipeline( StandardScaler(),
min_list = [0, 8, 9, 12] else: min_list = [3, 4, 7, 10, 11, 13] # fit a CART model to the data data = pd.read_csv('input_' + c + '_2_hrv_c.csv', header=None) decisionTree = DecisionTreeClassifier() knnClf = KNeighborsClassifier( n_neighbors=3 ) # default:k = 5,defined by yourself:KNeighborsClassifier(n_neighbors=10) svc = svm.SVC( kernel='linear', C=1) #(kernel='linear', C=1) #(kernel='rbf') #(kernel='poly', degree=5) naive_bayes = GaussianNB() rand_forrest = RandomForestClassifier(n_estimators=25) lpo = LeavePOut(p=3) X_raw = data.iloc[:, :data.shape[1] - 1] y = data.iloc[:, data.shape[1] - 1] # X = X_raw.iloc[:, best_features_list] # lsvc = LinearSVC(C=0.7, penalty="l1", dual=False).fit(X_old, y) # model = SelectFromModel(lsvc, prefit=True) # X = model.transform(X_old) # print X.shape # model_name_list = ['decision tree', 'knn', 'svm', 'naive bayes'] #, 'random forrest'] # model_list = [decisionTree, knnClf, svc, naive_bayes]# , rand_forrest] model_name_list = ['knn'] #, 'random forrest'] model_list = [svc] # , rand_forrest] from sklearn.model_selection import ShuffleSplit
lower_bound = lower_bound - (lower_bound % 100) df_animal = df_animal[df_animal['picked']] df_object = df_object[df_object['picked']] df_animal = df_animal.nlargest(lower_bound,'Mean\nFamiliarity') df_object = df_object.nlargest(lower_bound,'Mean\nFamiliarity') df_final = pd.concat([df_animal,df_object]) df_final = df_final.sort_values(['Category','Word']) ewrq base_clf = make_pipeline(StandardScaler(), LogisticRegression(C=1, solver='liblinear', multi_class='auto')) word_vecs = np.array([model_word2vec[word] for word in df_final['Word']]) labels = np.array([label_map[item] for item in df_final['Category']]) cv = LeavePOut(p = 2) groups = df_final['Word'].values results = dict( fold = [], score = [], test_word1 = [], test_word2 = [], ) for fold, (idx_train,idx_test) in tqdm(enumerate(cv.split(word_vecs,labels,groups = groups))): X_train,y_train = word_vecs[idx_train],labels[idx_train] X_test,y_test = word_vecs[idx_test],labels[idx_test] X_train,y_train = shuffle(X_train,y_train) test_pairs = groups[idx_test] clf = clone(base_clf)
X = np.array([[1, 2], [3, 4], [5, 6], [7, 8], [9, 10], [11, 12]]) y = np.array([1, 1, 1, 2, 2, 2]) loo = LeaveOneOut() print(loo) for train_index, test_index in loo.split(X): print("Train Index:", train_index, ",Test Index:", test_index) X_train, X_test = X[train_index], X[test_index] y_train, y_test = y[train_index], y[test_index] # print(X_train,X_test,y_train,y_test) #LeavePOut import numpy as np from sklearn.model_selection import LeavePOut X = np.array([[1, 2], [3, 4], [5, 6], [7, 8], [9, 10], [11, 12]]) y = np.array([1, 1, 1, 2, 2, 2]) lpo = LeavePOut(p=2) print(lpo) for train_index, test_index in lpo.split(X): print("Train Index:", train_index, ",Test Index:", test_index) X_train, X_test = X[train_index], X[test_index] y_train, y_test = y[train_index], y[test_index] # print(X_train,X_test,y_train,y_test) #随机划分法 #ShuffleSplit import numpy as np from sklearn.model_selection import ShuffleSplit X = np.array([[1, 2], [3, 4], [5, 6], [7, 8], [9, 10], [11, 12]]) y = np.array([1, 2, 1, 2, 1, 2]) rs = ShuffleSplit(n_splits=3, test_size=.25, random_state=0) print(rs)
# ==================================K折交叉验证、留一交叉验证、留p交叉验证、随机排列交叉验证========================================== # k折划分子集 kf = KFold(n_splits=2) for train, test in kf.split(iris.data): print("k折划分:%s %s" % (train.shape, test.shape)) break # 留一划分子集 loo = LeaveOneOut() for train, test in loo.split(iris.data): print("留一划分:%s %s" % (train.shape, test.shape)) break # 留p划分子集 lpo = LeavePOut(p=2) for train, test in loo.split(iris.data): print("留p划分:%s %s" % (train.shape, test.shape)) break # 随机排列划分子集 ss = ShuffleSplit(n_splits=3, test_size=0.25, random_state=0) for train_index, test_index in ss.split(iris.data): print("随机排列划分:%s %s" % (train.shape, test.shape)) break # ==================================分层K折交叉验证、分层随机交叉验证========================================== skf = StratifiedKFold(n_splits=3) # 各个类别的比例大致和完整数据集中相同 for train, test in skf.split(iris.data, iris.target): print("分层K折划分:%s %s" % (train.shape, test.shape)) break
X = data.data y = data.target clf = linear_model.LogisticRegression() loocv = LeaveOneOut() train_index, test_index = next(loocv.split(X, y)) # 1つだけ y.size, train_index.size, test_index.size # サイズを見てみる scores = cross_val_score(clf, X, y, cv=loocv) # LeaveOneOut scores.mean() * 100, scores.std() * 100, scores.size loocv = LeavePOut(2) # scores = cross_val_score(clf, X, y, cv=loocv) # LeavePOut 終わらない! n_C_2オーダー # scores.mean(), scores.std(), scores.size group = np.array(list(range(50)) * 12) group = np.sort(group[:y.size]) group.size group loocv = LeaveOneGroupOut() for train_index, test_index in loocv.split(X, y, group): X_train, X_test = X[train_index], X[test_index] y_train, y_test = y[train_index], y[test_index]
from sklearn.pipeline import Pipeline from sktime.classification.interval_based import CanonicalIntervalForest from sktime.transformations.panel.pca import PCATransformer from sktime.utils._testing.estimator_checks import _make_args DATA_ARGS = [ {"return_numpy": True, "n_columns": 2}, {"return_numpy": False, "n_columns": 2}, ] # StratifiedGroupKFold(n_splits=2), , removed, not available in sklearn 0.24 CROSS_VALIDATION_METHODS = [ KFold(n_splits=2), RepeatedKFold(n_splits=2, n_repeats=2), LeaveOneOut(), LeavePOut(p=5), ShuffleSplit(n_splits=2, test_size=0.25), StratifiedKFold(n_splits=2), StratifiedShuffleSplit(n_splits=2, test_size=0.25), GroupKFold(n_splits=2), LeavePGroupsOut(n_groups=5), GroupShuffleSplit(n_splits=2, test_size=0.25), TimeSeriesSplit(n_splits=2), ] PARAMETER_TUNING_METHODS = [ GridSearchCV, RandomizedSearchCV, HalvingGridSearchCV, HalvingRandomSearchCV, ] COMPOSITE_ESTIMATORS = [
# ### Leave-p-out # # Este un tipo de validación en la que no se define un porcentaje para el conjunto de validación, sino un número $p$ de muestras para validación y las restantes $n-p$ quedan para el entrenamiento. En este caso el número de repeticiones estará definido por el número de combinaciones posibles. # In[19]: X=np.random.randn(10,2) # In[20]: from sklearn.model_selection import LeavePOut lpo = LeavePOut(2) lpo.get_n_splits(X) # Que corresponde al número de combinaciones posibles N combinado 2. # In[21]: from itertools import combinations len(list(combinations(range(X.shape[0]), 2))) LeavePOut(p=1) es igual a LeaveOneOut() # ## Metodología de validación para problemas desbalanceados # #
if np.isnan(r_spearman[i, j, k]): r_spearman[i, j, k] = 0.0 p_spearman[i, j, k] = 1.0 if np.isnan(r_pearson[i, j, k]): r_pearson[i, j, k] = 0.0 p_pearson[i, j, k] = 1.0 # Move to linear discriminant analysis lda_palatability = np.zeros((unique_lasers.shape[0], identity.shape[0])) for i in range(unique_lasers.shape[0]): for j in range(identity.shape[0]): X = response[j, :, trials[i]] Y = palatability[j, 0, trials[i]] # Use k-fold cross validation where k = 1 sample left out test_results = [] c_validator = LeavePOut(1) for train, test in c_validator.split(X, Y): model = LDA() model.fit(X[train, :], Y[train]) # And test on the left out kth trial - compare to the actual class of the kth trial and store in test results test_results.append(np.mean(model.predict(X[test]) == Y[test])) lda_palatability[i, j] = np.mean(test_results) # Save these arrays to file hf5.create_array('/ancillary_analysis', 'r_pearson', r_pearson) hf5.create_array('/ancillary_analysis', 'p_pearson', p_pearson) hf5.create_array('/ancillary_analysis', 'r_spearman', r_spearman) hf5.create_array('/ancillary_analysis', 'p_spearman', p_spearman) hf5.create_array('/ancillary_analysis', 'lda_palatability', lda_palatability) hf5.flush()
assert tokenize(cls(n_splits=3, random_state=0)) != tokenize( cls(n_splits=3, random_state=2) ) assert tokenize(cls(n_splits=3, random_state=0)) != tokenize( cls(n_splits=4, random_state=0) ) cv = cls(n_splits=3) assert compute_n_splits(cv, np_X, np_y, np_groups) == 3 with assert_dask_compute(False): assert compute_n_splits(cv, da_X, da_y, da_groups) == 3 @pytest.mark.parametrize("cvs", [(LeaveOneOut(),), (LeavePOut(2), LeavePOut(3))]) def test_leave_out(cvs): tokens = [] for cv in cvs: assert tokenize(cv) == tokenize(cv) tokens.append(cv) assert len(set(tokens)) == len(tokens) cv = cvs[0] sol = cv.get_n_splits(np_X, np_y, np_groups) assert compute_n_splits(cv, np_X, np_y, np_groups) == sol with assert_dask_compute(True): assert compute_n_splits(cv, da_X, da_y, da_groups) == sol with assert_dask_compute(False):
from sklearn.linear_model import LinearRegression from sklearn.model_selection import cross_val_score data = list(range(1, 11)) print(data) print(train_test_split(data, train_size=.8)) kf = KFold(n_splits=5) for train, validate in kf.split(data): print(train, validate) kf = KFold(n_splits=5, shuffle=True, random_state=42) for train, validate in kf.split(data): print(train, validate) loo = LeaveOneOut() for train, validate in loo.split(data): print(train, validate) lpo = LeavePOut(p=2) for train, validate in lpo.split(data): print(train, validate) ss = ShuffleSplit(n_splits=3, test_size=2, random_state=0) for train, validate in ss.split(data): print(train, validate) tscv = TimeSeriesSplit(n_splits=5) for train, validate in tscv.split(data): print(train, validate)
for i in test: bar[i] = "T" output_test = "{}({}: {}) ".format(output_test, i, data[i]) print("[ {} ]".format(" ".join(bar))) print("Train: {}".format(output_train)) print("Test: {}\n".format(output_test)) # Create some data to split with data = numpy.array([[1, 2], [3, 4], [5, 6], [7, 8]]) # Our two methods loocv = LeaveOneOut() lpocv = LeavePOut(p=P_VAL) split_loocv = loocv.split(data) split_lpocv = lpocv.split(data) print("""\ The Leave-P-Out method works by using every combination of P points as test data. The following output shows the result of splitting some sample data by Leave-One-Out and Leave-P-Out methods. A bar displaying the current train-test split as well as the actual data points are displayed for each split. In the bar, "-" is a training point and "T" is a test point. """) print("Data:\n{}\n".format(data)) print("Leave-One-Out:\n")
3, 25, 31, 45, 80, 94, 95, 98 ], [3, 38, 43, 45, 49, 67, 80, 81, 86, 87, 98, 99, 107, 109], [45, 49, 53, 64, 65, 81, 87, 89, 90]] # fit a CART model to the data data = pd.read_csv('input_i_2_hrv_c.csv', header=None) decisionTree = DecisionTreeClassifier() knnClf = KNeighborsClassifier( n_neighbors=3 ) # default:k = 5,defined by yourself:KNeighborsClassifier(n_neighbors=10) svc = svm.SVC( kernel='linear', C=1) # (kernel='linear', C=1) #(kernel='rbf') #(kernel='poly', degree=5) naive_bayes = GaussianNB() rand_forrest = RandomForestClassifier(n_estimators=25) lpo = LeavePOut(p=3) X_raw = data.iloc[:, :data.shape[1] - 1] y = data.iloc[:, data.shape[1] - 1] # X = X_raw.iloc[:, best_features_list] # lsvc = LinearSVC(C=0.7, penalty="l1", dual=False).fit(X_old, y) # model = SelectFromModel(lsvc, prefit=True) # X = model.transform(X_old) # print X.shape model_name_list = ['decision tree', 'knn', 'svm', 'naive bayes'] # , 'random forrest'] model_list = [decisionTree, knnClf, svc, naive_bayes] # , rand_forrest] # # ---------------------------------------------------------------------------------------------------------------------- # # this part is for selecting best features, every iteration select the feature combination with highest score
# example with KFold cross validation from sklearn.model_selection import KFold crossval_method = KFold(n_splits=3) crossvalidated = cross_validate(classifier, donnee.loc[:, donnee.columns != "target"], donnee.target, cv = crossval_method) crossvalidated.get("test_score").mean() # run one of these and then run crossvalidated at the end from sklearn.model_selection import RepeatedKFold crossval_method = RepeatedKFold(n_splits=2, n_repeats=2) from sklearn.model_selection import LeaveOneOut crossval_method = LeaveOneOut() from sklearn.model_selection import LeavePOut crossval_method = LeavePOut(p = 1) from sklearn.model_selection import ShuffleSplit crossval_method = ShuffleSplit(n_splits=3, test_size=0.3) from sklearn.model_selection import StratifiedKFold crossval_method = StratifiedKFold(n_splits=3) crossvalidated = cross_validate(classifier, donnee.loc[:, donnee.columns != "target"], donnee.target, cv = crossval_method) crossvalidated.get("test_score").mean() # see also # from sklearn.model_selection import GroupKFold, LeaveOneGroupOut, LeavePGroupsOut, GroupShuffleSplit, TimeSeriesSplit # from now on, with cross validation, train and test sets will be built from X_train and y_train. X_test and y_test are now really validation sets.