def findBestK(data, x_cols, y_cols): """ Non-nested approach to knn. Also for quick accuracy testing Arguments: data {array} -- Data x_cols {array} -- x columns y_cols {array} -- y columns """ best_k=0 best_accu=0 x = data.loc[:, x_cols] y = data.loc[:, y_cols] #Picking best k for k in range(2,11): #from 2 to 10 loo = LeaveOneOut() loo.get_n_splits(data) n=loo.split(data) knnClassifier = KNeighborsClassifier(n_neighbors=k, weights="uniform", metric="euclidean") accuracy_a = [] real_label = [] pred_label = [] for train_index, test_index in n: #Each row is test data once xtrain, xtest = x.iloc[train_index], x.iloc[test_index] ytrain, ytest = y.iloc[train_index], y.iloc[test_index] knnClassifier.fit(xtrain, ytrain.values.ravel()) ypred=knnClassifier.predict(xtest) pred_label.append(ypred) real_label.append(ytest) acc = accuracy_score(ytest, ypred) accuracy_a.append(acc) avg_acc = np.mean(accuracy_a) print(k,": average accuracy ", avg_acc) if(avg_acc>best_accu): #Updating best_k if accuracy is better best_accu=avg_acc best_k=k print("Best k=",best_k) print("Best accuracy=",best_accu) return(best_k)
def element_check(X, Y, num): if num == 2: clf = MLPClassifier(max_iter=500, alpha=1.0, random_state=21, tol=0.000000001) elif num == 1: clf = KNeighborsClassifier(n_neighbors=3) TN = 0 FP = 0 FN = 0 TP = 0 F1 = 0 Educate = 0.0 Test = 0.0 count = 0 loo = LeaveOneOut() loo.get_n_splits(X) for train_index, test_index in loo.split(X): start_time = time.time() clf.fit(X[train_index], Y[train_index]) education_time = time.time() - start_time start_time = time.time() proba = clf.predict(X[test_index]) test_time = time.time() - start_time Educate += education_time Test += test_time tn, fp, fn, tp = confusion_matrix(Y[test_index], proba, labels=[0, 1]).ravel() TN += tn FP += fp FN += fn TP += tp count += 1 F1 += (f1_score(Y[test_index], proba, average='binary')) summ = TP + TN + FP + FN print('TP: ', TP / summ) print('TN: ', TN / summ) print('FP: ', FP / summ) print('FN: ', FN / summ) print('Точность (Precision): ', TN / (TN + FN)) print('Полнота(Recall)', TN / (TN + TP)) print('F-мера: ', F1 / len(Y)) print('Время обучения: ', Educate) print('Время тестирования: ', Test)
def save_regression_leave_one_out(X, y, classification_model, classification_model_name): y_preds_list = [] y_list = [] loo = LeaveOneOut() loo.get_n_splits(X) for train_index, test_index in loo.split(X): X_train, X_test = X.iloc[train_index], X.iloc[test_index] y_train, y_test = y[train_index], y[test_index] regressionFunction = classification_model.fit(X_train, y_train) y_pred = regressionFunction.predict(X_test) y_list.append(y_test) y_preds_list.append(y_pred) f = open(f"./results/leave_one_out/{classification_model_name}.txt", "w") f.write(get_model_report_for_multi_y_pred(y_list, y_preds_list))
def k_fold(reg, x_train, y_train, k=5): if k == -1: kf = LeaveOneOut() else: kf = KFold(n_splits=k) kf.get_n_splits(x_train) tprs = [] aucs = [] mean_fpr = np.linspace(0, 1, 100) i = 0 for train_index, test_index in kf.split(x_train): _x_train, _x_test = x_train.values[train_index, :], x_train.values[test_index, :] _y_train, _y_test = y_train[train_index], y_train[test_index] probas_ = reg.fit(_x_train, _y_train).predict_proba(_x_test) fpr, tpr, thresholds = roc_curve(_y_test, probas_[:, 1]) tprs.append(interp(mean_fpr, fpr, tpr)) tprs[-1][0] = 0.0 roc_auc = auc(fpr, tpr) aucs.append(roc_auc) # plt.plot(fpr, tpr, lw=1, alpha=0.3, label='ROC fold %d (AUC = %0.2f)' % (i, roc_auc)) i += 1 plt.plot([0, 1], [0, 1], linestyle='--', lw=2, label='Chance', alpha=.8) mean_tpr = np.mean(tprs, axis=0) mean_tpr[-1] = 1.0 mean_auc = auc(mean_fpr, mean_tpr) std_auc = np.std(aucs) plt.plot(mean_fpr, mean_tpr, label=r'Mean ROC (AUC = %0.2f $\pm$ %0.2f)' % (mean_auc, std_auc), lw=2, alpha=.8) std_tpr = np.std(tprs, axis=0) tprs_upper = np.minimum(mean_tpr + std_tpr, 1) tprs_lower = np.maximum(mean_tpr - std_tpr, 0) plt.fill_between(mean_fpr, tprs_lower, tprs_upper, color='grey', alpha=.2, label=r'$\pm$ 1 std. dev.') plt.xlim([-0.05, 1.05]) plt.ylim([-0.05, 1.05]) plt.xlabel('False Positive Rate') plt.ylabel('True Positive Rate') plt.title('Receiver operating characteristic example') plt.legend(loc="lower right") return reg
def find_squared_losses(N, X, Y, degree, btype): """ returns squared losses for both train and test when loo is performed """ # design matrix and optimal parameters dm = build_design_matrix(X, degree, btype) ot = find_optimal_parameters(dm, Y) # init leave-one-out validator loo = LeaveOneOut() nb_splits = loo.get_n_splits(X) # run leave-one-out validation to calculate the losses squared_losses_test = [] for train_index, test_index in loo.split(X): # prepare train and test sets X_train, X_test = X[train_index], X[test_index] y_train, y_test = Y[train_index], Y[test_index] # find squared loss for test set dm_test = build_design_matrix(X_test, degree, btype) ot_test = find_optimal_parameters(dm_test, y_test) squared_loss_test = calculate_squared_loss(y_test, dm_test, ot_test) squared_loss_test_scalar = np.asscalar(squared_loss_test) squared_losses_test.append(squared_loss_test_scalar) # calculate mean values mean_squared_loss_test = sum(squared_losses_test) / nb_splits # find maximum likelihood for variance for the whole dataset dm = build_design_matrix(X, degree, btype) ot = find_optimal_parameters(dm, Y) squared_loss_mle_var = calculate_squared_loss(Y, dm, ot) squared_loss_mle_var_scalar = np.asscalar(squared_loss_mle_var) return mean_squared_loss_test, squared_loss_mle_var_scalar
def LOOCV(ohe_df, use_previous_years=False): ''' Leave one out cross validation to check performance on multiple regression models. ''' models = { 'RFR': RandomForestRegressor(n_estimators=50, random_state=0), 'GBR': GradientBoostingRegressor(max_depth=1, random_state=0), 'LIR': LinearRegression(), 'SVR': SVR(kernel='linear') } if use_previous_years is False: ordinal_columns = df[['INCOME_BINS', '2017 RATING']] ohe_df = pd.concat([categorical_columns, ordinal_columns], axis=1) df_x = ohe_df.iloc[:, :-1] df_y = ohe_df.iloc[:, -1] scoring = ['neg_mean_squared_error', 'neg_mean_absolute_error'] loo = LeaveOneOut.get_n_splits(df_x, df_y) for name, model in models.items(): scores = cross_validate(model, df_x, df_y, cv=loo, scoring=scoring) rmse = (-1*mean(scores['test_neg_mean_squared_error']))**0.5 mae = -1*mean(scores['test_neg_mean_absolute_error']) print(f'{name} RMSE: {rmse: 0.4f}, MAE: {mae: 0.4f}')
def intra_set_cross_validation(self, pred_metric, target_metric): loo = LeaveOneOut() loo.get_n_splits(np.arange(self.num_samples)) pred_intra_set_distance_cv = np.zeros( (self.num_samples, 1, self.num_samples - 1)) target_intra_set_distance_cv = np.zeros( (self.num_samples, 1, self.num_samples - 1)) for train_index, test_index in loo.split(np.arange(self.num_samples)): pred_intra_set_distance_cv[test_index[0]][0] = utils.c_dist( pred_metric[test_index], pred_metric[train_index]) target_intra_set_distance_cv[test_index[0]][0] = utils.c_dist( target_metric[test_index], target_metric[train_index]) return pred_intra_set_distance_cv, target_intra_set_distance_cv
def loocv_logistic_retrain(subj_censored, features, DV, best_c): start = time.time() cv = LeaveOneOut() num_cv = cv.get_n_splits(subj_censored) res = [] for i in range(num_cv): # define train, test data train_data, test_data = loocv_train_test_split_ith(subj_censored, i) # define model best_lasso_model = LogisticRegression(penalty='l1', solver='saga', C=best_c, fit_intercept=False) # fit model best_lasso_model.fit(train_data[features], train_data[DV]) # predict prob yhat = best_lasso_model.predict(test_data[features]) yprob = best_lasso_model.predict_proba(test_data[features])[:, 1] # save pred outcomes res.append([ test_data['HCPID'].values[0], test_data[DV].values[0], yhat[0], yprob[0] ]) res = pd.DataFrame(res, columns=['HCPID', 'ytrue', 'yhat', 'yprob']) print('Time Usage (s)', round((time.time() - start), 4)) return res
def call_SVM_LOOCV(X, y, verbose=0): """ This function applies LDA on the data and returns the LOOCV scores in 2 ways. Created by: Loukas Serafeim, Nov 2017 Args: X: A numpy array of the input features y: A numpy array of the target values. Note: this shoud have shape= [n_features, ] Returns: The mean LOOCV scores of LDA classification """ ###### Standardize Data ########### pipe = Pipeline([('scaler', StandardScaler()), ('clf', SVC(kernel='linear', random_state=1))]) #clf = SVC(kernel = 'linear', random_state = 1) #sc = StandardScaler() #pipe = make_pipeline(sc, clf) loo = LeaveOneOut() if verbose: print("The number of splits is:{}\n".format(loo.get_n_splits(X))) ######################## 1st WAY ###################### test_fold_predictions = [] y_test_all = [] for i, j in loo.split(X): X_train, X_test = X[i], X[j] y_train, y_test = y[i], y[j] pipe.fit(X_train, y_train) y_pred = pipe.predict(X_test) test_fold_predictions.append(y_pred) y_test_all.append(y_test) if verbose: print('Confusion matrix \n{}\n'.format( metrics.confusion_matrix(y_test_all, test_fold_predictions))) print("Accuracy is %r \n" % metrics.accuracy_score(y_test_all, test_fold_predictions)) ################ PLOT CONFUSION MATRIX PLOT ######################### #plt.imshow(confusion_matrix(y_test_all, test_fold_predictions), interpolation='nearest', cmap=plt.cm.Blues) #plt.colorbar() #plt.xlabel("True label") #plt.ylabel("Predicted label") #plt.title(" The Confusion Matrix") ### stop blocking #########3 #plt.show(block = False) ###################### 2nd way using sklearn build-in functions ################### # sc = StandardScaler() # pipe = make_pipeline(sc, clf) scores = cross_val_score(pipe, X, y, cv=loo, scoring="accuracy") if verbose: print("Accuracy of 2nd way is %r\n" % np.mean(scores)) #plt.show() return np.mean(scores)
def LeaveOneOut_Onemodel(X, y, model): # LeaveOneOut for one model loo = LeaveOneOut() loo.get_n_splits(X) Prediction = [] for train_index, test_index in loo.split(X): X_train, X_test = X.iloc[train_index, :], X.iloc[test_index, :] y_train, y_test = y[train_index], y[test_index] model.fit(X_train, y_train) pre = model.predict(X_test)[0] print(pre) Prediction.append(pre) pred = pd.Series(Prediction) SST = sum((y - y.mean())**2) SSE = sum((pred - y)**2) print("LLO", 1 - SSE / SST)
def leave_one_out(data, classlabel, n=3, loop=True): """ tests k-fold accuracy when k = n :param data: the test data without the class identifier :param classlabel: the lable of the class for each instance :param n: the number of neighbors to be tested, default is 3 :param loop: loop true is normal, loop false is for testing :return: the average overall accuracy for leave one out """ knn = KNeighborsRegressor(n_neighbors=n) loo = LeaveOneOut() size = loo.get_n_splits(data, classlabel) rate = 0 if loop: for training, testing in loo.split(data): x_train, x_test = data.iloc[training], data.iloc[testing] y_train, y_test = classlabel[training], classlabel[testing] knn.fit(x_train, y_train) if knn.predict(x_test)[0] == y_test.iloc[0]: rate = rate + 1 error = rate / size else: error = random.uniform(1.0, 0.0) return error
def classify2(X, y): from sklearn import svm clf = svm.LinearSVC(C=1.0, dual=True, fit_intercept=True, intercept_scaling=1, loss='squared_hinge', max_iter=1000, multi_class='ovr', penalty='l2', random_state=None, tol=0.0001, verbose=0) from sklearn.model_selection import LeaveOneOut loo = LeaveOneOut() loo.get_n_splits(X) pred = [] target = [] for train_index, test_index in loo.split(X): # print("TRAIN:", train_index, "TEST:", test_index) X_train, X_test = X[train_index], X[test_index] y_train, y_test = y[train_index], y[test_index] clf.fit(X_train, y_train) Y_pred = clf.predict(X_test) pred.append(Y_pred) target.append(y_test) # print (target) f_micro = sklearn.metrics.f1_score(target, pred, average='micro') p_micro = sklearn.metrics.precision_score(target, pred, average='micro') r_micro = sklearn.metrics.recall_score(target, pred, average='micro') # f_macro = sklearn.metrics.f1_score(target, pred, average='macro') # p_macro = sklearn.metrics.precision_score(target, pred, average='macro') # r_macro = sklearn.metrics.recall_score(target, pred, average='macro') accuracy = sklearn.metrics.accuracy_score(target, pred) print('Accuracy=%f' % accuracy) print('*' * 10 + ' Micro Score ' + '*' * 10) print('p=%f' % p_micro) print('r=%f' % r_micro) print('f-score=%f' % f_micro)
def evaluate(columns): modify("data.csv", translate(columns)) df = pd.read_csv("modified.csv", header=0) dataset = df.values X = dataset[:, 1:] y = dataset[:, 0] y = y.astype('int') scale = StandardScaler().fit(X) X_std = scale.transform(X) X_train, X_test, y_train, y_test = train_test_split(X_std, y, train_size=.9) loo = LeaveOneOut() loo.get_n_splits(X_train) parameters = { 'solver': ['newton-cg', 'lbfgs', 'liblinear'], 'C': [1, 100, 1000] } log = LogisticRegression(multi_class='auto', max_iter=1000) clf = GridSearchCV(log, parameters, cv=loo) clf.fit(X_train, y_train) a = clf.best_score_ p = clf.best_params_ parameters = {'kernel': ['rbf', 'linear', 'poly'], 'C': [1, 100, 1000]} svc = SVC(gamma="scale") clf = GridSearchCV(svc, parameters, cv=loo) clf.fit(X_train, y_train) if clf.best_score_ > a: a = clf.best_score_ p = clf.best_params_ parameters = {'n_neighbors': [2, 3, 4, 5, 6], 'p': [1, 2]} knn = KNeighborsClassifier() clf = GridSearchCV(knn, parameters, cv=loo) clf.fit(X_train, y_train) if clf.best_score_ > a: a = clf.best_score_ p = clf.best_params_ return a,
def loo_risk(X, y, regmod): """ Construct the leave-one-out square error risk for a regression model Input: design matrix, X, response vector, y, a regression model, regmod Output: scalar LOO risk """ loo = LeaveOneOut() loo.get_n_splits(X) loo_losses = [] for train_index, test_index in loo.split(X): X_train, X_test = X[train_index], X[test_index] y_train, y_test = y[train_index], y[test_index] regmod.fit(X_train, y_train) y_hat = regmod.predict(X_test) loss = np.sum((y_hat - y_test)**2) loo_losses.append(loss) return np.mean(loo_losses)
def LeaveOneOut_test(dataset,min_support,min_threshold): score = 0 tot = 0 loo = LeaveOneOut() loo.get_n_splits(dataset) dataset = numpy.array(dataset) for train_index, test_index in loo.split(dataset): dataset_train = dataset[train_index] # train dataset_test = dataset[test_index] # ActiveUserTransactions rules = ARM_train(dataset_train,min_support,min_threshold) if not rules.empty: score += ARM_test(rules, dataset_test[0]) tot = tot + 1 #print(score) return float(score)/tot # this is accuracy
def knn(n): x, y = getData() model = KNeighborsClassifier(n_neighbors=n) loo = LeaveOneOut() loo.get_n_splits(x) y_pred = [] a = np.array(y) for train_index, test_index in loo.split(x): x_train, x_test = x[train_index], x[test_index] y_train, y_test = a[train_index], a[test_index] model.fit(x_train, y_train) y_pred.extend(model.predict(x_test)) print("KNN com N =", n) print("y: ") print(y) print("y_pred: ") print(y_pred) print("recall score:") print("macro:", recall_score(y, y_pred, average='macro')) print("micro:", recall_score(y, y_pred, average='micro')) print(recall_score(y, y_pred, average=None)) print("precision score:") print("macro:", precision_score(y, y_pred, average='macro')) print("micro", precision_score(y, y_pred, average='micro')) print("weighted:", precision_score(y, y_pred, average='weighted')) print(precision_score(y, y_pred, average=None)) print("accuracy score:") print("normalizado:", accuracy_score(y, y_pred)) print("nao normalizado:", accuracy_score(y, y_pred, normalize=False), "\n")
def classification_within_modality(dataFrame, categoria, exposure): ''' ''' dataFrame_result = [] loo = LeaveOneOut() pbar = tqdm(total=loo.get_n_splits(dataFrame)) for ind, pearson in dataFrame.groupby('people'): X = pearson.drop(['trial', 'group', 'people'], 1) y = pearson['group'] loo = LeaveOneOut() for train_index, test_index in loo.split(X): X_train, X_test = X.iloc[train_index], X.iloc[test_index] y_train, y_test = y.iloc[train_index], y.iloc[test_index] #Normalize train_mean = average(X_train, axis=0) X_train_without_mean = subtract(X_train, train_mean) X_test_without_mean = subtract(X_test, train_mean) clf = GaussianNB() clf.class_prior_ = [(1 / 6), (1 / 6), (1 / 6), (1 / 6), (1 / 6), (1 / 6)] pca_ = PCA(random_state=42, svd_solver='full', n_components=0.99) pca = pca_.fit(X_train_without_mean) X_train_pca = pca.transform(X_train_without_mean) X_test_pca = pca.transform(X_test_without_mean) clf = clf.fit(X_train_pca, y_train) y_pred = clf.predict(X_test_pca) dataFrame_result.append( [ind, y_pred, y_test.values, categoria, exposure]) pbar.update(1) return dataFrame_result
def main(): columns = "age sex bmi map tc ldl hdl tch ltg glu".split() diabetes = datasets.load_diabetes() print(diabetes) print(columns) df = pd.DataFrame(diabetes.data, columns=columns) y = diabetes.target # create training and testing vars X_train, X_test, y_train, y_test = train_test_split(df, y, test_size=0.2) print X_train.shape, y_train.shape print X_test.shape, y_test.shape lm = linear_model.LinearRegression() model = lm.fit(X_train, y_train) predictions = model.predict(X_test) print(predictions) # the linear model print 'score', model.score(X_test, y_test) # KFold split example X = np.array([[1, 2], [3, 4], [1, 2], [3, 4]]) y = np.array([1, 2, 3, 4]) kf = KFold(n_splits=2) kf.get_n_splits(X) print kf KFold(n_splits=2, random_state=None, shuffle=False) for train_index, test_index in kf.split(X): print('TRAIN:', train_index, 'TEST:', test_index) X_train, X_test = X[train_index], X[test_index] y_train, y_test = y[train_index], y[test_index] # leave one out cross validation X = np.array([[1, 2], [3, 4]]) y = np.array([1, 2]) loo = LeaveOneOut() loo.get_n_splits(X) for train_index, test_index in loo.split(X): print('TRAIN:', train_index, 'TEST:', test_index) X_train, X_test = X[train_index], X[test_index] y_train, y_test = y[train_index], y[test_index] print(X_train, X_test, y_train, y_test)
def find_parameters_evaluation(index_set, gene_expression, cell_count_aa): prediction = [] actual_value = [] n_splines_all = [] lam_all = [] # THIS IS OUTER LOOP: for VALIDATION/TESTING #train n models and evaluate their average performance gene_indexes = index_set y = cell_count_aa X = gene_expression[gene_expression.columns[gene_indexes]] loo = LeaveOneOut() loo.get_n_splits(X) gam = LinearGAM() gam = gam.gridsearch(X, y, n_splines=np.arange(10, 50), lam=[0.4, 0.5, 0.6, 0.7, 0.8]) for train_index, test_index in loo.split(X): X_train, X_test = X.iloc[train_index], X.iloc[test_index] y_train, y_test = y[train_index], y[test_index] # THIS IS INNER LOOP: for TRAINING/VALIDATION #train model with given optimized parameters regr = gam.fit(X_train, y_train) #make a prediction on OUTER LOOP test set prediction_val = regr.predict(X_test)[0] # store predictions and actual values prediction.append(prediction_val) actual_value.append(y_test[0]) # add optimal parameter values to arrays n_splines_all.append(regr.n_splines) lam_all.append(regr.lam) print(test_index) print(str(prediction_val), " ", str(y_test[0])) #calculate spearman correlation over all of the models rho, pval = spearmanr(actual_value, prediction) lams = np.array(lam_all) lams_mean = lams.mean() n_splines_all = np.array(n_splines_all) n_splines_mean = n_splines_all.mean() return lams_mean, n_splines_mean, rho, pval
def run3(): import warnings warnings.filterwarnings("ignore") import pandas as pd import numpy as np import matplotlib.pyplot as plt df=pd.read_csv("AdmissionDataset/data.csv") X=df.iloc[:,0:8].as_matrix() #print(X.shape) y=df.iloc[:,8:9].values y=y.reshape((y.shape[0],)) #print(y.shape) d_m=np.mean(X) d_s=np.std(X) d_n=(X-d_m)/d_s from sklearn.model_selection import LeaveOneOut kf = LeaveOneOut() kf.get_n_splits(d_n) msel=[] for train_index, test_index in kf.split(X): #print("TRAIN:", train_index, "TEST:", test_index) X_train, X_test = d_n[train_index], d_n[test_index] y_train, y_test = y[train_index], y[test_index] model =RidgeRegression(0.0001, iters=3000, lrate=0.001) model.fit(X_train,y_train) y_pred=model.predict(X_test) from sklearn.metrics import mean_squared_error,r2_score mse = mean_squared_error(y_test, y_pred) msel.append(mse) print("Mean Error for Ridge Regression : "+str(sum(msel)/len(msel))) msel=[] for train_index, test_index in kf.split(X): #print("TRAIN:", train_index, "TEST:", test_index) X_train, X_test = d_n[train_index], d_n[test_index] y_train, y_test = y[train_index], y[test_index] model =LassoRegression(0.0001, iters=3000, lrate=0.001) model.fit(X_train,y_train) y_pred=model.predict(X_test) from sklearn.metrics import mean_squared_error mse = mean_squared_error(y_test, y_pred) msel.append(mse) print("Mean Error for Lasso Regression : "+str(sum(msel)/len(msel)))
def get_cross_validation_predictions(data_obj, data, target, tags, method): import numpy as np data = np.array(data) target = np.array(target) from sklearn.model_selection import LeaveOneOut loo = LeaveOneOut() loo.get_n_splits(data) preds = [] for train_index, test_index in loo.split(data): indexes_to_leave_out, q_tag = get_all_questions_belonging_to_thread( data_obj, tags, index=list(test_index)[0]) train_index = np.delete(train_index, indexes_to_leave_out, 0) train_target, test_target = target[train_index], target[test_index] train_data, test_data = data[train_index], data[test_index] pred = method(train_data, train_target, test_data, q_tag) preds.append(pred[0]) return preds, target
def runTest(X,Y): Y_pred = np.zeros((len(Y),), dtype='uint8') loo = LeaveOneOut() print("LOO : Total {:d} tests to perform".format(loo.get_n_splits(X))) for train_index, test_index in loo.split(X): X_train, X_test = X[train_index], X[test_index] Y_train, Y_test = Y[train_index], Y[test_index] clf = SVC(C=2) clf.fit(X_train, Y_train) Y_pred[test_index] = clf.predict(X_test) return Y_pred
def loadVideos(self): """ Load the video data, Extract feature and train hmm model """ mat_contents = sio.loadmat('data/gait.mat') mat_contents = mat_contents['gait'] for category_name in self.categories: """Each category""" images = [] for person in self.persons: """Each person""" if person == 'lena_' and (category_name == 'run' or category_name == 'skip' or category_name == 'walk'): """Person is Lena and category run, skip or walk""" video = mat_contents[person + category_name + '1'][0][0] if self.args.mhi: data = self.extractMhiFeature(video) else: data = self.extractFeature(video) images.append(data) video = mat_contents[person + category_name + '2'][0][0] if self.args.mhi: data = self.extractMhiFeature(video) else: data = self.extractFeature(video) images.append(data) else: video = mat_contents[person + category_name][0][0] if self.args.mhi: data = self.extractMhiFeature(video) else: data = self.extractFeature(video) images.append(data) if images.__len__() != 0: loo = LeaveOneOut() # images.__len__() images = np.array(images) # train hmm with category all video self.fullDataTrainHmm[ category_name], std_scale, std_scale1 = self.train(images) self.model[category_name] = {} self.model[category_name]['hmm'] = [] self.model[category_name]['std_scale'] = [] self.model[category_name]['std_scale1'] = [] self.model[category_name]['data'] = [] print(loo.get_n_splits(images)) for train, test in loo.split(images): markov_model, std_scale, std_scale1 = self.train( images[train]) self.model[category_name]['hmm'].append(markov_model) self.model[category_name]['std_scale'].append(std_scale) self.model[category_name]['std_scale1'].append(std_scale1) self.model[category_name]['data'].append(images[test]) self.target_names = self.categories
def calculate_BAcc(features_array): '''For given feature(s), use NN to calculate its BAcc_values with respect to their true lables. Leave-one-out''' features_array = features_array.T # Features subset used to calculate BAcc true_labels = 1*c_mic.T # true_labels = Data set columns converted to boolean then 0/1. loo = LeaveOneOut() loo.get_n_splits(features_array) # The number of iterations. classifier = KNeighborsClassifier(n_neighbors=1) # KNN, k = 1 y_pred = [] for train_index, test_index in loo.split(features_array): X_train, X_test = features_array.iloc[train_index], features_array.iloc[test_index] y_train, y_test = true_labels[train_index], true_labels[test_index] classifier.fit(X_train, y_train) y_pred_i = classifier.predict(X_test) y_pred.append(y_pred_i) BAcc = balanced_accuracy_score(true_labels, y_pred) return BAcc
def get_loocv_accuracy(model, data, labels): loo = LeaveOneOut() print( f"\nCalculating LOOCV accuracy with {loo.get_n_splits(data)} iterations..." ) total_accuracy = 0 for training_indices, testing_indices in loo.split(data): model.fit(data[training_indices], labels[training_indices]) y_predicted = model.predict(data[testing_indices]) total_accuracy += accuracy_score(labels[testing_indices], y_predicted) return total_accuracy / loo.get_n_splits(data)
def spatial_kfold(central_shape,year,thresh): data=central_shape[['Call_density','geometry', '{}_pop_density'.format(year)]].reset_index(drop=True) #Get the centoid for each TA data['centroid']=data['geometry'].centroid loo = LeaveOneOut() loo.get_n_splits(data) coef_sp=pd.DataFrame() for train_index, test_index in loo.split(data): print("TRAIN:", train_index, "TEST:", test_index) train=data.loc[train_index].reset_index(drop=True) test=data.loc[test_index].reset_index(drop=True) train_new=pd.DataFrame() #check whether training points and test points distance between threshold #is greater than threshold for i,row in train.iterrows(): if test.centroid.distance(train.centroid.iloc[i]).values> thresh: train_new=train_new.append(train.iloc[i]) #Get the train and test datasets X_train = pd.DataFrame(np.log(train_new['Call_density'])).reset_index(drop=True) y_train = pd.DataFrame(np.log(train_new['{}_pop_density'.format(year)])).reset_index(drop=True) X_test = pd.DataFrame(np.log(test['Call_density'])).reset_index(drop=True) y_test = pd.DataFrame(np.log(test['{}_pop_density'.format(year)])).reset_index(drop=True) #Fit regression model lm = LinearRegression() lm.fit(X_train,y_train) #Get RMSE train and test RMSEtrain=np.sqrt(mean_squared_error(y_train,lm.predict(X_train))) RMSEtest=np.sqrt(mean_squared_error(y_test,lm.predict(X_test))) #Get the coefficient for all iterations coef_sp=coef_sp.append({'Alpha':float(lm.intercept_),'Beta':float(lm.coef_),'R^2':lm.score(X_train,y_train), 'RMSE_train':RMSEtrain,'RMSE_test':RMSEtest},ignore_index=True) return coef_sp
def makeItemCrossValidation(dataset, labels): print('[*] Item cross validation has started') tnSum, fpSum, fnSum, tpSum = 0, 0, 0, 0 trainingTime = 0.0 testingTime = 0.0 fscore = 0.0 myClassifier = RandomForestClassifier(min_samples_leaf=2, random_state=17, criterion='entropy') loo = LeaveOneOut() loo.get_n_splits(dataset) LeaveOneOut() for trainIndex, testIndex in loo.split(dataset): # Обучение begin = time.time() myClassifier.fit(dataset[trainIndex], labels[trainIndex]) trainingTime += time.time() - begin # Тестирование begin = time.time() y_pred = myClassifier.predict(dataset[testIndex]) testingTime += time.time() - begin tn, fp, fn, tp = confusion_matrix(labels[testIndex], y_pred, labels=[0, 1]).ravel() tnSum += tn fpSum += fp fnSum += fn tpSum += tp fscore += f1_score(labels[testIndex], y_pred, average='binary') resultTesting(tpSum, tnSum, fpSum, fnSum, len(dataset), fscore, len(dataset), trainingTime, testingTime)
def test_LeaveOneOut(): ''' 测试 LeaveOneOut 的用法 :return: None ''' # X = np.array([[1, 2, 3, 4], # [11, 12, 13, 14], # [21, 22, 23, 24], # [31, 32, 33, 34]] # ) # y = np.array([1, 1, 0, 0]) # # loo = LeaveOneOut() # loo.get_n_splits(X) # for train_index, test_index in loo.split(X): # print("Train Index:", train_index) # print("Test Index:", test_index) # print("X_train:", X[train_index]) # print("X_test:", X[test_index]) # print("") from sklearn.datasets import load_digits from sklearn.svm import LinearSVC from sklearn.metrics import mean_squared_error digits = load_digits() # 加载用于分类问题的数据集 X = digits.data y = digits.target print(y) print(len(y)) SVC = LinearSVC() loo = LeaveOneOut() loo.get_n_splits(X) mean_squared_error_list = [] for train_index, test_index in loo.split(X): SVC.fit(X[train_index], y[train_index]) prediction = SVC.predict(X[test_index]) print(mean_squared_error(y[test_index], prediction)) mean_squared_error_list.append( mean_squared_error(y[test_index], prediction)) print(np.average(mean_squared_error_list))
def meta_model(combined_meta, metric_df, algorithm_name): loo = LeaveOneOut() loo.get_n_splits(combined_meta) idx = metric_df.columns.get_loc(algorithm_name) m, n = combined_meta.shape pca = PCA(n_components=3) y_pred = np.zeros(shape=(m, 1)) for train_index, test_index in loo.split(combined_meta): # print("TRAIN:", train_index, "TEST:", test_index) X_train, X_test = combined_meta.iloc[ train_index, :], combined_meta.iloc[test_index, :] pca_train_data = pca.fit_transform(X_train) pca_test_data = pca.transform(X_test) y_train, y_test = metric_df.iloc[train_index, idx], metric_df.iloc[test_index, idx] # Calculate actual gamma values to test model = SVR(C=1, epsilon=0.1, gamma='scale') # model = linear_model.LinearRegression() model.fit(pca_train_data, y_train) y_pred[test_index] = model.predict(pca_test_data) # Uncomment the next two lines to manually get prediction. #print(get_pred(model, pca_test_data, pca_train_data)) #print(y_pred[test_index]) # The same oputput by both # print("Train Data", X_train, X_test, "\n Response \n", y_train, y_test) # print(y_test, '\n') # y_pred = pd.DataFrame({ # algorithm_name: y_pred, }, index=y_test.index) data = pd.DataFrame(y_pred, index=metric_df.index) data.columns = [algorithm_name] return data
def DecisionTree(): print("Decision Tree") x, y = getData() clf = DecisionTreeClassifier() #tree.plot_tree(clf.fit(x, y)) loo = LeaveOneOut() loo.get_n_splits(x) y_pred = [] a = np.array(y) for train_index, test_index in loo.split(x): x_train, x_test = x[train_index], x[test_index] y_train, y_test = a[train_index], a[test_index] clf.fit(x_train, y_train) y_pred.extend(clf.predict(x_test)) print(y) print(y_pred) print("recall score:") print("macro:", recall_score(y, y_pred, average='macro')) print("micro:", recall_score(y, y_pred, average='micro')) print(recall_score(y, y_pred, average=None)) print("precision score:") print("macro:", precision_score(y, y_pred, average='macro')) print("micro", precision_score(y, y_pred, average='micro')) print("weighted:", precision_score(y, y_pred, average='weighted')) print(precision_score(y, y_pred, average=None)) print("accuracy score:") print("normalizado:", accuracy_score(y, y_pred)) print("nao normalizado:", accuracy_score(y, y_pred, normalize=False), "\n")
def computeCVROC(df, model, outcomeVar, predVars, nFolds=10, LOO=False): """Apply model to df and return performance metrics in a cross-validation framework. Parameters ---------- df : pd.DataFrame Must contain outcome and predictor variables. model : sklearn or other model Model must have fit and predict methods. outcomeVar : str predVars : ndarray or list Predictor variables in the model. nFolds : int N-fold cross-validation (not required for LOO) Returns ------- fpr : np.ndarray Pre-specified vector of FPR thresholds for interpolation fpr = np.linspace(0, 1, 100) meanTPR : np.ndarray Mean true-positive rate in test fraction. auc : float Area under the mean ROC curve. acc : float Mean accuracy score in test fraction. results : returned by model.fit() Training model results object for each fold prob : pd.Series Mean predicted probabilities on test data with index from df success : bool An indicator of whether the cross-validation was completed.""" if not isinstance(predVars, list): predVars = list(predVars) tmp = df[[outcomeVar] + predVars].dropna() X,y = tmp[predVars].astype(float), tmp[outcomeVar].astype(float) if LOO: cv = LeaveOneOut() nFolds = cv.get_n_splits(y) cv_iter = cv.split(y=y) else: cv = StratifiedKFold(n_splits=nFolds, shuffle=True) cv_iter = cv.split(X=X, y=y) fpr = np.linspace(0, 1, 100) tpr = np.nan * np.zeros((fpr.shape[0], nFolds)) acc = np.nan * np.zeros(nFolds) auc = np.nan * np.zeros(nFolds) coefs = [] probs = [] for outi, (trainInd, testInd) in enumerate(cv_iter): Xtrain, Xtest = X.iloc[trainInd], X.iloc[testInd] ytrain, ytest = y.iloc[trainInd], y.iloc[testInd] results = model.fit(X=Xtrain, y=ytrain) prob = results.predict_proba(Xtest) class1Ind = np.nonzero(results.classes_ == 1)[0][0] fprTest, tprTest, _ = sklearn.metrics.roc_curve(ytest, prob[:, class1Ind]) tpr[:, outi] = np.interp(fpr, fprTest, tprTest) auc[outi] = sklearn.metrics.auc(fprTest, tprTest) acc[outi] = sklearn.metrics.accuracy_score(ytest, np.round(prob[:, class1Ind]), normalize=True) coefs.append(results.coef_[None,:]) probs.append(pd.Series(prob[:, class1Ind], index=Xtest.index)) meanTPR = np.mean(tpr, axis=1) meanTPR[0], meanTPR[-1] = 0, 1 meanACC = np.mean(acc) meanAUC = sklearn.metrics.auc(fpr, meanTPR) """Compute mean probability over test predictions in CV""" probS = pd.concat(probs).groupby(level=0).agg(np.mean) probS.name = 'Prob' """Refit all the data for final model""" result = model.fit(X=X, y=y) rocRes = rocStats(y, np.round(probS)) outD = {'fpr':fpr, # (100, ) average FPR for ROC 'tpr':meanTPR, # (100, ) average TPR for ROC 'AUC':auc, # (CVfolds, ) AUC of ROC for each outer test fold 'mAUC': meanAUC, # (1, ) AUC of the average ROC 'mACC': np.mean(acc), 'ACC':acc, # (CVfolds, ) accuracy across outer test folds 'finalResult': result, # final fitted model with predict() exposed 'prob':probS, # (N,) pd.Series of predicted probabilities avg over outer folds 'coefs':np.concatenate(coefs), # (CVfolds, predVars) 'Xvars':predVars, 'Yvar':outcomeVar, 'nFolds':nFolds, 'LOO':'Yes' if LOO else 'No', 'N':tmp.shape[0]} outD.update(rocRes[['Sensitivity', 'Specificity']].to_dict()) return outD