def _iter_cv(n): # XXX support sklearn < 0.18 if hasattr(LeaveOneOut, 'split'): cv = LeaveOneOut() return cv.split(np.zeros((n, 1))) else: cv = LeaveOneOut(len(data)) return cv
def run_leave_one_out_cv(features, labels, classifier=LinearDiscriminantAnalysis()): """ Runs leave one out CV. :param features: Features shape(epoch, feature) :param labels: list of lables of length num epochs :param classifier: Sklearn classifier (Defaults to LDA) :return: A list of cross validation scores. Use np.average on the result to find the average score. """ loo = LeaveOneOut() scores = [] for train_indexes, test_indexes in loo.split(features, labels): # Assert our split maintains the same number of features CCDLAssert.assert_equal(len(train_indexes) + len(test_indexes), features.shape[0]) # Assert we have the same number of features X_train, X_test = features[train_indexes, :], features[test_indexes, :] Y_train, Y_test = np.asarray(labels)[train_indexes], np.asarray(labels)[test_indexes] # Assert our X_train and X_test have the same number of features CCDLAssert.assert_equal(X_train.shape[1], X_test.shape[1]) # Fit our classifier to our classifier.fit(X_train, Y_train) score = classifier.score(X_test, Y_test) scores.append(score) return scores
def main(argv): filename = argv[0] t = float(argv[1]) # threshold for logistic regression (default=0.5) dup = int(argv[2]) # if 1, bad queries will be duplicated subset = 'cache' # column title for precision of cache full = 'full' # column title for precision of full db df = pd.read_csv('../../data/cache_selection_structured/' + filename) df = df.drop(['query', 'freq'], axis = 1) df = df.fillna(0) df['label'] = np.where(df['full'] > df['cache'], 1, 0) if dup: print('duping..') bads = df[df['label'] == 1] df = df.append(bads, ignore_index=True) X = df.drop(['label'], axis = 1) y = df['label'] df = df.drop(['label'], axis = 1) p20_mean = np.zeros([1, 6]) bad_mean = np.zeros([1, 6]) ml_average_rare = 0 ql_average_rare = 0 best_average_rare = 0 loo = LeaveOneOut() bad_counter = 0 for train_index, test_index in loo.split(X): X_train, X_test = X.iloc[train_index], X.iloc[test_index] y_train, y_test = y.iloc[train_index], y.iloc[test_index] X_train = X_train.drop([subset, full], axis=1) p12 = X_test[subset].iloc[0] p100 = X_test[full].iloc[0] is_bad = p12 < p100 X_test = X_test.drop([subset, full], axis=1) # compute query likelihood based effectiveness ql_cache = np.mean(X_test['ql_0_0'] + X_test['ql_0_1'] + X_test['ql_1_0'] + X_test['ql_2_0']) ql_rest = np.mean(X_test['ql_rest_0_0'] + X_test['ql_rest_0_1'] + X_test['ql_rest_1_0'] + X_test['ql_rest_2_0']) #ql_pred = X_test['ql_0_1'].iloc[0] < X_test['ql_rest_0_1'].iloc[0] ql_pred = 1 if ql_cache < ql_rest else 0 ql = p12 if ql_pred == 0 else p100 # learn the model print(X_train.shape) print(df.columns.shape) # y_pred = train_lr(X_train, y_train, X_test, y_test, t, df.columns.values[:-2]) y_pred = train_lr(X_train, y_train, X_test, y_test, t) ml = p12 if y_pred[0] == 0 else p100 best = p12 if y_test.iloc[0] == 0 else p100 rnd = p12 if np.random.randint(0, 2) == 1 else p100 p20_mean += [p12, p100, ml, ql, best, rnd] if is_bad: #bad_mean += [p12[0], p100[0], ml[0], ql[0], best[0], rnd[0]] bad_mean += [p12, p100, ml, ql, best, rnd] bad_counter += 1 print('final results:') print('\t'.join(map(str,['set', 'cache', 'db', 'ml', 'ql', 'best', 'rand']))) print('\t'.join(['bad'] + map(str, np.round(bad_mean[0] / bad_counter, 2)))) print('\t'.join(['all'] + map(str, np.round(p20_mean[0] / df.shape[0], 2))))
def roc_data(X,Y,clf,n_iter=50,test_size=0.1): if n_iter is None and test_size is None: cv = LeaveOneOut() else: cv = ShuffleSplit(n_iter=n_iter,test_size=test_size) n_labels = Y.shape[1] Y_cv = {i:[] for i in range(n_labels)} p = {i:[] for i in range(n_labels)} p_1 = {i:[] for i in range(n_labels)} p_0 = {i:[] for i in range(n_labels)} for train, test in cv.split(Y): clf.fit(X[train,:], Y[train,:]) Y_predicted = clf.predict_proba(X[test,:]) for i in range(Y.shape[1]): if type(Y_predicted) is list: p_ = 1 - Y_predicted[i][:,0] else: p_ = Y_predicted[:,i] Y_cv[i] += list(Y[test,i]) p[i] += list(p_) p_1[i] += list(p_[np.where(Y[test,i]==1)[0]]) p_0[i] += list(p_[np.where(Y[test,i]==0)[0]]) return Y_cv, p, p_1, p_0
def _print_classification_results(classifier, regressors, response, regressors_test, response_test, regressor_names, messages): loo = LeaveOneOut() cv_score = cross_val_score(classifier, regressors, response, cv=loo.split(regressors)) classifier.fit(regressors, response) messages.AddMessage("Adaboost classifier with " + str(classifier.n_estimators) + " estimators and learning rate " + str(classifier.learning_rate)) if regressors_test is None or response_test is None: regressors_test = regressors response_test = response t_set = "Train" else: t_set = "Test" messages.AddMessage("Score (" + t_set + " Set):" + str(classifier.score(regressors_test, response_test))) messages.AddMessage("Score (Leave one Out):" + str(cv_score.mean())) messages.AddMessage("Confusion Matrix (" + t_set + " Set):") confusion = confusion_matrix(response_test, classifier.predict(regressors_test)) labels = ["Non Prospective", "Prospective"] row_format = "{:6}" + "{:^16}" * (len(labels) + 1) messages.AddMessage(row_format.format("", "", "Predicted", "")) messages.AddMessage(row_format.format("True", "", *labels)) for label, row in zip(labels, confusion): messages.AddMessage(row_format.format("", label, *row)) messages.AddMessage("Area Under the curve (AUC):" + str(roc_auc_score(response_test, classifier.decision_function(regressors_test)))) messages.AddMessage("Feature importances: ") importances = [[name, val] for name, val in zip(regressor_names, classifier.feature_importances_)] for elem in sorted(importances, key=lambda imp: imp[1], reverse=True): if elem[1] > 0: messages.AddMessage(elem[0] + ": \t" + str(elem[1]*100) + "%") return
return outputs l1 = add_layers(xs, 7, 20, activation_function=tf.nn.sigmoid) predict = add_layers(l1, 20, 8, activation_function=tf.nn.softmax) loss = tf.nn.softmax_cross_entropy_with_logits(labels=ys, logits=predict) loss = tf.reduce_mean(loss) train = tf.train.AdamOptimizer(0.01).minimize(loss) with tf.Session() as sess: sess.run(tf.global_variables_initializer()) output_res = [] for i in range(20): loo = LeaveOneOut() lo = loo.split(X, y) pred = [] for train_index, test_index in lo: train_X, train_Y = X[train_index], y[train_index] test_X, test_Y = X[test_index], y[test_index] sess.run(train, feed_dict={xs: train_X, ys: train_Y}) test_res = tf.argmax(sess.run(predict, feed_dict={xs: test_X}), 1) res = sess.run(test_res) for r in res: pred.append(r) test_accuracy = accuracy_score(label, pred) # print('----%s times------%f%%'%(i,test_accuracy*100)) print(test_accuracy * 100) output_res.append(test_accuracy)
def allergies_distance_matrix(distance='spearman', clustering='spectral'): for i in range(0, df.shape[1]): for j in range(0, df.shape[1]): #Spearman correlation if distance == 'spearman': dist_mat.at[df.columns[i], df.columns[j]] = abs( round( scipy.stats.spearmanr( np.array(df.iloc[:, i]).astype(float), np.array(df.iloc[:, j]).astype(float))[0], 4)) #Euclidean distance else: dist_mat.at[df.columns[i], df.columns[j]] = np.linalg.norm( np.array(df.iloc[:, i]).astype(float) - np.array(df.iloc[:, j]).astype(float)) if clustering == 'spectral': clustering = SpectralClustering(n_clusters=2, affinity='precomputed', assign_labels='discretize', random_state=0) else: clustering = AgglomerativeClustering(affinity='precomputed', linkage='average') clustering.fit(dist_mat.values) bact_label1 = [] bact_label0 = [] bact_label = {0: [], 1: []} for i in range(0, df.shape[1]): if clustering.labels_[i] == 1: bact_label1.append(df.columns[i]) else: bact_label0.append(df.columns[i]) bact_label_name = {0: [], 1: []} bact_label_tmp = {0: [], 1: []} bact_level = level - 1 for k in [0, 1]: for i in bact_label[k]: for key, value in dict_bact.items(): for j in value: if i == j: bact_label_tmp[k].append(key) bact_label_tmp[k] = set(bact_label_tmp[k]) for i in bact_label_tmp[k]: if i != 'else': for j in taxonomy: try: if j.split(';')[bact_level] == i: bact_label_name[k].append(','.join( j.split(';')[0:bact_level + 1])) break except: continue else: bact_label_name[k].append('else') bact_label_name[k] = set(bact_label_name[k]) df1 = df[bact_label1] df0 = df[bact_label0] pca = PCA(n_components=min(round(df0.shape[1] / 2) + 1, df0.shape[0])) pca.fit(df0) sum = 0 num_comp = 0 for (i, component) in enumerate(pca.explained_variance_ratio_): if sum <= 0.5: sum += component else: num_comp = i break if num_comp == 0: num_comp += 1 otu_after_pca0, _ = apply_pca(df0, n_components=num_comp, print_data=False) merged_data0 = otu_after_pca0.join(mapping_file) X = merged_data0.drop(['disease'], axis=1) y = merged_data0['disease'] loo = LeaveOneOut() accuracy = [] y_pred_list = [] for train_index, test_index in loo.split(X): train_index = list(train_index) X_train, X_test = X.iloc[train_index, :], X.iloc[test_index, :] y_train, y_test = y[train_index], y[test_index] model = XGBClassifier(max_depth=5, n_estimators=300, learning_rate=15 / 100, objective='binary:logistic', scale_pos_weight=(np.sum(y_train == 0) / np.sum(y_train == 1)), reg_lambda=450) model.fit(X_train, y_train) y_pred = model.predict(X_test) y_pred_list.append(y_pred) y_pred_train = model.predict(X_train) print('Train Precision: ' + str(round(precision_score(y_train, y_pred_train), 2))) print('Train Recall: ' + str(round(recall_score(y_train, y_pred_train), 2))) cnf_matrix = metrics.confusion_matrix(y_train, y_pred_train) class_names = ['Control', 'GVHD'] plt.figure() plot_confusion_matrix(cnf_matrix, classes=list(class_names), normalize=True, title='Normalized confusion matrix') plt.show() print('Precision: ' + str(round(precision_score(y, y_pred_list), 2))) print('Recall: ' + str(round(recall_score(y, y_pred_list), 2))) cnf_matrix = metrics.confusion_matrix(y, y_pred_list) # # Plot normalized confusion matrix plt.figure() plot_confusion_matrix(cnf_matrix, classes=list(class_names), normalize=True, title='Normalized confusion matrix') plt.show() # pca = PCA(n_components=min(round(df1.shape[1] / 2) + 1, df1.shape[0])) pca.fit(df1) sum = 0 num_comp = 0 for (i, component) in enumerate(pca.explained_variance_ratio_): if sum <= 0.5: sum += component else: num_comp = i break if num_comp == 0: num_comp += 1 otu_after_pca1, _ = apply_pca(df1, n_components=num_comp, print_data=False) merged_data1 = otu_after_pca1.join(mapping_file) X = merged_data1.drop(['disease'], axis=1) y = merged_data1['disease'] loo = LeaveOneOut() accuracy = [] y_pred_list = [] for train_index, test_index in loo.split(X): train_index = list(train_index) X_train, X_test = X.iloc[train_index, :], X.iloc[test_index, :] y_train, y_test = y[train_index], y[test_index] model = XGBClassifier(max_depth=5, n_estimators=300, learning_rate=15 / 100, objective='binary:logistic', scale_pos_weight=(np.sum(y_train == 0) / np.sum(y_train == 1)), reg_lambda=450) model.fit(X_train, y_train) y_pred = model.predict(X_test) y_pred_list.append(y_pred) y_pred_train = model.predict(X_train) print('Train Precision: ' + str(round(precision_score(y_train, y_pred_train), 2))) print('Train Recall: ' + str(round(recall_score(y_train, y_pred_train), 2))) cnf_matrix = metrics.confusion_matrix(y_train, y_pred_train) class_names = ['Control', 'GVHD'] plt.figure() plot_confusion_matrix(cnf_matrix, classes=list(class_names), normalize=True, title='Normalized confusion matrix') plt.show() print('Precision: ' + str(round(precision_score(y, y_pred_list), 2))) print('Recall: ' + str(round(recall_score(y, y_pred_list), 2))) cnf_matrix = metrics.confusion_matrix(y, y_pred_list) # # Plot normalized confusion matrix plt.figure() plot_confusion_matrix(cnf_matrix, classes=list(class_names), normalize=True, title='Normalized confusion matrix') plt.show()
def pmo(alg_no, rm_38, type_filter, feature_selection): raw_data = np.loadtxt('data-PMO.csv', delimiter=',', skiprows=1) if rm_38: raw_data = raw_data[np.where(raw_data[:, 12] != 38.98)] index = np.array([], dtype=int) for it in type_filter: index = np.append(index, np.where(raw_data[:, it] == 1)) raw_data = raw_data[index, :] data = preprocessing.StandardScaler().fit(raw_data).transform(raw_data) X = data[:, type_filter + feature_selection] y = data[:, 12] tot = np.linalg.norm(y - np.mean(y))**2 mse_min = 10000 hp_opt = 0 py_opt = [] for hp in range(1, 2): py = [] ry = [] loo = LeaveOneOut() for train, test in loo.split(X, y): X_train = X[train] y_train = y[train] X_test = X[test] y_test = y[test] if alg_no == 0: # Lasso model = linear_model.LinearRegression() elif alg_no == 1: # SVR model = svm.SVR() elif alg_no == 2: # KNR model = neighbors.KNeighborsRegressor(hp, weights='distance') else: # DTR model = tree.DecisionTreeRegressor(max_depth=hp, random_state=0) model.fit(X_train, y_train) py.append(model.predict(X_test)) ry.append(y_test) mse = np.linalg.norm(np.array(py) - np.array(ry))**2 if mse < mse_min: mse_min = mse hp_opt = hp py_opt = py print(1 - mse_min / tot) print(hp_opt) plt.plot([min(np.min(y), np.min(py_opt)), max(np.max(y), np.max(py_opt))], [min(np.min(y), np.min(py_opt)), max(np.max(y), np.max(py_opt))]) py_opt = np.array(py_opt) if 0 in type_filter: plt.scatter(y[np.where(raw_data[:, 0] == 1)], py_opt[np.where(raw_data[:, 0] == 1)], c='r', label='20CrMnTi') if 1 in type_filter: plt.scatter(y[np.where(raw_data[:, 1] == 1)], py_opt[np.where(raw_data[:, 1] == 1)], c='g', label='45#') if 2 in type_filter: plt.scatter(y[np.where(raw_data[:, 2] == 1)], py_opt[np.where(raw_data[:, 2] == 1)], c='b', label='60Si2Mn') if 3 in type_filter: plt.scatter(y[np.where(raw_data[:, 3] == 1)], py_opt[np.where(raw_data[:, 3] == 1)], c='k', label='AM2') if 4 in type_filter: plt.scatter(y[np.where(raw_data[:, 4] == 1)], py_opt[np.where(raw_data[:, 4] == 1)], c='y', label='GCr15') if 5 in type_filter: plt.scatter(y[np.where(raw_data[:, 5] == 1)], py_opt[np.where(raw_data[:, 5] == 1)], c='c', label='SA-210C') if 6 in type_filter: plt.scatter(y[np.where(raw_data[:, 6] == 1)], py_opt[np.where(raw_data[:, 6] == 1)], c='m', label='ZTM-S2') plt.legend() plt.show()
def trainModelFV_LOOCV_Fusion(self, extension='*.*'): """ This method contains the entire module required for training the Bag of Poses model Use of helper functions will be extensive. """ self.name_dict, self.number_dict, self.count_class = self.file_helper.getLabelsFromFile( self.label_path) # read file. prepare file lists. self.files1, self.trainFilesCount1 = self.file_helper.getFilesFromDirectory( self.base_path, self.datasets, extension) self.files2, self.trainFilesCount2 = self.file_helper.getFilesFromDirectory( self.base_path2, self.datasets, extension) save = True self.parameters += 'Classifier Parameters\n' self.parameters += '%s' % self.classifier_helper.clf features_nd1 = np.asarray(self.files1) features_nd2 = np.asarray(self.files2) features_nd1.sort(axis=0) features_nd2.sort(axis=0) # build GMMs self.descriptor_list1 = [] self.descriptor_list2 = [] for f in features_nd1: feature = f[0] des1 = self.file_helper.loadFeaturesFromFile(feature) self.descriptor_list1.append(des1) for f in features_nd2: feature = f[0] des2 = self.file_helper.loadFeaturesFromFile(feature) self.descriptor_list2.append(des2) ft1 = self.classifier_helper.formatND(self.descriptor_list1) ft2 = self.classifier_helper.formatND(self.descriptor_list2) gmm1 = GMM(n_components=self.no_clusters, covariance_type='diag', verbose=0) gmm1.fit(ft1) gmm2 = GMM(n_components=self.no_clusters, covariance_type='diag', verbose=0) gmm2.fit(ft2) # Train Classifier loo = LeaveOneOut() predictions = [] pre = [] lab = [] hits = 0 c = 0 for train, test in loo.split(features_nd1): feature_test_file1 = str(features_nd1[test][0][0]) feature_test_file2 = str(features_nd2[test][0][0]) class_name_test = feature_test_file1.split(os.sep)[-2] c += 1 currenInvDate = datetime.datetime.now().strftime( "%d/%m/%Y %H:%M:%S") print('Step: %i/%i - %s\n%s\n%s' % (c, features_nd1.shape[0], currenInvDate, feature_test_file1, feature_test_file2)) if c == 1 or c % 25 == 0: self.mail_helper.sendMail( "Progress: %s - %s" % (self.test_name, self.OsName), "Samples processed: %i" % c) self.descriptor_list1 = [] self.descriptor_list2 = [] self.train_labels = [] for feature in features_nd1[train]: feature = feature[0] label_number = self.number_dict[feature.split(os.sep)[-2]] self.train_labels = np.append(self.train_labels, label_number) des1 = self.file_helper.loadFeaturesFromFile(feature) self.descriptor_list1.append(des1) for feature in features_nd2[train]: feature = feature[0] des2 = self.file_helper.loadFeaturesFromFile(feature) self.descriptor_list2.append(des2) # format data as nd array ft1 = self.classifier_helper.formatND(self.descriptor_list1) ft2 = self.classifier_helper.formatND(self.descriptor_list2) fv_dim1 = self.no_clusters + 2 * self.no_clusters * ft1.shape[1] fv_dim2 = self.no_clusters + 2 * self.no_clusters * ft2.shape[1] print(fv_dim1, fv_dim2) n_videos = train.shape[0] features1 = np.array([np.zeros(fv_dim1) for i in range(n_videos)]) features2 = np.array([np.zeros(fv_dim2) for i in range(n_videos)]) count1 = 0 count2 = 0 for i in range(n_videos): len_video1 = len(self.descriptor_list1[i]) fv1 = fisher_vector(ft1[count1:count1 + len_video1], gmm1) features1[i] = fv1 count1 += len_video1 len_video2 = len(self.descriptor_list2[i]) fv2 = fisher_vector(ft2[count2:count2 + len_video2], gmm2) features2[i] = fv2 count2 += len_video2 print(features1.shape) print('Data normalization. 1') scaler1 = StandardScaler() # train normalization features1 = scaler1.fit_transform(features1) features1 = power_normalize(features1, 0.5) features1 = L2_normalize(features1) print(features2.shape) print('Data normalization. 2') scaler2 = StandardScaler() # train normalization features2 = scaler2.fit_transform(features2) features2 = power_normalize(features2, 0.5) features2 = L2_normalize(features2) # real label lab.extend( [self.number_dict[feature_test_file1.split(os.sep)[-2]]]) # test features 1 feature_test1 = self.file_helper.loadFeaturesFromFile( feature_test_file1) test_fv1 = fisher_vector(feature_test1, gmm1) # train normalization test_fv1 = test_fv1.reshape(1, -1) test_fv1 = scaler1.transform(test_fv1) test_fv1 = power_normalize(test_fv1, 0.5) test_fv1 = L2_normalize(test_fv1) # test features 2 feature_test2 = self.file_helper.loadFeaturesFromFile( feature_test_file2) test_fv2 = fisher_vector(feature_test2, gmm2) # train normalization test_fv2 = test_fv2.reshape(1, -1) test_fv2 = scaler2.transform(test_fv2) test_fv2 = power_normalize(test_fv2, 0.5) test_fv2 = L2_normalize(test_fv2) ## concatenate two fv test feature_test = np.concatenate((test_fv1, test_fv2), axis=1).reshape(1, -1) ## concatenate two fv train feature_train = np.concatenate((features1, features2), axis=1) # train classifiers self.classifier_helper.clf.fit(feature_train, self.train_labels) cl = int(self.classifier_helper.clf.predict(feature_test)[0]) class_name_predict = self.name_dict[str(cl)] if class_name_test == class_name_predict: hits += 1 error = c - hits msg_progress = 'Hits: %i/%i - Accuracy: %.4f - Error: %i\n\n' % ( hits, c, hits / c, error) print(msg_progress) if c % 25 == 0: self.mail_helper.sendMail( "Progress: %s - %s" % (self.test_name, self.OsName), msg_progress) if error > 40: save = False print('Error excedded') break # predicted label pre.extend([cl]) predictions.append({ 'image1': feature_test_file1, 'image2': feature_test_file2, 'class': cl, 'object_name': self.name_dict[str(cl)] }) if save: self.saveResults(predictions, pre, lab, features_nd1.shape[0])
bar[i] = "T" output_test = "{}({}: {}) ".format(output_test, i, data[i]) print("[ {} ]".format(" ".join(bar))) print("Train: {}".format(output_train)) print("Test: {}\n".format(output_test)) # Create some data to split with data = numpy.array([[1, 2], [3, 4], [5, 6], [7, 8]]) # Our two methods loocv = LeaveOneOut() lpocv = LeavePOut(p=P_VAL) split_loocv = loocv.split(data) split_lpocv = lpocv.split(data) print("""\ The Leave-P-Out method works by using every combination of P points as test data. The following output shows the result of splitting some sample data by Leave-One-Out and Leave-P-Out methods. A bar displaying the current train-test split as well as the actual data points are displayed for each split. In the bar, "-" is a training point and "T" is a test point. """) print("Data:\n{}\n".format(data)) print("Leave-One-Out:\n") print_result(split_loocv)
def main(): feature_array_all = np.loadtxt('x_189.txt', dtype=np.float32) f = open("y.txt", "rb") label_vector = f.read().decode() label_vector = list(label_vector) f.close() label_vector = np.array(label_vector, dtype=np.float32) loo = LeaveOneOut() X = feature_array_all y = label_vector predict_y_test = np.empty(0) predictions_test = np.empty(0) for train_index, test_index in loo.split(X, y): X_train, X_test = X[train_index], X[test_index] y_train, y_test = y[train_index], y[test_index] clf = svm.SVC(probability=True, C=20.6913808111479, gamma=0.25118864315095824) clf = clf.fit(X_train, y_train) score_r = clf.score(X_test, y_test) predict_y_test_single = clf.predict(X_test) predict_y_test = np.append(predict_y_test, predict_y_test_single, axis=None) prob_predict_y_test = clf.predict_proba(X_test) predictions_test_single = prob_predict_y_test[:, 1] predictions_test = np.append(predictions_test, predictions_test_single, axis=None) print('Sequence ' + str(test_index[0] + 1) + ' has finished. (1805 sequences in total)') TP = 0 TN = 0 FP = 0 FN = 0 for i in range(0, len(y)): if int(y[i]) == 1 and int(predict_y_test[i]) == 1: TP = TP + 1 elif int(y[i]) == 1 and int(predict_y_test[i]) == 0: FN = FN + 1 elif int(y[i]) == 0 and int(predict_y_test[i]) == 0: TN = TN + 1 elif int(y[i]) == 0 and int(predict_y_test[i]) == 1: FP = FP + 1 Sn = float(TP) / (TP + FN) Sp = float(TN) / (TN + FP) ACC = float((TP + TN)) / (TP + TN + FP + FN) y_validation = np.array(y, dtype=int) fpr, tpr, thresholds = metrics.roc_curve(y_validation, predictions_test, pos_label=1) roc_auc = auc(fpr, tpr) F1 = metrics.f1_score(y_validation, np.array(predict_y_test, int)) MCC = metrics.matthews_corrcoef(y_validation, np.array(predict_y_test, int)) print('svm ACC:%s' % ACC) print('svm AUC:%s' % roc_auc) print('svm Sn:%s' % Sn) print('svm Sp:%s' % Sp) print('svm F1:%s' % F1) print('svm MCC:%s' % MCC)
def calculate_concrete_IH(X, y, full, clfList): ndata = X.shape[0] numClf = len(clfList) # Num of classifiers knn_clf = KNeighborsClassifier(np.floor(np.sqrt(ndata) / 2)) # k = sqrt(n)/2 tree_clf = DecisionTreeClassifier(max_depth=5) nb_clf = GaussianNB() lr_clf = LogisticRegression() lda_clf = LinearDiscriminantAnalysis() qda_clf = QuadraticDiscriminantAnalysis() # Matrix that record misclassification misclf_matrix = np.zeros((ndata, numClf)) # If full = True, perform Leave-one-out cross validation for all classifiers if full == True: loo = LeaveOneOut() for train_index, test_index in loo.split(X): X_train, X_test = X[train_index], X[test_index] y_train, y_test = y[train_index], y[test_index] # Classifier 0: kNN if 0 in clfList: knn_clf.fit(X_train, y_train) pred_knn = knn_clf.predict(X_test) if pred_knn != y_test: misclf_matrix[test_index[0]][0] = 1 # Classifier 1: Decision Tree if 1 in clfList: tree_clf.fit(X_train, y_train) pred_tree = tree_clf.predict(X_test) if pred_tree != y_test: misclf_matrix[test_index[0]][1] = 1 # Classifier 2: Naive Bayes if 2 in clfList: nb_clf.fit(X_train, y_train) pred_nb = nb_clf.predict(X_test) if pred_nb != y_test: misclf_matrix[test_index[0]][2] = 1 # Classifier 3: Logistic Regression if 3 in clfList: lr_clf.fit(X_train, y_train) pred_lr = lr_clf.predict(X_test) if pred_lr != y_test: misclf_matrix[test_index[0]][3] = 1 # Classifier 4: LDA if 4 in clfList: lda_clf.fit(X_train, y_train) pred_lda = lda_clf.predict(X_test) if pred_lda != y_test: misclf_matrix[test_index[0]][4] = 1 # Classifier 5: QDA if 5 in clfList: qda_clf.fit(X_train, y_train) pred_qda = qda_clf.predict(X_test) if pred_qda != y_test: misclf_matrix[test_index[0]][5] = 1 ih_vector = np.zeros(ndata) for i in range(ndata): ih_vector[i] = sum(misclf_matrix[i, :]) / numClf return ih_vector, misclf_matrix # else perform niter by nfolds (default is 5 by 10) fold cross validation else: niter = 5 # Num of iterations nfolds = 10 misclf = np.zeros( (ndata, numClf, niter) ) # For each data, misclassif by each classifier on each iteration for randseed in range(niter): np.random.seed(randseed) kf = KFold(n_splits=nfolds, shuffle=True) fold = 0 for tr_idx, test_idx in kf.split(X): X_train, X_test = X[tr_idx], X[test_idx] y_train, y_test = y[tr_idx], y[test_idx] # Classifier 0: kNN if 0 in clfList: knn_clf.fit(X_train, y_train) pred_knn = knn_clf.predict(X_test) for i in range(len(test_idx)): if pred_knn[i] != y_test[i]: misclf[test_idx[i]][0][randseed] = 1 # Classifier 1: Decision Tree if 1 in clfList: tree_clf.fit(X_train, y_train) pred_tree = tree_clf.predict(X_test) for i in range(len(test_idx)): if pred_tree[i] != y_test[i]: misclf[test_idx[i]][1][randseed] = 1 # Classifier 2: Naive Bayes if 2 in clfList: nb_clf.fit(X_train, y_train) pred_nb = nb_clf.predict(X_test) for i in range(len(test_idx)): if pred_nb[i] != y_test[i]: misclf[test_idx[i]][2][randseed] = 1 # Classifier 3: Logistic Regression if 3 in clfList: lr_clf.fit(X_train, y_train) pred_lr = lr_clf.predict(X_test) for i in range(len(test_idx)): if pred_lr[i] != y_test[i]: misclf[test_idx[i]][3][randseed] = 1 # Classifier 4: LDA if 4 in clfList: lda_clf.fit(X_train, y_train) pred_lda = lda_clf.predict(X_test) for i in range(len(test_idx)): if pred_lda[i] != y_test[i]: misclf[test_idx[i]][4][randseed] = 1 # Classifier 5: QDA if 5 in clfList: qda_clf.fit(X_train, y_train) pred_qda = qda_clf.predict(X_test) for i in range(len(test_idx)): if pred_qda[i] != y_test[i]: misclf[test_idx[i]][5][randseed] = 1 fold = fold + 1 ih_vector = np.zeros(ndata) for i in range(ndata): ih_vector[i] = sum(sum(misclf[i])) / ( numClf * niter ) # Avg of matrix with numClf classifiers and niter iterations return ih_vector, misclf
def fit(self, df): """Train the model using the given Pandas dataframe df as input. The dataframe has a hierarchical index where the outer index (ID) is over individuals, and the inner index (Time) is over time points. Many features are available. There are five binary label columns: ['label:LYING_DOWN', 'label:SITTING', 'label:FIX_walking', 'label:TALKING', 'label:OR_standing'] The dataframe contains both missing feature values and missing label values indicated by nans. Your goal is to design and fit a probabilistic forecasting model that when given a dataframe containing a sequence of incomplete observations and a time stamp t, outputs the probability that each label is active (e.g., equal to 1) at time t. Arguments: df: A Pandas data frame containing the feature and label data """ """#print(df.to_string()) print(df) print(df.shape) #df.to_excel("data.xlsx") print(df.columns[1:-5].values) #features -- without timestamp print(df.index.names) #with pd.option_context('display.max_rows', None, 'display.max_columns', None): print(df.loc[pd.IndexSlice[2, :],:'discrete:time_of_day:between21and3']) #print(df.loc(axis=0)[pd.IndexSlice[0, :]])""" df_timestamps, df_features, df_output = df.iloc[:, 0], df.iloc[:, 1: -5], df.iloc[:, -5:] loo = LeaveOneOut() batch_size = 1 device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu') lr = 0.005 criterion = nn.BCELoss() optimizer = torch.optim.Adam(self.model.parameters(), lr=lr) for train, test in loo.split(range(len(df.groupby(level=0)))): df_train, df_test = df.loc[pd.IndexSlice[train, :], :], df.loc[ pd.IndexSlice[test, :], :] df_test_timestamps, df_test_features, df_test_output = df_test.iloc[:, 0].values, df_test.iloc[:, 1: -5].values, df_test.iloc[:, -5:].values df_test_t = df_test_timestamps[-1] + torch.randint( low=1, high=61, size=(1, )) #sub = df_test_timestamps[0] #df_test_timestamps = [i - sub + 1 for i in df_test_timestamps] #df_test_features = (df_test_features.T*df_test_timestamps).T #df_train_timestamps, df_train_features, df_train_output = df_train.iloc[:,0], df_train.iloc[:,1:-5], df_train.iloc[:,-5:] for train_index in train: df_train_timestamps, df_train_features, df_train_output = df_train.loc[ pd. IndexSlice[train_index, :], :].values[:, 0], df_train.loc[ pd.IndexSlice[ train_index, :], :].values[:, 1:-5], df_train.loc[ pd.IndexSlice[train_index, :], :].values[:, -5:] print(train_index) sub = df_train_timestamps[0] df_train_timestamps = [ i - sub + 1 for i in df_train_timestamps ] df_train_features = (df_train_features.T * df_train_timestamps).T #print(df_train_features.shape) #print(df_train_output.shape) for train_tuple in range(df_train_features.shape[0]): tuple = np.reshape(df_train_features[train_tuple, :], (1, 1, df_train_features.shape[1])) result = np.reshape(df_train_output[train_tuple, :], (1, 1, df_train_output.shape[1])) #print(df_train_output.shape) self.model.train() optimizer.zero_grad() out = self.model(torch.tensor(tuple).float()) loss = criterion(out, torch.tensor(result).float()) #print(loss) loss.backward() optimizer.step() #input() self.forecast(df_test, df_test_t)
def computeCVROC(df, model, outcomeVar, predVars, nFolds=10, LOO=False): """Apply model to df and return performance metrics in a cross-validation framework. Parameters ---------- df : pd.DataFrame Must contain outcome and predictor variables. model : sklearn or other model Model must have fit and predict methods. outcomeVar : str predVars : ndarray or list Predictor variables in the model. nFolds : int N-fold cross-validation (not required for LOO) Returns ------- fpr : np.ndarray Pre-specified vector of FPR thresholds for interpolation fpr = np.linspace(0, 1, 100) meanTPR : np.ndarray Mean true-positive rate in test fraction. auc : float Area under the mean ROC curve. acc : float Mean accuracy score in test fraction. results : returned by model.fit() Training model results object for each fold prob : pd.Series Mean predicted probabilities on test data with index from df success : bool An indicator of whether the cross-validation was completed.""" if not isinstance(predVars, list): predVars = list(predVars) tmp = df[[outcomeVar] + predVars].dropna() X,y = tmp[predVars].astype(float), tmp[outcomeVar].astype(float) if LOO: cv = LeaveOneOut() nFolds = cv.get_n_splits(y) cv_iter = cv.split(y=y) else: cv = StratifiedKFold(n_splits=nFolds, shuffle=True) cv_iter = cv.split(X=X, y=y) fpr = np.linspace(0, 1, 100) tpr = np.nan * np.zeros((fpr.shape[0], nFolds)) acc = np.nan * np.zeros(nFolds) auc = np.nan * np.zeros(nFolds) coefs = [] probs = [] for outi, (trainInd, testInd) in enumerate(cv_iter): Xtrain, Xtest = X.iloc[trainInd], X.iloc[testInd] ytrain, ytest = y.iloc[trainInd], y.iloc[testInd] results = model.fit(X=Xtrain, y=ytrain) prob = results.predict_proba(Xtest) class1Ind = np.nonzero(results.classes_ == 1)[0][0] fprTest, tprTest, _ = sklearn.metrics.roc_curve(ytest, prob[:, class1Ind]) tpr[:, outi] = np.interp(fpr, fprTest, tprTest) auc[outi] = sklearn.metrics.auc(fprTest, tprTest) acc[outi] = sklearn.metrics.accuracy_score(ytest, np.round(prob[:, class1Ind]), normalize=True) coefs.append(results.coef_[None,:]) probs.append(pd.Series(prob[:, class1Ind], index=Xtest.index)) meanTPR = np.mean(tpr, axis=1) meanTPR[0], meanTPR[-1] = 0, 1 meanACC = np.mean(acc) meanAUC = sklearn.metrics.auc(fpr, meanTPR) """Compute mean probability over test predictions in CV""" probS = pd.concat(probs).groupby(level=0).agg(np.mean) probS.name = 'Prob' """Refit all the data for final model""" result = model.fit(X=X, y=y) rocRes = rocStats(y, np.round(probS)) outD = {'fpr':fpr, # (100, ) average FPR for ROC 'tpr':meanTPR, # (100, ) average TPR for ROC 'AUC':auc, # (CVfolds, ) AUC of ROC for each outer test fold 'mAUC': meanAUC, # (1, ) AUC of the average ROC 'mACC': np.mean(acc), 'ACC':acc, # (CVfolds, ) accuracy across outer test folds 'finalResult': result, # final fitted model with predict() exposed 'prob':probS, # (N,) pd.Series of predicted probabilities avg over outer folds 'coefs':np.concatenate(coefs), # (CVfolds, predVars) 'Xvars':predVars, 'Yvar':outcomeVar, 'nFolds':nFolds, 'LOO':'Yes' if LOO else 'No', 'N':tmp.shape[0]} outD.update(rocRes[['Sensitivity', 'Specificity']].to_dict()) return outD
#Iris data cross-validation from sklearn.linear_model import LogisticRegression from sklearn import metrics from sklearn.model_selection import cross_val_predict from sklearn.model_selection import LeaveOneOut from sklearn import datasets import numpy as np import pandas as pd #tmp=pd.read_csv('iris.data',sep=',') #iris=np.loadtxt('iris.data', delimiter=',') iris=datasets.load_iris() x=iris['data'][0:149] y=iris['target'][0:149] log_model=LogisticRegression() m=np.shape(x)[0] y_pred=cross_val_predict(log_model,x,y,cv=10) print(metrics.accuracy_score(y,y_pred)) #print(y_pred) loo=LeaveOneOut() accuracy=0 for train,test in loo.split(x): log_model.fit(x[train],y[train]) y_pred1=log_model.predict(x[test]) if y_pred1==y[test]:accuracy+=1 print (accuracy/m)
y = iris.target knn = KNeighborsClassifier() # ==== Leave-one-out validation ==== from sklearn.model_selection import LeaveOneOut # Instatiate `LeaveOneOut` class. See [here](http://scikit-learn.org/stable/modules/cross_validation.html#leave-one-out-loo) # and [here](http://scikit-learn.org/stable/modules/generated/sklearn.model_selection.LeaveOneOut.html#sklearn.model_selection.LeaveOneOut) # for more details. loo = LeaveOneOut() # Keep track of successful predictions successes = [] # the `split` method generates indices to split data into training and test set. for train_index, test_index in loo.split(X): # `fit` classifier on training indices knn.fit(X[train_index], y[train_index]) # `score` classifier on testing indices; since there will be only one # test index, the score will be either 1 (for a correct prediction) or # 0 (for an incorrect prediction). successes.append(knn.score(X[test_index], y[test_index])) # Divide `successes` by the sample size to get the percentage score. print("Accuracy for iris dataset with Leave-One-Out validation is {}.\n". format(np.mean(successes))) # ==== Random permutation cross validation ==== from sklearn.model_selection import ShuffleSplit # Instantiate ShuffleSplit class with `n_splits` (number of repetitions) and # `test_size` (percentage of dataset to withhold for the test data). See
# # # Plot normalized confusion matrix # plt.figure() # plot_confusion_matrix(cnf_matrix, classes=list(class_names), normalize=True, # title='Normalized confusion matrix') # # plt.show() # scores = np.array(metrics.accuracy_score(y, y_pred_list)) # print(md, ne, lr, round(scores.mean(), 2), round(scores.std(), 2) * 2) #SVM clf = svm.SVC(kernel='linear', probability=True) auc = [] y_pred1 = [] y_test1 = [] for train_index, test_index in loo.split(X): X_train, X_test = X.iloc[train_index, :], X.iloc[test_index, :] y_train, y_test = y[train_index], y[test_index] clf.fit(X_train, y_train) y_pred = clf.predict_proba(X_test)[:, 1] y_pred1.append(y_pred) y_test1.append(y_test.values[0]) W = clf.coef_[0] try: df = pd.DataFrame({ 'Taxonome': preproccessed_data.columns[3:], 'Coefficients': np.dot(clf.coef_[0], pca_components) }) except: df = pd.DataFrame({ 'Taxonome': preproccessed_data.columns[3:],
CrossValidation5Fold = BaseCrossValidation( n_split='5', description="To determine the hyper-parameter (e.g. the number of " "features) of model, we applied cross validation with 5-fold " "on the training data set. The hyper-parameters were set " "according to the model performance on the validation data set. ") CrossValidation10Fold = BaseCrossValidation( n_split='10', description="To determine the hyper-parameter (e.g. the number of " "features) of model, we applied cross validation with 10-fold " "on the training data set. The hyper-parameters were set " "according to the model performance on the validation data set. ") CrossValidationLOO = BaseCrossValidation( n_split='all', description="To determine the hyper-parameter (e.g. the number of features) " "of model, we applied cross validation with leave-one-out on the " "training data set. The hyper-parameters were set according to " "the model performance on the validation data set. ") if __name__ == '__main__': import numpy as np data = np.random.random((100, 10)) label = np.concatenate((np.ones((60, )), np.zeros((40, ))), axis=0) cv = LeaveOneOut() for train, val in cv.split(data, label): print(train) print(val) print('')
if args.fold < 0: logger.info("Validation set") random.shuffle(duplicate_reports) sample_size = int(len(duplicate_reports) * 0.05) data_alpha = sorted(duplicate_reports[:sample_size], key=lambda bug_id: int(bug_id)) splits = [(sorted(duplicate_reports[sample_size:], key=lambda bug_id: int(bug_id)), data_alpha)] elif args.fold == 0: logger.info("Leave one out") loo = LeaveOneOut() splits = loo.split(duplicate_reports) else: logger.info("K-folds: {}".format(args.fold)) kf = KFold(n_splits=args.fold) splits = kf.split(duplicate_reports) base_filename = path.splitext(path.split(args.dt)[1])[0] + '_' + args.l max_bug_id = max( map(lambda bug_id: int(bug_id), trainingDataset.bugIds)) masterSetById = bugReportDatabase.getMasterSetById( trainingDataset.bugIds) map_by_alpha = [] n_queries = 0 # Preprocess the reports before writing REP input file rep_reports = generate_input_vec(bugReportDatabase, max_bug_id)
model[j].add(Conv2D(j*32+32,kernel_size=5,activation='relu')) """ model[j].add(MaxPool2D()) model[j].add(Flatten()) model[j].add(Dense(256, activation='relu')) model[j].add(Dense(2, activation='softmax')) model[j].compile(optimizer="adam", loss="categorical_crossentropy", metrics=["accuracy"]) #LEAVE ONE OUT CROSS-VALIDATION history = [0] * nets names = ["8 maps","16 maps","24 maps","32 maps","48 maps","64 maps"] cv_result = [] for j in range(nets): cv_result = [] for train_index, test_index in loo.split(X_train): clf = model[j] X_train2, X_val2 = X_train[train_index], X_train[test_index] Y_train2, Y_val2 = Y_train[train_index], Y_train[test_index] acc = clf.fit(X_train2,Y_train2, batch_size=None, epochs=5, verbose=0, validation_data=(X_val2, Y_val2), workers=1, callbacks=[annealer]) cv_result.append(acc.history['val_accuracy']) history[j] = np.mean(cv_result) print("CNN {0}: Validation accuracy={1:.5f}".format(names[j], history[j])) #EXPERIMENT 3 nets = 4 model = [0] *nets for j in range(4): model[j] = Sequential() model[j].add(Conv2D(j*8+8,kernel_size=5,activation='relu',input_shape=(64,64,1)))
def main(): x1 = getMatrix("data/Train915/result/negative/pssm_profile_uniref50") x2 = getMatrix("data/Train915/result/positive/pssm_profile_uniref50") x = np.vstack((x1, x2)) y = [-1 for i in range(x1.shape[0])] y.extend([1 for i in range(x2.shape[0])]) y = np.array(y) # N = x.shape[1] print(int(sqrt(N).real), N // 5, int(log(N, 2).real), N // 3, N // 2, N // 4, N // 10) param_grid = { 'max_features': [ int(sqrt(N).real), N // 5, int(log(N, 2).real), N // 3, N // 2, N // 4, N // 10 ] } gs = GridSearchCV(RandomForestClassifier(n_estimators=1000, random_state=1), param_grid, cv=10) gs.fit(x, y) print(gs.best_estimator_) print(gs.best_score_) # clf = gs.best_estimator_ loo = LeaveOneOut() score = cross_val_score(clf, x, y, cv=loo).mean() print("LOO:{}".format(score)) # loo_probas_y = [] # loo_test_y = [] # loo_predict_y = [] # for train, test in loo.split(x): clf.fit(x[train], y[train]) loo_predict_y.extend(clf.predict(x[test])) # loo_probas_y.extend(clf.predict_proba(x[test])) # loo_test_y.extend(y[test]) # loo_probas_y = np.array(loo_probas_y) loo_test_y = np.array(loo_test_y) print(loo_probas_y.shape) #np.savetxt("915-RFclassification-DWT-LOO-probas_y.csv", loo_probas_y, delimiter=",") #np.savetxt("915-RFclassification-DWT-LOO-test_y.csv", loo_test_y, delimiter=",") # confusion = sklearn.metrics.confusion_matrix(loo_test_y, loo_predict_y) TP = confusion[1, 1] TN = confusion[0, 0] FP = confusion[0, 1] FN = confusion[1, 0] print("ROC:{}".format(roc_auc_score(loo_test_y, loo_probas_y[:, 1]))) print("SP:{}".format(TN / (TN + FP))) print("SN:{}".format(TP / (TP + FN))) n = (TP * TN - FP * FN) / (((TP + FP) * (TP + FN) * (TN + FP) * (TN + FN))**0.5) print("PRE:{}".format(TP / (TP + FP))) print("MCC:{}".format(n)) print("F-score:{}".format((2 * TP) / (2 * TP + FP + FN))) print("ACC:{}".format((TP + TN) / (TP + FP + TN + FN))) # test_x1 = getMatrix("data/Test850/result/negative/pssm_profile_uniref50") test_x2 = getMatrix("data/Test850/result/positive/pssm_profile_uniref50") test_x = np.vstack((test_x1, test_x2)) test_y = [-1 for i in range(test_x1.shape[0])] test_y.extend([1 for i in range(test_x2.shape[0])]) clf = gs.best_estimator_ clf.fit(x, y) predict_y = clf.predict(test_x) print("IND:{}".format(accuracy_score(test_y, predict_y)))
print('\nExperiment 7 - Fit a 5th order polynomial to female400') print("Mean square Error - Female400 - Linear : %.3f" % mean_squared_error(train_time_female400, pred_time_female400_all)) print("Mean square Error - Female400 - Degree 3 : %.3f" % mean_squared_error(train_time_female400, pred_time_female400_all_3)) print("Mean square Error - Female400 - Degree 5 : %.3f" % mean_squared_error(train_time_female400, pred_time_female400_all_5)) print("The error does not improve," "And slightly increases from degree 3 to degree 5") # Experiment 8 # Use LOOCV for both 3rd and 5th order polynomials # Reference - http://scikit-learn.org/stable/modules/generated/sklearn.model_selection.LeaveOneOut.html print('\nExperiment 8 - Use LOOCV for both 3rd and 5th order polynomials') poly_model_3_loo = make_pipeline(PolynomialFeatures(degree=3), LinearRegression(fit_intercept=False)) results_3 = [] loo = LeaveOneOut() for train_index, test_index in loo.split(train_year_female400): train_year_f400_loo, test_year_f400_loo = train_year_female400[train_index], train_year_female400[test_index] train_time_f400_loo, test_time_f400_loo = train_time_female400[train_index], train_time_female400[test_index] poly_model_3_loo.fit(train_year_f400_loo, train_time_f400_loo) results_3.append(mean_squared_error(test_time_f400_loo, poly_model_3_loo.predict(test_year_f400_loo))) formatted_result_3 = ["%.3f" % item for item in results_3] pred_time_f400_all_3_loo = poly_model_3_loo.predict(train_year_female400) # Plot to check fit plt.scatter(train_year_female400, train_time_female400, c='r') plt.plot(train_year_female400, pred_time_f400_all_3_loo, c='b') plt.xlabel('Year') plt.ylabel('Time(seconds)') plt.title('Exp 8 - Use LOOCV for 3rd order polynomial') plt.show() print("Mean square Error - Female400 - Degree 3 - LOOCV : %.3f" % mean_squared_error(train_time_female400, pred_time_f400_all_3_loo))
def main(): ######################### Defining_Autism_brains ######################### n = 31 A = numpy.zeros((90, 90, n)) A[:, :, 0] = numpy.loadtxt(open("28853.csv", "rb"), delimiter=",") A[:, :, 1] = numpy.loadtxt(open("28855.csv", "rb"), delimiter=",") A[:, :, 2] = numpy.loadtxt(open("28856.csv", "rb"), delimiter=",") A[:, :, 3] = numpy.loadtxt(open("28857.csv", "rb"), delimiter=",") A[:, :, 4] = numpy.loadtxt(open("28859.csv", "rb"), delimiter=",") A[:, :, 5] = numpy.loadtxt(open("28860.csv", "rb"), delimiter=",") A[:, :, 6] = numpy.loadtxt(open("28861.csv", "rb"), delimiter=",") A[:, :, 7] = numpy.loadtxt(open("28864.csv", "rb"), delimiter=",") A[:, :, 8] = numpy.loadtxt(open("28865.csv", "rb"), delimiter=",") A[:, :, 9] = numpy.loadtxt(open("28866.csv", "rb"), delimiter=",") A[:, :, 10] = numpy.loadtxt(open("28871.csv", "rb"), delimiter=",") A[:, :, 11] = numpy.loadtxt(open("28872.csv", "rb"), delimiter=",") A[:, :, 12] = numpy.loadtxt(open("28873.csv", "rb"), delimiter=",") A[:, :, 13] = numpy.loadtxt(open("28874.csv", "rb"), delimiter=",") A[:, :, 14] = numpy.loadtxt(open("28875.csv", "rb"), delimiter=",") A[:, :, 15] = numpy.loadtxt(open("28876.csv", "rb"), delimiter=",") A[:, :, 16] = numpy.loadtxt(open("28879.csv", "rb"), delimiter=",") A[:, :, 17] = numpy.loadtxt(open("28885.csv", "rb"), delimiter=",") A[:, :, 18] = numpy.loadtxt(open("28887.csv", "rb"), delimiter=",") A[:, :, 19] = numpy.loadtxt(open("28890.csv", "rb"), delimiter=",") A[:, :, 20] = numpy.loadtxt(open("28896.csv", "rb"), delimiter=",") A[:, :, 21] = numpy.loadtxt(open("28897.csv", "rb"), delimiter=",") A[:, :, 22] = numpy.loadtxt(open("28898.csv", "rb"), delimiter=",") A[:, :, 23] = numpy.loadtxt(open("28899.csv", "rb"), delimiter=",") A[:, :, 24] = numpy.loadtxt(open("28901.csv", "rb"), delimiter=",") A[:, :, 25] = numpy.loadtxt(open("28903.csv", "rb"), delimiter=",") A[:, :, 26] = numpy.loadtxt(open("28905.csv", "rb"), delimiter=",") A[:, :, 27] = numpy.loadtxt(open("28906.csv", "rb"), delimiter=",") A[:, :, 28] = numpy.loadtxt(open("28907.csv", "rb"), delimiter=",") A[:, :, 29] = numpy.loadtxt(open("28908.csv", "rb"), delimiter=",") A[:, :, 30] = numpy.loadtxt(open("28909.csv", "rb"), delimiter=",") ############################Defining_Normal_brains################################# m = 24 B = numpy.zeros((90, 90, m)) B[:, :, 0] = numpy.loadtxt(open("28854.csv", "rb"), delimiter=",") B[:, :, 1] = numpy.loadtxt(open("28858.csv", "rb"), delimiter=",") B[:, :, 2] = numpy.loadtxt(open("28862.csv", "rb"), delimiter=",") B[:, :, 3] = numpy.loadtxt(open("28863.csv", "rb"), delimiter=",") B[:, :, 4] = numpy.loadtxt(open("28867.csv", "rb"), delimiter=",") B[:, :, 5] = numpy.loadtxt(open("28868.csv", "rb"), delimiter=",") B[:, :, 6] = numpy.loadtxt(open("28870.csv", "rb"), delimiter=",") B[:, :, 7] = numpy.loadtxt(open("28877.csv", "rb"), delimiter=",") B[:, :, 8] = numpy.loadtxt(open("28878.csv", "rb"), delimiter=",") B[:, :, 9] = numpy.loadtxt(open("28880.csv", "rb"), delimiter=",") B[:, :, 10] = numpy.loadtxt(open("28881.csv", "rb"), delimiter=",") B[:, :, 11] = numpy.loadtxt(open("28882.csv", "rb"), delimiter=",") B[:, :, 12] = numpy.loadtxt(open("28883.csv", "rb"), delimiter=",") B[:, :, 13] = numpy.loadtxt(open("28886.csv", "rb"), delimiter=",") B[:, :, 14] = numpy.loadtxt(open("28888.csv", "rb"), delimiter=",") B[:, :, 15] = numpy.loadtxt(open("28889.csv", "rb"), delimiter=",") B[:, :, 16] = numpy.loadtxt(open("28891.csv", "rb"), delimiter=",") B[:, :, 17] = numpy.loadtxt(open("28892.csv", "rb"), delimiter=",") B[:, :, 18] = numpy.loadtxt(open("28893.csv", "rb"), delimiter=",") B[:, :, 19] = numpy.loadtxt(open("28894.csv", "rb"), delimiter=",") B[:, :, 20] = numpy.loadtxt(open("28895.csv", "rb"), delimiter=",") B[:, :, 21] = numpy.loadtxt(open("28900.csv", "rb"), delimiter=",") B[:, :, 22] = numpy.loadtxt(open("28902.csv", "rb"), delimiter=",") B[:, :, 23] = numpy.loadtxt(open("28904.csv", "rb"), delimiter=",") ################################################################################### # Defining Austim[] and Normal[] brains after measuring the four matrices Autism = numpy.zeros((31, 4)) Normal = numpy.zeros((24, 4)) All_brains_matrices = numpy.zeros((55, 4)) Autism = caculate_matrices(A, n) # calculate the four matrices for 31 brains Normal = caculate_matrices(B, m) # calculate the four matrices for 24 brains #Combine the two matrices into one matrix (All_brains_matrices All_brains_matrices = numpy.concatenate((Autism, Normal), axis=0) print("All_Brain_Matrices is ", All_brains_matrices) print("Dim_All_Brain_Matrices is", All_brains_matrices.shape) ############################################################################# X = All_brains_matrices y = numpy.zeros(55) y[0:31] = 1 y[31:55] = 0 ################################ Leave-One-Out #################################################### loo = LeaveOneOut() score = numpy.zeros(55) count = 0 train_X = numpy.zeros((54, 4)) train_y = numpy.zeros(54) test_X = [0] test_y = [0] for train_index, test_index in loo.split(X): #print(train_index.shape,test_index.shape) for i in range(len(train_index)): train_X[i, :] = X[train_index[i]] train_y[i] = y[train_index[i]] test_X = X[test_index[0]] test_y = y[test_index[0]] clf = svm.SVC(kernel='linear', C=1, probability=True).fit(train_X, train_y) probs = clf.predict_proba(test_X) score[count] = probs[:, 0] count += 1 ############ Alessandro's ROC############# roc_x = [] roc_y = [] min_score = min(score) max_score = max(score) thr = numpy.linspace(min_score, max_score, 30) FP = 0 TP = 0 P = sum(y) N = len(y) - P for (i, T) in enumerate(thr): for i in range(0, len(score)): if (score[i] > T): if (y[i] == 1): TP = TP + 1 if (y[i] == 0): FP = FP + 1 roc_x.append(FP / float(N)) roc_y.append(TP / float(P)) FP = 0 TP = 0 roc_auc = auc(roc_x, roc_y) ############################################################################## #Plot of a ROC curve for a specific class lw = 2 plt.plot(roc_x, roc_y, color='darkorange', lw=lw, label='ROC curve (area = %0.2f)' % roc_auc) plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--') plt.xlim([0.0, 1.0]) plt.ylim([0.0, 1.05]) plt.xlabel('False Positive Rate') plt.ylabel('True Positive Rate') plt.title('Receiver operating characteristic') plt.legend(loc="lower right") plt.show()
"SDML was not able to converge in less than {} attempts. ". format(max_runs)) continue else: is_sdml = False loo = LeaveOneOut() estimated_parameters = [] true_parameters_list = [] parameter_distances = [] embedding_distances = [] estimated_KL = [] estimated_TV = [] for ref_index, obs_index in loo.split( test_data, test_labels): # this returns the indices # split the dataset n_samples = len(ref_index) output_ref = test_data[ref_index] param1_ref = param1_test[ref_index] param2_ref = param2_test[ref_index] observation = test_data[obs_index] param1_obs = param1_test[obs_index] param2_obs = param2_test[obs_index] true_parameters = np.array([param1_obs, param2_obs]).reshape(-1) if name == 'true': output_ref_transformed = np.column_stack(
kf = KFold(n_splits=4, shuffle=True, random_state=71) for tr_group_idx, va_group_idx in kf.split(unique_user_ids): # 顧客IDをtrain/valid(学習に使うデータ、バリデーションデータ)に分割する tr_groups, va_groups = unique_user_ids[tr_group_idx], unique_user_ids[ va_group_idx] # 各レコードの顧客IDがtrain/validのどちらに属しているかによって分割する is_tr = user_id.isin(tr_groups) is_va = user_id.isin(va_groups) tr_x, va_x = train_x[is_tr], train_x[is_va] tr_y, va_y = train_y[is_tr], train_y[is_va] # (参考)GroupKFoldクラスではシャッフルと乱数シードの指定ができないため使いづらい kf = GroupKFold(n_splits=4) for tr_idx, va_idx in kf.split(train_x, train_y, user_id): tr_x, va_x = train_x.iloc[tr_idx], train_x.iloc[va_idx] tr_y, va_y = train_y.iloc[tr_idx], train_y.iloc[va_idx] # ----------------------------------- # leave-one-out # ----------------------------------- # データが100件しかないものとする train_x = train_x.iloc[:100, :].copy() # ----------------------------------- from sklearn.model_selection import LeaveOneOut loo = LeaveOneOut() for tr_idx, va_idx in loo.split(train_x): tr_x, va_x = train_x.iloc[tr_idx], train_x.iloc[va_idx] tr_y, va_y = train_y.iloc[tr_idx], train_y.iloc[va_idx]
from sklearn.model_selection import LeaveOneOut #importing the logistic regression module from logisticRegression.py from LogisticRegression import LogisticRegression import numpy as np irisdata = datasets.load_iris() data = np.delete(irisdata['data'][50:], [0, 1], axis=1) lv = LeaveOneOut() lv.get_n_splits(data) labels = irisdata.target[50:].ravel() error = [] for train_index, test_index in lv.split(data): #creating object for logistic regression model = LogisticRegression() train_data = data[train_index] train_labels = labels[train_index] test_data = data[test_index] test_labels = labels[test_index] #training the logistic regression object model.train(train_data, train_labels) error.append(1 - (test_labels == model.test(test_data, test_labels)[0])) print(np.mean(error))
def trainModelFV_LOOCV_Classifiers(self, extension='*.txt'): """ This method contains the entire module required for training the bag of visual words model Use of helper functions will be extensive. """ print('trainModelFV_LOOCV_Classifiers') names = ["Linear SVM"] classifiers = [SVC(kernel='linear')] self.name_dict, self.number_dict, self.count_class = self.file_helper.getLabelsFromFile( self.label_path) # read file. prepare file lists. self.images, self.trainImageCount = self.file_helper.getFilesFromDirectory( self.base_path, self.datasets, extension) self.parameters += 'Classifier Parameters\n' self.parameters += '%s' % self.classifier_helper.clf features_nd = np.asarray(self.images) #features_nd.sort(axis=0) loo = LeaveOneOut() predictions = {} p = {} l = [] hits = {} for name in names: predictions[name] = [] p[name] = [] hits[name] = 0 c = 0 for train, test in loo.split(features_nd): feature_test_file = str(features_nd[test][0][0]) class_name_test = feature_test_file.split(os.sep)[-2] c += 1 currenInvDate = datetime.datetime.now().strftime( "%d/%m/%Y %H:%M:%S") print('Step: %i/%i - %s - %s' % (c, features_nd.shape[0], currenInvDate, feature_test_file)) # if c == 1 or c % 25 == 0: # self.mail_helper.sendMail("Progress: %s - %s" % (self.test_name, self.OsName), "Samples processed: %i" % c) self.descriptor_list = [] self.train_labels = [] for feature in features_nd[train]: feature = feature[0] label_number = self.number_dict[feature.split(os.sep)[-2]] self.train_labels = np.append(self.train_labels, label_number) des = self.file_helper.loadFeaturesFromFile(feature) self.descriptor_list.append(des) # format data as nd array self.classifier_helper.formatND(self.descriptor_list) gmm = GMM(n_components=self.no_clusters, covariance_type='diag') gmm.fit(self.classifier_helper.descriptor_vstack) fv_dim = self.no_clusters + 2 * self.no_clusters * self.classifier_helper.descriptor_vstack.shape[ 1] print(fv_dim) n_videos = train.shape[0] features = np.array([np.zeros(fv_dim) for i in range(n_videos)]) count = 0 for i in range(n_videos): len_video = len(self.descriptor_list[i]) fv = fisher_vector( self.classifier_helper.descriptor_vstack[count:count + len_video], gmm) features[i] = fv count += len_video print(features.shape) print('Data normalization.') scaler = StandardScaler() # train normalization features = scaler.fit_transform(features) features = power_normalize(features, 0.5) features = L2_normalize(features) # real label l.extend([self.number_dict[feature_test_file.split(os.sep)[-2]]]) # test features feature_test = self.file_helper.loadFeaturesFromFile( feature_test_file) test_fv = fisher_vector(feature_test, gmm) # train normalization test_fv = test_fv.reshape(1, -1) test_fv = scaler.transform(test_fv) test_fv = power_normalize(test_fv, 0.5) test_fv = L2_normalize(test_fv) # train classifiers for name, clf in zip(names, classifiers): print(name) clf.fit(features, self.train_labels) cl = int(clf.predict(test_fv)[0]) class_name_predict = self.name_dict[str(cl)] if class_name_test == class_name_predict: hits[name] += 1 # predicted label p[name].extend([cl]) predictions[name].append({ 'image': feature_test_file, 'class': cl, 'object_name': self.name_dict[str(cl)] }) msg_progress = '' for name in names: msg_progress += 'Classifier: %s - Hits:%i/%i - Accuracy: %.4f\n' % ( name.ljust(20), hits[name], c, hits[name] / c) print(msg_progress) print('\n\n') if c == 1 or c % 25 == 0: self.mail_helper.sendMail( "Progress: %s - %s" % (self.test_name, self.OsName), msg_progress) for name in names: print(name) self.saveResults(predictions[name], p[name], l, features_nd.shape[0], classifier_name=name)
print("Importing data") (train_x, train_y) = mmi() print("Imported data, initiating training") train_x = train_x[1:] train_y = train_y[1:] loo = LeaveOneOut() vals_y = [] Poly_preds_y = [] Gaussian_preds_y = [] SVM1 = sklearn.svm.SVC(C=5.0, kernel='poly', coef0=1.0) SVM2 = sklearn.svm.SVC(C=20.0, kernel='rbf') for train_idx, val_idx in loo.split(train_x): X_train, X_val = train_x[train_idx], train_x[val_idx] y_train, y_val = train_y[train_idx], train_y[val_idx] SVM1.fit(X_train, y_train) SVM2.fit(X_train, np.ravel(y_train)) Poly_pred_y = SVM1.predict(X_val) Gaussian_pred_y = SVM2.predict(X_val) vals_y.append(list(y_val)) Poly_preds_y.append(list(Poly_pred_y)) Gaussian_preds_y.append(list(Gaussian_pred_y)) vals_y = np.ravel(vals_y) Poly_preds_y = np.ravel(Poly_preds_y) Gaussian_preds_y = np.ravel(Gaussian_preds_y)
def balance_tpr(cfg, featdata): """ Find the threshold of class index 0 that yields equal number of true positive samples of each class. Currently only available for binary classes. Params ====== cfg: config module feetdata: feature data computed using compute_features() """ n_jobs = cfg.N_JOBS if n_jobs is None: n_jobs = mp.cpu_count() if n_jobs > 1: print('balance_tpr(): Using %d cores' % n_jobs) pool = mp.Pool(n_jobs) results = [] # Init a classifier if cfg.CLASSIFIER == 'GB': cls = GradientBoostingClassifier(loss='deviance', learning_rate=cfg.GB['learning_rate'], n_estimators=cfg.GB['trees'], subsample=1.0, max_depth=cfg.GB['max_depth'], random_state=cfg.GB['seed'], max_features='sqrt', verbose=0, warm_start=False, presort='auto') elif cfg.CLASSIFIER == 'XGB': cls = XGBClassifier(loss='deviance', learning_rate=cfg.GB['learning_rate'], n_estimators=cfg.GB['trees'], subsample=1.0, max_depth=cfg.GB['max_depth'], random_state=cfg.GB['seed'], max_features='sqrt', verbose=0, warm_start=False, presort='auto') elif cfg.CLASSIFIER == 'RF': cls = RandomForestClassifier(n_estimators=cfg.RF['trees'], max_features='auto', max_depth=cfg.RF['max_depth'], n_jobs=cfg.N_JOBS, random_state=cfg.RF['seed'], oob_score=True, class_weight='balanced_subsample') elif cfg.CLASSIFIER == 'LDA': cls = LDA() elif cfg.CLASSIFIER == 'rLDA': cls = rLDA(cfg.RLDA_REGULARIZE_COEFF) else: raise ValueError('Unknown classifier type %s' % cfg.CLASSIFIER) # Setup features X_data = featdata['X_data'] Y_data = featdata['Y_data'] wlen = featdata['wlen'] if cfg.PSD['wlen'] is None: cfg.PSD['wlen'] = wlen # Choose CV type ntrials, nsamples, fsize = X_data.shape if cfg.CV_PERFORM == 'LeaveOneOut': print('\n>> %d-fold leave-one-out cross-validation' % ntrials) if SKLEARN_OLD: cv = LeaveOneOut(len(Y_data)) else: cv = LeaveOneOut() elif cfg.CV_PERFORM == 'StratifiedShuffleSplit': print( '\n>> %d-fold stratified cross-validation with test set ratio %.2f' % (cfg.CV_FOLDS, cfg.CV_TEST_RATIO)) if SKLEARN_OLD: cv = StratifiedShuffleSplit(Y_data[:, 0], cfg.CV_FOLDS, test_size=cfg.CV_TEST_RATIO, random_state=cfg.CV_RANDOM_SEED) else: cv = StratifiedShuffleSplit(n_splits=cfg.CV_FOLDS, test_size=cfg.CV_TEST_RATIO, random_state=cfg.CV_RANDOM_SEED) else: raise NotImplementedError('%s is not supported yet. Sorry.' % cfg.CV_PERFORM) print('%d trials, %d samples per trial, %d feature dimension' % (ntrials, nsamples, fsize)) # For classifier itself, single core is usually faster cls.n_jobs = 1 Y_preds = [] if SKLEARN_OLD: splits = cv else: splits = cv.split(X_data, Y_data[:, 0]) for cnum, (train, test) in enumerate(splits): X_train = np.concatenate(X_data[train]) X_test = np.concatenate(X_data[test]) Y_train = np.concatenate(Y_data[train]) Y_test = np.concatenate(Y_data[test]) if n_jobs > 1: results.append( pool.apply_async( get_predict_proba, [cls, X_train, Y_train, X_test, Y_test, cnum + 1])) else: Y_preds.append( get_predict_proba(cls, X_train, Y_train, X_test, Y_test, cnum + 1)) cnum += 1 # Aggregate predictions if n_jobs > 1: pool.close() pool.join() for r in results: Y_preds.append(r.get()) Y_preds = np.concatenate(Y_preds, axis=0) # Find threshold for class index 0 Y_preds = sorted(Y_preds) mid_idx = int(len(Y_preds) / 2) if len(Y_preds) == 1: return 0.5 # should not reach here in normal conditions elif len(Y_preds) % 2 == 0: thres = Y_preds[mid_idx - 1] + (Y_preds[mid_idx] - Y_preds[mid_idx - 1]) / 2 else: thres = Y_preds[mid_idx] return thres
scoring=scoring, cv=5, return_train_score=True) sorted(scores.keys()) print('测试结果:', scores) # scores类型为字典。包含训练得分,拟合次数, score-times (得分次数) # ==================================K折交叉验证、留一交叉验证、留p交叉验证、随机排列交叉验证========================================== # k折划分子集 kf = KFold(n_splits=2) for train, test in kf.split(iris.data): print("k折划分:%s %s" % (train.shape, test.shape)) break # 留一划分子集 loo = LeaveOneOut() for train, test in loo.split(iris.data): print("留一划分:%s %s" % (train.shape, test.shape)) break # 留p划分子集 lpo = LeavePOut(p=2) for train, test in loo.split(iris.data): print("留p划分:%s %s" % (train.shape, test.shape)) break # 随机排列划分子集 ss = ShuffleSplit(n_splits=3, test_size=0.25, random_state=0) for train_index, test_index in ss.split(iris.data): print("随机排列划分:%s %s" % (train.shape, test.shape)) break
iris = sns.load_dataset("iris") X = iris.values[50:150, 0:4] y = iris.values[50:150, 4] # 逻辑回归 log_model = LogisticRegression() m = np.shape(X)[0] # 10-folds 交叉验证 y_pred = cross_val_predict(log_model, X, y, cv=10) print(metrics.accuracy_score(y, y_pred)) # 留一法 loo = LeaveOneOut() accuracy = 0 for train, test in loo.split(X): log_model.fit(X[train], y[train]) # fitting y_p = log_model.predict(X[test]) if y_p == y[test]: accuracy += 1 print(accuracy / np.shape(X)[0]) ''' transfusion-blood dats set analysis ''' # import numpy as np # for matrix calculation dataset_transfusion = np.loadtxt('../data/transfusion.data', delimiter=",", skiprows=1) X2 = dataset_transfusion[:, 0:4] y2 = dataset_transfusion[:, 4]
print c,ytest,a if a>200: test_pred = [5] else: test_pred = [2] if(test_pred[0] == ytest[0]): return 1 else: return 0 from sklearn.model_selection import LeaveOneOut loo = LeaveOneOut() # xall = x3 # yall = y3 xall = np.array(x1.tolist()+x2.tolist()+x3.tolist()) yall = np.array(y1.tolist()+y2.tolist()+y3.tolist()) score = [0,0,0,0,0] total = [0,0,0,0,0] for a,b in loo.split(xall,yall): #trainindices = a #testindex = b xtrain, xtest = xall[a], xall[b] ytrain, ytest = yall[a], yall[b] score[ytest[0]-1] +=get_score(xtrain,xtest,ytrain,ytest) total[ytest[0]-1] +=1 print ['Transport','Weather','Inflation','Fuel','Hoarding'] print 'correct = ', score, 'overall = ', sum(score) print 'total = ', total, 'total = ', sum(total) print 'accuracy = ',[score[i]*1.0/total[i] for i in range(0,5)], 'overall = ', sum(score)*1.0/sum(total)
def evaluate_model_CNN_LSTM(self, df): verbose, epochs, batch_size = 0, 25, 64 loo = LeaveOneOut() for train, test in loo.split(range(len(df.groupby(level=0)))): df_train, df_test = df.loc[pd.IndexSlice[train, :], :], df.loc[ pd.IndexSlice[test, :], :] df_test_timestamps, df_test_features, df_test_output = df_test.iloc[:, 0], df_test.iloc[:, 1: -5], df_test.iloc[:, -5:] df_train_timestamps, df_train_features, df_train_output = df_train.iloc[:, 0], df_train.iloc[:, 1: -5], df_train.iloc[:, -5:] #df_train_timestamps.reset_index(level=1, drop=True, inplace=True) #df_train_features.index = df_train_features.index.droplevel(1) #df_train_output.index = df_train_output.index.droplevel(1) #print(df_train.loc[pd.IndexSlice[2, :],:]) for train_index in train: [w, h] = (df_train.loc[pd.IndexSlice[ train_index, :], :'raw_acc:magnitude_stats:time_entropy'] ).shape temp = (df_train.loc[pd.IndexSlice[ train_index, :], :'raw_acc:magnitude_stats:time_entropy'] ).values.reshape(1, -1, w, h) print([w, h]) #input() model = torch.nn.Sequential( torch.nn.Conv2d(1, 32, 3, stride=2, bias=True, padding=3), torch.nn.ReLU(), torch.nn.Dropout(0.5), torch.nn.MaxPool2d(3), torch.nn.Conv2d(32, 64, 3, stride=2, bias=True, padding=3), torch.nn.ReLU(), torch.nn.Dropout(0.5), torch.nn.MaxPool2d(3), torch.nn.Flatten(0, 1), torch.nn.LSTM(w, h, 100), torch.nn.Dropout(0.5), torch.nn.Linear(100, 5), torch.nn.ReLU(), torch.nn.Linear(100, 5), torch.nn.Softmax()) criterion = torch.nn.CrossEntropyLoss() optimizer = torch.optim.Adam(model.parameters()) for epoch in range(epochs): model = model.float() model.train() optimizer.zero_grad() y_ = model( torch.tensor(df_train.loc[ pd.IndexSlice[train_index, :], : 'raw_acc:magnitude_stats:time_entropy'].values. reshape(1, -1, w, h)).float()) print(y_) input() loss = criterion(y_, df_train.iloc[0, -5:].values) loss.backward() optimizer.step() print(f"Epoch {epoch+1}/{n_epochs}, loss = {loss.item()}") [w, h] = (df_test.loc[pd.IndexSlice[ test, :], :'raw_acc:magnitude_stats:time_entropy']).shape temp = (df_test.loc[pd.IndexSlice[ test, :], :'raw_acc:magnitude_stats:time_entropy'] ).values.reshape(1, -1, w, h) y_test_predict = model(temp) mae = np.sum( np.absolute((df_test.loc[pd.IndexSlice[test, :], 'label:LYING_DOWN':]).values - y_test_predict))
def func(x, a, b, k): C = np.log((b - a) / (b - x)) y_pred = np.multiply(C, k) return y_pred LAI_test_pred = [] NDVI_test = [] LAI_test = [] LAI_ALL_P = [] param = [] A = [] B = [] K = [] for train_index, test_index in loo.split(NDVI): X_train, X_test = NDVI[train_index], NDVI[test_index] y_train, y_test = LAI[train_index], LAI[test_index] param_bounds = ([0.01, 0.93, 1.3], [0.15, 0.95, 1.8]) #参数上下限,第一个方括号为所有参数下限,第二个为所有参数上限 popt, pcov = curve_fit(func, X_train, y_train, bounds=param_bounds) param.append(popt) a = popt[0] b = popt[1] k = popt[2] A.append(a) B.append(b) K.append(k) NDVI_test.append(X_test) LAI_test.append(y_test)
def evaluate_by_loo(energies_train, energies_target, regr=LinearRegression()): loo = LeaveOneOut() loo.get_n_splits(energies_train) train_r2_scores = np.array([]) test_r2_scores = np.array([]) train_rmse_scores = np.array([]) test_rmse_scores = np.array([]) predicted_powers = np.array([]) actual_powers = np.array([]) # Train Linear Regression model # It is small data, so for train_index, test_index in loo.split(energies_train): # print("Test index:{}".format(test_index)) # print("TRAIN:", train_index, "TEST:", test_index) # regr = LinearRegression() x_train, x_test = energies_train[train_index], \ energies_train[test_index] y_train, y_test = energies_target.iloc[train_index], \ energies_target.iloc[test_index] regr.fit(x_train, y_train) # print(X_test, y_test) y_train_pred = execute( { 'pipeline': regr, 'statistics': features_statistics }, features=x_train, prediction=True) y_test_pred = execute( { 'pipeline': regr, 'statistics': features_statistics }, features=x_test, prediction=True) # print(y_test.values, y_test_pred) train_r2_score = regr.score(x_train, y_train) train_r2_scores = np.append(train_r2_scores, train_r2_score) test_r2_score = r2_score(y_test.values, y_test_pred) test_r2_scores = np.append(test_r2_scores, test_r2_score) train_rmse_score = rmse(y_train, y_train_pred) train_rmse_scores = np.append(train_rmse_scores, train_rmse_score) test_rmse_score = rmse(y_test.values, y_test_pred) test_rmse_scores = np.append(test_rmse_scores, test_rmse_score) actual_powers = np.append(actual_powers, y_test.values[0]) predicted_powers = np.append(predicted_powers, y_test_pred[0]) # print("Actual energy generation: {}\tPredicted energy generation: {}" # .format(y_test.values[0], y_test_pred[0])) # print("Train R^2 score: {}\tTest R^2 score:{}" # .format(train_r2_score, test_r2_score)) # print("Train RMSE: {}\tTest RMSE:{}\n" # .format(train_rmse_score, test_rmse_score)) # Standard deviation of training data is base line of RMSE # print("Standard deviation: {}".format(pd.DataFrame.std(energies_target))) print("Train average RMSE: {}\tTest average RMSE:{}".format( np.average(train_rmse_scores), np.average(test_rmse_scores))) print("Train average R^2: {}\tTest average R^2:{}".format( np.average(train_r2_scores), np.average(test_r2_scores))) return actual_powers, predicted_powers
def regress_loo(features, grades, method='ridge', standard=False, use_intercept=True, groups=None, convert='none', alpha=1.0): """Calculates linear regression with leave-one-out split and L2 regularization. Parameters ---------- features : ndarray Input features used in creating regression model. grades : ndarray Ground truth for the model. method : str Regression model used. Defaults to ridge regression, but lasso is also possible. Ridge seems to perform better. standard : bool Choice whether to center features by the mean of training split. Defaults to false, since whitened PCA is assumed to be centered. use_intercept : bool Choice whether to use intercept term on the model. If the model does not provide very powerful predictions, it is better to center them by the intercept. groups : ndarray Patients groups. Used in leave-one-group-out split. convert : str Possibility to predict exp or log of ground truth. Defaults to no conversion. Returns ------- Array of model prdictions, model coefficients and model intercept term. """ # Convert grades if convert == 'exp': grades = np.exp(grades) elif convert == 'log': grades = np.log(grades) else: pass predictions = [] # Get leave-one-out split loo = LeaveOneOut() loo.get_n_splits(features) for train_idx, test_idx in loo.split(features): # Train split x_train, x_test = features[train_idx], features[test_idx] y_train, y_test = grades[train_idx], grades[test_idx] # Normalize with mean and std if standard: x_test -= x_train.mean(0) x_train -= x_train.mean(0) # Linear regression if method == 'ridge': model = Ridge(alpha=alpha, normalize=True, random_state=42, fit_intercept=use_intercept) else: model = Lasso(alpha=alpha, normalize=True, random_state=42, fit_intercept=use_intercept) model.fit(x_train, y_train) # Evaluate on test sample predictions.append(model.predict(x_test)) predictions_flat = [] for group in predictions: for p in group: predictions_flat.append(p) return np.array(predictions).squeeze(), model.coef_, model.intercept_
# pass None as weights to np.average: uniform mean multioutput = None return np.average(output_errors, weights=multioutput) '''Q1-1-1 PCA+PCR''' import numpy as np import pandas as pd from sklearn.linear_model import LinearRegression from sklearn.model_selection import train_test_split, cross_val_score, LeaveOneOut from sklearn import metrics loo = LeaveOneOut() ytests = [] ypreds = [] for train_idx, test_idx in loo.split(X_pca): X_train, X_test = X_pca[train_idx], X_pca[test_idx] #requires arrays y_train, y_test = y[train_idx], y[test_idx] model = LinearRegression() model.fit(X=X_train, y=y_train) y_pred = model.predict(X_test) # there is only one y-test and y-pred per iteration over the loo.split, # so to get a proper graph, we append them to respective lists. ytests += list(y_test) ypreds += list(y_pred) Score_R2 = metrics.r2_score(ytests, ypreds) Score_MAE = metrics.mean_absolute_error(ytests, ypreds) Score_MSE = metrics.mean_squared_error(ytests, ypreds)
clf = LinearSVC(C=10) start_time = time.time() clf = clf.fit(x_tr, y_tr) predictions_tr = (clf.predict(x_ts)) #10-fold Cross-Validation scores = cross_val_score(clf, x_tr, y_tr, cv=10) test_acc = accuracy_score(y_ts, predictions_tr) print("Training Accuracy: %0.4f (+/- %0.2f)" % (scores.mean(), scores.std() * 2)) print("Test Accuracy: %0.4f" % test_acc) print("--- %s seconds ---" % (time.time() - start_time)) #Leave One Out Validation if leave_one_out_validation: loo_train_acc = [] loo = LeaveOneOut() for train_index, test_index in loo.split(x_tr): X_train, X_test = x_tr[train_index], x_tr[test_index] y_train, y_test = y_tr[train_index], y_tr[test_index] clf = clf.fit(X_train, y_train) predictions = (clf.predict(X_test)) loo_train_acc.append(accuracy_score(y_test, predictions)) loo_train_accuracy = np.asarray(loo_train_acc) print("LOO Accuracy: %0.4f" % loo_train_accuracy.mean()) #Save the model pickle.dump(clf, open(f'{model_save_path}/model1_inception_svm.sav', 'wb'))
def _print_train_results(classifier_name, classifier, regressors, response, regressor_names, leave_one_out): """ _print_train_results Performs validation tests of the model and prints the results :param classifier_name: Name of the classifier method :param classifier: Classifier object :param regressors: numpy array with the regressors used to train the model :param response: numpy array with the response used to train the model :param regressor_names: List with the name of the regressors :param leave_one_out: Boolean, true to perform leave-one-out cross-validation, otherwise perform default cross validation :return: None """ global MESSAGES _verbose_print("classifier_name: {}".format(classifier_name)) _verbose_print("classifier: {}".format(classifier)) _verbose_print("regressor_names: {}".format(regressor_names)) _verbose_print("leave_one_out: {}".format(leave_one_out)) MESSAGES.AddMessage("{} classifier with parameters: \n {}".format(classifier_name, str(classifier.get_params()).replace("'", ""))) if leave_one_out: # create a leave-one-out instance to execute the cross-validation loo = LeaveOneOut() start = timer() cv_score = cross_val_score(classifier, regressors, response, cv=loo.split(regressors)) end = timer() n_tests = len(response) MESSAGES.AddMessage("Score (Leave one Out):" + str(cv_score.mean())) else: start = timer() cv_score = cross_val_score(classifier, regressors, response) end = timer() n_tests = 3 MESSAGES.AddMessage("Score (3-Fold):" + str(cv_score.mean())) # Print validation time MESSAGES.AddMessage("Testing time: {:.3f} seconds, {:.3f} seconds per test".format(end - start, (end - start) / n_tests)) # Print confusion matrix MESSAGES.AddMessage("Confusion Matrix (Train Set):") confusion = confusion_matrix(response, classifier.predict(regressors)) labels = ["Non Deposit", "Deposit"] row_format = "{:6}" + "{:^16}" * (len(labels) + 1) MESSAGES.AddMessage(row_format.format("", "", "Predicted", "")) MESSAGES.AddMessage(row_format.format("True", "", *labels)) for label, row in zip(labels, confusion): MESSAGES.AddMessage(row_format.format("", label, *row)) # Some classifiers do not have decision_function attribute but count with predict_proba instead # TODO: Generalize to anything that does not have decision_function "Easier to ask for forgiveness than permission" if classifier_name in ["Random Forest"]: des_fun = classifier.predict_proba(regressors)[:, classifier.classes_ == 1] else: des_fun = classifier.decision_function(regressors) MESSAGES.AddMessage("Area Under the curve (AUC): {}".format(roc_auc_score(response, des_fun))) # Give the importance of the features if it is supported # TODO: Generalize to anything that does have feature_importances_ "Easier to ask for forgiveness than permission" if classifier_name == "Adaboost": MESSAGES.AddMessage("Feature importances: ") importances = [[name, val*100] for name, val in zip(regressor_names, classifier.feature_importances_)] long_word = max([len(x) for x in regressor_names]) row_format = "{" + ":" + str(long_word) + "} {:4.1f}%" # Print regressors in descending importance, omit the ones with 0 importance for elem in sorted(importances, key=lambda imp: imp[1], reverse=True): if elem[1] > 0: MESSAGES.AddMessage(row_format.format(*elem)) return
from sklearn import metrics from sklearn.model_selection import cross_val_predict # log-regression lib model log_model = LogisticRegression() m = np.shape(X)[0] # 10-folds CV y_pred = cross_val_predict(log_model, X, y, cv=10) print(metrics.accuracy_score(y, y_pred)) # LOOCV from sklearn.model_selection import LeaveOneOut loo = LeaveOneOut() accuracy = 0; for train, test in loo.split(X): log_model.fit(X[train], y[train]) # fitting y_p = log_model.predict(X[test]) if y_p == y[test] : accuracy += 1 print(accuracy / np.shape(X)[0]) # m = np.shape(X)[0] # scores_loo = cross_val_score(log_model, X, y, cv=m) # print(scores_loo) # # prediction using 10-folds # y_pred_loo = cross_val_predict(log_model, X, y, cv=m) # print(metrics.accuracy_score(y, y_pred_loo)) ''' transfusion-blood dats set analysis '''