def _iter_cv(n): # XXX support sklearn < 0.18 if hasattr(LeaveOneOut, 'split'): cv = LeaveOneOut() return cv.split(np.zeros((n, 1))) else: cv = LeaveOneOut(len(data)) return cv
def run_leave_one_out_cv(features, labels, classifier=LinearDiscriminantAnalysis()): """ Runs leave one out CV. :param features: Features shape(epoch, feature) :param labels: list of lables of length num epochs :param classifier: Sklearn classifier (Defaults to LDA) :return: A list of cross validation scores. Use np.average on the result to find the average score. """ loo = LeaveOneOut() scores = [] for train_indexes, test_indexes in loo.split(features, labels): # Assert our split maintains the same number of features CCDLAssert.assert_equal(len(train_indexes) + len(test_indexes), features.shape[0]) # Assert we have the same number of features X_train, X_test = features[train_indexes, :], features[test_indexes, :] Y_train, Y_test = np.asarray(labels)[train_indexes], np.asarray(labels)[test_indexes] # Assert our X_train and X_test have the same number of features CCDLAssert.assert_equal(X_train.shape[1], X_test.shape[1]) # Fit our classifier to our classifier.fit(X_train, Y_train) score = classifier.score(X_test, Y_test) scores.append(score) return scores
def main(argv): filename = argv[0] t = float(argv[1]) # threshold for logistic regression (default=0.5) dup = int(argv[2]) # if 1, bad queries will be duplicated subset = 'cache' # column title for precision of cache full = 'full' # column title for precision of full db df = pd.read_csv('../../data/cache_selection_structured/' + filename) df = df.drop(['query', 'freq'], axis = 1) df = df.fillna(0) df['label'] = np.where(df['full'] > df['cache'], 1, 0) if dup: print('duping..') bads = df[df['label'] == 1] df = df.append(bads, ignore_index=True) X = df.drop(['label'], axis = 1) y = df['label'] df = df.drop(['label'], axis = 1) p20_mean = np.zeros([1, 6]) bad_mean = np.zeros([1, 6]) ml_average_rare = 0 ql_average_rare = 0 best_average_rare = 0 loo = LeaveOneOut() bad_counter = 0 for train_index, test_index in loo.split(X): X_train, X_test = X.iloc[train_index], X.iloc[test_index] y_train, y_test = y.iloc[train_index], y.iloc[test_index] X_train = X_train.drop([subset, full], axis=1) p12 = X_test[subset].iloc[0] p100 = X_test[full].iloc[0] is_bad = p12 < p100 X_test = X_test.drop([subset, full], axis=1) # compute query likelihood based effectiveness ql_cache = np.mean(X_test['ql_0_0'] + X_test['ql_0_1'] + X_test['ql_1_0'] + X_test['ql_2_0']) ql_rest = np.mean(X_test['ql_rest_0_0'] + X_test['ql_rest_0_1'] + X_test['ql_rest_1_0'] + X_test['ql_rest_2_0']) #ql_pred = X_test['ql_0_1'].iloc[0] < X_test['ql_rest_0_1'].iloc[0] ql_pred = 1 if ql_cache < ql_rest else 0 ql = p12 if ql_pred == 0 else p100 # learn the model print(X_train.shape) print(df.columns.shape) # y_pred = train_lr(X_train, y_train, X_test, y_test, t, df.columns.values[:-2]) y_pred = train_lr(X_train, y_train, X_test, y_test, t) ml = p12 if y_pred[0] == 0 else p100 best = p12 if y_test.iloc[0] == 0 else p100 rnd = p12 if np.random.randint(0, 2) == 1 else p100 p20_mean += [p12, p100, ml, ql, best, rnd] if is_bad: #bad_mean += [p12[0], p100[0], ml[0], ql[0], best[0], rnd[0]] bad_mean += [p12, p100, ml, ql, best, rnd] bad_counter += 1 print('final results:') print('\t'.join(map(str,['set', 'cache', 'db', 'ml', 'ql', 'best', 'rand']))) print('\t'.join(['bad'] + map(str, np.round(bad_mean[0] / bad_counter, 2)))) print('\t'.join(['all'] + map(str, np.round(p20_mean[0] / df.shape[0], 2))))
def roc_data(X,Y,clf,n_iter=50,test_size=0.1): if n_iter is None and test_size is None: cv = LeaveOneOut() else: cv = ShuffleSplit(n_iter=n_iter,test_size=test_size) n_labels = Y.shape[1] Y_cv = {i:[] for i in range(n_labels)} p = {i:[] for i in range(n_labels)} p_1 = {i:[] for i in range(n_labels)} p_0 = {i:[] for i in range(n_labels)} for train, test in cv.split(Y): clf.fit(X[train,:], Y[train,:]) Y_predicted = clf.predict_proba(X[test,:]) for i in range(Y.shape[1]): if type(Y_predicted) is list: p_ = 1 - Y_predicted[i][:,0] else: p_ = Y_predicted[:,i] Y_cv[i] += list(Y[test,i]) p[i] += list(p_) p_1[i] += list(p_[np.where(Y[test,i]==1)[0]]) p_0[i] += list(p_[np.where(Y[test,i]==0)[0]]) return Y_cv, p, p_1, p_0
def _print_classification_results(classifier, regressors, response, regressors_test, response_test, regressor_names, messages): loo = LeaveOneOut() cv_score = cross_val_score(classifier, regressors, response, cv=loo.split(regressors)) classifier.fit(regressors, response) messages.AddMessage("Adaboost classifier with " + str(classifier.n_estimators) + " estimators and learning rate " + str(classifier.learning_rate)) if regressors_test is None or response_test is None: regressors_test = regressors response_test = response t_set = "Train" else: t_set = "Test" messages.AddMessage("Score (" + t_set + " Set):" + str(classifier.score(regressors_test, response_test))) messages.AddMessage("Score (Leave one Out):" + str(cv_score.mean())) messages.AddMessage("Confusion Matrix (" + t_set + " Set):") confusion = confusion_matrix(response_test, classifier.predict(regressors_test)) labels = ["Non Prospective", "Prospective"] row_format = "{:6}" + "{:^16}" * (len(labels) + 1) messages.AddMessage(row_format.format("", "", "Predicted", "")) messages.AddMessage(row_format.format("True", "", *labels)) for label, row in zip(labels, confusion): messages.AddMessage(row_format.format("", label, *row)) messages.AddMessage("Area Under the curve (AUC):" + str(roc_auc_score(response_test, classifier.decision_function(regressors_test)))) messages.AddMessage("Feature importances: ") importances = [[name, val] for name, val in zip(regressor_names, classifier.feature_importances_)] for elem in sorted(importances, key=lambda imp: imp[1], reverse=True): if elem[1] > 0: messages.AddMessage(elem[0] + ": \t" + str(elem[1]*100) + "%") return
def redraw(self): variables = [] if self.includeallcheckBox.isChecked(): for i in range(self.interactionlistWidget.count()): variables.append(self.interactionlistWidget.item(i).text()) else: for i in range(self.selectedlistWidget.count()): variables.append(self.selectedlistWidget.item(i).text()) nX = len(variables) if nX < 1: QtWidgets.QMessageBox.critical(self,'Error',"Too few variables selected!",\ QtWidgets.QMessageBox.Ok) return () Yname = self.YcomboBox.currentText() Lc = DS.Lc[DS.Ic] Gc = DS.Gc[DS.Ic] Lcy = Lc[Gc] Lcx = Lc[-Gc] data = DS.Raw.loc[DS.Ir, DS.Ic] Y = data[Lcy] X = data[Lcx] if nX > X.shape[0]: QtWidgets.QMessageBox.critical(self,'Error',"Factors > Observation! \n Reduce factors.",\ QtWidgets.QMessageBox.Ok) return () ny = self.YcomboBox.currentIndex() Y = Y.values.astype('float') X = X.values.astype('float') Y = Y[:, ny] nr = len(Y) basey = [Term([LookupFactor(Yname)])] basex = [] for term in variables: if term == 'Intercept': basex = [INTERCEPT] variables.remove(term) for term in variables: vterm = term.split(':') term_lookup = [LookupFactor(x) for x in vterm] if len(term_lookup) > 1: if vterm[0] == vterm[1]: term_lookup = [EvalFactor(vterm[0] + ' ** 2')] basex.append(Term(term_lookup)) desc = ModelDesc(basey, basex) data = np.column_stack((X, Y)) columns = Lcx.tolist() columns.append(Yname) data = pd.DataFrame(data, columns=columns) y, mx = dmatrices(desc, data, return_type='dataframe') dism = np.linalg.inv(np.dot(mx.T.values, mx.values)) mod = OLS(y, mx) DOE.res = mod.fit() # calculation of cross-validation ypcv = list() rcv = list() bres = list() loo = LeaveOneOut() loo.get_n_splits(mx) for train_index, test_index in loo.split(mx): mx_train = mx.ix[train_index, :] mx_test = mx.ix[test_index, :] y_train = y.ix[train_index, :] y_test = y.ix[test_index, :] modcv = OLS(y_train, mx_train) rescv = modcv.fit() ypcv.append(rescv.predict(mx_test).values[0]) rcv.append(rescv.predict(mx_test).values[0] - y_test.values[0]) bres.append((rescv.params - DOE.res.params).values**2) bres = pd.DataFrame(bres) bres = bres.sum() * nr / (nr - 1) bres = np.sqrt(bres.values) tres = np.abs(DOE.res.params.values / bres) pt = 2 * t.pdf(tres, nr) fig = Figure() ax = fig.add_subplot(111) if self.coefradioButton.isChecked(): if DOE.res.params.index[0] == 'Intercept': ind = np.arange(1, len(DOE.res.params)) vcol = [] for i in ind: if (DOE.res.pvalues[i] < 0.05): vcol.append('red') else: vcol.append('blue') ax.bar(ind, DOE.res.params[1:], align='center', color=vcol) ax.set_title('Coefficient Value : Intercept {:10.4f}-{:10.4f}-{:10.4f}'.\ format(DOE.res.conf_int().ix[0,0],DOE.res.params[0],DOE.res.conf_int().ix[0,1])) ax.set_xticklabels(DOE.res.params.index[1:], rotation='vertical') cmin = DOE.res.params[1:] - DOE.res.conf_int().ix[1:, 0] cmax = DOE.res.conf_int().ix[1:, 1] - DOE.res.params[1:] ax.errorbar(ind, DOE.res.params[1:], yerr=[cmin.values, cmax.values], fmt='o', ecolor='green') else: ind = np.arange(1, len(DOE.res.params) + 1) ax.bar(ind, DOE.res.params, align='center') ax.set_title('Coefficient Value : None Intercept') ax.set_xticklabels(DOE.res.params.index[0:], rotation='vertical') cmin = DOE.res.conf_int().ix[0:, 0] - DOE.res.params[0:] cmax = DOE.res.conf_int().ix[0:, 1] - DOE.res.params[0:] ax.errorbar(ind, DOE.res.params[0:], yerr=[cmin.values, cmax.values], fmt='o', ecolor='green') ax.set_xticks(ind) ax.set_xlabel('Coefficient Number (except Intercept)') ax.annotate('red bar: significance 5%', xy=(0.75, 0.95), xycoords='figure fraction', fontsize=8) elif self.coefpredradioButton.isChecked(): if DOE.res.params.index[0] == 'Intercept': ind = np.arange(1, len(DOE.res.params)) vcol = [] for i in ind: if (pt[i] < 0.05): vcol.append('red') else: vcol.append('blue') ax.bar(ind, DOE.res.params[1:], align='center', color=vcol) ax.set_title( 'Coefficient Value : Intercept {:10.4f}-{:10.4f}-{:10.4f}'. format(DOE.res.params[0] - tres[0] * bres[0] / np.sqrt(nr), DOE.res.params[0], DOE.res.params[0] + tres[0] * bres[0] / np.sqrt(nr))) ax.set_xticklabels(DOE.res.params.index[1:], rotation='vertical') ax.errorbar(ind, DOE.res.params[1:], yerr=tres[1:] * bres[1:] / np.sqrt(nr), fmt='o', ecolor='green') else: ind = np.arange(1, len(DOE.res.params) + 1) ax.bar(ind, DOE.res.params, align='center') ax.set_title('Coefficient Value : None Intercept') ax.set_xticklabels(DOE.res.params.index[0:], rotation='vertical') ax.errorbar(ind, DOE.res.params[0:], yerr=tres[0:] * bres[0:] / np.sqrt(nr), fmt='o', ecolor='green') ax.set_xticks(ind) ax.set_xlabel('Coefficient Number (except Intercept)') ax.annotate('red bar: significance 5%', xy=(0.75, 0.95), xycoords='figure fraction', fontsize=8) elif self.fitradioButton.isChecked(): yf = DOE.res.fittedvalues.tolist() resid = DOE.res.resid.tolist() ax.scatter(y, yf, color='red', alpha=0.3, marker='o') ax.set_ylabel('Fitted Values', color='red') ax.tick_params('y', colors='red') ax1 = ax.twinx() ax1.scatter(y, resid, color='blue', alpha=0.3, marker='o') ax1.set_ylabel('Residuals', color='blue') ax1.tick_params('y', colors='blue') xmin, xmax = ax.get_xlim() ax.set_ylim([xmin, xmax]) df = DOE.res.df_resid vares = np.sum(DOE.res.resid**2) / df rmsef = np.sqrt(vares) vary = np.var(y.values) evar = (1 - vares / vary) * 100 ax.set_title( 'df {:3.0f}; RMSEF {:6.2f}; Exp.Var.{:5.1f}%'.format( df, rmsef, evar)) ax.add_line(Line2D([xmin, xmax], [xmin, xmax], color='red')) ax1.add_line(Line2D([xmin, xmax], [0, 0], color='blue')) ax.set_xlabel('Measured Values') if self.VcheckBox.isChecked(): Lr = DOE.res.model.data.row_labels for i, txt in enumerate(Lr): ax.annotate(str(txt), (y.ix[i], yf[i])) elif self.predradioButton.isChecked(): ax.scatter(y, ypcv, color='red', alpha=0.3, marker='o') ax.set_ylabel('CV Predicted Values', color='red') ax.tick_params('y', colors='red') ax1 = ax.twinx() ax1.scatter(y, rcv, color='blue', alpha=0.3, marker='o') ax1.set_ylabel('CV Residuals', color='blue') ax1.tick_params('y', colors='blue') xmin, xmax = ax.get_xlim() ax.set_ylim([xmin, xmax]) ax.set_xlabel('Measured Values') df = DS.Raw.shape[0] varcv = np.sum(np.array(rcv)**2) / df rmsecv = np.sqrt(varcv) vary = np.var(y.values) evar = (1 - varcv / vary) * 100 ax.set_title( 'df {:3.0f}; RMSECV {:6.2f}; Exp.Var.{:5.1f}%'.format( df, rmsecv, evar)) ax.add_line(Line2D([xmin, xmax], [xmin, xmax], color='red')) ax1.add_line(Line2D([xmin, xmax], [0, 0], color='blue')) if self.VcheckBox.isChecked(): Lr = DOE.res.model.data.row_labels for i, txt in enumerate(Lr): ax.annotate(str(txt), (y.ix[i], ypcv[i])) elif self.levradioButton.isChecked(): Ftable = surtabDlg.launch(None) if len(np.shape(Ftable)) == 0: return () if np.argmax(Ftable['X axis'].values) == np.argmax( Ftable['Y axis'].values): QtWidgets.QMessageBox.critical(self,'Error',"Two variables on the same axis",\ QtWidgets.QMessageBox.Ok) return () fig = plt.figure() ax = fig.add_subplot(111) npts = 20 xname = Ftable[(Ftable['X axis'] == True).values].index[0] yname = Ftable[(Ftable['Y axis'] == True).values].index[0] cname = Ftable[(Ftable['Constant'] == True).values].index.tolist() cvalue = Ftable.loc[(Ftable['Constant'] == True).values, 'value'] zname = Yname x = np.linspace(float(Ftable['min'][xname]), float(Ftable['max'][xname]), npts) y = np.linspace(float(Ftable['min'][yname]), float(Ftable['max'][yname]), npts) px = [] py = [] for i in range(npts): for j in range(npts): px.append(x[i]) py.append(y[j]) data = pd.DataFrame({xname: px, yname: py, zname: px}) xtitle = '' for i in range(len(cname)): xtitle = xtitle + cname[i] + ' = ' + str( cvalue.values.tolist()[i]) data[cname[i]] = np.ones(npts**2) * float(cvalue[i]) my, mx = dmatrices(desc, data, return_type='dataframe') pz = np.diag(np.dot(np.dot(mx, dism), mx.T)) px = np.array(px) py = np.array(py) pz = np.array(pz) z = plt.mlab.griddata(px, py, pz, x, y, interp='linear') plt.contour(x, y, z, 15, linewidths=0.5, colors='k') plt.contourf(x, y, z, 15, cmap=plt.cm.rainbow) plt.colorbar() ax.set_xlabel(xname) ax.set_ylabel(yname) ax.set_title(xtitle) ax.set_xlim([px.min(), px.max()]) ax.set_ylim([py.min(), py.max()]) elif self.surradioButton.isChecked(): Ftable = surtabDlg.launch(None) if len(np.shape(Ftable)) == 0: return () if np.argmax(Ftable['X axis'].values) == np.argmax( Ftable['Y axis'].values): QtWidgets.QMessageBox.critical(self,'Error',"Two variables on the same axis",\ QtWidgets.QMessageBox.Ok) return () fig = plt.figure() ax = fig.add_subplot(111) npts = 100 xname = Ftable[(Ftable['X axis'] == True).values].index[0] yname = Ftable[(Ftable['Y axis'] == True).values].index[0] cname = Ftable[(Ftable['Constant'] == True).values].index.tolist() cvalue = Ftable.loc[(Ftable['Constant'] == True).values, 'value'] zname = Yname x = np.linspace(float(Ftable['min'][xname]), float(Ftable['max'][xname]), npts) y = np.linspace(float(Ftable['min'][yname]), float(Ftable['max'][yname]), npts) px = [] py = [] for i in range(npts): for j in range(npts): px.append(x[i]) py.append(y[j]) data = pd.DataFrame({xname: px, yname: py, zname: px}) xtitle = '' for i in range(len(cname)): xtitle = xtitle + cname[i] + ' = ' + str( cvalue.values.tolist()[i]) data[cname[i]] = np.ones(npts**2) * float(cvalue[i]) my, mx = dmatrices(desc, data, return_type='dataframe') pz = DOE.res.predict(mx) px = np.array(px) py = np.array(py) pz = np.array(pz) z = plt.mlab.griddata(px, py, pz, x, y, interp='linear') plt.contour(x, y, z, 15, linewidths=0.5, colors='k') plt.contourf(x, y, z, 15, cmap=plt.cm.rainbow) plt.colorbar() ax.set_xlabel(xname) ax.set_ylabel(yname) ax.set_title(xtitle) ax.set_xlim([px.min(), px.max()]) ax.set_ylim([py.min(), py.max()]) elif self.dismradioButton.isChecked(): fig = plt.figure() ax = fig.add_subplot(111) cax = ax.matshow(dism) fig.colorbar(cax) ax.set_title('Trace = {:10.4f}'.format(np.trace(dism))) elif self.inflradioButton.isChecked(): mxc = preprocessing.scale(mx.values, with_mean=True, with_std=False) mxc2 = mxc**2 infl = np.sum(mxc2, axis=0) * np.diag(dism) fig = plt.figure() ax = fig.add_subplot(111) cax = ax.matshow(infl.reshape(1, -1), cmap='gray_r') fig.colorbar(cax) ax.yaxis.grid(False) ax.tick_params(axis='y', which='both', left='off', right='off', labelleft='off') ax.set_xlabel('Inlaction Factor') if self.XcheckBox.isChecked(): if self.XlineEdit.text(): ax.set_xlabel(self.XlineEdit.text()) else: ax.set_xlabel('') if self.YcheckBox.isChecked(): if self.YlineEdit.text(): ax.set_ylabel(self.YlineEdit.text()) else: ax.set_ylabel('') if self.XGcheckBox.isChecked(): ax.xaxis.grid(True) else: ax.xaxis.grid(False) if self.YGcheckBox.isChecked(): ax.yaxis.grid(True) else: ax.yaxis.grid(False) if not self.XMcheckBox.isChecked(): ax.tick_params(axis='x', which='both', bottom='off', top='off', labelbottom='off') if not self.YMcheckBox.isChecked(): ax.tick_params(axis='y', which='both', left='off', right='off', labelleft='off') self.rmmpl() self.addmpl(fig)
import numpy as np from sklearn.model_selection import LeaveOneOut # ---------------------------------------------------- ''' class sklearn.model_selection.LeaveOneOut() ''' # ---------------------------------------------------- X = np.array([1, 2, 3, 4]) y = np.array([5, 6, 7, 8]) loo = LeaveOneOut() print(loo.get_n_splits(X)) print(loo) loo = LeaveOneOut() for train_index, test_index in loo.split(X): print("TRAIN:", train_index, "TEST:", test_index) X_train, X_test = X[train_index], X[test_index] y_train, y_test = y[train_index], y[test_index] print('X_train \n', X_train) print('X_test \n', X_test) print('y_train \n', y_train) print('y_test \n', y_test) print('*********************')
def train(self): with open(os.path.join(self.results_folder, "log.txt"), "w") as f_log: for train, test in LeaveOneOut().split(self.dfs): train_set = [self.dfs[i] for i in train] test_set = self.dfs[test[0]] # Create sentence and label lists sentences_list = [] labels_list = [] for i, book in enumerate(train_set): sentences_list.extend(book.sentence.values) labels_list.extend(book.label.values) f_log.write("Length book: " + str(len(sentences_list[i])) + '\n') f_log.write("Sentences: " + str(len(sentences_list)) + ", labels:" + str(len(labels_list)) + '\n') MAX_LEN = 128 # We need to add special tokens at the beginning and end of each sentence for BERT to work properly sentences_train = [ self.tokenizer.encode_plus(sent, add_special_tokens=True, max_length=MAX_LEN) for i, sent in enumerate(sentences_list) ] le = LabelEncoder() labels_train = labels_list f_log.write(str(labels_train[:10]) + '\n') f_log.write('Analyze labels' + '\n') le.fit(labels_train) le_name_mapping = dict( zip(le.classes_, le.transform(le.classes_))) f_log.write(str(le_name_mapping) + '\n') labels_train = le.fit_transform(labels_train) # Use the BERT tokenizer to convert the tokens to their index numbers in the BERT vocabulary input_ids_train = [ inputs["input_ids"] for inputs in sentences_train ] # Pad our input tokens input_ids_train = pad_sequences(input_ids_train, maxlen=MAX_LEN, truncating="post", padding="post") # Create attention masks attention_masks_train = [] # Create a mask of 1s for each token followed by 0s for padding for seq in input_ids_train: seq_mask_train = [float(i > 0) for i in seq] attention_masks_train.append(seq_mask_train) # Use train_test_split to split our data into train and validation sets for training train_inputs, train_labels = input_ids_train, labels_train train_masks, _ = attention_masks_train, input_ids_train # Convert all of our data into torch tensors, the required datatype for our model train_inputs = torch.tensor(train_inputs).to(torch.int64) train_labels = torch.tensor(train_labels).to(torch.int64) train_masks = torch.tensor(train_masks).to(torch.int64) batch_size = 32 # Create an iterator of our data with torch DataLoader. This helps save on memory during training # because, unlike a for loop, with an iterator the entire dataset does not need to be loaded into # memory train_data = TensorDataset(train_inputs, train_masks, train_labels) train_sampler = RandomSampler(train_data) train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size) torch.cuda.empty_cache() # BINARY CLASSIFIER model = BertForSequenceClassification.from_pretrained( "bert-base-uncased", num_labels=2) model.cuda() param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'gamma', 'beta'] optimizer_grouped_parameters = [{ 'params': [ p for n, p in param_optimizer if not any(nd in n for nd in no_decay) ], 'weight_decay_rate': 0.01 }, { 'params': [ p for n, p in param_optimizer if any(nd in n for nd in no_decay) ], 'weight_decay_rate': 0.0 }] # This variable contains all of the hyperparemeter information our training loop needs optimizer = BertAdam(optimizer_grouped_parameters, lr=2e-5, warmup=.1) train_loss_set = [] # Number of training epochs (authors recommend between 2 and 4) epochs = 10 device = torch.device( "cuda" if torch.cuda.is_available() else "cpu") torch.cuda.get_device_name(0) for _ in trange(epochs, desc="Epoch"): # Training # Set our model to training mode (as opposed to evaluation mode) model.train() # Tracking variables tr_loss = 0 nb_tr_examples, nb_tr_steps = 0, 0 # Train the data for one epoch for step, batch in enumerate(train_dataloader): # Add batch to GPU batch = tuple(t.to(device) for t in batch) # Unpack the inputs from our dataloader b_input_ids, b_input_mask, b_labels = batch # Clear out the gradients (by default they accumulate) optimizer.zero_grad() # Forward pass loss = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask, labels=b_labels) train_loss_set.append(loss.item()) # Backward pass loss.backward() # Update parameters and take a step using the computed gradient optimizer.step() # Update tracking variables tr_loss += loss.item() nb_tr_examples += b_input_ids.size(0) nb_tr_steps += 1 f_log.write("Train loss: {}".format(tr_loss / nb_tr_steps) + '\n') plt.figure(figsize=(15, 8)) plt.title("Training loss") plt.xlabel("Batch") plt.ylabel("Loss") plt.plot(train_loss_set) plt.savefig(self.img_folder + 'train' + str(test[0]) + '.png') model_to_save = model WEIGHTS_NAME = "BERT_Novel_test" + str(test[0]) + ".bin" OUTPUT_DIR = self.models_folder output_model_file = os.path.join(OUTPUT_DIR, WEIGHTS_NAME) f_log.write(str(output_model_file) + '\n') torch.save(model_to_save.state_dict(), output_model_file) state_dict = torch.load(output_model_file) model.load_state_dict(state_dict) sentences6 = test_set.sentence.values f_log.write(str(len(sentences6)) + '\n') labels6 = test_set.label.values labels_test = labels6 sentences11 = sentences6 sentences_test = [ self.tokenizer.encode_plus(sent, add_special_tokens=True, max_length=MAX_LEN) for i, sent in enumerate(sentences11) ] f_log.write('Analyze labels test' + '\n') le.fit(labels_test) le_name_mapping = dict( zip(le.classes_, le.transform(le.classes_))) f_log.write(str(le_name_mapping) + '\n') labels_test = le.fit_transform(labels_test) MAX_LEN = 128 # Use the BERT tokenizer to convert the tokens to their index numbers in the BERT vocabulary input_ids1 = [inputs["input_ids"] for inputs in sentences_test] # Pad our input tokens input_ids1 = pad_sequences(input_ids1, maxlen=MAX_LEN, truncating="post", padding="post") # Create attention masks attention_masks1 = [] # Create a mask of 1s for each token followed by 0s for padding for seq in input_ids1: seq_mask1 = [float(i > 0) for i in seq] attention_masks1.append(seq_mask1) f_log.write(str(len(attention_masks1[0])) + '\n') prediction_inputs = torch.tensor(input_ids1).to(torch.int64) prediction_masks = torch.tensor(attention_masks1).to( torch.int64) prediction_labels = torch.tensor(labels_test).to(torch.int64) batch_size = 32 prediction_data = TensorDataset(prediction_inputs, prediction_masks, prediction_labels) prediction_sampler = SequentialSampler(prediction_data) prediction_dataloader = DataLoader(prediction_data, sampler=prediction_sampler, batch_size=batch_size) # Prediction on test set # Put model in evaluation mode model.eval() # Tracking variables predictions, true_labels = [], [] # Predict for batch in prediction_dataloader: # Add batch to GPU batch = tuple(t.to(device) for t in batch) # Unpack the inputs from our dataloader b_input_ids, b_input_mask, b_labels = batch # Telling the model not to compute or store gradients, saving memory and speeding up prediction with torch.no_grad(): # Forward pass, calculate logit predictions logits = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask) # Move logits and labels to CPU logits = logits.detach().cpu().numpy() label_ids = b_labels.to('cpu').numpy() # Store predictions and true labels predictions.append(logits) true_labels.append(label_ids) f_log.write( str(len(predictions)) + ' ' + str(len(true_labels)) + '\n') f_log.write(str(predictions[0][0]) + '\n') # Import and evaluate each test batch using Matthew's correlation coefficient matthews_set = [] for i in range(len(true_labels)): matthews = matthews_corrcoef( true_labels[i], np.argmax(predictions[i], axis=1).flatten()) matthews_set.append(matthews) # Flatten the predictions and true values for aggregate Matthew's evaluation on the whole dataset flat_predictions = [ item for sublist in predictions for item in sublist ] flat_predictions = np.argmax(flat_predictions, axis=1).flatten() flat_true_labels = [ item for sublist in true_labels for item in sublist ] f_log.write( str(len(flat_predictions) + ' ' + len(flat_true_labels)) + '\n') f_log.write( str(flat_predictions[989:994] + ' ' + flat_true_labels[989:994]) + '\n') f_log.write( str(flat_predictions[0:11] + ' ' + flat_true_labels[0:11]) + '\n') f_log.write('Classification Report' + '\n') f_log.write( str( classification_report(flat_true_labels, flat_predictions)) + '\n') f_log.write( str(confusion_matrix(flat_true_labels, flat_predictions)) + '\n')
def testXGBoostPredictions(data, parameters, x_cols, y_cols, plots=False): """ Testing XGBoost prediction accuracies. Arguments: data {array} -- Labeled data for classifier testing. x_cols {array} -- x columns y_cols {array} -- y columns parameters {namedtuple} -- Parameters for the tree classifier. Using named tuple to keep things tidy. Keyword Arguments: plots {bool} -- Used for plotting (default: {False}) """ x = data.loc[:, x_cols] y = data.loc[:, y_cols] loo = LeaveOneOut() loo.get_n_splits(data) n = loo.split(data) xgbClassifier = xgb.XGBClassifier() accuracy_a = [] real_label = [] pred_label = [] for train_index, test_index in n: #Each row is test data once xtrain, xtest = x.iloc[train_index], x.iloc[test_index] ytrain, ytest = y.iloc[train_index], y.iloc[test_index] #Fitting train data xgbClassifier = xgbClassifier.fit(xtrain, ytrain.values.ravel()) #Predictions ypred = xgbClassifier.predict(xtest) pred_label.append(ypred) real_label.append(ytest.values) #Accuracy acc = accuracy_score(ytest, ypred) accuracy_a.append(acc) pred_label_df = pd.DataFrame(columns=["label"]) real_label_df = pd.DataFrame(columns=["label"]) #Forming the dataframes for row in range(0, len(pred_label)): label_str = pred_label[row][0] pred_label_df.loc[row] = label_str for row in range(0, len(real_label)): label_str = real_label[row][0][0] real_label_df.loc[row] = label_str if (plots): #Plotting tree and accuracy heatmap cm = confusion_matrix(real_label_df, pred_label_df) cm_df = pd.DataFrame(cm, ["Fall", "Normal"], ["Fall", "Normal"]) sn.set(font_scale=1.5) sn.heatmap(cm_df, annot=True, annot_kws={"size": 32}, fmt='d') plt.savefig("../figs/xgboost_heatmap.png", facecolor="w", bbox_inches="tight") plt.show() avg_acc = np.mean(accuracy_a) #Checking accuracy print("Tree average accuracy: ", round(avg_acc, 2)) #2 decimals #More detailed report print(classification_report(real_label_df, pred_label_df)) return (avg_acc, real_label_df, pred_label_df)
def main(): parser = argparse.ArgumentParser( description='Run a particular experiment using an ElasticNet estimator' ) parser.add_argument('--data_dir', dest="data_dir", type=str, required=True, help='Directory where data are located') parser.add_argument('--output_dir', dest="output_dir", type=str, required=True, help='Directory where we are going to save the resuts') parser.add_argument('--con_type', dest="con_type", type=str, required=True, choices=['look_neg_look_neut', 'reg_neg_look_neg'], help='Which contrast maps to take as input') parser.add_argument('--target_var', dest="target_var", type=str, required=True, help='Which variable to take as target') parser.add_argument('--n_alphas', dest="n_alphas", type=int, default=1000, help='Number of alphas to try for optimization') parser.add_argument('--transform', dest="transform", type=str, choices=['yeo-johnson', 'box-cox'], help='Transform target variable') opts = parser.parse_args() if opts.transform: msg = "Experiment to predict %s transformed %s from %s contrast maps with %d alphas" % ( opts.transform, opts.target_var, opts.con_type, opts.n_alphas) print(msg) else: msg = "Experiment to predict untransformed %s from %s contrast maps with %d alphas" % ( opts.target_var, opts.con_type, opts.n_alphas) print(msg) data_dir = os.path.abspath(opts.data_dir) if Path(data_dir).exists() is False: raise print("input directory does not exist") # Load data print("Loading data...") X, y = load_data(data_dir, opts.con_type, opts.target_var) # Build classifier cv_outer = LeaveOneOut() print("Running experiment...") if opts.transform: y_pred, y_true, list_models = run_transform(X, y, opts.transform, cv_outer=cv_outer, n_alphas=opts.n_alphas) else: y_pred, y_true, list_models = run(X, y, cv_outer=cv_outer, n_alphas=opts.n_alphas) from sklearn.metrics import r2_score, mean_squared_error r = np.corrcoef(y_true, y_pred)[0, 1] r2 = r2_score(y_true, y_pred) mse = mean_squared_error(y_true, y_pred) print("experiment gives r =%.3f, R2 = %.3f, MSE = %.3f" % (r, r2, mse)) print("Saving results...") # Create output_directory for the given case (target->Input) if opts.transform: output_dir = opj(opts.output_dir, opts.transform + "_" + opts.target_var, opts.con_type) else: output_dir = opj(opts.output_dir, opts.target_var, opts.con_type) output_dir = os.path.abspath(output_dir) Path(output_dir).mkdir(parents=True, exist_ok=True) save_data(output_dir, y_pred, y_true, list_models)
def compute_acc_conf(x, y, confounds, verbose=False, balanced=True, loo=False, nfolds=10, gs_kfolds=5, optimize=True, C=.01): encoder = preprocessing.LabelEncoder() encoder.fit(y) if loo: cv = LeaveOneOut(len(y)) else: cv = StratifiedKFold(y=encoder.transform(y), n_folds=nfolds) mean_tpr = 0.0 mean_fpr = np.linspace(0, 1, 100) all_tpr = [] total_test_score = [] y_pred = [] # clf_array = [] bc_all = [] prec = [] recall = [] if len(np.unique(y)) == 1: print('Unique class: 100%', np.sum(encoder.transform(y) == 0) / len(y)) return (1., 0., len(y)) for i, (train, test) in enumerate(cv): select_x = x.copy() # betacluster = bc.BetaCluster(crm.transform(confounds[train,:],select_x[train,:]),encoder.transform(y[train]),100,k_feature=200) # bc_all.append(betacluster) if balanced: clf = SVC(kernel='linear', class_weight='balanced', C=C) else: clf = SVC(kernel='linear', C=C) if len(confounds) == 0: xtrain = select_x[train, :] xtest = select_x[test, :] else: crm = ConfoundsRm(confounds[train, :], select_x[train, :]) xtrain = crm.transform(confounds[train, :], select_x[train, :]) xtest = crm.transform(confounds[test, :], select_x[test, :]) ytrain = encoder.transform(y[train]) ytest = encoder.transform(y[test]) # clf.probability = True if optimize: clf, score = plib.grid_search(clf, xtrain, ytrain, n_folds=gs_kfolds, verbose=verbose) clf.fit(xtrain, ytrain) total_test_score.append(clf.score(xtest, ytest)) # clf_array.append(clf) prec.append(metrics.precision_score(ytest, clf.predict(xtest))) recall.append(metrics.recall_score(ytest, clf.predict(xtest))) if loo: y_pred.append(clf.predict(xtest)) if verbose: print('nSupport: ', clf.n_support_) print("Train:", clf.score(xtrain, ytrain)) print("Test :", clf.score(xtest, ytest)) print("Prediction :", clf.predict(xtest)) print("Real Labels:", ytest) print('Precision:', prec[-1], 'Recall:', recall[-1]) y_pred = np.array(y_pred)[:, 0] if loo: total_std_test_score = estimate_std( metrics.accuracy_score(encoder.transform(y), np.array(y_pred)), len(y)) print('Mean:', np.mean(total_test_score), 'Std:', total_std_test_score, 'AvgPrecision:', np.mean(prec), 'AvgRecall:', np.mean(recall)) return [ np.mean(total_test_score), total_std_test_score, len(y), y_pred ] else: print('Mean:', np.mean(total_test_score), 'Std:', np.std(total_test_score), 'AvgPrecision:', np.mean(prec), 'AvgRecall:', np.mean(recall)) return [np.mean(total_test_score), np.std(total_test_score), len(y)]
cls(n_splits=3, random_state=0)) assert tokenize(cls(n_splits=3, random_state=0)) != tokenize( cls(n_splits=3, random_state=2)) assert tokenize(cls(n_splits=3, random_state=0)) != tokenize( cls(n_splits=4, random_state=0)) cv = cls(n_splits=3) assert compute_n_splits(cv, np_X, np_y, np_groups) == 3 with assert_dask_compute(False): assert compute_n_splits(cv, da_X, da_y, da_groups) == 3 @pytest.mark.parametrize("cvs", [(LeaveOneOut(), ), (LeavePOut(2), LeavePOut(3))]) def test_leave_out(cvs): tokens = [] for cv in cvs: assert tokenize(cv) == tokenize(cv) tokens.append(cv) assert len(set(tokens)) == len(tokens) cv = cvs[0] sol = cv.get_n_splits(np_X, np_y, np_groups) assert compute_n_splits(cv, np_X, np_y, np_groups) == sol with assert_dask_compute(True): assert compute_n_splits(cv, da_X, da_y, da_groups) == sol
# 'min_samples_split': [2, 3, 4, 5, 10, 15], # 'max_depth': [3, 4, 5, 10], # "criterion": ["gini"] # } # clf = GridSearchCV( # RandomForestClassifier(), # parameters, # cv=3, # n_jobs=-1 # ) # clf.fit(all_data, all_label) # print(clf.best_estimator_) # L法 print("--- L法 ---") loo = LeaveOneOut() count = [[0, 0], [0, 0]] for train_index, test_index in loo.split(all_data): train_data = [all_data[i] for i in train_index] train_label = [all_label[i] for i in train_index] test_data = [all_data[i] for i in test_index] test_label = [all_label[i] for i in test_index] clf = RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None, criterion='gini', max_depth=10, max_features=6, max_leaf_nodes=None, max_samples=None, min_impurity_decrease=0.0,
def _print_train_results(classifier_name, classifier, regressors, response, regressor_names, leave_one_out): """ _print_train_results Performs validation tests of the model and prints the results :param classifier_name: Name of the classifier method :param classifier: Classifier object :param regressors: numpy array with the regressors used to train the model :param response: numpy array with the response used to train the model :param regressor_names: List with the name of the regressors :param leave_one_out: Boolean, true to perform leave-one-out cross-validation, otherwise perform default cross validation :return: None """ global MESSAGES _verbose_print("classifier_name: {}".format(classifier_name)) _verbose_print("classifier: {}".format(classifier)) _verbose_print("regressor_names: {}".format(regressor_names)) _verbose_print("leave_one_out: {}".format(leave_one_out)) MESSAGES.AddMessage("{} classifier with parameters: \n {}".format( classifier_name, str(classifier.get_params()).replace("'", ""))) if leave_one_out: # create a leave-one-out instance to execute the cross-validation loo = LeaveOneOut() start = timer() cv_score = cross_val_score(classifier, regressors, response, cv=loo.split(regressors)) end = timer() n_tests = len(response) MESSAGES.AddMessage("Score (Leave one Out):" + str(cv_score.mean())) else: start = timer() cv_score = cross_val_score(classifier, regressors, response) end = timer() n_tests = 3 MESSAGES.AddMessage("Score (3-Fold):" + str(cv_score.mean())) # Print validation time MESSAGES.AddMessage( "Testing time: {:.3f} seconds, {:.3f} seconds per test".format( end - start, (end - start) / n_tests)) # Print confusion matrix MESSAGES.AddMessage("Confusion Matrix (Train Set):") confusion = confusion_matrix(response, classifier.predict(regressors)) labels = ["Non Deposit", "Deposit"] row_format = "{:6}" + "{:^16}" * (len(labels) + 1) MESSAGES.AddMessage(row_format.format("", "", "Predicted", "")) MESSAGES.AddMessage(row_format.format("True", "", *labels)) for label, row in zip(labels, confusion): MESSAGES.AddMessage(row_format.format("", label, *row)) # Some classifiers do not have decision_function attribute but count with predict_proba instead # TODO: Generalize to anything that does not have decision_function "Easier to ask for forgiveness than permission" if classifier_name in ["Random Forest"]: des_fun = classifier.predict_proba( regressors)[:, classifier.classes_ == 1] else: des_fun = classifier.decision_function(regressors) MESSAGES.AddMessage("Area Under the curve (AUC): {}".format( roc_auc_score(response, des_fun))) # Give the importance of the features if it is supported # TODO: Generalize to anything that does have feature_importances_ "Easier to ask for forgiveness than permission" if classifier_name == "Adaboost": MESSAGES.AddMessage("Feature importances: ") importances = [[name, val * 100] for name, val in zip( regressor_names, classifier.feature_importances_)] long_word = max([len(x) for x in regressor_names]) row_format = "{" + ":" + str(long_word) + "} {:4.1f}%" # Print regressors in descending importance, omit the ones with 0 importance for elem in sorted(importances, key=lambda imp: imp[1], reverse=True): if elem[1] > 0: MESSAGES.AddMessage(row_format.format(*elem)) return
stop_ind = start_ind + test_size train_targets = followup_total_PANSS[train_index] test_targets = followup_total_PANSS[test_index] test_subjects = test_subjects + list(np.array(subjectids)[test_index]) # do supervised site correction? if site_correction == 'comBat_supervised': # generate (outer) train and test data and metadata train_data = logm_connectivity_data[train_index, :] test_data = logm_connectivity_data[test_index, :] train_metadata = metadata.iloc[train_index, :] # generate loo training prediction loo = LeaveOneOut() inner_predictions = np.zeros((train_size, )) inner_train_indices, inner_test_indices = min_two_test_CV( site[train_index]) #for j in range(len(inner_train_indices)) : #for inner_train_index, inner_test_index in loo.split(train_data) : skf = StratifiedKFold(n_splits=3) for inner_train_index, inner_test_index in skf.split( train_data, site[train_index]): # print ('i = ' +str(i)) # print ('j = ' +str(j)) # # inner_train_index = inner_train_indices[j] # inner_test_index = inner_test_indices[j] #
def main(argv): filename = argv[0] t = float(argv[1]) # threshold for logistic regression (default=0.5) dup = int(argv[2]) # if 1, bad queries will be duplicated subset = 'cache' # column title for precision of cache full = 'full' # column title for precision of full db df = pd.read_csv('../../data/cache_selection_structured/' + filename) df = df.drop(['query', 'freq'], axis=1) df = df.fillna(0) df['label'] = np.where(df['full'] > df['cache'], 1, 0) if dup: print('duping..') bads = df[df['label'] == 1] df = df.append(bads, ignore_index=True) X = df.drop(['label'], axis=1) y = df['label'] p20_mean = np.zeros([1, 6]) bad_mean = np.zeros([1, 6]) ml_average_rare = 0 ql_average_rare = 0 best_average_rare = 0 loo = LeaveOneOut() bad_counter = 0 for train_index, test_index in loo.split(X): X_train, X_test = X.iloc[train_index], X.iloc[test_index] y_train, y_test = y.iloc[train_index], y.iloc[test_index] X_train = X_train.drop([subset, full], axis=1) p12 = X_test[subset].iloc[0] p100 = X_test[full].iloc[0] is_bad = p12 < p100 X_test = X_test.drop([subset, full], axis=1) # compute query likelihood based effectiveness ql_cache = np.mean(X_test['ql_0_0'] + X_test['ql_0_1'] + X_test['ql_1_0'] + X_test['ql_2_0']) ql_rest = np.mean(X_test['ql_rest_0_0'] + X_test['ql_rest_0_1'] + X_test['ql_rest_1_0'] + X_test['ql_rest_2_0']) #ql_pred = X_test['ql_0_1'].iloc[0] < X_test['ql_rest_0_1'].iloc[0] ql_pred = 1 if ql_cache < ql_rest else 0 ql = p12 if ql_pred == 0 else p100 # learn the model sc = MinMaxScaler().fit(X_train) X_train = sc.transform(X_train) X_test = sc.transform(X_test) # print("\ttraining balanced LR..") lr = linear_model.LogisticRegression(class_weight='balanced') lr.fit(X_train, y_train) # print("\ttraining mean accuracy = %.2f" % lr.score(X_train, y_train)) # print("\ttesting mean accuracy = %.2f" % lr.score(X_test, y_test)) y_prob = lr.predict_proba(X_test) y_pred = y_prob[:, 1] > t y_pred = y_pred.astype('uint8') # print('\t t = %.2f results:' % t) # print_results(y_test, y_pred) # compute ML based effectiveness ml = p12 if y_pred[0] == 0 else p100 best = p12 if y_test.iloc[0] == 0 else p100 rnd = p12 if np.random.randint(0, 2) == 1 else p100 p20_mean += [p12, p100, ml, ql, best, rnd] if is_bad: #bad_mean += [p12[0], p100[0], ml[0], ql[0], best[0], rnd[0]] bad_mean += [p12, p100, ml, ql, best, rnd] bad_counter += 1 print('final results:') print('\t'.join( map(str, ['set', 'cache', 'db', 'ml', 'ql', 'best', 'rand']))) print('\t'.join(['bad'] + map(str, np.round(bad_mean[0] / bad_counter, 2)))) print('\t'.join(['all'] + map(str, np.round(p20_mean[0] / df.shape[0], 2))))
def HADA(sourceGraph, targetGraph, labels, settings): # initialisation subject = 150 overallResult_PCC = np.zeros((subject, 32)) overallResult_TSW = np.zeros((subject, 32)) allSV = np.empty((0, sourceGraph.shape[1]), int) allTV = np.empty((0, targetGraph.shape[1]), int) allpredTV = np.empty((0, targetGraph.shape[1]), int) testlabel = [] # Create training and testing sets loo = LeaveOneOut() loo.get_n_splits(sourceGraph) for train_index, test_index in loo.split(sourceGraph): rearrangedPredictorView = np.concatenate((np.transpose( sourceGraph[train_index]), np.transpose(sourceGraph[test_index])), axis=1) rearrangedTargetView = np.concatenate((np.transpose( targetGraph[train_index]), np.transpose(targetGraph[test_index])), axis=1) ## Domain Alignment (DA) using ARGA and Similarity matrix learning using SIMLR simlr = SIMLR.SIMLR_LARGE(1, 50, 0) enc = Encoder(settings) ## STEP 1: Hierarchical Domain Alignment for traing samples print("Hierarchical Domain Alignment for traing samples") print("level 1") Simlarity2, _, _, _ = simlr.fit(targetGraph[train_index]) encode_S_T = enc.erun(Simlarity2, sourceGraph[train_index]) # H denotes the number of hierarchical levels H = 2 temporary = encode_S_T for number in range(1, H): print("level ", H) encode_train__TV_A = enc.erun(Simlarity2, temporary) temporary = encode_train__TV_A ## STEP 2: Target Graph Prediction ## STEP 2.1: Source graph embedding of training and testing subjects test__train__SV = np.vstack( (sourceGraph[train_index], sourceGraph[test_index])) print("Source graph embedding of training and testing subjects...") Simlarity1, _, _, _ = simlr.fit(test__train__SV) encode_test__train__SV = enc.erun(Simlarity1, test__train__SV) ## STEP 2.2: Connectomic Manifold Learning using SIMLR print("SIMLR...") SALL, FALL, val, ind = simlr.fit(encode_test__train__SV) SY, FY, val, ind = simlr.fit(encode_train__TV_A) # number of neighbors for trust score TS_bestNb = 5 # get the best neighbors in the learned manifold of the regularized source graph embeddings sall = SALL.todense() Index_ALL = np.argsort(-sall, axis=0) des = np.sort(-sall, axis=0) Bvalue_ALL = -des # get the best neighbors in the learned manifold of the hierarchically aligned source and target graph embeddings sy = SY.todense() Index_Y = np.argsort(-sy, axis=0) desy = np.sort(-sy, axis=0) Bvalue_Y = -desy # make prediction for each testing subject for testingSubject in range(1, 2): print "testing subject:", test_index # get this testing subject's rearranged index and original index tSubjectIndex = (sourceGraph[train_index].shape[0] - 2) + testingSubject tSubjectOriginalIndex = test_index # compute Tscore for each neighbor trustScore = np.ones((TS_bestNb, TS_bestNb)) newWeight_TSW = np.ones(TS_bestNb) for neighbor in range(0, TS_bestNb): neighborIndex = Index_ALL[tSubjectIndex, neighbor] temp_counter = 0 while (neighborIndex > sourceGraph[train_index].shape[0]): # best neighbor is a testing data temp_counter = temp_counter + 1 neighborIndex = Index_ALL[tSubjectIndex, (TS_bestNb + temp_counter)] if (temp_counter != 0): neighborSequence = TS_bestNb + temp_counter else: neighborSequence = neighbor #print(neighborIndex) # get top nb neighbors in mappedX neighborListX = Index_ALL[neighborIndex, 0:TS_bestNb] # get top nb neighbors in mappedY neighborListY = Index_Y[neighborIndex, 0:TS_bestNb] # calculate trust score trustScore[TS_bestNb - 1, neighbor] = len( np.intersect1d(np.array(neighborListX), np.array(neighborListY))) # calculate new weight (TS * Similarity) newWeight_TSW[neighbor] = exp( trustScore[TS_bestNb - 1, neighbor] / TS_bestNb * Bvalue_ALL[tSubjectIndex, neighborSequence]) #reconstruct with Tscore and similarity weight innerPredict_TSW = np.zeros( sourceGraph[train_index].shape[1])[np.newaxis] #summing up the best neighbors for j1 in range(0, TS_bestNb): tr = (rearrangedTargetView[:, Index_ALL[tSubjectIndex, j1]])[np.newaxis] if j1 == 0: innerPredict_TSW = innerPredict_TSW.T + tr.T * newWeight_TSW[ j1] else: innerPredict_TSW = innerPredict_TSW + tr.T * newWeight_TSW[ j1] # scale weight to 1 Scale_TSW = sum(newWeight_TSW) innerPredict_TSW = np.divide(innerPredict_TSW, Scale_TSW) # calculate result (MAE) tr2 = (rearrangedTargetView[:, tSubjectIndex])[np.newaxis] resulttsw = abs(tr2.T - innerPredict_TSW) iMAE_TSW = mean_absolute_error(tr2.T, innerPredict_TSW) overallResult_TSW[tSubjectOriginalIndex, TS_bestNb] = overallResult_TSW[ tSubjectOriginalIndex, TS_bestNb] + iMAE_TSW allSV = np.append(allSV, sourceGraph[test_index], axis=0) testlabel.append(labels[test_index]) allpredTV = np.append(allpredTV, innerPredict_TSW.T, axis=0) print test_index dataset_source_and_predicted_target = np.concatenate((allSV, allpredTV), axis=1) print('END') mae = np.mean(overallResult_TSW, axis=0) print("Mean Absolute Error: ") print(mae[np.nonzero(mae)]) return mae, dataset_source_and_predicted_target, testlabel
mu, sigma = 0.22266368090882432, 0.027202072213276744 # mean and standard deviation sourceGraph = np.random.normal(mu, sigma, (150, 595)) mu, sigma = 0.08308065685993601, 0.01338490182696101 targetGraph = np.random.normal(mu, sigma, (150, 595)) labels = np.concatenate((np.zeros((1, 75)), np.ones((1, 75))), axis=None) ## HADA execution model = 'arga_ae' #autoencoder/variational autoencoder settings = settings.get_settings_new(model) mae, dataset_source_and_predicted_target, testlabel = HADA( sourceGraph, targetGraph, labels, settings) ## STEP 3: Disease Classification using Random Forest classes = testlabel label = np.array(classes) loo = LeaveOneOut() actual_label = [] predicted_sv_predtv_label = [] RF = RandomForestClassifier(bootstrap=False, class_weight=None, criterion='gini', max_depth=None, max_features='auto', max_leaf_nodes=None, min_impurity_decrease=0.0, min_impurity_split=None, min_samples_leaf=1, min_samples_split=2, min_weight_fraction_leaf=0.0, n_estimators=400,
header = data_table[0] del data_table[0] data = np.zeros((len(data_table), len(data_table[0]))) for i in range(0,len(data_table)): tmp = data_table[i] for j in range(0,len(tmp)): data[i,j] = float(tmp[j]) feature=data[:,[1,2,3,4]] labels=data[:,[0]] # Perform Leave One Out Validation on just the Decision Tree Classifier. LOO=LeaveOneOut() number_of_iterations=LOO.get_n_splits(feature) total_score=0; d3=tree.DecisionTreeClassifier() for train_index,test_index in LOO.split(feature): #print("TRAIN:", train_index, "TEST:", test_index) train_features, test_features = feature[train_index], feature[test_index] train_labels, test_labels = labels[train_index], labels[test_index] d3=tree.DecisionTreeClassifier() clf=d3.fit(train_features,train_labels) total_score+=clf.score(test_features,test_labels) score = mean_score=(total_score/number_of_iterations) print("D3 + leave one cross validation:",score)
def findBestAlpha(data, x_cols, y_cols, parameters, alphas): """ Best alpha value for MLP Arguments: data {array} -- Data x_cols {arrray} -- x columns y_cols {array} -- y columns parameters {namedTuple} -- parameters for the classifier alphas {array} -- array of alphas to test """ best_alpha = 0 best_accu = 0 x = data.loc[:, x_cols] y = data.loc[:, y_cols] #Picking best k for a in alphas: loo = LeaveOneOut() loo.get_n_splits(data) n = loo.split(data) mlpClassifier = MLPClassifier( hidden_layer_sizes=parameters.hidden_layer_sizes, solver=parameters.solver, alpha=a, batch_size=parameters.batch_size, learning_rate=parameters.learning_rate, learning_rate_init=parameters.learning_rate_init, max_iter=parameters.max_iter, random_state=parameters.random_state, verbose=parameters.verbose, early_stopping=parameters.early_stopping, validation_fraction=parameters.validation_fraction) accuracy_a = [] real_label = [] pred_label = [] for train_index, test_index in n: #Each row is test data once xtrain, xtest = x.iloc[train_index], x.iloc[test_index] ytrain, ytest = y.iloc[train_index], y.iloc[test_index] mlpClassifier.fit(xtrain, ytrain.values.ravel()) ypred = mlpClassifier.predict(xtest) pred_label.append(ypred) real_label.append(ytest) acc = accuracy_score(ytest, ypred) accuracy_a.append(acc) avg_acc = np.mean(accuracy_a) print(a, ": average accuracy ", avg_acc) if (avg_acc > best_accu): #Updating best_k if accuracy is better best_accu = avg_acc best_alpha = a print("Best alpha=", best_alpha) print("Best accuracy=", best_accu) return (best_alpha)
def testMlp(data, parameters, x_cols, y_cols, plots=False): """ testing MLP classifier Arguments: data {array} -- Data x_cols {array} -- x columns y_cols {array} -- y columns Keyword Arguments: plots {bool} -- Used for plotting (default: {False}) """ x = data.loc[:, x_cols] y = data.loc[:, y_cols] loo = LeaveOneOut() loo.get_n_splits(data) n = loo.split(data) mlpClassifier = MLPClassifier( hidden_layer_sizes=parameters.hidden_layer_sizes, solver=parameters.solver, alpha=parameters.alpha, batch_size=parameters.batch_size, learning_rate=parameters.learning_rate, learning_rate_init=parameters.learning_rate_init, max_iter=parameters.max_iter, random_state=parameters.random_state, verbose=parameters.verbose, early_stopping=parameters.early_stopping, validation_fraction=parameters.validation_fraction) accuracy_a = [] real_label = [] pred_label = [] for train_index, test_index in n: #Each row is test data once xtrain, xtest = x.iloc[train_index], x.iloc[test_index] ytrain, ytest = y.iloc[train_index], y.iloc[test_index] mlpClassifier.fit(xtrain, ytrain.values.ravel()) ypred = mlpClassifier.predict(xtest) pred_label.append(ypred) real_label.append(ytest.values) acc = accuracy_score(ytest, ypred) accuracy_a.append(acc) avg_acc = np.mean(accuracy_a) pred_label_df = pd.DataFrame(columns=["label"]) real_label_df = pd.DataFrame(columns=["label"]) #Forming the dataframes for row in range(0, len(pred_label)): label_str = pred_label[row][0] pred_label_df.loc[row] = label_str for row in range(0, len(real_label)): label_str = real_label[row][0][0] real_label_df.loc[row] = label_str if (plots): cm = confusion_matrix(real_label_df, pred_label_df) cm_df = pd.DataFrame(cm, ["Fall", "Normal"], ["Fall", "Normal"]) sn.set(font_scale=1.5) sn.heatmap(cm_df, annot=True, annot_kws={"size": 32}, fmt='d') plt.savefig("../figs/svm_heatmap.png", facecolor="w", bbox_inches="tight") plt.show() #Checking accuracy print("SVM average accuracy: ", round(avg_acc, 2)) #2 decimals #More detailed report print(classification_report(real_label_df, pred_label_df)) return (avg_acc, real_label_df, pred_label_df)
def pca_graph_pvals_less_than(): data = preproccessed_data.join(mapping_file[[ 'Age', 'BMI', 'FattyLiver', 'RegularExercise', 'Smoking', 'DiagnosisGroup' ]]) X = data.drop([ 'Age', 'BMI', 'FattyLiver', 'RegularExercise', 'Smoking', 'DiagnosisGroup' ], axis=1) y = data['DiagnosisGroup'] for n_comp in range(2, 30): pcas.append(n_comp) loo = LeaveOneOut() y_pred_list = [] auc = [] auc_train = [] for train_index, test_index in loo.split(X): train_index = list(train_index) # print("%s %s" % (train_index, test_index)) X_train, X_test = X.iloc[train_index], X.iloc[test_index] y_train, y_test = y[train_index], y[test_index] most_corelated_taxon = {} for i in range(X_train.shape[1]): p_val = scipy.stats.spearmanr(X_train.iloc[:, i], y_train)[1] if math.isnan(p_val): most_corelated_taxon[X_train.columns[i]] = 1 else: most_corelated_taxon[X_train.columns[i]] = p_val sorted_taxon = sorted(most_corelated_taxon.items(), key=operator.itemgetter(1)) most_corelated_taxon = [i for i in sorted_taxon if i[1] <= 0.01] bact = [i[0] for i in most_corelated_taxon if i[0] != 1] new_data = X[bact] otu_after_pca, _ = apply_pca(new_data, n_components=n_comp) new_data = otu_after_pca.join(data[[ 'Age', 'BMI', 'FattyLiver', 'RegularExercise', 'Smoking', 'DiagnosisGroup' ]], how='inner') X_new = new_data.drop(['DiagnosisGroup'], axis=1) y_new = new_data['DiagnosisGroup'] regex = re.compile(r"\[|\]|<", re.IGNORECASE) X_new.columns = [ regex.sub("_", col) if any(x in str(col) for x in set(('[', ']', '<'))) else col for col in X_new.columns.values ] X_train, X_test = X_new.iloc[train_index], X_new.iloc[test_index] y_train, y_test = y_new[train_index], y_new[test_index] model = XGBClassifier(max_depth=4, n_estimators=150, learning_rate=15 / 100, objective='multi:softmax') #objective='binary:logistic', #scale_pos_weight=(np.sum(y_train == -1) / np.sum(y_train == 1))) model.fit(X_train, y_train) pred_train = model.predict(X_train) auc_train.append(metrics.accuracy_score(y_train, pred_train)) y_pred = model.predict(X_test) y_pred_list.append(y_pred[0]) try: auc = metrics.accuracy_score(y, y_pred_list) except: pass print('PCA components' + str(n_comp), round(auc, 2)) scores = round(auc, 2) scores_train = round(np.array(auc_train).mean(), 2) train_accuracy.append(scores_train) test_accuracy.append(round(scores.mean(), 2))
parameters = {'alpha': [1e-15, 1e-10, 1e-8, 1e-4, 1e-3,1e-2, 1, 5, 10, 20]} lasso_regressor = GridSearchCV(lasso, parameters, scoring='neg_mean_squared_error', cv = 16) lasso_regressor.fit(X, y) lasso_regressor.best_params_ lasso_regressor.best_score_ # performs worse compared to linear regression ## #%% try with leave one out import numpy as np from sklearn.model_selection import LeaveOneOut loo = LeaveOneOut() #loo.get_n_splits(X) loo.get_n_splits(np.array(subject_list[0:18])[mask]) # chose subject number t1 = creatSubCor(subject_list, '1', 'speficTrial_0_trauma.npy') df_clinical.describe() X = np.array(subjectPosEdges[0:18])[mask].reshape(-1,1) y = np.array(diffPCL)[mask]#.reshape(-1,1) print(loo) for train_index, test_index in loo.split(np.array(subject_list[0:18])[mask]): # should insert matrx tresholding for all subjects picked and calculating (each iteration) the Sum of Positive edges print(np.array(subject_list)[train_index]) print("TRAIN:", train_index, "TEST:", test_index)
def lda_project(spike_times, spike_clusters, event_times, event_groups, pre_time=0, post_time=0.5, cross_validation='kfold', num_splits=5, prob_left=None, custom_validation=None): """ Use linear discriminant analysis to project population vectors to the line that best separates the two groups. When cross-validation is used, the LDA projection is fitted on the training data after which the test data is projected to this projection. spike_times : 1D array spike times (in seconds) spike_clusters : 1D array cluster ids corresponding to each event in `spikes` event_times : 1D array times (in seconds) of the events from the two groups event_groups : 1D array group identities of the events, can be any number of groups, accepts integers and strings cross_validation : string which cross-validation method to use, options are: 'none' No cross-validation 'kfold' K-fold cross-validation 'leave-one-out' Leave out the trial that is being decoded 'block' Leave out the block the to-be-decoded trial is in 'custom' Any custom cross-validation provided by the user num_splits : integer ** only for 'kfold' cross-validation ** Number of splits to use for k-fold cross validation, a value of 5 means that the decoder will be trained on 4/5th of the data and used to predict the remaining 1/5th. This process is repeated five times so that all data has been used as both training and test set. prob_left : 1D array ** only for 'block' cross-validation ** the probability of the stimulus appearing on the left for each trial in event_times custom_validation : generator ** only for 'custom' cross-validation ** a generator object with the splits to be used for cross validation using this format: ( (split1_train_idxs, split1_test_idxs), (split2_train_idxs, split2_test_idxs), (split3_train_idxs, split3_test_idxs), ...) n_neurons : int Group size of number of neurons to be sub-selected Returns ------- lda_projection : 1D array the position along the LDA projection axis for the population vector of each trial """ # Check input assert cross_validation in [ 'none', 'kfold', 'leave-one-out', 'block', 'custom' ] assert event_times.shape[0] == event_groups.shape[0] if cross_validation == 'block': assert event_times.shape[0] == prob_left.shape[0] if cross_validation == 'custom': assert isinstance(custom_validation, types.GeneratorType) # Get matrix of all neuronal responses times = np.column_stack( ((event_times - pre_time), (event_times + post_time))) pop_vector, cluster_ids = get_spike_counts_in_bins(spike_times, spike_clusters, times) pop_vector = pop_vector.T # Initialize lda = LinearDiscriminantAnalysis() lda_projection = np.zeros(event_groups.shape) if cross_validation == 'none': # Find the best LDA projection on all data and transform those data lda_projection = lda.fit_transform(pop_vector, event_groups) else: # Perform cross-validation if cross_validation == 'leave-one-out': cv = LeaveOneOut().split(pop_vector) elif cross_validation == 'kfold': cv = KFold(n_splits=num_splits).split(pop_vector) elif cross_validation == 'block': block_lengths = [sum(1 for i in g) for k, g in groupby(prob_left)] blocks = np.repeat(np.arange(len(block_lengths)), block_lengths) cv = LeaveOneGroupOut().split(pop_vector, groups=blocks) elif cross_validation == 'custom': cv = custom_validation # Loop over the splits into train and test for train_index, test_index in cv: # Find LDA projection on the training data lda.fit(pop_vector[train_index], [event_groups[j] for j in train_index]) # Project the held-out test data to projection lda_projection[test_index] = lda.transform( pop_vector[test_index]).T[0] return lda_projection
from keras.models import Sequential from keras.layers import Dense from keras.optimizers import SGD from sklearn.datasets import load_iris from sklearn.model_selection import LeaveOneOut data, classifier = load_iris(return_X_y=True) data = np.delete(data, 0, 1) data = np.delete(data, 0, 1) score = np.zeros([len(data)]) acc = np.zeros([len(data)]) loocv = LeaveOneOut() for train_index, test_index in loocv.split(data): Xtrain, Xtest = data[train_index], data[test_index] Ytrain, Ytest = classifier[train_index], classifier[test_index] #holdout = 0.2; #Xtrain, Xtest, Ytrain, Ytest = train_test_split(data, classifier, test_size=holdout) model = Sequential() model.add(Dense(30, activation='sigmoid', input_shape=(2, ))) model.add(Dense(30, activation='sigmoid')) model.add(Dense(1)) sgd = SGD(lr=0.05, decay=1e-6, momentum=0.9, nesterov=True) model.compile(loss='mean_squared_error', optimizer=sgd,
def allergies_distance_matrix(distance='spearman', clustering='spectral'): for i in range(0, df.shape[1]): for j in range(0, df.shape[1]): #Spearman correlation if distance == 'spearman': dist_mat.at[df.columns[i], df.columns[j]] = abs( round( scipy.stats.spearmanr( np.array(df.iloc[:, i]).astype(float), np.array(df.iloc[:, j]).astype(float))[0], 4)) #Euclidean distance else: dist_mat.at[df.columns[i], df.columns[j]] = np.linalg.norm( np.array(df.iloc[:, i]).astype(float) - np.array(df.iloc[:, j]).astype(float)) if clustering == 'spectral': clustering = SpectralClustering(n_clusters=2, affinity='precomputed', assign_labels='discretize', random_state=0) else: clustering = AgglomerativeClustering(affinity='precomputed', linkage='average') clustering.fit(dist_mat.values) bact_label1 = [] bact_label0 = [] bact_label = {0: [], 1: []} for i in range(0, df.shape[1]): if clustering.labels_[i] == 1: bact_label1.append(df.columns[i]) else: bact_label0.append(df.columns[i]) bact_label_name = {0: [], 1: []} bact_label_tmp = {0: [], 1: []} bact_level = level - 1 for k in [0, 1]: for i in bact_label[k]: for key, value in dict_bact.items(): for j in value: if i == j: bact_label_tmp[k].append(key) bact_label_tmp[k] = set(bact_label_tmp[k]) for i in bact_label_tmp[k]: if i != 'else': for j in taxonomy: try: if j.split(';')[bact_level] == i: bact_label_name[k].append(','.join( j.split(';')[0:bact_level + 1])) break except: continue else: bact_label_name[k].append('else') bact_label_name[k] = set(bact_label_name[k]) df1 = df[bact_label1] df0 = df[bact_label0] pca = PCA(n_components=min(round(df0.shape[1] / 2) + 1, df0.shape[0])) pca.fit(df0) sum = 0 num_comp = 0 for (i, component) in enumerate(pca.explained_variance_ratio_): if sum <= 0.5: sum += component else: num_comp = i break if num_comp == 0: num_comp += 1 otu_after_pca0, _ = apply_pca(df0, n_components=num_comp, print_data=False) merged_data0 = otu_after_pca0.join(mapping_file) X = merged_data0.drop(['disease'], axis=1) y = merged_data0['disease'] loo = LeaveOneOut() accuracy = [] y_pred_list = [] for train_index, test_index in loo.split(X): train_index = list(train_index) X_train, X_test = X.iloc[train_index, :], X.iloc[test_index, :] y_train, y_test = y[train_index], y[test_index] model = XGBClassifier(max_depth=5, n_estimators=300, learning_rate=15 / 100, objective='binary:logistic', scale_pos_weight=(np.sum(y_train == 0) / np.sum(y_train == 1)), reg_lambda=450) model.fit(X_train, y_train) y_pred = model.predict(X_test) y_pred_list.append(y_pred) y_pred_train = model.predict(X_train) print('Train Precision: ' + str(round(precision_score(y_train, y_pred_train), 2))) print('Train Recall: ' + str(round(recall_score(y_train, y_pred_train), 2))) cnf_matrix = metrics.confusion_matrix(y_train, y_pred_train) class_names = ['Control', 'GVHD'] plt.figure() plot_confusion_matrix(cnf_matrix, classes=list(class_names), normalize=True, title='Normalized confusion matrix') plt.show() print('Precision: ' + str(round(precision_score(y, y_pred_list), 2))) print('Recall: ' + str(round(recall_score(y, y_pred_list), 2))) cnf_matrix = metrics.confusion_matrix(y, y_pred_list) # # Plot normalized confusion matrix plt.figure() plot_confusion_matrix(cnf_matrix, classes=list(class_names), normalize=True, title='Normalized confusion matrix') plt.show() # pca = PCA(n_components=min(round(df1.shape[1] / 2) + 1, df1.shape[0])) pca.fit(df1) sum = 0 num_comp = 0 for (i, component) in enumerate(pca.explained_variance_ratio_): if sum <= 0.5: sum += component else: num_comp = i break if num_comp == 0: num_comp += 1 otu_after_pca1, _ = apply_pca(df1, n_components=num_comp, print_data=False) merged_data1 = otu_after_pca1.join(mapping_file) X = merged_data1.drop(['disease'], axis=1) y = merged_data1['disease'] loo = LeaveOneOut() accuracy = [] y_pred_list = [] for train_index, test_index in loo.split(X): train_index = list(train_index) X_train, X_test = X.iloc[train_index, :], X.iloc[test_index, :] y_train, y_test = y[train_index], y[test_index] model = XGBClassifier(max_depth=5, n_estimators=300, learning_rate=15 / 100, objective='binary:logistic', scale_pos_weight=(np.sum(y_train == 0) / np.sum(y_train == 1)), reg_lambda=450) model.fit(X_train, y_train) y_pred = model.predict(X_test) y_pred_list.append(y_pred) y_pred_train = model.predict(X_train) print('Train Precision: ' + str(round(precision_score(y_train, y_pred_train), 2))) print('Train Recall: ' + str(round(recall_score(y_train, y_pred_train), 2))) cnf_matrix = metrics.confusion_matrix(y_train, y_pred_train) class_names = ['Control', 'GVHD'] plt.figure() plot_confusion_matrix(cnf_matrix, classes=list(class_names), normalize=True, title='Normalized confusion matrix') plt.show() print('Precision: ' + str(round(precision_score(y, y_pred_list), 2))) print('Recall: ' + str(round(recall_score(y, y_pred_list), 2))) cnf_matrix = metrics.confusion_matrix(y, y_pred_list) # # Plot normalized confusion matrix plt.figure() plot_confusion_matrix(cnf_matrix, classes=list(class_names), normalize=True, title='Normalized confusion matrix') plt.show()
def fit(self, X, y): """Fit the model using X as training data and y as target values Parameters ---------- X : sktime-format pandas dataframe with shape([n_cases,n_dimensions]), or numpy ndarray with shape([n_cases,n_readings,n_dimensions]) y : {array-like, sparse matrix} Target values of shape = [n_samples] """ X, y = check_X_y(X, y, enforce_univariate=True) y = np.asarray(y) X = np.array( [np.asarray([x]).reshape(len(x), 1) for x in X.iloc[:, 0]]) check_classification_targets(y) # if internal cv is desired, the relevant flag forces a grid search # to evaluate the possible values, # find the best, and then set this classifier's params to match if self._cv_for_params: grid = GridSearchCV(estimator=KNeighborsTimeSeriesClassifier( metric=self.metric, n_neighbors=1, algorithm="brute"), param_grid=self._param_matrix, cv=LeaveOneOut(), scoring='accuracy') grid.fit(X, y) self.metric_params = grid.best_params_['metric_params'] if y.ndim == 1 or y.ndim == 2 and y.shape[1] == 1: if y.ndim != 1: warnings.warn( "A column-vector y was passed when a 1d array " "was expected. Please change the shape of y to " "(n_samples, ), for example using ravel().", DataConversionWarning, stacklevel=2) self.outputs_2d_ = False y = y.reshape((-1, 1)) else: self.outputs_2d_ = True self.classes_ = [] self._y = np.empty(y.shape, dtype=np.int) for k in range(self._y.shape[1]): classes, self._y[:, k] = np.unique(y[:, k], return_inverse=True) self.classes_.append(classes) if not self.outputs_2d_: self.classes_ = self.classes_[0] self._y = self._y.ravel() temp = check_array.__wrapped__.__code__ check_array.__wrapped__.__code__ = _check_array_ts.__code__ fx = self._fit(X) check_array.__wrapped__.__code__ = temp self._is_fitted = True return fx
def testTreePredictions(data, parameters, x_cols, y_cols, plots=False): """ Testing tree prediction accuracies. Arguments: data {array} -- Labeled data for classifier testing. x_cols {array} -- x columns y_cols {array} -- y columns parameters {namedtuple} -- Parameters for the tree classifier. Using named tuple to keep things tidy. Keyword Arguments: plots {bool} -- Used for plotting (default: {False}) """ x = data.loc[:, x_cols] y = data.loc[:, y_cols] loo = LeaveOneOut() loo.get_n_splits(data) n = loo.split(data) #Creating the classifier with the input parameters treeClassifier = tree.DecisionTreeClassifier( class_weight=parameters.class_weight, criterion=parameters.criterion, max_depth=parameters.max_depth, max_features=parameters.max_features, max_leaf_nodes=parameters.max_leaf_nodes, min_samples_leaf=parameters.min_samples_leaf, min_samples_split=parameters.min_samples_split, min_weight_fraction_leaf=parameters.min_weight_fraction_leaf, presort=parameters.presort, random_state=parameters.random_state, splitter=parameters.splitter) accuracy_a = [] real_label = [] pred_label = [] for train_index, test_index in n: #Each row is test data once xtrain, xtest = x.iloc[train_index], x.iloc[test_index] ytrain, ytest = y.iloc[train_index], y.iloc[test_index] #Fitting train data treeClassifier = treeClassifier.fit(xtrain, ytrain) #Predictions ypred = treeClassifier.predict(xtest) pred_label.append(ypred) real_label.append(ytest.values) #Accuracy acc = accuracy_score(ytest, ypred) accuracy_a.append(acc) pred_label_df = pd.DataFrame(columns=["label"]) real_label_df = pd.DataFrame(columns=["label"]) #Forming the dataframes for row in range(0, len(pred_label)): label_str = pred_label[row][0] pred_label_df.loc[row] = label_str for row in range(0, len(real_label)): label_str = real_label[row][0][0] real_label_df.loc[row] = label_str if (plots): #Plotting tree and accuracy heatmap #not found in the library for some reason, currently using old version? #plt.figure(figsize=[12, 12]) #tree.plot_tree(treeClassifier, filled=True) #plt.show() #Workaround attempt for tree plotting dot = io.StringIO() tree.export_graphviz(treeClassifier, out_file=dot) (graph, ) = pydot.graph_from_dot_data(dot.getvalue()) graph.write_png("../figs/treeClassifier.png") cm = confusion_matrix(real_label_df, pred_label_df) cm_df = pd.DataFrame(cm, ["Fall", "Normal"], ["Fall", "Normal"]) sn.set(font_scale=1.5) sn.heatmap(cm_df, annot=True, annot_kws={"size": 32}, fmt='d') plt.savefig("../figs/tree_heatmap.png", facecolor="w", bbox_inches="tight") plt.show() avg_acc = np.mean(accuracy_a) #Checking accuracy print("Tree average accuracy: ", round(avg_acc, 2)) #2 decimals #More detailed report print(classification_report(real_label_df, pred_label_df)) return (avg_acc, real_label_df, pred_label_df)
#Iris data cross-validation from sklearn.linear_model import LogisticRegression from sklearn import metrics from sklearn.model_selection import cross_val_predict from sklearn.model_selection import LeaveOneOut from sklearn import datasets import numpy as np import pandas as pd #tmp=pd.read_csv('iris.data',sep=',') #iris=np.loadtxt('iris.data', delimiter=',') iris=datasets.load_iris() x=iris['data'][0:149] y=iris['target'][0:149] log_model=LogisticRegression() m=np.shape(x)[0] y_pred=cross_val_predict(log_model,x,y,cv=10) print(metrics.accuracy_score(y,y_pred)) #print(y_pred) loo=LeaveOneOut() accuracy=0 for train,test in loo.split(x): log_model.fit(x[train],y[train]) y_pred1=log_model.predict(x[test]) if y_pred1==y[test]:accuracy+=1 print (accuracy/m)
def _cost_fn(argd, X, y, EX_list, valid_size, n_folds, shuffle, random_state, use_partial_fit, info, timeout, _conn, loss_fn=None, continuous_loss_fn=False, best_loss=None, n_jobs=1): '''Calculate the loss function ''' try: t_start = time.time() # Extract info from calling function. if 'classifier' in argd: classifier = argd['classifier'] regressor = argd['regressor'] preprocessings = argd['preprocessing'] ex_pps_list = argd['ex_preprocs'] else: classifier = argd['model']['classifier'] regressor = argd['model']['regressor'] preprocessings = argd['model']['preprocessing'] ex_pps_list = argd['model']['ex_preprocs'] learner = classifier if classifier is not None else regressor # Set n_jobs parameter if available for given learner if hasattr(learner, 'n_jobs'): # https://github.com/hyperopt/hyperopt-sklearn/issues/82#issuecomment-430963445 learner.n_jobs = n_jobs is_classif = classifier is not None untrained_learner = copy.deepcopy(learner) # -- N.B. modify argd['preprocessing'] in-place # Determine cross-validation iterator. if n_folds is not None: if n_folds == -1: info('Will use leave-one-out CV') try: cv_iter = LeaveOneOut().split(X) except TypeError: # Older syntax before sklearn version 0.18 cv_iter = LeaveOneOut(len(y)) elif is_classif: info('Will use stratified K-fold CV with K:', n_folds, 'and Shuffle:', shuffle) try: cv_iter = StratifiedKFold(n_splits=n_folds, shuffle=shuffle, random_state=random_state ).split(X, y) except TypeError: # Older syntax before sklearn version 0.18 cv_iter = StratifiedKFold(y, n_folds=n_folds, shuffle=shuffle, random_state=random_state) else: info('Will use K-fold CV with K:', n_folds, 'and Shuffle:', shuffle) try: cv_iter = KFold(n_splits=n_folds, shuffle=shuffle, random_state=random_state).split(X) except TypeError: # Older syntax before sklearn version 0.18 cv_iter = KFold(len(y), n_folds=n_folds, shuffle=shuffle, random_state=random_state) else: if not shuffle: # always choose the last samples. info('Will use the last', valid_size, 'portion of samples for validation') n_train = int(len(y) * (1 - valid_size)) valid_fold = np.ones(len(y), dtype=np.int) valid_fold[:n_train] = -1 # "-1" indicates train fold. try: cv_iter = PredefinedSplit(valid_fold).split() except TypeError: # Older syntax before sklearn version 0.18 cv_iter = PredefinedSplit(valid_fold) elif is_classif: info('Will use stratified shuffle-and-split with validation \ portion:', valid_size) try: cv_iter = StratifiedShuffleSplit(1, test_size=valid_size, random_state=random_state ).split(X, y) except TypeError: # Older syntax before sklearn version 0.18 cv_iter = StratifiedShuffleSplit(y, 1, test_size=valid_size, random_state=random_state) else: info('Will use shuffle-and-split with validation portion:', valid_size) try: cv_iter = ShuffleSplit(n_splits=1, test_size=valid_size, random_state=random_state).split(X) except TypeError: # Older syntax before sklearn version 0.18 cv_iter = ShuffleSplit(len(y), 1, test_size=valid_size, random_state=random_state) # Use the above iterator for cross-validation prediction. cv_y_pool = np.array([]) cv_pred_pool = np.array([]) cv_n_iters = np.array([]) for train_index, valid_index in cv_iter: Xfit, Xval = X[train_index], X[valid_index] yfit, yval = y[train_index], y[valid_index] if EX_list is not None: _EX_list = [ (EX[train_index], EX[valid_index]) for EX in EX_list ] EXfit_list, EXval_list = zip(*_EX_list) else: EXfit_list = None EXval_list = None XEXfit, XEXval = transform_combine_XEX( Xfit, info, preprocessings, Xval, EXfit_list, ex_pps_list, EXval_list ) learner = copy.deepcopy(untrained_learner) info('Training learner', learner, 'on X/EX of dimension', XEXfit.shape) if hasattr(learner, "partial_fit") and use_partial_fit: learner, n_iters = pfit_until_convergence( learner, is_classif, XEXfit, yfit, info, best_loss=best_loss, XEXval=XEXval, yval=yval, timeout=timeout, t_start=t_start ) else: learner.fit(XEXfit, yfit) n_iters = None if learner is None: break cv_y_pool = np.append(cv_y_pool, yval) info('Scoring on X/EX validation of shape', XEXval.shape) if continuous_loss_fn: cv_pred_pool = np.append(cv_pred_pool, learner.predict_proba(XEXval)) else: cv_pred_pool = np.append(cv_pred_pool, learner.predict(XEXval)) cv_n_iters = np.append(cv_n_iters, n_iters) else: # all CV folds are exhausted. if loss_fn is None: if is_classif: loss = 1 - accuracy_score(cv_y_pool, cv_pred_pool) # -- squared standard error of mean lossvar = (loss * (1 - loss)) / max(1, len(cv_y_pool) - 1) info('OK trial with accuracy %.1f +- %.1f' % ( 100 * (1 - loss), 100 * np.sqrt(lossvar)) ) else: loss = 1 - r2_score(cv_y_pool, cv_pred_pool) lossvar = None # variance of R2 is undefined. info('OK trial with R2 score %.2e' % (1 - loss)) else: # Use a user specified loss function loss = loss_fn(cv_y_pool, cv_pred_pool) lossvar = None info('OK trial with loss %.1f' % loss) t_done = time.time() rval = { 'loss': loss, 'loss_variance': lossvar, 'learner': untrained_learner, 'preprocs': preprocessings, 'ex_preprocs': ex_pps_list, 'status': hyperopt.STATUS_OK, 'duration': t_done - t_start, 'iterations': (cv_n_iters.max() if (hasattr(learner, "partial_fit") and use_partial_fit) else None), } rtype = 'return' # The for loop exit with break, one fold did not finish running. if learner is None: t_done = time.time() rval = { 'status': hyperopt.STATUS_FAIL, 'failure': 'Not enough time to finish training on \ all CV folds', 'duration': t_done - t_start, } rtype = 'return' ##==== Cost function exception handling ====## except (NonFiniteFeature,) as exc: print('Failing trial due to NaN in', str(exc)) t_done = time.time() rval = { 'status': hyperopt.STATUS_FAIL, 'failure': str(exc), 'duration': t_done - t_start, } rtype = 'return' except (ValueError,) as exc: if ('k must be less than or equal' ' to the number of training points') in str(exc): t_done = time.time() rval = { 'status': hyperopt.STATUS_FAIL, 'failure': str(exc), 'duration': t_done - t_start, } rtype = 'return' else: rval = exc rtype = 'raise' except (AttributeError,) as exc: print('Failing due to k_means_ weirdness') if "'NoneType' object has no attribute 'copy'" in str(exc): # -- sklearn/cluster/k_means_.py line 270 raises this sometimes t_done = time.time() rval = { 'status': hyperopt.STATUS_FAIL, 'failure': str(exc), 'duration': t_done - t_start, } rtype = 'return' else: rval = exc rtype = 'raise' except Exception as exc: rval = exc rtype = 'raise' # -- return the result to calling process _conn.send((rtype, rval))
def computeCVROC(df, model, outcomeVar, predVars, nFolds=10, LOO=False): """Apply model to df and return performance metrics in a cross-validation framework. Parameters ---------- df : pd.DataFrame Must contain outcome and predictor variables. model : sklearn or other model Model must have fit and predict methods. outcomeVar : str predVars : ndarray or list Predictor variables in the model. nFolds : int N-fold cross-validation (not required for LOO) Returns ------- fpr : np.ndarray Pre-specified vector of FPR thresholds for interpolation fpr = np.linspace(0, 1, 100) meanTPR : np.ndarray Mean true-positive rate in test fraction. auc : float Area under the mean ROC curve. acc : float Mean accuracy score in test fraction. results : returned by model.fit() Training model results object for each fold prob : pd.Series Mean predicted probabilities on test data with index from df success : bool An indicator of whether the cross-validation was completed.""" if not isinstance(predVars, list): predVars = list(predVars) tmp = df[[outcomeVar] + predVars].dropna() X,y = tmp[predVars].astype(float), tmp[outcomeVar].astype(float) if LOO: cv = LeaveOneOut() nFolds = cv.get_n_splits(y) cv_iter = cv.split(y=y) else: cv = StratifiedKFold(n_splits=nFolds, shuffle=True) cv_iter = cv.split(X=X, y=y) fpr = np.linspace(0, 1, 100) tpr = np.nan * np.zeros((fpr.shape[0], nFolds)) acc = np.nan * np.zeros(nFolds) auc = np.nan * np.zeros(nFolds) coefs = [] probs = [] for outi, (trainInd, testInd) in enumerate(cv_iter): Xtrain, Xtest = X.iloc[trainInd], X.iloc[testInd] ytrain, ytest = y.iloc[trainInd], y.iloc[testInd] results = model.fit(X=Xtrain, y=ytrain) prob = results.predict_proba(Xtest) class1Ind = np.nonzero(results.classes_ == 1)[0][0] fprTest, tprTest, _ = sklearn.metrics.roc_curve(ytest, prob[:, class1Ind]) tpr[:, outi] = np.interp(fpr, fprTest, tprTest) auc[outi] = sklearn.metrics.auc(fprTest, tprTest) acc[outi] = sklearn.metrics.accuracy_score(ytest, np.round(prob[:, class1Ind]), normalize=True) coefs.append(results.coef_[None,:]) probs.append(pd.Series(prob[:, class1Ind], index=Xtest.index)) meanTPR = np.mean(tpr, axis=1) meanTPR[0], meanTPR[-1] = 0, 1 meanACC = np.mean(acc) meanAUC = sklearn.metrics.auc(fpr, meanTPR) """Compute mean probability over test predictions in CV""" probS = pd.concat(probs).groupby(level=0).agg(np.mean) probS.name = 'Prob' """Refit all the data for final model""" result = model.fit(X=X, y=y) rocRes = rocStats(y, np.round(probS)) outD = {'fpr':fpr, # (100, ) average FPR for ROC 'tpr':meanTPR, # (100, ) average TPR for ROC 'AUC':auc, # (CVfolds, ) AUC of ROC for each outer test fold 'mAUC': meanAUC, # (1, ) AUC of the average ROC 'mACC': np.mean(acc), 'ACC':acc, # (CVfolds, ) accuracy across outer test folds 'finalResult': result, # final fitted model with predict() exposed 'prob':probS, # (N,) pd.Series of predicted probabilities avg over outer folds 'coefs':np.concatenate(coefs), # (CVfolds, predVars) 'Xvars':predVars, 'Yvar':outcomeVar, 'nFolds':nFolds, 'LOO':'Yes' if LOO else 'No', 'N':tmp.shape[0]} outD.update(rocRes[['Sensitivity', 'Specificity']].to_dict()) return outD
import pandas as pd
TimeSeriesSplit) data = list(range(1, 11)) print(data) print(train_test_split(data, train_size=.8)) kf = KFold(n_splits=5) for train, validate in kf.split(data): print(train, validate) kf = KFold(n_splits=5, shuffle=True, random_state=42) for train, validate in kf.split(data): print(train, validate) loo = LeaveOneOut() for train, validate in loo.split(data): print(train, validate) lpo = LeavePOut(p=2) for train, validate in lpo.split(data): print(train, validate) ss = ShuffleSplit(n_splits=3, test_size=2, random_state=0) for train, validate in ss.split(data): print(train, validate) tscv = TimeSeriesSplit(n_splits=5) for train, validate in tscv.split(data): print(train, validate)
from sklearn import datasets from sklearn.neighbors import KNeighborsClassifier iris = datasets.load_iris() X = iris.data y = iris.target knn = KNeighborsClassifier() # ==== Leave-one-out validation ==== from sklearn.model_selection import LeaveOneOut # Instatiate `LeaveOneOut` class. See [here](http://scikit-learn.org/stable/modules/cross_validation.html#leave-one-out-loo) # and [here](http://scikit-learn.org/stable/modules/generated/sklearn.model_selection.LeaveOneOut.html#sklearn.model_selection.LeaveOneOut) # for more details. loo = LeaveOneOut() # Keep track of successful predictions successes = [] # the `split` method generates indices to split data into training and test set. for train_index, test_index in loo.split(X): # `fit` classifier on training indices knn.fit(X[train_index], y[train_index]) # `score` classifier on testing indices; since there will be only one # test index, the score will be either 1 (for a correct prediction) or # 0 (for an incorrect prediction). successes.append(knn.score(X[test_index], y[test_index])) # Divide `successes` by the sample size to get the percentage score. print("Accuracy for iris dataset with Leave-One-Out validation is {}.\n". format(np.mean(successes)))
'DiagnosisGroup' ]], how='inner') new_df2 = new_df2.fillna(0) X = new_df2.drop(['DiagnosisGroup'], axis=1) regex = re.compile(r"\[|\]|<", re.IGNORECASE) X.columns = [ regex.sub("_", col) if any(x in str(col) for x in set(('[', ']', '<'))) else col for col in X.columns.values ] y = new_df2['DiagnosisGroup'] loo = LeaveOneOut() y_pred_list = [] auc = [] auc_train = [] for train_index, test_index in loo.split(X): train_index = list(train_index) # print("%s %s" % (train_index, test_index)) X_train, X_test = X.iloc[train_index, :], X.iloc[test_index, :] y_train, y_test = y[train_index], y[test_index] model = XGBClassifier( max_depth=4, n_estimators=150, learning_rate=15 / 100, objective='multi:softmax', reg_lambda=150 #objective='binary:logistic',
output_train = "{}({}: {}) ".format(output_train, i, data[i]) for i in test: bar[i] = "T" output_test = "{}({}: {}) ".format(output_test, i, data[i]) print("[ {} ]".format(" ".join(bar))) print("Train: {}".format(output_train)) print("Test: {}\n".format(output_test)) # Create some data to split with data = numpy.array([[1, 2], [3, 4], [5, 6], [7, 8]]) # Our two methods loocv = LeaveOneOut() lpocv = LeavePOut(p=P_VAL) split_loocv = loocv.split(data) split_lpocv = lpocv.split(data) print("""\ The Leave-P-Out method works by using every combination of P points as test data. The following output shows the result of splitting some sample data by Leave-One-Out and Leave-P-Out methods. A bar displaying the current train-test split as well as the actual data points are displayed for each split. In the bar, "-" is a training point and "T" is a test point. """) print("Data:\n{}\n".format(data))
frases = [] f = open("intensoes.txt", "r") for x in f: classe, texto = x.split(">>") intensao.append(classe) frases.append(texto.rstrip()) #rstrip remove o \n #Converte as sentenças em BOW vectorizer = TfidfVectorizer(sublinear_tf=True, max_df=0.5, strip_accents='unicode') intensaoBow = vectorizer.fit_transform(frases) intensaoNumpy = np.array(intensao) leaveOneOut = LeaveOneOut() leaveOneOut.get_n_splits(intensaoBow) result = [] for train_index, test_index in leaveOneOut.split(intensaoBow): X_train, X_test = intensaoBow[train_index], intensaoBow[test_index] y_train, y_test = intensaoNumpy[train_index], intensaoNumpy[test_index] #KNN model = KNeighborsClassifier(n_neighbors=1) model.fit(X_train, y_train) resultado = model.predict(X_test)[0] result.append(resultado)
def compare_Estimators_fscore(): from sklearn.svm import LinearSVC from sklearn.neighbors import KNeighborsClassifier from sklearn.model_selection import LeaveOneOut # list of (estimator, param_grid), where param_grid is used in GridSearchCV classifiers = [ (LinearSVC(random_state=RS, tol=1e-5, C=0.025)), (KNeighborsClassifier(3)), (GradientBoostingClassifier(n_estimators=200, random_state=RS, learning_rate=0.05)), GaussianNB(), ] names = [ 'Linear SVC', 'K-Nearest Neighbors', 'Gradient Boosting', 'Gaussian Naive Bayes' ] columns = 4 fig, axs = plt.subplots(1, columns, figsize=(16, 6)) axs = axs.ravel() loo = LeaveOneOut() outputFile = exportDir / '{0}_compare_classifiers'.format(fname) # iterate over classifiers for i, clf in enumerate(classifiers): y_pred = np.zeros(n_samples) for train_index, test_index in loo.split(X_train): print("TRAIN:", train_index, "TEST:", test_index) clf.fit(X_train[train_index, :], y_train[train_index]) y_pred[test_index] = clf.predict(X_train[test_index, :]) precision, recall, fscore, support = precision_recall_fscore_support( y_train, y_pred) accuracy = accuracy_score(y_train, y_pred) index = np.arange(num_categories) bar_width = 0.3 for c in range(num_categories): axs[i].bar(index[c], precision[c], bar_width, alpha=1, color=plt.cm.tab20(i)) axs[i].bar(index[c] + bar_width, recall[c], bar_width, alpha=0.6, color=plt.cm.tab20(i)) ##axs[i].bar( index[c]+bar_width+bar_width, fscore[c], bar_width, alpha=0.25, color=plt.cm.tab20(i), hatch="//", edgecolor=plt.cm.tab20(i) ) axs[i].set_xticks(np.arange(num_categories) + bar_width) axs[i].set_xticklabels(categories, rotation=45, ha='right') axs[i].set_xlabel('') axs[i].set_ylabel('Score') axs[i].set_title(names[i] + r"$\bf{" + ' | Accuracy: ' + str(round(accuracy, 2)) + "}$") plt.tight_layout() from matplotlib.patches import Patch legend_elements = [ Patch(facecolor=fc, label='Precision'), Patch(facecolor=fc, alpha=0.6, label='Recall'), ] #Patch(facecolor=fc, alpha=0.25, label='F-score', hatch="//") ] plt.legend(handles=legend_elements, loc='lower right') df = pd.DataFrame([precision, recall], columns=categories, index=['Precision', 'Recall']).transpose() df.to_csv(Path(str(outputFile) + names[i] + ".csv"), index=True, header=True, sep=',') plt.savefig(Path(str(outputFile) + ".png"), dpi=dpi_all) # np.savetxt(Path(str(outputFile)+ ".csv"), prob_loo, delimiter=",", header=",".join(categories )) # df.to_csv(Path(str(outputFile)+ ".csv"), index=True, sep=',') if exportPDF: plt.savefig(Path(str(outputFile) + ".pdf"), dpi=dpi_all)
def perform_plot_LOO(): ####### ### Leave-one-samepl-out cross-validation model ##### n_samples, n_features = X_train.shape y_pred = np.zeros(n_samples) class_probs = np.zeros( [n_samples, np.unique(y_train).size] ) # the probability of assigning each left out sample to each of the classes loo = LeaveOneOut() for train_index, test_index in loo.split(X_train): print("TRAIN:", train_index, "TEST:", test_index) clf_main.fit(X_train[train_index, :], y_train[train_index]) y_pred[test_index] = clf_main.predict(X_train[test_index, :]) try: class_probs[test_index, :] = clf_main.predict_proba( X_train[test_index, :]) except Exception: pass # my_score = np.mean(y_pred==y_input) precision, recall, fscore, support = precision_recall_fscore_support( y_train, y_pred) accuracy = accuracy_score(y_train, y_pred) ## MAKE CLASS PROBABILITY PLOT plt.figure() arr1inds = y_train.argsort() labels_train_temp = labels_train.reset_index(drop=True) labels_train_sorted = labels_train_temp[arr1inds[::-1]] prob_loo = class_probs[arr1inds[::-1]] plt.imshow(prob_loo, cmap=plt.cm.coolwarm, interpolation='none', aspect='auto') plt.grid(True) plt.yticks(np.arange(n_samples), labels_train_sorted[0:n_samples], fontsize=2, rotation=0) plt.xticks(np.arange(num_categories), categories, fontsize=8, rotation=45, ha='right') ax = plt.gca() ax.grid(color='w', linestyle='-', linewidth=0) plt.colorbar() plt.tight_layout() outputFile = exportDir / '{0}_class_probs_leave_one_out'.format(fname) plt.savefig(Path(str(outputFile) + ".png"), dpi=dpi_all) # np.savetxt(Path(str(outputFile)+ ".csv"), prob_loo, delimiter=",", header=",".join(categories )) df = pd.DataFrame(prob_loo, index=labels_train[0:n_samples], columns=categories) df.to_csv(Path(str(outputFile) + ".csv"), index=True, header=True, sep=',') if exportPDF: plt.savefig(Path(str(outputFile) + ".pdf"), dpi=dpi_all) ## PRECISION RECALL PLOT plotPrecisionRecall(precision, recall, categories, accuracy) outputFile = exportDir / '{0}_precision_recall_training'.format(fname) plt.savefig(Path(str(outputFile) + ".png"), dpi=dpi_all) data = {'Precision': precision, 'Recall': recall} df = pd.DataFrame(data, index=categories) df.to_csv(Path(str(outputFile) + ".csv"), index=True, header=True, sep=',') if exportPDF: plt.savefig(Path(str(outputFile) + ".pdf"), dpi=dpi_all)
def _print_train_results(classifier_name, classifier, regressors, response, regressor_names, leave_one_out): """ _print_train_results Performs validation tests of the model and prints the results :param classifier_name: Name of the classifier method :param classifier: Classifier object :param regressors: numpy array with the regressors used to train the model :param response: numpy array with the response used to train the model :param regressor_names: List with the name of the regressors :param leave_one_out: Boolean, true to perform leave-one-out cross-validation, otherwise perform default cross validation :return: None """ global MESSAGES _verbose_print("classifier_name: {}".format(classifier_name)) _verbose_print("classifier: {}".format(classifier)) _verbose_print("regressor_names: {}".format(regressor_names)) _verbose_print("leave_one_out: {}".format(leave_one_out)) MESSAGES.AddMessage("{} classifier with parameters: \n {}".format(classifier_name, str(classifier.get_params()).replace("'", ""))) if leave_one_out: # create a leave-one-out instance to execute the cross-validation loo = LeaveOneOut() start = timer() cv_score = cross_val_score(classifier, regressors, response, cv=loo.split(regressors)) end = timer() n_tests = len(response) MESSAGES.AddMessage("Score (Leave one Out):" + str(cv_score.mean())) else: start = timer() cv_score = cross_val_score(classifier, regressors, response) end = timer() n_tests = 3 MESSAGES.AddMessage("Score (3-Fold):" + str(cv_score.mean())) # Print validation time MESSAGES.AddMessage("Testing time: {:.3f} seconds, {:.3f} seconds per test".format(end - start, (end - start) / n_tests)) # Print confusion matrix MESSAGES.AddMessage("Confusion Matrix (Train Set):") confusion = confusion_matrix(response, classifier.predict(regressors)) labels = ["Non Deposit", "Deposit"] row_format = "{:6}" + "{:^16}" * (len(labels) + 1) MESSAGES.AddMessage(row_format.format("", "", "Predicted", "")) MESSAGES.AddMessage(row_format.format("True", "", *labels)) for label, row in zip(labels, confusion): MESSAGES.AddMessage(row_format.format("", label, *row)) # Some classifiers do not have decision_function attribute but count with predict_proba instead # TODO: Generalize to anything that does not have decision_function "Easier to ask for forgiveness than permission" if classifier_name in ["Random Forest"]: des_fun = classifier.predict_proba(regressors)[:, classifier.classes_ == 1] else: des_fun = classifier.decision_function(regressors) MESSAGES.AddMessage("Area Under the curve (AUC): {}".format(roc_auc_score(response, des_fun))) # Give the importance of the features if it is supported # TODO: Generalize to anything that does have feature_importances_ "Easier to ask for forgiveness than permission" if classifier_name == "Adaboost": MESSAGES.AddMessage("Feature importances: ") importances = [[name, val*100] for name, val in zip(regressor_names, classifier.feature_importances_)] long_word = max([len(x) for x in regressor_names]) row_format = "{" + ":" + str(long_word) + "} {:4.1f}%" # Print regressors in descending importance, omit the ones with 0 importance for elem in sorted(importances, key=lambda imp: imp[1], reverse=True): if elem[1] > 0: MESSAGES.AddMessage(row_format.format(*elem)) return
''' from sklearn.linear_model import LogisticRegression from sklearn import metrics from sklearn.model_selection import cross_val_predict # log-regression lib model log_model = LogisticRegression() m = np.shape(X)[0] # 10-folds CV y_pred = cross_val_predict(log_model, X, y, cv=10) print(metrics.accuracy_score(y, y_pred)) # LOOCV from sklearn.model_selection import LeaveOneOut loo = LeaveOneOut() accuracy = 0; for train, test in loo.split(X): log_model.fit(X[train], y[train]) # fitting y_p = log_model.predict(X[test]) if y_p == y[test] : accuracy += 1 print(accuracy / np.shape(X)[0]) # m = np.shape(X)[0] # scores_loo = cross_val_score(log_model, X, y, cv=m) # print(scores_loo) # # prediction using 10-folds # y_pred_loo = cross_val_predict(log_model, X, y, cv=m) # print(metrics.accuracy_score(y, y_pred_loo)) '''