Example #1
0
 def _iter_cv(n):  # XXX support sklearn < 0.18
     if hasattr(LeaveOneOut, 'split'):
         cv = LeaveOneOut()
         return cv.split(np.zeros((n, 1)))
     else:
         cv = LeaveOneOut(len(data))
         return cv
Example #2
0
def run_leave_one_out_cv(features, labels, classifier=LinearDiscriminantAnalysis()):
    """
    Runs leave one out CV.
    :param features: Features shape(epoch, feature)
    :param labels: list of lables of length num epochs
    :param classifier: Sklearn classifier (Defaults to LDA)
    :return: A list of cross validation scores.  Use np.average on the result to find the average score.
    """
    loo = LeaveOneOut()
    scores = []
    for train_indexes, test_indexes in loo.split(features, labels):

        # Assert our split maintains the same number of features
        CCDLAssert.assert_equal(len(train_indexes) + len(test_indexes), features.shape[0])

        # Assert we have the same number of features

        X_train, X_test = features[train_indexes, :], features[test_indexes, :]
        Y_train, Y_test = np.asarray(labels)[train_indexes], np.asarray(labels)[test_indexes]

        # Assert our X_train and X_test have the same number of features
        CCDLAssert.assert_equal(X_train.shape[1], X_test.shape[1])

        # Fit our classifier to our
        classifier.fit(X_train, Y_train)

        score = classifier.score(X_test, Y_test)
        scores.append(score)

    return scores
def main(argv):
    filename = argv[0]
    t = float(argv[1]) # threshold for logistic regression (default=0.5)
    dup = int(argv[2]) # if 1, bad queries will be duplicated
    subset = 'cache' # column title for precision of cache
    full = 'full' # column title for precision of full db
    df = pd.read_csv('../../data/cache_selection_structured/' + filename)
    df = df.drop(['query', 'freq'], axis = 1)
    df = df.fillna(0)
    df['label'] = np.where(df['full'] > df['cache'], 1, 0)
    if dup:
        print('duping..')
        bads = df[df['label'] == 1]
        df = df.append(bads, ignore_index=True)
    X = df.drop(['label'], axis = 1)
    y = df['label']
    df = df.drop(['label'], axis = 1)
    p20_mean = np.zeros([1, 6])
    bad_mean = np.zeros([1, 6])
    ml_average_rare = 0
    ql_average_rare = 0
    best_average_rare = 0
    loo = LeaveOneOut()
    bad_counter = 0
    for train_index, test_index in loo.split(X):
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]
        X_train = X_train.drop([subset, full], axis=1)
        p12 = X_test[subset].iloc[0]
        p100 = X_test[full].iloc[0]
        is_bad = p12 < p100
        X_test = X_test.drop([subset, full], axis=1)
        # compute query likelihood based effectiveness
        ql_cache = np.mean(X_test['ql_0_0'] + X_test['ql_0_1'] +
                           X_test['ql_1_0'] + X_test['ql_2_0'])
        ql_rest = np.mean(X_test['ql_rest_0_0'] + X_test['ql_rest_0_1'] +
                           X_test['ql_rest_1_0'] + X_test['ql_rest_2_0'])
        #ql_pred = X_test['ql_0_1'].iloc[0] < X_test['ql_rest_0_1'].iloc[0]
        ql_pred = 1 if ql_cache < ql_rest else 0
        ql = p12 if ql_pred == 0 else p100
        # learn the model
        print(X_train.shape)
        print(df.columns.shape)
        # y_pred = train_lr(X_train, y_train, X_test, y_test, t, df.columns.values[:-2])
        y_pred = train_lr(X_train, y_train, X_test, y_test, t)
        ml = p12 if y_pred[0] == 0 else p100
        best = p12 if y_test.iloc[0] == 0 else p100
        rnd = p12 if np.random.randint(0, 2) == 1 else p100
        p20_mean += [p12, p100, ml, ql,
                     best, rnd]
        if is_bad:
            #bad_mean += [p12[0], p100[0], ml[0], ql[0], best[0], rnd[0]]
            bad_mean += [p12, p100, ml, ql, best, rnd]
            bad_counter += 1
    print('final results:')
    print('\t'.join(map(str,['set', 'cache', 'db', 'ml', 'ql', 'best',
                              'rand'])))
    print('\t'.join(['bad'] + map(str, np.round(bad_mean[0] / bad_counter, 2))))
    print('\t'.join(['all'] + map(str, np.round(p20_mean[0] / df.shape[0], 2))))
Example #4
0
def roc_data(X,Y,clf,n_iter=50,test_size=0.1):
    if n_iter is None and test_size is None:
        cv = LeaveOneOut()
    else:
        cv = ShuffleSplit(n_iter=n_iter,test_size=test_size)
    n_labels = Y.shape[1]
    Y_cv = {i:[] for i in range(n_labels)}
    p = {i:[] for i in range(n_labels)}
    p_1 = {i:[] for i in range(n_labels)}
    p_0 = {i:[] for i in range(n_labels)}
    for train, test in cv.split(Y):
        clf.fit(X[train,:], Y[train,:])
        Y_predicted = clf.predict_proba(X[test,:])
        for i in range(Y.shape[1]):
            if type(Y_predicted) is list:
                p_ = 1 - Y_predicted[i][:,0]
            else:
                p_ = Y_predicted[:,i]
            Y_cv[i] += list(Y[test,i])
            p[i] += list(p_)
            p_1[i] += list(p_[np.where(Y[test,i]==1)[0]])
            p_0[i] += list(p_[np.where(Y[test,i]==0)[0]])
    return Y_cv, p, p_1, p_0
Example #5
0
def _print_classification_results(classifier, regressors, response, regressors_test, response_test, regressor_names,
                                  messages):

    loo = LeaveOneOut()
    cv_score = cross_val_score(classifier, regressors, response, cv=loo.split(regressors))
    classifier.fit(regressors, response)
    messages.AddMessage("Adaboost classifier with " + str(classifier.n_estimators) + " estimators and learning rate "
                        + str(classifier.learning_rate))

    if regressors_test is None or response_test is None:
        regressors_test = regressors
        response_test = response
        t_set = "Train"
    else:
        t_set = "Test"

    messages.AddMessage("Score (" + t_set + " Set):" + str(classifier.score(regressors_test, response_test)))
    messages.AddMessage("Score (Leave one Out):" + str(cv_score.mean()))
    messages.AddMessage("Confusion Matrix (" + t_set + " Set):")

    confusion = confusion_matrix(response_test, classifier.predict(regressors_test))
    labels = ["Non Prospective", "Prospective"]
    row_format = "{:6}" + "{:^16}" * (len(labels) + 1)
    messages.AddMessage(row_format.format("", "", "Predicted", ""))
    messages.AddMessage(row_format.format("True", "", *labels))
    for label, row in zip(labels, confusion):
        messages.AddMessage(row_format.format("", label, *row))
    messages.AddMessage("Area Under the curve (AUC):" + str(roc_auc_score(response_test,
                                                            classifier.decision_function(regressors_test))))

    messages.AddMessage("Feature importances: ")
    importances = [[name, val] for name, val in zip(regressor_names, classifier.feature_importances_)]
    for elem in sorted(importances, key=lambda imp: imp[1], reverse=True):
        if elem[1] > 0:
            messages.AddMessage(elem[0] + ": \t" + str(elem[1]*100) + "%")
    return
Example #6
0
 def redraw(self):
     variables = []
     if self.includeallcheckBox.isChecked():
         for i in range(self.interactionlistWidget.count()):
             variables.append(self.interactionlistWidget.item(i).text())
     else:
         for i in range(self.selectedlistWidget.count()):
             variables.append(self.selectedlistWidget.item(i).text())
     nX = len(variables)
     if nX < 1:
         QtWidgets.QMessageBox.critical(self,'Error',"Too few variables selected!",\
                                        QtWidgets.QMessageBox.Ok)
         return ()
     Yname = self.YcomboBox.currentText()
     Lc = DS.Lc[DS.Ic]
     Gc = DS.Gc[DS.Ic]
     Lcy = Lc[Gc]
     Lcx = Lc[-Gc]
     data = DS.Raw.loc[DS.Ir, DS.Ic]
     Y = data[Lcy]
     X = data[Lcx]
     if nX > X.shape[0]:
         QtWidgets.QMessageBox.critical(self,'Error',"Factors > Observation! \n Reduce factors.",\
                                        QtWidgets.QMessageBox.Ok)
         return ()
     ny = self.YcomboBox.currentIndex()
     Y = Y.values.astype('float')
     X = X.values.astype('float')
     Y = Y[:, ny]
     nr = len(Y)
     basey = [Term([LookupFactor(Yname)])]
     basex = []
     for term in variables:
         if term == 'Intercept':
             basex = [INTERCEPT]
             variables.remove(term)
     for term in variables:
         vterm = term.split(':')
         term_lookup = [LookupFactor(x) for x in vterm]
         if len(term_lookup) > 1:
             if vterm[0] == vterm[1]:
                 term_lookup = [EvalFactor(vterm[0] + ' ** 2')]
         basex.append(Term(term_lookup))
     desc = ModelDesc(basey, basex)
     data = np.column_stack((X, Y))
     columns = Lcx.tolist()
     columns.append(Yname)
     data = pd.DataFrame(data, columns=columns)
     y, mx = dmatrices(desc, data, return_type='dataframe')
     dism = np.linalg.inv(np.dot(mx.T.values, mx.values))
     mod = OLS(y, mx)
     DOE.res = mod.fit()
     # calculation of cross-validation
     ypcv = list()
     rcv = list()
     bres = list()
     loo = LeaveOneOut()
     loo.get_n_splits(mx)
     for train_index, test_index in loo.split(mx):
         mx_train = mx.ix[train_index, :]
         mx_test = mx.ix[test_index, :]
         y_train = y.ix[train_index, :]
         y_test = y.ix[test_index, :]
         modcv = OLS(y_train, mx_train)
         rescv = modcv.fit()
         ypcv.append(rescv.predict(mx_test).values[0])
         rcv.append(rescv.predict(mx_test).values[0] - y_test.values[0])
         bres.append((rescv.params - DOE.res.params).values**2)
     bres = pd.DataFrame(bres)
     bres = bres.sum() * nr / (nr - 1)
     bres = np.sqrt(bres.values)
     tres = np.abs(DOE.res.params.values / bres)
     pt = 2 * t.pdf(tres, nr)
     fig = Figure()
     ax = fig.add_subplot(111)
     if self.coefradioButton.isChecked():
         if DOE.res.params.index[0] == 'Intercept':
             ind = np.arange(1, len(DOE.res.params))
             vcol = []
             for i in ind:
                 if (DOE.res.pvalues[i] < 0.05): vcol.append('red')
                 else: vcol.append('blue')
             ax.bar(ind, DOE.res.params[1:], align='center', color=vcol)
             ax.set_title('Coefficient Value : Intercept {:10.4f}-{:10.4f}-{:10.4f}'.\
             format(DOE.res.conf_int().ix[0,0],DOE.res.params[0],DOE.res.conf_int().ix[0,1]))
             ax.set_xticklabels(DOE.res.params.index[1:],
                                rotation='vertical')
             cmin = DOE.res.params[1:] - DOE.res.conf_int().ix[1:, 0]
             cmax = DOE.res.conf_int().ix[1:, 1] - DOE.res.params[1:]
             ax.errorbar(ind,
                         DOE.res.params[1:],
                         yerr=[cmin.values, cmax.values],
                         fmt='o',
                         ecolor='green')
         else:
             ind = np.arange(1, len(DOE.res.params) + 1)
             ax.bar(ind, DOE.res.params, align='center')
             ax.set_title('Coefficient Value : None Intercept')
             ax.set_xticklabels(DOE.res.params.index[0:],
                                rotation='vertical')
             cmin = DOE.res.conf_int().ix[0:, 0] - DOE.res.params[0:]
             cmax = DOE.res.conf_int().ix[0:, 1] - DOE.res.params[0:]
             ax.errorbar(ind,
                         DOE.res.params[0:],
                         yerr=[cmin.values, cmax.values],
                         fmt='o',
                         ecolor='green')
         ax.set_xticks(ind)
         ax.set_xlabel('Coefficient Number (except Intercept)')
         ax.annotate('red bar: significance 5%',
                     xy=(0.75, 0.95),
                     xycoords='figure fraction',
                     fontsize=8)
     elif self.coefpredradioButton.isChecked():
         if DOE.res.params.index[0] == 'Intercept':
             ind = np.arange(1, len(DOE.res.params))
             vcol = []
             for i in ind:
                 if (pt[i] < 0.05): vcol.append('red')
                 else: vcol.append('blue')
             ax.bar(ind, DOE.res.params[1:], align='center', color=vcol)
             ax.set_title(
                 'Coefficient Value : Intercept {:10.4f}-{:10.4f}-{:10.4f}'.
                 format(DOE.res.params[0] - tres[0] * bres[0] / np.sqrt(nr),
                        DOE.res.params[0], DOE.res.params[0] +
                        tres[0] * bres[0] / np.sqrt(nr)))
             ax.set_xticklabels(DOE.res.params.index[1:],
                                rotation='vertical')
             ax.errorbar(ind,
                         DOE.res.params[1:],
                         yerr=tres[1:] * bres[1:] / np.sqrt(nr),
                         fmt='o',
                         ecolor='green')
         else:
             ind = np.arange(1, len(DOE.res.params) + 1)
             ax.bar(ind, DOE.res.params, align='center')
             ax.set_title('Coefficient Value : None Intercept')
             ax.set_xticklabels(DOE.res.params.index[0:],
                                rotation='vertical')
             ax.errorbar(ind,
                         DOE.res.params[0:],
                         yerr=tres[0:] * bres[0:] / np.sqrt(nr),
                         fmt='o',
                         ecolor='green')
         ax.set_xticks(ind)
         ax.set_xlabel('Coefficient Number (except Intercept)')
         ax.annotate('red bar: significance 5%',
                     xy=(0.75, 0.95),
                     xycoords='figure fraction',
                     fontsize=8)
     elif self.fitradioButton.isChecked():
         yf = DOE.res.fittedvalues.tolist()
         resid = DOE.res.resid.tolist()
         ax.scatter(y, yf, color='red', alpha=0.3, marker='o')
         ax.set_ylabel('Fitted Values', color='red')
         ax.tick_params('y', colors='red')
         ax1 = ax.twinx()
         ax1.scatter(y, resid, color='blue', alpha=0.3, marker='o')
         ax1.set_ylabel('Residuals', color='blue')
         ax1.tick_params('y', colors='blue')
         xmin, xmax = ax.get_xlim()
         ax.set_ylim([xmin, xmax])
         df = DOE.res.df_resid
         vares = np.sum(DOE.res.resid**2) / df
         rmsef = np.sqrt(vares)
         vary = np.var(y.values)
         evar = (1 - vares / vary) * 100
         ax.set_title(
             'df {:3.0f};   RMSEF {:6.2f};   Exp.Var.{:5.1f}%'.format(
                 df, rmsef, evar))
         ax.add_line(Line2D([xmin, xmax], [xmin, xmax], color='red'))
         ax1.add_line(Line2D([xmin, xmax], [0, 0], color='blue'))
         ax.set_xlabel('Measured Values')
         if self.VcheckBox.isChecked():
             Lr = DOE.res.model.data.row_labels
             for i, txt in enumerate(Lr):
                 ax.annotate(str(txt), (y.ix[i], yf[i]))
     elif self.predradioButton.isChecked():
         ax.scatter(y, ypcv, color='red', alpha=0.3, marker='o')
         ax.set_ylabel('CV Predicted Values', color='red')
         ax.tick_params('y', colors='red')
         ax1 = ax.twinx()
         ax1.scatter(y, rcv, color='blue', alpha=0.3, marker='o')
         ax1.set_ylabel('CV Residuals', color='blue')
         ax1.tick_params('y', colors='blue')
         xmin, xmax = ax.get_xlim()
         ax.set_ylim([xmin, xmax])
         ax.set_xlabel('Measured Values')
         df = DS.Raw.shape[0]
         varcv = np.sum(np.array(rcv)**2) / df
         rmsecv = np.sqrt(varcv)
         vary = np.var(y.values)
         evar = (1 - varcv / vary) * 100
         ax.set_title(
             'df {:3.0f};   RMSECV {:6.2f};   Exp.Var.{:5.1f}%'.format(
                 df, rmsecv, evar))
         ax.add_line(Line2D([xmin, xmax], [xmin, xmax], color='red'))
         ax1.add_line(Line2D([xmin, xmax], [0, 0], color='blue'))
         if self.VcheckBox.isChecked():
             Lr = DOE.res.model.data.row_labels
             for i, txt in enumerate(Lr):
                 ax.annotate(str(txt), (y.ix[i], ypcv[i]))
     elif self.levradioButton.isChecked():
         Ftable = surtabDlg.launch(None)
         if len(np.shape(Ftable)) == 0: return ()
         if np.argmax(Ftable['X axis'].values) == np.argmax(
                 Ftable['Y axis'].values):
             QtWidgets.QMessageBox.critical(self,'Error',"Two variables on the same axis",\
                                            QtWidgets.QMessageBox.Ok)
             return ()
         fig = plt.figure()
         ax = fig.add_subplot(111)
         npts = 20
         xname = Ftable[(Ftable['X axis'] == True).values].index[0]
         yname = Ftable[(Ftable['Y axis'] == True).values].index[0]
         cname = Ftable[(Ftable['Constant'] == True).values].index.tolist()
         cvalue = Ftable.loc[(Ftable['Constant'] == True).values, 'value']
         zname = Yname
         x = np.linspace(float(Ftable['min'][xname]),
                         float(Ftable['max'][xname]), npts)
         y = np.linspace(float(Ftable['min'][yname]),
                         float(Ftable['max'][yname]), npts)
         px = []
         py = []
         for i in range(npts):
             for j in range(npts):
                 px.append(x[i])
                 py.append(y[j])
         data = pd.DataFrame({xname: px, yname: py, zname: px})
         xtitle = ''
         for i in range(len(cname)):
             xtitle = xtitle + cname[i] + ' = ' + str(
                 cvalue.values.tolist()[i])
             data[cname[i]] = np.ones(npts**2) * float(cvalue[i])
         my, mx = dmatrices(desc, data, return_type='dataframe')
         pz = np.diag(np.dot(np.dot(mx, dism), mx.T))
         px = np.array(px)
         py = np.array(py)
         pz = np.array(pz)
         z = plt.mlab.griddata(px, py, pz, x, y, interp='linear')
         plt.contour(x, y, z, 15, linewidths=0.5, colors='k')
         plt.contourf(x, y, z, 15, cmap=plt.cm.rainbow)
         plt.colorbar()
         ax.set_xlabel(xname)
         ax.set_ylabel(yname)
         ax.set_title(xtitle)
         ax.set_xlim([px.min(), px.max()])
         ax.set_ylim([py.min(), py.max()])
     elif self.surradioButton.isChecked():
         Ftable = surtabDlg.launch(None)
         if len(np.shape(Ftable)) == 0: return ()
         if np.argmax(Ftable['X axis'].values) == np.argmax(
                 Ftable['Y axis'].values):
             QtWidgets.QMessageBox.critical(self,'Error',"Two variables on the same axis",\
                                            QtWidgets.QMessageBox.Ok)
             return ()
         fig = plt.figure()
         ax = fig.add_subplot(111)
         npts = 100
         xname = Ftable[(Ftable['X axis'] == True).values].index[0]
         yname = Ftable[(Ftable['Y axis'] == True).values].index[0]
         cname = Ftable[(Ftable['Constant'] == True).values].index.tolist()
         cvalue = Ftable.loc[(Ftable['Constant'] == True).values, 'value']
         zname = Yname
         x = np.linspace(float(Ftable['min'][xname]),
                         float(Ftable['max'][xname]), npts)
         y = np.linspace(float(Ftable['min'][yname]),
                         float(Ftable['max'][yname]), npts)
         px = []
         py = []
         for i in range(npts):
             for j in range(npts):
                 px.append(x[i])
                 py.append(y[j])
         data = pd.DataFrame({xname: px, yname: py, zname: px})
         xtitle = ''
         for i in range(len(cname)):
             xtitle = xtitle + cname[i] + ' = ' + str(
                 cvalue.values.tolist()[i])
             data[cname[i]] = np.ones(npts**2) * float(cvalue[i])
         my, mx = dmatrices(desc, data, return_type='dataframe')
         pz = DOE.res.predict(mx)
         px = np.array(px)
         py = np.array(py)
         pz = np.array(pz)
         z = plt.mlab.griddata(px, py, pz, x, y, interp='linear')
         plt.contour(x, y, z, 15, linewidths=0.5, colors='k')
         plt.contourf(x, y, z, 15, cmap=plt.cm.rainbow)
         plt.colorbar()
         ax.set_xlabel(xname)
         ax.set_ylabel(yname)
         ax.set_title(xtitle)
         ax.set_xlim([px.min(), px.max()])
         ax.set_ylim([py.min(), py.max()])
     elif self.dismradioButton.isChecked():
         fig = plt.figure()
         ax = fig.add_subplot(111)
         cax = ax.matshow(dism)
         fig.colorbar(cax)
         ax.set_title('Trace = {:10.4f}'.format(np.trace(dism)))
     elif self.inflradioButton.isChecked():
         mxc = preprocessing.scale(mx.values,
                                   with_mean=True,
                                   with_std=False)
         mxc2 = mxc**2
         infl = np.sum(mxc2, axis=0) * np.diag(dism)
         fig = plt.figure()
         ax = fig.add_subplot(111)
         cax = ax.matshow(infl.reshape(1, -1), cmap='gray_r')
         fig.colorbar(cax)
         ax.yaxis.grid(False)
         ax.tick_params(axis='y',
                        which='both',
                        left='off',
                        right='off',
                        labelleft='off')
         ax.set_xlabel('Inlaction Factor')
     if self.XcheckBox.isChecked():
         if self.XlineEdit.text():
             ax.set_xlabel(self.XlineEdit.text())
     else:
         ax.set_xlabel('')
     if self.YcheckBox.isChecked():
         if self.YlineEdit.text():
             ax.set_ylabel(self.YlineEdit.text())
     else:
         ax.set_ylabel('')
     if self.XGcheckBox.isChecked():
         ax.xaxis.grid(True)
     else:
         ax.xaxis.grid(False)
     if self.YGcheckBox.isChecked():
         ax.yaxis.grid(True)
     else:
         ax.yaxis.grid(False)
     if not self.XMcheckBox.isChecked():
         ax.tick_params(axis='x',
                        which='both',
                        bottom='off',
                        top='off',
                        labelbottom='off')
     if not self.YMcheckBox.isChecked():
         ax.tick_params(axis='y',
                        which='both',
                        left='off',
                        right='off',
                        labelleft='off')
     self.rmmpl()
     self.addmpl(fig)
Example #7
0
import numpy as np
from sklearn.model_selection import LeaveOneOut
# ----------------------------------------------------
'''

class sklearn.model_selection.LeaveOneOut()
'''
# ----------------------------------------------------
X = np.array([1, 2, 3, 4])
y = np.array([5, 6, 7, 8])

loo = LeaveOneOut()

print(loo.get_n_splits(X))
print(loo)

loo = LeaveOneOut()

for train_index, test_index in loo.split(X):
    print("TRAIN:", train_index, "TEST:", test_index)
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    print('X_train \n', X_train)
    print('X_test \n', X_test)
    print('y_train \n', y_train)
    print('y_test \n', y_test)
    print('*********************')
Example #8
0
    def train(self):
        with open(os.path.join(self.results_folder, "log.txt"), "w") as f_log:
            for train, test in LeaveOneOut().split(self.dfs):
                train_set = [self.dfs[i] for i in train]
                test_set = self.dfs[test[0]]
                # Create sentence and label lists
                sentences_list = []
                labels_list = []
                for i, book in enumerate(train_set):
                    sentences_list.extend(book.sentence.values)
                    labels_list.extend(book.label.values)
                    f_log.write("Length book: " + str(len(sentences_list[i])) +
                                '\n')
                f_log.write("Sentences: " + str(len(sentences_list)) +
                            ", labels:" + str(len(labels_list)) + '\n')

                MAX_LEN = 128
                # We need to add special tokens at the beginning and end of each sentence for BERT to work properly
                sentences_train = [
                    self.tokenizer.encode_plus(sent,
                                               add_special_tokens=True,
                                               max_length=MAX_LEN)
                    for i, sent in enumerate(sentences_list)
                ]

                le = LabelEncoder()
                labels_train = labels_list
                f_log.write(str(labels_train[:10]) + '\n')
                f_log.write('Analyze labels' + '\n')
                le.fit(labels_train)
                le_name_mapping = dict(
                    zip(le.classes_, le.transform(le.classes_)))
                f_log.write(str(le_name_mapping) + '\n')
                labels_train = le.fit_transform(labels_train)

                # Use the BERT tokenizer to convert the tokens to their index numbers in the BERT vocabulary
                input_ids_train = [
                    inputs["input_ids"] for inputs in sentences_train
                ]

                # Pad our input tokens
                input_ids_train = pad_sequences(input_ids_train,
                                                maxlen=MAX_LEN,
                                                truncating="post",
                                                padding="post")
                # Create attention masks
                attention_masks_train = []

                # Create a mask of 1s for each token followed by 0s for padding
                for seq in input_ids_train:
                    seq_mask_train = [float(i > 0) for i in seq]
                    attention_masks_train.append(seq_mask_train)

                # Use train_test_split to split our data into train and validation sets for training
                train_inputs, train_labels = input_ids_train, labels_train
                train_masks, _ = attention_masks_train, input_ids_train

                # Convert all of our data into torch tensors, the required datatype for our model
                train_inputs = torch.tensor(train_inputs).to(torch.int64)
                train_labels = torch.tensor(train_labels).to(torch.int64)
                train_masks = torch.tensor(train_masks).to(torch.int64)

                batch_size = 32
                # Create an iterator of our data with torch DataLoader. This helps save on memory during training
                # because, unlike a for loop, with an iterator the entire dataset does not need to be loaded into
                # memory
                train_data = TensorDataset(train_inputs, train_masks,
                                           train_labels)
                train_sampler = RandomSampler(train_data)
                train_dataloader = DataLoader(train_data,
                                              sampler=train_sampler,
                                              batch_size=batch_size)
                torch.cuda.empty_cache()

                # BINARY CLASSIFIER
                model = BertForSequenceClassification.from_pretrained(
                    "bert-base-uncased", num_labels=2)
                model.cuda()
                param_optimizer = list(model.named_parameters())
                no_decay = ['bias', 'gamma', 'beta']
                optimizer_grouped_parameters = [{
                    'params': [
                        p for n, p in param_optimizer
                        if not any(nd in n for nd in no_decay)
                    ],
                    'weight_decay_rate':
                    0.01
                }, {
                    'params': [
                        p for n, p in param_optimizer
                        if any(nd in n for nd in no_decay)
                    ],
                    'weight_decay_rate':
                    0.0
                }]

                # This variable contains all of the hyperparemeter information our training loop needs
                optimizer = BertAdam(optimizer_grouped_parameters,
                                     lr=2e-5,
                                     warmup=.1)

                train_loss_set = []

                # Number of training epochs (authors recommend between 2 and 4)
                epochs = 10

                device = torch.device(
                    "cuda" if torch.cuda.is_available() else "cpu")
                torch.cuda.get_device_name(0)

                for _ in trange(epochs, desc="Epoch"):
                    # Training
                    # Set our model to training mode (as opposed to evaluation mode)
                    model.train()

                    # Tracking variables
                    tr_loss = 0
                    nb_tr_examples, nb_tr_steps = 0, 0

                    # Train the data for one epoch
                    for step, batch in enumerate(train_dataloader):
                        # Add batch to GPU
                        batch = tuple(t.to(device) for t in batch)
                        # Unpack the inputs from our dataloader
                        b_input_ids, b_input_mask, b_labels = batch
                        # Clear out the gradients (by default they accumulate)
                        optimizer.zero_grad()
                        # Forward pass
                        loss = model(b_input_ids,
                                     token_type_ids=None,
                                     attention_mask=b_input_mask,
                                     labels=b_labels)
                        train_loss_set.append(loss.item())
                        # Backward pass
                        loss.backward()
                        # Update parameters and take a step using the computed gradient
                        optimizer.step()

                        # Update tracking variables
                        tr_loss += loss.item()
                        nb_tr_examples += b_input_ids.size(0)
                        nb_tr_steps += 1

                    f_log.write("Train loss: {}".format(tr_loss /
                                                        nb_tr_steps) + '\n')

                plt.figure(figsize=(15, 8))
                plt.title("Training loss")
                plt.xlabel("Batch")
                plt.ylabel("Loss")
                plt.plot(train_loss_set)
                plt.savefig(self.img_folder + 'train' + str(test[0]) + '.png')

                model_to_save = model
                WEIGHTS_NAME = "BERT_Novel_test" + str(test[0]) + ".bin"
                OUTPUT_DIR = self.models_folder
                output_model_file = os.path.join(OUTPUT_DIR, WEIGHTS_NAME)
                f_log.write(str(output_model_file) + '\n')
                torch.save(model_to_save.state_dict(), output_model_file)
                state_dict = torch.load(output_model_file)
                model.load_state_dict(state_dict)

                sentences6 = test_set.sentence.values
                f_log.write(str(len(sentences6)) + '\n')
                labels6 = test_set.label.values

                labels_test = labels6
                sentences11 = sentences6
                sentences_test = [
                    self.tokenizer.encode_plus(sent,
                                               add_special_tokens=True,
                                               max_length=MAX_LEN)
                    for i, sent in enumerate(sentences11)
                ]

                f_log.write('Analyze labels test' + '\n')
                le.fit(labels_test)
                le_name_mapping = dict(
                    zip(le.classes_, le.transform(le.classes_)))
                f_log.write(str(le_name_mapping) + '\n')
                labels_test = le.fit_transform(labels_test)
                MAX_LEN = 128

                # Use the BERT tokenizer to convert the tokens to their index numbers in the BERT vocabulary
                input_ids1 = [inputs["input_ids"] for inputs in sentences_test]
                # Pad our input tokens
                input_ids1 = pad_sequences(input_ids1,
                                           maxlen=MAX_LEN,
                                           truncating="post",
                                           padding="post")
                # Create attention masks
                attention_masks1 = []

                # Create a mask of 1s for each token followed by 0s for padding
                for seq in input_ids1:
                    seq_mask1 = [float(i > 0) for i in seq]
                    attention_masks1.append(seq_mask1)

                f_log.write(str(len(attention_masks1[0])) + '\n')

                prediction_inputs = torch.tensor(input_ids1).to(torch.int64)
                prediction_masks = torch.tensor(attention_masks1).to(
                    torch.int64)

                prediction_labels = torch.tensor(labels_test).to(torch.int64)

                batch_size = 32
                prediction_data = TensorDataset(prediction_inputs,
                                                prediction_masks,
                                                prediction_labels)
                prediction_sampler = SequentialSampler(prediction_data)
                prediction_dataloader = DataLoader(prediction_data,
                                                   sampler=prediction_sampler,
                                                   batch_size=batch_size)

                # Prediction on test set
                # Put model in evaluation mode
                model.eval()
                # Tracking variables
                predictions, true_labels = [], []
                # Predict
                for batch in prediction_dataloader:
                    # Add batch to GPU
                    batch = tuple(t.to(device) for t in batch)
                    # Unpack the inputs from our dataloader
                    b_input_ids, b_input_mask, b_labels = batch
                    # Telling the model not to compute or store gradients, saving memory and speeding up prediction
                    with torch.no_grad():
                        # Forward pass, calculate logit predictions
                        logits = model(b_input_ids,
                                       token_type_ids=None,
                                       attention_mask=b_input_mask)

                    # Move logits and labels to CPU
                    logits = logits.detach().cpu().numpy()
                    label_ids = b_labels.to('cpu').numpy()

                    # Store predictions and true labels
                    predictions.append(logits)
                    true_labels.append(label_ids)

                f_log.write(
                    str(len(predictions)) + ' ' + str(len(true_labels)) + '\n')
                f_log.write(str(predictions[0][0]) + '\n')

                # Import and evaluate each test batch using Matthew's correlation coefficient
                matthews_set = []

                for i in range(len(true_labels)):
                    matthews = matthews_corrcoef(
                        true_labels[i],
                        np.argmax(predictions[i], axis=1).flatten())
                    matthews_set.append(matthews)

                # Flatten the predictions and true values for aggregate Matthew's evaluation on the whole dataset
                flat_predictions = [
                    item for sublist in predictions for item in sublist
                ]
                flat_predictions = np.argmax(flat_predictions,
                                             axis=1).flatten()
                flat_true_labels = [
                    item for sublist in true_labels for item in sublist
                ]

                f_log.write(
                    str(len(flat_predictions) + ' ' + len(flat_true_labels)) +
                    '\n')
                f_log.write(
                    str(flat_predictions[989:994] + ' ' +
                        flat_true_labels[989:994]) + '\n')
                f_log.write(
                    str(flat_predictions[0:11] + ' ' +
                        flat_true_labels[0:11]) + '\n')
                f_log.write('Classification Report' + '\n')
                f_log.write(
                    str(
                        classification_report(flat_true_labels,
                                              flat_predictions)) + '\n')
                f_log.write(
                    str(confusion_matrix(flat_true_labels, flat_predictions)) +
                    '\n')
    def testXGBoostPredictions(data, parameters, x_cols, y_cols, plots=False):
        """
        Testing XGBoost prediction accuracies.

        Arguments:
            data {array} -- Labeled data for classifier testing.
            x_cols {array} -- x columns
            y_cols {array} -- y columns
            parameters {namedtuple} -- Parameters for the tree classifier. Using named tuple to keep things tidy.

        Keyword Arguments:
            plots {bool} -- Used for plotting (default: {False})
        """
        x = data.loc[:, x_cols]
        y = data.loc[:, y_cols]

        loo = LeaveOneOut()
        loo.get_n_splits(data)
        n = loo.split(data)

        xgbClassifier = xgb.XGBClassifier()

        accuracy_a = []
        real_label = []
        pred_label = []

        for train_index, test_index in n:  #Each row is test data once
            xtrain, xtest = x.iloc[train_index], x.iloc[test_index]
            ytrain, ytest = y.iloc[train_index], y.iloc[test_index]

            #Fitting train data
            xgbClassifier = xgbClassifier.fit(xtrain, ytrain.values.ravel())
            #Predictions
            ypred = xgbClassifier.predict(xtest)
            pred_label.append(ypred)
            real_label.append(ytest.values)
            #Accuracy
            acc = accuracy_score(ytest, ypred)
            accuracy_a.append(acc)

        pred_label_df = pd.DataFrame(columns=["label"])
        real_label_df = pd.DataFrame(columns=["label"])

        #Forming the dataframes
        for row in range(0, len(pred_label)):
            label_str = pred_label[row][0]
            pred_label_df.loc[row] = label_str

        for row in range(0, len(real_label)):
            label_str = real_label[row][0][0]
            real_label_df.loc[row] = label_str

        if (plots):  #Plotting tree and accuracy heatmap

            cm = confusion_matrix(real_label_df, pred_label_df)
            cm_df = pd.DataFrame(cm, ["Fall", "Normal"], ["Fall", "Normal"])

            sn.set(font_scale=1.5)
            sn.heatmap(cm_df, annot=True, annot_kws={"size": 32}, fmt='d')
            plt.savefig("../figs/xgboost_heatmap.png",
                        facecolor="w",
                        bbox_inches="tight")
            plt.show()

        avg_acc = np.mean(accuracy_a)

        #Checking accuracy
        print("Tree average accuracy: ", round(avg_acc, 2))  #2 decimals

        #More detailed report
        print(classification_report(real_label_df, pred_label_df))

        return (avg_acc, real_label_df, pred_label_df)
def main():

    parser = argparse.ArgumentParser(
        description='Run a particular experiment using an ElasticNet estimator'
    )
    parser.add_argument('--data_dir',
                        dest="data_dir",
                        type=str,
                        required=True,
                        help='Directory where data are located')
    parser.add_argument('--output_dir',
                        dest="output_dir",
                        type=str,
                        required=True,
                        help='Directory where we are going to save the resuts')
    parser.add_argument('--con_type',
                        dest="con_type",
                        type=str,
                        required=True,
                        choices=['look_neg_look_neut', 'reg_neg_look_neg'],
                        help='Which contrast maps to take as input')
    parser.add_argument('--target_var',
                        dest="target_var",
                        type=str,
                        required=True,
                        help='Which variable to take as target')

    parser.add_argument('--n_alphas',
                        dest="n_alphas",
                        type=int,
                        default=1000,
                        help='Number of alphas to try for optimization')

    parser.add_argument('--transform',
                        dest="transform",
                        type=str,
                        choices=['yeo-johnson', 'box-cox'],
                        help='Transform target variable')
    opts = parser.parse_args()

    if opts.transform:
        msg = "Experiment to predict %s transformed %s from %s contrast maps with %d alphas" % (
            opts.transform, opts.target_var, opts.con_type, opts.n_alphas)
        print(msg)
    else:
        msg = "Experiment to predict untransformed %s from %s contrast maps with %d alphas" % (
            opts.target_var, opts.con_type, opts.n_alphas)
        print(msg)

    data_dir = os.path.abspath(opts.data_dir)

    if Path(data_dir).exists() is False:
        raise print("input directory does not exist")

    # Load data
    print("Loading data...")
    X, y = load_data(data_dir, opts.con_type, opts.target_var)

    # Build classifier
    cv_outer = LeaveOneOut()

    print("Running experiment...")
    if opts.transform:
        y_pred, y_true, list_models = run_transform(X,
                                                    y,
                                                    opts.transform,
                                                    cv_outer=cv_outer,
                                                    n_alphas=opts.n_alphas)
    else:
        y_pred, y_true, list_models = run(X,
                                          y,
                                          cv_outer=cv_outer,
                                          n_alphas=opts.n_alphas)

    from sklearn.metrics import r2_score, mean_squared_error

    r = np.corrcoef(y_true, y_pred)[0, 1]
    r2 = r2_score(y_true, y_pred)
    mse = mean_squared_error(y_true, y_pred)

    print("experiment gives r =%.3f, R2 = %.3f, MSE = %.3f" % (r, r2, mse))

    print("Saving results...")
    # Create output_directory for the given case (target->Input)
    if opts.transform:
        output_dir = opj(opts.output_dir,
                         opts.transform + "_" + opts.target_var, opts.con_type)
    else:
        output_dir = opj(opts.output_dir, opts.target_var, opts.con_type)

    output_dir = os.path.abspath(output_dir)
    Path(output_dir).mkdir(parents=True, exist_ok=True)

    save_data(output_dir, y_pred, y_true, list_models)
Example #11
0
def compute_acc_conf(x,
                     y,
                     confounds,
                     verbose=False,
                     balanced=True,
                     loo=False,
                     nfolds=10,
                     gs_kfolds=5,
                     optimize=True,
                     C=.01):
    encoder = preprocessing.LabelEncoder()
    encoder.fit(y)

    if loo:
        cv = LeaveOneOut(len(y))
    else:
        cv = StratifiedKFold(y=encoder.transform(y), n_folds=nfolds)

    mean_tpr = 0.0
    mean_fpr = np.linspace(0, 1, 100)
    all_tpr = []

    total_test_score = []
    y_pred = []
    # clf_array = []
    bc_all = []

    prec = []
    recall = []

    if len(np.unique(y)) == 1:
        print('Unique class: 100%', np.sum(encoder.transform(y) == 0) / len(y))
        return (1., 0., len(y))

    for i, (train, test) in enumerate(cv):

        select_x = x.copy()

        # betacluster = bc.BetaCluster(crm.transform(confounds[train,:],select_x[train,:]),encoder.transform(y[train]),100,k_feature=200)
        # bc_all.append(betacluster)

        if balanced:
            clf = SVC(kernel='linear', class_weight='balanced', C=C)
        else:
            clf = SVC(kernel='linear', C=C)

        if len(confounds) == 0:
            xtrain = select_x[train, :]
            xtest = select_x[test, :]
        else:
            crm = ConfoundsRm(confounds[train, :], select_x[train, :])
            xtrain = crm.transform(confounds[train, :], select_x[train, :])
            xtest = crm.transform(confounds[test, :], select_x[test, :])

        ytrain = encoder.transform(y[train])
        ytest = encoder.transform(y[test])

        # clf.probability = True
        if optimize:
            clf, score = plib.grid_search(clf,
                                          xtrain,
                                          ytrain,
                                          n_folds=gs_kfolds,
                                          verbose=verbose)

        clf.fit(xtrain, ytrain)
        total_test_score.append(clf.score(xtest, ytest))
        # clf_array.append(clf)

        prec.append(metrics.precision_score(ytest, clf.predict(xtest)))
        recall.append(metrics.recall_score(ytest, clf.predict(xtest)))

        if loo:
            y_pred.append(clf.predict(xtest))
        if verbose:
            print('nSupport: ', clf.n_support_)
            print("Train:", clf.score(xtrain, ytrain))
            print("Test :", clf.score(xtest, ytest))
            print("Prediction :", clf.predict(xtest))
            print("Real Labels:", ytest)
            print('Precision:', prec[-1], 'Recall:', recall[-1])
    y_pred = np.array(y_pred)[:, 0]
    if loo:
        total_std_test_score = estimate_std(
            metrics.accuracy_score(encoder.transform(y), np.array(y_pred)),
            len(y))
        print('Mean:', np.mean(total_test_score), 'Std:', total_std_test_score,
              'AvgPrecision:', np.mean(prec), 'AvgRecall:', np.mean(recall))
        return [
            np.mean(total_test_score), total_std_test_score,
            len(y), y_pred
        ]
    else:
        print('Mean:', np.mean(total_test_score), 'Std:',
              np.std(total_test_score), 'AvgPrecision:', np.mean(prec),
              'AvgRecall:', np.mean(recall))
        return [np.mean(total_test_score), np.std(total_test_score), len(y)]
        cls(n_splits=3, random_state=0))

    assert tokenize(cls(n_splits=3, random_state=0)) != tokenize(
        cls(n_splits=3, random_state=2))

    assert tokenize(cls(n_splits=3, random_state=0)) != tokenize(
        cls(n_splits=4, random_state=0))

    cv = cls(n_splits=3)
    assert compute_n_splits(cv, np_X, np_y, np_groups) == 3

    with assert_dask_compute(False):
        assert compute_n_splits(cv, da_X, da_y, da_groups) == 3


@pytest.mark.parametrize("cvs", [(LeaveOneOut(), ),
                                 (LeavePOut(2), LeavePOut(3))])
def test_leave_out(cvs):
    tokens = []
    for cv in cvs:
        assert tokenize(cv) == tokenize(cv)
        tokens.append(cv)
    assert len(set(tokens)) == len(tokens)

    cv = cvs[0]
    sol = cv.get_n_splits(np_X, np_y, np_groups)
    assert compute_n_splits(cv, np_X, np_y, np_groups) == sol

    with assert_dask_compute(True):
        assert compute_n_splits(cv, da_X, da_y, da_groups) == sol
Example #13
0
#     'min_samples_split': [2, 3, 4, 5, 10, 15],
#     'max_depth': [3, 4, 5, 10],
#     "criterion": ["gini"]
# }
# clf = GridSearchCV(
#     RandomForestClassifier(),
#     parameters,
#     cv=3,
#     n_jobs=-1
# )
# clf.fit(all_data, all_label)
# print(clf.best_estimator_)

# L法
print("--- L法 ---")
loo = LeaveOneOut()
count = [[0, 0], [0, 0]]
for train_index, test_index in loo.split(all_data):
    train_data = [all_data[i] for i in train_index]
    train_label = [all_label[i] for i in train_index]
    test_data = [all_data[i] for i in test_index]
    test_label = [all_label[i] for i in test_index]
    clf = RandomForestClassifier(bootstrap=True,
                                 ccp_alpha=0.0,
                                 class_weight=None,
                                 criterion='gini',
                                 max_depth=10,
                                 max_features=6,
                                 max_leaf_nodes=None,
                                 max_samples=None,
                                 min_impurity_decrease=0.0,
Example #14
0
def _print_train_results(classifier_name, classifier, regressors, response,
                         regressor_names, leave_one_out):
    """
        _print_train_results
            Performs validation tests of the model and prints the results
             
        :param classifier_name: Name of the classifier method 
        :param classifier: Classifier object
        :param regressors: numpy array with the regressors used to train the model
        :param response: numpy array with the response used to train the model
        :param regressor_names: List with the name of the regressors
        :param leave_one_out: Boolean, true to perform leave-one-out cross-validation, otherwise perform default cross 
            validation
        :return: None 
    """
    global MESSAGES
    _verbose_print("classifier_name: {}".format(classifier_name))
    _verbose_print("classifier: {}".format(classifier))
    _verbose_print("regressor_names: {}".format(regressor_names))
    _verbose_print("leave_one_out: {}".format(leave_one_out))

    MESSAGES.AddMessage("{} classifier with parameters: \n {}".format(
        classifier_name,
        str(classifier.get_params()).replace("'", "")))

    if leave_one_out:
        # create a leave-one-out instance to execute the cross-validation
        loo = LeaveOneOut()
        start = timer()
        cv_score = cross_val_score(classifier,
                                   regressors,
                                   response,
                                   cv=loo.split(regressors))
        end = timer()
        n_tests = len(response)
        MESSAGES.AddMessage("Score (Leave one Out):" + str(cv_score.mean()))
    else:
        start = timer()
        cv_score = cross_val_score(classifier, regressors, response)
        end = timer()
        n_tests = 3
        MESSAGES.AddMessage("Score (3-Fold):" + str(cv_score.mean()))
    # Print validation time
    MESSAGES.AddMessage(
        "Testing time: {:.3f} seconds, {:.3f} seconds per test".format(
            end - start, (end - start) / n_tests))
    # Print confusion matrix
    MESSAGES.AddMessage("Confusion Matrix (Train Set):")

    confusion = confusion_matrix(response, classifier.predict(regressors))
    labels = ["Non Deposit", "Deposit"]
    row_format = "{:6}" + "{:^16}" * (len(labels) + 1)
    MESSAGES.AddMessage(row_format.format("", "", "Predicted", ""))
    MESSAGES.AddMessage(row_format.format("True", "", *labels))
    for label, row in zip(labels, confusion):
        MESSAGES.AddMessage(row_format.format("", label, *row))

    # Some classifiers do not have  decision_function attribute but count with predict_proba instead
    # TODO: Generalize to anything that does not have decision_function "Easier to ask for forgiveness than permission"
    if classifier_name in ["Random Forest"]:
        des_fun = classifier.predict_proba(
            regressors)[:, classifier.classes_ == 1]
    else:
        des_fun = classifier.decision_function(regressors)
    MESSAGES.AddMessage("Area Under the curve (AUC): {}".format(
        roc_auc_score(response, des_fun)))

    # Give the importance of the features if it is supported
    # TODO: Generalize to anything that does have feature_importances_ "Easier to ask for forgiveness than permission"
    if classifier_name == "Adaboost":
        MESSAGES.AddMessage("Feature importances: ")
        importances = [[name, val * 100] for name, val in zip(
            regressor_names, classifier.feature_importances_)]
        long_word = max([len(x) for x in regressor_names])
        row_format = "{" + ":" + str(long_word) + "} {:4.1f}%"
        # Print regressors in descending importance, omit the ones with 0 importance
        for elem in sorted(importances, key=lambda imp: imp[1], reverse=True):
            if elem[1] > 0:
                MESSAGES.AddMessage(row_format.format(*elem))

    return
Example #15
0
    stop_ind = start_ind + test_size

    train_targets = followup_total_PANSS[train_index]
    test_targets = followup_total_PANSS[test_index]
    test_subjects = test_subjects + list(np.array(subjectids)[test_index])

    # do supervised site correction?
    if site_correction == 'comBat_supervised':

        # generate (outer) train and test data and metadata
        train_data = logm_connectivity_data[train_index, :]
        test_data = logm_connectivity_data[test_index, :]
        train_metadata = metadata.iloc[train_index, :]

        # generate loo training prediction
        loo = LeaveOneOut()
        inner_predictions = np.zeros((train_size, ))
        inner_train_indices, inner_test_indices = min_two_test_CV(
            site[train_index])
        #for j in range(len(inner_train_indices)) :
        #for inner_train_index, inner_test_index in loo.split(train_data) :
        skf = StratifiedKFold(n_splits=3)
        for inner_train_index, inner_test_index in skf.split(
                train_data, site[train_index]):

            #            print ('i = ' +str(i))
            #            print ('j = ' +str(j))
            #
            #            inner_train_index = inner_train_indices[j]
            #            inner_test_index = inner_test_indices[j]
            #
Example #16
0
def main(argv):
    filename = argv[0]
    t = float(argv[1])  # threshold for logistic regression (default=0.5)
    dup = int(argv[2])  # if 1, bad queries will be duplicated
    subset = 'cache'  # column title for precision of cache
    full = 'full'  # column title for precision of full db
    df = pd.read_csv('../../data/cache_selection_structured/' + filename)
    df = df.drop(['query', 'freq'], axis=1)
    df = df.fillna(0)
    df['label'] = np.where(df['full'] > df['cache'], 1, 0)
    if dup:
        print('duping..')
        bads = df[df['label'] == 1]
        df = df.append(bads, ignore_index=True)
    X = df.drop(['label'], axis=1)
    y = df['label']
    p20_mean = np.zeros([1, 6])
    bad_mean = np.zeros([1, 6])
    ml_average_rare = 0
    ql_average_rare = 0
    best_average_rare = 0
    loo = LeaveOneOut()
    bad_counter = 0
    for train_index, test_index in loo.split(X):
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]
        X_train = X_train.drop([subset, full], axis=1)
        p12 = X_test[subset].iloc[0]
        p100 = X_test[full].iloc[0]
        is_bad = p12 < p100
        X_test = X_test.drop([subset, full], axis=1)
        # compute query likelihood based effectiveness
        ql_cache = np.mean(X_test['ql_0_0'] + X_test['ql_0_1'] +
                           X_test['ql_1_0'] + X_test['ql_2_0'])
        ql_rest = np.mean(X_test['ql_rest_0_0'] + X_test['ql_rest_0_1'] +
                          X_test['ql_rest_1_0'] + X_test['ql_rest_2_0'])
        #ql_pred = X_test['ql_0_1'].iloc[0] < X_test['ql_rest_0_1'].iloc[0]
        ql_pred = 1 if ql_cache < ql_rest else 0
        ql = p12 if ql_pred == 0 else p100
        # learn the model
        sc = MinMaxScaler().fit(X_train)
        X_train = sc.transform(X_train)
        X_test = sc.transform(X_test)
        # print("\ttraining balanced LR..")
        lr = linear_model.LogisticRegression(class_weight='balanced')
        lr.fit(X_train, y_train)
        # print("\ttraining mean accuracy = %.2f" % lr.score(X_train, y_train))
        # print("\ttesting mean accuracy = %.2f" % lr.score(X_test, y_test))
        y_prob = lr.predict_proba(X_test)
        y_pred = y_prob[:, 1] > t
        y_pred = y_pred.astype('uint8')
        # print('\t t = %.2f results:' % t)
        # print_results(y_test, y_pred)
        # compute ML based effectiveness
        ml = p12 if y_pred[0] == 0 else p100
        best = p12 if y_test.iloc[0] == 0 else p100
        rnd = p12 if np.random.randint(0, 2) == 1 else p100
        p20_mean += [p12, p100, ml, ql, best, rnd]
        if is_bad:
            #bad_mean += [p12[0], p100[0], ml[0], ql[0], best[0], rnd[0]]
            bad_mean += [p12, p100, ml, ql, best, rnd]
            bad_counter += 1
    print('final results:')
    print('\t'.join(
        map(str, ['set', 'cache', 'db', 'ml', 'ql', 'best', 'rand'])))
    print('\t'.join(['bad'] +
                    map(str, np.round(bad_mean[0] / bad_counter, 2))))
    print('\t'.join(['all'] +
                    map(str, np.round(p20_mean[0] / df.shape[0], 2))))
Example #17
0
def HADA(sourceGraph, targetGraph, labels, settings):

    # initialisation
    subject = 150
    overallResult_PCC = np.zeros((subject, 32))
    overallResult_TSW = np.zeros((subject, 32))
    allSV = np.empty((0, sourceGraph.shape[1]), int)
    allTV = np.empty((0, targetGraph.shape[1]), int)
    allpredTV = np.empty((0, targetGraph.shape[1]), int)
    testlabel = []
    # Create training and testing sets
    loo = LeaveOneOut()
    loo.get_n_splits(sourceGraph)
    for train_index, test_index in loo.split(sourceGraph):
        rearrangedPredictorView = np.concatenate((np.transpose(
            sourceGraph[train_index]), np.transpose(sourceGraph[test_index])),
                                                 axis=1)
        rearrangedTargetView = np.concatenate((np.transpose(
            targetGraph[train_index]), np.transpose(targetGraph[test_index])),
                                              axis=1)

        ## Domain Alignment (DA) using ARGA and Similarity matrix learning using SIMLR
        simlr = SIMLR.SIMLR_LARGE(1, 50, 0)
        enc = Encoder(settings)

        ## STEP 1: Hierarchical Domain Alignment for traing samples
        print("Hierarchical Domain Alignment for traing samples")
        print("level 1")
        Simlarity2, _, _, _ = simlr.fit(targetGraph[train_index])
        encode_S_T = enc.erun(Simlarity2, sourceGraph[train_index])

        # H denotes the number of hierarchical levels
        H = 2
        temporary = encode_S_T
        for number in range(1, H):
            print("level ", H)
            encode_train__TV_A = enc.erun(Simlarity2, temporary)
            temporary = encode_train__TV_A

    ## STEP 2: Target Graph Prediction
    ## STEP 2.1: Source graph embedding of training and testing subjects
        test__train__SV = np.vstack(
            (sourceGraph[train_index], sourceGraph[test_index]))
        print("Source graph embedding of training and testing subjects...")
        Simlarity1, _, _, _ = simlr.fit(test__train__SV)
        encode_test__train__SV = enc.erun(Simlarity1, test__train__SV)

        ## STEP 2.2: Connectomic Manifold Learning using SIMLR
        print("SIMLR...")
        SALL, FALL, val, ind = simlr.fit(encode_test__train__SV)
        SY, FY, val, ind = simlr.fit(encode_train__TV_A)
        # number of neighbors for trust score
        TS_bestNb = 5
        # get the best neighbors in the learned manifold of the regularized source graph embeddings
        sall = SALL.todense()
        Index_ALL = np.argsort(-sall, axis=0)
        des = np.sort(-sall, axis=0)
        Bvalue_ALL = -des
        # get the best neighbors in the learned manifold of the hierarchically aligned source and target graph embeddings
        sy = SY.todense()
        Index_Y = np.argsort(-sy, axis=0)
        desy = np.sort(-sy, axis=0)
        Bvalue_Y = -desy

        # make prediction for each testing subject
        for testingSubject in range(1, 2):
            print "testing subject:", test_index
            # get this testing subject's rearranged index and original index
            tSubjectIndex = (sourceGraph[train_index].shape[0] -
                             2) + testingSubject
            tSubjectOriginalIndex = test_index
            # compute Tscore for each neighbor
            trustScore = np.ones((TS_bestNb, TS_bestNb))
            newWeight_TSW = np.ones(TS_bestNb)

            for neighbor in range(0, TS_bestNb):
                neighborIndex = Index_ALL[tSubjectIndex, neighbor]
                temp_counter = 0
                while (neighborIndex > sourceGraph[train_index].shape[0]):
                    # best neighbor is a testing data
                    temp_counter = temp_counter + 1
                    neighborIndex = Index_ALL[tSubjectIndex,
                                              (TS_bestNb + temp_counter)]

                if (temp_counter != 0):
                    neighborSequence = TS_bestNb + temp_counter
                else:
                    neighborSequence = neighbor

                    #print(neighborIndex)
                    # get top nb neighbors in mappedX
                    neighborListX = Index_ALL[neighborIndex, 0:TS_bestNb]
                    # get top nb neighbors in mappedY
                    neighborListY = Index_Y[neighborIndex, 0:TS_bestNb]
                    # calculate trust score
                    trustScore[TS_bestNb - 1, neighbor] = len(
                        np.intersect1d(np.array(neighborListX),
                                       np.array(neighborListY)))
                    # calculate new weight (TS * Similarity)
                    newWeight_TSW[neighbor] = exp(
                        trustScore[TS_bestNb - 1, neighbor] / TS_bestNb *
                        Bvalue_ALL[tSubjectIndex, neighborSequence])

            #reconstruct with Tscore and similarity weight
            innerPredict_TSW = np.zeros(
                sourceGraph[train_index].shape[1])[np.newaxis]
            #summing up the best neighbors
            for j1 in range(0, TS_bestNb):
                tr = (rearrangedTargetView[:, Index_ALL[tSubjectIndex,
                                                        j1]])[np.newaxis]
                if j1 == 0:
                    innerPredict_TSW = innerPredict_TSW.T + tr.T * newWeight_TSW[
                        j1]
                else:
                    innerPredict_TSW = innerPredict_TSW + tr.T * newWeight_TSW[
                        j1]

            # scale weight to 1
            Scale_TSW = sum(newWeight_TSW)
            innerPredict_TSW = np.divide(innerPredict_TSW, Scale_TSW)

            # calculate result (MAE)
            tr2 = (rearrangedTargetView[:, tSubjectIndex])[np.newaxis]
            resulttsw = abs(tr2.T - innerPredict_TSW)
            iMAE_TSW = mean_absolute_error(tr2.T, innerPredict_TSW)
            overallResult_TSW[tSubjectOriginalIndex,
                              TS_bestNb] = overallResult_TSW[
                                  tSubjectOriginalIndex, TS_bestNb] + iMAE_TSW

            allSV = np.append(allSV, sourceGraph[test_index], axis=0)
            testlabel.append(labels[test_index])
            allpredTV = np.append(allpredTV, innerPredict_TSW.T, axis=0)

            print test_index

    dataset_source_and_predicted_target = np.concatenate((allSV, allpredTV),
                                                         axis=1)

    print('END')

    mae = np.mean(overallResult_TSW, axis=0)
    print("Mean Absolute Error: ")
    print(mae[np.nonzero(mae)])

    return mae, dataset_source_and_predicted_target, testlabel
Example #18
0
mu, sigma = 0.22266368090882432, 0.027202072213276744  # mean and standard deviation
sourceGraph = np.random.normal(mu, sigma, (150, 595))
mu, sigma = 0.08308065685993601, 0.01338490182696101
targetGraph = np.random.normal(mu, sigma, (150, 595))
labels = np.concatenate((np.zeros((1, 75)), np.ones((1, 75))), axis=None)

## HADA execution
model = 'arga_ae'  #autoencoder/variational autoencoder
settings = settings.get_settings_new(model)
mae, dataset_source_and_predicted_target, testlabel = HADA(
    sourceGraph, targetGraph, labels, settings)

## STEP 3: Disease Classification using Random Forest
classes = testlabel
label = np.array(classes)
loo = LeaveOneOut()
actual_label = []
predicted_sv_predtv_label = []

RF = RandomForestClassifier(bootstrap=False,
                            class_weight=None,
                            criterion='gini',
                            max_depth=None,
                            max_features='auto',
                            max_leaf_nodes=None,
                            min_impurity_decrease=0.0,
                            min_impurity_split=None,
                            min_samples_leaf=1,
                            min_samples_split=2,
                            min_weight_fraction_leaf=0.0,
                            n_estimators=400,
header = data_table[0]
del data_table[0]

data = np.zeros((len(data_table), len(data_table[0])))

for i in range(0,len(data_table)):
    tmp = data_table[i]
    for j in range(0,len(tmp)):
        data[i,j] = float(tmp[j])

feature=data[:,[1,2,3,4]]
labels=data[:,[0]]


# Perform Leave One Out Validation on just the Decision Tree Classifier. 
LOO=LeaveOneOut()
number_of_iterations=LOO.get_n_splits(feature)
total_score=0;
d3=tree.DecisionTreeClassifier()
for train_index,test_index in LOO.split(feature):
    #print("TRAIN:", train_index, "TEST:", test_index)
    train_features, test_features = feature[train_index], feature[test_index]
    train_labels, test_labels = labels[train_index], labels[test_index]
    d3=tree.DecisionTreeClassifier()
    clf=d3.fit(train_features,train_labels)
    total_score+=clf.score(test_features,test_labels)
  
score = mean_score=(total_score/number_of_iterations)
print("D3 + leave one cross validation:",score)

Example #20
0
    def findBestAlpha(data, x_cols, y_cols, parameters, alphas):
        """
        Best alpha value for MLP

        Arguments:
            data {array} -- Data
            x_cols {arrray} -- x columns
            y_cols {array} -- y columns
            parameters {namedTuple} -- parameters for the classifier
            alphas {array} -- array of alphas to test
        """

        best_alpha = 0
        best_accu = 0

        x = data.loc[:, x_cols]
        y = data.loc[:, y_cols]

        #Picking best k
        for a in alphas:
            loo = LeaveOneOut()
            loo.get_n_splits(data)
            n = loo.split(data)

            mlpClassifier = MLPClassifier(
                hidden_layer_sizes=parameters.hidden_layer_sizes,
                solver=parameters.solver,
                alpha=a,
                batch_size=parameters.batch_size,
                learning_rate=parameters.learning_rate,
                learning_rate_init=parameters.learning_rate_init,
                max_iter=parameters.max_iter,
                random_state=parameters.random_state,
                verbose=parameters.verbose,
                early_stopping=parameters.early_stopping,
                validation_fraction=parameters.validation_fraction)

            accuracy_a = []
            real_label = []
            pred_label = []

            for train_index, test_index in n:  #Each row is test data once
                xtrain, xtest = x.iloc[train_index], x.iloc[test_index]
                ytrain, ytest = y.iloc[train_index], y.iloc[test_index]

                mlpClassifier.fit(xtrain, ytrain.values.ravel())

                ypred = mlpClassifier.predict(xtest)
                pred_label.append(ypred)
                real_label.append(ytest)

                acc = accuracy_score(ytest, ypred)
                accuracy_a.append(acc)

            avg_acc = np.mean(accuracy_a)
            print(a, ": average accuracy ", avg_acc)

            if (avg_acc > best_accu):  #Updating best_k if accuracy is better
                best_accu = avg_acc
                best_alpha = a

        print("Best alpha=", best_alpha)
        print("Best accuracy=", best_accu)

        return (best_alpha)
Example #21
0
    def testMlp(data, parameters, x_cols, y_cols, plots=False):
        """
        testing MLP classifier

        Arguments:
            data {array} -- Data
            x_cols {array} -- x columns
            y_cols {array} -- y columns

        Keyword Arguments:
            plots {bool} -- Used for plotting (default: {False})
        """

        x = data.loc[:, x_cols]
        y = data.loc[:, y_cols]

        loo = LeaveOneOut()
        loo.get_n_splits(data)
        n = loo.split(data)

        mlpClassifier = MLPClassifier(
            hidden_layer_sizes=parameters.hidden_layer_sizes,
            solver=parameters.solver,
            alpha=parameters.alpha,
            batch_size=parameters.batch_size,
            learning_rate=parameters.learning_rate,
            learning_rate_init=parameters.learning_rate_init,
            max_iter=parameters.max_iter,
            random_state=parameters.random_state,
            verbose=parameters.verbose,
            early_stopping=parameters.early_stopping,
            validation_fraction=parameters.validation_fraction)

        accuracy_a = []
        real_label = []
        pred_label = []

        for train_index, test_index in n:  #Each row is test data once
            xtrain, xtest = x.iloc[train_index], x.iloc[test_index]
            ytrain, ytest = y.iloc[train_index], y.iloc[test_index]

            mlpClassifier.fit(xtrain, ytrain.values.ravel())

            ypred = mlpClassifier.predict(xtest)
            pred_label.append(ypred)
            real_label.append(ytest.values)

            acc = accuracy_score(ytest, ypred)
            accuracy_a.append(acc)

        avg_acc = np.mean(accuracy_a)

        pred_label_df = pd.DataFrame(columns=["label"])
        real_label_df = pd.DataFrame(columns=["label"])

        #Forming the dataframes
        for row in range(0, len(pred_label)):
            label_str = pred_label[row][0]
            pred_label_df.loc[row] = label_str

        for row in range(0, len(real_label)):
            label_str = real_label[row][0][0]
            real_label_df.loc[row] = label_str

        if (plots):
            cm = confusion_matrix(real_label_df, pred_label_df)
            cm_df = pd.DataFrame(cm, ["Fall", "Normal"], ["Fall", "Normal"])

            sn.set(font_scale=1.5)
            sn.heatmap(cm_df, annot=True, annot_kws={"size": 32}, fmt='d')
            plt.savefig("../figs/svm_heatmap.png",
                        facecolor="w",
                        bbox_inches="tight")
            plt.show()

        #Checking accuracy
        print("SVM average accuracy: ", round(avg_acc, 2))  #2 decimals

        #More detailed report
        print(classification_report(real_label_df, pred_label_df))

        return (avg_acc, real_label_df, pred_label_df)
def pca_graph_pvals_less_than():

    data = preproccessed_data.join(mapping_file[[
        'Age', 'BMI', 'FattyLiver', 'RegularExercise', 'Smoking',
        'DiagnosisGroup'
    ]])
    X = data.drop([
        'Age', 'BMI', 'FattyLiver', 'RegularExercise', 'Smoking',
        'DiagnosisGroup'
    ],
                  axis=1)

    y = data['DiagnosisGroup']

    for n_comp in range(2, 30):
        pcas.append(n_comp)

        loo = LeaveOneOut()

        y_pred_list = []
        auc = []
        auc_train = []
        for train_index, test_index in loo.split(X):
            train_index = list(train_index)
            # print("%s %s" % (train_index, test_index))
            X_train, X_test = X.iloc[train_index], X.iloc[test_index]
            y_train, y_test = y[train_index], y[test_index]
            most_corelated_taxon = {}
            for i in range(X_train.shape[1]):
                p_val = scipy.stats.spearmanr(X_train.iloc[:, i], y_train)[1]
                if math.isnan(p_val):
                    most_corelated_taxon[X_train.columns[i]] = 1
                else:
                    most_corelated_taxon[X_train.columns[i]] = p_val
            sorted_taxon = sorted(most_corelated_taxon.items(),
                                  key=operator.itemgetter(1))
            most_corelated_taxon = [i for i in sorted_taxon if i[1] <= 0.01]
            bact = [i[0] for i in most_corelated_taxon if i[0] != 1]
            new_data = X[bact]

            otu_after_pca, _ = apply_pca(new_data, n_components=n_comp)

            new_data = otu_after_pca.join(data[[
                'Age', 'BMI', 'FattyLiver', 'RegularExercise', 'Smoking',
                'DiagnosisGroup'
            ]],
                                          how='inner')

            X_new = new_data.drop(['DiagnosisGroup'], axis=1)
            y_new = new_data['DiagnosisGroup']
            regex = re.compile(r"\[|\]|<", re.IGNORECASE)
            X_new.columns = [
                regex.sub("_", col) if any(x in str(col)
                                           for x in set(('[', ']',
                                                         '<'))) else col
                for col in X_new.columns.values
            ]

            X_train, X_test = X_new.iloc[train_index], X_new.iloc[test_index]
            y_train, y_test = y_new[train_index], y_new[test_index]

            model = XGBClassifier(max_depth=4,
                                  n_estimators=150,
                                  learning_rate=15 / 100,
                                  objective='multi:softmax')
            #objective='binary:logistic',
            #scale_pos_weight=(np.sum(y_train == -1) / np.sum(y_train == 1)))
            model.fit(X_train, y_train)
            pred_train = model.predict(X_train)
            auc_train.append(metrics.accuracy_score(y_train, pred_train))
            y_pred = model.predict(X_test)
            y_pred_list.append(y_pred[0])
        try:
            auc = metrics.accuracy_score(y, y_pred_list)
        except:
            pass
        print('PCA components' + str(n_comp), round(auc, 2))
        scores = round(auc, 2)
        scores_train = round(np.array(auc_train).mean(), 2)
        train_accuracy.append(scores_train)
        test_accuracy.append(round(scores.mean(), 2))
Example #23
0
parameters = {'alpha': [1e-15, 1e-10, 1e-8, 1e-4, 1e-3,1e-2, 1, 5, 10, 20]}

lasso_regressor = GridSearchCV(lasso, parameters, scoring='neg_mean_squared_error', cv = 16)

lasso_regressor.fit(X, y)

lasso_regressor.best_params_
lasso_regressor.best_score_
# performs worse compared to linear regression
##

#%% try with leave one out
import numpy as np
from sklearn.model_selection import LeaveOneOut
loo = LeaveOneOut()

#loo.get_n_splits(X)
loo.get_n_splits(np.array(subject_list[0:18])[mask])
# chose subject number
t1 = creatSubCor(subject_list, '1', 'speficTrial_0_trauma.npy')

df_clinical.describe()
X = np.array(subjectPosEdges[0:18])[mask].reshape(-1,1)
y = np.array(diffPCL)[mask]#.reshape(-1,1)
print(loo)

for train_index, test_index in loo.split(np.array(subject_list[0:18])[mask]):
   # should insert matrx tresholding for all subjects picked and calculating (each iteration) the Sum of Positive edges
   print(np.array(subject_list)[train_index])
   print("TRAIN:", train_index, "TEST:", test_index)
Example #24
0
def lda_project(spike_times,
                spike_clusters,
                event_times,
                event_groups,
                pre_time=0,
                post_time=0.5,
                cross_validation='kfold',
                num_splits=5,
                prob_left=None,
                custom_validation=None):
    """
    Use linear discriminant analysis to project population vectors to the line that best separates
    the two groups. When cross-validation is used, the LDA projection is fitted on the training
    data after which the test data is projected to this projection.

    spike_times : 1D array
        spike times (in seconds)
    spike_clusters : 1D array
        cluster ids corresponding to each event in `spikes`
    event_times : 1D array
        times (in seconds) of the events from the two groups
    event_groups : 1D array
        group identities of the events, can be any number of groups, accepts integers and strings
    cross_validation : string
        which cross-validation method to use, options are:
            'none'              No cross-validation
            'kfold'             K-fold cross-validation
            'leave-one-out'     Leave out the trial that is being decoded
            'block'             Leave out the block the to-be-decoded trial is in
            'custom'            Any custom cross-validation provided by the user
    num_splits : integer
        ** only for 'kfold' cross-validation **
        Number of splits to use for k-fold cross validation, a value of 5 means that the decoder
        will be trained on 4/5th of the data and used to predict the remaining 1/5th. This process
        is repeated five times so that all data has been used as both training and test set.
    prob_left : 1D array
        ** only for 'block' cross-validation **
        the probability of the stimulus appearing on the left for each trial in event_times
    custom_validation : generator
        ** only for 'custom' cross-validation **
        a generator object with the splits to be used for cross validation using this format:
            (
                (split1_train_idxs, split1_test_idxs),
                (split2_train_idxs, split2_test_idxs),
                (split3_train_idxs, split3_test_idxs),
             ...)
    n_neurons : int
        Group size of number of neurons to be sub-selected

    Returns
    -------
    lda_projection : 1D array
        the position along the LDA projection axis for the population vector of each trial

    """

    # Check input
    assert cross_validation in [
        'none', 'kfold', 'leave-one-out', 'block', 'custom'
    ]
    assert event_times.shape[0] == event_groups.shape[0]
    if cross_validation == 'block':
        assert event_times.shape[0] == prob_left.shape[0]
    if cross_validation == 'custom':
        assert isinstance(custom_validation, types.GeneratorType)

    # Get matrix of all neuronal responses
    times = np.column_stack(
        ((event_times - pre_time), (event_times + post_time)))
    pop_vector, cluster_ids = get_spike_counts_in_bins(spike_times,
                                                       spike_clusters, times)
    pop_vector = pop_vector.T

    # Initialize
    lda = LinearDiscriminantAnalysis()
    lda_projection = np.zeros(event_groups.shape)

    if cross_validation == 'none':
        # Find the best LDA projection on all data and transform those data
        lda_projection = lda.fit_transform(pop_vector, event_groups)

    else:
        # Perform cross-validation
        if cross_validation == 'leave-one-out':
            cv = LeaveOneOut().split(pop_vector)
        elif cross_validation == 'kfold':
            cv = KFold(n_splits=num_splits).split(pop_vector)
        elif cross_validation == 'block':
            block_lengths = [sum(1 for i in g) for k, g in groupby(prob_left)]
            blocks = np.repeat(np.arange(len(block_lengths)), block_lengths)
            cv = LeaveOneGroupOut().split(pop_vector, groups=blocks)
        elif cross_validation == 'custom':
            cv = custom_validation

        # Loop over the splits into train and test
        for train_index, test_index in cv:

            # Find LDA projection on the training data
            lda.fit(pop_vector[train_index],
                    [event_groups[j] for j in train_index])

            # Project the held-out test data to projection
            lda_projection[test_index] = lda.transform(
                pop_vector[test_index]).T[0]

    return lda_projection
Example #25
0
from keras.models import Sequential
from keras.layers import Dense
from keras.optimizers import SGD
from sklearn.datasets import load_iris
from sklearn.model_selection import LeaveOneOut

data, classifier = load_iris(return_X_y=True)

data = np.delete(data, 0, 1)
data = np.delete(data, 0, 1)

score = np.zeros([len(data)])
acc = np.zeros([len(data)])

loocv = LeaveOneOut()
for train_index, test_index in loocv.split(data):
    Xtrain, Xtest = data[train_index], data[test_index]
    Ytrain, Ytest = classifier[train_index], classifier[test_index]

    #holdout = 0.2;
    #Xtrain, Xtest, Ytrain, Ytest = train_test_split(data, classifier, test_size=holdout)

    model = Sequential()
    model.add(Dense(30, activation='sigmoid', input_shape=(2, )))
    model.add(Dense(30, activation='sigmoid'))
    model.add(Dense(1))

    sgd = SGD(lr=0.05, decay=1e-6, momentum=0.9, nesterov=True)
    model.compile(loss='mean_squared_error',
                  optimizer=sgd,
Example #26
0
def allergies_distance_matrix(distance='spearman', clustering='spectral'):
    for i in range(0, df.shape[1]):
        for j in range(0, df.shape[1]):
            #Spearman correlation
            if distance == 'spearman':
                dist_mat.at[df.columns[i], df.columns[j]] = abs(
                    round(
                        scipy.stats.spearmanr(
                            np.array(df.iloc[:, i]).astype(float),
                            np.array(df.iloc[:, j]).astype(float))[0], 4))
            #Euclidean distance
            else:
                dist_mat.at[df.columns[i], df.columns[j]] = np.linalg.norm(
                    np.array(df.iloc[:, i]).astype(float) -
                    np.array(df.iloc[:, j]).astype(float))
    if clustering == 'spectral':
        clustering = SpectralClustering(n_clusters=2,
                                        affinity='precomputed',
                                        assign_labels='discretize',
                                        random_state=0)
    else:
        clustering = AgglomerativeClustering(affinity='precomputed',
                                             linkage='average')
    clustering.fit(dist_mat.values)
    bact_label1 = []
    bact_label0 = []
    bact_label = {0: [], 1: []}

    for i in range(0, df.shape[1]):
        if clustering.labels_[i] == 1:
            bact_label1.append(df.columns[i])
        else:
            bact_label0.append(df.columns[i])
    bact_label_name = {0: [], 1: []}
    bact_label_tmp = {0: [], 1: []}
    bact_level = level - 1
    for k in [0, 1]:
        for i in bact_label[k]:
            for key, value in dict_bact.items():
                for j in value:
                    if i == j:
                        bact_label_tmp[k].append(key)
        bact_label_tmp[k] = set(bact_label_tmp[k])
        for i in bact_label_tmp[k]:
            if i != 'else':
                for j in taxonomy:
                    try:
                        if j.split(';')[bact_level] == i:
                            bact_label_name[k].append(','.join(
                                j.split(';')[0:bact_level + 1]))
                            break
                    except:
                        continue
            else:
                bact_label_name[k].append('else')
        bact_label_name[k] = set(bact_label_name[k])
    df1 = df[bact_label1]
    df0 = df[bact_label0]
    pca = PCA(n_components=min(round(df0.shape[1] / 2) + 1, df0.shape[0]))
    pca.fit(df0)
    sum = 0
    num_comp = 0
    for (i, component) in enumerate(pca.explained_variance_ratio_):
        if sum <= 0.5:
            sum += component
        else:
            num_comp = i
            break
    if num_comp == 0:
        num_comp += 1

    otu_after_pca0, _ = apply_pca(df0, n_components=num_comp, print_data=False)
    merged_data0 = otu_after_pca0.join(mapping_file)
    X = merged_data0.drop(['disease'], axis=1)
    y = merged_data0['disease']
    loo = LeaveOneOut()
    accuracy = []
    y_pred_list = []
    for train_index, test_index in loo.split(X):
        train_index = list(train_index)
        X_train, X_test = X.iloc[train_index, :], X.iloc[test_index, :]
        y_train, y_test = y[train_index], y[test_index]
        model = XGBClassifier(max_depth=5,
                              n_estimators=300,
                              learning_rate=15 / 100,
                              objective='binary:logistic',
                              scale_pos_weight=(np.sum(y_train == 0) /
                                                np.sum(y_train == 1)),
                              reg_lambda=450)
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        y_pred_list.append(y_pred)
    y_pred_train = model.predict(X_train)
    print('Train Precision: ' +
          str(round(precision_score(y_train, y_pred_train), 2)))
    print('Train Recall: ' +
          str(round(recall_score(y_train, y_pred_train), 2)))

    cnf_matrix = metrics.confusion_matrix(y_train, y_pred_train)
    class_names = ['Control', 'GVHD']
    plt.figure()
    plot_confusion_matrix(cnf_matrix,
                          classes=list(class_names),
                          normalize=True,
                          title='Normalized confusion matrix')
    plt.show()

    print('Precision: ' + str(round(precision_score(y, y_pred_list), 2)))
    print('Recall: ' + str(round(recall_score(y, y_pred_list), 2)))

    cnf_matrix = metrics.confusion_matrix(y, y_pred_list)
    # # Plot normalized confusion matrix
    plt.figure()
    plot_confusion_matrix(cnf_matrix,
                          classes=list(class_names),
                          normalize=True,
                          title='Normalized confusion matrix')
    plt.show()
    #
    pca = PCA(n_components=min(round(df1.shape[1] / 2) + 1, df1.shape[0]))
    pca.fit(df1)
    sum = 0
    num_comp = 0
    for (i, component) in enumerate(pca.explained_variance_ratio_):
        if sum <= 0.5:
            sum += component
        else:
            num_comp = i
            break
    if num_comp == 0:
        num_comp += 1

    otu_after_pca1, _ = apply_pca(df1, n_components=num_comp, print_data=False)
    merged_data1 = otu_after_pca1.join(mapping_file)
    X = merged_data1.drop(['disease'], axis=1)
    y = merged_data1['disease']
    loo = LeaveOneOut()
    accuracy = []
    y_pred_list = []
    for train_index, test_index in loo.split(X):
        train_index = list(train_index)
        X_train, X_test = X.iloc[train_index, :], X.iloc[test_index, :]
        y_train, y_test = y[train_index], y[test_index]
        model = XGBClassifier(max_depth=5,
                              n_estimators=300,
                              learning_rate=15 / 100,
                              objective='binary:logistic',
                              scale_pos_weight=(np.sum(y_train == 0) /
                                                np.sum(y_train == 1)),
                              reg_lambda=450)
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        y_pred_list.append(y_pred)

    y_pred_train = model.predict(X_train)
    print('Train Precision: ' +
          str(round(precision_score(y_train, y_pred_train), 2)))
    print('Train Recall: ' +
          str(round(recall_score(y_train, y_pred_train), 2)))

    cnf_matrix = metrics.confusion_matrix(y_train, y_pred_train)
    class_names = ['Control', 'GVHD']
    plt.figure()
    plot_confusion_matrix(cnf_matrix,
                          classes=list(class_names),
                          normalize=True,
                          title='Normalized confusion matrix')
    plt.show()

    print('Precision: ' + str(round(precision_score(y, y_pred_list), 2)))
    print('Recall: ' + str(round(recall_score(y, y_pred_list), 2)))

    cnf_matrix = metrics.confusion_matrix(y, y_pred_list)
    # # Plot normalized confusion matrix
    plt.figure()
    plot_confusion_matrix(cnf_matrix,
                          classes=list(class_names),
                          normalize=True,
                          title='Normalized confusion matrix')
    plt.show()
    def fit(self, X, y):
        """Fit the model using X as training data and y as target values

        Parameters
        ----------
        X : sktime-format pandas dataframe with shape([n_cases,n_dimensions]),
        or numpy ndarray with shape([n_cases,n_readings,n_dimensions])

        y : {array-like, sparse matrix}
            Target values of shape = [n_samples]

        """
        X, y = check_X_y(X, y, enforce_univariate=True)
        y = np.asarray(y)
        X = np.array(
            [np.asarray([x]).reshape(len(x), 1) for x in X.iloc[:, 0]])
        check_classification_targets(y)

        # if internal cv is desired, the relevant flag forces a grid search
        # to evaluate the possible values,
        # find the best, and then set this classifier's params to match
        if self._cv_for_params:
            grid = GridSearchCV(estimator=KNeighborsTimeSeriesClassifier(
                metric=self.metric, n_neighbors=1, algorithm="brute"),
                                param_grid=self._param_matrix,
                                cv=LeaveOneOut(),
                                scoring='accuracy')
            grid.fit(X, y)
            self.metric_params = grid.best_params_['metric_params']

        if y.ndim == 1 or y.ndim == 2 and y.shape[1] == 1:
            if y.ndim != 1:
                warnings.warn(
                    "A column-vector y was passed when a 1d array "
                    "was expected. Please change the shape of y to "
                    "(n_samples, ), for example using ravel().",
                    DataConversionWarning,
                    stacklevel=2)

            self.outputs_2d_ = False
            y = y.reshape((-1, 1))
        else:
            self.outputs_2d_ = True

        self.classes_ = []
        self._y = np.empty(y.shape, dtype=np.int)
        for k in range(self._y.shape[1]):
            classes, self._y[:, k] = np.unique(y[:, k], return_inverse=True)
            self.classes_.append(classes)

        if not self.outputs_2d_:
            self.classes_ = self.classes_[0]
            self._y = self._y.ravel()

        temp = check_array.__wrapped__.__code__
        check_array.__wrapped__.__code__ = _check_array_ts.__code__
        fx = self._fit(X)
        check_array.__wrapped__.__code__ = temp

        self._is_fitted = True
        return fx
    def testTreePredictions(data, parameters, x_cols, y_cols, plots=False):
        """
        Testing tree prediction accuracies.
        
        Arguments:
            data {array} -- Labeled data for classifier testing.
            x_cols {array} -- x columns
            y_cols {array} -- y columns
            parameters {namedtuple} -- Parameters for the tree classifier. Using named tuple to keep things tidy.
            
        Keyword Arguments:
            plots {bool} -- Used for plotting (default: {False})
        """
        x = data.loc[:, x_cols]
        y = data.loc[:, y_cols]

        loo = LeaveOneOut()
        loo.get_n_splits(data)
        n = loo.split(data)

        #Creating the classifier with the input parameters
        treeClassifier = tree.DecisionTreeClassifier(
            class_weight=parameters.class_weight,
            criterion=parameters.criterion,
            max_depth=parameters.max_depth,
            max_features=parameters.max_features,
            max_leaf_nodes=parameters.max_leaf_nodes,
            min_samples_leaf=parameters.min_samples_leaf,
            min_samples_split=parameters.min_samples_split,
            min_weight_fraction_leaf=parameters.min_weight_fraction_leaf,
            presort=parameters.presort,
            random_state=parameters.random_state,
            splitter=parameters.splitter)

        accuracy_a = []
        real_label = []
        pred_label = []

        for train_index, test_index in n:  #Each row is test data once
            xtrain, xtest = x.iloc[train_index], x.iloc[test_index]
            ytrain, ytest = y.iloc[train_index], y.iloc[test_index]

            #Fitting train data
            treeClassifier = treeClassifier.fit(xtrain, ytrain)
            #Predictions
            ypred = treeClassifier.predict(xtest)
            pred_label.append(ypred)
            real_label.append(ytest.values)
            #Accuracy
            acc = accuracy_score(ytest, ypred)
            accuracy_a.append(acc)

        pred_label_df = pd.DataFrame(columns=["label"])
        real_label_df = pd.DataFrame(columns=["label"])

        #Forming the dataframes
        for row in range(0, len(pred_label)):
            label_str = pred_label[row][0]
            pred_label_df.loc[row] = label_str

        for row in range(0, len(real_label)):
            label_str = real_label[row][0][0]
            real_label_df.loc[row] = label_str

        if (plots):  #Plotting tree and accuracy heatmap

            #not found in the library for some reason, currently using old version?
            #plt.figure(figsize=[12, 12])
            #tree.plot_tree(treeClassifier, filled=True)
            #plt.show()

            #Workaround attempt for tree plotting
            dot = io.StringIO()
            tree.export_graphviz(treeClassifier, out_file=dot)
            (graph, ) = pydot.graph_from_dot_data(dot.getvalue())
            graph.write_png("../figs/treeClassifier.png")

            cm = confusion_matrix(real_label_df, pred_label_df)
            cm_df = pd.DataFrame(cm, ["Fall", "Normal"], ["Fall", "Normal"])

            sn.set(font_scale=1.5)
            sn.heatmap(cm_df, annot=True, annot_kws={"size": 32}, fmt='d')
            plt.savefig("../figs/tree_heatmap.png",
                        facecolor="w",
                        bbox_inches="tight")
            plt.show()

        avg_acc = np.mean(accuracy_a)

        #Checking accuracy
        print("Tree average accuracy: ", round(avg_acc, 2))  #2 decimals

        #More detailed report
        print(classification_report(real_label_df, pred_label_df))

        return (avg_acc, real_label_df, pred_label_df)
Example #29
0
File: 3.4a.py Project: kelent/-
#Iris data cross-validation

from sklearn.linear_model import LogisticRegression
from sklearn import metrics
from sklearn.model_selection import cross_val_predict
from sklearn.model_selection import LeaveOneOut
from sklearn import datasets
import numpy as np
import pandas as pd

#tmp=pd.read_csv('iris.data',sep=',')
#iris=np.loadtxt('iris.data', delimiter=',')
iris=datasets.load_iris()
x=iris['data'][0:149]
y=iris['target'][0:149]


log_model=LogisticRegression()
m=np.shape(x)[0]

y_pred=cross_val_predict(log_model,x,y,cv=10)
print(metrics.accuracy_score(y,y_pred))
#print(y_pred)

loo=LeaveOneOut()
accuracy=0
for train,test in loo.split(x):
      log_model.fit(x[train],y[train])
      y_pred1=log_model.predict(x[test])
      if y_pred1==y[test]:accuracy+=1
print (accuracy/m)
Example #30
0
def _cost_fn(argd, X, y, EX_list, valid_size, n_folds, shuffle, random_state,
             use_partial_fit, info, timeout, _conn, loss_fn=None,
             continuous_loss_fn=False, best_loss=None, n_jobs=1):
    '''Calculate the loss function
    '''
    try:
        t_start = time.time()
        # Extract info from calling function.
        if 'classifier' in argd:
            classifier = argd['classifier']
            regressor = argd['regressor']
            preprocessings = argd['preprocessing']
            ex_pps_list = argd['ex_preprocs']
        else:
            classifier = argd['model']['classifier']
            regressor = argd['model']['regressor']
            preprocessings = argd['model']['preprocessing']
            ex_pps_list = argd['model']['ex_preprocs']
        learner = classifier if classifier is not None else regressor
        # Set n_jobs parameter if available for given learner
        if hasattr(learner, 'n_jobs'):
            # https://github.com/hyperopt/hyperopt-sklearn/issues/82#issuecomment-430963445
            learner.n_jobs = n_jobs
        is_classif = classifier is not None
        untrained_learner = copy.deepcopy(learner)
        # -- N.B. modify argd['preprocessing'] in-place

        # Determine cross-validation iterator.
        if n_folds is not None:
            if n_folds == -1:
                info('Will use leave-one-out CV')
                try:
                    cv_iter = LeaveOneOut().split(X)
                except TypeError:
                    # Older syntax before sklearn version 0.18
                    cv_iter = LeaveOneOut(len(y))
            elif is_classif:
                info('Will use stratified K-fold CV with K:', n_folds,
                     'and Shuffle:', shuffle)
                try:
                    cv_iter = StratifiedKFold(n_splits=n_folds,
                                              shuffle=shuffle,
                                              random_state=random_state
                                             ).split(X, y)
                except TypeError:
                    # Older syntax before sklearn version 0.18
                    cv_iter = StratifiedKFold(y, n_folds=n_folds,
                                              shuffle=shuffle,
                                              random_state=random_state)
            else:
                info('Will use K-fold CV with K:', n_folds,
                     'and Shuffle:', shuffle)
                try:
                    cv_iter = KFold(n_splits=n_folds,
                                    shuffle=shuffle,
                                    random_state=random_state).split(X)
                except TypeError:
                    # Older syntax before sklearn version 0.18
                    cv_iter = KFold(len(y), n_folds=n_folds,
                                    shuffle=shuffle,
                                    random_state=random_state)
        else:
            if not shuffle:  # always choose the last samples.
                info('Will use the last', valid_size,
                     'portion of samples for validation')
                n_train = int(len(y) * (1 - valid_size))
                valid_fold = np.ones(len(y), dtype=np.int)
                valid_fold[:n_train] = -1  # "-1" indicates train fold.
                try:
                    cv_iter = PredefinedSplit(valid_fold).split()
                except TypeError:
                    # Older syntax before sklearn version 0.18
                    cv_iter = PredefinedSplit(valid_fold)
            elif is_classif:
                info('Will use stratified shuffle-and-split with validation \
                      portion:', valid_size)
                try:
                    cv_iter = StratifiedShuffleSplit(1, test_size=valid_size,
                                                     random_state=random_state
                                                    ).split(X, y)
                except TypeError:
                    # Older syntax before sklearn version 0.18
                    cv_iter = StratifiedShuffleSplit(y, 1, test_size=valid_size,
                                                     random_state=random_state)
            else:
                info('Will use shuffle-and-split with validation portion:',
                     valid_size)
                try:
                    cv_iter = ShuffleSplit(n_splits=1, test_size=valid_size,
                                           random_state=random_state).split(X)
                except TypeError:
                    # Older syntax before sklearn version 0.18
                    cv_iter = ShuffleSplit(len(y), 1, test_size=valid_size,
                                           random_state=random_state)

        # Use the above iterator for cross-validation prediction.
        cv_y_pool = np.array([])
        cv_pred_pool = np.array([])
        cv_n_iters = np.array([])
        for train_index, valid_index in cv_iter:
            Xfit, Xval = X[train_index], X[valid_index]
            yfit, yval = y[train_index], y[valid_index]
            if EX_list is not None:
                _EX_list = [ (EX[train_index], EX[valid_index])
                             for EX in EX_list ]
                EXfit_list, EXval_list = zip(*_EX_list)
            else:
                EXfit_list = None
                EXval_list = None
            XEXfit, XEXval = transform_combine_XEX(
                Xfit, info, preprocessings, Xval,
                EXfit_list, ex_pps_list, EXval_list
            )
            learner = copy.deepcopy(untrained_learner)
            info('Training learner', learner, 'on X/EX of dimension',
                 XEXfit.shape)
            if hasattr(learner, "partial_fit") and use_partial_fit:
                learner, n_iters = pfit_until_convergence(
                    learner, is_classif, XEXfit, yfit, info,
                    best_loss=best_loss, XEXval=XEXval, yval=yval,
                    timeout=timeout, t_start=t_start
                )
            else:
                learner.fit(XEXfit, yfit)
                n_iters = None
            if learner is None:
                break
            cv_y_pool = np.append(cv_y_pool, yval)
            info('Scoring on X/EX validation of shape', XEXval.shape)
            if continuous_loss_fn:
                cv_pred_pool = np.append(cv_pred_pool, learner.predict_proba(XEXval))
            else:
                cv_pred_pool = np.append(cv_pred_pool, learner.predict(XEXval))
            cv_n_iters = np.append(cv_n_iters, n_iters)
        else:  # all CV folds are exhausted.
            if loss_fn is None:
                if is_classif:
                    loss = 1 - accuracy_score(cv_y_pool, cv_pred_pool)
                    # -- squared standard error of mean
                    lossvar = (loss * (1 - loss)) / max(1, len(cv_y_pool) - 1)
                    info('OK trial with accuracy %.1f +- %.1f' % (
                         100 * (1 - loss),
                         100 * np.sqrt(lossvar))
                    )
                else:
                    loss = 1 - r2_score(cv_y_pool, cv_pred_pool)
                    lossvar = None  # variance of R2 is undefined.
                    info('OK trial with R2 score %.2e' % (1 - loss))
            else:
                # Use a user specified loss function
                loss = loss_fn(cv_y_pool, cv_pred_pool)
                lossvar = None
                info('OK trial with loss %.1f' % loss)
            t_done = time.time()
            rval = {
                'loss': loss,
                'loss_variance': lossvar,
                'learner': untrained_learner,
                'preprocs': preprocessings,
                'ex_preprocs': ex_pps_list,
                'status': hyperopt.STATUS_OK,
                'duration': t_done - t_start,
                'iterations': (cv_n_iters.max()
                    if (hasattr(learner, "partial_fit") and use_partial_fit)
                    else None),
            }
            rtype = 'return'
        # The for loop exit with break, one fold did not finish running.
        if learner is None:
            t_done = time.time()
            rval = {
                'status': hyperopt.STATUS_FAIL,
                'failure': 'Not enough time to finish training on \
                            all CV folds',
                'duration': t_done - t_start,
            }
            rtype = 'return'

    ##==== Cost function exception handling ====##
    except (NonFiniteFeature,) as exc:
        print('Failing trial due to NaN in', str(exc))
        t_done = time.time()
        rval = {
            'status': hyperopt.STATUS_FAIL,
            'failure': str(exc),
            'duration': t_done - t_start,
        }
        rtype = 'return'

    except (ValueError,) as exc:
        if ('k must be less than or equal'
                ' to the number of training points') in str(exc):
            t_done = time.time()
            rval = {
                'status': hyperopt.STATUS_FAIL,
                'failure': str(exc),
                'duration': t_done - t_start,
            }
            rtype = 'return'
        else:
            rval = exc
            rtype = 'raise'

    except (AttributeError,) as exc:
        print('Failing due to k_means_ weirdness')
        if "'NoneType' object has no attribute 'copy'" in str(exc):
            # -- sklearn/cluster/k_means_.py line 270 raises this sometimes
            t_done = time.time()
            rval = {
                'status': hyperopt.STATUS_FAIL,
                'failure': str(exc),
                'duration': t_done - t_start,
            }
            rtype = 'return'
        else:
            rval = exc
            rtype = 'raise'

    except Exception as exc:
        rval = exc
        rtype = 'raise'

    # -- return the result to calling process
    _conn.send((rtype, rval))
Example #31
0
def computeCVROC(df, model, outcomeVar, predVars, nFolds=10, LOO=False):
    """Apply model to df and return performance metrics in a cross-validation framework.

    Parameters
    ----------
    df : pd.DataFrame
        Must contain outcome and predictor variables.
    model : sklearn or other model
        Model must have fit and predict methods.
    outcomeVar : str
    predVars : ndarray or list
        Predictor variables in the model.
    nFolds : int
        N-fold cross-validation (not required for LOO)

    Returns
    -------
    fpr : np.ndarray
        Pre-specified vector of FPR thresholds for interpolation
        fpr = np.linspace(0, 1, 100)
    meanTPR : np.ndarray
        Mean true-positive rate in test fraction.
    auc : float
        Area under the mean ROC curve.
    acc : float
        Mean accuracy score in test fraction.
    results : returned by model.fit()
        Training model results object for each fold
    prob : pd.Series
        Mean predicted probabilities on test data with index from df
    success : bool
        An indicator of whether the cross-validation was completed."""

    if not isinstance(predVars, list):
        predVars = list(predVars)
    
    tmp = df[[outcomeVar] + predVars].dropna()
    X,y = tmp[predVars].astype(float), tmp[outcomeVar].astype(float)

    if LOO:
        cv = LeaveOneOut()
        nFolds = cv.get_n_splits(y)
        cv_iter = cv.split(y=y)
    else:
        cv = StratifiedKFold(n_splits=nFolds, shuffle=True)
        cv_iter = cv.split(X=X, y=y)
    
    fpr = np.linspace(0, 1, 100)
    tpr = np.nan * np.zeros((fpr.shape[0], nFolds))
    acc = np.nan * np.zeros(nFolds)
    auc = np.nan * np.zeros(nFolds)
    coefs = []
    probs = []

    for outi, (trainInd, testInd) in enumerate(cv_iter):
        Xtrain, Xtest = X.iloc[trainInd], X.iloc[testInd]
        ytrain, ytest = y.iloc[trainInd], y.iloc[testInd]

        results = model.fit(X=Xtrain, y=ytrain)
        prob = results.predict_proba(Xtest)
        
        class1Ind = np.nonzero(results.classes_ == 1)[0][0]
        fprTest, tprTest, _ = sklearn.metrics.roc_curve(ytest, prob[:, class1Ind])

        tpr[:, outi] = np.interp(fpr, fprTest, tprTest)
        auc[outi] = sklearn.metrics.auc(fprTest, tprTest)
        acc[outi] = sklearn.metrics.accuracy_score(ytest, np.round(prob[:, class1Ind]), normalize=True)
        coefs.append(results.coef_[None,:])
        probs.append(pd.Series(prob[:, class1Ind], index=Xtest.index))
    
    meanTPR = np.mean(tpr, axis=1)
    meanTPR[0], meanTPR[-1] = 0, 1
    meanACC = np.mean(acc)
    meanAUC = sklearn.metrics.auc(fpr, meanTPR)
    
    """Compute mean probability over test predictions in CV"""
    probS = pd.concat(probs).groupby(level=0).agg(np.mean)
    probS.name = 'Prob'

    """Refit all the data for final model"""
    result = model.fit(X=X, y=y)

    rocRes = rocStats(y, np.round(probS))
    
    outD = {'fpr':fpr,                      # (100, ) average FPR for ROC
            'tpr':meanTPR,                  # (100, ) average TPR for ROC
            'AUC':auc,                      # (CVfolds, ) AUC of ROC for each outer test fold
            'mAUC': meanAUC,                # (1, ) AUC of the average ROC
            'mACC': np.mean(acc),
            'ACC':acc,                      # (CVfolds, ) accuracy across outer test folds
            'finalResult': result,          # final fitted model with predict() exposed
            'prob':probS,                   # (N,) pd.Series of predicted probabilities avg over outer folds
            'coefs':np.concatenate(coefs),  # (CVfolds, predVars)
            'Xvars':predVars,
            'Yvar':outcomeVar,
            'nFolds':nFolds,
            'LOO':'Yes' if LOO else 'No',
            'N':tmp.shape[0]}                  
    outD.update(rocRes[['Sensitivity', 'Specificity']].to_dict())
    return outD
Example #32
0
import pandas as pd
                                     TimeSeriesSplit)

data = list(range(1, 11))
print(data)

print(train_test_split(data, train_size=.8))

kf = KFold(n_splits=5)
for train, validate in kf.split(data):
    print(train, validate)

kf = KFold(n_splits=5, shuffle=True, random_state=42)
for train, validate in kf.split(data):
    print(train, validate)

loo = LeaveOneOut()
for train, validate in loo.split(data):
    print(train, validate)

lpo = LeavePOut(p=2)
for train, validate in lpo.split(data):
    print(train, validate)

ss = ShuffleSplit(n_splits=3, test_size=2, random_state=0)
for train, validate in ss.split(data):
    print(train, validate)

tscv = TimeSeriesSplit(n_splits=5)
for train, validate in tscv.split(data):
    print(train, validate)
from sklearn import datasets
from sklearn.neighbors import KNeighborsClassifier

iris = datasets.load_iris()
X = iris.data
y = iris.target

knn = KNeighborsClassifier()

# ==== Leave-one-out validation ====

from sklearn.model_selection import LeaveOneOut
# Instatiate `LeaveOneOut` class. See [here](http://scikit-learn.org/stable/modules/cross_validation.html#leave-one-out-loo)
# and [here](http://scikit-learn.org/stable/modules/generated/sklearn.model_selection.LeaveOneOut.html#sklearn.model_selection.LeaveOneOut)
# for more details.
loo = LeaveOneOut()
# Keep track of successful predictions
successes = []

# the `split` method generates indices to split data into training and test set.
for train_index, test_index in loo.split(X):
    # `fit` classifier on training indices
    knn.fit(X[train_index], y[train_index])
    # `score` classifier on testing indices; since there will be only one
    # test index, the score will be either 1 (for a correct prediction) or
    # 0 (for an incorrect prediction).
    successes.append(knn.score(X[test_index], y[test_index]))
# Divide `successes` by the sample size to get the percentage score.
print("Accuracy for iris dataset with Leave-One-Out validation is {}.\n".
      format(np.mean(successes)))
        'DiagnosisGroup'
    ]],
                          how='inner')
    new_df2 = new_df2.fillna(0)

    X = new_df2.drop(['DiagnosisGroup'], axis=1)
    regex = re.compile(r"\[|\]|<", re.IGNORECASE)
    X.columns = [
        regex.sub("_", col) if any(x in str(col)
                                   for x in set(('[', ']', '<'))) else col
        for col in X.columns.values
    ]

    y = new_df2['DiagnosisGroup']

    loo = LeaveOneOut()
    y_pred_list = []
    auc = []
    auc_train = []
    for train_index, test_index in loo.split(X):
        train_index = list(train_index)
        # print("%s %s" % (train_index, test_index))
        X_train, X_test = X.iloc[train_index, :], X.iloc[test_index, :]
        y_train, y_test = y[train_index], y[test_index]
        model = XGBClassifier(
            max_depth=4,
            n_estimators=150,
            learning_rate=15 / 100,
            objective='multi:softmax',
            reg_lambda=150
            #objective='binary:logistic',
            output_train = "{}({}: {}) ".format(output_train, i, data[i])

        for i in test:
            bar[i] = "T"
            output_test = "{}({}: {}) ".format(output_test, i, data[i])
            
        print("[ {} ]".format(" ".join(bar)))
        print("Train: {}".format(output_train))
        print("Test:  {}\n".format(output_test))


# Create some data to split with
data = numpy.array([[1, 2], [3, 4], [5, 6], [7, 8]])

# Our two methods
loocv = LeaveOneOut()
lpocv = LeavePOut(p=P_VAL)

split_loocv = loocv.split(data)
split_lpocv = lpocv.split(data)

print("""\
The Leave-P-Out method works by using every combination of P points as test data.

The following output shows the result of splitting some sample data by Leave-One-Out and Leave-P-Out methods.
A bar displaying the current train-test split as well as the actual data points are displayed for each split.
In the bar, "-" is a training point and "T" is a test point.
""")

print("Data:\n{}\n".format(data))
Example #37
0
frases = []
f = open("intensoes.txt", "r")

for x in f:
    classe, texto = x.split(">>")
    intensao.append(classe)
    frases.append(texto.rstrip())  #rstrip remove o \n

#Converte as sentenças em BOW
vectorizer = TfidfVectorizer(sublinear_tf=True,
                             max_df=0.5,
                             strip_accents='unicode')
intensaoBow = vectorizer.fit_transform(frases)

intensaoNumpy = np.array(intensao)
leaveOneOut = LeaveOneOut()
leaveOneOut.get_n_splits(intensaoBow)

result = []

for train_index, test_index in leaveOneOut.split(intensaoBow):
    X_train, X_test = intensaoBow[train_index], intensaoBow[test_index]
    y_train, y_test = intensaoNumpy[train_index], intensaoNumpy[test_index]

    #KNN
    model = KNeighborsClassifier(n_neighbors=1)
    model.fit(X_train, y_train)

    resultado = model.predict(X_test)[0]
    result.append(resultado)
Example #38
0
def compare_Estimators_fscore():
    from sklearn.svm import LinearSVC
    from sklearn.neighbors import KNeighborsClassifier
    from sklearn.model_selection import LeaveOneOut

    # list of (estimator, param_grid), where param_grid is used in GridSearchCV
    classifiers = [
        (LinearSVC(random_state=RS, tol=1e-5, C=0.025)),
        (KNeighborsClassifier(3)),
        (GradientBoostingClassifier(n_estimators=200,
                                    random_state=RS,
                                    learning_rate=0.05)),
        GaussianNB(),
    ]
    names = [
        'Linear SVC', 'K-Nearest Neighbors', 'Gradient Boosting',
        'Gaussian Naive Bayes'
    ]

    columns = 4
    fig, axs = plt.subplots(1, columns, figsize=(16, 6))
    axs = axs.ravel()

    loo = LeaveOneOut()

    outputFile = exportDir / '{0}_compare_classifiers'.format(fname)

    # iterate over classifiers
    for i, clf in enumerate(classifiers):

        y_pred = np.zeros(n_samples)

        for train_index, test_index in loo.split(X_train):
            print("TRAIN:", train_index, "TEST:", test_index)

            clf.fit(X_train[train_index, :], y_train[train_index])
            y_pred[test_index] = clf.predict(X_train[test_index, :])

        precision, recall, fscore, support = precision_recall_fscore_support(
            y_train, y_pred)
        accuracy = accuracy_score(y_train, y_pred)

        index = np.arange(num_categories)
        bar_width = 0.3

        for c in range(num_categories):
            axs[i].bar(index[c],
                       precision[c],
                       bar_width,
                       alpha=1,
                       color=plt.cm.tab20(i))
            axs[i].bar(index[c] + bar_width,
                       recall[c],
                       bar_width,
                       alpha=0.6,
                       color=plt.cm.tab20(i))
            ##axs[i].bar( index[c]+bar_width+bar_width, fscore[c], bar_width, alpha=0.25, color=plt.cm.tab20(i), hatch="//", edgecolor=plt.cm.tab20(i) )

        axs[i].set_xticks(np.arange(num_categories) + bar_width)
        axs[i].set_xticklabels(categories, rotation=45, ha='right')

        axs[i].set_xlabel('')
        axs[i].set_ylabel('Score')

        axs[i].set_title(names[i] + r"$\bf{" + ' | Accuracy: ' +
                         str(round(accuracy, 2)) + "}$")

        plt.tight_layout()

        from matplotlib.patches import Patch

        legend_elements = [
            Patch(facecolor=fc, label='Precision'),
            Patch(facecolor=fc, alpha=0.6, label='Recall'),
        ]
        #Patch(facecolor=fc, alpha=0.25, label='F-score', hatch="//") ]
        plt.legend(handles=legend_elements, loc='lower right')

        df = pd.DataFrame([precision, recall],
                          columns=categories,
                          index=['Precision', 'Recall']).transpose()
        df.to_csv(Path(str(outputFile) + names[i] + ".csv"),
                  index=True,
                  header=True,
                  sep=',')

    plt.savefig(Path(str(outputFile) + ".png"), dpi=dpi_all)

    #       np.savetxt(Path(str(outputFile)+ ".csv"), prob_loo, delimiter=",", header=",".join(categories ))
    #    df.to_csv(Path(str(outputFile)+ ".csv"), index=True, sep=',')

    if exportPDF:
        plt.savefig(Path(str(outputFile) + ".pdf"), dpi=dpi_all)
Example #39
0
def perform_plot_LOO():
    #######
    ### Leave-one-samepl-out cross-validation model
    #####

    n_samples, n_features = X_train.shape

    y_pred = np.zeros(n_samples)
    class_probs = np.zeros(
        [n_samples, np.unique(y_train).size]
    )  # the probability of assigning each left out sample to each of the classes

    loo = LeaveOneOut()

    for train_index, test_index in loo.split(X_train):
        print("TRAIN:", train_index, "TEST:", test_index)

        clf_main.fit(X_train[train_index, :], y_train[train_index])
        y_pred[test_index] = clf_main.predict(X_train[test_index, :])
        try:
            class_probs[test_index, :] = clf_main.predict_proba(
                X_train[test_index, :])
        except Exception:
            pass

    # my_score = np.mean(y_pred==y_input)
    precision, recall, fscore, support = precision_recall_fscore_support(
        y_train, y_pred)
    accuracy = accuracy_score(y_train, y_pred)

    ## MAKE CLASS PROBABILITY PLOT

    plt.figure()

    arr1inds = y_train.argsort()

    labels_train_temp = labels_train.reset_index(drop=True)

    labels_train_sorted = labels_train_temp[arr1inds[::-1]]

    prob_loo = class_probs[arr1inds[::-1]]
    plt.imshow(prob_loo,
               cmap=plt.cm.coolwarm,
               interpolation='none',
               aspect='auto')

    plt.grid(True)

    plt.yticks(np.arange(n_samples),
               labels_train_sorted[0:n_samples],
               fontsize=2,
               rotation=0)
    plt.xticks(np.arange(num_categories),
               categories,
               fontsize=8,
               rotation=45,
               ha='right')
    ax = plt.gca()
    ax.grid(color='w', linestyle='-', linewidth=0)
    plt.colorbar()
    plt.tight_layout()

    outputFile = exportDir / '{0}_class_probs_leave_one_out'.format(fname)
    plt.savefig(Path(str(outputFile) + ".png"), dpi=dpi_all)

    #    np.savetxt(Path(str(outputFile)+ ".csv"), prob_loo, delimiter=",", header=",".join(categories ))
    df = pd.DataFrame(prob_loo,
                      index=labels_train[0:n_samples],
                      columns=categories)
    df.to_csv(Path(str(outputFile) + ".csv"), index=True, header=True, sep=',')

    if exportPDF:
        plt.savefig(Path(str(outputFile) + ".pdf"), dpi=dpi_all)

    ## PRECISION RECALL PLOT
    plotPrecisionRecall(precision, recall, categories, accuracy)

    outputFile = exportDir / '{0}_precision_recall_training'.format(fname)
    plt.savefig(Path(str(outputFile) + ".png"), dpi=dpi_all)

    data = {'Precision': precision, 'Recall': recall}
    df = pd.DataFrame(data, index=categories)
    df.to_csv(Path(str(outputFile) + ".csv"), index=True, header=True, sep=',')

    if exportPDF:
        plt.savefig(Path(str(outputFile) + ".pdf"), dpi=dpi_all)
Example #40
0
def _print_train_results(classifier_name, classifier, regressors, response, regressor_names, leave_one_out):
    """
        _print_train_results
            Performs validation tests of the model and prints the results
             
        :param classifier_name: Name of the classifier method 
        :param classifier: Classifier object
        :param regressors: numpy array with the regressors used to train the model
        :param response: numpy array with the response used to train the model
        :param regressor_names: List with the name of the regressors
        :param leave_one_out: Boolean, true to perform leave-one-out cross-validation, otherwise perform default cross 
            validation
        :return: None 
    """
    global MESSAGES
    _verbose_print("classifier_name: {}".format(classifier_name))
    _verbose_print("classifier: {}".format(classifier))
    _verbose_print("regressor_names: {}".format(regressor_names))
    _verbose_print("leave_one_out: {}".format(leave_one_out))

    MESSAGES.AddMessage("{} classifier with parameters: \n {}".format(classifier_name,
                                                                      str(classifier.get_params()).replace("'", "")))

    if leave_one_out:
        # create a leave-one-out instance to execute the cross-validation
        loo = LeaveOneOut()
        start = timer()
        cv_score = cross_val_score(classifier, regressors, response, cv=loo.split(regressors))
        end = timer()
        n_tests = len(response)
        MESSAGES.AddMessage("Score (Leave one Out):" + str(cv_score.mean()))
    else:
        start = timer()
        cv_score = cross_val_score(classifier, regressors, response)
        end = timer()
        n_tests = 3
        MESSAGES.AddMessage("Score (3-Fold):" + str(cv_score.mean()))
    # Print validation time
    MESSAGES.AddMessage("Testing time: {:.3f} seconds, {:.3f} seconds per test".format(end - start,
                                                                                       (end - start) / n_tests))
    # Print confusion matrix
    MESSAGES.AddMessage("Confusion Matrix (Train Set):")

    confusion = confusion_matrix(response, classifier.predict(regressors))
    labels = ["Non Deposit", "Deposit"]
    row_format = "{:6}" + "{:^16}" * (len(labels) + 1)
    MESSAGES.AddMessage(row_format.format("", "", "Predicted", ""))
    MESSAGES.AddMessage(row_format.format("True", "", *labels))
    for label, row in zip(labels, confusion):
        MESSAGES.AddMessage(row_format.format("", label, *row))

    # Some classifiers do not have  decision_function attribute but count with predict_proba instead
    # TODO: Generalize to anything that does not have decision_function "Easier to ask for forgiveness than permission"
    if classifier_name in ["Random Forest"]:
        des_fun = classifier.predict_proba(regressors)[:, classifier.classes_ == 1]
    else:
        des_fun = classifier.decision_function(regressors)
    MESSAGES.AddMessage("Area Under the curve (AUC): {}".format(roc_auc_score(response, des_fun)))

    # Give the importance of the features if it is supported
    # TODO: Generalize to anything that does have feature_importances_ "Easier to ask for forgiveness than permission"
    if classifier_name == "Adaboost":
        MESSAGES.AddMessage("Feature importances: ")
        importances = [[name, val*100] for name, val in zip(regressor_names, classifier.feature_importances_)]
        long_word = max([len(x) for x in regressor_names])
        row_format = "{" + ":" + str(long_word) + "} {:4.1f}%"
        # Print regressors in descending importance, omit the ones with 0 importance
        for elem in sorted(importances, key=lambda imp: imp[1], reverse=True):
            if elem[1] > 0:
                MESSAGES.AddMessage(row_format.format(*elem))

    return
'''
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
from sklearn.model_selection import cross_val_predict

# log-regression lib model
log_model = LogisticRegression()
m = np.shape(X)[0]

# 10-folds CV
y_pred = cross_val_predict(log_model, X, y, cv=10)
print(metrics.accuracy_score(y, y_pred))
    
# LOOCV
from sklearn.model_selection import LeaveOneOut
loo = LeaveOneOut()
accuracy = 0;
for train, test in loo.split(X):
    log_model.fit(X[train], y[train])  # fitting
    y_p = log_model.predict(X[test])
    if y_p == y[test] : accuracy += 1  
print(accuracy / np.shape(X)[0])

# m = np.shape(X)[0]
# scores_loo = cross_val_score(log_model, X, y, cv=m)
# print(scores_loo)
# # prediction using 10-folds
# y_pred_loo = cross_val_predict(log_model, X, y, cv=m)
# print(metrics.accuracy_score(y, y_pred_loo))

'''