Exemple #1
0
def computing_performance_LDA(in_path=None, seeds=list([0])):
    def u65(mod_Y):
        return 1.6 / mod_Y - 0.6 / mod_Y ** 2

    def u80(mod_Y):
        return 2.2 / mod_Y - 1.2 / mod_Y ** 2

    data = export_data_set('iris.data') if in_path is None else pd.read_csv(in_path)
    print("-----DATA SET TRAINING---", in_path)
    X = data.iloc[:, :-1].values
    y = data.iloc[:, -1].tolist()
    lda = LinearDiscriminantAnalysis(solver="svd", store_covariance=True)
    mean_u65, mean_u80 = 0, 0
    n_times = len(seeds)
    for k in range(0, n_times):
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=seeds[k])
        sum_u65, sum_u80 = 0, 0
        lda.fit(X_train, y_train)
        n, _ = X_test.shape
        for i, test in enumerate(X_test):
            evaluate = lda.predict([test])
            print("-----TESTING-----", i)
            if y_test[i] in evaluate:
                sum_u65 += u65(len(evaluate))
                sum_u80 += u80(len(evaluate))
        print("--k-->", k, sum_u65 / n, sum_u80 / n)
        mean_u65 += sum_u65 / n
        mean_u80 += sum_u80 / n
    print("--->", mean_u65 / n_times, mean_u80 / n_times)
Exemple #2
0
def main():
    """Read Train/test log."""
    df = pd.read_csv("train.csv")

    # train/test split using stratified sampling
    labels = df['label']
    df = df.drop(['label'], 1)
    sss = StratifiedShuffleSplit(labels, 10, test_size=0.2, random_state=23)
    for train_index, test_index in sss:
        x_train, x_test = df.values[train_index], df.values[test_index]
        y_train, y_test = labels[train_index], labels[test_index]

    # classification algorithm
    classification(x_train, y_train, x_test, y_test)

    # Predict Test Set
    favorite_clf = LinearDiscriminantAnalysis()
    favorite_clf.fit(x_train, y_train)
    test = pd.read_csv('test.csv')
    test_predictions = favorite_clf.predict(test)
    print test_predictions

    # Format DataFrame
    submission = pd.DataFrame(test_predictions, columns=['Label'])
    submission.tail()
    submission.insert(0, 'ImageId', np.arange(len(test_predictions)) + 1)
    submission.reset_index()
    submission.tail()

    # Export Submission
    submission.to_csv('submission.csv', index=False)
    submission.tail()
Exemple #3
0
def computing_cv_accuracy_LDA(in_path=None, cv_n_fold=10):
    def u65(mod_Y):
        return 1.6 / mod_Y - 0.6 / mod_Y ** 2

    def u80(mod_Y):
        return 2.2 / mod_Y - 1.2 / mod_Y ** 2

    from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

    data = export_data_set('iris.data') if in_path is None else pd.read_csv(in_path)
    print("-----DATA SET TRAINING---", in_path)
    X = data.iloc[:, :-1].values
    y = np.array(data.iloc[:, -1].tolist())
    kf = KFold(n_splits=cv_n_fold, random_state=None, shuffle=True)
    lda = LinearDiscriminantAnalysis(solver="svd", store_covariance=True)
    mean_u65, mean_u80 = 0, 0
    for idx_train, idx_test in kf.split(y):
        print("---k-FOLD-new-executing--")
        X_cv_train, y_cv_train = X[idx_train], y[idx_train]
        X_cv_test, y_cv_test = X[idx_test], y[idx_test]
        lda.fit(X_cv_train, y_cv_train)
        n_test = len(idx_test)
        sum_u65, sum_u80 = 0, 0
        for i, test in enumerate(X_cv_test):
            evaluate = lda.predict([test])
            print("-----TESTING-----", i)
            if y_cv_test[i] in evaluate:
                sum_u65 += u65(len(evaluate))
                sum_u80 += u80(len(evaluate))
        mean_u65 += sum_u65 / n_test
        mean_u80 += sum_u80 / n_test
    print("--->", mean_u65 / cv_n_fold, mean_u80 / cv_n_fold)
class LinearDiscriminantAnalysiscls(object):
    """docstring for ClassName"""
    def __init__(self):
        self.lda_cls = LinearDiscriminantAnalysis()
        self.prediction = None
        self.train_x = None
        self.train_y = None

    def train_model(self, train_x, train_y):
        try:
            self.train_x = train_x
            self.train_y = train_y
            self.lda_cls.fit(train_x, train_y)
        except:
            print(traceback.format_exc())

    def predict(self, test_x):
        try:
            self.test_x = test_x
            self.prediction = self.lda_cls.predict(test_x)
            return self.prediction
        except:
            print(traceback.format_exc())

    def accuracy_score(self, test_y):
        try:
            # return r2_score(test_y, self.prediction)
            return self.lda_cls.score(self.test_x, test_y)
        except:
            print(traceback.format_exc())
def doLDA(x,digits,s):
    myLDA = LDA()
    myLDA.fit(x.PCA[:,:s],digits.train_Labels)
    newtest = digits.test_Images -x.centers
    [email protected](x.V[:s,:])
    labels = myLDA.predict(newtest)
    errors = class_error_rate(labels.reshape(1,labels.shape[0]),digits.test_Labels)
    return errors
Exemple #6
0
    def train_model(self):
        ### Train spectrum data
        # form training data and labels
        X = np.empty((0, self.freq_cutoff), int)
        y = np.empty((0, 1), int)

        data_dir = 'clap_data/claps/spectrum/'
        for fname in os.listdir(data_dir):
            data = np.load("%s%s"% (data_dir, fname))
            X = np.append(X, data, axis=0)
            y = np.append(y, [1] * data.shape[0])

        data_dir = 'clap_data/noclaps/spectrum/'
        for fname in os.listdir(data_dir):
            data = np.load("%s%s"% (data_dir, fname))
            X = np.append(X, data, axis=0)
            y = np.append(y, [0] * data.shape[0])

        # pca = PCA(n_components=200)
        # X_pca = pca.fit_transform(X)

        # fit the model
        # clf = LogisticRegression(penalty='l1')
        clf = LinearDiscriminantAnalysis()
        clf.fit(X, y)
        preds = clf.predict(X)
        # X_new = clf.transform(X)

        # clf2 = LinearDiscriminantAnalysis()
        # clf2.fit(X_new, y)
        # preds2 = clf2.predict(X_new)

        # print X.shape, X_pca.shape
        print preds
        print np.sum(preds), preds.size
        # print preds2, np.sum(preds2)

        # save model
        pickle.dump(clf, open(clap_model_dir + clap_classifier_fname, 'w'))
        self.clap_clf = clf

        ### Train decay data
        X = np.empty((0, self.decay_samples/10), int)

        data_dir = 'clap_data/claps/decay/'
        for fname in os.listdir(data_dir):
            if fname.endswith('npy'):
                data = np.load("%s%s"% (data_dir, fname))
                print data.shape, X.shape
                X = np.append(X, data, axis=0)

        print X.shape
        X_avg = np.mean(X, axis=0)
        plt.plot(X_avg)
        plt.show()

        # Average decay data
        np.save('%s%s' % (clap_model_dir, clap_decay_model_fname), X_avg)
 def testEvaluateLDA(self, trCList, teCList):
     # LDA object
     clf = LinearDiscriminantAnalysis()
     # fit lda model using training chromosomes
     clf.fit(numpy.asarray(trCList), numpy.asarray(trainGroupings))
     
     predicted = clf.predict(teCList)
         
     self.confusionMatrix(testGroupings, predicted, 'lda_test')
     
     # return precision ([0]), recall ([1]) or f1 score ([2]), replace with clf.score(numpy.asarray(teCList), testGroupings) for accuracy
     return precision_recall_fscore_support(testGroupings, predicted, average = 'weighted')[2] # fitness for test set
Exemple #8
0
    def train_DA(self, X, y, lda_comp, qda_reg):
        '''
        Input: 
            qda_reg - reg_param
            lda_comp - n_components
            X - data matrix (train_num, feat_num)
            y - target labels matrix (train_num, label_num)

        Output: 
            best_clf - best classifier trained (QDA/LDA)
            best_score - CV score of best classifier

        Find best DA classifier.
        '''
        n_samples, n_feat = X.shape
        cv_folds = 10
        kf = KFold(n_samples, cv_folds, shuffle=False)

        
        
        lda = LinearDiscriminantAnalysis(n_components = lda_comp)
        qda = QuadraticDiscriminantAnalysis(reg_param = qda_reg)
        score_total_lda = 0 #running total of metric score over all cv runs
        score_total_qda = 0 #running total of metric score over all cv runs
        for train_index, test_index in kf:
            X_train, X_test = X[train_index], X[test_index]
            y_train, y_test = y[train_index], y[test_index]
            
            lda.fit(X_train, y_train)
            cv_pred_lda = lda.predict(X_test)
            score_lda = eval(self.metric + '(y_test[:,None], cv_pred_lda[:,None], "' + self.task + '")')
            score_total_lda += score_lda
            
            qda.fit(X_train,y_train)
            cv_pred_qda = qda.predict(X_test)
            score_qda = eval(self.metric + '(y_test[:,None], cv_pred_lda[:,None], "' + self.task + '")')
            score_total_qda += score_qda

        score_lda = score_total_lda/cv_folds
        score_qda = score_total_qda/cv_folds
        
        # We keep the best one
        if(score_qda > score_lda):
            qda.fit(X,y)
            return qda, score_qda
        else:
            lda.fit(X,y)
            return lda, score_lda
Exemple #9
0
def computing_precise_vs_imprecise(in_path=None, ell_optimal=0.1, seeds=0):
    def u65(mod_Y):
        return 1.6 / mod_Y - 0.6 / mod_Y ** 2

    def u80(mod_Y):
        return 2.2 / mod_Y - 1.2 / mod_Y ** 2

    data = export_data_set('iris.data') if in_path is None else pd.read_csv(in_path)
    print("-----DATA SET TRAINING---", in_path)
    X = data.iloc[:, :-1].values
    y = data.iloc[:, -1].tolist()
    n_time = len(seeds)
    lda_imp = LinearDiscriminant(init_matlab=True)
    lda = LinearDiscriminantAnalysis(solver="svd", store_covariance=True)
    mean_u65_imp, mean_u80_imp, u_mean = 0, 0, 0
    for k in range(0, n_time):
        X_train, X_test, y_train, y_test = \
            train_test_split(X, y, test_size=0.4, random_state=seeds[k])
        lda_imp.learn(X_train, y_train, ell=ell_optimal)
        lda.fit(X_train, y_train)
        sum_u65, sum_u80 = 0, 0
        u_precise, n_real_test = 0, 0
        n_test, _ = X_test.shape
        for i, test in enumerate(X_test):
            print("--TESTING-----", i)
            evaluate_imp, _ = lda_imp.evaluate(test)
            if len(evaluate_imp) > 1:
                n_real_test += 1
                if y_test[i] in evaluate_imp:
                    sum_u65 += u65(len(evaluate_imp))
                    sum_u80 += u80(len(evaluate_imp))
                evaluate = lda.predict([test])
                if y_test[i] in evaluate:
                    u_precise += u80(len(evaluate))
        mean_u65_imp += sum_u65 / n_real_test
        mean_u80_imp += sum_u80 / n_real_test
        u_mean += u_precise / n_real_test
        print("--time_k--u65-->", k, sum_u65 / n_real_test)
        print("--time_k--u80-->", k, sum_u80 / n_real_test)
        print("--time_k--precise-->", k, u_precise / n_real_test)
    print("--global--u65-->", mean_u65_imp / n_time)
    print("--global--u80-->", mean_u80_imp / n_time)
    print("--global--precise-->", u_mean / n_time)
def lda_pred(Xtrain, Xtest, Ytrain, Ytest):
    """ Simple Naive Implementation of the the LDA
    """
    # empty list for the predictions
    Ypred = []
    
    # loop through and perform classification
    for xtrain, xtest, ytrain, ytest in zip(Xtrain,Xtest,
                                            Ytrain, Ytest):
        # initialize the model                
        lda_model = LDA()
        
        # fit the model to the training data
        lda_model.fit(xtrain, ytrain.ravel())
        
        # save the results of the model predicting the testing data
        Ypred.append(lda_model.predict(xtest))
    
    # return this list    
    return Ypred    
    def classifyLDA(self, tCList, vCList):
        if self.mode == "cv":
            # LDA object
            clf = make_pipeline(preprocessing.StandardScaler(), LinearDiscriminantAnalysis())
            predicted = cross_validation.cross_val_predict(clf, tCList, trainGroupings, cv=3)

            if self.cm:
                self.confusionMatrix(trainGroupings, predicted, 'lda_cv')
            
            return precision_recall_fscore_support(trainGroupings, predicted, average = 'weighted')[2]
            
        else:
            clf = LinearDiscriminantAnalysis()
            # fit lda model using training chromosomes
            clf.fit(numpy.asarray(tCList), numpy.asarray(trainGroupings))
            
            if self.cm:
                self.confusionMatrix(validGroupings, predicted, 'lda_valid')
            
            # return precision ([0]), recall ([1]) or f1 score ([2]), replace with clf.score(numpy.asarray(vCList), validGroupings) for accuracy
            return precision_recall_fscore_support(validGroupings, clf.predict(numpy.asarray(vCList)), average = 'weighted')[2] # fitness for validation set
def processTraining(cvtrainx,cvtrainy,cvevalx,prob=False):
    print cvtrainx[0]
    #cvevalx=[' '.join(s) for s in cvevalx]
    print cvevalx[0]
    tfv = TfidfVectorizer(min_df=10,  max_features=None,
        strip_accents='unicode', analyzer=mytokenlizer,
        ngram_range=(1, 5), use_idf=1,smooth_idf=1,sublinear_tf=1,
        stop_words = 'english')

    cvtrainx=tfv.fit_transform(cvtrainx)
    cvevalx=tfv.transform(cvevalx)
    tsvd=TruncatedSVD(n_components=600,random_state=2016)
    cvtrainx=tsvd.fit_transform(cvtrainx)
    cvevalx=tsvd.transform(cvevalx)
    print len(tfv.get_feature_names())
    print tfv.get_feature_names()[0:10]
    clf=LinearDiscriminantAnalysis()
    clf.fit(cvtrainx,cvtrainy)
    if prob:
        predictValue=clf.predict_proba(cvevalx)
    else:
        predictValue=clf.predict(cvevalx)
    return predictValue
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
print "test"

DIM = 100

mel = load.loadMel()[0]
res_class = load.loadClass()[0]
print "loaded"
# print res[0]
# def myfunc(a):
# 	print a
# 	return a.tolist().index(1)

def save_to_file(X, filename='afterLDA'):
	with open(filename + str(DIM) + ".db", 'w') as f:
		ujson.dump(X.tolist(), f)
		
# vfunc = np.vectorize(myfunc)
# res_class = vfunc(res)
clf = LinearDiscriminantAnalysis(n_components=DIM)
print "train"
clf.fit(mel, res_class)
print "trained"
print clf.predict(mel[:10])
pred = clf.predict(mel)
print res_class[:10]
print np.mean(pred == res_class)
save_to_file(clf.transform(mel))
endt = time.time()
print endt - startt
Exemple #14
0
jj = 7

I = digits.target == ii
J = digits.target == jj

X = np.vstack((B[I, :], B[J, :]))
y = np.hstack((digits.target[I], digits.target[J]))

print(X.shape)
print(y.shape)

clf = LinearDiscriminantAnalysis()

clf.fit(X, y)

err = clf.predict(X) != y

for i in [ii, jj]:
    II = digits.target == i
    plt.plot(B[II, 0], B[II, 1], 'o', label=str(i))

plt.plot(X[err, 0], X[err, 1], 'ro')

f = 0
h = scipy.stats.kruskal(B[digits.target == ii, f], B[digits.target == jj, f])
print(h)

plt.figure()
plt.boxplot([B[digits.target == ii, f], B[digits.target == jj, f]])

#plt.legend()
Exemple #15
0
LDA via sklearn
'''
from sklearn import model_selection
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn import metrics
import matplotlib.pyplot as plt
# generalization of train and test set
X_train, X_test, y_train, y_test = model_selection.train_test_split(
    X, y, test_size=0.5, random_state=0)
# model fitting
#http://scikit-learn.org/stable/modules/generated/sklearn.discriminant_analysis.
# LinearDiscriminantAnalysis.html#sklearn.discriminant_analysis.LinearDiscriminantAnalysis
lda_model = LinearDiscriminantAnalysis(solver='lsqr',
                                       shrinkage=None).fit(X_train, y_train)
# model validation
y_pred = lda_model.predict(X_test)
# summarize the fit of the model
print(metrics.confusion_matrix(y_test, y_pred))
print(metrics.classification_report(y_test, y_pred))

f1 = plt.figure(1)
plt.title('watermelon_3a')
plt.xlabel('density')
plt.ylabel('ratio_sugar')
"""
plt.scatter(X[y == 0, 0], X[y == 0, 1], marker='o', color='b', s=100, label='bad')
"""
plt.scatter(goodData[:, 1],
            goodData[:, 2],
            marker='o',
            color='g',
Exemple #16
0
def acc(X1,Y1,X2,Y2,imputation_method):
    dim=len(X1[0])
    len_training=len(X1)
    len_testing=len(X2)

    if (imputation_method=='multiple_closest'):
        mean=np.zeros(dim)
        for j in range(dim):
            n=0
            for i in range(len_training):
                val=X1[i][j]
                if (val!=0):
                    n+=1
                    mean[j]+=val
            mean[j]/=n

        #variance

        var=np.zeros(dim)
        for j in range(dim):
            n=0
            for i in range(len_training):
                val=X1[i][j]
                if (val!=0):
                    n+=1
                    var[j]+=(val-mean[j])**2
            var[j]/=n


        #Actual Imputation
        Xaf=np.copy(X1)
        Xal=np.copy(X1)
        for i in range(len_training):
            for j in range(dim):
                if (Xaf[i][j]==0):
                    min_dis1=dim+1
                    min_dis2=dim+1
                    closest_index1=0
                    closest_index2=0
                    for a in range (100):
                        int=rd.randint(0, len_training-1)
                        if (Xaf[int][j]!=0) and Y1[int]== Y1[i]:
                            dis=distance(Xaf[i],Xaf[int],var)
                            if (dis<min_dis1):
                                closest_index1=int
                                min_dis1=dis
                            elif (dis<min_dis2):
                                closest_index2=int
                                min_dis2=dis
                    Xaf[i][j]=Xaf[closest_index1][j]
                    Xal[i][j]=Xal[closest_index2][j]

        Xbf=np.copy(X2)
        Xbl=np.copy(X2)
        for i in range(len_testing):
            for j in range(dim):
                if (Xbf[i][j]==0):
                    min_dis1=dim+1
                    min_dis2=dim+1
                    closest_index1=0
                    closest_index2=0
                    for a in range (100):
                        int=rd.randint(0, len_testing-1)
                        if (Xbf[int][j]!=0):
                            dis=distance(Xbf[i],Xbf[int],var)
                            if (dis<min_dis1):
                                closest_index1=int
                                min_dis1=dis
                            elif (dis<min_dis2):
                                closest_index2=int
                                min_dis2=dis
                    Xbf[i][j]=Xbf[closest_index1][j]
                    Xbl[i][j]=Xbl[closest_index2][j]

        predictions=[]

        for i in range(nb_multiple_imputation+1):
            Xa=(i*Xaf+((nb_multiple_imputation)-i)*Xal)/nb_multiple_imputation
            Xb=(i*Xbf+((nb_multiple_imputation)-i)*Xbl)/nb_multiple_imputation
            lda = LinearDiscriminantAnalysis()
            lda.fit(Xa, Y1)
            predictions.append(lda.predict(Xb))

        sol=maj_vote(predictions)

        from sklearn.metrics import accuracy_score
        return accuracy_score(Y2, sol)




    if (imputation_method=='no_imputation'):
        lda = LinearDiscriminantAnalysis()
        lda.fit(X1, Y1)
        return(lda.score(X2,Y2))

    if (imputation_method=='grand_mean'):
        N=0
        Mean=np.zeros(dim)
        for i in range(len_training):
            for j in range(dim):
                val=X1[i][j]
                if (val!=0):
                    N+=1
                    Mean[j]+=val
        Mean/=N

        #Imputing
        Xa=np.copy(X1)
        for i in range(len_training):
            for j in range(dim):
                if Xa[i][j]==0:
                    Xa[i][j]=Mean[j]

        Xb=np.copy(X2)
        for i in range(len_testing):
            for j in range(dim):
                if Xb[i][j]==0:
                    Xb[i][j]=Mean[j]

        lda = LinearDiscriminantAnalysis()
        lda.fit(Xa, Y1)
        return(lda.score(Xb,Y2))

    if (imputation_method=='conditional_mean'):
        N0=0
        Mean0=np.zeros(dim)
        N1=0
        Mean1=np.zeros(dim)
        for i in range(len_training):
            if (Y1[i]==0):
                for j in range(dim):
                    val=X1[i][j]
                    if (val!=0):
                        N0+=1
                        Mean0[j]+=val
            else :
                for j in range(dim):
                    val=X1[i][j]
                    if (val!=0):
                        N1+=1
                        Mean1[j]+=val
        for j in range(dim):
            Mean0[j]=Mean0[j]/N0
        for j in range(dim):
            Mean1[j]=Mean1[j]/N1

        #Imputing the training set
        Xa=np.copy(X1)
        for i in range(len_training):
            for j in range(dim):
                if Xa[i][j]==0:
                    if Y1[i]==0:
                        Xa[i][j]=Mean0[j]
                    if Y1[i]==1:
                        Xa[i][j]=Mean1[j]

        #Imputing the testing sets

        Xb1=np.copy(X2)
        for i in range(len_testing):
            for j in range(dim):
                if Xb1[i][j]==0:
                    Xb1[i][j]=Mean0[j]
        Xb2=np.copy(X2)
        for i in range(len_testing):
            for j in range(dim):
                if Xb2[i][j]==0:
                    Xb2[i][j]=Mean1[j]

        lda = ml.LinearDiscriminantAnalysis(n_components=None, priors=None, shrinkage=None,
                      solver='eigen', store_covariance=False, tol=0.0001)
        lda.fit(Xa, Y1)
        return(lda.score(Xb1,Xb2,Y2))

    if (imputation_method=='closest'):
        #Imputation of Training set
        #mean
        mean=np.zeros(dim)
        for j in range(dim):
            n=0
            for i in range(len_training):
                val=X1[i][j]
                if (val!=0):
                    n+=1
                    mean[j]+=val
            mean[j]/=n

        #variance

        var=np.zeros(dim)
        for j in range(dim):
            n=0
            for i in range(len_training):
                val=X1[i][j]
                if (val!=0):
                    n+=1
                    var[j]+=(val-mean[j])**2
            var[j]/=n


        #Actual Imputation
        Xa=np.copy(X1)
        for i in range(len_training):
            for j in range(dim):
                if (Xa[i][j]==0):
                    min_dis=dim+1
                    closest_index=0
                    for a in range (100):
                        int=rd.randint(0, len_training-1)
                        if (Xa[int][j]!=0) and Y1[int]== Y1[i]:
                            dis=distance(Xa[i],Xa[int],var)
                            if (dis<min_dis):
                                closest_index=int
                                min_dis=dis
                    Xa[i][j]=Xa[closest_index][j]

        Xb=np.copy(X2)
        for i in range(len_testing):
            for j in range(dim):
                if (Xb[i][j]==0):
                    min_dis=dim+1
                    closest_index=0
                    for a in range (100):
                        int=rd.randint(0, len_testing-1)
                        if (Xb[int][j]!=0):
                            dis=distance(Xb[i],Xb[int],var)
                            if (dis<min_dis):
                                closest_index=int
                                min_dis=dis
                    Xb[i][j]=Xb[closest_index][j]


        lda = LinearDiscriminantAnalysis()
        lda.fit(Xa, Y1)
        return(lda.score(Xb,Y2))

    if (imputation_method=='regression'):
        Xtraindet=np.copy(X1)
        Xtestdet=np.copy(X2)
        for j in range(1,dim):
            Xs=[]
            Xn=[]
            for h in range(len_training):
                if (Xtraindet[h][j]!=0):
                    Xs.append(Xtraindet[h][0:j])
                    Xn.append(Xtraindet[h][j])
            reg = LinearRegression().fit(Xs, Xn)

            for h in range(len_training):
                if (Xtraindet[h][j]==0):
                    Xtraindet[h][j]=(reg.predict([Xtraindet[h][0:j]]))[0]
            for h in range(len_testing):
                if (Xtestdet[h][j]==0):
                    Xtestdet[h][j]=(reg.predict([Xtestdet[h][0:j]]))[0]

        lda = LinearDiscriminantAnalysis()
        lda.fit(Xtraindet, Y1)
        return(lda.score(Xtestdet, Y2))

    if (imputation_method=='multiple_regression'):
        mean=np.zeros(dim)
        for j in range(dim):
            n=0
            for i in range(len_training):
                val=X1[i][j]
                if (val!=0):
                    n+=1
                    mean[j]+=val
            mean[j]/=n

        #variance

        var=np.zeros(dim)
        for j in range(dim):
            n=0
            for i in range(len_training):
                val=X1[i][j]
                if (val!=0):
                    n+=1
                    var[j]+=(val-mean[j])**2
            var[j]/=n

        results=[]
        for i in range(nb_multiple_imputation):
            Xtraindet=np.copy(X1)
            Xtestdet=np.copy(X2)
            for j in range(1,dim):
                Xs=[]
                Xn=[]
                for h in range(len_training):
                    if (Xtraindet[h][j]!=0):
                        Xs.append(Xtraindet[h][0:j])
                        Xn.append(Xtraindet[h][j])
                reg = LinearRegression().fit(Xs, Xn)

                for h in range(len_training):
                    if (Xtraindet[h][j]==0):
                        Xtraindet[h][j]=(reg.predict([Xtraindet[h][0:j]]))[0]+np.random.normal(0, np.sqrt(var[j]))
                for h in range(len_testing):
                    if (Xtestdet[h][j]==0):
                        Xtestdet[h][j]=(reg.predict([Xtestdet[h][0:j]]))[0]+np.random.normal(0, np.sqrt(var[j]))

            lda = LinearDiscriminantAnalysis()
            lda.fit(Xtraindet, Y1)
            results.append(lda.predict(Xtestdet))

        sol=maj_vote(results)

        from sklearn.metrics import accuracy_score
        return accuracy_score(Y2, sol)
tests = ['mistakes', 'informative', 'presentation', 'quality']
for predData in tests:
    y = trainingSet.loc[:, predData]
    print(y.shape)

    # clf = SVC()
    # print(clf.fit(X, y))

    clf = LinearDiscriminantAnalysis()
    clf.fit(X, y)

    testSet = testSet.drop(predData, axis=1)
    XNew = testSet.loc[:, featureColumns]
    # print(XNew)

    newPredClass = clf.predict(XNew)

    surveyDataMistakes = list(surveyData[predData].astype(int))
    tenFirst = surveyDataMistakes[:20]

    print(tenFirst)
    print(newPredClass)

    print(mean_squared_error(newPredClass, tenFirst))

    print(len([i for i, j in zip(newPredClass, tenFirst) if i == j]))
    print(len(testSet))
    print(
        len([i for i, j in zip(newPredClass, tenFirst) if i == j]) /
        len(testSet))
    print("\n")
models.append(('LR', LogisticRegression()))
models.append(('LDA', LinearDiscriminantAnalysis()))
models.append(('KNN', KNeighborsClassifier()))
models.append(('CART', DecisionTreeClassifier()))
models.append(('NB', GaussianNB()))
models.append(('SVM', SVC()))
# evaluate each model in turn
results = []
names = []
for name, model in models:
    kfold = model_selection.KFold(n_splits=5, random_state=seed)
    cv_results = model_selection.cross_val_score(model, X_train, Y_train, cv=kfold, scoring=scoring)
    results.append(cv_results)
    names.append(name)
    msg = "%s: %f (%f)" % (name, cv_results.mean(), cv_results.std())
    print(msg)
    
# Compare Algorithms
#fig = plt.figure()
#fig.suptitle('Algorithm Comparison')
#ax = fig.add_subplot(111)
#plt.boxplot(results)
#ax.set_xticklabels(names)
#plt.show()

model = LinearDiscriminantAnalysis()
model.fit(X_train, Y_train)
predictions = model.predict(X_validation)
print(accuracy_score(Y_validation, predictions))
print(confusion_matrix(Y_validation, predictions))
print(classification_report(Y_validation, predictions))
Exemple #19
0
class FBCSP(object):
	def __init__(self,
		     sample_rate,
		     feat_sel_proportion=0.8,
		     low_cut_hz = 4,
		     high_cut_hz = 36,
		     step = 4,
		     csp_components = 4
		    ):

		self.low_cut_hz = low_cut_hz
		self.high_cut_hz = high_cut_hz
		self.step = step
		self.sample_rate = sample_rate
		self.csp_component = csp_components
		self.feat_proportion = feat_sel_proportion
		self.csp_bank = dict()
		self.low = dict()
		self.high = dict()
		self.n_bank = (self.high_cut_hz - self.low_cut_hz)//self.step
		self.n_feat = int(self.n_bank*self.csp_component*self.feat_proportion)

		for i in range(self.n_bank):
			self.low[i]  = self.low_cut_hz+i*self.step
			self.high[i] = self.low_cut_hz+i*self.step+self.step
			if (self.high_cut_hz - self.high[i]) < self.step:
				self.high[i] = self.high_cut_hz

				
	def fit(self, data, label):
		data_bank = dict()
		for i in range(self.n_bank):
			# get each freq filter bank 
			data_bank[i] = self.bank_filter(data, self.low[i], self.high[i], self.sample_rate)
			# extract csp feature for each bank 
			self.csp_bank[i] = CSP(n_components=self.csp_component, reg=None, log=True, norm_trace=False)
			self.csp_bank[i].fit(data_bank[i], label)


	def transform(self, data):
		data_bank = dict()
		csp_feat = dict()
		for i in range(self.n_bank):
			# get each freq filter bank 
			data_bank[i] = self.bank_filter(data, self.low[i], self.high[i], self.sample_rate)
			# extract csp feature for each bank 
			csp_feat[i] = self.csp_bank[i].transform(data_bank[i])
			try:
				feature
			except NameError:
				feature = csp_feat[i]
			else:
				feature = np.hstack([feature, csp_feat[i]])
		return feature
	
	
	def fit_transform(self, data, label):
		data_bank = dict()
		csp_feat = dict()
		for i in range(self.n_bank):
			# get each freq filter bank 
			data_bank[i] = self.bank_filter(data, self.low[i], self.high[i], self.sample_rate)
			# extract csp feature for each bank 
			self.csp_bank[i] = CSP(n_components=4, reg=None, log=True, norm_trace=False)
			self.csp_bank[i].fit(data_bank[i], label)
			csp_feat[i] = self.csp_bank[i].transform(data_bank[i])
			try:
				feature
			except NameError:
				feature = csp_feat[i]
			else:
				feature = np.hstack([feature, csp_feat[i]])
		return feature


	def bank_filter(self, data, low_cut_hz, high_cut_hz, sample_rate):
		n_trial		= data.shape[0]
		n_channel	= data.shape[1]
		n_length	= data.shape[2]
		data_bank	= []
		for i in range(n_trial):
			data_bank += [np.array([butter_bandpass_filter(data[i, j, :], low_cut_hz, high_cut_hz, sample_rate, pass_type = 'band', order=6) 
							for j in range(n_channel)])]
		return np.array(data_bank)


	def classifier_fit(self, feature, label):
		# feature selection
		self.MI_sel = SelectPercentile(mutual_info_classif, percentile=self.feat_proportion*100)
		self.MI_sel.fit(feature, label)
		new_feat = self.MI_sel.transform(feature)
		# classification
		self.clf = LinearDiscriminantAnalysis()
		self.clf.fit(new_feat, label)


	def classifier_transform(self, feature):
		# feature selection
		new_feat = self.MI_sel.transform(feature)
		# classification
		return self.clf.transform(new_feat)


	def evaluation(self, feature, label):
		# feature selection
		new_feat = self.MI_sel.transform(feature)
		# accuracy
		accuracy = self.clf.score(new_feat, label)
		# f1
		f1 = dict()
		pred = self.clf.predict(new_feat)
		f1["micro"] = f1_score(y_true = label, y_pred = pred, average='micro')
		f1["macro"] = f1_score(y_true = label, y_pred = pred, average='macro')
		# auc
		pred_posi = self.clf.decision_function(new_feat)
		lb = LabelBinarizer()
		test_y = lb.fit_transform(label)
		roc_auc = self.multiclass_roc_auc_score(y_true = test_y, y_score = pred_posi)
		return accuracy, f1, roc_auc


	def multiclass_roc_auc_score(self, y_true, y_score):
		assert y_true.shape == y_score.shape
		fpr = dict()
		tpr = dict()
		roc_auc = dict()
		n_classes = y_true.shape[1]
		# compute ROC curve and ROC area for each class
		for i in range(n_classes):
			fpr[i], tpr[i], _ = roc_curve(y_true[:, i], y_score[:, i])
			roc_auc[i] = auc(fpr[i], tpr[i])
		# compute micro-average ROC curve and ROC area
		fpr["micro"], tpr["micro"], _ = roc_curve(y_true.ravel(), y_score.ravel())
		roc_auc["micro"] = auc(fpr["micro"], tpr["micro"])
	
		# compute macro-average ROC curve and ROC area
		# First aggregate all false positive rates
		all_fpr = np.unique(np.concatenate([fpr[i] for i in range(n_classes)]))
		# Then interpolate all ROC curves at this points
		mean_tpr = np.zeros_like(all_fpr)
		for i in range(n_classes):
		    mean_tpr += interp(all_fpr, fpr[i], tpr[i])
		# Finally average it and compute AUC
		mean_tpr /= n_classes
		fpr["macro"] = all_fpr
		tpr["macro"] = mean_tpr
		roc_auc["macro"] = auc(fpr["macro"], tpr["macro"])
		return roc_auc
print(df.values)

colors = ("orange", "blue")
plt.scatter(df['x'],
            df['y'],
            s=300,
            c=df['label'],
            cmap=matplotlib.colors.ListedColormap(colors))
plt.show()

X = df[['x', 'y']].values
y = df['label'].values

train_X, test_X, train_y, test_y = train_test_split(X,
                                                    y,
                                                    test_size=0.25,
                                                    random_state=0,
                                                    shuffle=True)

lda = LinearDiscriminantAnalysis()
lda = lda.fit(train_X, train_y)

y_pred = lda.predict(test_X)
print("Predicted vs Expected")
print(y_pred)
print(test_y)

print(classification_report(test_y, y_pred, digits=3))

print(confusion_matrix(test_y, y_pred))
Exemple #21
0
def __lda(X,y,x,solver):
    lda = LDA(solver=solver, store_covariance=True)
    lda.fit(X,y)
    y_p = lda.predict(x)
    score = lda.predict_proba(x)
    return y_p,score
Exemple #22
0
    def crossTaskBDM(self,
                     sj,
                     cnds='all',
                     window=(-0.2, 0.8),
                     to_decode_tr='digit',
                     to_decode_te='T1',
                     gat_matrix=True):
        '''
		function that decoding across localizer and AB task
		'''

        # STEP 1: reading data from localizer task and AB task (EEG and behavior)
        locEEG = mne.read_epochs(
            self.FolderTracker(extension=['localizer', 'processed'],
                               filename='subject-{}_all-epo.fif'.format(sj)))
        abEEG = mne.read_epochs(
            self.FolderTracker(extension=['AB', 'processed'],
                               filename='subject-{}_all-epo.fif'.format(sj)))
        beh_loc = pickle.load(
            open(
                self.FolderTracker(
                    extension=['localizer', 'beh', 'processed'],
                    filename='subject-{}_all.pickle'.format(sj)), 'rb'))
        beh_ab = pickle.load(
            open(
                self.FolderTracker(
                    extension=['AB', 'beh', 'processed'],
                    filename='subject-{}_all.pickle'.format(sj)), 'rb'))

        # STEP 2: downsample data
        locEEG.resample(128)
        abEEG.resample(128)

        # set general parameters
        s_loc, e_loc = [np.argmin(abs(locEEG.times - t)) for t in window]
        s_ab, e_ab = [np.argmin(abs(abEEG.times - t)) for t in window]
        picks = mne.pick_types(
            abEEG.info, eeg=True,
            exclude='bads')  # 64 good electrodes in both tasks (interpolation)
        eegs_loc = locEEG._data[:, picks, s_loc:e_loc]
        eegs_ab = abEEG._data[:, picks, s_ab:e_ab]
        nr_time = eegs_loc.shape[-1]
        if gat_matrix:
            nr_test_time = eegs_loc.shape[-1]
        else:
            nr_test_time = 1

        # STEP 3: get training and test info
        identity_idx = np.where(beh_loc[to_decode_tr] > 0)[
            0]  # digits are 0 in case of letters and vice versa
        train_labels = beh_loc[to_decode_tr][
            identity_idx]  # select the labels used for training
        nr_tr_labels = np.unique(train_labels).size
        min_tr_labels = min(np.unique(train_labels, return_counts=True)[1])
        print('You are using {}s to train, with {} as unique labels'.format(
            to_decode_tr, np.unique(train_labels)))
        train_idx = np.sort(
            np.hstack([
                random.sample(
                    np.where(beh_loc[to_decode_tr] == l)[0], min_tr_labels)
                for l in np.unique(train_labels)
            ]))

        # set test labels
        #test_idx = np.where(np.array(beh_ab['condition']) == cnd)[0] # number test labels is not yet counterbalanced
        test_idx = range(np.array(beh_ab[to_decode_te]).size)

        # STEP 4: do classification
        lda = LinearDiscriminantAnalysis()

        # set training and test labels
        Ytr = beh_loc[to_decode_tr][
            train_idx] % 10  # double check whether this also works for letters
        Yte = np.array(beh_ab[to_decode_te])  #[test_idx]

        class_acc = np.zeros((nr_time, nr_test_time))
        label_info = np.zeros((nr_time, nr_test_time, nr_tr_labels))

        for tr_t in range(nr_time):
            print(tr_t)
            for te_t in range(nr_test_time):
                if not gat_matrix:
                    te_t = tr_t

                Xtr = eegs_loc[train_idx, :, tr_t].reshape(-1, picks.size)
                Xte = eegs_ab[test_idx, :, te_t].reshape(-1, picks.size)

                lda.fit(Xtr, Ytr)
                predict = lda.predict(Xte)

                if not gat_matrix:
                    #class_acc[tr_t, :] = sum(predict == Yte)/float(Yte.size)
                    class_acc[tr_t, :] = np.mean([
                        sum(predict[Yte == y] == y) / float(sum(Yte == y))
                        for y in np.unique(Yte)
                    ])
                    label_info[tr_t, :] = [
                        sum(predict == l) for l in np.unique(Ytr)
                    ]
                else:
                    #class_acc[tr_t, te_t] = sum(predict == Yte)/float(Yte.size)
                    class_acc[tr_t, te_t] = np.mean([
                        sum(predict[Yte == y] == y) / float(sum(Yte == y))
                        for y in np.unique(Yte)
                    ])
                    label_info[tr_t, te_t] = [
                        sum(predict == l) for l in np.unique(Ytr)
                    ]

        pickle.dump(
            class_acc,
            open(
                self.FolderTracker(
                    extension=['cross_task', 'bdm'],
                    filename='subject-{}_bdm.pickle'.format(sj)), 'wb'))
        #print(selected_features_to_drop)

        temp_dataset = load_breast_cancer()
        temp_dataFrame = pd.DataFrame(temp_dataset.data,
                                      columns=temp_dataset.feature_names)
        temp_dataFrame = temp_dataFrame.drop(selected_features_to_drop, axis=1)
        temp_dataset = Bunch(data=temp_dataFrame.values,
                             target=wbcd_train.target,
                             target_names=wbcd_train.target_names,
                             feature_names=temp_dataFrame.columns)
        #print(temp_dataset.data.shape)

        #print("Training SVM model with RBF kernel function")
        model_LDA = LinearDiscriminantAnalysis().fit(temp_dataset.data,
                                                     temp_dataset.target)
        LDA_prediction = model_LDA.predict(temp_dataset.data)
        scr_acc = accuracy_score(temp_dataset.target, LDA_prediction)
        scr_pre = precision_score(temp_dataset.target,
                                  LDA_prediction,
                                  average='macro')
        #print("Accuracy : " + str(round(scr_acc, 3)))
        #print("Precision : " + str(round(scr_pre, 3)))
        LDA_perf[n_feat] = [round(scr_acc, 3), round(scr_pre, 3)]

    print(LDA_perf)
    print("Model : Naive Bayes")
    table = [["5", NB_perf[5][0], NB_perf[5][1]],
             ["10", NB_perf[10][0], NB_perf[10][1]],
             ["15", NB_perf[15][0], NB_perf[15][1]],
             ["20", NB_perf[20][0], NB_perf[20][1]],
             ["25", NB_perf[25][0], NB_perf[25][1]]]
Exemple #24
0
def classification_pipeline(classifier,X_train,y_train,X_test,y_test,data_all,\
                            width,height,num_classes,test_indexes,\
                            num_train_each_class, model_selection=True):
    Classifiers = [
        "KNN", "GaussNB", "LDA", "LR", "KSVM", "DT", "RF", "GB", "MLR"
    ]
    IsScale = [True, False, False, True, True, False, False, False, True]
    is_scale = IsScale[Classifiers.index(classifier)]
    if is_scale:
        scaler = MinMaxScaler()
        X_train = scaler.fit_transform(X_train)
        X_test = scaler.transform(X_test)
        data_all = scaler.transform(data_all)

    if classifier == "KNN":
        start_time = time.time()
        if model_selection == True:
            Clf = KNeighborsClassifier()
            param_grid = {'n_neighbors': [3, 5, 7, 9]}
            if np.sum(num_train_each_class < 5) == len(num_train_each_class):
                nfolds = 3
            else:
                nfolds = 5
            best_params = param_selection(Clf, X_train, y_train, param_grid,
                                          nfolds)
            print("KNN----------------------")
            print("The parameter grid is:")
            print(param_grid)
            print("The best parameter is:")
            print(best_params)
            KNN = KNeighborsClassifier(
                n_neighbors=best_params['n_neighbors']).fit(X_train, y_train)
#        KNN = KNeighborsClassifier(n_neighbors=7).fit(X_train,y_train)
        if model_selection == False:
            n_neighbors = 5
            KNN = KNeighborsClassifier(n_neighbors=n_neighbors).fit(
                X_train, y_train)
        Cla_Map = KNN.predict(data_all).reshape(width,
                                                height).astype(int).transpose(
                                                    1, 0)
        predict_prob = KNN.predict_proba(data_all)
        # Post-processing using Graph-Cut
        Seg_Map, seg_accuracy, seg_accuracy_each = Post_Processing(predict_prob,height,width,\
                                          num_classes,y_test,test_indexes)
        print('(KNN) Train_Acc=%.3f, Test_Cla_Acc=%.3f, Seg_Acc=%.3f (Time_cost=%.3f)'\
              % (KNN.score(X_train,y_train),KNN.score(X_test,y_test),\
                 seg_accuracy, (time.time()-start_time)))
        cla_accuracy = KNN.score(X_test, y_test)
#        time_cost = time.time()-start_time

    if classifier == "GaussNB":
        start_time = time.time()
        GaussNB = GaussianNB().fit(X_train, y_train)
        Cla_Map = GaussNB.predict(data_all).reshape(
            width, height).astype(int).transpose(1, 0)
        predict_prob = GaussNB.predict_proba(data_all)
        # Post-processing using Graph-Cut
        Seg_Map, seg_accuracy, seg_accuracy_each = Post_Processing(predict_prob,height,width,\
                                          num_classes,y_test,test_indexes)
        print('(GaussNB) Train_Acc=%.3f, Test_Cla_Acc=%.3f, Seg_Acc=%.3f(Time_cost=%.3f)'\
              % (GaussNB.score(X_train,y_train),GaussNB.score(X_test,y_test),\
                 seg_accuracy, (time.time()-start_time)))
        cla_accuracy = GaussNB.score(X_test, y_test)
#        time_cost = time.time()-start_time

    if classifier == "LDA":
        start_time = time.time()
        LDA = LinearDiscriminantAnalysis().fit(X_train, y_train)
        Cla_Map = LDA.predict(data_all).reshape(width,
                                                height).astype(int).transpose(
                                                    1, 0)
        predict_prob = LDA.predict_proba(data_all)
        # Post-processing using Graph-Cut
        Seg_Map, seg_accuracy, seg_accuracy_each = Post_Processing(predict_prob,height,width,\
                                          num_classes,y_test,test_indexes)
        print('(LDA) Train_Acc=%.3f, Test_Cla_Acc=%.3f, Seg_Acc=%.3f(Time_cost=%.3f)'\
              % (LDA.score(X_train,y_train),LDA.score(X_test,y_test),\
                 seg_accuracy, (time.time()-start_time)))
        cla_accuracy = LDA.score(X_test, y_test)
#        time_cost = time.time()-start_time

    if classifier == "LR":
        start_time = time.time()
        if model_selection == True:
            Clf = LogisticRegression(multi_class='multinomial', solver='lbfgs')
            param_grid = {'C': [0.1, 1, 10, 20, 30, 50]}
            if np.sum(num_train_each_class < 5) == len(num_train_each_class):
                nfolds = 3
            else:
                nfolds = 5
            best_params = param_selection(Clf, X_train, y_train, param_grid,
                                          nfolds)
            print("LR----------------------")
            print("The parameter grid is:")
            print(param_grid)
            print("The best parameter is:")
            print(best_params)
            LR = LogisticRegression(multi_class='multinomial',
                                    solver='lbfgs',
                                    C=best_params['C']).fit(X_train, y_train)
        if model_selection == False:
            LR = LogisticRegression(multi_class='multinomial',
                                    solver='lbfgs',
                                    C=1).fit(X_train, y_train)
        Cla_Map = LR.predict(data_all).reshape(width,
                                               height).astype(int).transpose(
                                                   1, 0)
        predict_prob = LR.predict_proba(data_all)
        # Post-processing using Graph-Cut
        Seg_Map, seg_accuracy, seg_accuracy_each = Post_Processing(predict_prob,height,width,\
                                          num_classes,y_test,test_indexes)
        print('(LR) Train_Acc=%.3f, Test_Cla_Acc=%.3f, Seg_Acc=%.3f(Time_cost=%.3f)'\
              % (LR.score(X_train,y_train),LR.score(X_test,y_test),\
                 seg_accuracy, (time.time()-start_time)))
        cla_accuracy = LR.score(X_test, y_test)
#        time_cost = time.time()-start_time

    if classifier == "KSVM":
        start_time = time.time()
        if model_selection == True:
            Clf = SVC(probability=True)
            param_grid = {'C':[2**(-9),2**(-8),2**(-7),2**(-6),2**(-5),2**(-4),2**(-3),2**(-2),\
                           2**(-1),2**(0),2**(1),2**(2),2**(3),2**(4),2**(5),2**(6),2**(7),2**(8),2**(9)]}
            if np.sum(num_train_each_class < 5) == len(num_train_each_class):
                nfolds = 3
            else:
                nfolds = 5
            best_params = param_selection(Clf, X_train, y_train, param_grid,
                                          nfolds)
            print("KSVM----------------------")
            print("The parameter grid is:")
            print(param_grid)
            print("The best parameter is:")
            print(best_params)
            SVM = SVC(C=best_params['C'],
                      probability=True).fit(X_train, y_train)
        if model_selection == False:
            SVM = SVC(C=512, probability=True).fit(X_train, y_train)
        Cla_Map = SVM.predict(data_all).reshape(width,
                                                height).astype(int).transpose(
                                                    1, 0)
        predict_prob = SVM.predict_proba(data_all)
        # Post-processing using Graph-Cut
        Seg_Map, seg_accuracy, seg_accuracy_each = Post_Processing(predict_prob,height,width,\
                                          num_classes,y_test,test_indexes)
        print('(Kernel SVM) Train_Acc=%.3f, Test_Cla_Acc=%.3f, Seg_Acc=%.3f(Time_cost=%.3f)'\
              % (SVM.score(X_train,y_train),SVM.score(X_test,y_test),\
                 seg_accuracy, (time.time()-start_time)))
        cla_accuracy = SVM.score(X_test, y_test)
#        time_cost = time.time()-start_time

    if classifier == "DT":
        start_time = time.time()
        if model_selection == True:
            Clf = DecisionTreeClassifier()
            param_grid = {'max_depth': [5, 10, 20, 50, 100, 200, 300]}
            if np.sum(num_train_each_class < 5) == len(num_train_each_class):
                nfolds = 3
            else:
                nfolds = 5
            best_params = param_selection(Clf, X_train, y_train, param_grid,
                                          nfolds)
            print("DT----------------------")
            print("The parameter grid is:")
            print(param_grid)
            print("The best parameter is:")
            print(best_params)
            DTree = DecisionTreeClassifier(
                max_depth=best_params['max_depth']).fit(X_train, y_train)
        if model_selection == False:
            DTree = DecisionTreeClassifier(max_depth=200).fit(X_train, y_train)
        Cla_Map = DTree.predict(data_all).reshape(
            width, height).astype(int).transpose(1, 0)
        predict_prob = DTree.predict_proba(data_all)
        # Post-processing using Graph-Cut
        Seg_Map, seg_accuracy, seg_accuracy_each = Post_Processing(predict_prob,height,width,\
                                          num_classes,y_test,test_indexes)
        print('(Decision Tree) Train_Acc=%.3f, Test_Cla_Acc=%.3f, Seg_Acc=%.3f(Time_cost=%.3f)'\
              % (DTree.score(X_train,y_train),DTree.score(X_test,y_test),\
                 seg_accuracy, (time.time()-start_time)))
        cla_accuracy = DTree.score(X_test, y_test)
#        time_cost = time.time()-start_time

    if classifier == "RF":
        start_time = time.time()
        if model_selection == True:
            Clf = RandomForestClassifier()
            param_grid = {'n_estimators': [5, 10, 20, 50, 100, 200, 300]}
            if np.sum(num_train_each_class < 5) == len(num_train_each_class):
                nfolds = 3
            else:
                nfolds = 5
            best_params = param_selection(Clf, X_train, y_train, param_grid,
                                          nfolds)
            print("RF----------------------")
            print("The parameter grid is:")
            print(param_grid)
            print("The best parameter is:")
            print(best_params)
            RF = RandomForestClassifier(
                n_estimators=best_params['n_estimators']).fit(
                    X_train, y_train)
        if model_selection == False:
            RF = RandomForestClassifier(n_estimators=200).fit(X_train, y_train)
        Cla_Map = RF.predict(data_all).reshape(width,
                                               height).astype(int).transpose(
                                                   1, 0)
        predict_prob = RF.predict_proba(data_all)
        # Post-processing using Graph-Cut
        Seg_Map, seg_accuracy, seg_accuracy_each = Post_Processing(predict_prob,height,width,\
                                          num_classes,y_test,test_indexes)
        print('(Random Forest) Train_Acc=%.3f, Test_Cla_Acc=%.3f, Seg_Acc=%.3f(Time_cost=%.3f)'\
              % (RF.score(X_train,y_train),RF.score(X_test,y_test),\
                 seg_accuracy, (time.time()-start_time)))
        cla_accuracy = RF.score(X_test, y_test)
#        time_cost = time.time()-start_time

    if classifier == "GB":
        start_time = time.time()
        if model_selection == True:
            Clf = GradientBoostingClassifier()
            param_grid = {'n_estimators': [10, 50, 100, 200, 300]}
            if np.sum(num_train_each_class < 5) == len(num_train_each_class):
                nfolds = 3
            else:
                nfolds = 5
            best_params = param_selection(Clf, X_train, y_train, param_grid,
                                          nfolds)
            print("GB----------------------")
            print("The parameter grid is:")
            print(param_grid)
            print("The best parameter is:")
            print(best_params)
            GB = GradientBoostingClassifier(
                n_estimators=best_params['n_estimators']).fit(
                    X_train, y_train)
        if model_selection == False:
            GB = GradientBoostingClassifier(n_estimators=200).fit(
                X_train, y_train)
        Cla_Map = GB.predict(data_all).reshape(width,
                                               height).astype(int).transpose(
                                                   1, 0)
        predict_prob = GB.predict_proba(data_all)
        # Post-processing using Graph-Cut
        Seg_Map, seg_accuracy, seg_accuracy_each = Post_Processing(predict_prob,height,width,\
                                          num_classes,y_test,test_indexes)
        print('(Gradient Boosting) Train_Acc=%.3f, Test_Cla_Acc=%.3f, Seg_Acc=%.3f(Time_cost=%.3f)'\
              % (GB.score(X_train,y_train),GB.score(X_test,y_test),\
                 seg_accuracy, (time.time()-start_time)))
        cla_accuracy = GB.score(X_test, y_test)
#        time_cost = time.time()-start_time

    if classifier == "MLR":
        start_time = time.time()
        if model_selection == True:
            Clf = MLPClassifier()
            param_grid = {'hidden_layer_sizes':[[50,50],[50,100],[50,200],[100,100],\
                                            [100,200],[200,100],[200,200],[200,300],\
                                            [200,500],[300,300],[300,400],[300,500],[400,500],[500,500]]}
            if np.sum(num_train_each_class < 5) == len(num_train_each_class):
                nfolds = 3
            else:
                nfolds = 5
            best_params = param_selection(Clf, X_train, y_train, param_grid,
                                          nfolds)
            print("MLR----------------------")
            print("The parameter grid is:")
            print(param_grid)
            print("The best parameter is:")
            print(best_params)
            MLP = MLPClassifier(
                hidden_layer_sizes=best_params['hidden_layer_sizes']).fit(
                    X_train, y_train)
        if model_selection == False:
            MLP = MLPClassifier(hidden_layer_sizes=[300, 400]).fit(
                X_train, y_train)
        Cla_Map = MLP.predict(data_all).reshape(width,
                                                height).astype(int).transpose(
                                                    1, 0)
        predict_prob = MLP.predict_proba(data_all)
        # Post-processing using Graph-Cut
        Seg_Map, seg_accuracy, seg_accuracy_each = Post_Processing(predict_prob,height,width,\
                                          num_classes,y_test,test_indexes)
        print('(MLP) Train_Acc=%.3f, Test_Cla_Acc=%.3f, Seg_Acc=%.3f(Time_cost=%.3f)'\
              % (MLP.score(X_train,y_train),MLP.score(X_test,y_test),\
                 seg_accuracy, (time.time()-start_time)))
        cla_accuracy = MLP.score(X_test, y_test)


#        time_cost = time.time()-start_time

    return Cla_Map, Seg_Map, cla_accuracy, seg_accuracy
Exemple #25
0
    # hide axis ticks
    plt.tick_params(axis="both", which="both", bottom="off", top="off",  
            labelbottom="on", left="off", right="off", labelleft="on")

    # remove axis spines
    ax.spines["top"].set_visible(False)  
    ax.spines["right"].set_visible(False)
    ax.spines["bottom"].set_visible(False)
    ax.spines["left"].set_visible(False)    

    plt.grid()
    plt.tight_layout
    plt.show()
    
plot_scikit_lda(X_train_lda_sklearn, y_train, title='Default LDA via scikit-learn')


# =============================================================================
# Prediction
# =============================================================================
# prior probability
print(sklearn_lda.priors_)

# confusion matrix
pred_test = sklearn_lda.predict(X_test)
print(confusion_matrix(pred_test, y_test))

# accuracy
print(np.mean(pred_test==y_test))

Exemple #26
0
def type(X,Y):
    rfc = RandomForestClassifier()
    classifier =LogisticRegression()  # SVC(kernel="linear") #svm.SVC(kernel='rbf',C=1,gamma='auto')
    gnb =GaussianNB() #BernoulliNB()#MultinomialNB()#
    gnb2=BernoulliNB()
    gnb3=MultinomialNB()
    svc = LinearSVC(C=0.5)
    EXT =ExtraTreesClassifier(criterion='gini', bootstrap=True,n_estimators=80,oob_score=True)
    EXT2 = ExtraTreesClassifier(criterion='entropy', bootstrap=True,n_estimators=125,oob_score=True)
    bag = BaggingClassifier(DecisionTreeClassifier(), n_estimators=100)
    model = GradientBoostingClassifier()
    model2=AdaBoostClassifier()
    model3=GradientBoostingClassifier()
    model4=LinearDiscriminantAnalysis()
    model5=QuadraticDiscriminantAnalysis()

    Y=shuffle(Y)#不對稱洗牌
    X=shuffle(X)#不對稱洗牌

    bag.fit(X, Y)
    classifier.fit(X, Y)
    rfc.fit(X, Y)
    gnb.fit(X, Y)
    gnb2.fit(X, Y)
    gnb3.fit(X, Y)
    EXT2.fit(X, Y)
    EXT.fit(X, Y)
    svc.fit(X,Y)
    model.fit(X,Y)
    model2.fit(X,Y)
    model3.fit(X,Y)
    model4.fit(X,Y)
    model5.fit(X,Y)


    pred = EXT.predict(X+Y).ravel()  # 預測 一維化
    pred_2=EXT2.predict(X+Y).ravel()  # 預測 一維化
    pred2 = gnb.predict(X + Y).ravel()  # 預測 一維化
    pred2_2 = gnb2.predict(X + Y).ravel()  # 預測 一維化
    pred2_3 = gnb3.predict(X + Y).ravel()  # 預測 一維化
    pred3=svc.predict(X + Y).ravel()
    pred4=bag.predict(X + Y).ravel()
    pred5=classifier.predict(X + Y).ravel()
    pred6=rfc.predict(X + Y).ravel()
    pred7=model.predict(X + Y).ravel()
    pred7_2 = model2.predict(X + Y).ravel()
    pred7_3 = model3.predict(X + Y).ravel()
    pred7_4 = model4.predict(X + Y).ravel()
    pred7_5= model5.predict(X + Y).ravel()



    print("ExtraTreesClassifier_gini",pred)
    print("ExtraTreesClassifier_entropy",pred_2)
    print("GaussianNB",pred2)
    print("BernoulliNB", pred2_2)
    print("MultinomialNB",pred2_3)
    print("LinearSVC(C=0.5)",pred3)
    print("BaggingClassifier(DecisionTreeClassifier(), n_estimators=100)", pred4)
    print("LogisticRegression", pred5)
    print("RandomForestClassifier", pred6)
    print('''model = GradientBoostingClassifier()  
    model2=AdaBoostClassifier()
    model3=GradientBoostingClassifier()
    model4=LinearDiscriminantAnalysis()
    model5=QuadraticDiscriminantAnalysis()''')
    print(pred7)
    print(pred7_2)
    print(pred7_3)
    print(pred7_4)
    print(pred7_5)
    print(model4.predict_log_proba(X + Y).ravel())
    print(model4.predict_proba(X+Y).ravel())
Exemple #27
0
    def run(self,person):
        print('starting on person ' + str(person))

        #load all features & keep them in memory
        X_cont, y_cont = self.personLoader.load(person)
        y_lbl = np.array( y_cont )
        y_lbl[ y_lbl <= 5 ] = 0
        y_lbl[ y_lbl >  5 ] = 1

        featNames = self.personLoader.featureExtractor.getFeatureNames()

        #split train / test
        #n_iter = 1 => abuse the shuffle split, to obtain a static break, instead of crossvalidation
        sss = StratifiedShuffleSplit(y_lbl, n_iter=1, test_size=0.25, random_state=19)
        for train_set_index, test_set_index in sss:
            #labels
            X_train, y_train = X_cont[train_set_index], y_lbl[train_set_index]
            X_test , y_test  = X_cont[test_set_index] , y_lbl[test_set_index]

            #correlations are based on the continuous values
            y_train_cont = y_cont[train_set_index]
            y_test_cont  = y_cont[train_set_index]


        #get correlations
        featCorrelations = [] #list[person] = {feat_index => , feat_corr => , feat_name => }
        for index, feat in enumerate(featNames):
            corr = pearsonr(X_train[:, index], y_train_cont)

            featCorrelations.append( {
                'feat_index' : index,
                'feat_corr'  : corr[0],
                'feat_name'  : featNames[index]
            })

        #sort correlations
        featCorrelations.sort(key=lambda tup: tup['feat_corr'], reverse = True) #sort on correlation (in place)
        #sort X_train in same order
        X_train_sorted = []
        for index,video in enumerate(X_train):
            X_train_sorted.append([])
            for map in featCorrelations:
                X_train_sorted[index].append(video[map['feat_index']])
        X_train_sorted = np.array(X_train_sorted)

        X_test_sorted = []
        for index,video in enumerate(X_test):
            X_test_sorted.append([])
            for map in featCorrelations:
                X_test_sorted[index].append(video[map['feat_index']])
        X_test_sorted = np.array(X_test_sorted)


        #academic loop
        featAccuracies = []

        #get lda accuracy for 2 features
        k = 2
        lda = LinearDiscriminantAnalysis()


        #leave out one validation
        K_CV = KFold(n=len(X_train_sorted),
            n_folds=len(X_train_sorted),
            random_state=17, #fixed randomseed ensure that the sets are always the same
            shuffle=False
        )
        predictions, truths = [], []
        for train_index, CV_index in K_CV: #train index here is a part of the train set
            #train
            lda.fit(X_train_sorted[train_index, 0:k], y_train[train_index])

            #predict
            pred = lda.predict(X_train_sorted[CV_index, 0:k])

            #save for metric calculations
            predictions.extend(pred)
            truths.extend(y_train[CV_index])

        best_acc = self.optMetric(predictions,truths)
        best_k   = k
        featAccuracies.append(best_acc)
        print('[' + str(person) + '] k= ' + str(k) + ' acc= ' + str(round(best_acc,3)))



        #try to improve the results with additional metrics
        k += 1

        while ( k <= self.max_k ):
            lda = LinearDiscriminantAnalysis()

            #leave out one validation
            K_CV = KFold(n=len(X_train_sorted),
                n_folds=len(X_train_sorted),
                random_state=17, #fixed randomseed ensure that the sets are always the same
                shuffle=False
            )
            predictions, truths = [], []
            for train_index, CV_index in K_CV: #train index here is a part of the train set
                #train
                lda.fit(X_train_sorted[train_index, 0:k], y_train[train_index])

                #predict
                pred = lda.predict(X_train_sorted[CV_index, 0:k])

                #save for metric calculations
                predictions.extend(pred)
                truths.extend(y_train[CV_index])

            curr_acc = self.optMetric(predictions,truths)
            featAccuracies.append(curr_acc)

            print('[' + str(person) + '] k= ' + str(k) + ' acc= ' + str(round(curr_acc,3)))

            if curr_acc > best_acc :
                best_acc = curr_acc
                best_k   = k

            k += 1

        #amount of features is now optimized, its results is stored in best_acc its value is stored in best_k
        #the acc leading up to it are stored in featAccuracies

        #train the optimized model on all data
        lda = LinearDiscriminantAnalysis()
        #train
        lda.fit(X_train_sorted[:, 0:best_k], y_train)
        #predict
        pred = lda.predict(X_test_sorted[:, 0:best_k])

        #get test accuracy
        test_acc = self.optMetric(pred,y_test)


        return {
            'feat_corr'         : featCorrelations,
            'feat_acc'          : featAccuracies,
            'test_acc'          : test_acc,
            'train_acc'         : best_acc,
            'best_k'            : best_k,
            'feat_names'        : featNames,
            'max_k'             : self.max_k,
            'classificatorName'  : self.personLoader.classificator.name
        }
def Predict():
    url = "Book1.csv"
    columns = [
        "Name", "Quiz 1 (10)", "Quiz 2 (10)", "Quiz 3 (10)", "Quiz 4 (10)",
        "Quiz 5 (15)", "Average Quiz", "HW 1 (10)", "HW 2 (10)", "HW 3 (25)",
        "HW 4 (10)", 'HW 5 (10)', "Average HW", "Report (15)",
        "Presentation (15)", "Final (30)", "Total Marks"
    ]
    dataset = pd.read_csv(url, names=columns)

    #print(dataset)
    #print(dataset.groupby("Total Marks").size())
    # dataset.plot(kind='box', subplots=True, layout=(2,2), sharex=False, sharey=False)
    #plt.show()

    #dataset.hist()
    #plt.show() #Plot of data
    #scatter_matrix(dataset)
    #plt.show()  #Plot of the relations
    array = dataset.values
    X = array[:, 1:15]
    Y = array[:, 15]
    Y = Y.astype('int')
    # print(array)
    # print(X)
    # print(Y)
    validation_size = 0.2
    seed = 8
    X_train, X_validation, Y_train, Y_validation = model_selection.train_test_split(
        X, Y, test_size=validation_size, random_state=seed)
    seed = 13
    # print(X_train)
    # print(Y_train)
    # # print(X_validation)
    # # print(Y_validation)
    # print(len(X_validation))
    # print(len(Y_validation))
    scoring = "accuracy"
    models = []
    models.append(
        ('LR', LogisticRegression(solver='liblinear', multi_class='ovr')))
    models.append(('LDA', LinearDiscriminantAnalysis()))
    models.append(('KNN', KNeighborsClassifier()))
    models.append(('CART', DecisionTreeClassifier()))
    models.append(('NB', GaussianNB()))
    models.append(('SVM', SVC(gamma='auto')))
    ##evaluate each model in turn
    results = []
    scores = []
    names = []
    for name, model in models:
        kfold = model_selection.KFold(n_splits=20, random_state=seed)
        cv_results = model_selection.cross_val_score(model,
                                                     X_train,
                                                     Y_train,
                                                     cv=kfold,
                                                     scoring=scoring)
        results.append(cv_results)
        names.append(name)
        msg = "%s: %f (%f)" % (name, cv_results.mean(), cv_results.std())
        print(msg)
        scores.append(cv_results.mean())

    maxi = 0
    for i in range(len(scores)):
        if scores[maxi] < scores[i]:
            maxi = i
    # fig = plt.figure()
    # fig.suptitle('Algorithm Comparison')
    # ax = fig.add_subplot(111)
    # plt.boxplot(results)
    # ax.set_xticklabels(names)
    # plt.savefig("g.png")

    knn = KNeighborsClassifier()
    lr = LogisticRegression(solver='liblinear', multi_class='ovr')
    lda = LinearDiscriminantAnalysis()
    cart = DecisionTreeClassifier()
    nb = GaussianNB()
    svm = SVC(gamma='auto')

    if maxi == 0:
        knn.fit(X_train, Y_train)
        predictions = knn.predict(X_validation)
    elif maxi == 1:
        lr.fit(X_train, Y_train)
        predictions = lr.predict(X_validation)
    elif maxi == 2:
        lda.fit(X_train, Y_train)
        predictions = lda.predict(X_validation)
    elif maxi == 3:
        cart.fit(X_train, Y_train)
        predictions = cart.predict(X_validation)
    elif maxi == 4:
        nb.fit(X_train, Y_train)
        predictions = nb.predict(X_validation)
    elif maxi == 5:
        svm.fit(X_train, Y_train)
        predictions = svm.predict(X_validation)

    print(accuracy_score(Y_validation, predictions))
    print(confusion_matrix(Y_validation, predictions))
    print(classification_report(Y_validation, predictions))

    # model = KNeighborsClassifier()
    # model.fit(X_train, Y_train)
    # # save the model to disk
    # filename = 'finalized_model.sav'
    # joblib.dump(knn, filename)

    # # # some time later...

    # # # load the model from disk
    # loaded_model = joblib.load(filename)
    # result = loaded_model.score(X_validation, Y_validation)
    # print(result)
    data_to_predict = pd.read_csv("Book2.csv", names=columns)
    array1 = data_to_predict.values
    Name = txtnew.get()
    Names = array1[:, 0]
    NameInd = ""
    for i in range(len(Names)):
        if Names[i] == Name:
            NameInd = i
            break
    if NameInd == "":
        ynew = ["Name Not Found"]
    else:
        Xnew = [array1[i][1:15]]
        if maxi == 0:
            ynew = knn.predict(Xnew)
        elif maxi == 1:
            ynew = lr.predict(Xnew)
        elif maxi == 2:
            ynew = lda.predict(Xnew)
        elif maxi == 3:
            ynew = cart.predict(Xnew)
        elif maxi == 4:
            ynew = nb.predict(Xnew)
        elif maxi == 5:
            ynew = svm.predict(Xnew)

        #print("X=%s, Predicted=%s" % (Xnew[0], ynew[0]))
    Answer = ynew[0]
    if Answer == "Name Not Found":
        messagebox.showinfo('Name not found', 'Please enter a valid name.')
    else:
        Input = tk.Tk()
        Input.title('Output')
        canvas4 = tk.Canvas(Input,
                            width=250,
                            height=150,
                            bg='light blue',
                            relief='raised')
        canvas4.pack()
        L1 = tk.Label(Input,
                      text="The predicted final marks are " + str(Answer))
        canvas4.create_window(125, 20, window=L1)
				p_pearson[i, j, k] = 1.0

# Move to linear discriminant analysis
lda_palatability = np.zeros((unique_lasers.shape[0], identity.shape[0]))
for i in range(unique_lasers.shape[0]):
	for j in range(identity.shape[0]):
		X = response[j, :, trials[i]] 
		Y = palatability[j, 0, trials[i]]
		# Use k-fold cross validation where k = 1 sample left out
		test_results = []
		c_validator = LeavePOut(1)
		for train, test in c_validator.split(X, Y):
			model = LDA()
			model.fit(X[train, :], Y[train])
			# And test on the left out kth trial - compare to the actual class of the kth trial and store in test results
			test_results.append(np.mean(model.predict(X[test]) == Y[test]))
		lda_palatability[i, j] = np.mean(test_results)

# Save these arrays to file
hf5.create_array('/ancillary_analysis', 'r_pearson', r_pearson)
hf5.create_array('/ancillary_analysis', 'p_pearson', p_pearson)
hf5.create_array('/ancillary_analysis', 'r_spearman', r_spearman)
hf5.create_array('/ancillary_analysis', 'p_spearman', p_spearman)
hf5.create_array('/ancillary_analysis', 'lda_palatability', lda_palatability)
hf5.flush()

# --------End palatability calculation----------------------------------------------------------------------------

#---------Isotonic (ordinal) regression of firing against palatability--------------------------------------------
r_isotonic = np.zeros((unique_lasers.shape[0], palatability.shape[0], palatability.shape[1]))
Exemple #30
0
disc = LinearDiscriminantAnalysis()
disc

# In[159]:

features = pd.concat((topics, nums.favorite_count), axis=1)
features

# In[160]:

disc = LinearDiscriminantAnalysis().fit(topics, nums.favorite_count >= 1)

# In[161]:

predicted_favorites = disc.predict(topics)
predicted_favorites

# In[162]:

np.sum(predicted_favorites)

# ## Wow!
# DiscriminantAnalysis is VERY discriminating!

# In[163]:

np.sum(nums.favorite_count >= 1)

# But not in a good way.
# 10x more true favorites than predicted.
Exemple #31
0
    x = np.concatenate((x1, x2))  # put x1 and x2 in same x
    y = np.concatenate((y1, y2))  # put y1 and y2 in same y
    x = x.reshape(-1, 1)  #reshape x from 2D to 1D
    return x, y


err1 = []  #define an empty list for LDA
err2 = []  #define an empty list for logistic regression
lr = skl_lm.LogisticRegression(solver='newton-cg')
lda = LinearDiscriminantAnalysis(solver='svd')
for i in range(100):
    X_train, Y_train = generata_data(100, 1)
    X_test, Y_test = generata_data(100, 1)
    #LDA
    lda.fit(X_train, Y_train)  #use training data to fit the lda model
    test_error1 = sum(lda.predict(X_test) != Y_test)  #get the test error
    err1.append(test_error1)  #put the test error in err1 list
    #logistic regression
    lr.fit(
        X_train,
        Y_train)  #use the training dara to fit the logsitic regression model
    test_error2 = sum(lr.predict(X_test) != Y_test)  #get the test error
    err2.append(test_error2)  #put the test error in err2 list
df = {'LDA': err1, 'LR': err2}
df = pd.DataFrame(data=df)  #save them in data frame
#find the mean and variance
mean = df.mean()
var = df.var()
print("The mean of test error is", mean)
print("The variance of test error is", mean)
box_plot = df.plot.box()
Exemple #32
0
    def crossTimeDecoding(self, Xtr, Xte, Ytr, Yte, labels, gat_matrix=False):
        '''

		At the moment only supports linear classification as implemented in sklearn. Decoding is done 
		across all time points. 

		Arguments
		- - - - - 

		Xtr (array): 
		xte (array): 
		Ytr (array):
		Yte (array): 
		labels (array | list):
		gat_matrix (bool):
		
		Returns
		- - - -

		class_acc (array): classification accuracies (nr train time X nr test time). If Decoding is only done across diagonal nr test time equals 1 
		label_info (array): Shows how frequent a specific label is selected (nr train time X nr test time X nr unique labels).   
		'''

        # set necessary parameters
        nr_labels = len(labels)
        N = self.nr_folds
        nr_elec, nr_time = Xtr.shape[-2], Xtr.shape[-1]
        if gat_matrix:
            nr_test_time = nr_time
        else:
            nr_test_time = 1

        # initiate linear classifier
        lda = LinearDiscriminantAnalysis()

        # inititate decoding arrays
        class_acc = np.zeros((N, nr_time, nr_test_time))
        label_info = np.zeros((N, nr_time, nr_test_time, nr_labels))

        for n in range(N):
            print('\r Fold {} out of {} folds'.format(n + 1, N), end='')
            Ytr_ = Ytr[n]
            Yte_ = Yte[n]

            for tr_t in range(nr_time):
                for te_t in range(nr_test_time):
                    if not gat_matrix:
                        te_t = tr_t

                    Xtr_ = Xtr[n, :, :, tr_t]
                    Xte_ = Xte[n, :, :, te_t]

                    # train model and predict
                    lda.fit(Xtr_, Ytr_)
                    scores = lda.predict_proba(
                        Xte_)  # get posteriar probability estimates
                    predict = lda.predict(Xte_)
                    class_perf = self.computeClassPerf(scores, Yte_,
                                                       np.unique(Ytr_),
                                                       predict)  #

                    if not gat_matrix:
                        #class_acc[n,tr_t, :] = sum(predict == Yte_)/float(Yte_.size)
                        label_info[n, tr_t, :] = [
                            sum(predict == l) for l in labels
                        ]
                        class_acc[n, tr_t, :] = class_perf  #

                    else:
                        #class_acc[n,tr_t, te_t] = sum(predict == Yte_)/float(Yte_.size)
                        label_info[n, tr_t,
                                   te_t] = [sum(predict == l) for l in labels]
                        class_acc[n, tr_t, te_t] = class_perf

        class_acc = np.squeeze(np.mean(class_acc, axis=0))
        label_info = np.squeeze(np.mean(label_info, axis=0))

        return class_acc, label_info
import seaborn as sns

# Dataset
n_samples, n_features = 100, 2
mean0, mean1 = np.array([0, 0]), np.array([0, 2])
Cov = np.array([[1, .8],[.8, 1]])
np.random.seed(42)
X0 = np.random.multivariate_normal(mean0, Cov, n_samples)
X1 = np.random.multivariate_normal(mean1, Cov, n_samples)
X = np.vstack([X0, X1])
y = np.array([0] * X0.shape[0] + [1] * X1.shape[0])

# LDA with scikit-learn
lda = LDA()
proj = lda.fit(X, y).transform(X)
y_pred = lda.predict(X)

errors =  y_pred != y
print("Nb errors=%i, error rate=%.2f" % (errors.sum(), errors.sum() / len(y_pred)))

# Use pandas & seaborn for convinience
data = pd.DataFrame(dict(x0=X[:, 0], x1=X[:, 1], y=["c"+str(v) for v in y]))
plt.figure()
g = sns.PairGrid(data, hue="y")
g.map_diag(plt.hist)
g.map_offdiag(plt.scatter)
g.add_legend()



plt.figure()
Exemple #34
0
	d = np.zeros(K)
	for i in range(K):
		d[i] = Discriminant(Xt[j,:],Mu[i,:],Sigma,Prior[i])
	Yl[j] = C[np.argmax(d)]
	
#Computing misclassification percentage
Error = np.mean(1.0*(Yl != Yt))*100
Result = "The misclassification percentage error for LDA is %s %s." % (Error, '%')
print Result


#-------------------------------------------------------------------
#Trying out scikit-learn module
clf = LinearDiscriminantAnalysis()
clf.fit(X,Y)
Yls= clf.predict(Xt)
Error1s = np.mean(1.0*(Yls != Yt))*100
Result1s = "The misclassification percentage error for LDA using scikit is %s %s." % (Error1s, '%')

clf = QuadraticDiscriminantAnalysis()
clf.fit(X,Y)
Yqs = clf.predict(Xt)
Error2s = np.mean(1.0*(Yqs != Yt))*100
Result2s = "The misclassification percentage error for QDA using scikit is %s %s." % (Error2s, '%')

print Result1s	
print Result2s
	
		
		
		
Exemple #35
0
def lindesc(train_X, train_Y, test_X, test_Y):
    lda = LinearDiscriminantAnalysis()
    lda.fit(train_X, train_Y)
    pred = lda.predict(test_X)
    rate = checkpred(test_Y, pred)
    return (1 - rate)
Exemple #36
0
    def selfEvaluation(self):

        eval_start = time.clock()

        print colors.GOLD
        print "--------------------------"
        print "Self Evaluation"

        # extract features from collected epochs by transforming with spatial filters
        print "Training..."
        self.X = self.extractFeatures(self.epochs, self.spatial_filters)
        lda = LinearDiscriminantAnalysis()
        lda = lda.fit(self.X, self.y)
        cross_validation_folds = 10
        xval = cross_val_score(lda, self.X, self.y, cv=cross_validation_folds)

        self.tuneSpatialFilters()

        # print cross validation report on training LDA
        print
        print colors.BOLD_YELLOW
        print "cross-validation with k=",cross_validation_folds,"folds"
        print xval
        print "mean:", xval.mean()

        print colors.SILVER
        print "--------------------------"
        print "Self Evaluation"
        print "Testing..."

        start = time.clock()
        test_epochs, test_y = BCIFileToEpochs(
            filename=self.test_file,
            num_channels=self.num_channels,
            max_epochs_per_class=1000, #self.calculation_threshold,
            filter_class_labels=[-1,1], #self.class_labels,
            epoch_size=self.epoch_size,
            include_electrodes=self.include_electrodes
        )
        end = time.clock()
        print "loaded test file in ", str(end - start),"seconds"

        # apply IIR filters to each channel row
        test_epochs = np.apply_along_axis( self.filterChannelData, axis=1, arr=test_epochs )

        test_X = self.extractFeatures(epochs=test_epochs, spatial_filters=self.spatial_filters)
        #chart_file_name="test_filters.pdf", y=test_y)

        print "-----------------------------------------------------------------"
        print "Metrics & Score"

        print colors.ORANGE
        predicted_y = lda.predict(test_X)
        cm = confusion_matrix(test_y, predicted_y)
        np.set_printoptions(precision=2)
        print('Confusion matrix, without normalization')
        print(cm)
        cm_normalized = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
        print('Normalized confusion matrix')
        print(cm_normalized)


        print colors.DARK_GREEN
        print "test",self.test_file
        print "bandpass filter", self.bandpass_filter_range
        print "trained with", self.calculation_threshold, "epochs per class"
        print (self.calculation_threshold*2*self.epoch_size)/self.sampling_rate, "sec trained"
        print "epoch_size", self.epoch_size
        print "CSP filters:", self.num_spatial_filters

        print colors.BOLD_GREEN
        print "percent correct:", lda.score(test_X, test_y)
        print colors.ENDC

        end = time.clock()
        print "evaluation stage completed in ", str(end - eval_start),"seconds"

        print "########################################"
        print "########################################"
        print "########################################"
        print "########################################"

        print "EXITING NOW"
        os._exit(1)
        thread.interrupt_main()
        exit()

        exit()
        return True
Exemple #37
0
    def linearClassification(self,
                             X,
                             train_tr,
                             test_tr,
                             max_tr,
                             labels,
                             gat_matrix=False):
        ''' 

		Arguments
		- - - - - 

		X (array): eeg data (trials X electrodes X time)
		train_tr (array): trial indices per fold and unique label (folds X labels X trials)
		test_tr (array): trial indices per fold and unique label (folds X labels X trials)
		max_tr (int): max number unique labels
		labels (array): decoding labels 
		bdm_matrix (bool): If True, return an train X test time decoding matrix. Otherwise only
							return the diagoanl of the matrix (standard decoding)

		Returns
		- - - -

		class_acc
		'''

        N = self.nr_folds
        nr_labels = np.unique(labels).size
        steps = int(max_tr / N)

        nr_elec, nr_time = X.shape[1], X.shape[2]
        if gat_matrix:
            nr_test_time = nr_time
        else:
            nr_test_time = 1

        lda = LinearDiscriminantAnalysis()

        # set training and test labels
        Ytr = np.hstack([[i] * (steps * (N - 1)) for i in np.unique(labels)])
        Yte = np.hstack([[i] * (steps) for i in np.unique(labels)])

        class_acc = np.zeros((N, nr_time, nr_test_time))
        label_info = np.zeros((N, nr_time, nr_test_time, nr_labels))

        for n in range(N):
            print('\r Fold {} out of {} folds'.format(n + 1, N), )

            for tr_t in range(nr_time):
                for te_t in range(nr_test_time):
                    if not gat_matrix:
                        te_t = tr_t

                    Xtr = np.array([
                        X[train_tr[n, l, :], :, tr_t] for l in range(nr_labels)
                    ]).reshape(-1, nr_elec)
                    Xte = np.vstack([
                        X[test_tr[n, l, :], :, te_t].reshape(-1, nr_elec)
                        for l, lbl in enumerate(np.unique(labels))
                    ])

                    lda.fit(Xtr, Ytr)
                    predict = lda.predict(Xte)

                    if not gat_matrix:
                        class_acc[n, tr_t, :] = sum(predict == Yte) / float(
                            Yte.size)
                        label_info[n, tr_t, :] = [
                            sum(predict == l) for l in np.unique(labels)
                        ]
                    else:
                        class_acc[n, tr_t,
                                  te_t] = sum(predict == Yte) / float(Yte.size)
                        label_info[n, tr_t, te_t] = [
                            sum(predict == l) for l in np.unique(labels)
                        ]
                        #class_acc[n,t] = clf.fit(X = Xtr, y = Ytr).score(Xte,Yte)

        class_acc = np.squeeze(np.mean(class_acc, axis=0))
        label_info = np.squeeze(np.mean(label_info, axis=0))

        return class_acc, label_info
Exemple #38
0
print(newdf_test['label'].value_counts())

features = newdf[final_columns].astype(float)
features1 = newdf_test[final_columns].astype(float)
lab = newdf['label']
lab1 = newdf_test['label']

from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
clf = LDA()
t0 = time()
clf.fit(features, lab)
tt = time() - t0
print("Classifier trained in {} seconds.".format(round(tt, 3)))
t0 = time()
pred = clf.predict(features1)
tt = time() - t0
print("Predicted in {} seconds".format(round(tt, 3)))
from sklearn.metrics import accuracy_score
acc = accuracy_score(pred, lab1)
print("Accuracy is {}.".format(round(acc, 4)))
print(
    pd.crosstab(lab1,
                pred,
                rownames=['Actual attacks'],
                colnames=['Predicted attacks']))

#Classifier trained in 5.718 seconds.
#Predicted in 0.02 seconds
#Accuracy is 0.999.
#Predicted attacks  U2R  non-U2R


NB_CM=confusion_matrix(y_test,pred)
heatmap(NB_CM,xticklabels=labels.index,yticklabels=labels.index)

labels=ConfusionMatrix(y_test,pred,'original')

pd.DataFrame(NB_CM/NB_CM.sum(axis=1).astype(float)).to_csv('percent.csv')
#LDA
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

lda=LinearDiscriminantAnalysis(solver='svd')
lda.fit(X_train.toarray(),y_train)

pred_lda=lda.predict(X_test)

lda_cm=confusion_matrix(y_test,pred_lda)
heatmap(lda_cm,xticklabels=labels.index,yticklabels=labels.index)

pred_ldacount=counter(pred_lda)
y_vc=y_test.value_counts()
y_vclda=pd.DataFrame(y_vc)
y_vclda.columns=['actual']
y_vclda['predicted']=pred_ldacount
y_vclda.plot(kind='bar')
   



# In[8]:

print("X Data type: ", X.dtype)
print("Y Data type: ", y.dtype)

# In[9]:

print("X_train Data type: ", X_train.dtype)
print("X_test Data type: ", X_test.dtype)
print("y_train Data type: ", y_train.dtype)
print("y_test Data type: ", y_test.dtype)

# In[10]:

print("X_train Shape: ", X_train.shape)
print("X_test Shape: ", X_test.shape)
print("y_train Shape: ", y_train.shape)
print("y_test Shape: ", y_train.shape)

# ## Creating & Testing Model

# In[11]:

model = LinearDiscriminantAnalysis()
model.fit(X_train, y_train)
predict = model.predict(X_test)
print(accuracy_score(y_test, predict))
print(confusion_matrix(y_test, predict))
print(classification_report(y_test, predict))
print "generating data"
arr1 = []
arr2 = []
test = []
for i in range(0, 20):
    arr1.append(randomarr())
    arr2.append(fitness())
    print i
print "done generating"

X = np.array(arr1)
y = np.array(arr2)
del arr1
del arr2
print "done deleting arr1 arr2"
start = timer()
clf = LinearDiscriminantAnalysis()
clf.fit(X, y)
end = timer()

del X
del y

print "Time it took to train:"
print(end - start)
print "Time it took to Predict:"
test.append(randomarr())
start = timer()
print(clf.predict(test))
end = timer()
print(end - start)
Exemple #42
0
def classification(filename, prepro, threads):
    set_option("display.max_rows", 10)
    pd.options.mode.chained_assignment = None

    schemes = [
        "complementary", "DAX", "EIIP", "enthalpy", "Galois4", "kmers", "pc"
    ]
    # evaluate each model in turn

    for scheme in schemes:

        training_data = pd.read_csv(filename + '.' + scheme, index_col=False)
        print(training_data)

        # basic statistics
        training_data.describe()
        label_vectors = training_data['Label'].values
        feature_vectors = training_data.drop(['Label'], axis=1).values
        print(label_vectors)
        print(feature_vectors)

        x_data = []
        y_data = []

        if prepro == 1:
            x_data = feature_vectors
            y_data = label_vectors
            print("### Any")
        elif prepro == 2:
            # information scaling
            scaler = preprocessing.StandardScaler().fit(feature_vectors)
            x_data = scaler.transform(feature_vectors)
            y_data = label_vectors
            print("### Scaling")
        elif prepro == 3:
            # PCA without scaling
            pca = decomposition.PCA(n_components=0.96,
                                    svd_solver='full',
                                    tol=1e-4)
            pca.fit(feature_vectors)
            x_data = pca.transform(feature_vectors)
            y_data = label_vectors

            #releasing memory
            pca = None
            label_vectors = None
            print("### PCA")
            print('X_PCA:', x_data.shape)
        elif prepro == 4:
            # information scaling
            scaler = preprocessing.StandardScaler().fit(feature_vectors)
            feature_vectors_scaler = scaler.transform(feature_vectors)

            # PCA with scaling
            pca = decomposition.PCA(n_components=0.9,
                                    svd_solver='full',
                                    tol=1e-4)
            pca.fit(feature_vectors_scaler)
            x_data = pca.transform(feature_vectors_scaler)
            y_data = label_vectors

            # realeasing memory
            feature_vectors_scaler = None
            pca = None
            scaler = None
            print("### PCA + Scaling")
            print('X_PCA:', x_data.shape)

        # information split with scaling
        validation_size = 0.2
        seed = 7
        X_train, X_validation, Y_train, Y_validation = train_test_split(
            x_data, list(y_data), test_size=validation_size, random_state=seed)
        x_data = None
        y_data = None
        training_data = None
        label_vectors = None
        feature_vectors = None

        # Logistic Regression
        # Testing different values of C
        limit = 1
        step = 0.1
        x = [0 for x in range(0, int(limit / step))]
        yValidation = [0 for x in range(0, int(limit / step))]
        ytrain = [0 for x in range(0, int(limit / step))]
        i = step
        index = 0
        while i < limit:
            lr = LogisticRegression(C=i, n_jobs=threads)

            lr.fit(X_train, Y_train)
            trainScore = f1_score(Y_train,
                                  lr.predict(X_train),
                                  average='macro')
            validationScore = f1_score(Y_validation,
                                       lr.predict(X_validation),
                                       average='macro')

            ytrain[index] = trainScore
            yValidation[index] = validationScore

            print('ite:', i)

            x[index] = i
            i += step
            index += 1

        plt.close('all')
        fig = plt.figure(figsize=(12, 8), dpi=80, facecolor='w', edgecolor='k')
        plt.plot(x, ytrain, '-', label='Train')
        plt.plot(x, yValidation, '-', label='Validation')
        plt.xlabel('C')
        plt.ylabel('F1-Score')
        plt.ylim((0, 1.1))
        plt.title('C vs Accuracy (' + scheme + ')')
        plt.legend()
        plt.savefig('LR-Algorithm_' + scheme + '.png', dpi=100)
        plt.show()
        print('### ' + scheme)
        print('### LG ### The best score with data validation: ',
              max(yValidation), 'with C: ',
              x[yValidation.index(max(yValidation))])

        lr = LogisticRegression(C=x[yValidation.index(max(yValidation))])
        lr.fit(X_train, Y_train)
        predictions = lr.predict(X_validation)
        metrics(Y_validation, predictions)
        #free memory
        lr = None

        # LDA
        # Testing different values of tolerance
        limit = 0.001
        step = 0.0001
        x = [0 for x in range(0, int(limit / step))]
        yValidation = [0 for x in range(0, int(limit / step))]
        ytrain = [0 for x in range(0, int(limit / step))]
        i = step
        index = 0
        while i <= limit:
            LDA = LinearDiscriminantAnalysis(tol=i)
            LDA.fit(X_train, Y_train)
            trainScore = f1_score(Y_train,
                                  LDA.predict(X_train),
                                  average='macro')
            validationScore = f1_score(Y_validation,
                                       LDA.predict(X_validation),
                                       average='macro')
            ytrain[index] = trainScore
            yValidation[index] = validationScore
            print('ite:', i)

            x[index] = i
            i = round(i + step, 4)
            index += 1

        plt.close('all')
        fig = plt.figure(figsize=(12, 8), dpi=80, facecolor='w', edgecolor='k')
        plt.plot(x, ytrain, '-', label='Train')
        plt.plot(x, yValidation, '-', label='Validation')
        plt.ylim((0, 1.1))
        plt.xlabel('C')
        plt.ylabel('F1-Score')
        plt.title('Tolerance vs Accuracy (' + scheme + ')')
        plt.legend()
        plt.savefig('LDA-Algorithm_' + scheme + '.png', dpi=100)
        plt.show()
        print('### ' + scheme)
        print('### LDA ### The best score with data validation: ',
              max(yValidation), 'with tol: ',
              x[yValidation.index(max(yValidation))])

        LDA = LinearDiscriminantAnalysis(
            tol=x[yValidation.index(max(yValidation))])
        LDA.fit(X_train, Y_train)
        predictions = LDA.predict(X_validation)
        metrics(Y_validation, predictions)
        #free memory
        LDA = None

        # KNN algorithm
        # Testing different quantities of neighbors
        limit = 100
        x = [x for x in range(1, limit, 10)]
        yValidation = [0 for x in range(1, limit, 10)]
        ytrain = [0 for x in range(1, limit, 10)]
        index = 0
        for i in range(1, limit, 10):
            KNN = KNeighborsClassifier(n_neighbors=i, n_jobs=threads)

            KNN.fit(X_train, Y_train)
            trainScore = f1_score(Y_train,
                                  KNN.predict(X_train),
                                  average='macro')
            validationScore = f1_score(Y_validation,
                                       KNN.predict(X_validation),
                                       average='macro')

            print('KNN Score:', i)
            ytrain[index] = trainScore
            yValidation[index] = validationScore
            index += 1

        plt.close('all')
        fig = plt.figure(figsize=(12, 8), dpi=80, facecolor='w', edgecolor='k')
        plt.plot(x, ytrain, label='Train')
        plt.ylim((0, 1.1))
        plt.plot(x, yValidation, label='Validation')
        plt.xlabel('n-Neighbors')
        plt.ylabel('F1-Score')
        plt.title('Neighbors vs Accuracy (' + scheme + ')')
        plt.legend()
        plt.savefig('KNN-Algorithm_' + scheme + '.png', dpi=100)
        plt.show()
        print('### ' + scheme)
        print('### KNN ### The best score with data validation: ',
              max(yValidation), 'with Neighbors: ',
              x[yValidation.index(max(yValidation))])

        KNN = KNeighborsClassifier(
            n_neighbors=x[yValidation.index(max(yValidation))])
        KNN.fit(X_train, Y_train)
        predictions = KNN.predict(X_validation)
        metrics(Y_validation, predictions)
        #free memory
        KNN = None

        # MLP
        limit = 500
        step = 50
        x = [x for x in range(0, int(limit / step) - 1)]
        yValidation = [0 for x in range(0, int(limit / step) - 1)]
        ytrain = [0 for x in range(0, int(limit / step) - 1)]

        i = step
        index = 0
        while i < limit:
            MLP = MLPClassifier(solver='lbfgs',
                                alpha=.5,
                                hidden_layer_sizes=(i))
            MLP.fit(X_train, Y_train)
            trainScore = f1_score(Y_train,
                                  MLP.predict(X_train),
                                  average='macro')
            validationScore = f1_score(Y_validation,
                                       MLP.predict(X_validation),
                                       average='macro')

            ytrain[index] = trainScore
            yValidation[index] = validationScore

            print('it:', i)
            x[index] = i
            i += step
            index += 1

        plt.close('all')
        fig = plt.figure(figsize=(12, 8), dpi=80, facecolor='w', edgecolor='k')
        plt.plot(x, ytrain, '-', label='Train')
        plt.plot(x, yValidation, '-', label='Validation')
        plt.ylim((0, 1.1))
        plt.xlabel('Neurons')
        plt.ylabel('F1-Score')
        plt.title('Neurons vs Accuracy (' + scheme + ')')
        plt.legend()
        plt.savefig('MLP-Algorithm_' + scheme + '.png', dpi=100)
        plt.show()
        print('### ' + scheme)
        print('### MLP ### The best score with data validation: ',
              max(yValidation), 'with Neurons: ',
              x[yValidation.index(max(yValidation))])
        MLP = MLPClassifier(solver='lbfgs',
                            alpha=.5,
                            hidden_layer_sizes=x[yValidation.index(
                                max(yValidation))])
        MLP.fit(X_train, Y_train)
        predictions = MLP.predict(X_validation)
        metrics(Y_validation, predictions)
        #free memory
        MLP = None

        # RF
        limit = 100
        step = 10
        x = [x for x in range(0, int(limit / step) - 1)]
        yValidation = [0 for x in range(0, int(limit / step) - 1)]
        ytrain = [0 for x in range(0, int(limit / step) - 1)]

        i = step
        index = 0
        while i < limit:
            RF = RandomForestClassifier(n_estimators=i, n_jobs=threads)
            RF.fit(X_train, Y_train)
            trainScore = f1_score(Y_train,
                                  RF.predict(X_train),
                                  average='macro')
            validationScore = f1_score(Y_validation,
                                       RF.predict(X_validation),
                                       average='macro')

            ytrain[index] = trainScore
            yValidation[index] = validationScore

            print('n_estimators:', i)
            x[index] = i
            i += step
            index += 1

        plt.close('all')
        plt.figure(figsize=(12, 8), dpi=80, facecolor='w', edgecolor='k')
        plt.plot(x, ytrain, '-', label='Train')
        plt.plot(x, yValidation, '-', label='Validation')
        plt.ylim((0, 1.1))
        plt.xlabel('Trees')
        plt.ylabel('F1-Score')
        plt.title('Trees vs Accuracy (' + scheme + ')')
        plt.legend()
        plt.savefig('RF-Algorithm_' + scheme + '.png', dpi=100)
        plt.show()
        print('### ' + scheme)
        print('### RF ### The best score with data validation: ',
              max(yValidation), 'with n_estimators: ',
              x[yValidation.index(max(yValidation))])
        RF = RandomForestClassifier(
            n_estimators=x[yValidation.index(max(yValidation))])
        RF.fit(X_train, Y_train)
        predictions = RF.predict(X_validation)
        metrics(Y_validation, predictions)
        #free memory
        RF = None

        # DT
        limit = 10
        step = 1
        x = [x for x in range(0, int(limit / step) - 1)]
        yValidation = [0 for x in range(0, int(limit / step) - 1)]
        ytrain = [0 for x in range(0, int(limit / step) - 1)]

        i = step
        index = 0
        while i < limit:
            DT = DecisionTreeClassifier(max_depth=i)
            DT.fit(X_train, Y_train)
            trainScore = f1_score(Y_train,
                                  DT.predict(X_train),
                                  average='macro')
            validationScore = f1_score(Y_validation,
                                       DT.predict(X_validation),
                                       average='macro')

            ytrain[index] = trainScore
            yValidation[index] = validationScore

            print('max_depth:', i)
            x[index] = i
            i += step
            index += 1

        plt.close('all')
        plt.figure(figsize=(12, 8), dpi=80, facecolor='w', edgecolor='k')
        plt.plot(x, ytrain, '-', label='Train')
        plt.plot(x, yValidation, '-', label='Validation')
        plt.ylim((0, 1.1))
        plt.xlabel('Max Depth')
        plt.ylabel('F1-Score')
        plt.title('Max Depth vs Accuracy (' + scheme + ')')
        plt.legend()
        plt.savefig('DT-Algorithm_' + scheme + '.png', dpi=100)
        plt.show()
        print('### ' + scheme)
        print('### DT ### The best score with data validation: ',
              max(yValidation), 'with max_depth: ',
              x[yValidation.index(max(yValidation))])
        DT = DecisionTreeClassifier(
            max_depth=x[yValidation.index(max(yValidation))])
        DT.fit(X_train, Y_train)
        predictions = DT.predict(X_validation)
        metrics(Y_validation, predictions)
        #free memory
        DT = None

        # SVC
        print("Before begin SVC")
        #Testing different values of C
        limit = 100
        step = 10
        x = [x for x in range(0, int(limit / step) - 1)]
        yValidation = [0 for x in range(0, int(limit / step) - 1)]
        ytrain = [0 for x in range(0, int(limit / step) - 1)]

        i = step
        index = 0
        while i < limit:
            svc = OneVsRestClassifier(SVC(C=i, gamma=1e-6), n_jobs=threads)

            svc.fit(X_train, Y_train)
            trainScore = f1_score(Y_train,
                                  svc.predict(X_train),
                                  average='macro')
            validationScore = f1_score(Y_validation,
                                       svc.predict(X_validation),
                                       average='macro')

            ytrain[index] = trainScore
            yValidation[index] = validationScore

            print('ite:', i)

            x[index] = i
            i += step
            index += 1

        plt.close('all')
        plt.figure(figsize=(12, 8), dpi=80, facecolor='w', edgecolor='k')
        plt.plot(x, ytrain, '-', label='Train')
        plt.plot(x, yValidation, '-', label='Validation')
        plt.ylim((0, 1.1))
        plt.xlabel('C')
        plt.ylabel('F1-Score')
        plt.title('C vs Accuracy (' + scheme + ')')
        plt.legend()
        plt.savefig('SVC-Algorithm_' + scheme + '.png', dpi=100)
        plt.show()
        print('### ' + scheme)
        print('### SVM ### The best score with data validation: ',
              max(yValidation), 'with C: ',
              x[yValidation.index(max(yValidation))])

        svc = SVC(C=x[yValidation.index(max(yValidation))], gamma=1e-6)
        svc.fit(X_train, Y_train)
        predictions = svc.predict(X_validation)
        metrics(Y_validation, predictions)
        #releasing memory
        svc = None

        # Bayesian Classifier
        values = [
            1e-1, 1e-3, 1e-5, 1e-7, 1e-9, 1e-11, 1e-13, 1e-15, 1e-17, 1e-19
        ]
        x = [0 for x in range(0, len(values))]
        yValidation = [0 for x in range(0, len(values))]
        ytrain = [0 for x in range(0, len(values))]
        for i, index in zip(values, range(len(values))):
            NB = GaussianNB(var_smoothing=i)

            NB.fit(X_train, Y_train)
            trainScore = f1_score(Y_train,
                                  NB.predict(X_train),
                                  average='macro')
            validationScore = f1_score(Y_validation,
                                       NB.predict(X_validation),
                                       average='macro')

            ytrain[index] = trainScore
            yValidation[index] = validationScore

            print('ite:', i)

            x[index] = i

        plt.close('all')
        fig = plt.figure(figsize=(12, 8), dpi=80, facecolor='w', edgecolor='k')
        plt.plot(x, ytrain, '-', label='Train')
        plt.plot(x, yValidation, '-', label='Validation')

        plt.xlabel('var_smoothing')
        plt.ylabel('F1-Score')
        plt.ylim((0, 1.1))
        plt.title('C vs F1-Score (' + scheme + ')')
        plt.legend()
        plt.savefig('NB-Algorithm_' + scheme + '.png', dpi=100)
        print('### ' + scheme)
        print('### NB ### The best score with data validation: ',
              max(yValidation), 'with var_smoothing: ',
              x[yValidation.index(max(yValidation))])

        NB = GaussianNB(var_smoothing=x[yValidation.index(max(yValidation))])
        NB.fit(X_train, Y_train)
        predictions = NB.predict(X_validation)
        metrics(Y_validation, predictions)
        #free memory
        NB = None
Exemple #43
0
class P300EasyClassifier(object):
    '''Easy and modular P300 classifier
    attributes"
    fname - classifier save filename
    epoch_buffor - current epoch buffor
    max_avr - maximum epochs to average
    decision_buffor - last decisions buffor, when full of identical
    decisions final decision is made
    clf - core classifier from sklearn
    feature_s - feature length'''
    
    def __init__(self, fname='./class.joblib.pkl', max_avr=10,
                    decision_stop=3, targetFs=30, clf=None,
                    feature_reduction = None):
        '''fname - classifier file to save or load classifier on disk
        while classifying produce decision after max_avr epochs averaged,
        or after decision_stop succesfull same decisions
        targetFs - on feature extraction downsample to this Hz
        clf - sklearn type classifier to use as core
        feature_reduction - 'auto', int, None. If 'auto' - features are
        reduced, features left are those which have statistically
        significant (p<0.05) difference in target and nontarget,
        if int - use feature_reduction most significant features, if 
        None don't use reduction
        '''
        self.targetFs = targetFs
        self.fname = fname
        self.epoch_buffor = []
        self.max_avr = max_avr
        self.decision_buffor = deque([], decision_stop)
        self.feature_reduction = feature_reduction
        if clf is None:
            self.clf = LinearDiscriminantAnalysis(solver = 'lsqr', shrinkage='auto')
        
    def load_classifier(self, fname=None):
        '''loads classifier from disk, provide fname - path to joblib
        pickle with classifier, or will be used from init'''
        self.clf = joblib.load(fname)
        
    
    def calibrate(self, targets, nontargets, bas=-0.1, window=0.4, Fs=None):
        '''targets, nontargets - 3D arrays (epoch x channel x time)
        or list of OBCI smart tags
        if arrays - need to provide Fs (sampling frequency) in Hz
        bas - baseline in seconds(negative), in other words start offset'''
    
        if Fs is None:
            Fs = float(targets[0].get_param('sampling_frequency'))
            target_data = _tags_to_array(targets)
            nontarget_data = _tags_to_array(nontargets)
        data = np.vstack((target_data, nontarget_data))
        self.epoch_l = data.shape[2]
        labels = np.zeros(len(data))
        labels[:len(target_data)] = 1
        data, labels = _remove_artifact_epochs(data, labels)
        features = _feature_extraction(data, Fs, bas, window, self.targetFs)
        
        if self.feature_reduction:
            mask = _feature_reduction_mask(features, labels, self.feature_reduction)
            self.feature_reduction_mask = mask
            features = features[:, mask]
        
        
        self.feature_s = features.shape[1]
        self.bas = bas
        self.window = window
        

        self.clf.fit(features, labels)
        joblib.dump(self.clf, self.fname, compress=9)
        return self.clf.score(features, labels)
        
    
        
        
    def run(self, epoch, Fs=None):
        '''epoch - array (channels x time) or smarttag/readmanager object,
         bas - baseline in seconds (negative),
        Fs - sampling frequency Hz, leave None if epoch is smart tag,
        returns decision - 1 for target, 0 for nontarget, 
        None - for no decision'''
        bas = self.bas
        window = self.window
        if Fs is None:
            Fs = float(epoch.get_param('sampling_frequency'))
            epoch = epoch.get_samples()[:,:self.epoch_l]
        if len(self.epoch_buffor)< self.max_avr:
            self.epoch_buffor.append(epoch)
            avr_epoch = np.mean(self.epoch_buffor, axis=0)
        
        features = _feature_extraction_singular(avr_epoch,
                                               Fs, bas, window, self.targetFs)[None, :]
        if self.feature_reduction:
            mask = self.feature_reduction_mask 
            features = features[:, mask]
        decision = self.clf.predict(features)[0]
        self.decision_buffor.append(decision)
        if len(self.decision_buffor) == self.decision_buffor.maxlen:
            if len(set(self.decision_buffor))==1:
                self.decision_buffor.clear()
                self.epoch_buffor = []
                return decision
        if len(self.epoch_buffor) == self.max_avr:
            self.decision_buffor.clear()
            self.epoch_buffor = []
            return decision
        return None
Exemple #44
0
models.append(('svm', SVC()))

resultss = []
names = []

for name, model in models:
    kfold = StratifiedKFold(n_splits=10, random_state=1)
    cv_results = cross_val_score(model,
                                 x_train,
                                 y_train,
                                 cv=kfold,
                                 scoring='accuracy')
    resultss.append(cv_results)
    names.append(name)
    print('%s: %f (%f)' % (name, cv_results.mean(), cv_results.std()))

#LinearDiscriminantAnalysis was found to be most efficient.

plt.boxplot(resultss, labels=names)
plt.title('Algorithm Comaprison')
plt.show()

model = LinearDiscriminantAnalysis()
model.fit(x_train, y_train)
pred = model.predict(x_test)
accuracy = model.score(x_test, y_test)
#evaluate our prediction
print(accuracy)
print(confusion_matrix(y_test, pred))
print(classification_report(y_test, pred))
print(validate.sum()/validate.size)
#0.621557828482


import sklearn.metrics as metrics
metrics.accuracy_score(test.label,pred)
#0.6097560975609756
metrics.roc_auc_score(test.label,pred)
#0.60621768080159055


from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

clf = LinearDiscriminantAnalysis()
clf=clf.fit(train_data[0:,1].reshape(-1,1), train_data[0:,0])
pred = clf.predict(test_data[0:,1].reshape(-1,1))
print("lda: label ~ count   accuracy:")
clf.score(test_data[0:,1].reshape(-1,1),test.label)
#0.57513768686073963
clf = LinearDiscriminantAnalysis()
clf=clf.fit(train_data[0:,[1,19,20]], train_data[0:,0])
pred = clf.predict(test_data[0:,[1,19,20]])
print("lda: label ~ count + callcount + crimecount   accuracy:")
clf.score(test_data[0:,[2,13,14]],test.label)
#0.75735590487706572





Exemple #46
0
"""
argv = int(sys.argv[1])
feature = argv
"""

feature = 41
lda = LinearDiscriminantAnalysis(n_components=41)
print(train_data.shape)
print(train_label.shape)
print(train_label)
raw_input()

lda.fit(train_data, train_label)
print ("lda done")

out = lda.predict(eval_data)
print (np.sum(out == eval_label) / float(eval_label.shape[0]))
raw_input()

matrix = np.ndarray([SIZE, feature])
for i in range(data.shape[0]):
        data_T = np.reshape(data[i], [1, -1])
        matrix[i] = lda.transform(data_T)
        print (matrix[x])
        raw_input()

data_length = data.shape[0]
f = file(name=FILENAME, mode="w+")

for x in range(data_length):
	info = []
datatrain.extend(temp[0:k2-3])




datatest = data_p[-3:]
datatest.extend(temp[-3:])

from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
clf = LinearDiscriminantAnalysis()
clf.fit(datatrain, labels)

LinearDiscriminantAnalysis(n_components=None, priors=None, shrinkage=None,
                           solver='svd', store_covariance=False, tol=0.0001)

print(clf.predict(datatest))


#should clear that one file


'''
    1. some blank files
    2. some files missing features
    3. some files too short, less than 1000 breath
    4. not using the min_f feature... the important one


    1. find out the collinear variables -- rank 70 of matrix 76,
    2. use other partial data for testing - need to verify validity
    3. pull out the classification plot
import seaborn as sns

# Dataset
n_samples, n_features = 100, 2
mean0, mean1 = np.array([0, 0]), np.array([0, 2])
Cov = np.array([[1, .8],[.8, 1]])
np.random.seed(42)
X0 = np.random.multivariate_normal(mean0, Cov, n_samples)
X1 = np.random.multivariate_normal(mean1, Cov, n_samples)
X = np.vstack([X0, X1])
y = np.array([0] * X0.shape[0] + [1] * X1.shape[0])

# LDA with scikit-learn
lda = LDA()
proj = lda.fit(X, y).transform(X)
y_pred = lda.predict(X)

errors =  y_pred != y
print("Nb errors=%i, error rate=%.2f" % (errors.sum(), errors.sum() / len(y_pred)))

# Use pandas & seaborn for convenience
data = pd.DataFrame(dict(x0=X[:, 0], x1=X[:, 1], y=["c"+str(v) for v in y]))
plt.figure()
g = sns.PairGrid(data, hue="y")
g.map_diag(plt.hist)
g.map_offdiag(plt.scatter)
g.add_legend()



plt.figure()
X_stim = np.array(X_stim)

X_successful = np.vstack([X_successful_reg, X_successful_stress])
y_successful = np.append(y_successful_reg,y_successful_stress)

X_all = np.vstack([X_reg, X_stress])
y_all = np.append(y_reg, y_stress)


clf_all = LinearDiscriminantAnalysis()
clf_all.fit(X_successful, y_successful)
scores = cross_val_score(LinearDiscriminantAnalysis(),X_successful,y_successful,scoring='accuracy',cv=10)
print "CV (10-fold) scores:", scores
print "Avg CV score:", scores.mean()

predict_stress = clf_all.predict(X_successful_stress)
print "Fraction of stress trials classified as stress:", np.sum(predict_stress)/len(predict_stress)

predict_stim = clf_all.predict(X_stim)
print "Fraction of all stimulation trials classified as stress:", np.sum(predict_stim)/len(predict_stim)

predict_stim = clf_all.predict(X_successful_stim)
print "Fraction of all successful stimulation trials classified as stress:", np.sum(predict_stim)/len(predict_stim)

"""
Decision boundary given by:
np.dot(clf.coef_, x) - clf.intercept_ = 0 according to 
http://stackoverflow.com/questions/36745480/how-to-get-the-equation-of-the-boundary-line-in-linear-discriminant-analysis-wit
"""

#LDAforFeatureSelection(X_successful,y_successful,filename,block_num)
    #print("Precision : " + str(round(scr_pre, 3)))
    perf['NB'] = [round(scr_acc, 3), round(scr_pre, 3)]
    #print("Training SVM model with RBF kernel function")
    model_SVM = SVC(kernel='rbf').fit(wbcd_train.data[:400],
                                      wbcd_train.target[:400])
    SVM_prediction = model_SVM.predict(wbcd_test.data[201:])
    scr_acc = accuracy_score(wbcd_test.target[201:], SVM_prediction)
    scr_pre = precision_score(wbcd_test.target[201:],
                              SVM_prediction,
                              average='macro')
    #print("Accuracy : " + str(round(scr_acc, 3)))
    #print("Precision : " + str(round(scr_pre, 3)))
    perf['SVM'] = [round(scr_acc, 3), round(scr_pre, 3)]
    #print("Training LDA model")
    model_LDA = LinearDiscriminantAnalysis().fit(wbcd_train.data,
                                                 wbcd_train.target)
    LDA_prediction = model_LDA.predict(wbcd_test.data)
    scr_acc = accuracy_score(wbcd_test.target, LDA_prediction)
    scr_pre = precision_score(wbcd_test.target,
                              LDA_prediction,
                              average='macro')
    #print("Accuracy : " + str(round(scr_acc, 3)))
    #print("Precision : " + str(round(scr_pre, 3)))
    perf['LDA'] = [round(scr_acc, 3), round(scr_pre, 3)]

    table = [["Naive Bayes", perf['NB'][0], perf['NB'][1]],
             ["SVM", perf['SVM'][0], perf['SVM'][1]],
             ["LDA", perf['LDA'][0], perf['LDA'][1]]]
    heads = ["Models", "Accuracy", "Precision"]
    print(tabulate(table, heads, tablefmt="grid"))
Exemple #51
0
label_e = ["ell"]*ell.shape[0]
label_v = ["vox"]*vox.shape[0]
label_w = ["wtr"]*wtr.shape[0]
label_r = ["rig"]*rig.shape[0]
label_c = ["con"]*(ell.shape[0]+vox.shape[0])


print()
print("CONTINUOUS VS. RIGID")
print("Training data: ellipse/voxel vs rigid...")
trainingSet = np.vstack((ell, vox, rig)).tolist()
labels = label_c + label_r
clf = LinearDiscriminantAnalysis()
clf.fit(trainingSet, labels)
print("Testing on wild type...")
predictions = clf.predict(wtr.tolist())
count = 0
for prediction in predictions:
    if (prediction=="con"):
        count+=1
print("Number of continuous predictions: "+str(count)+"/"+str(wtr.shape[0]))

print()
print("ELLIPSE VS. RIGID")
print("Training data: ellipse vs. rigid...")
trainingSet = np.vstack((ell, rig)).tolist()
labels = label_e + label_r
clf = LinearDiscriminantAnalysis()
clf.fit(trainingSet, labels)
print("Testing on voxels...")
predictions = clf.predict(vox.tolist())
Exemple #52
0
train.reset_index(level=0, inplace=True)
tr_target = train.iloc[:, 0]
tr_input = train.iloc[:, 1:].as_matrix()


# import test data
test = pd.DataFrame.from_csv("dataset/test.csv")
test.reset_index(level=0, inplace=True)
te_input = test.as_matrix()


# linear discriminant analysis classifier
classifier = LinearDiscriminantAnalysis()
classifier.fit(tr_input, tr_target)

predicted = classifier.predict(te_input)

# compress input and predicted values
images_and_predictions = list(zip(te_input, predicted))

# show result
s_r = 5
s_c = 3
s_p = s_r * s_c
s_o = 0
for index, (image, prediction) in enumerate(images_and_predictions[s_o:s_o + s_p]):
    im = np.reshape(image, [28, 28])
    plt.subplot(s_r, s_c, index+1)
    plt.axis('off')
    plt.imshow(im, cmap=plt.cm.gray_r, interpolation='nearest')
    plt.title('Prediction: %i' % prediction)
devtest='./exp/ivectors_semeval_devtest_NGMM_2048_W_2_DIM_200/feats.txt'
dev='./exp/ivectors_semeval_dev_NGMM_2048_W_2_DIM_200/feats.txt'
train='./exp/ivectors_semeval_train_NGMM_2048_W_2_DIM_200/feats.txt'



trainy,trainx=imdb_bag_of_word_libs.loadFeatsText(train)
trainy=imdb_bag_of_word_libs.kaldiID_2_LB(trainy)
evaly,evalx=imdb_bag_of_word_libs.loadFeatsText(dev)
evaly=imdb_bag_of_word_libs.kaldiID_2_LB(evaly)

evaly2,evalx2=imdb_bag_of_word_libs.loadFeatsText(devtest)
evaly2=imdb_bag_of_word_libs.kaldiID_2_LB(evaly2)


robust_scaler = RobustScaler()
trainx=robust_scaler.fit_transform(trainx)
evalx=robust_scaler.transform(evalx)

clf= LinearDiscriminantAnalysis() #
clf.fit(trainx,trainy)
predictValue=clf.predict(evalx)

print semeval2016_libs.scoreSameOrder(predictValue,configure.SCORE_REF_DEV)

evalx2=robust_scaler.transform(evalx2)
predictValue=clf.predict(evalx2)


print semeval2016_libs.scoreSameOrder(predictValue,configure.SCORE_REF_DEVTEST)
Exemple #54
0
    print()

from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
pca=PCA(n_components=n_components, whiten=True)
pca.fit(X)
X_pca=pca.transform(X)
for name, model in models:
    kfold=KFold(n_splits=5, shuffle=True, random_state=0)
    
    cv_scores=cross_val_score(model, X_pca, target, cv=kfold)
    print("{} mean cross validations score:{:.2f}".format(name, cv_scores.mean()))

lr=LinearDiscriminantAnalysis()
lr.fit(X_train_pca, y_train)
y_pred=lr.predict(X_test_pca)
print("Accuracy score:{:.2f}".format(metrics.accuracy_score(y_test, y_pred)))

cm=metrics.confusion_matrix(y_test, y_pred)

plt.subplots(1, figsize=(12,12))
sns.heatmap(cm)

print("Classification Results:\n{}".format(metrics.classification_report(y_test, y_pred)))

#More Validated Results: Leave One Out vross-validation
from sklearn.model_selection import LeaveOneOut
loo_cv=LeaveOneOut()
clf=LogisticRegression()
cv_scores=cross_val_score(clf,
                         X_pca,
print X_train.shape
print Y_train.shape
print X_test.shape
print Y_test.shape
'''

clf = LinearDiscriminantAnalysis()
clf.fit(X_train, Y_train)
LinearDiscriminantAnalysis(n_components=None,
                           priors=None,
                           shrinkage='auto',
                           solver='lsqr',
                           store_covariance=False,
                           tol=0.01)
#print clf.predict(temp[temp.shape[0]/2:temp.shape[0],0:temp.shape[1]-1])
Y_predict = clf.predict(X_test)
#print Y_predict.shape
count = 0

#print Y_test.shape[0]

for i in range(0, Y_test.shape[0]):
    if Y_predict[i] == Y_test[i]:
        count = count + 1
print "Accuracy=", (count * 100.) / Y_test.shape[0]
#print "accuracy=",count/(temp.shape[0]/2)

#for i in range(train_start,train_end):
#	print Y_predict[i]
#print temp.shape
#print X.shape
Exemple #56
0
import pandas as pd
import numpy as np
from sklearn.cross_validation import train_test_split
#from sklearn.datasets import load_breast_cancer
from sklearn import datasets, metrics
from sklearn.cross_validation import train_test_split
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

iris = datasets.load_iris()  # importing iris data set

X = iris.data  # storing feature matrix
y = iris.target  # storing response(target) vector

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2)  # spliting the data into training and testing sets

model = LinearDiscriminantAnalysis(
    n_components=4)  # choosing the number of components for the classification
model.fit(X_train, y_train)  # fitting the trining data into the model

y_pred = model.predict(
    X_test
)  # predicting the value of the dependent variable using the testing data
print(
    "The accuracy of using linear discriminant analysis with test size of 20%: "
)
print(metrics.accuracy_score(y_test, y_pred))
Exemple #57
0
    # Single-trial fitting and feature extraction
    features = np.zeros((len(triggers), 32))
    for t in range(len(triggers)):
        print('Fold {:2d}/{:2d}, trial: {:d}   '.format(fold, nfolds, t),
              end='\r')
        ws.set_data(data[t, :, :])
        ws.fit_var()

        con = ws.get_connectivity('ffPDC')

        alpha = np.mean(con[:, :, np.logical_and(7 < freq, freq < 13)], axis=2)
        beta = np.mean(con[:, :, np.logical_and(15 < freq, freq < 25)], axis=2)

        features[t, :] = np.array([alpha, beta]).flatten()

    lda.fit(features[train, :], classids[train])

    acc_train = lda.score(features[train, :], classids[train])
    acc_test = lda.score(features[test, :], classids[test])

    print('Fold {:2d}/{:2d}, '
          'acc train: {:.3f}, '
          'acc test: {:.3f}'.format(fold, nfolds, acc_train, acc_test))

    pred = lda.predict(features[test, :])
    cm += confusion_matrix(classids[test], pred)

print('\nConfusion Matrix:\n', cm)
print('\nTotal Accuracy: {:.3f}'.format(np.sum(np.diag(cm))/np.sum(cm)))
Exemple #58
0
def classify_diff_intervals(fnames, electrodes, preprocess, filter_apply, fc,
                            apply_pca):
    LDAClassifier = LinearDiscriminantAnalysis()
    cols = make_columns_drawing(electrodes + ["CVprob"])
    newCols = make_columns_drawing(electrodes + ["CVprob"])
    electrodes_len = len(electrodes)
    fi = 0
    n = 0
    for f in fnames:
        print(f)
        for i in np.arange(12, 180, 12):
            pos, neg = upload_pos_neg(f)
            #all_cols = pos.columns
            if (filter_apply):
                posElec, negElec = apply_filter(
                    pos.drop(
                        pos.columns[pos.columns.str.startswith('CVprob.')],
                        axis=1),
                    neg.drop(
                        neg.columns[pos.columns.str.startswith('CVprob.')],
                        axis=1), fc)
                pos = pd.concat([
                    posElec,
                    pos[pos.columns[pos.columns.str.startswith('CVprob.')]]
                ],
                                axis=1)
                neg = pd.concat([
                    negElec,
                    neg[neg.columns[neg.columns.str.startswith('CVprob.')]]
                ],
                                axis=1)
            #pos, neg = select_electrodes(electrodes, pos, neg)
            dropCols = get_drop_cols(i, electrodes_len, newCols)
            pos = pos.drop(dropCols, axis=1)
            neg = neg.drop(dropCols, axis=1)
            mean_features_pos, mean_features_neg = time_intervals_features(
                pos, neg, i, int(i * 6 / 12), electrodes_len, cols)
            mean_features_pos['y'] = 1
            mean_features_neg['y'] = 0
            #pos['y'] = 1
            #neg['y'] = 0
            X_train, X_test, y_train, y_test = train_test_split_data(
                mean_features_pos, mean_features_neg, test_size=0.3)
            #X_train, X_test, y_train, y_test = train_test_split_data(pos, neg, test_size=0.3)
            X_train = np.nan_to_num(X_train)
            X_test = np.nan_to_num(X_test)
            if (apply_pca):
                #scaler = MinMaxScaler(feature_range=[0, 1])
                #X_train = scaler.fit_transform(X_train)
                #X_test = scaler.fit_transform(X_test)
                #n += 100
                print(X_train.shape[1])
                #print(n)
                for k in np.arange(168, 10, 10):
                    print("PCA:" + str(k))
                    pca = PCA(n_components=168)
                    X_train = pca.fit_transform(X_train)
                    X_test = pca.transform(X_test)
                    if (preprocess):
                        X_train = preprocessing.scale(X_train)
                        X_test = preprocessing.scale(X_test)
                    LDAClassifier.fit(X_train, y_train)
                    y_predict = LDAClassifier.predict(X_test)
                    #probs = LDAClassifier.predict_proba(X_test)
                    #print(y_predict.shape)
                    #print(probs.shape)
                    #for i in range(len(probs)):
                    #    print('{0:.10f}'.format(probs[i][0]))
                    #    print('{0:.10f}'.format(probs[i][1]))
                    print(str(i * 7.8125) + "ms")
                    print(classification_report(y_test, y_predict))
                    fpr, tpr, thresholds = metrics.roc_curve(y_test, y_predict)
                    acc_auc = metrics.auc(fpr, tpr)
                    print("AUC: " + str(acc_auc))
                    print("-----------------------")
            if (preprocess):
                X_train = preprocessing.scale(X_train)
                X_test = preprocessing.scale(X_test)
            LDAClassifier.fit(X_train, y_train)
            y_predict = LDAClassifier.predict(X_test)
            #print(y_predict)
            probs = LDAClassifier.predict_proba(X_test)
            #print(probs)
            #print(y_predict.shape)
            #print(probs.shape)
            #for i in range(len(probs)):
            #    print('{0:.10f}'.format(probs[i][0]))
            #    print('{0:.10f}'.format(probs[i][1]))
            print(str(i * 7.8125) + "ms")
            print(classification_report(y_test, y_predict))
            fpr, tpr, thresholds = metrics.roc_curve(y_test, y_predict)
            acc_auc = metrics.auc(fpr, tpr)
            print("AUC: " + str(acc_auc))
            print("-----------------------")
        fi += 1
Exemple #59
0
def main():
    print "Using MNE", mne.__version__

    opts = parse_args()
    verbose = opts.debug

    # variables (parameters)
    opts.bandpass = (8.0, 30.0)  # bandpass filter envelope (min, max)
    opts.num_spatial_filters = 44  # max num spatial filters to try
    opts.epoch_full_tmin = -0.5  #
    opts.epoch_full_tmax = 3.5
    opts.epoch_trim_tmin = 0.0
    opts.epoch_trim_tmax = 0.0

    # constants
    sfreq = 100.0
    opts.event_labels = {"left": 2, "right": 3}

    # files
    train_fname = "data/custom/bci4/train/ds1b.txt"
    test_fname = "data/custom/bci4/test/ds1b.txt"

    # top ten scores
    ranked_scores = list()
    ranked_scores_opts = list()
    ranked_scores_lda = list()

    #################
    # get data from files
    eval_start = time.clock()
    [train_nparray, train_info] = file_to_nparray(train_fname, sfreq=sfreq, verbose=verbose)
    end = time.clock()
    print "train dataset loaded in ", str(end - eval_start), "seconds"

    eval_start = time.clock()
    [test_nparray, test_info] = file_to_nparray(test_fname, sfreq=sfreq, verbose=verbose)
    end = time.clock()
    print "test dataset loaded in ", str(end - eval_start), "seconds"

    ###
    # create a set of many bandpass filter range combinations
    bandpass_combinations = get_bandpass_ranges()
    window_ranges = get_window_ranges()

    # vars to store cumulative performance
    best_score = 0
    best_opts = None
    total_start = time.clock()

    for epoch_window in window_ranges:
        loop1_opts = copy.deepcopy(opts)
        loop1_opts.epoch_trim_tmin = epoch_window[0]
        loop1_opts.epoch_trim_tmax = epoch_window[1]

        for bp in bandpass_combinations:
            eval_start = time.clock()
            current_opts = copy.deepcopy(loop1_opts)
            current_opts.bandpass = bp

            print "trying this permutation:"
            print "bp", bp, "window", epoch_window

            # bandpass filter coefficients
            current_opts.b, current_opts.a = butter(
                5, np.array([current_opts.bandpass[0], current_opts.bandpass[1]]) / (sfreq / 2.0), "bandpass"
            )

            # [test_X, test_y] = extract_X_and_y(test_nparray, test_info, current_opts, verbose=verbose)

            # only train and score against the train set
            # we can't score without looking at test data, and this woul dbe looking ahead,
            # as well as overfitting
            [train_X, train_y] = extract_X_and_y(train_nparray, train_info, current_opts, verbose=verbose)
            [practice_train_X, practice_test_X, practice_train_y, practice_test_y] = train_test_split(
                train_X, train_y, test_size=0.5
            )
            [num_trials, num_channels, num_samples] = train_X.shape

            # CLASSIFIER with score for brute force parameter tuning
            [score, best_num_filters] = eval_classification(
                num_channels, practice_train_X, practice_train_y, practice_test_X, practice_test_y, verbose=verbose
            )
            current_opts.best_num_filters = best_num_filters
            print "this score was", score

            # put in ranked order Top 10 list
            idx = bisect(ranked_scores, score)
            ranked_scores.insert(idx, score)
            ranked_scores_opts.insert(idx, current_opts)

            # timer
            print round(time.clock() - eval_start, 1), "sec"

            print "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~"
            print "^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^"
            print "          H A L L    O F    F A M E"
            print

            print "score,filters,bandpass_low,bandpass_high,window_min,window_max"
            j = 1
            for i in xrange(len(ranked_scores) - 1, 0, -1):
                print len(ranked_scores) - i, ",", round(ranked_scores[i], 4), ",",
                print_opts(ranked_scores_opts[i])
                j += 1
                if j > 10:
                    break

            if score > best_score:
                best_score = score
                best_opts = copy.deepcopy(current_opts)

    print "<-----&--@--------<<"
    print "best score of all permutations"
    print best_score,
    print_opts(best_opts)

    print "actual score"
    print
    print "rank,score,filters,bandpass_low,bandpass_high,window_min,window_max"
    # CLASSIFIER

    # now try with top 5 params
    print "actual score: top 10 trained models applied to test data"
    test_y = None

    predictions = None
    num_ensembles = 6
    for i in xrange(1, num_ensembles + 1):
        best_opts = ranked_scores_opts[len(ranked_scores) - i]

        [train_feat, train_y, test_feat, test_y] = train_transform(
            train_nparray, train_info, test_nparray, test_info, best_opts, verbose=verbose
        )

        # train LDA
        lda = LinearDiscriminantAnalysis()
        prediction_score = lda.fit(train_feat, train_y).score(test_feat, test_y)
        prediction = lda.predict(test_feat)
        if predictions is None:
            # initialize
            predictions = np.zeros((num_ensembles, len(test_y)))

            # nb = GaussianNB()
            # nb_score = nb.fit(train_feat, train_y).score(test_feat, test_y)
            # print "NB:",nb_score

            # save prediction
        predictions[i - 1, :] = prediction

        print prediction_score,
        print_opts(best_opts)

        # print "real answer:", test_y

        # use ensemble to "vote" for each prediction
    num_correct = 0
    for i in xrange(len(test_y)):
        # print "sum", predictions[:,i].sum(),

        if predictions[:, i].sum() >= float(num_ensembles) / float(2):
            guess = 1
            # print "guessing 1",
        else:
            guess = 0
            # print "guessing 0",

        if guess == test_y[i]:
            num_correct += 1

            # print "correct so far::",float(num_correct)/float(i+1)

    print "using ensemble:"
    print "percentage correct", num_correct, "out of", len(test_y), "=", float(num_correct) / float(len(test_y))

    print
    print "total run time", round(time.clock() - total_start, 1), "sec"
    print
    print

    exit()
y = A.values[:, 2]

I = y == 1
J = [not x for x in I]

#LDA
m1 = np.mean(X[I,:],axis=0)
m2 = np.mean(X[J,:],axis=0)

e1 = 1 / (len(X[I,:])-1)
e2 = 1 / (len(X[J,:])-1)

s1 = np.dot((X[I,:]-m1).transpose(), X[I,:]) / e1
s2 = np.dot((X[J,:]-m1).transpose(), X[J,:]) / e2

sw = s1 + s2
w = np.dot(np.linalg.inv(sw), (m2-m1))

print("The coeffs are:", w)

clf = LDA()
clf.fit(X, y)

print(np.vstack((clf.predict(X), y)).T)
plt.plot(X[I,0], X[I,1], '.')
plt.plot(X[J,0], X[J,1], '.')
plt.grid()
plt.show()