Beispiel #1
0
 def lda_on(train_x,
            train_y,
            test_x,
            test_y,
            feats_name='all_features'):
     """ Linear Discriminant Analysis """
     lda = LDA()
     lda.fit(train_x, train_y, store_covariance=True)
     print feats_name, "(train):", lda.score(train_x, train_y)
     print feats_name, "(test):", lda.score(test_x, test_y)
     with open(dataset_name + '_lda_classif_' + feats_name + '.pickle',
               'w') as w_f:
         cPickle.dump(lda, w_f)
     y_pred = lda.predict(test_x)
     X_train, X_validate, y_train, y_validate = cross_validation\
             .train_test_split(train_x, train_y, test_size=0.2,
                     random_state=0)
     lda.fit(X_train, y_train)
     print feats_name, "(validation):", lda.score(
         X_validate, y_validate)
     y_pred_valid = lda.predict(X_validate)
     cm_test = confusion_matrix(test_y, y_pred)
     cm_valid = confusion_matrix(y_validate, y_pred_valid)
     np.set_printoptions(threshold='nan')
     with open("cm_test" + feats_name + ".txt", 'w') as w_f:
         print >> w_f, cm_test
     with open("cm_valid" + feats_name + ".txt", 'w') as w_f:
         print >> w_f, cm_valid
Beispiel #2
0
def get_performance(test_df, X_std, y):
    Xtest = test_df.ix[:, 'x.1':'x.10'].values
    ytest = test_df.ix[:, 'y'].values

    X_std_test = StandardScaler().fit_transform(Xtest)

    lda_model = LDA()
    lda_model.fit(X_std, y)

    qda_model = QDA()
    qda_model.fit(X_std, y)

    knn_model = KNeighborsClassifier(n_neighbors=10)
    knn_model.fit(X_std, y)

    print "KNN SCORE"
    print knn_model.score(X_std_test, ytest)
    print "LDA SCORE"
    print lda_model.score(X_std_test, ytest)
    print "QDA SCORE"
    print qda_model.score(X_std_test, ytest)

    knn_scores_training = []
    knn_scores_test = []

    for i in range(1, 12):
        knn_model = KNeighborsClassifier(n_neighbors=i)
        knn_model.fit(X_std, y)
        knn_scores_training.append(knn_model.score(X_std_test, ytest))
        knn_scores_test.append(knn_model.score(X_std, y))

    plt.plot(range(11), knn_scores_training, 'r--')
    plt.plot(range(11), knn_scores_test, 'b--')
    plt.axis([0, 10, 0.3, 1.1])
    plt.show()
def get_LDA(Xtrain, Xtest, Ytrain, Ytest):
    lda = LDA()
    lda.fit(Xtrain, Ytrain)
    scores = np.empty((4))
    scores[0] = lda.score(Xtrain, Ytrain)
    scores[1] = lda.score(Xtest, Ytest)
    print('LDA, train: {0:.02f}% '.format(scores[0] * 100))
    print('LDA, test: {0:.02f}% '.format(scores[1] * 100))

    return lda
Beispiel #4
0
def get_LDA(Xtrain, Xtest, Ytrain, Ytest):
        lda = LDA()
        lda.fit(Xtrain,Ytrain)
        scores = np.empty((4))
        scores[0] = lda.score(Xtrain,Ytrain)
        scores[1] = lda.score(Xtest,Ytest)
        print('LDA, train: {0:.02f}% '.format(scores[0]*100))
        print('LDA, test: {0:.02f}% '.format(scores[1]*100))
        
        return lda
Beispiel #5
0
def get_LDA_performance(test_df, X_std, y):
    X_test = test_df.ix[:, 'x.1':'x.10'].values
    X_std_test = StandardScaler().fit_transform(X_test)
    y_test = test_df.ix[:, 'y'].values

    lda_scores_training = []
    lda_scores_test = []

    qda_scores_training = []
    qda_scores_test = []

    knn_scores_training = []
    knn_scores_test = []

    for d in range(1, 11):
        lda = LDA(n_components=d)
        Xred_lda_training = lda.fit_transform(X_std, y)
        Xred_lda_test = lda.transform(X_std_test)

        lda_model = LDA()
        lda_model.fit(Xred_lda_training, y)

        qda_model = QDA()
        qda_model.fit(Xred_lda_training, y)

        knn_model = KNeighborsClassifier(n_neighbors=10)
        knn_model.fit(Xred_lda_training, y)

        lda_scores_training.append(1 - lda_model.score(Xred_lda_training, y))
        lda_scores_test.append(1 - lda_model.score(Xred_lda_test, y_test))

        qda_scores_training.append(1 - qda_model.score(Xred_lda_training, y))
        qda_scores_test.append(1 - qda_model.score(Xred_lda_test, y_test))

        knn_scores_training.append(1 - knn_model.score(Xred_lda_training, y))
        knn_scores_test.append(1 - knn_model.score(Xred_lda_test, y_test))

    plt.plot(range(10), lda_scores_training, 'r--', label="Train data")
    plt.plot(range(10), lda_scores_test, 'b--', label="Test data")
    plt.title("LDA vs LDA")
    plt.xlabel('k')
    plt.ylabel('Score')
    plt.show()

    plt.plot(range(10), qda_scores_training, 'r--', label="Train data")
    plt.plot(range(10), qda_scores_test, 'b--', label="Test data")
    plt.title("QDA vs LDA")
    plt.show()

    plt.plot(range(10), knn_scores_training, 'r--', label="Train data")
    plt.plot(range(10), knn_scores_test, 'b--', label="Test data")
    plt.title("KNN vs LDA")
    plt.show()
def plot_lda_projection(marker, flname):
	lda = LDA()
	lda.fit(marker["individuals"], marker["population_labels"])
	print lda.score(marker["individuals"], marker["population_labels"])
	proj = lda.transform(marker["individuals"])
	n_samples, n_components = proj.shape

	plt.scatter(proj, marker["population_labels"])
	plt.xlabel("Component 0", fontsize=18)
	plt.ylabel("Population Labels", fontsize=18)

	plt.savefig(flname, DPI=200)
Beispiel #7
0
def classify(Xtrain,Xtest,Ytrain,Ytest):
    '''
    Linear and RBF SVM classifiers
    '''
    scores = np.zeros((5,))
    

    lr = LogisticRegression()
    lr.fit(Xtrain,Ytrain)
    scores[0] = lr.score(Xtest,Ytest)

    lda = LDA()
    lda.fit(Xtrain,Ytrain)
    scores[1] = lda.score(Xtest,Ytest)

    nb = GaussianNB()
    nb.fit(Xtrain,Ytrain)
    scores[2] = nb.score(Xtest,Ytest)
    
    lsvm = LinearSVC( C = 1)
    lsvm.fit(Xtrain,Ytrain)
    scores[3] = lsvm.score(Xtest,Ytest)
    
    gsvm = SVC(kernel='rbf', C = 1000)
    gsvm.fit(Xtrain,Ytrain)
    scores[4] = gsvm.score(Xtest,Ytest)
    return scores
def LDAmeanScore(X, Y, n_folds, dim_reduction=0):
    """
    :param X: matrice d'entree du classifieur, n_samples*n_parameters, n_paramters>=2, n_samples>0. DONNES COHERENTES POUR CLASSIFICATION LDA
    :param Y: matrice des labels, n_samples
    :param n_folds: nombre de tests pour le KFold, >1
    :param dim_reduction: si egale a 0, pas de reduction, si inferieur a 0, best_reduction, sinon on fait une reduction PCA (on reduit a dim_reduction dimensions)
    :return: le score moyen de la validation croisee, affiche ce score. Si n_folds>n_samples, renvoie -1
 """
    if dim_reduction > 0 and X.shape[1] > dim_reduction:
        X = dim_reduction_PCA(X, dim_reduction)
    if dim_reduction == -1:
        dim_reduction = best_dimension(X)
        print "Best dimension : " + str(dim_reduction)
        X = dim_reduction_PCA(X, dim_reduction)

    if X.shape[0] > n_folds:
        # Cross validation pour estimer la performance d'un classifieur LDA
        kf = KFold(n=len(Y), n_folds=n_folds, shuffle=True, random_state=None)
        scores = []
        for train_index, test_index in kf:
            X_train, X_test = X[train_index, :], X[test_index, :]
            Y_train, Y_test = Y[train_index], Y[test_index]
            cl = LDA()
            cl.fit(X_train, Y_train)
            scores.append(cl.score(X_test, Y_test))

        print "Score moyen : ", np.mean(np.array(scores))
        return 100.0 * np.mean(np.array(scores))
    else:
        return -1
def pca_lda(X_train, X_test, y_train, y_test):
    pca = PCA(n_components=500)
    lda = LDA()
    pca.fit(X_train)
    scores = np.dot(X_train, np.transpose(pca.components_))
    lda.fit(scores, y_train)
    return lda.score(scores, y_train, sample_weight=None)
def pca_lda(X_train,X_test,y_train,y_test):
    pca = PCA(n_components=500)
    lda = LDA()
    pca.fit(X_train)
    scores = np.dot(X_train,np.transpose(pca.components_))
    lda.fit(scores, y_train)
    return lda.score(scores, y_train, sample_weight=None)
def LDAmeanScore(X, Y, n_folds, dim_reduction=0):
    """
    :param X: matrice d'entree du classifieur, n_samples*n_parameters, n_paramters>=2, n_samples>0. DONNES COHERENTES POUR CLASSIFICATION LDA
    :param Y: matrice des labels, n_samples
    :param n_folds: nombre de tests pour le KFold, >1
    :param dim_reduction: si inferieur ou egale a 0, pas de reduction, sinon, si le nombre de parametre est superieur a dim_reduction, on fait une reduction PCA
    :return: le score moyen de la validation croisee, affiche ce score. Si n_folds>n_samples, renvoie -1
 """
    if dim_reduction > 0 and X.shape[1] > dim_reduction:
        X = dim_reduction_PCA(X, dim_reduction)

    if (X.shape[0] > n_folds):
        # Cross validation pour estimer la performance d'un classifieur LDA
        kf = KFold(n=len(Y), n_folds=n_folds, shuffle=False, random_state=None)
        scores = []
        for train_index, test_index in kf:
            X_train, X_test = X[train_index, :], X[test_index, :]
            Y_train, Y_test = Y[train_index], Y[test_index]
            cl = LDA()
            cl.fit(X_train, Y_train)
            scores.append(cl.score(X_test, Y_test))

        print 'Score moyen : ', np.mean(np.array(scores))
        return 100. * np.mean(np.array(scores))
    else:
        return -1
Beispiel #12
0
def LDA(data, label, pred_data, pred_last):
    '''not good,不需要规范化
    '''
    data = np.array(data)
    pred_data = np.array(pred_data)
    label = np.array(label)
    pred_last = np.array(pred_last)
    from sklearn.lda import LDA
    gnb = LDA()
    gnb.fit(data, label)

    print gnb.score(data, label)
    pred_result = gnb.predict(pred_data)
    print("Number of mislabeled points out of a total %d points : %d" %
          (pred_data.shape[0], (pred_last != pred_result).sum()))
    print gnb.score(pred_data, pred_last)
    return pred_result
Beispiel #13
0
def acc_image(training_data, tarining_label, test_data, test_label):
    n_train = training_data.shape[0]  # samples for training
    n_test = test_data.shape[0]  # samples for testing
    n_averages = 50  # how often to repeat classification
    n_features_max = 5  # maximum number of features
    step = 1  # step size for the calculation

    acc_clf1, acc_clf2 = [], []
    n_features_range = range(1, n_features_max + 1, step)
    for n_features in n_features_range:
        score_clf1, score_clf2 = 0, 0
        for _ in range(n_averages):
            X, y = training_data[:, 0:n_features], tarining_label

            clf1 = LDA(solver='lsqr', shrinkage='auto').fit(X, y)
            clf2 = LDA(solver='lsqr', shrinkage=None).fit(X, y)

            X, y = test_data[:, 0:n_features], test_label
            score_clf1 += clf1.score(X, y)
            score_clf2 += clf2.score(X, y)

        acc_clf1.append(score_clf1 / n_averages)
        acc_clf2.append(score_clf2 / n_averages)

    features_samples_ratio = np.array(n_features_range) / n_train

    plt.plot(features_samples_ratio,
             acc_clf1,
             linewidth=2,
             label="LDA with shrinkage",
             color='r')
    plt.plot(features_samples_ratio,
             acc_clf2,
             linewidth=2,
             label="LDA",
             color='g')

    plt.xlabel('n_features / n_samples')
    plt.ylabel('Classification accuracy')

    plt.legend(loc=1, prop={'size': 12})
    plt.suptitle('LDA vs. shrinkage LDA (1 discriminative feature)')
    plt.show()
Beispiel #14
0
 def lda_on(train_x, train_y, test_x, test_y, feats_name='all_features'):
     lda = LDA()
     lda.fit(train_x, train_y, store_covariance=True)
     print feats_name, "(train):", lda.score(train_x, train_y)
     print feats_name, "(test):", lda.score(test_x, test_y)
     with open(dataset_name + '_lda_classif_' + feats_name + '.pickle', 'w') as f:
         cPickle.dump(lda, f)
     y_pred = lda.predict(test_x)
     X_train, X_validate, y_train, y_validate = cross_validation.train_test_split(train_x, train_y, test_size=0.2, random_state=0)
     lda.fit(X_train, y_train)
     print feats_name, "(validation):", lda.score(X_validate, y_validate)
     y_pred_valid = lda.predict(X_validate)
     cm_test = confusion_matrix(test_y, y_pred)
     cm_valid = confusion_matrix(y_validate, y_pred_valid)
     np.set_printoptions(threshold='nan')
     with open("cm_test" + feats_name + ".txt", 'w') as wf:
         print >> wf, cm_test
     with open("cm_valid" + feats_name + ".txt", 'w') as wf:
         print >> wf, cm_valid
Beispiel #15
0
def LDA_select_cv(X, Y, num_features):
    scores = []
    skf = cross_validation.StratifiedKFold(Y, n_folds=10)
    for train, test in skf:
        X_train, X_test, y_train, y_test = X[train], X[test], Y[train], Y[test]
        XRF_train, imp, ind, std = fitRF(X_train, y_train, est=2000)  # RFsel
        XRF_test = X_test[:, ind]  # reorder test set after RFsel
        clf = LDA()
        clf.fit(XRF_train[:, 0:num_features], y_train)
        scores.append(clf.score(XRF_test[:, 0:num_features], y_test))
    score = np.mean(scores)
    return(score)
Beispiel #16
0
Datei: ch4.py Projekt: syting/esl
def table_4_1():
    """Reproduces table 4.1 in ESLii showing the training and test error rates
    for classifying vowels using different classification techniques. The
    sklearn implementation of logistic regression uses OvA instead of a true
    multinomial which likely accounts for the worse results
    """
    vowels_train = eslii.read_vowel_data()
    train_X = vowels_train[vowels_train.columns[1:]]
    train_y = vowels_train['y']
    vowels_test = eslii.read_vowel_data(train=False)
    test_X = vowels_test[vowels_test.columns[1:]]
    test_y = vowels_test['y']

    lda = LDA().fit(train_X, train_y)
    print "Linear discriminant analysis:  {:.2f} {:.2f}".format(
        1 - lda.score(train_X, train_y), 1 - lda.score(test_X, test_y))
    qda = QDA().fit(train_X, train_y)
    print "Quadratic discriminant analysis:  {:.2f} {:.2f}".format(
        1 - qda.score(train_X, train_y), 1 - qda.score(test_X, test_y))
    lr = LogisticRegression(C=1e30).fit(train_X, train_y)
    print "Logistic regression:  {:.2f} {:.2f}".format(
        1 - lr.score(train_X, train_y), 1 - lr.score(test_X, test_y))
Beispiel #17
0
def yj():
    params['mu0'] = np.random.randn()*0.2
    params['mu1'] = np.random.randn()*0.2
    params['sigma0'] = di.invgamma.rvs(3)
    params['sigma1'] = di.invgamma.rvs(3)
    sel, rawdata, normdata = get_data(data_yj, params)
    norm_trn_data = normdata.loc[sel['trn'], sel['feats']]
    norm_tst_data = normdata.loc[sel['tst'], sel['feats']]

    sklda = LDA()
    sklda.fit(norm_trn_data, sel['trnl'])
    error = (1-sklda.score(norm_tst_data, sel['tstl']))
    print("skLDA error: %f" % error)
    return error
Beispiel #18
0
def acc_image(training_data, tarining_label, test_data, test_label):
    n_train = training_data.shape[0]  # samples for training
    n_test = test_data.shape[0]       # samples for testing
    n_averages = 50                   # how often to repeat classification
    n_features_max = 5  # maximum number of features
    step = 1  # step size for the calculation
    
    acc_clf1, acc_clf2 = [], []
    n_features_range = range(1, n_features_max + 1, step)
    for n_features in n_features_range:
        score_clf1, score_clf2 = 0, 0
        for _ in range(n_averages):
            X, y = training_data[:,0:n_features], tarining_label
        
            clf1 = LDA(solver='lsqr', shrinkage='auto').fit(X, y)
            clf2 = LDA(solver='lsqr', shrinkage=None).fit(X, y)
        
            X, y = test_data[:,0:n_features], test_label
            score_clf1 += clf1.score(X, y)
            score_clf2 += clf2.score(X, y)
    
        acc_clf1.append(score_clf1 / n_averages)
        acc_clf2.append(score_clf2 / n_averages)

    features_samples_ratio = np.array(n_features_range) / n_train

    plt.plot(features_samples_ratio, acc_clf1, linewidth=2,
             label="LDA with shrinkage", color='r')
    plt.plot(features_samples_ratio, acc_clf2, linewidth=2,
             label="LDA", color='g')

    plt.xlabel('n_features / n_samples')
    plt.ylabel('Classification accuracy')

    plt.legend(loc=1, prop={'size': 12})
    plt.suptitle('LDA vs. shrinkage LDA (1 discriminative feature)')
    plt.show()
Beispiel #19
0
def predict_scores(markers, threshold=0.05):
	scores = []
	for i, marker in enumerate(markers):
		try:
			lda = LDA()
			lda.fit(marker["individuals"], marker["population_labels"])
			scores.append((lda.score(marker["individuals"], marker["population_labels"]), i))
		except:
			scores.append((0.0, i))
	scores.sort()
	scores.reverse()

	cutoff_idx = int(threshold * len(scores))

	return scores[:cutoff_idx]
    def optimize(self, X, y):
        clf = LDA()
        scores = []
        train_times = []
        for train, test in StratifiedKFold(y, 10):
            X_train, X_test, y_train, y_test = (X[train], X[test],
                    y[train], y[test])
            clf.fit(X_train.toarray(), y_train)
            t0 = self._timer()
            scores.append(clf.score(X_test.toarray(), y_test))
            train_times.append(self._timer() - t0)

        self._mean_score = np.mean(scores)
        self._score_std = np.var(scores)
        self._mean_train_time = np.mean(train_times)
        self._train_time_std = np.var(train_times)
Beispiel #21
0
def plot_scores(markers, flname):
	plt.clf()
	scores = []
	for i, marker in enumerate(markers):
		try:
			lda = LDA()
			lda.fit(marker["individuals"], marker["population_labels"])
			scores.append(lda.score(marker["individuals"], marker["population_labels"]))
		except:
			scores.append(0.0)

	plt.hist(scores, bins=np.arange(0.0, 1.0, 0.01))

	plt.xlabel("Score", fontsize=18)
	plt.ylabel("Occurrences", fontsize=18)

	plt.savefig(flname, DPI=200)
Beispiel #22
0
def main():
    X_BBC, y_BBC = get_data('BBC')
    X_CNN, y_CNN = get_data('CNN')
    print('# of BBC frames = ' + str(X_BBC.shape[0]))
    print('# of CNN frames = ' + str(X_CNN.shape[0]))

    clf = LDA()
    print('Training...')

    t0 = time.clock()
    clf.fit(X_BBC.toarray(), y_BBC)
    trainTime = time.clock() - t0

    print('Training time: ' + str(trainTime) + 's\n')
    print('Testing...')

    t0 = time.clock()
    score = clf.score(X_CNN.toarray(), y_CNN)
    testTime = time.clock() - t0

    print('Testing time: ' + str(testTime) + 's\n')
    print('Total time: ' + str(trainTime + testTime) + 's\n')
    print('score = ' + str(score))
def main():
    X_BBC, y_BBC = get_data('BBC')
    X_CNN, y_CNN = get_data('CNN')
    print('# of BBC frames = ' + str(X_BBC.shape[0]))
    print('# of CNN frames = ' + str(X_CNN.shape[0]))

    clf = LDA()
    print('Training...')

    t0 = time.clock()
    clf.fit(X_BBC.toarray(), y_BBC)
    trainTime = time.clock() - t0

    print('Training time: ' + str(trainTime) + 's\n')
    print('Testing...')

    t0 = time.clock()
    score = clf.score(X_CNN.toarray(), y_CNN)
    testTime = time.clock() - t0

    print('Testing time: ' + str(testTime) + 's\n')
    print('Total time: ' + str(trainTime + testTime) + 's\n')
    print('score = ' + str(score))
Beispiel #24
0
X2=X[best[0:20]]
X_test2=X_test[best[0:20]]

#Building a loop to find best model and feature selection (results are lda with the 23 best features)
model=[]
score=[]
for i in range(10,len(best)):
    X2=X[best[0:i]]
    X_test2=X_test[best[0:i]]

    #running the train and test data in LDA (this typically gives the best model)
    model.append(['lda',i])
    lda= LDA(n_components=2)
    lda_x_axis = lda.fit(X2, y).transform(X2)
    score.append(lda.score(X_test2, y_test, sample_weight=None))

    #Look at Decision Tree Accuracy
    model.append(['dt',i])
    dt = DecisionTreeClassifier(class_weight='balanced')
    dt.fit(X2,y)
    score.append(dt.score(X_test2,y_test))

    #Look at Random Forest Accuracy
    model.append(['rf',i])
    rf = RandomForestClassifier(class_weight='balanced')
    rf.fit(X2,y)
    score.append(rf.score(X_test2,y_test))

    #Extra Trees Accuracy
    model.append(['et',i])
Beispiel #25
0
import pickle
from sklearn.lda import LDA
from sklearn.model_selection import train_test_split
import random

def loadXY():
	#zippedXY = pickle.load(open("../Feature_reduction/zippedXY_wff_fs_2k.p","rb"))
	#zippedXY = pickle.load(open("../CNN_features/zippedXY_cnn_wff_2k_gap4.p","rb"))
	#zippedXY = pickle.load(open("../Vectorizer/zippedXY_wff_2k.p","rb"))
	#zippedXY = pickle.load(open("../CNN_features/zippedXY_cnn_te.p","rb"))
	zippedXY = pickle.load(open("../Feature_reduction/zippedXY_te_fs.p","rb"))
	random.shuffle(zippedXY)
	X,Y = zip(*zippedXY)
	return X,Y



if __name__ == "__main__":

	X,Y = loadXY()
	print "X and Y loaded"
	X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.80, random_state=0)
	print Y
	lda_model = LDA()
	lda_model.fit(X_train,Y_train)
	predictedY = lda_model.predict(X_test)
	for tt in range(len(Y_test)):
		print "Actual:",Y_test[tt],"  Predicted:",predictedY[tt]
	accuracy = lda_model.score(X_test,Y_test)
	print accuracy
Beispiel #26
0
import pandas as pd
import numpy as np
from sklearn.lda import LDA

## read files
train = pd.read_csv('data/spam_train.csv')
test = pd.read_csv('data/spam_test.csv')

x = np.array(train.iloc[:, 0:57])
y = np.ravel(train.iloc[:, -1])

## separate the predictors and response in the test data set
x2 = np.array(test.iloc[:, 0:57])
y2 = np.ravel(test.iloc[:, -1])

## fit the model using lda
lda_cls = LDA()
lda_cls.fit(x, y)
print("(1): lda accuracy")
print(lda_cls.score(x, y))

## predict output on test data set with lda
predict = lda_cls.predict(x2)
print("(2): lda test accuracy")
print(lda_cls.score(x2, y2))
Beispiel #27
0
	delimiter=',', skiprows=1)
test = rawtest[:,feat_inds]
norm_test = (test - test.mean(axis=0)) / np.sqrt(test.var(axis=0,ddof=1))
N = test.shape[0]
D = data.shape[1]
#sys.exit()

trn_labels = np.hstack(( np.zeros(Ntrn/2), np.ones(Ntrn/2) ))
tst_labels = np.hstack(( np.zeros(N/2), np.ones(N/2) ))
sklda = LDA()
skknn = KNN(3, warn_on_equidistant=False)
sksvm = SVC()
sklda.fit(norm_data, trn_labels)
skknn.fit(norm_data, trn_labels)
sksvm.fit(norm_data, trn_labels)
print("skLDA error: %f" % (1-sklda.score(norm_test, tst_labels)))
print("skKNN error: %f" % (1-skknn.score(norm_test, tst_labels)))
print("skSVM error: %f" % (1-sksvm.score(norm_test, tst_labels)))

labels = np.hstack((np.zeros(N/2), np.ones(N/2)))
n,gext,grid = get_grid_data(np.vstack(( norm_data0, norm_data1 )))

bayes0 = GaussianBayes(np.zeros(D), 1, 8, np.eye(D)*3, norm_data0)
bayes1 = GaussianBayes(np.zeros(D), 1, 8, np.eye(D)*3, norm_data1)

# Gaussian Analytic
gc = GaussianCls(bayes0, bayes1)
print("Gaussian Analytic error: %f" % gc.approx_error_data(norm_test, labels))
gavg = gc.calc_gavg(grid).reshape(-1,n)
myplot(p.subplot(2,3,1),gavg,norm_data0, norm_data1)
Beispiel #28
0
norm_trn_data = normdata.loc[sel['trn'], sel['feats']]
norm_tst_data = normdata.loc[sel['tst'], sel['feats']]
tst_data = rawdata.loc[sel['tst'], sel['feats']]

t1 = time()
#################### CLASSIFICATION ################
########################################
########################################
########################################
sklda = LDA()
skknn = KNN(3, warn_on_equidistant=False)
sksvm = SVC()
sklda.fit(norm_trn_data, sel['trnl'])
skknn.fit(norm_trn_data, sel['trnl'])
sksvm.fit(norm_trn_data, sel['trnl'])
errors['lda'] = (1-sklda.score(norm_tst_data, sel['tstl']))
errors['knn'] = (1-skknn.score(norm_tst_data, sel['tstl']))
errors['svm'] = (1-sksvm.score(norm_tst_data, sel['tstl']))
print("skLDA error: %f" % errors['lda'])
print("skKNN error: %f" % errors['knn'])
print("skSVM error: %f" % errors['svm'])

bayes0 = GaussianBayes(np.zeros(num_feat), 1, kappa, 
        np.eye(num_feat)*(kappa-1-num_feat), 
        normdata.loc[sel['trn0'], sel['feats']])
bayes1 = GaussianBayes(np.zeros(num_feat), 1, kappa,
        np.eye(num_feat)*(kappa-1-num_feat), 
        normdata.loc[sel['trn1'], sel['feats']])

# Gaussian Analytic
gc = GaussianCls(bayes0, bayes1)
Beispiel #29
0
                     skiprows=1)
test = rawtest[:, feat_inds]
norm_test = (test - test.mean(axis=0)) / np.sqrt(test.var(axis=0, ddof=1))
N = test.shape[0]
D = data.shape[1]
#sys.exit()

trn_labels = np.hstack((np.zeros(Ntrn / 2), np.ones(Ntrn / 2)))
tst_labels = np.hstack((np.zeros(N / 2), np.ones(N / 2)))
sklda = LDA()
skknn = KNN(3, warn_on_equidistant=False)
sksvm = SVC()
sklda.fit(norm_data, trn_labels)
skknn.fit(norm_data, trn_labels)
sksvm.fit(norm_data, trn_labels)
print("skLDA error: %f" % (1 - sklda.score(norm_test, tst_labels)))
print("skKNN error: %f" % (1 - skknn.score(norm_test, tst_labels)))
print("skSVM error: %f" % (1 - sksvm.score(norm_test, tst_labels)))

labels = np.hstack((np.zeros(N / 2), np.ones(N / 2)))
n, gext, grid = get_grid_data(np.vstack((norm_data0, norm_data1)))

bayes0 = GaussianBayes(np.zeros(D), 1, 8, np.eye(D) * 3, norm_data0)
bayes1 = GaussianBayes(np.zeros(D), 1, 8, np.eye(D) * 3, norm_data1)

# Gaussian Analytic
gc = GaussianCls(bayes0, bayes1)
print("Gaussian Analytic error: %f" % gc.approx_error_data(norm_test, labels))
gavg = gc.calc_gavg(grid).reshape(-1, n)
myplot(p.subplot(2, 3, 1), gavg, norm_data0, norm_data1)
Beispiel #30
0
	for c in classes:
		lda = LDA(ldasolver)
		lda.fit(features['train'],np.array(labels['train'])==c)
		#test classifier
		p = np.array(lda.predict_proba(features['test']))
		proba.append(p[:,1])
	proba=np.transpose(np.array(proba))
	prediction=np.argmax(proba,axis=1)+1
else:
	#train classifier
	lda = LDA(ldasolver)
	lda.fit(features['train'],labels['train'])

	#test classifier
	prediction = lda.predict(features['test'])
	proba = lda.predict_proba(features['test'])
	print('Accuracy %.2f%%' % lda.score(features['test'],labels['test']))

#output data
file = open(outputFile,'w')
file.write('labels ')
for c in classes:
	file.write(str(c)+' ')
file.write('\n')
for i in range(len(prediction)):
	l = prediction[i]
	file.write(str(l)+' ')
	for p in proba[i]:
		file.write(str(p)+' ')
	file.write('\n')
from sklearn import svm
from sklearn import cross_validation
from sklearn.lda import LDA

# Import training data
trainingData = loadtxt('Data/featureData.txt')
trainingLabels = loadtxt('Data/labels.txt')

# Find the sizes of the data
trainingSize = size(trainingLabels)

# Run PCA
pca = PCA(n_components=200)
trainingDataNew = pca.fit_transform(trainingData)

X_train, X_test, y_train, y_test = cross_validation.train_test_split(trainingDataNew, trainingLabels, test_size=0.3)

# Train SVM model for the training data
clf_svm = LDA()
clf_svm.fit(X_train,y_train)

# Test the trained model in the test data
print clf_svm.score(X_test, y_test) * 100

# Find the percentage error
# error = 0
# for i in range(0,trainingSize):
#     if predictedLabels[i] != trainingLabels[i]:
#         error = error+1

# print float(error)/float(trainingSize) * 100
Beispiel #32
0
    param_grid.update({'min_samples_split':np.arange(2,11)})
    gtree = GridSearchCV(DecisionTreeClassifier(),param_grid,scoring='precision',cv=StratifiedKFold(Ytrain, n_folds = 5),refit=True,n_jobs=-1)
    gtree.fit(Xtrain,Ytrain)
    scores = np.empty((6))
    scores[0] = gtree.score(Xtrain,Ytrain)
    scores[1] = gtree.score(Xtest,Ytest)
    print "---------------------Decission Tree Clasifier--------------"
    print('Decision Tree, train: {0:.02f}% '.format(scores[0]*100))
    print('Decision Tree, test: {0:.02f}% '.format(scores[1]*100))

if (LDA_cl == 1):
    from sklearn.lda import LDA
    lda = LDA()
    lda.fit(Xtrain,Ytrain)
    scores = np.empty((4))
    scores[0] = lda.score(Xtrain,Ytrain)
    scores[1] = lda.score(Xtest,Ytest)
    print "---------------------Linear Discriminant Analysis---------------------------"
    print('LDA, train: {0:.02f}% '.format(scores[0]*100))
    print('LDA, test: {0:.02f}% '.format(scores[1]*100))


if (GNB_cl == 1):
    nb = GaussianNB()
    nb.fit(Xtrain,Ytrain)
    scores = np.empty((4))
    scores[0] = nb.score(Xtrain,Ytrain)
    scores[1] = nb.score(Xtest,Ytest)

    print "---------------------Naive Bayes Classifier------------------"
#    print "Prediction time:", t1-t0, "s"
Beispiel #33
0
N = tst_data.shape[0]
D = trn_data.shape[1]
norm_tst_data0 = norm_tst_data[:N / 2, :]
norm_tst_data1 = norm_tst_data[N / 2:, :]

trn_labels = np.hstack((np.zeros(Ntrn / 2), np.ones(Ntrn / 2)))
tst_labels = np.hstack((np.zeros(N / 2), np.ones(N / 2)))
sklda = LDA()
skknn = KNN(3, warn_on_equidistant=False)
sksvm = SVC()
sklda.fit(norm_trn_data, trn_labels)
skknn.fit(norm_trn_data, trn_labels)
sksvm.fit(norm_trn_data, trn_labels)

output = {}
output['ldaerr'] = (1 - sklda.score(norm_tst_data, tst_labels))
output['knnerr'] = (1 - skknn.score(norm_tst_data, tst_labels))
output['svmerr'] = (1 - sksvm.score(norm_tst_data, tst_labels))

print("skLDA error: %f" % output['ldaerr'])
print("skKNN error: %f" % output['knnerr'])
print("skSVM error: %f" % output['svmerr'])

# Gaussian Analytic
bayes0 = GaussianBayes(np.zeros(D), 1, 8, np.eye(D) * 3, norm_trn_data0)
bayes1 = GaussianBayes(np.zeros(D), 1, 8, np.eye(D) * 3, norm_trn_data1)
gc = GaussianCls(bayes0, bayes1)

output['gausserr'] = gc.approx_error_data(norm_tst_data, tst_labels)
print("Gaussian Analytic error: %f" % output['gausserr'])
import pandas as pd
import numpy as np
from sklearn.lda import LDA

## read files
train = pd.read_csv('data/spam_train.csv')
test = pd.read_csv('data/spam_test.csv')

x = np.array(train.iloc[:, 0:57])
y = np.ravel(train.iloc[:, -1])

## separate the predictors and response in the test data set
x2 = np.array(test.iloc[:, 0:57])
y2 = np.ravel(test.iloc[:, -1])



## fit the model using lda
lda_cls = LDA()
lda_cls.fit(x,y)
print("(1): lda accuracy")
print(lda_cls.score(x, y))

## predict output on test data set with lda
predict = lda_cls.predict(x2)
print("(2): lda test accuracy")
print(lda_cls.score(x2, y2))
from sklearn import svm
lin_clf = svm.LinearSVC(C=0.1, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0)
lin_clf.fit(X_train, y_train) 
1-lin_clf.score(X_test, y_test)


# In[29]:

from sklearn.lda import LDA 

clf3 =LDA()
clf3.fit(X_train, y_train)
1-clf3.score(X_test, y_test)


# In[27]:

clfrbf = svm.SVC(kernel='rbf')
clfrbf.fit(X_train, y_train) 

1-clfrbf.score(X_test, y_test)


# In[18]:

from sklearn.naive_bayes import GaussianNB

clf5 = GaussianNB()
Beispiel #36
0
    XX = []
    for i in xrange(len(Y)):
        if Y[i] == value:
            XX.append(X[i])
    return XX


out = open(sys.argv[1], "r")
model = LDA(solver='lsqr')
X, Y = read_fea(sys.argv[1])
sel = VarianceThreshold(threshold=0)
model.fit(sel.fit_transform(X), Y)
warning("useful features dim: " + str(len(sel.get_support(True))))
if hasattr(model, 'score'):
    warning("accuracy on training set: " +
            str(model.score(sel.transform(X), Y)))
    if len(sys.argv) > 2:
        X, Y = read_fea(sys.argv[2])
        warning("accuracy on cv set: " + str(model.score(sel.transform(X), Y)))

    if len(sys.argv) > 3:
        X, Y = read_fea(sys.argv[3])
        warning("accuracy on dev set: " +
                str(model.score(sel.transform(X), Y)))

if len(sys.argv) > 4:
    ref = model.decision_function(sel.transform(X))
    X, Y = read_fea(sys.argv[4], True)
    Z = model.decision_function(sel.transform(X))
    Z = (Z - ref.mean(axis=0)[np.newaxis, :]) / ref.std(axis=0)[np.newaxis, :]
    for i in xrange(len(Y)):
Beispiel #37
0
# -*- coding: utf-8 -*-

__author__ = 'PC-LiNing'

import gensim
from lda import load_data
import numpy as np
from sklearn.lda import LDA

corpus,dic,labels = load_data.load_corpus()
tfidf = gensim.models.TfidfModel(corpus=corpus,dictionary=dic)
corpus_tfidf = [tfidf[doc] for doc in corpus]

matrix = load_data.convert_to_matrix(corpus_tfidf)
train_data,train_label,test_data,test_label = load_data.get_train_test(matrix,labels)

lda = LDA(solver='svd',store_covariance=True)
lda.fit(train_data,train_label)
score = lda.score(test_data,test_label)
print(score)




def main():
    #Define our connection string
    conn_string = "host='localhost' dbname='CRAWL4J' user='******' password='******'"
    # print the connection string we will use to connect
    print "Connecting to database\n    ->%s" % (conn_string)
 
    # get a connection, if a connect cannot be made an exception will be raised here
    conn = psycopg2.connect(conn_string)
    
    # fetching training data from Cdiscount-maison
    cdiscount_maison_request = "select url, whole_text, title, h1, short_description, status_code, depth, outlinks_size, inlinks_size, nb_breadcrumbs, nb_aggregated_ratings, nb_ratings_values, nb_prices, nb_availabilities, nb_reviews, nb_reviews_count, nb_images, nb_search_in_url, nb_add_in_text, nb_filter_in_text, nb_search_in_text, nb_guide_achat_in_text, nb_product_info_in_text, nb_livraison_in_text, nb_garanties_in_text, nb_produits_similaires_in_text, nb_images_text, width_average, height_average, page_rank, page_type, concurrent_name, last_update, semantic_hits, semantic_title, inlinks_semantic, inlinks_semantic_count  from arbocrawl_results  where page_type !='Unknown' and concurrent_name = 'Cdiscount-maison' "; 
    catPred=["PAGE DEPTH AT SITE LEVEL","NUMBER OF OUTGOING LINKS","NUMBER OF INCOMING LINKS","NUMBER OF ITEMTYPE http://data-vocabulary.org/Breadcrumb","NUMBER OF ITEMPROP aggregateRating","NUMBER OF ITEMPROP ratingValue","NUMBER OF ITEMPROP price","NUMBER OF ITEMPROP availability","NUMBER OF ITEMPROP review","NUMBER OF ITEMPROP reviewCount","NUMBER OF ITEMPROP image","NUMBER OF OCCURENCES FOUND IN URL of search + recherche + Recherche + Search","NUMBER OF OCCURENCES FOUND IN PAGE TEXT ajout + ajouter + Ajout + Ajouter","NUMBER OF OCCURENCES FOUND IN PAGE TEXT filtre + facette + Filtre + Facette + filtré + filtrés","NUMBER OF OCCURENCES FOUND IN PAGE TEXT Ma recherche + Votre recherche + résultats pour + résultats associés","NUMBER OF OCCURENCES FOUND IN PAGE TEXT guide d""achat + Guide d""achat","NUMBER OF OCCURENCES FOUND IN PAGE TEXT caractéristique + Caractéristique + descriptif + Descriptif +information + Information","NUMBER OF OCCURENCES FOUND IN PAGE TEXT livraison + Livraison + frais de port + Frais de port","NUMBER OF OCCURENCES FOUND IN PAGE TEXT garantie + Garantie +assurance + Assurance","NUMBER OF OCCURENCES FOUND IN PAGE TEXT Produits Similaires + produits similaires + Meilleures Ventes + meilleures ventes +Meilleures ventes + Nouveautés + nouveautés + Nouveauté + nouveauté","NUMBER OF HTML TAG img IN THE PAGE","AVERAGE WIDTH OF HTML TAG img IN THE PAGE","AVERAGE HEIGHT OF HTML TAG img IN THE PAGE"];
    semPred =["PAGE TEXT", "PAGE TITLE", "PAGE H1", "PAGE SHORT DESCRIPTION","TEN BEST TF/IDF HITS FOR THE PAGE","TITLE TF/IDF","PAGE INCOMING LINKS ANCHOR SEMANTIC"];

    print "Executing the following request to fetch data for Cdiscount-maison from the ARBOCRAWL_RESULTS table : " + cdiscount_maison_request
    print"Page-type predictors : "+ ', '.join(catPred)
    print"Semantic predictors : " + ', '.join(semPred)

    df = pd.read_sql(cdiscount_maison_request, conn)
    
  
    url_list = df.url.values
    semantic_columns = ["url","title","h1","short_description","semantic_hits", "semantic_title", "inlinks_semantic"];
    semantic_predictors = df[list(semantic_columns)].values;
    
    classifying_columns = ["depth", "outlinks_size", "inlinks_size", "nb_breadcrumbs", "nb_aggregated_ratings", "nb_ratings_values", "nb_prices", "nb_availabilities", "nb_reviews", "nb_reviews_count", "nb_images", "nb_search_in_url", "nb_add_in_text", "nb_filter_in_text", "nb_search_in_text", "nb_guide_achat_in_text", "nb_product_info_in_text", "nb_livraison_in_text", "nb_garanties_in_text", "nb_produits_similaires_in_text", "nb_images_text", "width_average","height_average"]
    classifying_predictors = df[list(classifying_columns)].values;
    X= np.asanyarray(classifying_predictors);
    y = df.page_type.values;

    print type(X)
    print X.shape
    print type(y)
    print y.shape
    
    # fetching the data to predict
    to_predict_request = "select url, whole_text, title, h1, short_description, status_code, depth, outlinks_size, inlinks_size, nb_breadcrumbs, nb_aggregated_ratings, nb_ratings_values, nb_prices, nb_availabilities, nb_reviews, nb_reviews_count, nb_images, nb_search_in_url, nb_add_in_text, nb_filter_in_text, nb_search_in_text, nb_guide_achat_in_text, nb_product_info_in_text, nb_livraison_in_text, nb_garanties_in_text, nb_produits_similaires_in_text, nb_images_text, width_average, height_average, page_rank, page_type, concurrent_name, last_update, semantic_hits, semantic_title, inlinks_semantic, inlinks_semantic_count  from arbocrawl_results  where concurrent_name != 'Cdiscount-maison' "; 
    df_to_predict = pd.read_sql(to_predict_request, conn)
    # df_to_predict.dropna()
    # df_to_predict.replace([np.inf, -np.inf], np.nan).dropna(subset=list(classifying_columns), how="all")
    # df_to_predict.dropna(subset=list(classifying_columns), how="all", with_inf=True)
    # indexnan = sum(np.isnan(Xval))
    # indexinfinite = np.isfinite(Xval)
    classifying_predictors_to_predict = df_to_predict[list(classifying_columns)].values;
    Xval= np.asanyarray(classifying_predictors_to_predict);
    print type(Xval)
    print Xval.shape
    
    url_val_list = df_to_predict.url.values
    print type(url_val_list)
    print url_val_list.shape
    
    # we must here filter the NaN / Infinity in Xval values
    #print np.isnan(Xval)
    #Xval = Xval[~np.isnan(Xval)]
    #print Xval.shape
 
    # transforming the predictors / rescaling the predictors
    # we don't need to do that
    #X = StandardScaler().fit_transform(X)
    #Xval = StandardScaler().fit_transform(Xval)
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.4)
    single_tree = DecisionTreeClassifier(max_depth=5)
    single_tree.fit(X_train, y_train)
    single_tree_score = single_tree.score(X_test, y_test)
    print "Single tree score " + str(single_tree_score)
    
    random_forest = RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1)
    random_forest.fit(X_train, y_train)
    random_forest_score = random_forest.score(X_test, y_test)
    print "Random forest score " + str(random_forest_score)
    
    kneighbors =  KNeighborsClassifier(3)
    kneighbors.fit(X_train, y_train)
    kneighbors_score = kneighbors.score(X_test, y_test)
    print "K-Neighbors score " + str(kneighbors_score)
    
    adaboost =  AdaBoostClassifier()
    adaboost.fit(X_train, y_train)
    adaboost_score = adaboost.score(X_test, y_test)
    print "Ada boost score " + str(adaboost_score)

    gaussian_nb =  GaussianNB()
    gaussian_nb.fit(X_train, y_train)
    gaussian_nb_score = gaussian_nb.score(X_test, y_test)
    print "gaussian mixtures score " + str(gaussian_nb_score)
    
    lda =  LDA()
    lda.fit(X_train, y_train)
    lda_nb_score = lda.score(X_test, y_test)
    print "linear discriminant score " + str(lda_nb_score)
    
    qda =  QDA()
    qda.fit(X_train, y_train)
    qda_nb_score = qda.score(X_test, y_test)
    print "quadratic discriminant score " + str(qda_nb_score)
    
    #SVC(kernel="linear", C=0.025),
    #SVC(gamma=2, C=1),


    # we now predict the dataset from the other web sites with the best scoring trained classifier
    y_val_predicted = random_forest.predict(Xval);
    print type(y_val_predicted)
    print y_val_predicted.shape
    
    print type(url_val_list)
    print url_val_list.shape
    
    url_validation_list = url_val_list.tolist()
    y_val_predicted_list = y_val_predicted.tolist()

#    displaying the classified data    
#    pprint.pprint(y_val_predicted_list)
#    pprint.pprint(url_validation_list)
    classified_values = zip(url_validation_list, y_val_predicted_list)
    print "Updating the database with the classification results"
    update_database_with_page_type(conn, classified_values)
    conn.close()
len(X_train)

# fitting logistic regression

logit_1 = LogisticRegression()
logit_1 = logit_1.fit(X_train, y_train)
logit_1.score(X_train, y_train)
logitpred = logit_1.predict(X_test)
print logitpred
confusion_matrix(y_test, logitpred)
prob = logit_1.predict_proba(X_test)
print prob
print metrics.accuracy_score(y_test, logitpred)
lda1 = LDA()
lda1 = lda1.fit(X_train, y_train)
lda1.score(X_train, y_train)
ldapredict = lda1.predict(X_test)
print ldapredict
confusion_matrix(y_test, ldapredict)
print metrics.accuracy_score(y_test, ldapredict)

# KNN
knn1 = KNeighborsClassifier(n_neighbors=2)
knn1 = knn1.fit(X_train, y_train)
knn1.score(X_train, y_train)
knnpredict = knn1.predict(X_test)
print knnpredict
confusion_matrix(y_test, knnpredict)
print metrics.accuracy_score(y_test, knnpredict)
knn2 = KNeighborsClassifier(n_neighbors=10)
knn2 = knn2.fit(X_train, y_train)
Beispiel #40
0
        else:
            x.append(base[j][0:50])

# %%%%%%%%%%%%%%%%%%%%%%%%%%TREINANDO%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

    clf = LDA()
    clf.fit(x, y)

    #%%%%%%%%%%%%%%%%%%%%%%CRIANDO O CONJUNTO DE TESTE%%%%%%%%%%%%%%%%%%%%%%
    xteste = []

    for i in ret:
        xteste.append(base[i][0:50])

#%%%%%%%%%%%%%%%%%%%%%%%%%%TESTANDO%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

    a = clf.score(xteste, labels)
    b = clf.predict(xteste)
    cm = confusion_matrix(labels, b)
    cm = np.asarray(cm)
    matriz = matriz + cm
    grupo.append(a)

np.set_printoptions(precision=0)
np.savetxt("matriz102lda.txt", matriz)
plt.figure()
plot_confusion_matrix(matriz)
plt.show()
print('Acurácia media do grupo: ', np.mean(grupo))
print('Desvio padrão do grupo: ', np.std(grupo))
Beispiel #41
0
log_reg.fit(train_weekly_x, train_weekly_y)
log_reg_weekly_y_preds = log_reg.predict(test_weekly_x)
score = log_reg.score(test_weekly_x, test_weekly_y)
conf_matrix = confusion_matrix(test_weekly_y, log_reg_weekly_y_preds)
print "\nLogistic Regression Coefficients [Lag2]: " + str(log_reg.coef_)
print "Confusion Matrix:"
print conf_matrix
print "Fraction of Correct Predictions: " + str(score)

#%% LDA using sklearn
from sklearn.lda import LDA

lda = LDA()
lda.fit(train_weekly_x, train_weekly_y)
lda_preds = lda.predict(test_weekly_x)
lda_score = lda.score(test_weekly_x, test_weekly_y)
conf_matrix = confusion_matrix(test_weekly_y, lda_preds)

print "\nLDA Results"
print "Confusion Matrix:"
print conf_matrix
print "Fraction of Correct Predictions: " + str(lda_score)

#%% QDA using sklearn
from sklearn.qda import QDA

qda = QDA()
qda.fit(train_weekly_x, train_weekly_y)
qda_preds = qda.predict(test_weekly_x)
qda_score = qda.score(test_weekly_x, test_weekly_y)
conf_matrix = confusion_matrix(test_weekly_y, qda_preds)
    if n_features > 1:
        X = np.hstack([X, np.random.randn(n_samples, n_features - 1)])
    return X, y

acc_clf1, acc_clf2 = [], []
n_features_range = range(1, n_features_max + 1, step)
for n_features in n_features_range:
    score_clf1, score_clf2 = 0, 0
    for _ in range(n_averages):
        X, y = generate_data(n_train, n_features)

        clf1 = LDA(solver='lsqr', shrinkage='auto').fit(X, y)
        clf2 = LDA(solver='lsqr', shrinkage=None).fit(X, y)

        X, y = generate_data(n_test, n_features)
        score_clf1 += clf1.score(X, y)
        score_clf2 += clf2.score(X, y)

    acc_clf1.append(score_clf1 / n_averages)
    acc_clf2.append(score_clf2 / n_averages)

features_samples_ratio = np.array(n_features_range) / n_train

plt.plot(features_samples_ratio, acc_clf1, linewidth=2,
         label="LDA with shrinkage", color='r')
plt.plot(features_samples_ratio, acc_clf2, linewidth=2,
         label="LDA", color='g')

plt.xlabel('n_features / n_samples')
plt.ylabel('Classification accuracy')
Beispiel #43
0
    return X, y

# acc_clf1 = []
acc_clf2 = []
n_features_range = list(range(1, n_features_max + 1, step))
for n_features in n_features_range:
    score_clf1, score_clf2 = 0, 0
    for _ in range(n_averages):
        X, y = generate_data(n_train, n_features)

        # clf1 = LDA(solver='lsqr', shrinkage='auto').fit(X, y)
        clf2 = LDA().fit(X, y)

        X, y = generate_data(n_test, n_features)
        # score_clf1 += clf1.score(X, y)
        score_clf2 += clf2.score(X, y)

    # acc_clf1.append(score_clf1 / n_averages)
    acc_clf2.append(score_clf2 / n_averages)

features_samples_ratio = np.array(n_features_range) / n_train

# plt.plot(features_samples_ratio, acc_clf1, linewidth=2,
#          label="LDA with shrinkage", color='r')
plt.plot(features_samples_ratio, acc_clf2, linewidth=2,
         label="LDA", color='g')

plt.xlabel('n_features / n_samples')
plt.ylabel('Classification accuracy')

plt.legend(loc=1, prop={'size': 12})
Beispiel #44
0
norm_trn_data = norm(trn_data)
norm_tst_data = norm(tst_data)

norm_trn_data0, norm_trn_data1 = split(norm_trn_data)
norm_tst_data0, norm_tst_data1 = split(norm_tst_data)
trn_data0, trn_data1 = split(trn_data)
tst_data0, tst_data1 = split(tst_data)

#################### CLASSIFICATION ################
sklda = LDA()
skknn = KNN(3)
sksvm = SVC()
sklda.fit(norm_trn_data, trn_labels)
skknn.fit(norm_trn_data, trn_labels)
sksvm.fit(norm_trn_data, trn_labels)
errors['lda'] = (1-sklda.score(norm_tst_data, tst_labels))
errors['knn'] = (1-skknn.score(norm_tst_data, tst_labels))
errors['svm'] = (1-sksvm.score(norm_tst_data, tst_labels))

bayes0 = GaussianBayes(np.zeros(num_feat), 1, 8, np.eye(num_feat)*3, norm_trn_data0)
bayes1 = GaussianBayes(np.zeros(num_feat), 1, 8, np.eye(num_feat)*3, norm_trn_data1)

# Gaussian Analytic
gc = GaussianCls(bayes0, bayes1)
errors['gauss'] = gc.approx_error_data(norm_tst_data, tst_labels)

# MPM Model
#d0 = np.asarray(mquantiles(trn_data0, 0.75, axis=1)).reshape(-1)
#d1 = np.asarray(mquantiles(trn_data1, 0.75, axis=1)).reshape(-1)
#dist0 = MPMDist(trn_data0,kmax=1,priorkappa=150,lammove=0.01,mumove=0.08,d=d0)
#dist1 = MPMDist(trn_data1,kmax=1,priorkappa=150,lammove=0.01,mumove=0.08,d=d1)
Beispiel #45
0
		probas[7]=probas[7]+1
	if row['y']==9:
		probas[8]=probas[8]+1

for i in range(0,9): 
	probas[i]=probas[i]/528

yhat_apriori = np.argmax(probas) + 1

print "Clase: %d"%yhat_apriori

######## Pregunta (g) ############################################################

lda_model = LDA()
lda_model.fit(X_std,y)
print "Score LDA train: %f"%lda_model.score(X_std,y)
print "Score LDA test: %f"%lda_model.score(X_std_test,ytest)
qda_model = QDA()
qda_model.fit(X_std,y)
print "Score QDA train: %f"%qda_model.score(X_std,y)
print "Score QDA test: %f"%qda_model.score(X_std_test,ytest)
knn_model = KNeighborsClassifier(n_neighbors=10)
knn_model.fit(X_std,y)
print "Score KNN train: %f"%knn_model.score(X_std,y)
print "Score KNN test: %f"%knn_model.score(X_std_test,ytest)

values_train = []
values_test = []
for i in range(1, 12):
	knn_model = KNeighborsClassifier(n_neighbors=i)
	knn_model.fit(X_std,y)
# Possible solution

import matplotlib.pyplot as plt
from sklearn import datasets
from sklearn.lda import LDA

iris = datasets.load_iris()
X = iris.data; y = iris.target

target_names = iris.target_names


# Now, invoke the LDA method to compute and fit the model:

lda_classifier = LDA(n_components=2)
lda_x_axis = lda_classifier.fit(X, y).transform(X)

# Now output a simple visualization of the model result:

color_scheme = ['r', 'g', 'b']

for c, i, target_name in zip(color_scheme, [0, 1, 2], target_names):
    plt.scatter(lda_x_axis[y == i, 0], lda_x_axis[y == i, 1], c = c, label = target_names)

plt.xlabel('First LDA'); plt.ylabel('Second LDA')
plt.show()


# We have a score associated with the classifier's performance
lda_classifier.score(X, y, sample_weight=None)
Beispiel #47
0
    plt.xlabel('Number of components')
    plt.ylabel('accuracy')
    plt.legend(['LR', 'LDA', 'GNB', 'Linear SVM', 'rbf SVM'],
               loc='lower right')
    plt.grid(True)

if (0):

    # Calculate classifiation scores for each component
    nComponents = np.linspace(500, 1500, 100, endpoint=True)
    kpcaldaScores = np.zeros((np.alen(nComponents), 1))
    lda = LDA()

    for i in range(len(nComponents)):
        lda.fit(XtrainT[:, :nComponents[i]], labelsTrain)
        kpcaldaScores[i] = lda.score(XtestT[:, :nComponents[i]], labelsTest)

#    %% Plot accuracies for kPCA
    plt.figure()
    plt.plot(nComponents, kpcaldaScores, lw=3)

    plt.xlim(1, np.amax(nComponents))
    plt.title('kPCA accuracy')
    plt.xlabel('Number of components')
    plt.ylabel('accuracy')
    plt.xlim([500, 1500])
    plt.legend(['LDA'], loc='lower right')
    plt.grid(True)

if (0):
    # K-PCA second round
Beispiel #48
0

#X_std : X entrenamiento
#Y_std : Y entrenamiento

Xtest = test_df.ix[:,'x.1':'x.10'].values
ytest = test_df.ix[:,'y'].values
X_std_test = StandardScaler().fit_transform(Xtest)


############ LDA #####################
#Construcción y Fit del modelo LDA
lda_model = LDA()
lda_model.fit(X_std,y)
#Score conjunto de entrenamiento y conjunto de testing.
print lda_model.score(X_std,y)
print lda_model.score(X_std_test,ytest)


############ QDA #####################
#Construcción y Fit del modelo QDA
qda_model = QDA()
qda_model.fit(X_std,y)
#Score conjunto de entrenamiento y conjunto de testing.
print qda_model.score(X_std,y)
print qda_model.score(X_std_test,ytest)

# ############ KNN #####################
# #Construcción y Fit del modelo KNN
# knn_model = KNeighborsClassifier(n_neighbors=10)
# knn_model.fit(X_std,y)
Beispiel #49
0
#     for particle_features in particles_features:
#         y = 1 if particle_features['truth']==1 else -1
#         norm_particle_features = [float(particle_features[features[k]])/norm[features[k]] for k in range(len(features))]
# #        pt = particle_features['pt']
#         test_data.append(norm_particle_features)
#         test_truth.append(y)
# 
# test_data = np.array(test_data)
# test_truth = np.array(test_truth)
# 
# np.save("../Data/particle_features_tjets/particle_features_"+str(numParticles)+"_upto_"+str(2*numParticles)+".npy", test_data)
# np.save("../Data/particle_features_tjets/particle_features_"+str(numParticles)+"_upto_"+str(2*numParticles)+"_truth.npy", test_truth)

#print train_data.shape, train_truth.shape
#print test_data.shape, test_truth.shape
print(clf.score(train_data,train_truth))
print(clf.score(test_data,test_truth))

p = np.where(train_truth == -1)[0]
p_truth = np.where(test_truth == -1)[0]
hs = np.where(train_truth == 1)[0]
hs_truth = np.where(test_truth == 1)[0]
print("Pileup")
print(clf.score(train_data[p],train_truth[p]))
print(clf.score(test_data[p_truth],test_truth[p_truth]))
print("Hard Scatter")
print(clf.score(train_data[hs],train_truth[hs]))
print(clf.score(test_data[hs_truth],test_truth[hs_truth]))

print '\n'
scores_windows = []

for train_idx, test_idx in cv:
    y_train, y_test = labels[train_idx], labels[test_idx]

    X_train = csp.fit_transform(epochs_data_train[train_idx], y_train)
    X_test = csp.transform(epochs_data_train[test_idx])

    # fit classifier
    svc.fit(X_train, y_train)

    # running classifier: test classifier on sliding window
    score_this_window = []
    for n in w_start:
        X_test = csp.transform(epochs_data[test_idx][:, :, n:(n + w_length)])
        score_this_window.append(svc.score(X_test, y_test))
    scores_windows.append(score_this_window)

# Plot scores over time
w_times = (w_start + w_length / 2.) / sfreq + epochs.tmin

plt.figure()
plt.plot(w_times, np.mean(scores_windows, 0), label='Score')
plt.axvline(0, linestyle='--', color='k', label='Onset')
plt.axhline(0.5, linestyle='-', color='k', label='Chance')
plt.xlabel('time (s)')
plt.ylabel('classification accuracy')
plt.title('Classification score over time')
plt.legend(loc='lower right')
plt.show()
Beispiel #51
0
def sample_data(X, Y, value=0):
    XX=[]
    for i in xrange(len(Y)):
        if Y[i]==value:
            XX.append(X[i])
    return XX

out=open(sys.argv[1],"r")
model=LDA()
X, Y = read_fea(sys.argv[1])
sel = VarianceThreshold(threshold=0)
model.fit(sel.fit_transform(X), Y)
warning("useful features dim: "+str(len(sel.get_support(True))))
if hasattr(model,'score'):
    warning("accuracy on training set: "+str(model.score(sel.transform(X), Y)))
    if len(sys.argv)>2:
        X, Y = read_fea(sys.argv[2])
        warning("accuracy on cv set: "+str(model.score(sel.transform(X), Y)))

    if len(sys.argv)>3:
        X, Y = read_fea(sys.argv[3])
        warning("accuracy on dev set: "+str(model.score(sel.transform(X), Y)))

if len(sys.argv)>4:
    ref = model.decision_function(sel.transform(X))
    X, Y = read_fea(sys.argv[4], True)
    Z = model.decision_function(sel.transform(X))
    Z = (Z-ref.mean(axis=0)[np.newaxis,:])/ref.std(axis=0)[np.newaxis,:]
    for i in xrange(len(Y)):
        ZZ=np.array(Z[i][1:])
Beispiel #52
0
    #ws.var_.xvschema = scot.xvschema.singletrial
    #ws.optimize_var()
    ws.var_.delta = 1

    # Single-Trial Fitting and feature extraction
    features = np.zeros((len(triggers), 32))
    for t in range(len(triggers)):
        print('Fold %d/%d, Trial: %d   ' %(fold, nfolds, t), end='\r')
        ws.set_data(data[:, :, t])
        ws.fit_var()

        con = ws.get_connectivity('ffPDC')

        alpha = np.mean(con[:, :, np.logical_and(7 < freq, freq < 13)], axis=2)
        beta = np.mean(con[:, :, np.logical_and(15 < freq, freq < 25)], axis=2)

        features[t, :] = np.array([alpha, beta]).flatten()

    lda.fit(features[train, :], classids[train])

    acc_train = lda.score(features[train, :], classids[train])
    acc_test = lda.score(features[test, :], classids[test])

    print('Fold %d/%d, Acc Train: %.4f, Acc Test: %.4f' %(fold, nfolds, acc_train, acc_test))

    pred = lda.predict(features[test, :])
    cm += confusion_matrix(classids[test], pred)
print('Confusion Matrix:\n', cm)

print('Total Accuracy: %.4f'%(np.sum(np.diag(cm))/np.sum(cm)))
Beispiel #53
0
    if n_features > 1:
        X = np.hstack([X, np.random.randn(n_samples, n_features - 1)])
    return X, y


acc_clf1, acc_clf2 = [], []
n_features_range = range(1, n_features_max + 1, step)
for n_features in n_features_range:
    score_clf1, score_clf2 = 0, 0
    for _ in range(n_averages):
        X, y = generate_data(n_train, n_features)
        clf1 = LDA(solver='lsqr', shrinkage='auto').fit(X, y)
        clf2 = LDA(solver='lsqr', shrinkage=None).fit(X, y)

        X, y = generate_data(n_test, n_features)
        score_clf1 += clf1.score(X, y)
        score_clf2 += clf2.score(X, y)

    acc_clf1.append(score_clf1 / n_averages)
    acc_clf2.append(score_clf2 / n_averages)

features_samples_ratio = np.array(n_features_range) / n_train
plt.plot(features_samples_ratio,
         acc_clf1,
         linewidth=2,
         label='LDA with shrinkage',
         color='r')
plt.plot(features_samples_ratio, acc_clf2, linewidth=2, label='LDA', color='g')

plt.xlabel('n_features / n_samples')
plt.ylabel('Classification accuracy')