Exemple #1
1
def classify_using_lda(feat1, feat2, num_comp=2):

    n_plus = len(feat1)
    n_minus = len(feat2)

    X = np.concatenate((feat1, feat2), axis=0)
    y = np.concatenate((np.zeros(n_plus), np.ones(n_minus)), axis=0)
    y += 1

    print(X.shape, y.shape, n_plus, n_minus, feat1.shape, feat2.shape)

    lda = LDA(n_components=num_comp)
    lda.fit(X, y)

    # TODO FIXME Why is this returning n_samples x 1, and not n_samples x 2?
    # Is it able to to differentiate using just 1 component? Crazy!!
    X_tr = lda.transform(X)

    print(X_tr.shape, lda.score(X, y))

    # CRAZY, we don't actually have the 2nd component from LDA
    X1 = np.concatenate((X_tr[0:n_plus], np.zeros((n_plus, 1))), axis=1)
    X2 = np.concatenate((X_tr[-n_minus:], np.ones((n_minus, 1))), axis=1)

    plt.plot(X1[:, 0], X1[:, 1], 'ro')
    plt.plot(X2[:, 0], X2[:, 1], 'g+')

    plt.ylim(-1, 3)
    plt.show()
def test_lda_orthogonality():
    # arrange four classes with their means in a kite-shaped pattern
    # the longer distance should be transformed to the first component, and
    # the shorter distance to the second component.
    means = np.array([[0, 0, -1], [0, 2, 0], [0, -2, 0], [0, 0, 5]])

    # We construct perfectly symmetric distributions, so the LDA can estimate
    # precise means.
    scatter = np.array([[0.1, 0, 0], [-0.1, 0, 0], [0, 0.1, 0], [0, -0.1, 0],
                        [0, 0, 0.1], [0, 0, -0.1]])

    X = (means[:, np.newaxis, :] + scatter[np.newaxis, :, :]).reshape((-1, 3))
    y = np.repeat(np.arange(means.shape[0]), scatter.shape[0])

    # Fit LDA and transform the means
    clf = LinearDiscriminantAnalysis(solver="svd").fit(X, y)
    means_transformed = clf.transform(means)

    d1 = means_transformed[3] - means_transformed[0]
    d2 = means_transformed[2] - means_transformed[1]
    d1 /= np.sqrt(np.sum(d1 ** 2))
    d2 /= np.sqrt(np.sum(d2 ** 2))

    # the transformed within-class covariance should be the identity matrix
    assert_almost_equal(np.cov(clf.transform(scatter).T), np.eye(2))

    # the means of classes 0 and 3 should lie on the first component
    assert_almost_equal(np.abs(np.dot(d1[:2], [1, 0])), 1.0)

    # the means of classes 1 and 2 should lie on the second component
    assert_almost_equal(np.abs(np.dot(d2[:2], [0, 1])), 1.0)
Exemple #3
0
def lda(X, y, n):
	'''
		Returns optimal projection of the data
		LDA with n components
	'''
	selector = LinearDiscriminantAnalysis(n_components=n)
	selector.fit(X, y)
	return selector.transform(X), y
Exemple #4
0
def _dimReduce(df, method='pca', n_components=2, labels=None, standardize=False, smatFunc=None, ldaShrinkage='auto'):
    if method == 'kpca':
        """By using KernelPCA for dimensionality reduction we don't need to impute missing values"""
        if smatFunc is None:
            smatFunc = corrTSmatFunc
        pca = KernelPCA(kernel='precomputed', n_components=n_components)
        smat = smatFunc(df).values
        xy = pca.fit_transform(smat)
        pca.components_ = pca.alphas_
        pca.explained_variance_ratio_ = pca.lambdas_ / pca.lambdas_.sum()
        return xy, pca
    elif method == 'pca':
        if standardize:
            normed = df.apply(lambda vec: (vec - vec.mean())/vec.std(), axis=0)
        else:
            normed = df.apply(lambda vec: vec - vec.mean(), axis=0)
        pca = PCA(n_components=n_components)
        xy = pca.fit_transform(normed)
        return xy, pca
    elif method == 'lda':
        if labels is None:
            raise ValueError('labels needed to perform LDA')
        if standardize:
            normed = df.apply(lambda vec: (vec - vec.mean())/vec.std(), axis=0)
        else:
            normed = df.apply(lambda vec: vec - vec.mean(), axis=0)
        
        if df.shape[1] > df.shape[0]:
            """Pre-PCA step"""
            ppca = PCA(n_components=int(df.shape[0]/1.5))
            normed = ppca.fit_transform(df)

        lda = LinearDiscriminantAnalysis(solver='eigen', shrinkage=ldaShrinkage, n_components=n_components)
        lda.fit(normed, labels.values)
        lda.explained_variance_ratio_ = np.abs(lda.explained_variance_ratio_) / np.abs(lda.explained_variance_ratio_).sum()
        xy = lda.transform(normed)
    elif method == 'pls':
        if labels is None:
            raise ValueError('labels needed to perform PLS')
        if standardize:
            normed = df.apply(lambda vec: (vec - vec.mean())/vec.std(), axis=0)
        else:
            normed = df.apply(lambda vec: vec - vec.mean(), axis=0)
        
        pls = PLSRegression(n_components=n_components)
        pls.fit(normed, labels)
        
        pls.explained_variance_ratio_ = np.zeros(n_components)
        xy = pls.x_scores_
        return xy, pls
def transformLDA(X,y,xTest):
    
    originalSize = np.size(X,1)
    print("Learning LDA \nProjecting {} features to 1 component".format(originalSize))
    priors = [0.5,0.5]

    clf = LinearDiscriminantAnalysis('svd', n_components=1,priors=priors)
    print(X.shape)
    X = clf.fit_transform(X,y)
    print("True size of X : ", X.shape)

    if xTest != []:
        xTest = clf.transform(xTest)
    return X,xTest
def plot_sklearn_lda_with_lr(X_train, X_test, y_train, y_test):
    lda = LDA(n_components=2)
    X_train_lda = lda.fit_transform(X_train, y_train)

    lr = LogisticRegression()
    lr = lr.fit(X_train_lda, y_train)

    plot_decision_regions(X_train_lda, y_train, classifier=lr)
    plt.xlabel('LD 1')
    plt.ylabel('LD 2')
    plt.legend(loc='lower left')
    plt.show()

    X_test_lda = lda.transform(X_test)

    plot_decision_regions(X_test_lda, y_test, classifier=lr)
    plt.xlabel('LD 1')
    plt.ylabel('LD 2')
    plt.legend(loc='lower left')
    plt.show()
def do_LDA2D_KNN(digits,p,q):
    l,r = LDA2D.iterative2DLDA(digits.train_Images, digits.train_Labels, p, q, 28, 28)

    new_train = np.zeros((digits.train_Images.shape[0],p*q))
    for i in range(digits.train_Images.shape[0]):
        new_train[i] = (np.transpose(l)@digits.train_Images[i].reshape(28,28)@r).reshape(p*q)
    new_test = np.zeros((digits.test_Images.shape[0],p*q))
    for i in range(digits.test_Images.shape[0]):
        new_test[i] = (np.transpose(l)@digits.test_Images[i].reshape(28,28)@r).reshape(p*q)
    myLDA = LDA()
    x = center_matrix_SVD(new_train)
    new_new_train = myLDA.fit_transform(new_train-x.centers,digits.train_Labels)
    new_new_test = myLDA.transform(new_test-x.centers)
    labels, nearest = KNN(new_new_train,digits.train_Labels,new_new_test,10,'euclidean')
    pickle.dump(labels, open('LDA2DFDA'+ str(p) + 'x' + str(q) + '_EU.p','wb'))
    #pickle.dump(nearest, open('NLDA2DFDA'+ str(p) + 'x' + str(q) + '_EU.p','wb'))
    labels, nearest = KNN(new_new_train,digits.train_Labels,new_new_test,10,'cityblock')
    pickle.dump(labels, open('LDA2DFDA'+ str(p) + 'x' + str(q) + '_CB.p','wb'))
    #pickle.dump(nearest, open('NLDA2DFDA'+ str(p) + 'x' + str(q) + '_CB.p','wb'))
    labels, nearest = KNN(new_new_train,digits.train_Labels,new_new_test,10,'cosine')
    pickle.dump(labels, open('LDA2DFDA'+ str(p) + 'x' + str(q) + '_CO.p','wb'))
def main():
    digits = mnist() # Creates a class with our mnist images and labels
    if open('Training SVD Data','rb')._checkReadable() == 0: # Check if file exist create it if it doesn't
        x = center_matrix_SVD(digits.train_Images) # Creates a class with our svd and associated info
        pickle.dump(x,open('Training SVD Data','wb'))
    else:
        x = pickle.load(open('Training SVD Data','rb'))  # If we already have the file just load it
    if 1: # if this is zero skip
        test_Images_Center = np.subtract(digits.test_Images,np.repeat(x.centers,digits.test_Images.shape[0],0))
        tic()
        myLDA = LDA()  # Create a new instance of the LDA class
        new_train = myLDA.fit_transform(x.PCA[:,:154],digits.train_Labels)  # It will fit based on x.PCA
        new_test = myLDA.transform([email protected](x.V[:154,:])) # get my transformed test dataset
        Knn_labels = local_kmeans_class(new_train,digits.train_Labels,new_test,10) # Run kNN on the new data
        toc()
        pickle.dump(Knn_labels,open('Loc_kmeans_fda_lab','wb'))

    fda = pickle.load(open('Loc_kmeans_fda_lab','rb'))
    labels_Full = pickle.load(open('KNN_Full','rb'))
    loc_full = pickle.load(open('Loc_kmeans_Full_lab','rb'))
    errors_fda,ind_fda = class_error_rate(np.transpose(fda),digits.test_labels)
    errors_near,ind_near = class_error_rate(labels_Full,digits.test_labels)
    errors_full,ind_full = class_error_rate(np.transpose(loc_full),digits.test_labels)
    labels_50 = pickle.load(open('KNN_50','rb'))
    errors_50,ind_50 = class_error_rate(labels_50,digits.test_labels)
    print(errors_full)
    plt.figure()
    plt.plot(np.arange(10)+1, errors_fda, color='Green', marker='o', markersize=10, label='fda Kmeans')  #plots the 82.5%
    plt.plot(np.arange(10)+1, errors_near, color='Blue', marker='o', markersize=10, label='kNN')
    plt.plot(np.arange(10)+1, errors_full, color='Yellow', marker='o', markersize=10, label='Full Kmeans')
    plt.plot(np.arange(10)+1, errors_50, color='Red', marker='o', markersize=10, label='kNN 50')
    axes = plt.gca()
    axes.set_ylim([0.015,0.12])
    plt.grid(1) # Turns the grid on
    plt.title('Plot of Local Kmeans with FDA Error rates')
    plt.legend(loc='upper right')  # Puts a legend on the plot
    plt.show()
    project_back(x,digits)
 def dimension_reduce(self,mode='L'):
     
     print 'Reduce Dimensions...'
     print 'Start:' + datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
     
     raw_train=self.train.copy()
     train=self.train.copy()
     train_label=self.train_label['label'].values.copy()
     train_label=train_label.reshape((train_label.shape[0]))
         
     test=self.test.copy()
     test_label=self.test_label['label'].values.copy()
     test_label=test_label.reshape((test_label.shape[0]))
     
     flist=train.columns
     
     if mode.upper()=='L':
         lda=LinearDiscriminantAnalysis()
         X_new=lda.fit_transform(train.values,train_label)
         self.train=pd.DataFrame(X_new,columns=['DR'])
         self.test=pd.DataFrame(lda.transform(test[flist].values),columns=['DR'])
         
         tt=lda.coef_[0]
         ind=np.argsort(tt)
         features=raw_train.columns[ind[-100:]]
         feas=pd.DataFrame()
         feas['feature']=features
         feas['values']=tt[ind[-100:]]
         return feas
         
     elif mode.upper()=='P':
         pca = PCA(n_components=100)
         X_new=pca.fit_transform(train.values,train_label)
         self.train=pd.DataFrame(X_new)
         self.test=pd.DataFrame(pca.transform(test[flist].values))
         
     print 'End:' + datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
Exemple #10
0
 def best_lda_nba(self):
     dh = data_helper()
     X_train, X_test, y_train, y_test = dh.get_nba_data()
     
     scl = RobustScaler()
     X_train_scl = scl.fit_transform(X_train)
     X_test_scl = scl.transform(X_test)
     
     lda = LinearDiscriminantAnalysis(n_components=2)
     X_train_transformed = lda.fit_transform(X_train_scl, y_train)
     X_test_transformed = lda.transform(X_test_scl)
     
     # save
     filename = './' + self.save_dir + '/nba_lda_x_train.txt'
     pd.DataFrame(X_train_transformed).to_csv(filename, header=False, index=False)
     
     filename = './' + self.save_dir + '/nba_lda_x_test.txt'
     pd.DataFrame(X_test_transformed).to_csv(filename, header=False, index=False)
     
     filename = './' + self.save_dir + '/nba_lda_y_train.txt'
     pd.DataFrame(y_train).to_csv(filename, header=False, index=False)
     
     filename = './' + self.save_dir + '/nba_lda_y_test.txt'
     pd.DataFrame(y_test).to_csv(filename, header=False, index=False)
def main():
    digits = mnist() # Creates a class with our mnist images and labels
    if open('Training SVD Data','rb')._checkReadable() == 0: # Check if file exist create it if it doesn't
        print("im here")   # Just wanted to check if it was going in here
        x = center_matrix_SVD(digits.train_Images) # Creates a class with our svd and associated info
        pickle.dump(x,open('Training SVD Data','wb'))
    else:
        x = pickle.load(open('Training SVD Data','rb'))  # If we already have the file just load it
    if 0: # if this is zero skip
        test_Images_Center = np.subtract(digits.test_Images,np.repeat(x.centers,digits.test_Images.shape[0],0))
        tic()
        myLDA = LDA()  # Create a new instance of the LDA class
        new_train = myLDA.fit_transform(x.PCA[:,:154],digits.train_Labels)  # It will fit based on x.PCA
        new_test = myLDA.transform([email protected](x.V[:154,:])) # get my transformed test dataset
        Knn_labels, nearest = KNN(new_train,digits.train_Labels,new_test,10) # Run kNN on the new data
        toc()
        pickle.dump(Knn_labels,open('FDAKNN_Lables','wb'))
        pickle.dump(nearest,open('FDAKNN_neastest','wb'))
    fda = pickle.load(open('FDAKNN_Lables','rb'))
    labels_Full = pickle.load(open('KNN_Full','rb'))
    labels_50 = pickle.load(open('KNN_50','rb'))
    errors_fda,ind_fda = class_error_rate(fda,digits.test_labels)
    errors_near,ind_near = class_error_rate(labels_Full,digits.test_labels)
    errors_50,ind_50 = class_error_rate(labels_50,digits.test_labels)
    plt.figure()
    plt.plot(np.arange(10)+1, errors_fda, color='Green', marker='o', markersize=10, label='fda')  #plots the 82.5%
    plt.plot(np.arange(10)+1, errors_near, color='Blue', marker='o', markersize=10, label='kNN')
    plt.plot(np.arange(10)+1, errors_50, color='Yellow', marker='o', markersize=10, label='kNN 50')
    plt.grid(1) # Turns the grid on
    plt.title('Plot of Knn with FDA Error rates')
    plt.legend(loc='upper right')  # Puts a legend on the plot
    plt.show()
    print(confusion_matrix(digits.test_labels,labels_Full[5]))
    print(confusion_matrix(digits.test_labels,fda[5]))
    print(confusion_matrix(digits.test_labels,labels_50[5]))
    """
Exemple #12
0
Xtrain_sc = sc.fit_transform(Xtrain)
Xtest_sc = sc.transform(Xtest)

#Componentes principales
from sklearn.decomposition import PCA
##como solo tengo 2 variables escojo 1
pca = PCA(n_components=1)
Xtrain_pca = pca.fit_transform(Xtrain_sc)
Xtest_pca = pca.transform(Xtest_sc)

#Discriminante linear
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA

lda = LDA(n_components=1)
Xtrain_lda = lda.fit_transform(Xtrain_sc, ytrain)
Xtest_lda = lda.transform(Xtest_sc)

#Kernel PCA
from sklearn.decomposition import KernelPCA

kpca = KernelPCA(n_components=1, kernel='rbf')
Xtrain_kpca = kpca.fit_transform(Xtrain_sc)
Xtest_kpca = kpca.transform(Xtest_sc)

#Regresion Logistica
#######################
from sklearn.linear_model import LogisticRegression
##R.logistica es un algoritmo iterativo por eso usamos random_state para tener aprox
##los mismos resultados
logistic = LogisticRegression(random_state=4)
Exemple #13
0
from sklearn.decomposition import PCA
pca = PCA(n_components=2)
X_train = pca.fit_transform(X_train)
X_task = pca.transform(X_task)

# Applying Kernel PCA
from sklearn.decomposition import KernelPCA
kpca = KernelPCA(n_components=32, kernel='rbf')
X_train = kpca.fit_transform(X_train)
X_task = kpca.transform(X_task)

# Applying LDA
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
lda = LDA(n_components=32)
X_train = lda.fit_transform(X_train, y_target)
X_task = lda.transform(X_task)

# Training the K-NN model on the Training set
#minkowski with p=2 is equivalent to the standard Euclidean metric (ezek a defaultak)
from sklearn.neighbors import KNeighborsClassifier
classifier = KNeighborsClassifier(n_neighbors=5, metric="minkowski", p=2)

# Training the Logistic Regression model on the Training set
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression(random_state=42, max_iter=1000)

# Training the Naive Bayes model on the Training set
from sklearn.naive_bayes import GaussianNB
classifier = GaussianNB()

# Training the Decision Tree Classification model on the Training set
Exemple #14
0
# The shape of the arrays should be (#_of_samples, #_of_features)
# The number of features is the total number of data points in a 30-second sample
X = np.array(X)
y = np.array(y)

# Format data for CNN only
# The shape of the arrays should be (#_of_samples, #_of_features, 1)
# The number of features is the total number of data points in a 30-second sample
X = np.dstack(X)
X = X.transpose()
y = to_categorical(y)

# Perform LDA transform (using previously fitted LDA) for kNN and SVM only. Provide path to LDA. Do not use for CNN.
# After this, the shape of the arrays should be (#_of_samples, #_of_outputs - 1)
lda = joblib.load(PATH_LDA)
X = lda.transform(X)

# Load trained model by providing its path.
model = joblib.load(PATH_MODEL)

___________________________________________________________________________________________________

# **DISPLAYING RESULTS**

# For training results, X = trainX and y = trainy
# For validation results, X = testX and y = testy
# For testing results, leave X and y as is

# Output results for kNN and SVM only

# Store predictions
Exemple #15
0
def connect_windows(windows, label_windows, reassign=True):
    G = nx.DiGraph()

    for i in range(len(windows) - 1):
        # Compare window i and i + 1
        print("Comparing window {}/{}".format(i, len(windows)), end="\r")

        # Create links step
        for c1 in np.unique(label_windows[i]):
            for c2 in np.unique(label_windows[i + 1]):
                if c1 == -1 or c2 == -1:
                    continue

                pts1 = windows[i][1][label_windows[i] == c1]
                pts2 = windows[i + 1][1][label_windows[i + 1] == c2]
                if len(pts1) < 10 or len(pts2) < 10:
                    continue

                with warnings.catch_warnings():
                    warnings.simplefilter("ignore")
                    temp_pts1 = pts1[LocalOutlierFactor(n_neighbors=10, contamination=0.1).fit_predict(pts1) == True]
                    temp_pts2 = pts2[LocalOutlierFactor(n_neighbors=10, contamination=0.1).fit_predict(pts2) == True]
                    if len(temp_pts1) >= 5:
                        pts1 = temp_pts1
                    if len(temp_pts2) >= 5:
                        pts2 = temp_pts2

                if len(pts1) < 5 or len(pts2) < 5:
                    continue

                all_pts = np.concatenate([pts1, pts2])
                labels_ = np.concatenate([
                    np.zeros(len(pts1)),
                    np.ones(len(pts2))
                ])
                
                with warnings.catch_warnings():
                    warnings.simplefilter("ignore")
                    lda = LDA(n_components=1).fit(
                        all_pts, labels_
                    )

                k, p = scipy.stats.ks_2samp(lda.transform(pts1).flatten(), lda.transform(pts2).flatten())
                
                if 1 - k > 0.5:
                    G.add_edge((i, c1), (i + 1, c2))

        if reassign:
            # Reassignment Step
            # (Splits up nodes that have multiple nodes coming into it)
            for node in [n for n in G.nodes if n[0] == i + 1]:
                in_edges = G.in_edges(node)
                n_in_edges = len(in_edges)
                if n_in_edges > 1:
                    labels = [edge[0][1] for edge in in_edges]
                    prev_node = node[0] - 1
                    curr_node = node[0]
                    selector = np.where(label_windows[curr_node] == node[1])[0]
                    discriminator = LDA(n_components=n_in_edges - 1).fit(
                        windows[prev_node][1][np.isin(label_windows[prev_node], labels)],
                        label_windows[prev_node][np.isin(label_windows[prev_node], labels)]
                    )
                    X = windows[curr_node][1][selector]

                    new_labels = guided_reassignment(X, discriminator.predict(X), force_clusters=n_in_edges)

                    for label in np.unique(new_labels):
                        if label == 0:
                            continue
                        selector_ = np.zeros_like(label_windows[curr_node])
                        for k in selector[new_labels == label]:
                            selector_[k] = 1
                        np.place(label_windows[curr_node], selector_, np.max(label_windows[curr_node]) + 1)

    return label_windows, graph_to_labels(label_windows, G), G if not reassign else None
class LDA(object):
    def __init__(self,
                 solver="svd",
                 shrinkage=None,
                 priors=None,
                 n_components=None,
                 store_covariance=False,
                 tol=1e-4):
        """
        :param solver: string, 可选项,"svd","lsqr", "eigen"。 默认使用svd, 不计算协方差矩阵,适用于大量特征
        的数据, 最小二乘 lsqr, 结合shrinkage 使用。 eigen 特征值分解, 集合shrinkage  使用
        :param shrinkage: str/float 可选项,概率值,默认为None, "auto", 自动收缩, 0到1内的float, 固定的收缩参数
        :param priors: array, optional, shape (n_classes,) 分类优先
        :param n_components:  # 分量数, 默认None, int, 可选项
        :param store_covariance:  bool, 可选项, 只用于”svd“ 额外计算分类协方差矩阵
        :param tol: 浮点型,默认1e-4, 在svd 中,用于排序评估的阈值
        """
        self.model = LinearDiscriminantAnalysis(
            solver=solver,
            shrinkage=shrinkage,
            priors=priors,
            n_components=n_components,
            store_covariance=store_covariance,
            tol=tol)

    def fit(self, x, y):
        self.model.fit(X=x, y=y)

    def transform(self, x):
        return self.model.transform(X=x)

    def fit_transform(self, x, y):
        return self.model.fit_transform(X=x, y=y)

    def get_params(self, deep=True):
        return self.model.get_params(deep=deep)

    def set_params(self, **params):
        self.model.set_params(**params)

    def decision_function(self, x):
        self.model.decision_function(X=x)

    def predict(self, x):
        return self.model.predict(X=x)

    def predict_log_proba(self, x):
        return self.model.predict_log_proba(X=x)

    def predict_proba(self, x):
        return self.model.predict_proba(X=x)

    def score(self, x, y, sample_weight):
        return self.model.score(X=x, y=y, sample_weight=sample_weight)

    def get_attributes(self):  # 生成模型之后才能获取相关属性值
        coef = self.model.coef_  # 权重向量,
        intercept = self.model.intercept_  # 截距项
        covariance = self.model.covariance_  # 协方差矩阵
        explained_variance_ratio = self.model.explained_variance_ratio_
        means = self.model.means_
        priors = self.model.priors_  # 分类等级, 求和为1 shape (n_classes)
        scalings = self.model.scalings_  # shape(rank,n_classes-1). 缩放
        xbar = self.model.xbar_  # 所有的均值
        classes = self.model.classes_  # 分类标签

        return coef, intercept, covariance, explained_variance_ratio, means, priors, scalings, xbar, classes
# Run the model
y_pred_full = model.predict(X_test)
y_prob_full = model.predict_proba(X_test)[:, 1]

#metrics and Prediction
metrics_lda_full = calculate_metrics(y_test, y_pred_full, y_prob_full, w_test)
pov_lda_full = predict_poverty_rate(TRAIN_PATH, TEST_PATH, model)

#results
conf_mat(metrics_lda_full)
metrics_table(metrics_lda_full, 'lda_full')
pov_table(pov_lda_full, 'lda_full')

#Transform LDA RESULTS
X_lda = model.transform(X_train)

mask = (y_train == 1)
fig, axes = plt.subplots(1, 2, figsize=(12, 4))
axes[0].scatter(X_lda[mask],
                y_train[mask],
                color='b',
                marker='+',
                label='poor')
axes[0].scatter(X_lda[~mask],
                y_train[~mask],
                color='r',
                marker='o',
                label='non-poor')
axes[0].set_title('LDA Projected Data')
axes[0].set_xlabel('Transformed axis')
Exemple #18
0
"""
Applying LDA

"""

from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA


# Create class object
lda = LDA(n_components = 2)

# Fit this LDA to the training set
X_train = lda.fit_transform(X_train, y_train)

# We use transform method to transform test set
X_test = lda.transform(X_test)





"""
Fitting Logistic Regression Model

"""


# Fitting Logistic Regression to the Training set

from sklearn.linear_model import LogisticRegression
pca_features= pca.fit_transform(feature) 

scores = cross_val_score(d3, pca_features, labels, cv=10)
scores = scores.mean()
print("PCA Scores + 10 fold cross_validation:",scores)

# Perform Leave One Out validation for the LDA - Decision Tree Classifier

total_score=0
for train_index,test_index in LOO.split(feature):
    train_features, test_features = feature[train_index], feature[test_index]
    train_labels, test_labels = labels[train_index], labels[test_index]

    lda  = LDA()
    lda=lda.fit(train_features,train_labels.ravel())
    lda_train_set = lda.transform(train_features)
    lda_test_set = lda.transform(test_features)
    
    clf_lda=d3.fit(lda_train_set,train_labels)
    prediction_lda=clf_lda.predict(lda_test_set)
    total_score+=accuracy_score(test_labels,prediction_lda) 
mean_score=(total_score/number_of_iterations)
score = mean_score
print("LDA Scores + leave one cross_validation:",score)


# Perform Cross Validation for 10 folds for the LDA-Decision Tree Classifier

lda = LDA()
lda_features=lda.fit_transform(feature,labels.ravel()) 
Exemple #20
0
X_train_std = sc.fit_transform(train_imgs)
X_test_std = sc.fit_transform(test_imgs)
y_train = np.array(train_labels)
y_test = np.array(test_labels)

#先进行 PCA处理,以免维数过高
pca = PCA(n_components=80)
X_train_pca = pca.fit_transform(X_train_std)
X_test_pca = pca.transform(X_test_std)


accs = []
for i in range(3,70):
    lda = LinearDiscriminantAnalysis(n_components=i)
    lda.fit(X_train_pca, y_train)
    X_train_lda = lda.transform(X_train_pca)
    X_test_lda = lda.transform(X_test_pca)
    KNN = KNeighborsClassifier(n_neighbors=1)
    KNN.fit(X_train_lda, y_train)
    accuracy = KNN.score(X_test_lda, y_test)
    accs.append(accuracy)
    
plt.plot(accs)
df = pd.DataFrame(accs, columns=['LDA_sk'])
df.to_csv('./LDA_sk_' + str(split_num) + '.csv', index=False)





Exemple #21
0
from sklearn.model_selection import train_test_split
x_train, x_test, y_train,  y_test = train_test_split(x, y, test_size = 0.25, random_state = 0)


#feature scaling
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
x_train = sc.fit_transform(x_train)
x_test = sc.transform(x_test)

#apply LDA
"require y_train as it is supervised in contrast pca only require x_test"
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
lda = LinearDiscriminantAnalysis(n_components = 2)
x_train = lda.fit_transform(x_train, y_train)
x_test = lda.transform(x_test)

#logistic regression
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression(random_state = 0)
classifier.fit(x_train, y_train)
y_pred = classifier.predict(x_test)

#making the cpnfusion matrix
from sklearn.metrics import confusion_matrix, accuracy_score
cm = confusion_matrix(y_test, y_pred)
ac = accuracy_score(y_test, y_pred)


"kernel PCA"
#import libraries
Exemple #22
0
class LDA(CtrlNode):
    """Linear Discriminant Analysis, uses sklearn"""
    nodeName = "LDA"
    uiTemplate = [('train_data', 'list_widget', {
        'selection_mode': QtWidgets.QAbstractItemView.ExtendedSelection,
        'toolTip': 'Column containing the training data'
    }),
                  ('train_labels', 'combo', {
                      'toolTip': 'Column containing training labels'
                  }), ('solver', 'combo', {
                      'items': ['svd', 'lsqr', 'eigen']
                  }),
                  ('shrinkage', 'combo', {
                      'items': ['None', 'auto', 'value']
                  }),
                  ('shrinkage_val', 'doubleSpin', {
                      'min': 0.0,
                      'max': 1.0,
                      'step': 0.1,
                      'value': 0.5
                  }),
                  ('n_components', 'intSpin', {
                      'min': 2,
                      'max': 1000,
                      'step': 1,
                      'value': 2
                  }),
                  ('tol', 'intSpin', {
                      'min': -50,
                      'max': 0,
                      'step': 1,
                      'value': -4
                  }), ('score', 'lineEdit', {}),
                  ('predict_on', 'list_widget', {
                      'selection_mode':
                      QtWidgets.QAbstractItemView.ExtendedSelection,
                      'toolTip':
                      'Data column of the input "predict" Transmission\n'
                      'that is used for predicting from the model'
                  }), ('Apply', 'check', {
                      'applyBox': True,
                      'checked': False
                  })]

    def __init__(self, name, **kwargs):
        CtrlNode.__init__(self,
                          name,
                          terminals={
                              'train': {
                                  'io': 'in'
                              },
                              'predict': {
                                  'io': 'in'
                              },
                              'T': {
                                  'io': 'out'
                              },
                              'coef': {
                                  'io': 'out'
                              },
                              'means': {
                                  'io': 'out'
                              },
                              'predicted': {
                                  'io': 'out'
                              }
                          },
                          **kwargs)
        self.ctrls['score'].setReadOnly(True)

    def process(self, **kwargs):
        return self.processData(**kwargs)

    def processData(self, train: Transmission, predict: Transmission):
        self.t = train.copy(
        )  #: Transmisison instance containing the training data with the labels
        if predict is not None:
            self.to_predict = predict.copy(
            )  #: Transmission instance containing the data to predict after fitting on the the training data

        dcols, ccols, ucols = organize_dataframe_columns(self.t.df.columns)

        self.ctrls['train_data'].setItems(dcols)
        self.ctrls['train_labels'].setItems(ccols)

        if predict is not None:
            pdcols, ccols, ucols = organize_dataframe_columns(
                self.to_predict.df.columns)
            self.ctrls['predict_on'].setItems(pdcols)

        if not self.apply_checked():
            return

        train_columns = self.ctrls['train_data'].getSelectedItems()
        labels = self.ctrls['train_labels'].currentText()

        solver = self.ctrls['solver'].currentText()

        shrinkage = self.ctrls['shrinkage'].currentText()
        if shrinkage == 'value':
            shrinkage = self.ctrls['shrinkage_val'].value()
        elif shrinkage == 'None':
            shrinkage = None

        n_components = self.ctrls['n_components'].value()
        tol = 10**self.ctrls['tol'].value()

        store_covariance = True if solver == 'svd' else False

        params = {
            'train_data': train_columns,
            'train_labels': labels,
            'solver': solver,
            'shrinkage': shrinkage,
            'n_components': n_components,
            'tol': tol,
            'store_covariance': store_covariance
        }

        kwargs = params.copy()
        kwargs.pop('train_data')
        kwargs.pop('train_labels')
        self.lda = LinearDiscriminantAnalysis(**kwargs)

        # Make an array of all the data from the selected columns
        self.X = np.hstack([
            np.vstack(self.t.df[train_column])
            for train_column in train_columns
        ])
        self.y = self.t.df[labels]

        self.X_ = self.lda.fit_transform(self.X, self.y)

        self.t.df['_LDA_TRANSFORM'] = self.X_.tolist()
        self.t.df['_LDA_TRANSFORM'] = self.t.df['_LDA_TRANSFORM'].apply(
            np.array)

        params.update({
            'score': self.lda.score(self.X, self.y),
            'classes': self.lda.classes_.tolist()
        })

        self.ctrls['score'].setText(f"{params['score']:.4f}")

        self.t.history_trace.add_operation('all', 'lda', params)

        self.t.df['_LDA_DFUNC'] = self.lda.decision_function(self.X).tolist()

        coef_df = pd.DataFrame({
            'classes': self.lda.classes_,
            '_COEF': self.lda.coef_.tolist()
        })
        t_coef = Transmission(df=coef_df, history_trace=self.t.history_trace)

        means_df = pd.DataFrame({
            'classes': self.lda.classes_,
            '_MEANS': self.lda.means_.tolist()
        })
        t_means = Transmission(df=means_df, history_trace=self.t.history_trace)

        out = {
            'T': self.t,
            'coef': t_coef,
            'means': t_means,
            'predicted': None
        }

        # Predict using the trained model
        predict_columns = self.ctrls['predict_on'].getSelectedItems()

        if not predict_columns:
            return out

        if predict_columns != train_columns:
            QtWidgets.QMessageBox.warning(
                'Predict and Train columns do not match',
                'The selected train and predict columns are different')

        predict_data = np.hstack([
            np.vstack(self.to_predict.df[predict_column])
            for predict_column in predict_columns
        ])
        self.to_predict.df['LDA_PREDICTED_LABELS'] = self.lda.predict(
            predict_data)
        self.to_predict.df['_LDA_TRANSFORM'] = self.lda.transform(
            predict_data).tolist()
        self.to_predict.df['_LDA_TRANSFORM'] = self.to_predict.df[
            '_LDA_TRANSFORM'].apply(np.array)

        params_predict = params.copy()
        params_predict.update({'predict_columns': predict_columns})

        self.to_predict.history_trace.add_operation('all', 'lda-predict',
                                                    params_predict)

        out.update({'predicted': self.to_predict})

        return out
Exemple #23
0
if (args.pca):
    print('Performing PCA on the samples')
    pca = PCA(n_components=0.9)
    pca.fit(Xtr)
    print('Number of components used: {}'.format(pca.n_components_))
    Xtr = pca.transform(Xtr)
    Xte = pca.transform(Xte)

if (args.lda):
    print('Performing LDA on the samples')
    lda = LDA()
    lda.fit(Xtr, Ytr)
    print('Number of components used: {}'.format(
        lda.explained_variance_ratio_.shape))
    Xtr = lda.transform(Xtr)
    Xte = lda.transform(Xte)

# Small development set for quick hyperparameter search
num_dev_samples = 5000
np.random.seed(28)
mask = np.random.choice(num_train_samples, num_dev_samples, replace=False)
Xtr_dev = Xtr[mask]
Ytr_dev = Ytr[mask]

np.random.seed(28)
mask = np.random.choice(num_test_samples,
                        int(num_dev_samples / 5),
                        replace=False)
Xte_dev = Xte[mask]
Yte_dev = Yte[mask]
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

#apllying the dimensionality reduction

from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
#no. of feature you want to extract from all the 13 features
#here we are including the 2 linear discriminat that expalin the seperate the classes the most

lda = LDA(n_components=2)
X_train = lda.fit_transform(
    X_train, y_train
)  #we need to include y_train since it is an supervised learning algo
X_test = lda.transform(
    X_test
)  #since our model is already fitted to the object no need of including the y_test

# Fitting classifier to the Training set
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression(random_state=0)
classifier.fit(X_train, y_train)
# Predicting the Test set results
y_pred = classifier.predict(X_test)

# Making the Confusion Matrix
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred)

# Visualising the Training set results
from matplotlib.colors import ListedColormap
Exemple #25
0
def main():
    if os.path.exists(__TRAINED_DATA_SET):
        df = pd.read_csv(__TRAINED_DATA_SET)
    else:
        df = train()

    X = df.iloc[:, 1:].values
    y = df.iloc[:, 0].values

    # Encoding the Dependent Variable
    labelencoder_y = LabelEncoder()
    y = labelencoder_y.fit_transform(y)

    # Splitting the dataset into the Training set and Test set
    x_train, x_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        test_size=0.25,
                                                        random_state=0)

    # Feature Scaling
    sc = StandardScaler()
    x_train = sc.fit_transform(x_train)
    x_test = sc.transform(x_test)

    lda = LDA(n_components=None)
    x_train = lda.fit_transform(x_train, y_train)
    x_test = lda.transform(x_test)
    explained_variance = lda.explained_variance_ratio_

    # Fitting Logistic Regression to the Training set
    classifier = LogisticRegression(random_state=0)
    classifier.fit(x_train, y_train)

    # Predicting the Test set results
    y_pred = classifier.predict(x_test)

    # Making the Confusion Matrix
    cm = confusion_matrix(y_test, y_pred)
    (cm[0][0] + cm[1][1] + cm[2][2] + cm[3][3]) / sum(sum(cm))

    # Fitting K-NN to the Training set
    classifier = KNeighborsClassifier(n_neighbors=5, metric='minkowski', p=2)
    classifier.fit(x_train, y_train)

    # Predicting the Test set results
    y_pred = classifier.predict(x_test)

    # Making the Confusion Matrix
    cm = confusion_matrix(y_test, y_pred)
    (cm[0][0] + cm[1][1] + cm[2][2] + cm[3][3]) / sum(sum(cm))

    # Fitting SVM to the Training set
    classifier = SVC(kernel='linear', random_state=0)
    classifier.fit(x_train, y_train)

    # Predicting the Test set results
    y_pred = classifier.predict(x_test)

    # Making the Confusion Matrix
    cm = confusion_matrix(y_test, y_pred)
    (cm[0][0] + cm[1][1] + cm[2][2] + cm[3][3]) / sum(sum(cm))

    # Fitting Kernel SVM to the Training set
    classifier = SVC(kernel='rbf', random_state=0)
    classifier.fit(x_train, y_train)

    # Predicting the Test set results
    y_pred = classifier.predict(x_test)

    # Making the Confusion Matrix
    cm = confusion_matrix(y_test, y_pred)
    (cm[0][0] + cm[1][1] + cm[2][2] + cm[3][3]) / sum(sum(cm))

    # Fitting Naive Bayes to the Training set
    classifier = GaussianNB()
    classifier.fit(x_train, y_train)

    # Predicting the Test set results
    y_pred = classifier.predict(x_test)

    # Making the Confusion Matrix
    cm = confusion_matrix(y_test, y_pred)
    (cm[0][0] + cm[1][1] + cm[2][2] + cm[3][3]) / sum(sum(cm))

    # Fitting Decision Tree Classification to the Training set
    classifier = DecisionTreeClassifier(criterion='entropy', random_state=0)
    classifier.fit(x_train, y_train)

    # Predicting the Test set results
    y_pred = classifier.predict(x_test)

    # Making the Confusion Matrix
    cm = confusion_matrix(y_test, y_pred)
    (cm[0][0] + cm[1][1] + cm[2][2] + cm[3][3]) / sum(sum(cm))

    # Fitting Random Forest Classification to the Training set
    classifier = RandomForestClassifier(n_estimators=10,
                                        criterion='entropy',
                                        random_state=0)
    classifier.fit(x_train, y_train)

    # Predicting the Test set results
    y_pred = classifier.predict(x_test)

    # Making the Confusion Matrix
    cm = confusion_matrix(y_test, y_pred)
    (cm[0][0] + cm[1][1] + cm[2][2] + cm[3][3]) / sum(sum(cm))

    parameters = [{
        'C': [1, 10, 100, 1000],
        'kernel': ['linear']
    }, {
        'C': [1, 10, 100, 1000],
        'kernel': ['rbf'],
        'gamma': [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]
    }]
    grid_search = GridSearchCV(estimator=classifier,
                               param_grid=parameters,
                               scoring='accuracy',
                               cv=10,
                               n_jobs=-1)
    grid_search = grid_search.fit(x_train, y_train)
    best_accuracy = grid_search.best_score_
    best_parameters = grid_search.best_params_

    # Fitting Kernel SVM to the Training set
    classifier = SVC(kernel='rbf', random_state=0)
    classifier.fit(x_train, y_train)

    # Predicting the Test set results
    y_pred = classifier.predict(x_test)

    # Making the Confusion Matrix
    cm = confusion_matrix(y_test, y_pred)
    (cm[0][0] + cm[1][1] + cm[2][2] + cm[3][3]) / sum(sum(cm))
Exemple #26
0
def lda_project(spike_times,
                spike_clusters,
                event_times,
                event_groups,
                pre_time=0,
                post_time=0.5,
                cross_validation='kfold',
                num_splits=5,
                prob_left=None,
                custom_validation=None):
    """
    Use linear discriminant analysis to project population vectors to the line that best separates
    the two groups. When cross-validation is used, the LDA projection is fitted on the training
    data after which the test data is projected to this projection.

    spike_times : 1D array
        spike times (in seconds)
    spike_clusters : 1D array
        cluster ids corresponding to each event in `spikes`
    event_times : 1D array
        times (in seconds) of the events from the two groups
    event_groups : 1D array
        group identities of the events, can be any number of groups, accepts integers and strings
    cross_validation : string
        which cross-validation method to use, options are:
            'none'              No cross-validation
            'kfold'             K-fold cross-validation
            'leave-one-out'     Leave out the trial that is being decoded
            'block'             Leave out the block the to-be-decoded trial is in
            'custom'            Any custom cross-validation provided by the user
    num_splits : integer
        ** only for 'kfold' cross-validation **
        Number of splits to use for k-fold cross validation, a value of 5 means that the decoder
        will be trained on 4/5th of the data and used to predict the remaining 1/5th. This process
        is repeated five times so that all data has been used as both training and test set.
    prob_left : 1D array
        ** only for 'block' cross-validation **
        the probability of the stimulus appearing on the left for each trial in event_times
    custom_validation : generator
        ** only for 'custom' cross-validation **
        a generator object with the splits to be used for cross validation using this format:
            (
                (split1_train_idxs, split1_test_idxs),
                (split2_train_idxs, split2_test_idxs),
                (split3_train_idxs, split3_test_idxs),
             ...)
    n_neurons : int
        Group size of number of neurons to be sub-selected

    Returns
    -------
    lda_projection : 1D array
        the position along the LDA projection axis for the population vector of each trial

    """

    # Check input
    assert cross_validation in [
        'none', 'kfold', 'leave-one-out', 'block', 'custom'
    ]
    assert event_times.shape[0] == event_groups.shape[0]
    if cross_validation == 'block':
        assert event_times.shape[0] == prob_left.shape[0]
    if cross_validation == 'custom':
        assert isinstance(custom_validation, types.GeneratorType)

    # Get matrix of all neuronal responses
    times = np.column_stack(
        ((event_times - pre_time), (event_times + post_time)))
    pop_vector, cluster_ids = get_spike_counts_in_bins(spike_times,
                                                       spike_clusters, times)
    pop_vector = pop_vector.T

    # Initialize
    lda = LinearDiscriminantAnalysis()
    lda_projection = np.zeros(event_groups.shape)

    if cross_validation == 'none':
        # Find the best LDA projection on all data and transform those data
        lda_projection = lda.fit_transform(pop_vector, event_groups)

    else:
        # Perform cross-validation
        if cross_validation == 'leave-one-out':
            cv = LeaveOneOut().split(pop_vector)
        elif cross_validation == 'kfold':
            cv = KFold(n_splits=num_splits).split(pop_vector)
        elif cross_validation == 'block':
            block_lengths = [sum(1 for i in g) for k, g in groupby(prob_left)]
            blocks = np.repeat(np.arange(len(block_lengths)), block_lengths)
            cv = LeaveOneGroupOut().split(pop_vector, groups=blocks)
        elif cross_validation == 'custom':
            cv = custom_validation

        # Loop over the splits into train and test
        for train_index, test_index in cv:

            # Find LDA projection on the training data
            lda.fit(pop_vector[train_index],
                    [event_groups[j] for j in train_index])

            # Project the held-out test data to projection
            lda_projection[test_index] = lda.transform(
                pop_vector[test_index]).T[0]

    return lda_projection
def run(train_pyramid_descriptors, D, test_pyramid_descriptors,
        feat_des_options):

    train_images_filenames = cPickle.load(
        open('train_images_filenames.dat', 'rb'))
    test_images_filenames = cPickle.load(
        open('test_images_filenames.dat', 'rb'))
    train_labels = cPickle.load(open('train_labels.dat', 'rb'))
    test_labels = cPickle.load(open('test_labels.dat', 'rb'))

    k = feat_des_options['k']
    codebook = MiniBatchKMeans(n_clusters=k,
                               verbose=False,
                               batch_size=k * 20,
                               compute_labels=False,
                               reassignment_ratio=10**-4,
                               random_state=42)
    codebook.fit(D)

    visual_words_pyramid = np.zeros((len(train_pyramid_descriptors),
                                     k * len(train_pyramid_descriptors[0])),
                                    dtype=np.float32)
    for i in range(len(train_pyramid_descriptors)):
        visual_words_pyramid[i, :] = spatial_pyramid_histograms(
            train_pyramid_descriptors[i], codebook, k)

    knn = KNeighborsClassifier(n_neighbors=5, n_jobs=-1, metric='euclidean')
    knn.fit(visual_words_pyramid, train_labels)

    # logreg = LogisticRegression(random_state=0,max_iter=300).fit(visual_words_pyramid, train_labels)
    # scores = cross_validate(logreg, visual_words_pyramid, train_labels,scoring = ['precision_macro', 'recall_macro','f1_macro'], cv=5,return_estimator=True)

    scores = cross_validate(
        knn,
        visual_words_pyramid,
        train_labels,
        scoring=['accuracy', 'precision_macro', 'recall_macro', 'f1_macro'],
        cv=8,
        return_estimator=True)
    cross_val_accuracy = scores['test_accuracy'].mean()
    cross_val_precision = scores['test_precision_macro'].mean()
    cross_val_recall = scores['test_recall_macro'].mean()
    cross_val_f1 = scores['test_f1_macro'].mean()
    # print("%0.2f precision with a std dev of %0.2f" % (cross_val_precision, scores['test_precision_macro'].std()))
    # print("%0.2f recall with a std dev of %0.2f" % (cross_val_recall, scores['test_recall_macro'].std()))
    # print("%0.2f F1-score with a std dev of %0.2f" % (cross_val_f1, scores['test_f1_macro'].std()))

    visual_words_test = np.zeros(
        (len(test_images_filenames), visual_words_pyramid.shape[1]),
        dtype=np.float32)
    for i in range(len(test_images_filenames)):
        visual_words_test[i, :] = spatial_pyramid_histograms(
            test_pyramid_descriptors[i], codebook, k)

    test_accuracy = 100 * knn.score(visual_words_test, test_labels)
    # print("Test accuracy: %0.2f" % (test_accuracy))

    test_prediction = knn.predict(visual_words_test)
    # test_prediction = logreg.predict(visual_words_test)
    test_precision, test_recall, test_fscore, _ = precision_recall_fscore_support(
        test_labels, test_prediction, average='macro')
    # print("%0.2f precision" % (test_precision))
    # print("%0.2f recall" % (test_recall))
    # print("%0.2f F1-score" % (test_fscore))

    # pca = PCA(n_components=64)
    pca = PCA(n_components=feat_des_options['pca_perc'], svd_solver='full')
    VWpca = pca.fit_transform(visual_words_pyramid)
    knnpca = KNeighborsClassifier(n_neighbors=5, n_jobs=-1, metric='euclidean')
    knnpca.fit(VWpca, train_labels)
    vwtestpca = pca.transform(visual_words_test)
    pca_test_accuracy = 100 * knnpca.score(vwtestpca, test_labels)
    # print("PCA Test accuracy: %0.2f" % (pca_test_accuracy))
    scores_pca = cross_validate(
        knnpca,
        visual_words_pyramid,
        train_labels,
        scoring=['accuracy', 'precision_macro', 'recall_macro', 'f1_macro'],
        cv=8,
        return_estimator=True)
    cross_val_accuracy_pca = scores_pca['test_accuracy'].mean()
    cross_val_precision_pca = scores_pca['test_precision_macro'].mean()
    cross_val_recall_pca = scores_pca['test_recall_macro'].mean()
    cross_val_f1_pca = scores_pca['test_f1_macro'].mean()

    lda = LinearDiscriminantAnalysis(n_components=7)
    VWlda = lda.fit_transform(visual_words_pyramid, train_labels)
    knnlda = KNeighborsClassifier(n_neighbors=5, n_jobs=-1, metric='euclidean')
    knnlda.fit(VWlda, train_labels)
    vwtestlda = lda.transform(visual_words_test)
    lda_test_accuracy = 100 * knnlda.score(vwtestlda, test_labels)
    # print("LDA Test accuracy: %0.2f" % (lda_test_accuracy))

    return [
        cross_val_accuracy, cross_val_precision, cross_val_recall,
        cross_val_f1, test_precision, test_recall, test_fscore, test_accuracy,
        pca_test_accuracy, cross_val_accuracy_pca, cross_val_precision_pca,
        cross_val_recall_pca, cross_val_f1_pca, lda_test_accuracy
    ]
Exemple #28
0
X = onehotencoder.fit_transform(X).toarray()
# Encoding the Dependent Variable
labelencoder_Y = LabelEncoder()
Y = labelencoder_Y.fit_transform(Y)

validation_size = 0.20
seed = 7
X_train, X_validation, Y_train, Y_validation = train_test_split(
    X, Y, test_size=validation_size, random_state=seed)

#Applying LDA
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA

lda = LDA(n_components=2)
X_train = lda.fit_transform(X_train, Y_train)
X_validation = lda.transform(X_validation)

#Test options and evaluation metrics
seed = 7
scoring = 'accuracy'

#Spot check algorithms
models = []
models.append(('LR', LogisticRegression()))
models.append(
    ('KNN', KNeighborsClassifier(n_neighbors=5, metric='minkowski', p=2)))
models.append(('DT', DecisionTreeClassifier(criterion="entropy")))
models.append(
    ('RF', RandomForestClassifier(n_estimators=10, criterion='entropy')))
models.append(('NB', GaussianNB()))
models.append(('KSVM', SVC(kernel='rbf')))
Exemple #29
0
test_pred_lr_pca = lr.predict(X_test_pca)
print accuracy_score(y_train, train_pred_lr_pca)
print accuracy_score(y_test, test_pred_lr_pca)

# Part 3: PCA then SVM
svm = SVC(kernel='linear', random_state=42, C=0.5)
svm.fit(X_train_pca, y_train)
train_pred_svm_pca = svm.predict(X_train_pca)
test_pred_svm_pca = svm.predict(X_test_pca)
print accuracy_score(y_train, train_pred_svm_pca)
print accuracy_score(y_test, test_pred_svm_pca)

# Part 4: LDA then logistic regression
lda = LDA(n_components=2)
X_train_lda = lda.fit_transform(X_train_std, y_train)
X_test_lda = lda.transform(X_test_std)
lr.fit(X_train_lda, y_train)
train_pred_lr_lda = lr.predict(X_train_lda)
test_pred_lr_lda = lr.predict(X_test_lda)
print accuracy_score(y_train, train_pred_lr_lda)
print accuracy_score(y_test, test_pred_lr_lda)

# Part 4: LDA then SVM
svm = SVC(kernel='linear', random_state=42, C=0.5)
svm.fit(X_train_lda, y_train)
train_pred_svm_lda = svm.predict(X_train_lda)
test_pred_svm_lda = svm.predict(X_test_lda)
print accuracy_score(y_train, train_pred_svm_lda)
print accuracy_score(y_test, test_pred_svm_lda)

# Part 5: KPCA
Exemple #30
0
def run_16(X_train, X_test, y_train, y_test, dataset):
    LOGGER.info('running 16...')

    settings = {
        'wage': {
            'pca': 65,
            'ica': 92,
            'rp': 105,
            'lda': 1,
            'kmeans': 2,
            'gmm': 2,
            'kmeans_ica': 83,
            'kmeans_lda': 99,
            'gmm_lda': 99,
            'gmm_ica': 83,
        },
        'wine': {
            'pca': 12,
            'ica': 12,
            'rp': 13,
            'lda': 2,
            'kmeans': 3,
            'gmm': 3,
            'kmeans_lda': 99,
            'gmm_lda': 99,
        },
    }

    score_fns = [
        v_measure_score,
        homogeneity_score,
        completeness_score,
    ]

    pca = PCA(n_components=settings[dataset]['pca'])
    pca.fit(X_train)
    ica = FastICA(n_components=settings[dataset]['ica'])
    ica.fit(X_train)
    rp = SparseRandomProjection(n_components=settings[dataset]['rp'])
    rp.fit(X_train)
    lda = LinearDiscriminantAnalysis(n_components=settings[dataset]['lda'])
    lda.fit(X_train, y_train)

    plt.clf()
    visualizer = KElbowVisualizer(KMeans(),
                                  k=(2, 100),
                                  metric='calinski_harabasz',
                                  timings=True)
    visualizer.fit(pca.transform(X_train))
    # visualizer.show()
    plt.tight_layout()
    plt.savefig('plots/p16/km_pca_' + dataset + '.png')
    # visualizer.poof()
    kmeans = KMeans(n_clusters=settings[dataset]['kmeans'], random_state=0)
    kmeans.fit(pca.transform(X_train))
    cluster_validation_df = pd.DataFrame()
    for score in score_fns:
        cluster_validation_df.loc[score.__name__, 'score'] = score(
            y_test[y_test.columns[0]], kmeans.predict(pca.transform(X_test)))
    # print(cluster_validation_df)
    LOGGER.info('KMeans PCA {}: \n{}'.format(dataset, cluster_validation_df))

    plt.clf()
    visualizer = KElbowVisualizer(KMeans(),
                                  k=(2, 100),
                                  metric='calinski_harabasz',
                                  timings=True)
    visualizer.fit(ica.transform(X_train))
    # visualizer.show()
    plt.tight_layout()
    plt.savefig('plots/p16/km_ica_' + dataset + '.png')
    # visualizer.poof()
    kmeans = KMeans(n_clusters=settings[dataset]['kmeans'], random_state=0)
    kmeans.fit(ica.transform(X_train))
    cluster_validation_df = pd.DataFrame()
    for score in score_fns:
        cluster_validation_df.loc[score.__name__, 'score'] = score(
            y_test[y_test.columns[0]], kmeans.predict(ica.transform(X_test)))
    # print(cluster_validation_df)
    LOGGER.info('KMeans ICA {}: \n{}'.format(dataset, cluster_validation_df))

    plt.clf()
    visualizer = KElbowVisualizer(KMeans(),
                                  k=(2, 100),
                                  metric='calinski_harabasz',
                                  timings=True)
    visualizer.fit(rp.transform(X_train))
    # visualizer.show()
    plt.tight_layout()
    plt.savefig('plots/p16/km_rp_' + dataset + '.png')
    # visualizer.poof()
    kmeans = KMeans(n_clusters=settings[dataset]['kmeans'], random_state=0)
    kmeans.fit(rp.transform(X_train))
    cluster_validation_df = pd.DataFrame()
    for score in score_fns:
        cluster_validation_df.loc[score.__name__, 'score'] = score(
            y_test[y_test.columns[0]], kmeans.predict(rp.transform(X_test)))
    # print(cluster_validation_df)
    LOGGER.info('KMeans RP {}: \n{}'.format(dataset, cluster_validation_df))

    plt.clf()
    visualizer = KElbowVisualizer(KMeans(),
                                  k=(2, 100),
                                  metric='calinski_harabasz',
                                  timings=True)
    visualizer.fit(lda.transform(X_train))
    # visualizer.show()
    plt.tight_layout()
    plt.savefig('plots/p16/km_lda_' + dataset + '.png')
    # visualizer.poof()
    kmeans = KMeans(n_clusters=settings[dataset]['kmeans'], random_state=0)
    kmeans.fit(lda.transform(X_train))
    cluster_validation_df = pd.DataFrame()
    for score in score_fns:
        cluster_validation_df.loc[score.__name__, 'score'] = score(
            y_test[y_test.columns[0]], kmeans.predict(lda.transform(X_test)))
    # print(cluster_validation_df)
    LOGGER.info('KMeans LDA {}: \n{}'.format(dataset, cluster_validation_df))

    gmm = GaussianMixture(random_state=0)
    score_df = pd.DataFrame()
    k_max = 100
    for k in range(2, k_max):
        gmm.set_params(n_components=k)
        predY = gmm.fit_predict(pca.transform(X_train))
        score_df.loc[k, 'score'] = calinski_harabasz_score(
            pca.transform(X_train), predY)
    LOGGER.info('gmm pca max score on {}: k={}'.format(
        dataset,
        score_df.idxmax(axis=0)['score']))
    plt.clf()
    plt.title("calinski_harabasz_Expectation_Maximization")
    plt.xlabel('k')
    plt.ylabel('score')
    plt.plot(score_df.reset_index()['index'],
             score_df['score'],
             label='calinski_harabasz_score')
    plt.legend(loc="best")
    plt.savefig('plots/p16/' + '_'.join(['gm', 'pca', dataset, '.png']))
    gmm = GaussianMixture(n_components=settings[dataset]['gmm'],
                          random_state=0)
    gmm.fit(pca.transform(X_train))
    cluster_validation_df = pd.DataFrame()
    for score in score_fns:
        cluster_validation_df.loc[score.__name__, 'score'] = score(
            y_test[y_test.columns[0]], gmm.predict(pca.transform(X_test)))
    LOGGER.info('GMM PCA {}: \n{}'.format(dataset, cluster_validation_df))

    gmm = GaussianMixture(random_state=0)
    score_df = pd.DataFrame()
    k_max = 100
    for k in range(2, k_max):
        gmm.set_params(n_components=k)
        predY = gmm.fit_predict(ica.transform(X_train))
        score_df.loc[k, 'score'] = calinski_harabasz_score(
            ica.transform(X_train), predY)
    LOGGER.info('gmm ica max score on {}: k={}'.format(
        dataset,
        score_df.idxmax(axis=0)['score']))
    plt.clf()
    plt.title("calinski_harabasz_Expectation_Maximization")
    plt.xlabel('k')
    plt.ylabel('score')
    plt.plot(score_df.reset_index()['index'],
             score_df['score'],
             label='calinski_harabasz_score')
    plt.legend(loc="best")
    plt.savefig('plots/p16/' + '_'.join(['gm', 'ica', dataset, '.png']))
    gmm = GaussianMixture(n_components=settings[dataset]['gmm'],
                          random_state=0)
    gmm.fit(ica.transform(X_train))
    cluster_validation_df = pd.DataFrame()
    for score in score_fns:
        cluster_validation_df.loc[score.__name__, 'score'] = score(
            y_test[y_test.columns[0]], gmm.predict(ica.transform(X_test)))
    LOGGER.info('GMM ICA {}: \n{}'.format(dataset, cluster_validation_df))

    gmm = GaussianMixture(random_state=0)
    score_df = pd.DataFrame()
    k_max = 100
    for k in range(2, k_max):
        gmm.set_params(n_components=k)
        predY = gmm.fit_predict(rp.transform(X_train))
        score_df.loc[k, 'score'] = calinski_harabasz_score(
            rp.transform(X_train), predY)
    LOGGER.info('gmm rp max score on {}: k={}'.format(
        dataset,
        score_df.idxmax(axis=0)['score']))
    plt.clf()
    plt.title("calinski_harabasz_Expectation_Maximization")
    plt.xlabel('k')
    plt.ylabel('score')
    plt.plot(score_df.reset_index()['index'],
             score_df['score'],
             label='calinski_harabasz_score')
    plt.legend(loc="best")
    plt.savefig('plots/p16/' + '_'.join(['gm', 'rp', dataset, '.png']))
    gmm = GaussianMixture(n_components=settings[dataset]['gmm'],
                          random_state=0)
    gmm.fit(rp.transform(X_train))
    cluster_validation_df = pd.DataFrame()
    for score in score_fns:
        cluster_validation_df.loc[score.__name__, 'score'] = score(
            y_test[y_test.columns[0]], gmm.predict(rp.transform(X_test)))
    LOGGER.info('GMM RP {}: \n{}'.format(dataset, cluster_validation_df))

    gmm = GaussianMixture(random_state=0)
    score_df = pd.DataFrame()
    k_max = 100
    for k in range(2, k_max):
        gmm.set_params(n_components=k)
        predY = gmm.fit_predict(lda.transform(X_train))
        score_df.loc[k, 'score'] = calinski_harabasz_score(
            lda.transform(X_train), predY)
    LOGGER.info('gmm lda max score on {}: k={}'.format(
        dataset,
        score_df.idxmax(axis=0)['score']))
    plt.clf()
    plt.title("calinski_harabasz_Expectation_Maximization")
    plt.xlabel('k')
    plt.ylabel('score')
    plt.plot(score_df.reset_index()['index'],
             score_df['score'],
             label='calinski_harabasz_score')
    plt.legend(loc="best")
    plt.savefig('plots/p16/' + '_'.join(['gm', 'lda', dataset, '.png']))
    gmm = GaussianMixture(n_components=settings[dataset]['gmm'],
                          random_state=0)
    gmm.fit(lda.transform(X_train))
    cluster_validation_df = pd.DataFrame()
    for score in score_fns:
        cluster_validation_df.loc[score.__name__, 'score'] = score(
            y_test[y_test.columns[0]], gmm.predict(lda.transform(X_test)))
    LOGGER.info('GMM LDA {}: \n{}'.format(dataset, cluster_validation_df))
# PCA Scatter Plot
plt.legend(digits.target_names,
           bbox_to_anchor=(1.05, 1),
           loc=2,
           borderaxespad=0.)
plt.xlabel('First Principal Component')
plt.ylabel('Second Principal Component')
plt.title("Regular PCA Scatter Plot")
plt.show()

# Create a regular LDA model
lda = LDA(n_components=2).fit(digits.data, digits.target)

# Fit and transform the data to the model
reduced_data_lda = lda.transform(digits.data)

# Don't change the code in this block
colors = [
    'black', 'blue', 'purple', 'yellow', 'white', 'red', 'lime', 'cyan',
    'orange', 'gray'
]

for i in range(len(colors)):
    x = reduced_data_lda[:, 0][digits.target == i]
    y = reduced_data_lda[:, 1][digits.target == i]
    plt.scatter(x, y, marker='o', s=20, facecolors=colors[i], edgecolors='k')

# LDA Scatter Plot
plt.legend(digits.target_names,
           bbox_to_anchor=(1.05, 1),
def PCAPlot():
    import io, sys
    sys.stdin = io.TextIOWrapper(sys.stdin.buffer, encoding='utf-8')
    sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding='utf-8')
    sys.stderr = io.TextIOWrapper(sys.stderr.buffer, encoding='utf-8')
    tagset = set([])
    tags = ["i2vtags", "mstags", "gotags"]
    #    tags = ["gotags"]
    for tag in tags:
        for item in anime:
            for st in item[tag]:
                t = tag + "_" + st[0].replace(" ", "_")
                tagset.add(t)
        for item in jpop:
            for st in item[tag]:
                t = tag + "_" + st[0].replace(" ", "_")
                tagset.add(t)
    idtag = list(tagset)
    idtag.sort()
    idtag = ["anime/jpop"] + idtag
    tagid = {}
    for id, tag in enumerate(idtag):
        tagid[tag] = id
    feature = np.zeros((len(jpop) * 2, len(idtag) - 1))
    cnt = 0
    for item in anime[:len(jpop)]:
        for tag in tags:
            for st in item[tag]:
                t = tag + "_" + st[0].replace(" ", "_")
                feature[cnt][tagid[t] - 1] = st[1]
        cnt += 1
    for item in jpop:
        for tag in tags:
            for st in item[tag]:
                t = tag + "_" + st[0].replace(" ", "_")
                feature[cnt][tagid[t] - 1] = st[1]
        cnt += 1
    from sklearn.decomposition import PCA
    from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
    pca = PCA(n_components=2)
    xtr = pca.fit_transform(feature)
    plt.scatter(xtr[:len(jpop), 0],
                xtr[:len(jpop), 1],
                color="red",
                label="anime")
    plt.scatter(xtr[len(jpop):, 0],
                xtr[len(jpop):, 1],
                color="blue",
                label="jpop")
    plt.legend()
    plt.savefig("pca.png")
    plt.show()
    target = [0] * len(jpop) + [1] * len(jpop)
    xtr1, xte1, ytr1, yte1 = train_test_split(feature[:len(jpop)],
                                              [0] * len(jpop),
                                              test_size=0.2)
    xtr2, xte2, ytr2, yte2 = train_test_split(feature[len(jpop):],
                                              [1] * len(jpop),
                                              test_size=0.2)
    xtr = list(xtr1) + list(xtr2)
    xte = list(xte1) + list(xte2)
    ytr = list(ytr1) + list(ytr2)
    yte = list(yte1) + list(yte2)
    lda = LinearDiscriminantAnalysis()
    ytrp = lda.fit_transform(xtr, ytr)
    ytep = lda.transform(xte)
    print(lda.score(xtr, ytr), lda.score(xte, yte))
    plt.subplot(2, 1, 1)
    plt.hist(ytrp[:len(ytrp) / 2],
             normed=True,
             bins=50,
             alpha=0.3,
             label="anime",
             color="red")
    plt.hist(ytrp[len(ytrp) / 2:],
             normed=True,
             bins=50,
             alpha=0.3,
             label="jpop",
             color="blue")
    plt.xlabel("train")
    plt.legend()
    plt.subplot(2, 1, 2)
    plt.hist(ytep[:len(ytep) / 2],
             normed=True,
             bins=50,
             range=(-20, 20),
             alpha=0.3,
             label="anime",
             color="red")
    plt.hist(ytep[len(ytep) / 2:],
             normed=True,
             bins=50,
             range=(-20, 20),
             alpha=0.3,
             label="jpop",
             color="blue")
    plt.xlabel("test")
    plt.legend()
    plt.savefig("lda.png")
    plt.show()
Exemple #33
0
class VectorNormalizer(BaseEstimator, TransformerMixin):
    """ Perform of sequence of normalization as following
    -> Centering: Substract sample mean
    -> Whitening: using within-class-covariance-normalization
    -> Applying LDA (optional)
    -> Length normalization

  Parameters
  ----------
  centering : bool (default: True)
    mean normalized the vectors
  wccn : bool (default: True)
    within class covariance normalization
  lda : bool (default: True)
    Linear Discriminant Analysis
  concat : bool (default: False)
    concatenate original vector to the normalized vector

  Return
  ------
  [nb_samples, feat_dim] if `lda=False`
  [nb_samples, nb_classes - 1] if `lda=True` and `concat=False`
  [nb_samples, feat_dim + nb_classes - 1] if `lda=True` and `concat=True`

  """
    def __init__(self,
                 centering=True,
                 wccn=False,
                 unit_length=True,
                 lda=False,
                 concat=False):
        super(VectorNormalizer, self).__init__()
        self._centering = bool(centering)
        self._unit_length = bool(unit_length)
        self._wccn = bool(wccn)
        self._lda = LinearDiscriminantAnalysis() if bool(lda) else None
        self._feat_dim = None
        self._concat = bool(concat)

    # ==================== properties ==================== #
    @property
    def feat_dim(self):
        return self._feat_dim

    @property
    def is_initialized(self):
        return self._feat_dim is not None

    @property
    def is_fitted(self):
        return hasattr(self, '_W')

    @property
    def enroll_vecs(self):
        return self._enroll_vecs

    @property
    def mean(self):
        """ global mean vector """
        return self._mean

    @property
    def vmin(self):
        return self._vmin

    @property
    def vmax(self):
        return self._vmax

    @property
    def W(self):
        return self._W

    @property
    def lda(self):
        return self._lda

    # ==================== sklearn ==================== #
    def _initialize(self, X, y):
        if not self.is_initialized:
            self._feat_dim = X.shape[1]
        assert self._feat_dim == X.shape[1]
        if isinstance(y, (tuple, list)):
            y = np.asarray(y)
        if y.ndim == 2:
            y = np.argmax(y, axis=-1)
        return y, np.unique(y)

    def normalize(self, X, concat=None):
        """
    Parameters
    ----------
    X : array [nb_samples, feat_dim]
    concat : {None, True, False}
      if not None, override the default `concat` attribute of
      this `VectorNormalizer`
    """
        if not self.is_fitted:
            raise RuntimeError("VectorNormalizer has not been fitted.")
        if concat is None:
            concat = self._concat
        if concat:
            X_org = X[:] if not isinstance(X, np.ndarray) else X
        else:
            X_org = None
        # ====== normalizing ====== #
        if self._centering:
            X = X - self._mean
        if self._wccn:
            X = np.dot(X, self.W)
        # ====== LDA ====== #
        if self._lda is not None:
            X_lda = self._lda.transform(X)  # [nb_classes, nb_classes - 1]
            # concat if necessary
            if concat:
                X = np.concatenate((X_lda, X_org), axis=-1)
            else:
                X = X_lda
        # ====== unit length normalization ====== #
        if self._unit_length:
            X = length_norm(X, axis=-1, ord=2)
        return X

    def fit(self, X, y):
        y, classes = self._initialize(X, y)
        # ====== compute classes' average ====== #
        enroll = compute_class_avg(X, y, classes, sorting=True)
        M = X.mean(axis=0).reshape(1, -1)
        self._mean = M
        if self._centering:
            X = X - M
        # ====== WCCN ====== #
        if self._wccn:
            W = compute_wccn(X, y, classes=None,
                             class_avg=enroll)  # [feat_dim, feat_dim]
        else:
            W = 1
        self._W = W
        # ====== preprocess ====== #
        # whitening the data
        if self._wccn:
            X = np.dot(X, W)
        # length normalization
        if self._unit_length:
            X = length_norm(X, axis=-1)
        # linear discriminant analysis
        if self._lda is not None:
            self._lda.fit(X, y)  # [nb_classes, nb_classes - 1]
        # ====== enroll vecs ====== #
        self._enroll_vecs = self.normalize(enroll, concat=False)
        # ====== max min ====== #
        if self._lda is not None:
            X = self._lda.transform(X)
            X = length_norm(X, axis=-1, ord=2)
        vmin = X.min(0, keepdims=True)
        vmax = X.max(0, keepdims=True)
        self._vmin, self._vmax = vmin, vmax
        return self

    def transform(self, X):
        return self.normalize(X)
Exemple #34
0
eigen_pairs = [
    (np.abs(eigen_vals[i], eigetn_vecs[:, i]) for i in range(len(eigen_vals)))]
eigen_pairs = sorted(eigen_pairs, key = lambda k: k[0], reverse = True)

print('Eigenvalues in decreasing order:\n')
for ev in eigen_pairs:
    print ev[0]
'''

# LDA in sklearn
lda = LDA(n_components = 2)
X_train_lda = lda.fit_transform(X_train_std, y_train)

lr = LogisticRegression()
lr = lr.fit(X_train_lda, y_train)

plot_decision_regions(X_train_lda, y_train, classifier = lr)
plt.xlabel('LD 1')
plt.ylabel('LD 2')
plt.legend(loc = 'lower left')
plt.show()

# On test set:
X_test_lda = lda.transform(X_test_std)

plot_decision_regions(X_test_lda, y_test, classifier = lr)
plt.xlabel('LD 1')
plt.ylabel('LD 2')
plt.legend(loc = 'lower left')
plt.show()
Exemple #35
0
def run_nn_2(X_train, X_test, y_train, y_test, dataset):
    LOGGER.info('running NN...')

    settings = {
        'wage': {
            'pca': 65,
            'ica': 92,
            'rp': 105,
            'lda': 1,
            'kmeans': 2,
            'gmm': 2,
            'kmeans_ica': 83,
            'kmeans_lda': 99,
            'gmm_lda': 99,
            'gmm_ica': 83,
            'nn': {
                'iter': 200,
                'hls': 1000,
                'alpha': .0001,
            },
        },
        'wine': {
            'pca': 12,
            'ica': 12,
            'rp': 13,
            'lda': 2,
            'kmeans': 3,
            'gmm': 3,
            'kmeans_lda': 99,
            'gmm_lda': 99,
            'nn': {
                'iter': 200,
                'hls': 800,
                'alpha': .1,
            },
        },
    }

    LOGGER.info('NN OG...')
    nn = MLPClassifier(max_iter=settings[dataset]['nn']['iter'],
                       hidden_layer_sizes=settings[dataset]['nn']['hls'],
                       alpha=settings[dataset]['nn']['alpha'])
    nn_check(X_train, X_test, y_train, y_test, nn, 'OG')
    nn_epochs(X_train.to_numpy(), X_test.to_numpy(), y_train, y_test, nn, 'OG')

    LOGGER.info('NN PCA...')
    pca = PCA(n_components=settings[dataset]['pca'], random_state=0)
    X_train_transformed = pca.fit_transform(X_train)
    X_test_transformed = pca.transform(X_test)
    nn = MLPClassifier(max_iter=settings[dataset]['nn']['iter'],
                       hidden_layer_sizes=settings[dataset]['nn']['hls'],
                       alpha=settings[dataset]['nn']['alpha'])
    nn_check(X_train_transformed, X_test_transformed, y_train, y_test, nn,
             'PCA')
    nn_epochs(X_train_transformed, X_test_transformed, y_train, y_test, nn,
              'PCA')

    LOGGER.info('NN ICA...')
    ica = FastICA(n_components=settings[dataset]['ica'], random_state=0)
    X_train_transformed = ica.fit_transform(X_train)
    X_test_transformed = ica.transform(X_test)
    nn = MLPClassifier(max_iter=settings[dataset]['nn']['iter'],
                       hidden_layer_sizes=settings[dataset]['nn']['hls'],
                       alpha=settings[dataset]['nn']['alpha'])
    nn_check(X_train_transformed, X_test_transformed, y_train, y_test, nn,
             'ICA')
    nn_epochs(X_train_transformed, X_test_transformed, y_train, y_test, nn,
              'ICA')

    LOGGER.info('NN RP...')
    rp = SparseRandomProjection(n_components=settings[dataset]['rp'],
                                random_state=0)
    X_train_transformed = rp.fit_transform(X_train)
    X_test_transformed = rp.transform(X_test)
    nn = MLPClassifier(max_iter=settings[dataset]['nn']['iter'],
                       hidden_layer_sizes=settings[dataset]['nn']['hls'],
                       alpha=settings[dataset]['nn']['alpha'])
    nn_check(X_train_transformed, X_test_transformed, y_train, y_test, nn,
             'RP')
    nn_epochs(X_train_transformed, X_test_transformed, y_train, y_test, nn,
              'RP')

    LOGGER.info('NN LDA...')
    lda = LinearDiscriminantAnalysis(n_components=settings[dataset]['lda'])
    X_train_transformed = lda.fit_transform(X_train, y_train)
    X_test_transformed = lda.transform(X_test)
    nn = MLPClassifier(max_iter=settings[dataset]['nn']['iter'],
                       hidden_layer_sizes=settings[dataset]['nn']['hls'],
                       alpha=settings[dataset]['nn']['alpha'])
    nn_check(X_train_transformed, X_test_transformed, y_train, y_test, nn,
             'LDA')
    nn_epochs(X_train_transformed, X_test_transformed, y_train, y_test, nn,
              'LDA')

    kmeans = KMeans(n_clusters=settings[dataset]['kmeans'], random_state=0)
    X_train_transformed = kmeans.fit_transform(X_train)
    X_test_transformed = kmeans.transform(X_test)
    nn = MLPClassifier(max_iter=settings[dataset]['nn']['iter'],
                       hidden_layer_sizes=settings[dataset]['nn']['hls'],
                       alpha=settings[dataset]['nn']['alpha'])
    nn_check(X_train_transformed, X_test_transformed, y_train, y_test, nn,
             'KMEANS')
    nn_epochs(X_train_transformed, X_test_transformed, y_train, y_test, nn,
              'KMEANS')

    gmm = GaussianMixture(n_components=settings[dataset]['gmm'],
                          random_state=0)
    gmm.fit(X_train)
    X_train_transformed = gmm.predict_proba(X_train)
    X_test_transformed = gmm.predict_proba(X_test)
    # X_train_transformed = gmm.predict(X_train)
    # X_test_transformed = gmm.predict(X_test)
    # print(X_train_transformed)
    # print(X_test_transformed)
    nn = MLPClassifier(max_iter=settings[dataset]['nn']['iter'],
                       hidden_layer_sizes=settings[dataset]['nn']['hls'],
                       alpha=settings[dataset]['nn']['alpha'])
    nn_check(X_train_transformed, X_test_transformed, y_train, y_test, nn,
             'GMM')
    nn_epochs(X_train_transformed, X_test_transformed, y_train, y_test, nn,
              'GMM')
Exemple #36
0
def main():
    dataset = pd.read_csv('../../data.csv', header=0)
    X = dataset.iloc[:, 2:18].values
    y = dataset.iloc[:, 18].values
    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        random_state=1,
                                                        test_size=0.3)
    sc_X = StandardScaler()
    X_train = sc_X.fit_transform(X_train)
    X_test = sc_X.transform(X_test)
    lda = LDA(n_components=2)
    X_train = lda.fit_transform(X_train, y_train)
    X_test = lda.transform(X_test)
    print("---------Perceptron---------")
    per = Perceptron(n_iter=50, eta0=.1, random_state=1)
    per.fit(X_train, y_train)
    y_pred1 = per.predict(X_test)
    accuracy1 = ((y_test == y_pred1).sum() / len(y_test) * 100)
    print('accuracy %.2f' % accuracy1)
    print("---------Decision Tree---------")
    clf_entropy = DecisionTreeClassifier(criterion="entropy",
                                         random_state=1,
                                         max_depth=5,
                                         min_samples_leaf=3)
    clf_entropy.fit(X_train, y_train)
    y_pred2 = clf_entropy.predict(X_test)
    accuracy2 = ((y_test == y_pred2).sum() / len(y_test) * 100)
    print('accuracy %.2f' % accuracy2)
    print("---------KNN---------")
    knnn = KNeighborsClassifier(n_neighbors=9,
                                metric='euclidean')  # how did i choose
    #print(len(X_test))
    knnn.fit(X_train, y_train)
    y_pred3 = knnn.predict(X_test)
    accuracy3 = ((y_test == y_pred3).sum() / len(y_test) * 100)
    print('accuracy %.2f' % accuracy3)
    print("---------Logestic Refression---------")
    logreg = LogisticRegression(multi_class='auto')
    logreg.fit(X_train, y_train)
    y_pred4 = logreg.predict(X_test)
    #y_pred_lr_prob = logreg.predict_log_proba(X_test)
    #print(y_pred_lr_prob.shape)
    #print(y_pred_lr_prob)
    accuracy4 = ((y_test == y_pred4).sum() / len(y_test) * 100)
    print('accuracy %.2f' % accuracy4)
    print("---------SVM Linear---------")
    clf1 = svm.SVC(kernel="linear", random_state=1, C=1)
    clf1.fit(X_train, y_train)
    y_pred5 = clf1.predict(X_test)
    accuracy5 = ((y_test == y_pred5).sum() / len(y_test) * 100)
    print('accuracy %.2f' % accuracy5)
    print("---------SVM non-Linear---------")
    #clf = svm.SVC(kernel='rbf', random_state=1, gamma='auto', C=20.0)
    clf2 = svm.SVC(gamma='scale', C=1.0)
    clf2.fit(X_train, y_train)
    y_pred6 = clf2.predict(X_test)
    accuracy6 = ((y_test == y_pred6).sum() / len(y_test) * 100)
    print('accuracy %.2f' % accuracy6)

    print("---------SGD---------")
    sgd = linear_model.SGDClassifier(max_iter=100, tol=1e-3)
    sgd.fit(X_train, y_train)
    y_pred7 = sgd.predict(X_test)
    accuracy7 = ((y_test == y_pred7).sum() / len(y_test) * 100)
    print('accuracy %.2f' % accuracy7)
Exemple #37
0
print(train_y.shape,test_y.shape)
from sklearn.preprocessing import LabelEncoder , OneHotEncoder
from sklearn.compose import ColumnTransformer

le=LabelEncoder()
train_y=le.fit_transform(train_y).reshape(-1,1)

le2=LabelEncoder()
test_y=le2.fit_transform(test_y).reshape(-1,1)
# # ---------------------------------------------------------------------------------

# DIMENSIONALITY REDUCTION USING LDA FOR VISUALIZATION
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
lda=LDA(n_components=2)
train_x=lda.fit_transform(train_x,train_y)
test_x=lda.transform(test_x)
# -----------------------------------------------------------------------------------
# pdb.set_trace()

# COMPARING MODEL
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import StratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
from sklearn.svm import SVC

models=[] #--- models will contain tuples whose fist element will be name and second element will be model
Exemple #38
0
def _lda(load_file, shrinkage, dim, xv_fold, lbl2idx, idx2lbl, rng,
         shuffle_labels, verbose):
    lda_dict = {}
    results_dictlist = []
    h5_file = h5py.File(load_file, "r")
    pbar = tqdm(h5_file, dynamic_ncols=True, disable=not verbose)
    for name in pbar:
        msg = "shuffled, {:d}d, {}" if shuffle_labels else "{:d}d, {}"
        pbar.set_description(msg.format(dim, name))
        behavior = h5_file[name]["behavior"]
        trial_info_grp = behavior["trial_info"]

        good_cells = np.array(behavior["good_cells"], dtype=int)
        dff = np.array(behavior["dff"], dtype=float)[..., good_cells]
        nt, ntrials, _ = dff.shape

        trial_info = {}
        for k, v in trial_info_grp.items():
            trial_info[k] = np.array(v, dtype=int)

        if not set(lbl2idx.keys()).issubset(set(trial_info.keys())):
            if verbose:
                print("missing some trial types, skipping {} . . . ".format(
                    name))
            continue

        lbls = []
        dff_combined = []
        for trial in lbl2idx:
            cond = trial_info[trial] == 1
            lbls.extend([trial] * cond.sum())
            dff_combined.append(dff[:, cond, :])

        y = np.array([lbl2idx[k] for k in lbls])
        dff_combined = np.concatenate(dff_combined, axis=1)

        vld_indxs = []
        for i in idx2lbl:
            idxs = np.where(y == i)[0]
            nb_vld = int(np.ceil(len(idxs) / xv_fold))
            vld_indxs.extend(random.sample(list(idxs), nb_vld))
        trn_indxs = np.delete(range(len(y)), vld_indxs)
        assert set(trn_indxs).isdisjoint(set(vld_indxs))

        y_trn, y_vld = y[trn_indxs], y[vld_indxs]

        num_samples = np.array(
            [len(np.where(y_trn == i)[0]) for i in idx2lbl.keys()])
        if any(num_samples < 2):
            if verbose:
                print("not enough samples, skipping {} . . .".format(name))
            continue

        performance = np.zeros(nt)
        embedded = np.zeros((nt, len(vld_indxs), dim))

        _clfs = {}
        for t in tqdm(range(nt), leave=False, disable=not verbose):
            x_trn, x_vld = dff_combined[t][trn_indxs], dff_combined[t][
                vld_indxs]
            if shuffle_labels:
                while True:
                    y_shuffled = dc(y_trn)
                    rng.shuffle(y_shuffled)
                    if not np.all(y_shuffled == y_trn):
                        break
                y_trn = y_shuffled
            clf = LinearDiscriminantAnalysis(
                n_components=dim,
                solver='eigen',
                shrinkage=shrinkage,
            ).fit(x_trn, y_trn)
            z = clf.transform(x_vld)
            embedded[t] = z
            _clfs[t] = clf

            y_pred = clf.predict(x_vld)
            performance[t] = matthews_corrcoef(y_vld, y_pred)

        embedded_dict = {
            lbl: embedded[:, y_vld == idx, :]
            for lbl, idx in lbl2idx.items()
        }

        mu0 = embedded.mean(1)
        mu_dict = {lbl: z.mean(1) for lbl, z in embedded_dict.items()}
        scatter_between = {
            lbl: z.shape[1] * norm(
                mu_dict[lbl] - mu0,
                axis=-1,
                keepdims=True,
            )
            for lbl, z in embedded_dict.items()
        }
        scatter_within = {
            lbl: z.shape[1] * np.concatenate(
                tuple(
                    norm(
                        z[:, i, :] - mu_dict[lbl],
                        axis=-1,
                        keepdims=True,
                    ) for i in range(z.shape[1])),
                axis=-1,
            ).mean(-1, keepdims=True)
            for lbl, z in embedded_dict.items()
        }
        sb = np.concatenate(list(scatter_between.values()), axis=-1).sum(-1)
        sw = np.concatenate(list(scatter_within.values()), axis=-1).sum(-1)

        com_distances_dict = {
            lbl: np.concatenate(
                tuple(
                    norm(
                        mu - mu_prime,
                        axis=-1,
                        keepdims=True,
                    ) for mu_prime in mu_dict.values()),
                axis=-1,
            ).sum(-1)
            for lbl, mu in mu_dict.items()
        }
        d = np.concatenate(
            list(
                np.expand_dims(item, axis=-1)
                for item in com_distances_dict.values()),
            axis=-1,
        ).sum(-1)

        data_dict = {
            'name': [name] * nt,
            'timepoint': range(nt),
            'performance': performance,
            'distance': d,
            'sb': sb,
            'sw': sw,
            'J': sb / np.maximum(sw, 1e-8),
        }
        results_dictlist.append(data_dict)
        lda_dict[name] = LDA(name, dff_combined, y, embedded_dict, _clfs)

    # merge all results together, can be used to get df
    results = merge_dicts(results_dictlist)
    results = pd.DataFrame.from_dict(results)
    results = _compute_best_t(results)

    return results, lda_dict
Exemple #39
0
                                                    test_size=0.2,
                                                    random_state=0)

# Feature Scaling
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)
a_test_NN = sc.transform(a_test_NN)
unknown_img_NN = sc.transform(unknown_img_NN)

# Applying LDA
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
lda = LDA(n_components=2)
X_train = lda.fit_transform(X_train, y_train)
X_test = lda.transform(X_test)
a_test_NN = lda.transform(a_test_NN)
unknown_img_NN = lda.transform(unknown_img_NN)

# Fitting Logistic Regression to the Training set
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression(random_state=0)
classifier.fit(X_train, y_train)

# Predicting the Test set results
y_pred = classifier.predict(X_test)

# Making the Confusion Matrix
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred)
Exemple #40
0
def sklearn_lda(x, y, nComponent=None):
    lda = LinearDiscriminantAnalysis(n_components=nComponent)
    lda.fit(X, y)
    newx = lda.transform(X)
    data_plot2d(newx, y)
Exemple #41
0
    X = data

    for n_cluster in range(2, 80):
        kmeans = KMeans(n_clusters=n_cluster).fit(X)
        label = kmeans.labels_
        sil_coeff = silhouette_score(X, label, metric='euclidean')
        print("For n_clusters={}, The Silhouette Coefficient is {}".format(
            n_cluster, sil_coeff))


X, names, y = load_data()
fig = plt.figure()
lda = LinearDiscriminantAnalysis(n_components=2)
lda.fit(X, y)
X_new = lda.transform(X)
show_result_sc(X_new)
# 2D
# plt.scatter(X_new[:, 0], X_new[:, 1], marker='o', c=y)

# 3D
# ax = Axes3D(fig, rect=[0, 0, 1, 1], elev=30, azim=20)
# plt.scatter(X[:, 0], X[:, 1], X[:, 2], marker='o', c=y)

# for label, x, y in zip(names, X_new[:, 0], X_new[:, 1]):
#     plt.annotate(
#         label,
#         xy=(x, y),
#         xytext=(-20, 20),
#         textcoords='offset points',
#         ha='right',
Exemple #42
0
##### CREDIT DATASET ######

### CHANGE THE FILEPATH TO YOUR FILE ###
data = pd.read_csv('../datasets/credit.csv')

### CHANGE 'hand' TO YOUR TARGET FEATURE
X = data.drop('default', axis=1)
y = data.default

numOfFeatures = 25

model = LDA(n_components=numOfFeatures, store_covariance=True)

model.fit(X, y)

LDAComponents = model.transform(X)

# var = np.cumsum(np.round(model.explained_variance_ratio_, decimals=3) * 100)

cov = model.covariance_

eigvals, eigvecs = np.linalg.eig(cov)

o = eigvals / float(sum(eigvals)) * 100

o2 = []

for each in o:
    each = round(each, 2)

    o2.append(each)