Exemple #1
0
 def __init__(self, **kwargs):
     super(LDA, self).__init__()
     super(LDA, self).SetModel(LinearDiscriminantAnalysis(**kwargs))
    target = D[:, target_list]
    # print(target)                  # 目标分类值

    Sample = D[:, :target_list]

    # as it creates all the possible training/test sets by removing p samples. from the complete set.
    SSlit = ShuffleSplit(n_splits=5, test_size=0.3)

    # clf = svm.SVC(C=1.0, kernel='poly',degree = 3, gamma = 'auto')      # SVR 分类模型

    # 分类模型
    clf_svm1 = svm.SVC(kernel='rbf', gamma='scale')
    clf_svm2 = svm.SVC(kernel='linear', gamma='scale')
    clf_tree = tree.DecisionTreeClassifier(criterion="gini")
    clf_lda = LinearDiscriminantAnalysis(solver="svd",
                                         n_components=ldaNum,
                                         store_covariance=True,
                                         tol=1.0e-4)
    clf_knn = neighbors.KNeighborsClassifier(n_neighbors=5,
                                             weights='uniform',
                                             algorithm='auto',
                                             leaf_size=1,
                                             p=2,
                                             metric='minkowski',
                                             metric_params=None)
    clf_NN = MLPClassifier(hidden_layer_sizes=(10, ),
                           activation='logistic',
                           solver='lbfgs',
                           alpha=0.0001,
                           batch_size='auto',
                           learning_rate='adaptive',
                           max_iter=200,
Exemple #3
0
    else:
        stability_idx = np.load(stability_idx_path)
        data = data[:, stability_idx]

#%% Decoding Main Part

decoding_method = 'nn'
#======================== For Sklearn Classifier =======================
if decoding_method == 'sklearn':
    info = pd.DataFrame(columns=['single', 'mean'])
    param_grid = gen_param_grid('lda')

    # make pipeline
    if voxel_selection_method == 'stability':
        pipe = Pipeline([('classifier',
                          LinearDiscriminantAnalysis(solver='lsqr',
                                                     shrinkage=0.9))])
    elif voxel_selection_method == 'discrim':
        pipe = Pipeline([('feature_selection',
                          SelectPercentile(percentile=25)),
                         ('classifier',
                          LinearDiscriminantAnalysis(solver='lsqr',
                                                     shrinkage=0.9))])
    # model = LogisticRegression(C=0.001, max_iter=8000, solver='liblinear')
    # selector = RFE(model, n_features_to_select=0.25)

    ### best params after grid searching ###
    # LogisticRegression(C=0.001, max_iter=8000, solver='liblinear')
    # MLPClassifier(hidden_layer_sizes=100, alpha=0.01)
    # SVC(max_iter=8000, C=0.001, kernel='linear', decision_function_shape='ovo')
    # RandomForestClassifier(n_estimators=500)
    # Lasso(alpha=0.01)
Exemple #4
0
def sklearn_lda(x, y, nComponent=None):
    lda = LinearDiscriminantAnalysis(n_components=nComponent)
    lda.fit(X, y)
    newx = lda.transform(X)
    data_plot2d(newx, y)
#%%
# CART(classification and regression trees) Classification
kfold = KFold(n_splits=10, random_state=7)
model = DecisionTreeClassifier()
results = cross_val_score(model, X, Y, cv=kfold)
print(results.mean())

#%%
#Gaussian Naive Bayes
model = GaussianNB()
results = cross_val_score(model, X, Y, cv=kfold)
print(results.mean())

#%%
#SVM
model = SVC()
results = cross_val_score(model, X, Y, cv=kfold)
print(results.mean())

#%%
#LDA
model = LinearDiscriminantAnalysis()
results = cross_val_score(model, X, Y, cv=kfold)
print(results.mean())

#%%
#K-Nearest Neighbor
model = KNeighborsClassifier()
results = cross_val_score(model, X, Y, cv=kfold)
print(results.mean())
Exemple #6
0
sss = StratifiedShuffleSplit(labels, 10, test_size=0.2, random_state=23)

for train_index, test_index in sss:
    X_train, X_test = train.values[train_index], train.values[test_index]
    y_train, y_test = labels[train_index], labels[test_index]

classifiers = [
    KNeighborsClassifier(3),
    SVC(kernel='rbf', C=0.025, probability=True),
    NuSVC(probability=True),
    DecisionTreeClassifier(),
    AdaBoostClassifier(),
    GradientBoostingClassifier(),
    GaussianNB(),
    LinearDiscriminantAnalysis(),
    QuadraticDiscriminantAnalysis()]

log_cols = ["Classifier", "Accuracy", "Log Loss"]
log = pd.DataFrame(columns=log_cols)

for clf in classifiers:
    clf.fit(X_train, y_train)
    name = clf.__class__.__name__

    print("=" * 30)
    print(name)

    print('****Result****')
    train_predictions = clf.predict(X_test)
    acc = accuracy_score(y_test, train_predictions)
Exemple #7
0
# method 1: Feature selection
* Back Propagation
* Forward Propagation
* Bidirectional Propagation

# method 2 :  Feature Extraction

################################# PCA Reduction ################
from sklearn.decomposition import PCA # linear dimentionality reduction
pca = PCA(n_components = None)
X_train = pca.fit_transform(X_train)
X_test = pca.transform(X_test)
explained_variance = pca.explained_variance_ratio_


################################# LDA Reduction ################
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
lda = LinearDiscriminantAnalysis(n_components = 2)
X_train = lda.fit_transform(X_train, y_train)
X_test = lda.transform()

################################# applying kernel_pca ################
from sklearn.decomposition import KernelPCA
kpca = KernelPCA(n_components = 2 , kernel = 'rbf')
X_train = kpca.fit_transform(X_train)
X_test = kpca.transform(X_test)
Exemple #8
0
def run_16(X_train, X_test, y_train, y_test, dataset):
    LOGGER.info('running 16...')

    settings = {
        'wage': {
            'pca': 65,
            'ica': 92,
            'rp': 105,
            'lda': 1,
            'kmeans': 2,
            'gmm': 2,
            'kmeans_ica': 83,
            'kmeans_lda': 99,
            'gmm_lda': 99,
            'gmm_ica': 83,
        },
        'wine': {
            'pca': 12,
            'ica': 12,
            'rp': 13,
            'lda': 2,
            'kmeans': 3,
            'gmm': 3,
            'kmeans_lda': 99,
            'gmm_lda': 99,
        },
    }

    score_fns = [
        v_measure_score,
        homogeneity_score,
        completeness_score,
    ]

    pca = PCA(n_components=settings[dataset]['pca'])
    pca.fit(X_train)
    ica = FastICA(n_components=settings[dataset]['ica'])
    ica.fit(X_train)
    rp = SparseRandomProjection(n_components=settings[dataset]['rp'])
    rp.fit(X_train)
    lda = LinearDiscriminantAnalysis(n_components=settings[dataset]['lda'])
    lda.fit(X_train, y_train)

    plt.clf()
    visualizer = KElbowVisualizer(KMeans(),
                                  k=(2, 100),
                                  metric='calinski_harabasz',
                                  timings=True)
    visualizer.fit(pca.transform(X_train))
    # visualizer.show()
    plt.tight_layout()
    plt.savefig('plots/p16/km_pca_' + dataset + '.png')
    # visualizer.poof()
    kmeans = KMeans(n_clusters=settings[dataset]['kmeans'], random_state=0)
    kmeans.fit(pca.transform(X_train))
    cluster_validation_df = pd.DataFrame()
    for score in score_fns:
        cluster_validation_df.loc[score.__name__, 'score'] = score(
            y_test[y_test.columns[0]], kmeans.predict(pca.transform(X_test)))
    # print(cluster_validation_df)
    LOGGER.info('KMeans PCA {}: \n{}'.format(dataset, cluster_validation_df))

    plt.clf()
    visualizer = KElbowVisualizer(KMeans(),
                                  k=(2, 100),
                                  metric='calinski_harabasz',
                                  timings=True)
    visualizer.fit(ica.transform(X_train))
    # visualizer.show()
    plt.tight_layout()
    plt.savefig('plots/p16/km_ica_' + dataset + '.png')
    # visualizer.poof()
    kmeans = KMeans(n_clusters=settings[dataset]['kmeans'], random_state=0)
    kmeans.fit(ica.transform(X_train))
    cluster_validation_df = pd.DataFrame()
    for score in score_fns:
        cluster_validation_df.loc[score.__name__, 'score'] = score(
            y_test[y_test.columns[0]], kmeans.predict(ica.transform(X_test)))
    # print(cluster_validation_df)
    LOGGER.info('KMeans ICA {}: \n{}'.format(dataset, cluster_validation_df))

    plt.clf()
    visualizer = KElbowVisualizer(KMeans(),
                                  k=(2, 100),
                                  metric='calinski_harabasz',
                                  timings=True)
    visualizer.fit(rp.transform(X_train))
    # visualizer.show()
    plt.tight_layout()
    plt.savefig('plots/p16/km_rp_' + dataset + '.png')
    # visualizer.poof()
    kmeans = KMeans(n_clusters=settings[dataset]['kmeans'], random_state=0)
    kmeans.fit(rp.transform(X_train))
    cluster_validation_df = pd.DataFrame()
    for score in score_fns:
        cluster_validation_df.loc[score.__name__, 'score'] = score(
            y_test[y_test.columns[0]], kmeans.predict(rp.transform(X_test)))
    # print(cluster_validation_df)
    LOGGER.info('KMeans RP {}: \n{}'.format(dataset, cluster_validation_df))

    plt.clf()
    visualizer = KElbowVisualizer(KMeans(),
                                  k=(2, 100),
                                  metric='calinski_harabasz',
                                  timings=True)
    visualizer.fit(lda.transform(X_train))
    # visualizer.show()
    plt.tight_layout()
    plt.savefig('plots/p16/km_lda_' + dataset + '.png')
    # visualizer.poof()
    kmeans = KMeans(n_clusters=settings[dataset]['kmeans'], random_state=0)
    kmeans.fit(lda.transform(X_train))
    cluster_validation_df = pd.DataFrame()
    for score in score_fns:
        cluster_validation_df.loc[score.__name__, 'score'] = score(
            y_test[y_test.columns[0]], kmeans.predict(lda.transform(X_test)))
    # print(cluster_validation_df)
    LOGGER.info('KMeans LDA {}: \n{}'.format(dataset, cluster_validation_df))

    gmm = GaussianMixture(random_state=0)
    score_df = pd.DataFrame()
    k_max = 100
    for k in range(2, k_max):
        gmm.set_params(n_components=k)
        predY = gmm.fit_predict(pca.transform(X_train))
        score_df.loc[k, 'score'] = calinski_harabasz_score(
            pca.transform(X_train), predY)
    LOGGER.info('gmm pca max score on {}: k={}'.format(
        dataset,
        score_df.idxmax(axis=0)['score']))
    plt.clf()
    plt.title("calinski_harabasz_Expectation_Maximization")
    plt.xlabel('k')
    plt.ylabel('score')
    plt.plot(score_df.reset_index()['index'],
             score_df['score'],
             label='calinski_harabasz_score')
    plt.legend(loc="best")
    plt.savefig('plots/p16/' + '_'.join(['gm', 'pca', dataset, '.png']))
    gmm = GaussianMixture(n_components=settings[dataset]['gmm'],
                          random_state=0)
    gmm.fit(pca.transform(X_train))
    cluster_validation_df = pd.DataFrame()
    for score in score_fns:
        cluster_validation_df.loc[score.__name__, 'score'] = score(
            y_test[y_test.columns[0]], gmm.predict(pca.transform(X_test)))
    LOGGER.info('GMM PCA {}: \n{}'.format(dataset, cluster_validation_df))

    gmm = GaussianMixture(random_state=0)
    score_df = pd.DataFrame()
    k_max = 100
    for k in range(2, k_max):
        gmm.set_params(n_components=k)
        predY = gmm.fit_predict(ica.transform(X_train))
        score_df.loc[k, 'score'] = calinski_harabasz_score(
            ica.transform(X_train), predY)
    LOGGER.info('gmm ica max score on {}: k={}'.format(
        dataset,
        score_df.idxmax(axis=0)['score']))
    plt.clf()
    plt.title("calinski_harabasz_Expectation_Maximization")
    plt.xlabel('k')
    plt.ylabel('score')
    plt.plot(score_df.reset_index()['index'],
             score_df['score'],
             label='calinski_harabasz_score')
    plt.legend(loc="best")
    plt.savefig('plots/p16/' + '_'.join(['gm', 'ica', dataset, '.png']))
    gmm = GaussianMixture(n_components=settings[dataset]['gmm'],
                          random_state=0)
    gmm.fit(ica.transform(X_train))
    cluster_validation_df = pd.DataFrame()
    for score in score_fns:
        cluster_validation_df.loc[score.__name__, 'score'] = score(
            y_test[y_test.columns[0]], gmm.predict(ica.transform(X_test)))
    LOGGER.info('GMM ICA {}: \n{}'.format(dataset, cluster_validation_df))

    gmm = GaussianMixture(random_state=0)
    score_df = pd.DataFrame()
    k_max = 100
    for k in range(2, k_max):
        gmm.set_params(n_components=k)
        predY = gmm.fit_predict(rp.transform(X_train))
        score_df.loc[k, 'score'] = calinski_harabasz_score(
            rp.transform(X_train), predY)
    LOGGER.info('gmm rp max score on {}: k={}'.format(
        dataset,
        score_df.idxmax(axis=0)['score']))
    plt.clf()
    plt.title("calinski_harabasz_Expectation_Maximization")
    plt.xlabel('k')
    plt.ylabel('score')
    plt.plot(score_df.reset_index()['index'],
             score_df['score'],
             label='calinski_harabasz_score')
    plt.legend(loc="best")
    plt.savefig('plots/p16/' + '_'.join(['gm', 'rp', dataset, '.png']))
    gmm = GaussianMixture(n_components=settings[dataset]['gmm'],
                          random_state=0)
    gmm.fit(rp.transform(X_train))
    cluster_validation_df = pd.DataFrame()
    for score in score_fns:
        cluster_validation_df.loc[score.__name__, 'score'] = score(
            y_test[y_test.columns[0]], gmm.predict(rp.transform(X_test)))
    LOGGER.info('GMM RP {}: \n{}'.format(dataset, cluster_validation_df))

    gmm = GaussianMixture(random_state=0)
    score_df = pd.DataFrame()
    k_max = 100
    for k in range(2, k_max):
        gmm.set_params(n_components=k)
        predY = gmm.fit_predict(lda.transform(X_train))
        score_df.loc[k, 'score'] = calinski_harabasz_score(
            lda.transform(X_train), predY)
    LOGGER.info('gmm lda max score on {}: k={}'.format(
        dataset,
        score_df.idxmax(axis=0)['score']))
    plt.clf()
    plt.title("calinski_harabasz_Expectation_Maximization")
    plt.xlabel('k')
    plt.ylabel('score')
    plt.plot(score_df.reset_index()['index'],
             score_df['score'],
             label='calinski_harabasz_score')
    plt.legend(loc="best")
    plt.savefig('plots/p16/' + '_'.join(['gm', 'lda', dataset, '.png']))
    gmm = GaussianMixture(n_components=settings[dataset]['gmm'],
                          random_state=0)
    gmm.fit(lda.transform(X_train))
    cluster_validation_df = pd.DataFrame()
    for score in score_fns:
        cluster_validation_df.loc[score.__name__, 'score'] = score(
            y_test[y_test.columns[0]], gmm.predict(lda.transform(X_test)))
    LOGGER.info('GMM LDA {}: \n{}'.format(dataset, cluster_validation_df))
import numpy as np
import pandas as pd
import csv

from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn import cross_validation
from sklearn.metrics import confusion_matrix

tt = pd.read_csv("dm-hw-m-train.txt", header=None)

index = tt.values[:, 0]
X = tt.values[:, 1:4]
y = tt.values[:, 4]

# Performing cross-validation
clf = LinearDiscriminantAnalysis()

X_train, X_test, y_train, y_test = cross_validation.train_test_split(X, y)

clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)

tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()
score = (tp + tn) / (tn + fp + fn + tp)  # accuracy score

print('LDA accuracy score: %f' % score)

# Applying on real test data
clf = LinearDiscriminantAnalysis()

clf.fit(X, y)
def twodim(d):
    lda = LinearDiscriminantAnalysis(n_components=2)
    d = sc.fit_transform(d)
    lda_object = lda.fit(d, y)
    d = lda_object.transform(d)
    return d
Exemple #11
0
def run_nn_2(X_train, X_test, y_train, y_test, dataset):
    LOGGER.info('running NN...')

    settings = {
        'wage': {
            'pca': 65,
            'ica': 92,
            'rp': 105,
            'lda': 1,
            'kmeans': 2,
            'gmm': 2,
            'kmeans_ica': 83,
            'kmeans_lda': 99,
            'gmm_lda': 99,
            'gmm_ica': 83,
            'nn': {
                'iter': 200,
                'hls': 1000,
                'alpha': .0001,
            },
        },
        'wine': {
            'pca': 12,
            'ica': 12,
            'rp': 13,
            'lda': 2,
            'kmeans': 3,
            'gmm': 3,
            'kmeans_lda': 99,
            'gmm_lda': 99,
            'nn': {
                'iter': 200,
                'hls': 800,
                'alpha': .1,
            },
        },
    }

    LOGGER.info('NN OG...')
    nn = MLPClassifier(max_iter=settings[dataset]['nn']['iter'],
                       hidden_layer_sizes=settings[dataset]['nn']['hls'],
                       alpha=settings[dataset]['nn']['alpha'])
    nn_check(X_train, X_test, y_train, y_test, nn, 'OG')
    nn_epochs(X_train.to_numpy(), X_test.to_numpy(), y_train, y_test, nn, 'OG')

    LOGGER.info('NN PCA...')
    pca = PCA(n_components=settings[dataset]['pca'], random_state=0)
    X_train_transformed = pca.fit_transform(X_train)
    X_test_transformed = pca.transform(X_test)
    nn = MLPClassifier(max_iter=settings[dataset]['nn']['iter'],
                       hidden_layer_sizes=settings[dataset]['nn']['hls'],
                       alpha=settings[dataset]['nn']['alpha'])
    nn_check(X_train_transformed, X_test_transformed, y_train, y_test, nn,
             'PCA')
    nn_epochs(X_train_transformed, X_test_transformed, y_train, y_test, nn,
              'PCA')

    LOGGER.info('NN ICA...')
    ica = FastICA(n_components=settings[dataset]['ica'], random_state=0)
    X_train_transformed = ica.fit_transform(X_train)
    X_test_transformed = ica.transform(X_test)
    nn = MLPClassifier(max_iter=settings[dataset]['nn']['iter'],
                       hidden_layer_sizes=settings[dataset]['nn']['hls'],
                       alpha=settings[dataset]['nn']['alpha'])
    nn_check(X_train_transformed, X_test_transformed, y_train, y_test, nn,
             'ICA')
    nn_epochs(X_train_transformed, X_test_transformed, y_train, y_test, nn,
              'ICA')

    LOGGER.info('NN RP...')
    rp = SparseRandomProjection(n_components=settings[dataset]['rp'],
                                random_state=0)
    X_train_transformed = rp.fit_transform(X_train)
    X_test_transformed = rp.transform(X_test)
    nn = MLPClassifier(max_iter=settings[dataset]['nn']['iter'],
                       hidden_layer_sizes=settings[dataset]['nn']['hls'],
                       alpha=settings[dataset]['nn']['alpha'])
    nn_check(X_train_transformed, X_test_transformed, y_train, y_test, nn,
             'RP')
    nn_epochs(X_train_transformed, X_test_transformed, y_train, y_test, nn,
              'RP')

    LOGGER.info('NN LDA...')
    lda = LinearDiscriminantAnalysis(n_components=settings[dataset]['lda'])
    X_train_transformed = lda.fit_transform(X_train, y_train)
    X_test_transformed = lda.transform(X_test)
    nn = MLPClassifier(max_iter=settings[dataset]['nn']['iter'],
                       hidden_layer_sizes=settings[dataset]['nn']['hls'],
                       alpha=settings[dataset]['nn']['alpha'])
    nn_check(X_train_transformed, X_test_transformed, y_train, y_test, nn,
             'LDA')
    nn_epochs(X_train_transformed, X_test_transformed, y_train, y_test, nn,
              'LDA')

    kmeans = KMeans(n_clusters=settings[dataset]['kmeans'], random_state=0)
    X_train_transformed = kmeans.fit_transform(X_train)
    X_test_transformed = kmeans.transform(X_test)
    nn = MLPClassifier(max_iter=settings[dataset]['nn']['iter'],
                       hidden_layer_sizes=settings[dataset]['nn']['hls'],
                       alpha=settings[dataset]['nn']['alpha'])
    nn_check(X_train_transformed, X_test_transformed, y_train, y_test, nn,
             'KMEANS')
    nn_epochs(X_train_transformed, X_test_transformed, y_train, y_test, nn,
              'KMEANS')

    gmm = GaussianMixture(n_components=settings[dataset]['gmm'],
                          random_state=0)
    gmm.fit(X_train)
    X_train_transformed = gmm.predict_proba(X_train)
    X_test_transformed = gmm.predict_proba(X_test)
    # X_train_transformed = gmm.predict(X_train)
    # X_test_transformed = gmm.predict(X_test)
    # print(X_train_transformed)
    # print(X_test_transformed)
    nn = MLPClassifier(max_iter=settings[dataset]['nn']['iter'],
                       hidden_layer_sizes=settings[dataset]['nn']['hls'],
                       alpha=settings[dataset]['nn']['alpha'])
    nn_check(X_train_transformed, X_test_transformed, y_train, y_test, nn,
             'GMM')
    nn_epochs(X_train_transformed, X_test_transformed, y_train, y_test, nn,
              'GMM')
Exemple #12
0
import numpy as np
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB
from sklearn import tree

X = np.array([[-1, -1], [-2, -1], [-3, -2], [1, 1], [2, 1], [3, 2]])
y = np.array([1, 1, 1, 2, 2, 2])
clf = LinearDiscriminantAnalysis(solver='svd')
clf.fit(X, y)
print(clf.predict([[1, 3]]))
print(X.shape)

k = [[
    [1, 1],
], [[2, 2], 2], [[3, 3]], 3]
a = np.array([[k[0], 1], [k[1], 2], [k[2], 3]])
b = np.array([1, 2, 3])
ab = GaussianNB().fit(k, b)
ab.predict([[[4, 4], 4]])

cd = tree.DecisionTreeClassifier().fit(X, y)
print(cd.predict([[0, 0]]))
Exemple #13
0
    splot.set_xticks(())
    splot.set_yticks(())


def plot_lda_cov(lda, splot):
    plot_ellipse(splot, lda.means_[0], lda.covariance_, 'red')
    plot_ellipse(splot, lda.means_[1], lda.covariance_, 'blue')


def plot_qda_cov(qda, splot):
    plot_ellipse(splot, qda.means_[0], qda.covariances_[0], 'red')
    plot_ellipse(splot, qda.means_[1], qda.covariances_[1], 'blue')

for i, (X, y) in enumerate([data_aud_dmn(), data_aud_sal(), data_dmn_sal()]):
    # Linear Discriminant Analysis
    lda = LinearDiscriminantAnalysis(solver="svd", store_covariance=True)
    y_pred = lda.fit(X, y).predict(X)
    splot = plot_data(lda, X, y, y_pred, fig_index=2 * i + 1)
    plot_lda_cov(lda, splot)
    plt.axis('tight')

    # Quadratic Discriminant Analysis
    qda = QuadraticDiscriminantAnalysis(store_covariances=True)
    y_pred = qda.fit(X, y).predict(X)
    splot = plot_data(qda, X, y, y_pred, fig_index=2 * i + 2)
    plot_qda_cov(qda, splot)
    plt.axis('tight')
plt.suptitle('Linear Discriminant Analysis vs Quadratic Discriminant'
             'Analysis')
plt.show()
Exemple #14
0
    X, y = make_blobs(n_samples=n_samples, n_features=1, centers=[[-2], [2]])

    # add non-discriminative features
    if n_features > 1:
        X = np.hstack([X, np.random.randn(n_samples, n_features - 1)])
    return X, y


acc_clf1, acc_clf2 = [], []
n_features_range = range(1, n_features_max + 1, step)
for n_features in n_features_range:
    score_clf1, score_clf2 = 0, 0
    for _ in range(n_averages):
        X, y = generate_data(n_train, n_features)

        clf1 = LinearDiscriminantAnalysis(solver='lsqr',
                                          shrinkage='auto').fit(X, y)
        clf2 = LinearDiscriminantAnalysis(solver='lsqr',
                                          shrinkage=None).fit(X, y)

        X, y = generate_data(n_test, n_features)
        score_clf1 += clf1.score(X, y)
        score_clf2 += clf2.score(X, y)

    acc_clf1.append(score_clf1 / n_averages)
    acc_clf2.append(score_clf2 / n_averages)

features_samples_ratio = np.array(n_features_range) / n_train

plt.plot(features_samples_ratio,
         acc_clf1,
         linewidth=2,
print(
    df_.groupby(['Predicted default status',
                 'True default status']).size().unstack('True default status'))
print(classification_report(y, y_pred))

# ### Sklearn

# In[22]:

from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.metrics import confusion_matrix, classification_report, precision_score
#X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25)
print(len(X_test))
# Fit and predict using LDA
lda = LinearDiscriminantAnalysis(solver='svd')
lda.fit(X_train, y_train)

y_pred = lda.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)
print(classification_report(y_test, y_pred, target_names=['No', 'Yes']))

# the LDA and logistic regression predictions are almost identical with 83% accuracy score. The LDA output indicates that πˆ1 = 0.84 and πˆ2 = 0.44; in other words, 84% of the training observations correspond to credit scores that are not defaulting. It also provides the group means; these are the average of each predictor within each class, and are used by LDA as estimates of μk.

# In[29]:

X = df[['balance', 'income']].values

y = df.default2.values
Exemple #16
0
        #Confusion matrix
        plot_confusion_matrix(y_test, pred, figsize=(7, 5), cmap="PuBuGn")
        bottom, top = plt.ylim()
        plt.ylim(bottom + 0.5, top - 0.5)
        st.pyplot()

    except:
        st.write("Preencha todos os parâmetros")

########################################
# LINEAR DISCRIMINANT CLASSIFIER
########################################
if ML_option == "Linear Discriminant Analysis":
    # Fit the model and predict X_test. Show some analysis.
    try:
        lda = LinearDiscriminantAnalysis()
        lda.fit(X_train, y_train)
        pred = lda.predict(X_test)
        st.write("R2 Score: ", r2_score(y_test, pred))
        st.write('Mean Absolute Error (MAE):',
                 metrics.mean_absolute_error(y_test, pred))
        st.write('Mean Squared Error (MSE):',
                 metrics.mean_squared_error(y_test, pred))
        st.write('Root Mean Squared Error (RMSE):',
                 np.sqrt(metrics.mean_squared_error(y_test, pred)))
        st.write('Accuracy of Decision Tree Classifier on training set: ',
                 lda.score(X_train, y_train))
        st.write('Accuracy of Decision Tree Classifier on test set: ',
                 lda.score(X_test, y_test))

        st.subheader("Classificarion Report")
Exemple #17
0
# plt.show()

# scatter_matrix(dataset)
# plt.show()

array = dataset.values
X = array[:, 0:4]
Y = array[:, 4]
validation_size = 0.20
seed = 7
scoring = 'accuracy'
X_train, X_validation, Y_train, Y_validation = \
    model_selection.train_test_split(X, Y, test_size=validation_size, random_state=seed)

# Spot Checking
models = [('LR', LogisticRegression()), ('LDA', LinearDiscriminantAnalysis()),
          ('KNN', KNeighborsClassifier()), ('CART', DecisionTreeClassifier()),
          ('NB', GaussianNB()), ('SVM', SVC())]

results = []
names = []

# Shows KNN as the most accurate model
for name, model in models:
    kfold = model_selection.KFold(n_splits=10, random_state=seed)
    cv_results = model_selection.cross_val_score(model,
                                                 X_train,
                                                 Y_train,
                                                 cv=kfold,
                                                 scoring=scoring)
    results.append(cv_results)
kfold= StratifiedKFold(n_splits=10)


# Classifiers (Building Classifier Array)

# In[56]:


random_state=2
classifiers=[]
classifiers.append(SVC(random_state=random_state))
classifiers.append(DecisionTreeClassifier(random_state=random_state))
classifiers.append(RandomForestClassifier(random_state=random_state))
classifiers.append(KNeighborsClassifier())
classifiers.append(LogisticRegression(random_state=random_state))
classifiers.append(LinearDiscriminantAnalysis())
classifiers.append(AdaBoostClassifier(DecisionTreeClassifier(random_state=random_state),random_state=random_state))
classifiers.append(ExtraTreesClassifier(random_state=random_state))
classifiers.append(GradientBoostingClassifier(random_state=random_state))
classifiers.append(XGBClassifier(random_state=random_state))
cv_results=[]
for classifier in classifiers:
    cv_results.append(cross_val_score(estimator=classifier,X=X_train,y=Y_train,cv=kfold,scoring='accuracy',n_jobs=-1))
cv_means=[]
cv_std=[]
for cv_result in cv_results:
    cv_means.append(cv_result.mean())
    cv_std.append(cv_result.std())
    
# Cross Validation Results
cv_res=pd.DataFrame({'CrossValMeans':cv_means , 'CrossValErrors':cv_std , 'Algorithm':['SVC','DTC','RFC','KNN','LR','LDA','ADA','XT','GBC','XGB']})
Exemple #19
0
                        imus[2].resampled_euler_y[i:-number_of_points + i])
                ], 0)
        out = np.append(out_z_0, out_z_2, 0)
        out = np.append(out, out_x_0, 0)
        out = np.append(out, out_x_2, 0)
        out = np.append(out, out_y_0, 0)
        out = np.append(out, out_y_2, 0)
        out = np.append(out, [dz0[number_of_points:]], 0)
        out = np.append(out, [dz2[number_of_points:]], 0)
        out = np.append(out, [dx0[number_of_points:]], 0)
        out = np.append(out, [dx2[number_of_points:]], 0)
        out = np.append(out, [dy0[number_of_points:]], 0)
        out = np.append(out, [dy2[number_of_points:]], 0)
        out = list(out.T)

        classifier = LinearDiscriminantAnalysis()
        classifier.fit(X, y)
        predicted_values = classifier.predict(out)
        predicted_values = medfilt(predicted_values, filter_size)

        print('Evaluating...')
        evaluated_buttons_timestamp = []
        evaluated_buttons_values = []
        evaluated_predicted_time = []
        evaluated_predicted_values = []
        for i in range(len(buttons_timestamp)):
            if testing_lower_time < buttons_timestamp[i] < testing_upper_time:
                evaluated_buttons_timestamp.append(buttons_timestamp[i])
                evaluated_buttons_values.append(buttons_values[i])
        for i in range(len(t)):
            if testing_lower_time < t[i] < testing_upper_time:
Exemple #20
0
def classification(sub):
    temporal_size = 9
    import matplotlib.pyplot as plt
    plt.rcParams["font.family"] = "Times New Roman"
    import seaborn as sns
    sns.set()
    res_val = np.zeros((9, temporal_size))

    for i in range(1, 6):
        train_data = scipy.io.loadmat('competition/rev_3.5_0.5/' + sub + '_' +
                                      str(i) + '_train.mat')
        test_data = scipy.io.loadmat('competition/rev_3.5_0.5/' + sub + '_' +
                                     str(i) + '_test.mat')
        train_x = np.transpose(train_data['train'][0][0][0])
        train_y = np.transpose(train_data['train'][0][0][1])
        test_x = np.transpose(test_data['test'][0][0][0])
        test_y = np.transpose(test_data['test'][0][0][1])

        t_train_x = []
        t_test_x = []
        for k in range(0, 9):
            for j in range(0, temporal_size):
                t_train_x.append(arr_flatten(train_x[:, j, :, k]))
                t_test_x.append(arr_flatten(test_x[:, j, :, k]))

        import feature_selection as FS
        opt_idx = FS.lsvm_wrapper(np.array(t_train_x), train_y)

        cur_train_x = t_train_x[opt_idx]
        cur_test_x = t_test_x[opt_idx]
        lda = LinearDiscriminantAnalysis(solver='lsqr', shrinkage='auto')
        lda.fit(cur_train_x, train_y.argmax(axis=1))
        y_predict = lda.predict(cur_test_x)
        coh = cohen_kappa_score(test_y.argmax(axis=1), y_predict)
        acc = accuracy_score(test_y.argmax(axis=1), y_predict)
        pre = precision_score(test_y.argmax(axis=1),
                              y_predict,
                              average='macro')
        rec = recall_score(test_y.argmax(axis=1), y_predict, average='macro')
        f1 = f1_score(test_y.argmax(axis=1), y_predict, average='macro')
        sen = str(coh) + ',' + str(acc) + ',' + str(pre) + ',' + str(
            rec) + ',' + str(f1)
        pen = open('LSVM_3.5_0.5.csv', 'a')
        pen.write('SVM,' + sub + ',' + str(i) + ',' + str(j) + ',' + sen +
                  '\n')
        pen.close()
    """
    for j in range(len(t_test_x)):
      cur_train_x = t_train_x[j]
      cur_test_x = t_test_x[j]
      lda = LinearDiscriminantAnalysis(solver='lsqr', shrinkage='auto')
      lda.fit(cur_train_x, train_y.argmax(axis=1))
      y_predict = lda.predict(cur_test_x)
      coh = cohen_kappa_score(test_y.argmax(axis=1), y_predict)
      acc = accuracy_score(test_y.argmax(axis=1), y_predict)
      pre = precision_score(test_y.argmax(axis=1), y_predict, average='macro')
      rec = recall_score(test_y.argmax(axis=1), y_predict, average='macro')
      f1 = f1_score(test_y.argmax(axis=1), y_predict, average='macro')
      sen = str(coh) + ',' + str(acc) + ',' + str(pre) + ',' + str(rec) + ',' + str(f1)
      #pen = open('total_2_0.5.csv', 'a')
      #pen.write('SVM,' + sub + ',' + str(i) + ',' + str(j) + ',' + sen + '\n')
      #pen.close()
      y_val = j % temporal_size
      x_val = int(j / temporal_size)
      res_val[x_val, y_val] += coh
  res_val /= 5
  plt.rcParams["font.family"] = "Times New Roman"
  ax = sns.heatmap(res_val, cmap="BuGn", vmin=0.1, vmax=0.85, square=True, annot=True)
  plt.savefig('fig/4.5_0.5/' + sub + '.png', format='png', dpi=1000)
  plt.close()
  """
    print('abc')
# Apply random forest
cverror = []
for e in (10, 40, 80, 100):
    clf = RandomForestClassifier(n_estimators=e)
    scores = cross_validation.cross_val_score(clf,
                                              Xtr_p,
                                              ytr,
                                              cv=5,
                                              scoring='accuracy')
    cverror.append(np.mean(1 - scores))
print("Random Forest tree:")
print((10, 40, 80, 100)[cverror.index(min(cverror, key=float))])
print(min(cverror, key=float))

#Apply GradientBoosting
clf = LinearDiscriminantAnalysis()
scores = cross_validation.cross_val_score(clf, Xtr_p, ytr, cv=5)
error = np.mean(1 - scores)
print("LDA:")
print(error)

#Choose three best methods and then run onto the test dataset
model1 = svm.SVC(C=0.01, kernel='linear', probability=True)
model2 = LogisticRegression(C=0.1)
model3 = LinearDiscriminantAnalysis()
model1.fit(Xtr_p, ytr)
model2.fit(Xtr_p, ytr)
model3.fit(Xtr_p, ytr)

print("Three best model to fit the test:")
print("model1:")
Exemple #22
0
def test_api_():
    import os
    os.chdir('E:/Richard/Competition/4c/')
    for i in range(1, 10):
        csp = scipy.io.loadmat('csp/A0' + str(i) + '.mat')['csp'][0][0]
        tdp = scipy.io.loadmat('tdp/A0' + str(i) + '.mat')['tdp'][0][0]
        psd = scipy.io.loadmat('psd/A0' + str(i) + '.mat')['psd'][0][0]
        for j in range(4):

            ctx = np.transpose(csp[0][j])
            cty = np.transpose(csp[1][j]).argmax(axis=1)
            cvx = np.transpose(csp[2][j])
            cvy = np.transpose(csp[3][j]).argmax(axis=1)

            ttx = np.transpose(tdp[0][j])
            tty = np.transpose(tdp[1][j]).argmax(axis=1)
            tvx = np.transpose(tdp[2][j])
            tvy = np.transpose(tdp[3][j]).argmax(axis=1)

            ptx = np.transpose(psd[0][j])
            pty = np.transpose(psd[1][j]).argmax(axis=1)
            pvx = np.transpose(psd[2][j])
            pvy = np.transpose(psd[3][j]).argmax(axis=1)

            from sklearn import svm, linear_model
            from sklearn import ensemble

            mode = ['lsvm', 'ksvm', 'gb', 'srlda']
            data = ['csp', 'tdp', 'psd']

            for cls in mode:
                for d in data:
                    if cls == 'lsvm': lda = svm.LinearSVC()
                    elif cls == 'ksvm': lda = svm.SVC(kernel='linear')
                    elif cls == 'gb':
                        lda = ensemble.GradientBoostingClassifier()
                    elif cls == 'srlda':
                        lda = LinearDiscriminantAnalysis(solver='lsqr',
                                                         shrinkage='auto')
                    if d == 'csp':
                        tx = ctx
                        ty = cty
                        vx = cvx
                        vy = cvy
                    elif d == 'tdp':
                        tx = ttx
                        ty = tty
                        vx = tvx
                        vy = tvy
                    elif d == 'psd':
                        tx = ptx
                        ty = pty
                        vx = pvx
                        vy = pvy
                    lda.fit(tx, ty)
                    y_predict = lda.predict(vx)
                    coh = cohen_kappa_score(vy, y_predict)
                    acc = accuracy_score(vy, y_predict)
                    pen = open('res/res_' + cls + '_' + d + '_f.csv', 'a')
                    pen.write(
                        str(i) + ',' + str(j) + ',' + str(coh) + ',' +
                        str(acc) + '\n')
                    pen.close()
Exemple #23
0
    X_train, X_test, y_train, y_test = train_test_split(public_data, public_labels, test_size=0.3, 
    random_state=i*500)

    clf = TransformedTargetRegressor(regressor=SVR(kernel='poly'),
                                     transformer=MinMaxScaler())


    #LinearRegression
    steps = [('scaler', StandardScaler()), ('red_dim', PCA()), ('clf', clf)]

    pipeline = Pipeline(steps)
    n_features_to_test = np.arange(1, 11)

    parameteres = [{'scaler':[MinMaxScaler()], 'red_dim':[PCA()], 'red_dim__n_components':list(n_features_to_test),
                    'clf__regressor__C': list(C_range), 'clf__regressor__gamma':['auto', 'scale']+list(gamma_range), 'clf__regressor__degree':[2, 3]},
                    {'scaler':[MinMaxScaler()], 'red_dim':[LinearDiscriminantAnalysis()], 'red_dim__n_components':[2],
                    'clf__regressor__C': list(C_range), 'clf__regressor__gamma':['auto', 'scale']+list(gamma_range), 'clf__regressor__degree':[2, 3]},
                    {'scaler':[MinMaxScaler()], 'red_dim':[None],
                    'clf__regressor__C': list(C_range), 'clf__regressor__gamma':['auto', 'scale']+list(gamma_range), 'clf__regressor__degree':[2, 3]}]

    grid = GridSearchCV(pipeline, param_grid=parameteres, cv=5, n_jobs=-1, verbose=1, scoring='neg_mean_absolute_error')
    grid.fit(X_train, y_train)

    score_train = grid.score(X_train, y_train)
    score_test = grid.score(X_test, y_test)
    best_p = grid.best_params_

    bp = pd.DataFrame(best_p, index=[i])
    bp['MAE_train'] = -score_train
    bp['MAE_test'] = -score_test
    bp['random_state'] = i*500
Exemple #24
0
import csv
import sys
import nltk
# import nltk.tokenize.casual
from nltk.tokenize import word_tokenize
from nltk.tokenize import TweetTokenizer
import re
from collections import Counter
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from nltk.classify.util import apply_features, accuracy
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from nltk.classify.scikitlearn import SklearnClassifier

classif = SklearnClassifier(LinearDiscriminantAnalysis())

tokenizer = TweetTokenizer(strip_handles=False, reduce_len=True)

train_file = sys.argv[1]
test_file = sys.argv[2]

emoticon_string = r"""
    (?:
      [<>]?
      [:;=8]                     # eyes
      [\-o\*\']?                 # optional nose
      [\)\]\(\[dDpP/\:\}\{@\|\\] # mouth
      |
      [\)\]\(\[dDpP/\:\}\{@\|\\] # mouth
      [\-o\*\']?                 # optional nose
      [:;=8]                     # eyes
Exemple #25
0
array = dataset.values
X = array[:, 0:4]
Y = array[:, 4]
validation_size = 0.20
seed = 7
X_train, X_validation, Y_train, Y_validation = model_selection.train_test_split(
    X, Y, test_size=validation_size, random_state=seed)

# Test options and evaluation metric
seed = 7
scoring = 'accuracy'

#spot check algorithms
models = []
models.append(('LR', LogisticRegression()))
models.append(('LDA', LinearDiscriminantAnalysis()))
models.append(('KNN', KNeighborsClassifier()))
models.append(('CART', DecisionTreeClassifier()))
models.append(('NB', GaussianNB()))
models.append(('SVM', SVC()))
# evaluate each model in turn
results = []
names = []
for name, model in models:
    kfold = model_selection.KFold(n_splits=10, random_state=seed)
    cv_results = model_selection.cross_val_score(model,
                                                 X_train,
                                                 Y_train,
                                                 cv=kfold,
                                                 scoring=scoring)
    results.append(cv_results)
Exemple #26
0
def ml(hursts,bt,pfds,hfd,targets,nof):

	data = np.zeros((nof,16))	
	for i in range (0,int(nof)):
		for y in range (0,4):
		
			data[i,y] = hursts[y,i]
			data[i,y+4] = bt[y,i]
			data[i,y+8]=pfds[y,i]
			data[i,y+12]=hfd[y,i]		
		
	#print(data)
	clf = svm.SVC(kernel='linear', C=100,class_weight={2:3}) #support v
	clf_lda = LinearDiscriminantAnalysis()
	#clf = joblib.load('classifier.pkl') 	
	
	targets2=np.zeros((len(targets)))
	data2=np.zeros((len(data)))
	
	for i in range(0,len(data)):	
		targets2[i] = int(targets[i])
#	print(targets2.ravel())
#	y = label_binarize(targets2.ravel(), classes=[1, 2])
#	print(y)
#	n_classes = y.shape[1]
	
#	X_train, X_test, y_train, y_test = train_test_split(data,y.ravel(), test_size=.5)
#	y_score = clf.fit(X_train, y_train).decision_function(X_test)
#	y_score2 = clf_lda.fit(X_train, y_train).decision_function(X_test)
	
#	fpr = dict()
#	tpr = dict()
#	roc_auc = dict()

	
	#	fpr, tpr, _ = roc_curve(y_test, y_score)
#	fpr2, tpr2, _ = roc_curve(y_test, y_score2)
#	roc_auc = auc(fpr, tpr)
#	roc_auc2 = auc(fpr2, tpr2)
	
		

	#plt.figure()
	#lw = 2
	#plt.plot(fpr, tpr, color='darkorange',
#			 lw=lw, label='ROC curve SVM (area = %0.2f)' % roc_auc)
#	plt.plot(fpr2, tpr2, color='green',
#			 lw=lw, label='ROC curve LDA(area = %0.2f)' % roc_auc2)
#	plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--')
#	plt.xlim([0.0, 1.0])
#	plt.ylim([0.0, 1.05])
#	plt.xlabel('False Positive Rate')
#	plt.ylabel('True Positive Rate')
#	plt.title('Receiver operating characteristic')
#	plt.legend(loc="lower right")	
#	plt.show()
	
	

	
	
		
	
	targets2 = np.reshape(targets2,(len(data),1))
	#print (targets2)
	
	
	for i in range (0,int(nof)):
		if(np.all(np.isfinite(data[i]))==False):
			for y in range (0,len(data[i])):
				if(np.isnan(data[i,y])):					
					data[i,y] = 0.4 
		
		
	#parameters = {'kernel': ('linear', 'rbf'), 'C': [50,60,70,80,90,100,110,120,130,140,150,300,400]}
	#svr = svm.SVC()
	#clf8 = grid_search.GridSearchCV(svr, parameters)
	
	c, r = targets2.shape
	targets2 = targets2.reshape(c,)
	#clf8.fit(data, targets2)
	#print(clf8.best_params_)
	#time.sleep(10)
		
	clf.fit(data, targets2)
	clf_lda.fit(data, targets2)
	
#	for i in range (0,len(data)):
		#print(data[i].reshape(1,-1))
#		a=clf.predict(data[i].reshape(1,-1))
#		b=clf_lda.predict(data[i].reshape(1,-1))
		
#		if(a==[1.]):
#			print('concentrated')
#		else:
#			print('distracted')
		
#		if(b==[1.]):
#			print('lda concentrated')
#		else:
#			print('lda distracted')
		
	joblib.dump(clf, 'classifier.pkl') 
	joblib.dump(clf_lda, 'classifier_lda.pkl') 
print("Accuracy of  PassiveAggressiveClassifier=", accuracy_score(y_test,pac_pred),"\n")
print("Classification of   PassiveAggressiveClassifier\n\n",classification_report(y_test,pac_pred),"\n")
print("Confusion matrix of  PassiveAggressiveClassifier\n\n\n",confusion_matrix(y_test,pac_pred))


# In[72]:


from sklearn.discriminant_analysis import LinearDiscriminantAnalysis


# In[73]:


lda = LinearDiscriminantAnalysis()
lda.fit(X_train, y_train)


# In[74]:


lda_pred = lda.predict(X_test)


# In[75]:


print("Accuracy of  LinearDiscriminantAnalysi=", accuracy_score(y_test,lda_pred),"\n")
print("Classification of   LinearDiscriminantAnalysi\n\n",classification_report(y_test,lda_pred),"\n")
print("Confusion matrix of  LinearDiscriminantAnalysi\n\n\n",confusion_matrix(y_test,lda_pred))
                                                    ypred_BRF_ds,
                                                    average='weighted')
recall_score_wt_BRF_ds = metrics.recall_score(y_test,
                                              ypred_BRF_ds,
                                              average='weighted')

print('F1-score_micro = ', f1_score_micro_BRF_ds)
print('F1-score = ', f1_score_wt_BRF_ds)
print('Precision = ', precision_score_wt_BRF_ds)
print('Recall Score = ', recall_score_wt_BRF_ds)

# ###### Bagging with LDA

# In[34]:

clf = BaggingClassifier(LinearDiscriminantAnalysis())

clf.fit(X1, ds_ytrain)
ypred_BLDA_ds = clf.predict(X2)

# In[35]:

print(
    '********Bagging with LDA Classifier, Sampled, Standard Scaled, Variance threshold',
    '********')

f1_score_micro_BLDA_ds = metrics.f1_score(y_test,
                                          ypred_BLDA_ds,
                                          average='micro')
f1_score_wt_BLDA_ds = metrics.f1_score(y_test,
                                       ypred_BLDA_ds,
Exemple #29
0
#DecisionTreeClassifier

from sklearn.tree import DecisionTreeClassifier

steps = [('scaler', StandardScaler()), ('red_dim', PCA()),
         ('clf', DecisionTreeClassifier())]

from sklearn.pipeline import Pipeline
pipeline = Pipeline(steps)

n_features_to_test = np.arange(1, 11)

parameteres = [{
    'scaler': scalers_to_test,
    'red_dim': [LinearDiscriminantAnalysis()],
    'red_dim__n_components': [2],
    'clf__criterion': ['gini', 'entropy']
}, {
    'scaler': scalers_to_test,
    'red_dim': [PCA()],
    'red_dim__n_components': n_features_to_test,
    'clf__criterion': ['gini', 'entropy']
}]

from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV

grid = GridSearchCV(pipeline, param_grid=parameteres, cv=5, verbose=1)

grid.fit(X_train, y_train)
Exemple #30
0
 def __init__(self, configs: object):
     super().__init__(configs.model.model_name, configs.device)
     from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
     self.lda_cls = LinearDiscriminantAnalysis()