Ejemplo n.º 1
0
def vizualizeData(X_svm, y_svm, y_kmeans, y_dbscan, y_agglomerative):

    interactive = isInteractive()

    # X_PCA and true y
    X_PCA, y, classes = processData(encodeLabels=True, testSet=0, reduceDim=3)

    # X_TSNE
    X, y, classes = processData(encodeLabels=True, testSet=0)
    X_TSNE = TSNE(n_components=3).fit_transform(X)

    # X_PCA and X_TSNE for svm
    X_svm_PCA = PCA(n_components=3).fit_transform(X_svm)
    X_svm_TSNE = TSNE(n_components=3).fit_transform(X_svm)


    if interactive:
        plotInteractive(X_PCA, y, 'PCA')
        plotInteractive(X_TSNE, y, 'TSNE')

        plotInteractive(X_svm_PCA, y_svm, 'Support Vector Machines PCA')
        plotInteractive(X_svm_TSNE, y_svm, 'Support Vector Machines TSNE')

        plotInteractive(X_PCA, y_kmeans, 'K-means Clustering PCA')
        plotInteractive(X_TSNE, y_kmeans, 'K-means Clustering TSNE')

        plotInteractive(X_PCA, y_dbscan, 'DBSCAN Clustering PCA')
        plotInteractive(X_TSNE, y_dbscan, 'DBSCAN Clustering TSNE')

        plotInteractive(X_PCA, y_agglomerative, 'Agglomerative Clustering PCA')
        plotInteractive(X_TSNE, y_agglomerative,
                        'Agglomerative Clustering TSNE')

    else:
        plotStationary(X_PCA, y, 'PCA')
        plotStationary(X_TSNE, y, 'TSNE')

        plotStationary(X_svm_PCA, y_svm, 'Support Vector Machines PCA')
        plotStationary(X_svm_TSNE, y_svm, 'Support Vector Machines TSNE')

        plotStationary(X_PCA, y_kmeans, 'K-means Clustering PCA')
        plotStationary(X_TSNE, y_kmeans, 'K-means Clustering TSNE')

        plotStationary(X_PCA, y_dbscan, 'DBSCAN Clustering PCA')
        plotStationary(X_TSNE, y_dbscan, 'DBSCAN Clustering TSNE')

        plotStationary(X_PCA, y_agglomerative, 'Agglomerative Clustering PCA')
        plotStationary(X_TSNE, y_agglomerative, 'Agglomerative Clustering TSNE')
def analyseData():
    """Plots most frequent words in the dataset for each author"""
    X, y, classes = processData(encodeLabels=False,
                                vectorizerName='None',
                                testSet=0)
    # assert X.shape[0] == 400 and len(y) == 400 and len(classes) == 20
    for i, author in enumerate(classes):
        authorBooks = []
        for j in range(20):
            idx = (i + 1) * (j + 1) - 1
            authorBooks.append(X[idx])
        freq = get_top_n_words(authorBooks, n=10)
        plotDistribution(freq, author)
from linearRegression import linearRegression
from kNNRegression import kNNRegression
from evaluateClassification import evaluateClassification
from sklearn.metrics import accuracy_score
from sklearn.metrics import r2_score
from analyseData import analyseData
from plotDecisionBoundry import plotDecisionBoundry
from evaluateRegression import evaluateRegression
from kmeans import kmeans
from dbscan import dbscan
from agglomerative import agglomerative
from evaluateClustering import evaluateClustering
from vizualizeData import vizualizeData

#loading pre-processed data
X_train, X_test, yClass_train, yClass_test, yReg_train, yReg_test = processData(
)

yClass_lr = logisticRegression(X_train, X_test, yClass_train)
lrAc = accuracy_score(yClass_test, yClass_lr)
X_lr = X_test

#reloading data pre-processed with different parameters
X_train, X_test, yClass_train, yClass_test, yReg_train, yReg_test = processData(
    normalization='mms')

yClass_rf = randomForest(X_train, X_test, yClass_train)
rfAc = accuracy_score(yClass_test, yClass_rf)
X_rf = X_test

yClass_knn = kNN(X_train, X_test, yClass_train)
knnAc = accuracy_score(yClass_test, yClass_knn)
def vizualizeData(X_lr, y_lr, X_rf, y_rf, X_knn, y_knn, X_linReg, y_linReg,
                  X_knnReg, y_knnReg, y_kmeans, y_dbscan, y_agglomerative):

    interactive = isInteractive()

    # X_PCA and X_TSNE
    X, yClass, yReg = processData(trainTestSplit=2)
    yClass = yClass.values.ravel()
    yReg = yReg.values.ravel()
    X_PCA = PCA(n_components=3).fit_transform(X)
    X_TSNE = TSNE(n_components=3).fit_transform(X)

    X_lr_PCA = PCA(n_components=3).fit_transform(X_lr)
    X_lr_TSNE = TSNE(n_components=3).fit_transform(X_lr)

    X_rf_PCA = PCA(n_components=3).fit_transform(X_rf)
    X_rf_TSNE = TSNE(n_components=3).fit_transform(X_rf)

    X_knn_PCA = PCA(n_components=3).fit_transform(X_knn)
    X_knn_TSNE = TSNE(n_components=3).fit_transform(X_knn)

    X_linReg_PCA = PCA(n_components=3).fit_transform(X_linReg)
    X_linReg_TSNE = TSNE(n_components=3).fit_transform(X_linReg)

    X_knnReg_PCA = PCA(n_components=3).fit_transform(X_knnReg)
    X_knnReg_TSNE = TSNE(n_components=3).fit_transform(X_knnReg)

    if interactive:
        plotInteractive(X_PCA, yClass, 'PCA Classification')
        plotInteractive(X_TSNE, yClass, 'TSNE Classification')

        plotInteractive(X_PCA, yReg, 'PCA Regression')
        plotInteractive(X_TSNE, yReg, 'TSNE Regression')

        plotInteractive(X_lr_PCA, y_lr, 'Logistic Regression PCA')
        plotInteractive(X_lr_TSNE, y_lr, 'Logistic Regression TSNE')

        plotInteractive(X_rf_PCA, y_rf, 'Random Forests PCA')
        plotInteractive(X_rf_TSNE, y_rf, 'Random Forests TSNE')

        plotInteractive(X_knn_PCA, y_knn, 'K-nearest neighbors PCA')
        plotInteractive(X_knn_TSNE, y_knn, 'K-nearest neighbors TSNE')

        plotInteractive(X_linReg_PCA, y_linReg, 'Linear Regression PCA')
        plotInteractive(X_linReg_TSNE, y_linReg, 'Linear Regression TSNE')

        plotInteractive(X_knnReg_PCA, y_knnReg, 'KNN Regression PCA')
        plotInteractive(X_knnReg_TSNE, y_knnReg, 'KNN Regression TSNE')

        plotInteractive(X_PCA, y_kmeans, 'K-means Clustering PCA')
        plotInteractive(X_TSNE, y_kmeans, 'K-means Clustering TSNE')

        plotInteractive(X_PCA, y_dbscan, 'DBSCAN Clustering PCA')
        plotInteractive(X_TSNE, y_dbscan, 'DBSCAN Clustering TSNE')

        plotInteractive(X_PCA, y_agglomerative, 'Agglomerative Clustering PCA')
        plotInteractive(X_TSNE, y_agglomerative,
                        'Agglomerative Clustering TSNE')

    else:
        plotStationary(X_PCA, yClass, 'PCA Classification')
        plotStationary(X_TSNE, yClass, 'TSNE Classification')

        plotStationary(X_PCA, yReg, 'PCA Regression')
        plotStationary(X_TSNE, yReg, 'TSNE Regression')

        plotStationary(X_lr_PCA, y_lr, 'Logistic Regression PCA')
        plotStationary(X_lr_TSNE, y_lr, 'Logistic Regression TSNE')

        plotStationary(X_rf_PCA, y_rf, 'Random Forests PCA')
        plotStationary(X_rf_TSNE, y_rf, 'Random Forests TSNE')

        plotStationary(X_knn_PCA, y_knn, 'K-nearest neighbors PCA')
        plotStationary(X_knn_TSNE, y_knn, 'K-nearest neighbors TSNE')

        plotStationary(X_linReg_PCA, y_linReg, 'Linear Regression PCA')
        plotStationary(X_linReg_TSNE, y_linReg, 'Linear Regression TSNE')

        plotStationary(X_knnReg_PCA, y_knnReg, 'KNN Regression PCA')
        plotStationary(X_knnReg_TSNE, y_knnReg, 'KNN Regression TSNE')

        plotStationary(X_PCA, y_kmeans, 'K-means Clustering PCA')
        plotStationary(X_TSNE, y_kmeans, 'K-means Clustering TSNE')

        plotStationary(X_PCA, y_dbscan, 'DBSCAN Clustering PCA')
        plotStationary(X_TSNE, y_dbscan, 'DBSCAN Clustering TSNE')

        plotStationary(X_PCA, y_agglomerative, 'Agglomerative Clustering PCA')
        plotStationary(X_TSNE, y_agglomerative,
                       'Agglomerative Clustering TSNE')
from dataPreprocessing import processData
from svm import svm
from kmeans import kmeans
from dbscan import dbscan
from agglomerative import agglomerative
from evaluateClassification import evaluateClassification
from evaluateClustering import evaluateClustering
from vizualizeData import vizualizeData
from analyseData import analyseData
"""
# loading pre-processed data
X_train, X_test, y_train, y_test, classes = processData()

y_svm = svm(X_train, X_test, y_train)
evaluateClassification(y_test, y_svm, 'Support Vector Machines', classes)
"""
X, y, classes = processData(testSet=0, reduceDim=2000)

y_kmeans = kmeans(X, init='random')
evaluateClustering(X, y, y_kmeans, 'K-means Clustering', classes)

y_dbscan = dbscan(X)
evaluateClustering(X, y, y_dbscan, 'DBSCAN Clustering', classes)

y_agglomerative = agglomerative(X)
evaluateClustering(X, y, y_agglomerative, 'Agglomerative Clustering', classes)

#vizualizeData(X_test, y_svm, y_kmeans, y_dbscan, y_agglomerative)

analyseData()