Exemple #1
0
def main():
    data_file = "data/winequality-red.csv"
    x, y = extractData(data_file)
    decompose = Decomposing()
    decompose.pca_dim_reduction(x, y, 'wine', 11)
    decompose.pca_eval(x, y, 'wine', 5, 'Rating')

    decompose.ica_dim_reduction(x, y, 'wine', 11)
    decompose.ica_eval(x, y, 'wine', 11, 'Rating')

    n = decompose.rp_dim_reduction(x, y, 'wine', 11)
    print('Optimal n for RP(Wine data set) : {} '.format(str(n)))
    decompose.rp_eval(x, y, 'wine', n, 'Rating')

    k = decompose.sk_dim_reduction(x, y, 'wine', 11)
    # decompose.sk_eval(x, y, 'wine', k)

    data_file = "data/default_of_credit_card_clients.csv"
    x, y = extractData(data_file)
    decompose = Decomposing()
    decompose.pca_dim_reduction(x, y, 'default', 24)
    decompose.pca_eval(x, y, 'default', 10, 'Defaulted')

    decompose.ica_dim_reduction(x, y, 'default', 24)
    decompose.ica_eval(x, y, 'default', 6, 'Defaulted')

    n = decompose.rp_dim_reduction(x, y, 'default', 24)
    print('Optimal n for RP(CC Default data set) : {} '.format(str(n)))
    decompose.rp_eval(x, y, 'default', n, 'Defaulted')

    k = decompose.sk_dim_reduction(x, y, 'default', 24)
Exemple #2
0
    def classify(self, data_file, encode, label):
        X, Y = extractData(data_file)

        enc = LabelEncoder()

        if encode:
            rows, cols = X.shape
            for c in range(cols):
                if not str(X.iloc[1][c]).isnumeric():
                    enc.fit(X.iloc[:, c])
                    X.iloc[:, c] = enc.transform(X.iloc[:, c])
            enc.fit(Y)
            Y = enc.transform(Y)

        train_x, test_x, train_y, test_y = train_test_split(X, Y, test_size=0.3)

        nn = MLPClassifier()
        parameter_grid = {'activation' : ['identity', 'logistic', 'tanh', 'relu'],
                            'solver' : ['lbfgs', 'sgd', 'adam'],
                            'hidden_layer_sizes': [
                                (5,),(10,),(15,),(20,)
                            ]
                          }
        start = timer()
        classifier, grid_search = getBestModel(nn, parameter_grid, train_x, train_y)
        classify_model = classifier.fit(train_x, train_y)
        pred_y = classify_model.predict(test_x)
        end = timer()
        print('Elapsed time of train and test : ' + str(end - start))
        accuracy = metrics.accuracy_score(test_y, pred_y) * 100

        print('Accuracy of Neural Network = {:.2f}%'.format(accuracy))
        plotValidationCurve("Neural Network", label, grid_search, train_x, train_y, parameter_grid)
        plotLearningCurve("Neural Network", label, classifier, X, Y)
        plotPerformance(test_y, pred_y, label, 'Algorithm: Neural Network')
Exemple #3
0
def main():
    results = pd.DataFrame(columns=[
        'Data Set', 'Cluster Algo.', '# Clusters', 'Mutual Info Score'
    ])
    cluster = Clustering()
    data_file = "data/winequality-red.csv"
    x, y = extractData(data_file)
    x = pd.DataFrame(preprocessing.scale(x), columns=x.columns)
    cluster.kMeansCluster('wine', x, y, 6, 10)
    km = cluster.kmeansFitBestModel(6, 10, 'wine', x, 5)
    score = adjusted_mutual_info_score(km.labels_, y)
    results = append_results(results, ['Wine Quality', 'k-Means', 5, score])

    cluster.emCluster('wine', x, y, 6, 10)
    em = cluster.emFitBestModel(6, 10, 'wine', 5, x)
    score = adjusted_mutual_info_score(em.predict(x), y)
    results = append_results(results,
                             ['Wine Quality', 'Exp. Maximization', 5, score])

    data_file = "data/default_of_credit_card_clients.csv"
    x, y = extractData(data_file)
    x = pd.DataFrame(preprocessing.scale(x), columns=x.columns)
    bill_cols = [
        'BILL_AMT1', 'BILL_AMT2', 'BILL_AMT3', 'BILL_AMT4', 'BILL_AMT5',
        'BILL_AMT6'
    ]
    pmt_cols = [
        'PAY_AMT1', 'PAY_AMT2', 'PAY_AMT3', 'PAY_AMT4', 'PAY_AMT5', 'PAY_AMT6'
    ]
    num_cols = x.shape[1]
    x['total_bill'] = x.loc[:, bill_cols].sum(axis=1)
    x['total_pmt'] = x.loc[:, pmt_cols].sum(axis=1)

    cluster.kMeansCluster('default', x, y, num_cols, num_cols + 1)
    km = cluster.kmeansFitBestModel(num_cols, num_cols + 1, 'default', x, 10)
    score = adjusted_mutual_info_score(km.labels_, y)
    results = append_results(results, ['CC Default', 'k-Means', 10, score])

    cluster.emCluster('default', x, y, num_cols, num_cols + 1)
    em = cluster.emFitBestModel(num_cols, num_cols + 1, 'default', 15, x)
    score = adjusted_mutual_info_score(em.predict(x), y)
    results = append_results(results,
                             ['CC Default', 'Exp. Maximization', 15, score])

    print(results)
    print(results.to_latex())
Exemple #4
0
    def classify(self, data_file, encode, label):
        # data = pd.read_csv(data_file)
        X, Y = extractData(data_file)

        enc = LabelEncoder()

        if encode:
            rows, cols = X.shape
            for c in range(cols):
                if not str(X.iloc[1][c]).isnumeric():
                    enc.fit(X.iloc[:, c])
                    X.iloc[:, c] = enc.transform(X.iloc[:, c])
            enc.fit(Y)
            Y = enc.transform(Y)

        #
        train_x, test_x, train_y, test_y = train_test_split(X,
                                                            Y,
                                                            test_size=0.2,
                                                            random_state=123)

        #Find best model
        dct = tree.DecisionTreeClassifier()
        parameter_grid = {
            'criterion': ['gini', 'entropy'],
            'splitter': ['best', 'random'],
            'max_depth': range(1, 10),
            'max_features': range(1, 5),
            'min_samples_leaf': range(1, 5)
        }
        start = timer()
        classifier, grid_search = getBestModel(dct, parameter_grid, train_x,
                                               train_y)
        classify_model = classifier.fit(train_x, train_y)
        pred_y = classify_model.predict(test_x)
        end = timer()
        print('Elapsed time of train and test : ' + str(end - start))
        accuracy = grid_search.best_score_ * 100

        print('Accuracy of Decision Tree(depth={}) = {:.2f}%'.format(
            classify_model.get_depth(), accuracy))

        plotValidationCurve("Decision Tree", label, grid_search, train_x,
                            train_y, parameter_grid)
        plotLearningCurve("Decision Tree", label, classifier, X, Y)
        plotPerformance(test_y, pred_y, label, 'Algorithm: Decision Tree')
Exemple #5
0
    def classifyWithBoost(self, data_file, encode, label):
        X, Y = extractData(data_file)

        enc = LabelEncoder()

        if encode:
            rows, cols = X.shape
            for c in range(cols):
                if not str(X.iloc[1][c]).isnumeric():
                    enc.fit(X.iloc[:, c])
                    X.iloc[:, c] = enc.transform(X.iloc[:, c])
            enc.fit(Y)
            Y = enc.transform(Y)

        train_x, test_x, train_y, test_y = train_test_split(X,
                                                            Y,
                                                            test_size=0.3,
                                                            random_state=123)
        dctb = GradientBoostingClassifier(random_state=123)
        parameter_grid = {
            'learning_rate': [0.2, 0.3, 0.5],
            'max_depth': [2, 3, 4, 5]
        }
        start = timer()
        classifier, grid_search = getBestModel(dctb, parameter_grid, train_x,
                                               train_y)
        plotValidationCurve("Decision Tree with Boost", label, grid_search,
                            train_x, train_y, parameter_grid)
        plotLearningCurve("Decision Tree with Boost", label, classifier, X, Y)
        classify_model = classifier.fit(X, Y)
        pred_y = classify_model.predict(test_x)
        end = timer()
        print('Elapsed time of train and test : ' + str(end - start))
        accuracy = classify_model.score(test_x, test_y) * 100

        print('Accuracy of GradientBoostingClassifier = {:.2f}%'.format(
            accuracy))

        plotPerformance(test_y, pred_y, label,
                        'Algorithm: Decision Tree with Boost')
Exemple #6
0
    def classify(self, data_file, encode, label):
        X, Y = extractData(data_file)

        enc = LabelEncoder()

        if encode:
            rows, cols = X.shape
            for c in range(cols):
                if not str(X.iloc[1][c]).isnumeric():
                    enc.fit(X.iloc[:, c])
                    X.iloc[:, c] = enc.transform(X.iloc[:, c])
            enc.fit(Y)
            Y = enc.transform(Y)

        #
        train_x, test_x, train_y, test_y = train_test_split(X,
                                                            Y,
                                                            test_size=0.3,
                                                            random_state=123)

        knn = KNeighborsClassifier()
        parameter_grid = {'n_neighbors': range(1, 10)}
        start = timer()
        classifier, grid_search = getBestModel(knn, parameter_grid, train_x,
                                               train_y)
        classify_model = classifier.fit(train_x, train_y)
        pred_y = classify_model.predict(test_x)
        accuracy = classify_model.score(test_x, test_y) * 100
        end = timer()
        print('Elapsed time of train and test : ' + str(end - start))
        print('Accuracy of KNN = {:.2f}%'.format(accuracy))

        plotValidationCurve("KNeighbours", label, grid_search, train_x,
                            train_y, parameter_grid)
        plotLearningCurve("KNeighbours", label, classifier, X, Y)

        plotPerformance(test_y, pred_y, label,
                        'Algorithm: k-Nearest Neighbors')
 def data_prep(self, test_size=0.3):
     x, y = extractData("data/default_of_credit_card_clients.csv")
     self.num_classes = len(np.unique(y))
     self.train_x, self.test_x, self.train_y, self.test_y = train_test_split(
         x, y, test_size=test_size)
Exemple #8
0
def main():

    exp_results = pd.DataFrame(columns=['Data Set','Dim. Red. Algo','Cluster Algo.','# Clusters','Mutual Info Score'])
    data_file = "data/winequality-red.csv"
    x, y = extractData(data_file)
    x = pd.DataFrame(preprocessing.scale(x), columns=x.columns)
    dimreduce = Decomposing()
    cluster = Clustering()

    pca_result = pd.DataFrame(dimreduce.pca_eval(x,y,'experiment/wine/pca',5,'Rating')[0])
    km = cluster.kmeansFitBestModel(1, 2, 'experiment/wine/pca', pca_result.copy(), 5)
    score = adjusted_mutual_info_score(km.labels_,y)
    print('Wine : PCA - Kmeans : Score = {}'.format(score))
    exp_results = append_results(exp_results, ['Wine Quality','PCA','k-Means',5,score])
    em = cluster.emFitBestModel(1,2,'experiment/wine/pca',5,pca_result)
    score = adjusted_mutual_info_score(em.predict(pca_result), y)
    print('Wine : PCA - EM : Score = {}'.format(score))
    exp_results = append_results(exp_results, ['Wine Quality', 'PCA', 'Exp. Maximization', 5, score])

    ica_result = pd.DataFrame(dimreduce.ica_eval(x, y, 'experiment/wine/ica', 11, 'Rating')[0])
    km = cluster.kmeansFitBestModel(1, 2, 'experiment/wine/ica', ica_result.copy(), 5)
    score = adjusted_mutual_info_score(km.labels_, y)
    print('Wine : ICA - Kmeans : Score = {}'.format(score))
    exp_results = append_results(exp_results, ['Wine Quality', 'ICA', 'k-Means', 5, score])
    em = cluster.emFitBestModel(1, 2, 'experiment/wine/ica', 5, ica_result)
    score = adjusted_mutual_info_score(em.predict(ica_result), y)
    print('Wine : ICA - EM : Score = {}'.format(score))
    exp_results = append_results(exp_results, ['Wine Quality', 'ICA', 'Exp. Maximization', 5, score])

    rp_result = pd.DataFrame(dimreduce.rp_eval(x, y, 'experiment/wine/rp', 11, 'Rating')[0])
    km = cluster.kmeansFitBestModel(1, 2, 'experiment/wine/rp', rp_result.copy(), 5)
    score = adjusted_mutual_info_score(km.labels_, y)
    print('Wine : RP - Kmeans : Score = {}'.format(score))
    exp_results = append_results(exp_results, ['Wine Quality', 'RP', 'k-Means', 5, score])
    em = cluster.emFitBestModel(1, 2, 'experiment/wine/rp', 5, rp_result)
    score = adjusted_mutual_info_score(em.predict(rp_result), y)
    print('Wine : RP - EM : Score = {}'.format(score))
    exp_results = append_results(exp_results, ['Wine Quality', 'RP', 'Exp. Maximization', 5, score])

    sk_result = pd.DataFrame(dimreduce.sk_eval(x, y, 8)[0])
    km = cluster.kmeansFitBestModel(1, 2, 'experiment/wine/sk', sk_result.copy(), 4)
    score = adjusted_mutual_info_score(km.labels_, y)
    print('Wine : SK - Kmeans : Score = {}'.format(score))
    exp_results = append_results(exp_results, ['Wine Quality', 'Select-K', 'k-Means', 4, score])
    em = cluster.emFitBestModel(1, 2, 'experiment/wine/sk', 5, sk_result)
    score = adjusted_mutual_info_score(em.predict(sk_result), y)
    print('Wine : SK - EM : Score = {}'.format(score))
    exp_results = append_results(exp_results, ['Wine Quality', 'Select-K', 'Exp. Maximization', 5, score])


    data_file = "data/default_of_credit_card_clients.csv"
    x, y = extractData(data_file)
    x = pd.DataFrame(preprocessing.scale(x), columns=x.columns)

    pca_result = pd.DataFrame(dimreduce.pca_eval(x, y, 'experiment/default/pca', 10,'Defaulted')[0])
    km = cluster.kmeansFitBestModel(1, 2, 'experiment/default/pca', pca_result, 5)
    score = adjusted_mutual_info_score(km.labels_, y)
    print('Default : PCA - Kmeans : Score = {}'.format(score))
    exp_results = append_results(exp_results, ['CC Default', 'PCA', 'k-Means', 5, score])
    em = cluster.emFitBestModel(1, 2, 'experiment/default/pca', 5, pca_result)
    score = adjusted_mutual_info_score(em.predict(pca_result), y)
    print('Default : PCA - EM : Score = {}'.format(score))
    exp_results = append_results(exp_results, ['CC Default', 'PCA', 'Exp. Maximization', 5, score])

    ica_result = pd.DataFrame(dimreduce.ica_eval(x, y, 'experiment/default/ica', 6, 'Defaulted')[0])
    km = cluster.kmeansFitBestModel(1, 2, 'experiment/default/ica', ica_result, 4)
    score = adjusted_mutual_info_score(km.labels_, y)
    print('Default : ICA - Kmeans : Score = {}'.format(score))
    exp_results = append_results(exp_results, ['CC Default', 'ICA', 'k-Means', 6, score])
    em = cluster.emFitBestModel(1, 2, 'experiment/default/ica', 4, ica_result)
    score = adjusted_mutual_info_score(em.predict(ica_result), y)
    print('Default : ICA - EM : Score = {}'.format(score))
    exp_results = append_results(exp_results, ['CC Default', 'ICA', 'Exp. Maximization', 4, score])

    rp_result = pd.DataFrame(dimreduce.rp_eval(x, y, 'experiment/default/rp', 24, 'Defaulted')[0])
    km = cluster.kmeansFitBestModel(1, 2, 'experiment/default/rp', rp_result, 10)
    score = adjusted_mutual_info_score(km.labels_, y)
    print('Default : RP - Kmeans : Score = {}'.format(score))
    exp_results = append_results(exp_results, ['CC Default', 'RP', 'k-Means', 10, score])
    em = cluster.emFitBestModel(1, 2, 'experiment/default/rp', 10, rp_result)
    score = adjusted_mutual_info_score(em.predict(rp_result), y)
    print('Default : RP - EM : Score = {}'.format(score))
    exp_results = append_results(exp_results, ['CC Default', 'RP', 'Exp. Maximization', 10, score])

    sk_result = pd.DataFrame(dimreduce.sk_eval(x, y, 6)[0])
    km = cluster.kmeansFitBestModel(1, 2, 'experiment/default/sk', sk_result, 4)
    score = adjusted_mutual_info_score(km.labels_, y)
    print('Default : SK - Kmeans : Score = {}'.format(score))
    exp_results = append_results(exp_results, ['CC Default', 'Select-K', 'k-Means', 4, score])
    em = cluster.emFitBestModel(1, 2, 'experiment/default/sk', 4, rp_result)
    score = adjusted_mutual_info_score(em.predict(rp_result), y)
    print('Default : SK - EM : Score = {}'.format(score))
    exp_results = append_results(exp_results, ['CC Default', 'Select-K', 'Exp. Maximization', 4, score])

    print(exp_results)
    print(exp_results.to_latex())

    ##### NN
    train_x, test_x, train_y, test_y = train_test_split(x, y, test_size=0.3)

    accuracy_raw = nn_classifier(train_x, train_y, test_x, test_y)

    pca_result, pca = dimreduce.pca_eval(train_x, train_y, 'nn', 10, 'Defaulted')
    test_data = pca.transform(test_x)
    accuracy_pca = nn_classifier(pca_result, train_y, test_data, test_y)

    ica_result, ica = dimreduce.ica_eval(train_x, train_y, 'nn', 6, 'Defaulted')
    test_data = ica.transform(test_x)
    accuracy_ica = nn_classifier(ica_result, train_y, test_data, test_y)

    rp_result, rp = dimreduce.rp_eval(train_x, train_y, 'nn', 24, 'Defaulted')
    test_data = rp.transform(test_x)
    accuracy_rp = nn_classifier(rp_result, train_y, test_data, test_y)

    sk_result, sk = dimreduce.sk_eval(train_x, train_y, 4)
    test_data = sk.transform(test_x)
    accuracy_sk = nn_classifier(sk_result, train_y, test_data, test_y)

    plot_results("Dimension Reduction Algo. Accuracy",'dim_red_accuracy',
                 ['Original','PCA','ICA','RP','SK'],
                 [accuracy_raw,accuracy_pca,accuracy_ica,accuracy_rp,accuracy_sk],'Dim. Red. Algorithms','Accuracy')

    km  = cluster.kmeansFitBestModel(1,2,'nn',train_x,10)
    km_data = km.fit_transform(train_x)
    test_data = km.transform(test_x)
    accuracy_km = nn_classifier(km_data,train_y,test_data, test_y)

    em = cluster.emFitBestModel(1, 2, 'nn', 15,train_x)
    em_train_labels = em.predict(train_x)
    em_train_ohc = one_hot_encode(em_train_labels, 15)
    em_train = np.concatenate((train_x, em_train_ohc), 1)
    # one hot encode cluster labels to val set
    em_test_labels = em.predict(test_x)
    em_test_ohc = one_hot_encode(em_test_labels, 15)
    em_test = np.concatenate((test_x, em_test_ohc), 1)
    # scale data
    scaler = preprocessing.StandardScaler().fit(em_train)
    em_data = scaler.transform(em_train)
    test_data = scaler.transform(em_test)

    accuracy_em = nn_classifier(em_data, train_y, test_data, test_y)

    plot_results("Clustering Accuracy", 'cluster_accuracy',['Original', 'k-Means', 'Ex. Maximization'],
                 [accuracy_raw, accuracy_km, accuracy_em], 'Clustering Algorithms',
                 'Accuracy')