예제 #1
0
def main():
    np.random.seed(311)
    # import the datasets
    x_census_data, y_census_data = process_data.process_census_data('./../Datasets/Census_Income')
    x_stock_data, y_stock_data = process_data.process_stock_data('./../Datasets/Stocks', ["2016", "2017"])
    stock_params = {
        'n_clusters': [21],
        'elbow_graph': 'Stock Dataset Elbow Graph - K Means',
        'con_mat': 'Stock Dataset Confusion Matrix - K Means',
        'fig_size': (20, 5),
        'cluster_center_file': "stock_k_means_centers.txt"
    }

    census_params = {
        'n_clusters': [8],
        'elbow_graph': 'Census Dataset Elbow Graph - K Means',
        'con_mat': 'Census Dataset Confusion Matrix - K Means',
        'fig_size': (20, 5),
        'cluster_center_file': "census_k_means_centers.txt"
    }

    # Run the elbow technique on the stock dataset and then calculate the clusters using the ideal
    # cluster size, and see what those cluster
    run_k_means_elbow(stock_params, x_stock_data)
    calculate_k_means_clusters(stock_params, x_stock_data, y_stock_data)

    plt.figure()

    run_k_means_elbow(census_params, x_census_data)
    calculate_k_means_clusters(census_params, x_census_data, y_census_data)
예제 #2
0
def clusters_added_ann():
    x_census_data, y_census_data = process_data.process_census_data(
        './../Datasets/Census_Income')
    # 2 ANN's here, one for each clustering algorithm
    census_params_km = {
        'clusters': 8,
        "title": "Census Dataset - K Means Clusters Added as Feature",
        "path": "./ANN_Clustered_Data/",
        "file": "k_means_ann.txt",
        "cm_mat": "Census Dataset - K Means Clusters added Confusion Matrix"
    }
    census_params_em = {
        'clusters': 12,
        "title": "Census Dataset - EM Clusters Added as Feature",
        "path": "./ANN_Clustered_Data/",
        "file": "em_ann.txt",
        "cm_mat": "Census Dataset - EM Clusters added Confusion Matrix"
    }
    census_params_orig = {
        "title": "Census Dataset - Original",
        "path": "./ANN_Clustered_Data/",
        "file": "ann_orig.txt",
        "cm_mat": "Census Dataset - Original Confusion Matrix"
    }

    km_clusters = k_means.return_k_means_clusters(census_params_km,
                                                  x_census_data).reshape(
                                                      (len(x_census_data), 1))
    em_clusters = k_means.return_k_means_clusters(census_params_em,
                                                  x_census_data).reshape(
                                                      (len(x_census_data), 1))

    x_census_data_km = np.hstack([x_census_data, km_clusters])
    x_census_data_em = np.hstack([x_census_data, em_clusters])

    neural_network_learning(x_census_data_km, y_census_data, census_params_km)
    neural_network_learning(x_census_data_em, y_census_data, census_params_em)
    neural_network_learning(x_census_data, y_census_data, census_params_orig)
def main():
    np.random.seed(311)
    # import the datasets
    x_census_data, y_census_data = process_data.process_census_data('./../Datasets/Census_Income')
    x_stock_data, y_stock_data = process_data.process_stock_data('./../Datasets/Stocks', ["2016", "2017"])

    stock_params = {
        'min_dim_graph': "Stock Dataset - Minimum Dimension vs EPS",
        'projection_loss_graph': "Stock Dataset - Projection Losses for Data Random Projections",
        'components': 100,
        'num_retry': 10
    }

    census_params = {
        'min_dim_graph': "Census Dataset - Minimum Dimension vs EPS",
        'projection_loss_graph': "Census Dataset - Projection Losses for Data Random Projections",
        'components': 4,
        'num_retry': 10
    }

    determine_min_dim(stock_params, x_stock_data)
    determine_min_dim(census_params, x_census_data)
    create_random_guassian_projections(stock_params, x_stock_data)
    create_random_guassian_projections(census_params, x_census_data)
def main():
    np.random.seed(311)
    # import the datasets
    x_census_data, y_census_data = process_data.process_census_data('./../Datasets/Census_Income')
    x_stock_data, y_stock_data = process_data.process_stock_data('./../Datasets/Stocks', ["2016", "2017"])
    stock_params = {
        'n_components': 100,
        'filename': None,
        'projection_loss_graph': None,
        'num_retry': 10,
        'ica_title': None,
        'components': 100
    }

    census_params = {
        'n_components': 4,
        'filename': None,
        'projection_loss_graph': None,
        'num_retry': 10,
        'ica_title': None,
        'components': 4
    }

    x_stock_data_reduced = []
    x_census_data_reduced = []

    # import all the reduced dimensionality datasets
    # PCA
    x_stock_data_reduced.append(principle_component_analysis.run_pca(stock_params, x_stock_data))
    x_census_data_reduced.append(principle_component_analysis.run_pca(census_params, x_census_data))

    # ICA
    x_stock_data_reduced.append(independant_component_analysis.run_ica(stock_params, x_stock_data))
    x_census_data_reduced.append(independant_component_analysis.run_ica(census_params, x_census_data))

    # Random Projections
    x_stock_data_reduced.append(random_projections.create_random_guassian_projections(stock_params, x_stock_data))
    x_census_data_reduced.append(random_projections.create_random_guassian_projections(census_params, x_census_data))

    # SVD
    x_stock_data_reduced.append(svd_projection.run_svd(stock_params, x_stock_data))
    x_census_data_reduced.append(svd_projection.run_svd(census_params, x_census_data))

    # Make some rough elbow graphs to figure out how many clusters to do
    km_stock_elbow_dict = {'elbow_graph': 'Stock Dataset K Means Elbow Graph Reduced Dimension',
                           'path': './Reduced_Data_Clustering/'}
    k_means.run_k_means_elbow(km_stock_elbow_dict, x_stock_data_reduced[0])
    km_census_elbow_dict = {'elbow_graph': 'Census Dataset K Means Elbow Graph Reduced Dimension',
                           'path': './Reduced_Data_Clustering/'}
    k_means.run_k_means_elbow(km_census_elbow_dict, x_census_data_reduced[0])


    # Run K means on all these guys
    k_means_stock_params_km = []
    k_means_stock_params_km.append({'n_clusters': [23], 'con_mat': "Stock Dataset Confusion Matrix - K-Means PCA Reduction",
                                 'path': './Reduced_Data_Clustering/', 'fig_size': (20, 5)})
    k_means_stock_params_km.append({'n_clusters': [23], 'con_mat': "Stock Dataset Confusion Matrix - K-Means ICA Reduction",
                                 'path': './Reduced_Data_Clustering/', 'fig_size': (20, 5)})
    k_means_stock_params_km.append({'n_clusters': [23], 'con_mat': "Stock Dataset Confusion Matrix - K-Means Random Projections",
                                 'path': './Reduced_Data_Clustering/', 'fig_size': (20, 5)})
    k_means_stock_params_km.append({'n_clusters': [23], 'con_mat': "Stock Dataset Confusion Matrix - K-Means SVD Reduction",
                                 'path': './Reduced_Data_Clustering/', 'fig_size': (20, 5)})

    k_means_census_params_km = []
    k_means_census_params_km.append({'n_clusters': [8], 'con_mat': "Census Dataset Confusion Matrix - K-Means PCA Reduction",
                                 'path': './Reduced_Data_Clustering/', 'fig_size': (20, 5)})
    k_means_census_params_km.append({'n_clusters': [8], 'con_mat': "Census Dataset Confusion Matrix - K-Means ICA Reduction",
                                 'path': './Reduced_Data_Clustering/', 'fig_size': (20, 5)})
    k_means_census_params_km.append({'n_clusters': [8], 'con_mat': "Census Dataset Confusion Matrix - K-Means Random Projections",
                                 'path': './Reduced_Data_Clustering/', 'fig_size': (20, 5)})
    k_means_census_params_km.append({'n_clusters': [8], 'con_mat': "Census Dataset Confusion Matrix - K-Means SVD Reduction",
                                 'path': './Reduced_Data_Clustering/', 'fig_size': (20, 5)})

    em_stock_params = []
    em_stock_params.append({'n_clusters': [8], 'con_mat': "Stock Dataset Confusion Matrix - EM PCA Reduction",
                            'path': './Reduced_Data_Clustering/', 'fig_size': (20, 5)})
    em_stock_params.append({'n_clusters': [8], 'con_mat': "Stock Dataset Confusion Matrix - EM ICA Reduction",
                            'path': './Reduced_Data_Clustering/', 'fig_size': (20, 5)})
    em_stock_params.append({'n_clusters': [8], 'con_mat': "Stock Dataset Confusion Matrix - EM Random Projections",
                            'path': './Reduced_Data_Clustering/', 'fig_size': (20, 5)})
    em_stock_params.append({'n_clusters': [8], 'con_mat': "Stock Dataset Confusion Matrix - EM SVD Reduction",
                            'path': './Reduced_Data_Clustering/', 'fig_size': (20, 5)})

    em_census_params = []
    em_census_params.append({'n_clusters': [12], 'con_mat': "Census Dataset Confusion Matrix - EM PCA Reduction",
                             'path': './Reduced_Data_Clustering/', 'fig_size': (20, 5)})
    em_census_params.append({'n_clusters': [12], 'con_mat': "Census Dataset Confusion Matrix - EM ICA Reduction",
                             'path': './Reduced_Data_Clustering/', 'fig_size': (20, 5)})
    em_census_params.append({'n_clusters': [12], 'con_mat': "Census Dataset Confusion Matrix - EM Random Projections",
                             'path': './Reduced_Data_Clustering/', 'fig_size': (20, 5)})
    em_census_params.append({'n_clusters': [12], 'con_mat': "Census Dataset Confusion Matrix - EM SVD Reduction",
                             'path': './Reduced_Data_Clustering/', 'fig_size': (20, 5)})

    # loop through all the parameters and datasets to do the analysis
    # 4 sets of 4 analyses = 16 in total
    for i in range(4):
        k_means.calculate_k_means_clusters(k_means_stock_params_km[i], x_stock_data_reduced[i], y_stock_data)
        k_means.calculate_k_means_clusters(k_means_census_params_km[i], x_census_data_reduced[i], y_census_data)
        expectation_maximization.calculate_em_clusters(em_stock_params[i], x_stock_data_reduced[i], y_stock_data)
        expectation_maximization.calculate_em_clusters(em_census_params[i], x_census_data_reduced[i], y_census_data)
예제 #5
0
def dimensionality_reduction_ann():
    x_census_data, y_census_data = process_data.process_census_data(
        './../Datasets/Census_Income')
    train_size = 0.6
    random_state = 86
    x_census_data_reduced = []

    census_params = {
        'n_components': 4,
        'filename': None,
        'projection_loss_graph': None,
        'num_retry': 10,
        'ica_title': None,
        'components': 4
    }
    # import all the reduced dimensionality datasets
    # PCA
    x_census_data_reduced.append(
        principle_component_analysis.run_pca(census_params, x_census_data))

    # ICA
    x_census_data_reduced.append(
        independant_component_analysis.run_ica(census_params, x_census_data))

    # Random Projections
    x_census_data_reduced.append(
        random_projections.create_random_guassian_projections(
            census_params, x_census_data))

    # SVD
    x_census_data_reduced.append(
        svd_projection.run_svd(census_params, x_census_data))

    census_params = []
    census_params.append({
        "title":
        "Census Dataset - PCA Dimension Reduction",
        "path":
        "./ANN_Reduced_Data/",
        'file':
        'pca_reduced_ann.txt',
        'cm_mat':
        "Census Dataset - PCA Dimension Reduction Confusion Matrix"
    })
    census_params.append({
        "title":
        "Census Dataset - ICA Dimension Reduction",
        "path":
        "./ANN_Reduced_Data/",
        'file':
        'ica_reduced_ann.txt',
        'cm_mat':
        "Census Dataset - ICA Dimension Reduction Confusion Matrix"
    })
    census_params.append({
        "title":
        "Census Dataset - Random Projection Reduction",
        "path":
        "./ANN_Reduced_Data/",
        'file':
        'random_reduced_ann.txt',
        'cm_mat':
        "Census Dataset - Random Projection Reduction Confusion Matrix"
    })
    census_params.append({
        "title":
        "Census Dataset - SVD Dimension Reduction",
        "path":
        "./ANN_Reduced_Data/",
        'file':
        'svd_reduced_ann.txt',
        'cm_mat':
        "Census Dataset - SVD Dimension Reduction Confusion Matrix"
    })

    # Run 4 neural nets - one for each instance of the reduced data
    for i in range(4):
        neural_network_learning(x_census_data_reduced[i], y_census_data,
                                census_params[i])