Esempio n. 1
0
def run_dt_fi(dataset_name, X, y, verbose=False):
    dtclf = DecisionTreeClassifier(random_state=RANDOM_SEED)
    dtclf.fit(X, y)
    fi = dtclf.feature_importances_

    fi_df = pd.DataFrame(fi, index=X.columns, columns=['feature_importance'])
    fi_df = fi_df.sort_values('feature_importance', ascending=False)
    # if verbose: print(fi_df)

    csv_path = 'tmp/dt_fi_' + dataset_name + '.csv'
    fi_df.to_csv(csv_path, header=True)

    # slice top n features
    if dataset_name == 'abalone':
        num_features = 3
    else:
        num_features = 25

    selected_features = fi_df.index[0:num_features].tolist()
    X_selected = X[selected_features]

    print("-------- DT_FI complete! --------\n")

    # run K-means
    clustering.run_k_means(dataset_name, X_selected, y, dim_reduction='dt_fi', verbose=verbose)

    # run EM
    clustering.run_expect_max(dataset_name, X_selected, y, dim_reduction='dt_fi', verbose=verbose)

    return X_selected
Esempio n. 2
0
def run_rp(dataset_name, X, y, verbose=False):
    # attempt RP for various dimensionality levels
    n_components_vals = np.arange(1, len(X.columns))
    iterations = np.arange(1, 15)
    recon_losses = []

    for n_components in n_components_vals:
        # see how reconstruction loss changes across iterations
        tmp_recon_losses = []
        for i in iterations:
            rp = GaussianRandomProjection(n_components=n_components, random_state=i)
            X_rp = rp.fit_transform(X)

            # calculate reconstruction error
            X_comp_pinv = np.linalg.pinv(rp.components_.T)
            X_projection = np.dot(X_rp, X_comp_pinv)
            recon_loss = ((X - X_projection) ** 2).mean()
            # if verbose: print(recon_loss.shape)
            tmp_recon_losses.append(np.sum(recon_loss))

        tmp_avg_recon_loss = np.mean(np.array(tmp_recon_losses))
        recon_losses.append(tmp_avg_recon_loss)

    if dataset_name == 'abalone':
        n_components = 3
    else:
        n_components = 25

    # plot reconstruction losses
    # if verbose: print(recon_losses[0])
    recon_losses = np.array(recon_losses)
    plot_title = "RP for " + dataset_name + ": Reconstruction loss\n"
    plotting.plot_recon_loss(
        recon_losses, n_components_vals, title=plot_title)
    plt.savefig('graphs/rp_' + dataset_name + '_recon_loss.png')
    plt.clf()

    # calculate reconstruction error
    grp = GaussianRandomProjection(n_components=n_components, random_state=RANDOM_SEED)
    X_rp = grp.fit_transform(X)

    X_comp_pinv = np.linalg.pinv(grp.components_.T)
    X_projection = np.dot(X_rp, X_comp_pinv)
    recon_loss = ((X - X_projection) ** 2).mean()

    print(dataset_name, ": RP reconstruction loss for k =", n_components, ":", np.sum(recon_loss), '\n')
    X_rp = pd.DataFrame(X_rp)

    # run K-means
    clustering.run_k_means(dataset_name, X_rp, y, dim_reduction='rp', verbose=verbose)

    # run EM
    clustering.run_expect_max(dataset_name, X_rp, y, dim_reduction='rp', verbose=verbose)

    return X_rp
Esempio n. 3
0
def run_ica(dataset_name, X, y, verbose=False):
    # attempt ICA for various dimensionality levels
    n_components_vals = np.arange(1, len(X.columns))
    kurtosis_vals = []

    for n_components in n_components_vals:
        ica = FastICA(n_components=n_components, random_state=RANDOM_SEED)
        X_ica = ica.fit_transform(X)

        # calculate cumulative explained variance
        kurtosis_val = np.average(kurtosis(ica.components_, fisher=False))
        # if verbose: print(kurtosis_val)
        kurtosis_vals.append(kurtosis_val)

    # plot cumulative explained variance
    kurtosis_vals = np.array(kurtosis_vals)
    plot_title = "ICA for " + dataset_name + ": Kurtosis\n"
    plotting.plot_kurtosis(
        kurtosis_vals, n_components_vals, title=plot_title)
    plt.savefig('graphs/ica_' + dataset_name + '_kurtosis.png')
    plt.clf()

    # choose optimal number of components (clusters) based on max cumulative explained variance
    if dataset_name == 'abalone':
        optimal_comp = 9
    else:
        optimal_comp = 56
    opt_ica = FastICA(n_components=optimal_comp, random_state=RANDOM_SEED)
    opt_X_ica = opt_ica.fit_transform(X)

    # calculate reconstruction loss
    X_projected = opt_ica.inverse_transform(opt_X_ica)
    recon_loss = ((X - X_projected) ** 2).mean()
    print(dataset_name, ": ICA reconstruction loss for k =", optimal_comp, ":", np.sum(recon_loss), '\n')
    opt_X_ica = pd.DataFrame(opt_X_ica)

    # run K-means
    clustering.run_k_means(dataset_name, opt_X_ica, y, dim_reduction='ica', verbose=verbose)

    # run EM
    clustering.run_expect_max(dataset_name, opt_X_ica, y, dim_reduction='ica', verbose=verbose)

    return opt_X_ica
Esempio n. 4
0
def run_pca(dataset_name, X, y, verbose=False):
    # attempt PCA for various dimensionality levels
    n_components_vals = np.arange(1, len(X.columns))
    cume_explained_variances = []

    for n_components in n_components_vals:
        pca = PCA(n_components=n_components, random_state=RANDOM_SEED)
        X_pca = pca.fit_transform(X)

        # calculate cumulative explained variance
        cume_explained_variance = np.sum(pca.explained_variance_)
        # if verbose: print(cume_explained_variance)
        cume_explained_variances.append(cume_explained_variance)

    # plot cumulative explained variance
    cume_explained_variances = np.array(cume_explained_variances)
    plot_title = "PCA for " + dataset_name + ": Cume. explained variance\n"
    plotting.plot_cume_explained_variance(
        cume_explained_variances, n_components_vals, title=plot_title)
    plt.savefig('graphs/pca_' + dataset_name + '_cumevar.png')
    plt.clf()

    # choose optimal number of components (clusters) based on max cumulative explained variance
    if dataset_name == 'abalone':
        optimal_comp = 3
    else:
        optimal_comp = 25
    opt_pca = PCA(n_components=optimal_comp, random_state=RANDOM_SEED)
    opt_X_pca = opt_pca.fit_transform(X)

    # calculate reconstruction loss
    X_projected = opt_pca.inverse_transform(opt_X_pca)
    recon_loss = ((X - X_projected) ** 2).mean()
    print(dataset_name, ": PCA reconstruction loss for k =", optimal_comp, ":", np.sum(recon_loss), '\n')
    opt_X_pca = pd.DataFrame(opt_X_pca)

    # run K-means
    clustering.run_k_means(dataset_name, opt_X_pca, y, dim_reduction='pca', verbose=verbose)

    # run EM
    clustering.run_expect_max(dataset_name, opt_X_pca, y, dim_reduction='pca', verbose=verbose)

    return opt_X_pca
Esempio n. 5
0
def abalone_cluster(verbose=False, show_plots=False):
    X, y = data_proc.process_abalone(scaler='minmax', tt_split=False)

    # calculate baseline performance
    base_X_train, base_X_test, base_y_train, base_y_test = data_proc.process_abalone(
        tt_split=True)
    run_experiment(
        'abalone',
        'baseline',
        base_X_train,
        base_X_test,
        base_y_train,
        base_y_test,
        verbose=verbose,
        show_plots=show_plots,
    )
    if verbose:
        print("\nBaseline complete!\n", "------------------------------\n")

    # calculate K-means performance
    k_means_X_clusters = clustering.run_k_means('abalone',
                                                X,
                                                y,
                                                dim_reduction=None,
                                                verbose=verbose)
    k_means_X_train, k_means_X_test, k_means_y_train, k_means_y_test = data_proc.process_abalone_w_clusters(
        k_means_X_clusters, scaler='minmax')

    run_experiment(
        'abalone',
        'kmeans',
        k_means_X_train,
        k_means_X_test,
        k_means_y_train,
        k_means_y_test,
        verbose=verbose,
        show_plots=show_plots,
    )
    if verbose:
        print("\nK-means complete!\n", "------------------------------\n")

    # calculate Expectation Maximization performance
    em_X_clusters = clustering.run_expect_max('abalone',
                                              X,
                                              y,
                                              dim_reduction=None,
                                              verbose=verbose)
    em_X_train, em_X_test, em_y_train, em_y_test = data_proc.process_abalone_w_clusters(
        em_X_clusters, scaler='minmax')

    run_experiment(
        'abalone',
        'em',
        em_X_train,
        em_X_test,
        em_y_train,
        em_y_test,
        verbose=verbose,
        show_plots=show_plots,
    )
    if verbose:
        print("\nExpectation Maximization complete!\n",
              "------------------------------\n")