Esempio n. 1
0
def run_clustering(dY, aY, rdir, pdir):
    """Re-run clustering experiments on datasets after dimensionality
    reduction.

    Args:
        dY (Numpy.Array): Labels for digits.
        aY (Numpy.Array): Labels for abalone.
        rdir (str): Input file directory.
        pdir (str): Output directory.

    """
    digitspath = get_abspath('digits_projected.csv', rdir)
    abalonepath = get_abspath('abalone_projected.csv', rdir)
    dX = np.loadtxt(digitspath, delimiter=',')
    aX = np.loadtxt(abalonepath, delimiter=',')
    rdir = rdir + '/clustering'
    pdir = pdir + '/clustering'

    # re-run clustering experiments after applying PCA
    clusters = [2, 3, 5, 10, 15, 20, 25, 30, 35, 40, 50]
    clustering_experiment(dX, dY, 'digits', clusters, rdir=rdir)
    clustering_experiment(aX, aY, 'abalone', clusters, rdir=rdir)

    # generate 2D data for cluster visualization
    get_cluster_data(dX, dY, 'digits', km_k=10, gmm_k=10, rdir=rdir)
    get_cluster_data(aX, aY, 'abalone', km_k=10, gmm_k=5, rdir=rdir)

    # generate component plots (metrics to choose size of k)
    generate_component_plots(name='digits', rdir=rdir, pdir=pdir)
    generate_component_plots(name='abalone', rdir=rdir, pdir=pdir)

    # generate validation plots (relative performance of clustering)
    generate_validation_plots(name='digits', rdir=rdir, pdir=pdir)
    generate_validation_plots(name='abalone', rdir=rdir, pdir=pdir)

    # generate validation plots (relative performance of clustering)
    df_digits = pd.read_csv(get_abspath('digits_2D.csv', rdir))
    df_abalone = pd.read_csv(get_abspath('abalone_2D.csv', rdir))
    generate_cluster_plots(df_digits, name='digits', pdir=pdir)
    generate_cluster_plots(df_abalone, name='abalone', pdir=pdir)
Esempio n. 2
0
def run_clustering(wY, sY, rdir, pdir):
    """Re-run clustering experiments on datasets after dimensionality
    reduction.

    Args:
        wY (Numpy.Array): Labels for winequality.
        sY (Numpy.Array): Labels for seismic-bumps.
        rdir (str): Input file directory.
        pdir (str): Output directory.

    """
    winepath = get_abspath('winequality_projected.csv', rdir)
    seismicpath = get_abspath('seismic-bumps_projected.csv', rdir)
    wX = np.loadtxt(winepath, delimiter=',')
    sX = np.loadtxt(seismicpath, delimiter=',')
    rdir = rdir + '/clustering'
    pdir = pdir + '/clustering'

    # re-run clustering experiments after applying PCA
    clusters = [2, 3, 4, 5, 6, 7, 8, 10, 12, 15, 18, 20, 25, 30, 45, 80, 120]
    clustering_experiment(wX, wY, 'winequality', clusters, rdir=rdir)
    clustering_experiment(sX, sY, 'seismic-bumps', clusters, rdir=rdir)

    # generate 2D data for cluster visualization
    get_cluster_data(wX, wY, 'winequality', km_k=15, gmm_k=15, rdir=rdir)
    get_cluster_data(sX, sY, 'seismic-bumps', km_k=20, gmm_k=15, rdir=rdir)

    # generate component plots (metrics to choose size of k)
    generate_component_plots(name='winequality', rdir=rdir, pdir=pdir)
    generate_component_plots(name='seismic-bumps', rdir=rdir, pdir=pdir)

    # # generate validation plots (relative performance of clustering)
    generate_validation_plots(name='winequality', rdir=rdir, pdir=pdir)
    generate_validation_plots(name='seismic-bumps', rdir=rdir, pdir=pdir)

    # generate validation plots (relative performance of clustering)
    df_wine = pd.read_csv(get_abspath('winequality_2D.csv', rdir))
    df_seismic = pd.read_csv(get_abspath('seismic-bumps_2D.csv', rdir))
    generate_cluster_plots(df_wine, name='winequality', pdir=pdir)
    generate_cluster_plots(df_seismic, name='seismic-bumps', pdir=pdir)
Esempio n. 3
0
def run_clustering(digits_y, abalone_y, rdir, pdir, experiment=False):
    """Re-run clustering experiments on datasets after dimensionality
    reduction.

    Args:
        digits_y (Numpy.Array): Labels for digits.
        abalone_y(Numpy.Array): Labels for abalones.
        rdir (str): Input file directory.
        pdir (str): Output directory.

    """

    start_time = timeit.default_timer()
    abalone_X = np.loadtxt(
        get_abspath('abalone_projected.csv', rdir),
        delimiter=','
    )

    digits_X = np.loadtxt(
        get_abspath('digits_projected.csv', rdir),
        delimiter=','
    )

    rdir = rdir + '/clustering'
    pdir = pdir + '/clustering'

    if experiment:
        # re-run clustering experiments after applying PCA
        clusters = range(2, 51)
        clustering_experiment(abalone_X, abalone_y,
                              'abalone', clusters, rdir=rdir)
        clustering_experiment(digits_X, digits_y, 'digits',
                              clusters, rdir=rdir)

        # generate component plots (metrics to choose size of k)
        generate_component_plots(name='abalone', rdir=rdir, pdir=pdir)
        generate_component_plots(name='digits', rdir=rdir, pdir=pdir)

        # # generate validation plots (relative performance of clustering)
        generate_validation_plots(name='abalone', rdir=rdir, pdir=pdir)
        generate_validation_plots(name='digits', rdir=rdir, pdir=pdir)

        return

    # generate 2D data for cluster visualization
    get_cluster_data(
        abalone_X, abalone_y, 'abalone',
        km_k=9, gmm_k=12, rdir=rdir, pdir=pdir,
    )
    get_cluster_data(
        digits_X, digits_y, 'digits',
        km_k=20, gmm_k=12, rdir=rdir, pdir=pdir,
    )
    # generate validation plots (relative performance of clustering)
    generate_cluster_plots(
        pd.read_csv(get_abspath('abalone_2D.csv', rdir)),
        name='abalone',
        pdir=pdir
    )
    generate_cluster_plots(
        pd.read_csv(get_abspath('digits_2D.csv', rdir)),
        name='digits',
        pdir=pdir
    )
Esempio n. 4
0
def run_clustering(digits_y, abalone_y, rdir, pdir, experiment=False):
    """Re-run clustering experiments on datasets after dimensionality
    reduction.

    Args:
        digits_y (Numpy.Array): Labels for digits.
        abalone_y(Numpy.Array): Labels for abalones.
        rdir (str): Input file directory.
        pdir (str): Output directory.

    """
    print('Running base clustering experiments RP')
    start_time = timeit.default_timer()

    digits_path = get_abspath('digits_projected.csv', rdir)
    abalone_path = get_abspath('abalone_projected.csv', rdir)
    digits_X = np.loadtxt(digits_path, delimiter=',')
    abalone_X = np.loadtxt(abalone_path, delimiter=',')
    rdir = rdir + '/clustering'
    pdir = pdir + '/clustering'
    # re-run clustering experiments after applying PCA
    if experiment:
        clusters = range(2, 51)
        clustering_experiment(digits_X,
                              digits_y,
                              'digits',
                              clusters,
                              rdir=rdir)
        clustering_experiment(abalone_X,
                              abalone_y,
                              'abalone',
                              clusters,
                              rdir=rdir)

        # generate component plots (metrics to choose size of k)
        generate_component_plots(name='digits', rdir=rdir, pdir=pdir)
        generate_component_plots(name='abalone', rdir=rdir, pdir=pdir)

        # # generate validation plots (relative performance of clustering)
        generate_validation_plots(name='digits', rdir=rdir, pdir=pdir)
        generate_validation_plots(name='abalone', rdir=rdir, pdir=pdir)
        return

    # # generate 2D data for cluster visualization
    get_cluster_data(
        digits_X,
        digits_y,
        'digits',
        km_k=3,
        gmm_k=9,
        rdir=rdir,
        pdir=pdir,
    )
    get_cluster_data(abalone_X,
                     abalone_y,
                     'abalone',
                     km_k=5,
                     gmm_k=10,
                     rdir=rdir,
                     pdir=pdir)
    # generate validation plots (relative performance of clustering)
    df_digits_2D = pd.read_csv(get_abspath('digits_2D.csv', rdir))
    generate_cluster_plots(df_digits_2D, name='digits', pdir=pdir)

    df_abalone_2D = pd.read_csv(get_abspath('abalone_2D.csv', rdir))
    generate_cluster_plots(df_abalone_2D, name='abalone', pdir=pdir)
    end_time = timeit.default_timer()
    elapsed = end_time - start_time
    print("Completed clustering experiments in {} seconds".format(elapsed))