def run_clustering(dY, aY, rdir, pdir): """Re-run clustering experiments on datasets after dimensionality reduction. Args: dY (Numpy.Array): Labels for digits. aY (Numpy.Array): Labels for abalone. rdir (str): Input file directory. pdir (str): Output directory. """ digitspath = get_abspath('digits_projected.csv', rdir) abalonepath = get_abspath('abalone_projected.csv', rdir) dX = np.loadtxt(digitspath, delimiter=',') aX = np.loadtxt(abalonepath, delimiter=',') rdir = rdir + '/clustering' pdir = pdir + '/clustering' # re-run clustering experiments after applying PCA clusters = [2, 3, 5, 10, 15, 20, 25, 30, 35, 40, 50] clustering_experiment(dX, dY, 'digits', clusters, rdir=rdir) clustering_experiment(aX, aY, 'abalone', clusters, rdir=rdir) # generate 2D data for cluster visualization get_cluster_data(dX, dY, 'digits', km_k=10, gmm_k=10, rdir=rdir) get_cluster_data(aX, aY, 'abalone', km_k=10, gmm_k=5, rdir=rdir) # generate component plots (metrics to choose size of k) generate_component_plots(name='digits', rdir=rdir, pdir=pdir) generate_component_plots(name='abalone', rdir=rdir, pdir=pdir) # generate validation plots (relative performance of clustering) generate_validation_plots(name='digits', rdir=rdir, pdir=pdir) generate_validation_plots(name='abalone', rdir=rdir, pdir=pdir) # generate validation plots (relative performance of clustering) df_digits = pd.read_csv(get_abspath('digits_2D.csv', rdir)) df_abalone = pd.read_csv(get_abspath('abalone_2D.csv', rdir)) generate_cluster_plots(df_digits, name='digits', pdir=pdir) generate_cluster_plots(df_abalone, name='abalone', pdir=pdir)
def run_clustering(wY, sY, rdir, pdir): """Re-run clustering experiments on datasets after dimensionality reduction. Args: wY (Numpy.Array): Labels for winequality. sY (Numpy.Array): Labels for seismic-bumps. rdir (str): Input file directory. pdir (str): Output directory. """ winepath = get_abspath('winequality_projected.csv', rdir) seismicpath = get_abspath('seismic-bumps_projected.csv', rdir) wX = np.loadtxt(winepath, delimiter=',') sX = np.loadtxt(seismicpath, delimiter=',') rdir = rdir + '/clustering' pdir = pdir + '/clustering' # re-run clustering experiments after applying PCA clusters = [2, 3, 4, 5, 6, 7, 8, 10, 12, 15, 18, 20, 25, 30, 45, 80, 120] clustering_experiment(wX, wY, 'winequality', clusters, rdir=rdir) clustering_experiment(sX, sY, 'seismic-bumps', clusters, rdir=rdir) # generate 2D data for cluster visualization get_cluster_data(wX, wY, 'winequality', km_k=15, gmm_k=15, rdir=rdir) get_cluster_data(sX, sY, 'seismic-bumps', km_k=20, gmm_k=15, rdir=rdir) # generate component plots (metrics to choose size of k) generate_component_plots(name='winequality', rdir=rdir, pdir=pdir) generate_component_plots(name='seismic-bumps', rdir=rdir, pdir=pdir) # # generate validation plots (relative performance of clustering) generate_validation_plots(name='winequality', rdir=rdir, pdir=pdir) generate_validation_plots(name='seismic-bumps', rdir=rdir, pdir=pdir) # generate validation plots (relative performance of clustering) df_wine = pd.read_csv(get_abspath('winequality_2D.csv', rdir)) df_seismic = pd.read_csv(get_abspath('seismic-bumps_2D.csv', rdir)) generate_cluster_plots(df_wine, name='winequality', pdir=pdir) generate_cluster_plots(df_seismic, name='seismic-bumps', pdir=pdir)
def run_clustering(digits_y, abalone_y, rdir, pdir, experiment=False): """Re-run clustering experiments on datasets after dimensionality reduction. Args: digits_y (Numpy.Array): Labels for digits. abalone_y(Numpy.Array): Labels for abalones. rdir (str): Input file directory. pdir (str): Output directory. """ start_time = timeit.default_timer() abalone_X = np.loadtxt( get_abspath('abalone_projected.csv', rdir), delimiter=',' ) digits_X = np.loadtxt( get_abspath('digits_projected.csv', rdir), delimiter=',' ) rdir = rdir + '/clustering' pdir = pdir + '/clustering' if experiment: # re-run clustering experiments after applying PCA clusters = range(2, 51) clustering_experiment(abalone_X, abalone_y, 'abalone', clusters, rdir=rdir) clustering_experiment(digits_X, digits_y, 'digits', clusters, rdir=rdir) # generate component plots (metrics to choose size of k) generate_component_plots(name='abalone', rdir=rdir, pdir=pdir) generate_component_plots(name='digits', rdir=rdir, pdir=pdir) # # generate validation plots (relative performance of clustering) generate_validation_plots(name='abalone', rdir=rdir, pdir=pdir) generate_validation_plots(name='digits', rdir=rdir, pdir=pdir) return # generate 2D data for cluster visualization get_cluster_data( abalone_X, abalone_y, 'abalone', km_k=9, gmm_k=12, rdir=rdir, pdir=pdir, ) get_cluster_data( digits_X, digits_y, 'digits', km_k=20, gmm_k=12, rdir=rdir, pdir=pdir, ) # generate validation plots (relative performance of clustering) generate_cluster_plots( pd.read_csv(get_abspath('abalone_2D.csv', rdir)), name='abalone', pdir=pdir ) generate_cluster_plots( pd.read_csv(get_abspath('digits_2D.csv', rdir)), name='digits', pdir=pdir )
def run_clustering(digits_y, abalone_y, rdir, pdir, experiment=False): """Re-run clustering experiments on datasets after dimensionality reduction. Args: digits_y (Numpy.Array): Labels for digits. abalone_y(Numpy.Array): Labels for abalones. rdir (str): Input file directory. pdir (str): Output directory. """ print('Running base clustering experiments RP') start_time = timeit.default_timer() digits_path = get_abspath('digits_projected.csv', rdir) abalone_path = get_abspath('abalone_projected.csv', rdir) digits_X = np.loadtxt(digits_path, delimiter=',') abalone_X = np.loadtxt(abalone_path, delimiter=',') rdir = rdir + '/clustering' pdir = pdir + '/clustering' # re-run clustering experiments after applying PCA if experiment: clusters = range(2, 51) clustering_experiment(digits_X, digits_y, 'digits', clusters, rdir=rdir) clustering_experiment(abalone_X, abalone_y, 'abalone', clusters, rdir=rdir) # generate component plots (metrics to choose size of k) generate_component_plots(name='digits', rdir=rdir, pdir=pdir) generate_component_plots(name='abalone', rdir=rdir, pdir=pdir) # # generate validation plots (relative performance of clustering) generate_validation_plots(name='digits', rdir=rdir, pdir=pdir) generate_validation_plots(name='abalone', rdir=rdir, pdir=pdir) return # # generate 2D data for cluster visualization get_cluster_data( digits_X, digits_y, 'digits', km_k=3, gmm_k=9, rdir=rdir, pdir=pdir, ) get_cluster_data(abalone_X, abalone_y, 'abalone', km_k=5, gmm_k=10, rdir=rdir, pdir=pdir) # generate validation plots (relative performance of clustering) df_digits_2D = pd.read_csv(get_abspath('digits_2D.csv', rdir)) generate_cluster_plots(df_digits_2D, name='digits', pdir=pdir) df_abalone_2D = pd.read_csv(get_abspath('abalone_2D.csv', rdir)) generate_cluster_plots(df_abalone_2D, name='abalone', pdir=pdir) end_time = timeit.default_timer() elapsed = end_time - start_time print("Completed clustering experiments in {} seconds".format(elapsed))