def compare_cluster_runtime(data, n_clusters, n_components): t0 = time() features = data[0] KMeans(n_clusters=n_clusters).fit_transform(features) t1 = time() - t0 t0 = time() mixture.GMM(n_components=n_clusters).fit(features) t2 = time() - t0 reduced_higgs_data, t3 = pca_eval.transform(data, n_components=n_components) reduced_higgs_data, t4 = rand_projections.transform(data, n_components=n_components) reduced_higgs_data, t5 = ica_eval.transform(data, n_components=n_components) reduced_higgs_data, t6 = trunk_svd.transform(data, n_components=n_components) t0 = time() KMeans(n_clusters=n_clusters).fit_transform(reduced_higgs_data) t7 = time() - t0 t0 = time() mixture.GMM(n_components=n_clusters).fit(reduced_higgs_data) t8 = time() - t0 ser = pd.Series([t1, t2, t3, t4, t5, t6, t7, t8], index=['original Kmeans clustering', 'original GMM clustering', 'PCA', 'RCA', 'ICA', 'LSA', 'reduced Kmeans clustering', 'reduced GMM clustering']) ser.name = 'time' return ser
def gmm_transform(higgs_data, n_clusters, n_components): start = time() reduced_higgs_data, elapsed = pca_eval.transform(higgs_data, n_components=n_components) cluster_assignment = mixture.GMM(n_components=n_clusters).fit(reduced_higgs_data).predict(reduced_higgs_data) elapsed = time() - start reduced_higgs_data['cluster_assighment'] = cluster_assignment data = {'features': reduced_higgs_data, 'weights': higgs_data[1], 'labels': higgs_data[2]} return data, elapsed
def run_higg_dimensionality_reduction(higgs_data, n_components): pca_trns, pca_elapsed = pca_eval.transform(higgs_data, n_components=n_components) rand_proj_trns, rand_proj_elapsed = rand_projections.transform(higgs_data, n_components=n_components) ica_trns, ica_elapsed = ica_eval.transform(higgs_data, n_components=n_components) lsa_trns, lsa_elapsed = trunk_svd.transform(higgs_data, n_components=n_components) transformation_time = pd.Series([pca_elapsed, rand_proj_elapsed, ica_elapsed, lsa_elapsed], index=['PCA', 'RCA', 'ICA', 'LSA'], name='transformation_time') return {'PCA': pca_trns, 'RCA': rand_proj_trns, 'ICA': ica_trns, 'LSA': lsa_trns}, transformation_time
def kmeans_transform(higgs_data, n_clusters, n_components, display=False): start = time() reduced_higgs_data, elapsed = pca_eval.transform(higgs_data, n_components=n_components) cluster_data = KMeans(n_clusters=n_clusters).fit_transform(reduced_higgs_data) elapsed = time() - start for l in range(cluster_data.shape[1]): reduced_higgs_data['new_feature' + str(l)] = cluster_data[:, l] data = {'features': reduced_higgs_data, 'weights': higgs_data[1], 'labels': higgs_data[2]} if display and n_clusters == 2: df = pd.DataFrame.from_records(cluster_data, columns=['new_feature_' + str(n) for n in range(n_clusters)]) df['label'] = higgs_data[2].values ax = df[df.label == 's'].plot(x='new_feature_0', y='new_feature_1', kind='scatter', color='darkgreen', label='signal') df[df.label == 'b'].plot(x='new_feature_0', y='new_feature_1', kind='scatter', color='darkred', ax=ax, label='background') return data, elapsed
def run_higg_dimensionality_reduction(higgs_data, n_components): pca_trns, pca_elapsed = pca_eval.transform(higgs_data, n_components=n_components) rand_proj_trns, rand_proj_elapsed = rand_projections.transform( higgs_data, n_components=n_components) ica_trns, ica_elapsed = ica_eval.transform(higgs_data, n_components=n_components) lsa_trns, lsa_elapsed = trunk_svd.transform(higgs_data, n_components=n_components) transformation_time = pd.Series( [pca_elapsed, rand_proj_elapsed, ica_elapsed, lsa_elapsed], index=['PCA', 'RCA', 'ICA', 'LSA'], name='transformation_time') return { 'PCA': pca_trns, 'RCA': rand_proj_trns, 'ICA': ica_trns, 'LSA': lsa_trns }, transformation_time