def getClusteringEvalPlots(dataset): noOfClusters = range(2, 11, 1) for ds in dataset: sse = [[]] sil = [[[], []]] scores = [[[], []], [[], []], [[], []], [[], []], [[], []]] for cluster in noOfClusters: kmLearner = Clustering.KM(n_clusters=cluster) kmLearner.getLearner().fit(ds.training_x) emLearner = Clustering.EM(n_components=cluster) emLearner.getLearner().fit(ds.training_x) clustringY_KM = kmLearner.getLearner().predict(ds.training_x) clustringY_EM = emLearner.getLearner().predict(ds.training_x) homogeneityKM, completenessKM, v_measureKM = homogeneity_completeness_v_measure(ds.training_y, clustringY_KM) AMISKM = adjusted_mutual_info_score(ds.training_y, clustringY_KM) ARSKM = adjusted_rand_score(ds.training_y, clustringY_KM) silhouetteKM = silhouette_score(ds.training_x, clustringY_KM) homogeneityEM, completenessEM, v_measureEM = homogeneity_completeness_v_measure(ds.training_y, clustringY_EM) AMISEM = adjusted_mutual_info_score(ds.training_y, clustringY_EM) ARSEM = adjusted_rand_score(ds.training_y, clustringY_EM) silhouetteEM = silhouette_score(ds.training_x, clustringY_EM) sse.append(kmLearner.getLearner().inertia_) sil[0][0].append(silhouetteKM) scores[0][0].append(v_measureKM) scores[1][0].append(AMISKM) scores[2][0].append(ARSKM) scores[3][0].append(homogeneityKM) sil[0][1].append(silhouetteEM) scores[0][1].append(v_measureEM) scores[1][1].append(AMISEM) scores[2][1].append(ARSEM) scores[3][1].append(homogeneityEM) plt.style.use('seaborn-whitegrid') plt.plot(noOfClusters, sil[0][0], label='Silhouette Score, KM', marker='o') plt.plot(noOfClusters, sil[0][1], label='Silhouette Score, EM', marker='o', linestyle='--') plt.ylabel('Silhouette Score', fontsize=12) plt.xlabel('K', fontsize=12) plt.title('Silhouette Plot for ' + ds.name, fontsize=12, y=1.03) plt.legend() plt.savefig('Figures/Clustering/Silhouette for ' + ds.name + '.png') plt.close() plt.style.use('seaborn-whitegrid') plt.plot(noOfClusters, scores[0][0], label='V Measure, KM', marker='o') plt.plot(noOfClusters, scores[1][0], label='Adj. Mutual Info, KM', marker='o') plt.plot(noOfClusters, scores[2][0], label='Adj. Rand. Score, KM', marker='o') plt.plot(noOfClusters, scores[0][1], label='V Measure, EM', marker='o', linestyle='--') plt.plot(noOfClusters, scores[1][1], label='Adj. Mutual Info, EM', marker='o', linestyle='--') plt.plot(noOfClusters, scores[2][1], label='Adj. Rand. Score, EM', marker='o', linestyle='--') plt.ylabel('Score', fontsize=12) plt.xlabel('K', fontsize=12) plt.title('Score Plot for ' + ds.name, fontsize=12, y=1.03) plt.legend() plt.savefig('Figures/Clustering/Score for ' + ds.name + '.png') plt.close()
def calcClusterAdded(iDataset): retDSs = [] for ds in iDataset: if 'Income' in ds.name: clusterKM = 3 clusterEM = 2 if 'FA' in ds.name: clusterKM = 2 clusterEM = 2 if 'ICA' in ds.name: clusterKM = 2 clusterEM = 2 if 'PCA' in ds.name: clusterKM = 3 clusterEM = 3 if 'RP' in ds.name: clusterKM = 2 clusterEM = 3 elif 'Wine' in ds.name: clusterKM = 2 clusterEM = 2 if 'FA' in ds.name: clusterKM = 4 clusterEM = 2 if 'ICA' in ds.name: clusterKM = 2 clusterEM = 2 if 'PCA' in ds.name: clusterKM = 2 clusterEM = 2 if 'RP' in ds.name: clusterKM = 3 clusterEM = 2 retDS = dataset() emLearner = Clustering.KM(n_clusters=clusterKM) emLearner.getLearner().fit(ds.training_x) clustringY_KM = emLearner.getLearner().predict(ds.training_x) xTransformed = pd.concat([pd.DataFrame(ds.training_x), pd.DataFrame(clustringY_KM)], axis=1).to_numpy() retDS.training_x = xTransformed retDS.training_y = ds.training_y retDS.name = ds.name + ' with KM Clusters Added' retDS.build_train_test_splitSecond() retDSs.append(retDS) retDS = dataset() emLearner = Clustering.EM(n_components=clusterEM) emLearner.getLearner().fit(ds.training_x) clustringY_EM = emLearner.getLearner().predict(ds.training_x) xTransformed = pd.concat([pd.DataFrame(ds.training_x), pd.DataFrame(clustringY_EM)], axis=1).to_numpy() retDS.training_x = xTransformed retDS.training_y = ds.training_y retDS.name = ds.name + ' with EM Clusters Added' retDS.build_train_test_splitSecond() retDSs.append(retDS) return retDSs[0:2], retDSs[2:4]
def getTsnePlot(dataset): for ds in dataset: if 'Income' in ds.name: clusterKM = 3 clusterEM = 2 markerEM = ['+', 'x'] paletteKM = ['red', 'green', 'blue'] paletteEM = ['red', 'blue'] if 'FA' in ds.name: clusterKM = 2 clusterEM = 2 paletteKM = ['red', 'blue'] paletteEM = ['red', 'blue'] if 'ICA' in ds.name: clusterKM = 2 clusterEM = 2 paletteKM = ['red', 'blue'] paletteEM = ['red', 'blue'] if 'PCA' in ds.name: clusterKM = 3 clusterEM = 3 paletteKM = ['red', 'green', 'blue'] paletteEM = ['red', 'green', 'blue'] if 'RP' in ds.name: clusterKM = 2 clusterEM = 3 paletteKM = ['red', 'blue'] paletteEM = ['red', 'green', 'blue'] elif 'Wine' in ds.name: clusterKM = 2 clusterEM = 2 paletteKM = ['red', 'blue'] paletteEM = ['red', 'blue'] if 'FA' in ds.name: clusterKM = 4 clusterEM = 2 paletteKM = ['red', 'green', 'blue', 'orange'] paletteEM = ['red', 'blue'] if 'ICA' in ds.name: clusterKM = 2 clusterEM = 2 paletteKM = ['red', 'blue'] paletteEM = ['red', 'blue'] if 'PCA' in ds.name: clusterKM = 2 clusterEM = 2 paletteKM = ['red', 'blue'] paletteEM = ['red', 'blue'] if 'RP' in ds.name: clusterKM = 3 clusterEM = 2 paletteKM = ['red', 'green', 'blue'] paletteEM = ['red', 'blue'] kmLearner = Clustering.KM(n_clusters=clusterKM) kmLearner.getLearner().fit(ds.training_x) emLearner = Clustering.EM(n_components=clusterEM) emLearner.getLearner().fit(ds.training_x) clustringY_KM = kmLearner.getLearner().predict(ds.training_x) clustringY_EM = emLearner.getLearner().predict(ds.training_x) homogeneityKM, completenessKM, v_measureKM = homogeneity_completeness_v_measure(ds.training_y, clustringY_KM) AMISKM = adjusted_mutual_info_score(ds.training_y, clustringY_KM) ARSKM = adjusted_rand_score(ds.training_y, clustringY_KM) silhouetteKM = silhouette_score(ds.training_x, clustringY_KM) homogeneityEM, completenessEM, v_measureEM = homogeneity_completeness_v_measure(ds.training_y, clustringY_EM) AMISEM = adjusted_mutual_info_score(ds.training_y, clustringY_EM) ARSEM = adjusted_rand_score(ds.training_y, clustringY_EM) silhouetteEM = silhouette_score(ds.training_x, clustringY_EM) print('For dataset ' + ds.name + ' using KM, the v_measure, AMIS, ARS and silhouette are: ', v_measureKM, AMISKM, ARSKM, silhouetteKM) print('For dataset ' + ds.name + ' using EM, the v_measure, AMIS, ARS and silhouette are: ', v_measureEM, AMISEM, ARSEM, silhouetteEM) fig = plt.figure() ax = fig.add_subplot() tsne = TSNE(n_components=2, random_state=0) tsne_obj = tsne.fit_transform(ds.training_x) ax.scatter(tsne_obj[:, 0], tsne_obj[:, 1], alpha=0.4, c=[paletteKM[x] for x in clustringY_KM]) plt.xlabel('X') plt.xlabel('Y') if 'FA' in ds.name and 'Income' in ds.name: plt.scatter(tsne_obj[ds.training_y == 1, 0], tsne_obj[ds.training_y == 1, 1], marker='+', c='k') plt.xlabel('X') plt.xlabel('Y') ax.set_title('t-SNE Plot for ' + ds.name + ' using KM') plt.legend() plt.savefig('Figures/Clustering/TSNE for ' + ds.name + ' using KM.png') plt.close() fig = plt.figure() ax = fig.add_subplot() tsne = TSNE(n_components=2, random_state=0) tsne_obj = tsne.fit_transform(ds.training_x) ax.scatter(tsne_obj[:, 0], tsne_obj[:, 1], alpha=0.4, c=[paletteEM[x] for x in clustringY_EM]) plt.xlabel('X') plt.xlabel('Y') if 'FA' in ds.name and 'Income' in ds.name: plt.scatter(tsne_obj[ds.training_y == 1, 0], tsne_obj[ds.training_y == 1, 1], marker='+', c='k') plt.xlabel('X') plt.xlabel('Y') ax.set_title('t-SNE Plot for ' + ds.name + ' using EM') plt.legend() plt.savefig('Figures/Clustering/TSNE for ' + ds.name + ' using EM.png') plt.close() if 'Wine' in ds.name: plt.style.use('seaborn-whitegrid') classDf = pd.DataFrame(clustringY_KM) classDf.columns = ['Class'] datasetDf = pd.concat((pd.DataFrame(ds.training_x), classDf), axis=1) parallel_coordinates(datasetDf, 'Class', color=paletteKM, alpha=0.3) plt.xlabel('Features', fontsize=12) plt.title('Parallel Coordinates Plot for ' + ds.name + ' using KM', fontsize=12, y=1.03) plt.legend() plt.savefig('Figures/Clustering/Parallel Coord for ' + ds.name + ' using KM.png') plt.close() plt.style.use('seaborn-whitegrid') classDf = pd.DataFrame(clustringY_EM) classDf.columns = ['Class'] datasetDf = pd.concat((pd.DataFrame(ds.training_x), classDf), axis=1) parallel_coordinates(datasetDf, 'Class', color=paletteEM, alpha=0.3) plt.xlabel('Features', fontsize=12) plt.title('Parallel Coordinates Plot for ' + ds.name + ' using EM', fontsize=12, y=1.03) plt.legend() plt.savefig('Figures/Clustering/Parallel Coord for ' + ds.name + ' using EM.png') plt.close()