def run_km_em(perm_x, perm_y, dname, clstr): SSE_km_perm = [] ll_em_perm = [] acc_km_perm = [] acc_em_perm = [] adjMI_km_perm = [] adjMI_em_perm = [] homo_km_perm = [] homo_em_perm = [] comp_km_perm = [] comp_em_perm = [] silhou_km_perm = [] bic_em_perm = [] clk_time = [] for k in clstr: st = clock() km = KMeans(n_clusters=k, random_state=10) gmm = GMM(n_components=k, random_state=10) SSE_km_perm.append(-km.score(perm_x, km.fit_predict(perm_x))) ll_em_perm.append(gmm.score(perm_x, gmm.fit_predict(perm_x))) acc_km_perm.append(cluster_acc(perm_y, km.fit_predict(perm_x))) acc_em_perm.append(cluster_acc(perm_y, gmm.fit_predict(perm_x))) adjMI_km_perm.append(ami(perm_y, km.fit_predict(perm_x))) adjMI_em_perm.append(ami(perm_y, gmm.fit_predict(perm_x))) homo_km_perm.append( metrics.homogeneity_score(perm_y, km.fit_predict(perm_x))) homo_em_perm.append( metrics.homogeneity_score(perm_y, gmm.fit_predict(perm_x))) comp_km_perm.append( metrics.completeness_score(perm_y, km.fit_predict(perm_x))) comp_em_perm.append( metrics.completeness_score(perm_y, gmm.fit_predict(perm_x))) silhou_km_perm.append( metrics.silhouette_score(perm_x, km.fit_predict(perm_x))) bic_em_perm.append(gmm.bic(perm_x)) clk_time.append(clock() - st) print(k, clock() - st) dbcluster = pd.DataFrame({ 'k': clstr, 'SSE_km': SSE_km_perm, 'll_em': ll_em_perm, 'acc_km': acc_km_perm, 'acc_em': acc_em_perm, 'adjMI_km': adjMI_km_perm, 'adjMI_em': adjMI_em_perm, 'homo_km': homo_km_perm, 'homo_em': homo_em_perm, 'comp_km': comp_km_perm, 'comp_em': comp_em_perm, 'silhou_km': silhou_km_perm, 'bic_em': bic_em_perm, 'clk_time': clk_time }) dbcluster.to_csv('./results/cluster_{}.csv'.format(dname), sep=',')
def generate_cluster_plots(df, name, pdir): """Visualizes clusters using pre-processed 2D dataset. Args: df (Pandas.DataFrame): Dataset containing attributes and labels. name (str): Dataset name. pdir (str): Output folder for plots. """ # get cols x1 = df['x1'] x2 = df['x2'] km = df['km'] gmm = df['gmm'] c = df['class'] print "Accuracy Score for KMeans- {}".format(cluster_acc(c, km)) print "Accuracy Score for EM- {}".format(cluster_acc(c, gmm)) # plot cluster scatter plots fig, (ax1, ax2, ax3) = plt.subplots(nrows=1, ncols=3, figsize=(12, 3)) ax1.scatter(x1, x2, marker='x', s=20, c=km, cmap='gist_rainbow') ax1.set_title('K-Means Clusters ({})'.format(name)) ax1.set_ylabel('x1') ax1.set_xlabel('x2') ax1.grid(color='grey', linestyle='dotted') ax2.scatter(x1, x2, marker='x', s=20, c=gmm, cmap='gist_rainbow') ax2.set_title('GMM Clusters ({})'.format(name)) ax2.set_ylabel('x1') ax2.set_xlabel('x2') ax2.grid(color='grey', linestyle='dotted') # change color map depending on dataset cmap = None if name == 'digits': cmap = 'hsv' else: cmap = 'summer' ax3.scatter(x1, x2, marker='o', s=20, c=c, cmap="gist_rainbow") ax3.set_title('Class Labels ({})'.format(name)) ax3.set_ylabel('x1') ax3.set_xlabel('x2') ax3.grid(color='grey', linestyle='dotted') # change layout size, font size and width between subplots fig.tight_layout() for ax in fig.axes: ax_items = [ax.title, ax.xaxis.label, ax.yaxis.label] for item in ax_items + ax.get_xticklabels() + ax.get_yticklabels(): item.set_fontsize(8) plt.subplots_adjust(wspace=0.3) # save figure plotdir = pdir plotpath = get_abspath('{}_clusters.png'.format(name), plotdir) plt.savefig(plotpath) plt.clf()
def evaluate_kmeans(X, y, problem, out='./results/Clustering/'): """Also evaluate kmeans and em both""" sm = SMOTE() X_res, y_res = sm.fit_sample(X, y) SSE = defaultdict(dict) ll = defaultdict(dict) distort_km = [] distort_gm = [] acc = defaultdict(lambda: defaultdict(dict)) adjMI = defaultdict(lambda: defaultdict(dict)) km = KMeans(random_state=5) gm = GM(random_state=5) st = clock() clusters = [2, 3, 4, 5, 6] for k in clusters: print('now doing k=' + str(k)) km.set_params(n_clusters=k) gm.set_params(n_components=k) km.fit(X_res) gm.fit(X_res) #distort_km.append(sum(np.min(cdist(X, km.cluster_centers_, 'euclidean'), axis=1)) / X.shape[0]) ##distort_gm.append(sum(np.min(cdist(X, gm.cluster_centers_, 'euclidean'), axis=1)) / X.shape[0]) SSE[k][problem] = km.score(X_res) ll[k][problem] = gm.score(X_res) print('km score:', SSE[k][problem]) print('gm score:', ll[k][problem]) acc[k][problem]['Kmeans'] = cluster_acc(y_res, km.predict(X_res)) acc[k][problem]['GM'] = cluster_acc(y_res, gm.predict(X_res)) adjMI[k][problem]['Kmeans'] = metrics.adjusted_mutual_info_score( y_res, km.predict(X_res)) adjMI[k][problem]['GM'] = metrics.adjusted_mutual_info_score( y_res, gm.predict(X_res)) print(k, clock() - st) SSE = (-pd.DataFrame(SSE)).T SSE.rename(columns=lambda x: x + ' SSE (left)', inplace=True) ll = pd.DataFrame(ll).T ll.rename(columns=lambda x: x + ' log-likelihood', inplace=True) acc = pd.Panel(acc) adjMI = pd.Panel(adjMI) SSE.to_csv(out + problem + ' SSE.csv') ll.to_csv(out + problem + ' logliklihood.csv') acc.ix[:, :, problem].to_csv(out + problem + ' acc.csv') acc.ix[:, :, problem, ].to_csv(out + problem + ' acc.csv') adjMI.ix[:, :, problem].to_csv(out + problem + ' adjMI.csv') adjMI.ix[:, :, problem].to_csv(out + problem + ' adjMI.csv') return SSE, ll, acc, adjMI, km, gm
def cluster_scores(X, y, pred): acc, H0, H1, cluster_summary = hlp.cluster_acc(y, pred) df = pd.DataFrame(cluster_summary)[['Size','Pos','Neg','H', 'Acc']] #.sort_values(['Acc'], ascending=False) df2 = pd.DataFrame(pred,silhouette_samples(X, pred)).reset_index()[[0,'index']] df2.columns=['label','silh'] df['Silh']=df2.groupby('label').mean() return df.sort_values(['Acc'], ascending=False)
gmm.fit(faultsX) #Faults dataset #Visual Measurements #Sum of Squared Errors for K-means SSE[k]['Faults'] = km.score(faultsX) #Log-Likelihood for GMM ll[k]['Faults'] = gmm.score(faultsX) #Silhouette Score #The best value is 1 and the worst value is -1. Silhouette analysis can be used to study the separation distance between the resulting clusters. SS[k]['Faults']['Kmeans'] = ss(faultsX, km.predict(faultsX)) SS[k]['Faults']['GMM'] = ss(faultsX, gmm.predict(faultsX)) #Cluster Accuracy acc[k]['Faults']['Kmeans'] = cluster_acc(faultsY, km.predict(faultsX)) acc[k]['Faults']['GMM'] = cluster_acc(faultsY, gmm.predict(faultsX)) #Adjusted Mutual Information adjMI[k]['Faults']['Kmeans'] = ami(faultsY, km.predict(faultsX)) adjMI[k]['Faults']['GMM'] = ami(faultsY, gmm.predict(faultsX)) #Breast Cancer dataset km.fit(bcX) gmm.fit(bcX) SSE[k]['BreastC'] = km.score(bcX) ll[k]['BreastC'] = gmm.score(bcX) SS[k]['BreastC']['Kmeans'] = ss(bcX, km.predict(bcX)) SS[k]['BreastC']['GMM'] = ss(bcX, gmm.predict(bcX)) acc[k]['BreastC']['Kmeans'] = cluster_acc(bcY, km.predict(bcX)) acc[k]['BreastC']['GMM'] = cluster_acc(bcY, gmm.predict(bcX))
clusters = [2, 3, 4, 5, 8, 12, 15, 18, 21, 25] loans_km_acc = [] loans_gmm_acc = [] loans_km_score = [] loans_gmm_score = [] loans_km_ami = [] loans_gmm_ami = [] loans_km_silhouette = [] loans_gmm_silhouette = [] for k in clusters: km.set_params(n_clusters=k) km.fit(loansX_pca) loans_km_acc.append(cluster_acc(loans_Y, km.predict(loansX_pca))) loans_km_score.append(km.score(loansX_pca)) loans_km_ami.append(ami(loans_Y, km.predict(loansX_pca))) loans_km_silhouette.append( silhouette_score(loansX_pca, km.predict(loansX_pca))) gmm.set_params(n_components=k) gmm.fit(loansX_pca) loans_gmm_acc.append(cluster_acc(loans_Y, gmm.predict(loansX_pca))) loans_gmm_score.append(gmm.score(loansX_pca)) loans_gmm_ami.append(ami(loans_Y, gmm.predict(loansX_pca))) loans_gmm_silhouette.append( silhouette_score(loansX_pca, gmm.predict(loansX_pca))) loans_df= pd.DataFrame({'Kmeans acc': loans_km_acc, 'GMM acc': loans_gmm_acc,\ 'Kmeans score': loans_km_score, 'GMM score': loans_gmm_score,\
def clustering_experiment(X, y, name, clusters, rdir): """Generate results CSVs for given datasets using the K-Means and EM clustering algorithms. Args: X (Numpy.Array): Attributes. y (Numpy.Array): Labels. name (str): Dataset name. clusters (list[int]): List of k values. rdir (str): Output directory. """ sse = defaultdict(dict) # sum of squared errors logl = defaultdict(dict) # log-likelihood bic = defaultdict(dict) # BIC for EM silhouette = defaultdict(dict) # silhouette score acc = defaultdict(lambda: defaultdict(dict)) # accuracy scores adjmi = defaultdict(lambda: defaultdict(dict)) # adjusted mutual info km = KMeans(random_state=0) # K-Means gmm = GMM(random_state=0) # Gaussian Mixture Model (EM) # start loop for given values of k for k in clusters: km.set_params(n_clusters=k) gmm.set_params(n_components=k) km.fit(X) gmm.fit(X) # calculate SSE, log-likelihood, accuracy, and adjusted mutual info sse[k][name] = km.score(X) logl[k][name] = gmm.score(X) acc[k][name]['km'] = cluster_acc(y, km.predict(X)) acc[k][name]['gmm'] = cluster_acc(y, gmm.predict(X)) adjmi[k][name]['km'] = ami(y, km.predict(X)) adjmi[k][name]['gmm'] = ami(y, gmm.predict(X)) # calculate silhouette score for K-Means km_silhouette = silhouette_score(X, km.predict(X)) silhouette[k][name] = km_silhouette # calculate BIC for EM bic[k][name] = gmm.bic(X) # generate output dataframes sse = (-pd.DataFrame(sse)).T sse.rename(columns={name: 'sse'}, inplace=True) logl = pd.DataFrame(logl).T logl.rename(columns={name: 'log-likelihood'}, inplace=True) bic = pd.DataFrame(bic).T bic.rename(columns={name: 'bic'}, inplace=True) silhouette = pd.DataFrame(silhouette).T silhouette.rename(columns={name: 'silhouette_score'}, inplace=True) acc = pd.Panel(acc) acc = acc.loc[:, :, name].T.rename(lambda x: '{}_acc'.format(x), axis='columns') adjmi = pd.Panel(adjmi) adjmi = adjmi.loc[:, :, name].T.rename(lambda x: '{}_adjmi'.format(x), axis='columns') # concatenate all results dfs = (sse, silhouette, logl, bic, acc, adjmi) metrics = pd.concat(dfs, axis=1) resfile = get_abspath('{}_metrics.csv'.format(name), rdir) metrics.to_csv(resfile, index_label='k')
acc = defaultdict(lambda: defaultdict(dict)) adjMI = defaultdict(lambda: defaultdict(dict)) km = kmeans(random_state=5) gmm = GMM(random_state=5) st = time.time() print(len(clusters)) for k in clusters: km.set_params(n_clusters=k) gmm.set_params(n_components=k) km.fit(perm_x) gmm.fit(perm_x) SSE[k]['perm'] = km.score(perm_x) ll[k]['perm'] = gmm.score(perm_x) acc[k]['perm']['Kmeans'] = cluster_acc(perm_y, km.predict(perm_x)) acc[k]['perm']['GMM'] = cluster_acc(perm_y, gmm.predict(perm_x)) adjMI[k]['perm']['Kmeans'] = ami(perm_y, km.predict(perm_x)) adjMI[k]['perm']['GMM'] = ami(perm_y, gmm.predict(perm_x)) for k in clusters: km.set_params(n_clusters=k) gmm.set_params(n_components=k) km.fit(housing_x) gmm.fit(housing_x) SSE[k]['housing'] = km.score(housing_x) ll[k]['housing'] = gmm.score(housing_x) acc[k]['housing']['Kmeans'] = cluster_acc(housing_y, km.predict(housing_x)) acc[k]['housing']['GMM'] = cluster_acc(housing_y, gmm.predict(housing_x))
labels[k]['GMM'] = gmm_labels sil[k]['Kmeans'] = sil_score(dataX, km_labels) sil[k]['GMM'] = sil_score(dataX, gmm_labels) km_sil_samples = sil_samples(dataX, km_labels) gmm_sil_samples = sil_samples(dataX, gmm_labels) for i, x in enumerate(km_sil_samples): sil_samp[j] = [k, 'Kmeans', round(x, 6), km_labels[i]] j += 1 for i, x in enumerate(gmm_sil_samples): sil_samp[j] = [k, 'GMM', round(x, 6), gmm_labels[i]] j += 1 sse[k] = km.score(dataX) ll[k] = gmm.score(dataX) bic[k] = gmm.bic(dataX) acc[k]['Kmeans'] = cluster_acc(dataY,km.predict(dataX)) acc[k]['GMM'] = cluster_acc(dataY,gmm.predict(dataX)) adj_mi[k]['Kmeans'] = ami(dataY,km.predict(dataX)) adj_mi[k]['GMM'] = ami(dataY,gmm.predict(dataX)) gmm_clusters = pd.DataFrame() kmeans_clusters = pd.DataFrame() for i in clusters: gmm_clusters[i] = labels[i]['GMM'] kmeans_clusters[i] = labels[i]['Kmeans'] bic = pd.DataFrame(bic, index=[0]).T bic.index.name = 'k'
adjMI = defaultdict(lambda: defaultdict(dict)) adjRI = defaultdict(lambda: defaultdict(dict)) bic = defaultdict(lambda: defaultdict(dict)) silh = defaultdict(lambda: defaultdict(dict)) km = kmeans(random_state=5) gmm = GMM(random_state=5) st = clock() for k in clusters: km.set_params(n_clusters=k) gmm.set_params(n_components=k) km.fit(wineX) gmm.fit(wineX) SSE[k]['Wine'] = km.score(wineX) ll[k]['Wine'] = gmm.score(wineX) acc[k]['Wine']['Kmeans'] = cluster_acc(wineY.ravel(), km.predict(wineX)) acc[k]['Wine']['GMM'] = cluster_acc(wineY.ravel(), gmm.predict(wineX)) adjMI[k]['Wine']['Kmeans'] = ami(wineY.ravel(), km.predict(wineX)) adjMI[k]['Wine']['GMM'] = ami(wineY.ravel(), gmm.predict(wineX)) adjRI[k]['Wine']['Kmeans'] = ari(wineY.ravel(), km.predict(wineX)) adjRI[k]['Wine']['GMM'] = ari(wineY.ravel(), gmm.predict(wineX)) bic[k]['Wine']['Kmeans'] = -compute_bic(km, wineX) bic[k]['Wine']['GMM'] = gmm.bic(wineX) silh[k]['Wine']['Kmeans'] = silhouette_score(wineX, km.predict(wineX)) silh[k]['Wine']['GMM'] = silhouette_score(wineX, gmm.predict(wineX)) km.fit(digitX) gmm.fit(digitX) SSE[k]['Digit'] = km.score(digitX) ll[k]['Digit'] = gmm.score(digitX) acc[k]['Digit']['Kmeans'] = cluster_acc(digitY.ravel(),
#%% Data for 1-3 clusters = [2, 3, 4, 5, 6, 8, 10, 12, 15, 20, 25, 30, 35, 40, 50] scores = hlp.explore_clustering(datasets, clusters) # Print Charts hlp.plot_CE([scores], clusters, 'DR using RCA (RP)') # play around km = KMeans(random_state=6, n_init=10) gmm = GaussianMixture(random_state=6, n_init=1) km.set_params(n_clusters=5) km.fit(datasets['Titanic']['X_train']) km.score(datasets['Titanic']['X_train']) X_train = datasets['Titanic']['X_train'] y_train = datasets['Titanic']['y_train'] acc, H0, H1 = hlp.cluster_acc(y_train, km.predict(X_train)) H0 - H1 adjusted_mutual_info_score(y_train, km.predict(X_train)) gmm.set_params(n_components=5) gmm.fit(datasets['Titanic']['X_train']) gmm.score(datasets['Titanic']['X_train']) gmm.bic(X_train) # Silhouette plots from sklearn.neighbors.nearest_centroid import NearestCentroid # set m = either km or gmm gmm.set_params(n_components=35) km.set_params(n_clusters=60) m = gmm # fit & plot #X_train = datasets['Titanic']['X_train']
SSE = defaultdict(dict) ll = defaultdict(dict) acc = defaultdict(lambda: defaultdict(dict)) adjMI = defaultdict(lambda: defaultdict(dict)) km = kmeans(random_state=5) gmm = GMM(random_state=5) st = clock() for k in clusters: km.set_params(n_clusters=k) gmm.set_params(n_components=k) km.fit(biodegX) gmm.fit(biodegX) SSE[k]['Biodeg'] = km.score(biodegX) ll[k]['Biodeg'] = gmm.score(biodegX) acc[k]['Biodeg']['Kmeans'] = cluster_acc(biodegY,km.predict(biodegX)) acc[k]['Biodeg']['GMM'] = cluster_acc(biodegY,gmm.predict(biodegX)) adjMI[k]['Biodeg']['Kmeans'] = ami(biodegY,km.predict(biodegX)) adjMI[k]['Biodeg']['GMM'] = ami(biodegY,gmm.predict(biodegX)) km.fit(digitsX) gmm.fit(digitsX) SSE[k]['Digits'] = km.score(digitsX) ll[k]['Digits'] = gmm.score(digitsX) acc[k]['Digits']['Kmeans'] = cluster_acc(digitsY,km.predict(digitsX)) acc[k]['Digits']['GMM'] = cluster_acc(digitsY,gmm.predict(digitsX)) adjMI[k]['Digits']['Kmeans'] = ami(digitsY,km.predict(digitsX)) adjMI[k]['Digits']['GMM'] = ami(digitsY,gmm.predict(digitsX)) print(k, clock()-st)
SSE = defaultdict(dict) ll = defaultdict(dict) acc = defaultdict(lambda: defaultdict(dict)) adjMI = defaultdict(lambda: defaultdict(dict)) km = kmeans(random_state=5) gmm = GMM(random_state=5) st = clock() for k in clusters: km.set_params(n_clusters=k) gmm.set_params(n_components=k) km.fit(wineX) gmm.fit(wineX) SSE[k]['wine'] = km.score(wineX) ll[k]['wine'] = gmm.score(wineX) acc[k]['wine']['Kmeans'] = cluster_acc(wineY, km.predict(wineX)) acc[k]['wine']['GMM'] = cluster_acc(wineY, gmm.predict(wineX)) adjMI[k]['wine']['Kmeans'] = ami(wineY, km.predict(wineX)) adjMI[k]['wine']['GMM'] = ami(wineY, gmm.predict(wineX)) km.fit(cancerX) gmm.fit(cancerX) SSE[k]['cancer'] = km.score(cancerX) ll[k]['cancer'] = gmm.score(cancerX) acc[k]['cancer']['Kmeans'] = cluster_acc(cancerY, km.predict(cancerX)) acc[k]['cancer']['GMM'] = cluster_acc(cancerY, gmm.predict(cancerX)) adjMI[k]['cancer']['Kmeans'] = ami(cancerY, km.predict(cancerX)) adjMI[k]['cancer']['GMM'] = ami(cancerY, gmm.predict(cancerX)) print(k, clock() - st) ## Adding cluster outputs for best parameters and saving at the end of the file
def main_logic(): out = './BASE/' # change the below value based on the readme.txt file instructions base = './BASE/' np.random.seed(0) madelon = pd.read_hdf(base + 'datasets.hdf', 'madelon') madelon_X = madelon.drop('Class', 1).copy().values madelon_Y = madelon['Class'].copy().values character = pd.read_hdf(base + 'datasets.hdf', 'character') character_X = character.drop('Class', 1).copy().values character_Y = character['Class'].copy().values np.random.seed(0) # clusters = [2] clusters = [2, 5, 10, 15, 20, 25, 30, 35, 40] madelon_X = StandardScaler().fit_transform(madelon_X) character_X = StandardScaler().fit_transform(character_X) # Data for 1-3 SSE = defaultdict(dict) ll = defaultdict(dict) Silhouette_dict = defaultdict(dict) acc = defaultdict(lambda: defaultdict(dict)) adjMI = defaultdict(lambda: defaultdict(dict)) km = kmeans(random_state=5) gmm = GMM(random_state=5) for j in clusters: st = clock() km.set_params(n_clusters=j) gmm.set_params(n_components=j) km.fit(madelon_X) gmm.fit(madelon_X) SSE[j]['Madelon'] = km.score(madelon_X) ll[j]['Madelon'] = gmm.score(madelon_X) test = km.predict(madelon_X) acc[j]['Madelon']['Kmeans'] = cluster_acc(madelon_Y, km.predict(madelon_X), j) acc[j]['Madelon']['GMM'] = cluster_acc(madelon_Y, gmm.predict(madelon_X)) adjMI[j]['Madelon']['Kmeans'] = ami(madelon_Y, km.predict(madelon_X)) adjMI[j]['Madelon']['GMM'] = ami(madelon_Y, gmm.predict(madelon_X)) print("Homogenity Score ,{}, Kmeans,".format(j), hs(madelon_Y, km.labels_)) print("Completeness Score ,{} ,Kmeans,".format(j), cs(madelon_Y, km.labels_)) label = km.labels_ gmmm = gmm.predict_proba(madelon_X) sil_coeff = silhouette_score(madelon_X, label, metric='euclidean') Silhouette_dict[j]['Madelon'] = sil_coeff print("For n_clusters={}, The Silhouette Coefficient is {}".format( j, sil_coeff)) km.fit(character_X) gmm.fit(character_X) SSE[j]['character'] = km.score(character_X) ll[j]['character'] = gmm.score(character_X) best = km.predict(character_X) acc[j]['character']['Kmeans'] = cluster_acc(character_Y, km.predict(character_X), j) acc[j]['character']['GMM'] = cluster_acc(character_Y, gmm.predict(character_X)) adjMI[j]['character']['Kmeans'] = ami(character_Y, km.predict(character_X)) adjMI[j]['character']['GMM'] = ami(character_Y, gmm.predict(character_X)) label = km.labels_ sil_coeff = silhouette_score(character_X, label, metric='euclidean') Silhouette_dict[j]['character'] = sil_coeff print(j, clock() - st) print("Homogenity Score ,{}, Kmeans,".format(j), hs(character_Y, km.labels_)) print("Completeness Score ,{} ,Kmeans,".format(j), cs(character_Y, km.labels_)) print("For n_clusters={}, The Silhouette Coefficient is {}".format( j, sil_coeff)) Silhouette_dict = pd.DataFrame(Silhouette_dict).to_csv(out + 'Silhouette.csv') SSE = (-pd.DataFrame(SSE)).T SSE.rename(columns=lambda x: x + ' SSE (left)', inplace=True) ll = pd.DataFrame(ll).T ll.rename(columns=lambda x: x + ' log-likelihood', inplace=True) acc = pd.Panel(acc) adjMI = pd.Panel(adjMI) SSE.to_csv(out + 'SSE.csv') ll.to_csv(out + 'logliklihood.csv') acc.ix[:, :, 'character'].to_csv(out + 'character_acc.csv') acc.ix[:, :, 'Madelon'].to_csv(out + 'Madelon acc.csv') adjMI.ix[:, :, 'character'].to_csv(out + 'character_adjMI.csv') adjMI.ix[:, :, 'Madelon'].to_csv(out + 'Madelon adjMI.csv') # %% NN fit data (2,3) grid = { 'km__n_clusters': clusters, 'NN__alpha': nn_reg, 'NN__hidden_layer_sizes': nn_arch } madelon = pd.read_hdf(base + 'datasets.hdf', 'madelon') madelon_X = madelon.drop('Class', 1).copy().values madelon_Y = madelon['Class'].copy().values X_train, X_test, y_train, y_test = train_test_split(madelon_X, madelon_Y, test_size=0.3, random_state=42) np.random.seed(0) for k in clusters: mlp = MLPClassifier(activation='relu', max_iter=2000, early_stopping=True, random_state=5, alpha=10**-5, hidden_layer_sizes=(62, 62), verbose=0) km = kmeans(random_state=5, n_clusters=k) pipe = Pipeline([('km', km), ('NN', mlp)]) # gs = GridSearchCV(pipe, grid, verbose=10) tick = time.clock() pipe.fit(X_train, y_train) tock = time.clock() - tick print("Traning time , {}, k means dataset".format(k), ',', tock) tick = time.clock() y_pred = pipe.predict(X_test) tock = time.clock() - tick print("Testing time , {}, k means component".format(k), ',', tock) print("Accuracy Score , {}, kmeans Madelon".format(k), ',', accuracy_score(y_test, y_pred)) grid = {'gmm__n_components': clusters} mlp = MLPClassifier(activation='relu', max_iter=2000, early_stopping=True, random_state=5, verbose=0, alpha=10**-5, hidden_layer_sizes=(62, 62)) gmm = myGMM(random_state=43, n_components=k) pipe = Pipeline([('gmm', gmm), ('NN', mlp)]) # gs = GridSearchCV(pipe, grid, verbose=10, cv=5) tick = time.clock() pipe.fit(X_train, y_train) tock = time.clock() - tick print("Traning time , {}, gmm dataset".format(k), ',', tock) tick = time.clock() y_pred = pipe.predict(X_test) tock = time.clock() - tick print("Testing time , {}, gmm means component".format(k), ',', tock) print("Accuracy Score , {}, gmm means Madelon".format(k), ',', accuracy_score(y_test, y_pred)) grid = { 'km__n_clusters': clusters, 'NN__alpha': nn_reg, 'NN__hidden_layer_sizes': nn_arch } mlp = MLPClassifier(activation='relu', max_iter=2000, early_stopping=True, random_state=5) km = kmeans(random_state=5) pipe = Pipeline([('km', km), ('NN', mlp)]) gs = GridSearchCV(pipe, grid, verbose=10) gs.fit(madelon_X, madelon_Y) tmp = pd.DataFrame(gs.cv_results_) tmp.to_csv(out + 'Madelon cluster Kmeans.csv') grid = { 'gmm__n_components': clusters, 'NN__alpha': nn_reg, 'NN__hidden_layer_sizes': nn_arch } mlp = MLPClassifier(activation='relu', max_iter=2000, early_stopping=True, random_state=5) gmm = myGMM(random_state=5) pipe = Pipeline([('gmm', gmm), ('NN', mlp)]) gs = GridSearchCV(pipe, grid, verbose=10, cv=5) gs.fit(madelon_X, madelon_Y) tmp = pd.DataFrame(gs.cv_results_) tmp.to_csv(out + 'Madelon cluster GMM.csv') grid = { 'km__n_clusters': clusters, 'NN__alpha': nn_reg, 'NN__hidden_layer_sizes': nn_arch } mlp = MLPClassifier(activation='relu', max_iter=2000, early_stopping=True, random_state=5) km = kmeans(random_state=5) pipe = Pipeline([('km', km), ('NN', mlp)]) gs = GridSearchCV(pipe, grid, verbose=10, cv=5) gs.fit(character_X, character_Y) tmp = pd.DataFrame(gs.cv_results_) tmp.to_csv(out + 'character_cluster_Kmeans.csv') grid = { 'gmm__n_components': clusters, 'NN__alpha': nn_reg, 'NN__hidden_layer_sizes': nn_arch } mlp = MLPClassifier(activation='relu', max_iter=2000, early_stopping=True, random_state=5) gmm = myGMM(random_state=5) pipe = Pipeline([('gmm', gmm), ('NN', mlp)]) gs = GridSearchCV(pipe, grid, verbose=10, cv=5) gs.fit(character_X, character_Y) tmp = pd.DataFrame(gs.cv_results_) tmp.to_csv(out + 'character_cluster_GMM.csv') # %% For chart 4/5 madelonX2D = TSNE(verbose=10, random_state=5).fit_transform(madelon_X) character_X2D = TSNE(verbose=10, random_state=5).fit_transform(character_X) madelon2D = pd.DataFrame(np.hstack( (madelonX2D, np.atleast_2d(madelon_Y).T)), columns=['x', 'y', 'target']) character2D = pd.DataFrame(np.hstack( (character_X2D, np.atleast_2d(character_Y).T)), columns=['x', 'y', 'target']) madelon2D.to_csv(out + 'madelon2D.csv') character2D.to_csv(out + 'character2D.csv')
SSE = defaultdict(dict) ll = defaultdict(dict) acc = defaultdict(lambda: defaultdict(dict)) adjMI = defaultdict(lambda: defaultdict(dict)) km = kmeans(random_state=5) gmm = GMM(random_state=5) st = clock() for k in clusters: km.set_params(n_clusters=k) gmm.set_params(n_components=k) km.fit(diamondsX) gmm.fit(diamondsX) SSE[k]['Diamonds'] = km.score(diamondsX) ll[k]['Diamonds'] = gmm.score(diamondsX) acc[k]['Diamonds']['Kmeans'] = cluster_acc(diamondsY, km.predict(diamondsX)) acc[k]['Diamonds']['GMM'] = cluster_acc(diamondsY, gmm.predict(diamondsX)) adjMI[k]['Diamonds']['Kmeans'] = ami(diamondsY, km.predict(diamondsX)) adjMI[k]['Diamonds']['GMM'] = ami(diamondsY, gmm.predict(diamondsX)) km.fit(digitsX) gmm.fit(digitsX) SSE[k]['Digits'] = km.score(digitsX) ll[k]['Digits'] = gmm.score(digitsX) acc[k]['Digits']['Kmeans'] = cluster_acc(digitsY, km.predict(digitsX)) acc[k]['Digits']['GMM'] = cluster_acc(digitsY, gmm.predict(digitsX)) adjMI[k]['Digits']['Kmeans'] = ami(digitsY, km.predict(digitsX)) adjMI[k]['Digits']['GMM'] = ami(digitsY, gmm.predict(digitsX)) print(k, clock() - st)
SSE = defaultdict(dict) ll = defaultdict(dict) acc = defaultdict(lambda: defaultdict(dict)) adjMI = defaultdict(lambda: defaultdict(dict)) km = kmeans(random_state=5) gmm = GMM(random_state=5) st = clock() for k in clusters: km.set_params(n_clusters=k) gmm.set_params(n_components=k) km.fit(madelonX) gmm.fit(madelonX) SSE[k]['Madelon'] = km.score(madelonX) ll[k]['Madelon'] = gmm.score(madelonX) acc[k]['Madelon']['Kmeans'] = cluster_acc(madelonY, km.predict(madelonX)) acc[k]['Madelon']['GMM'] = cluster_acc(madelonY, gmm.predict(madelonX)) adjMI[k]['Madelon']['Kmeans'] = ami(madelonY, km.predict(madelonX)) adjMI[k]['Madelon']['GMM'] = ami(madelonY, gmm.predict(madelonX)) km.fit(digitsX) gmm.fit(digitsX) SSE[k]['Digits'] = km.score(digitsX) ll[k]['Digits'] = gmm.score(digitsX) acc[k]['Digits']['Kmeans'] = cluster_acc(digitsY, km.predict(digitsX)) acc[k]['Digits']['GMM'] = cluster_acc(digitsY, gmm.predict(digitsX)) adjMI[k]['Digits']['Kmeans'] = ami(digitsY, km.predict(digitsX)) adjMI[k]['Digits']['GMM'] = ami(digitsY, gmm.predict(digitsX)) print(k, clock() - st) SSE = (-pd.DataFrame(SSE)).T
silhouette = defaultdict(lambda: defaultdict(dict)) km = kmeans(random_state=5) gmm = GMM(random_state=5) st = clock() abaloneX2D = TSNE(verbose=10, random_state=5).fit_transform(abaloneX) digitsX2D = TSNE(verbose=10, random_state=5).fit_transform(digitsX) for k in clusters: km.set_params(n_clusters=k) gmm.set_params(n_components=k) km.fit(abaloneX) gmm.fit(abaloneX) SSE[k]['abalone'] = km.score(abaloneX) ll[k]['abalone'] = gmm.score(abaloneX) acc[k]['abalone']['Kmeans'] = cluster_acc(abaloneY, km.predict(abaloneX)) acc[k]['abalone']['GMM'] = cluster_acc(abaloneY, gmm.predict(abaloneX)) adjMI[k]['abalone']['Kmeans'] = ami(abaloneY, km.predict(abaloneX)) adjMI[k]['abalone']['GMM'] = ami(abaloneY, gmm.predict(abaloneX)) silhouette[k]['abalone']['Kmeans'] = silhouette_score(abaloneX, km.labels_, metric='euclidean') silhouette[k]['abalone']['GMM'] = silhouette_score(abaloneX, gmm.predict(abaloneX), metric='euclidean') abalone2D = pd.DataFrame(np.hstack( (abaloneX2D, np.atleast_2d(km.predict(abaloneX)).T)), columns=['x', 'y', 'target']) abalone2D.to_csv(out + 'abalone2D_km_{}.csv'.format(k)) abalone2D = pd.DataFrame(np.hstack(
SSE = defaultdict(dict) ll = defaultdict(dict) acc = defaultdict(lambda: defaultdict(dict)) adjMI = defaultdict(lambda: defaultdict(dict)) km = kmeans(random_state=5) gmm = GMM(random_state=5) st = clock() for k in clusters: km.set_params(n_clusters=k) gmm.set_params(n_components=k) km.fit(contraX) gmm.fit(contraX) SSE[k]['contra'] = km.score(contraX) ll[k]['contra'] = gmm.score(contraX) acc[k]['contra']['Kmeans'] = cluster_acc(contraY, km.predict(contraX)) acc[k]['contra']['GMM'] = cluster_acc(contraY, gmm.predict(contraX)) adjMI[k]['contra']['Kmeans'] = ami(contraY, km.predict(contraX)) adjMI[k]['contra']['GMM'] = ami(contraY, gmm.predict(contraX)) km.fit(cancerX) gmm.fit(cancerX) SSE[k]['cancer'] = km.score(cancerX) ll[k]['cancer'] = gmm.score(cancerX) acc[k]['cancer']['Kmeans'] = cluster_acc(cancerY, km.predict(cancerX)) acc[k]['cancer']['GMM'] = cluster_acc(cancerY, gmm.predict(cancerX)) adjMI[k]['cancer']['Kmeans'] = ami(cancerY, km.predict(cancerX)) adjMI[k]['cancer']['GMM'] = ami(cancerY, gmm.predict(cancerX)) print(k, clock() - st) ## Keith Mertan: Adding cluster outputs for best parameters and saving at the end of the file