# plots with kmeans = 2
kmean = sklearn.cluster.KMeans(2)

pred = kmean.fit_predict(X_used)

plt.close()
plt.figure()
three_d_plot_funct(X_used[:,-3:],pred,save=False)
plt.savefig(images+"/tibshirani_gap/"+data_info+"_tibshirani_2_clusters_3d.png")

pairplotsX         = pd.concat([pd.DataFrame(X_used),pd.DataFrame(pred)],axis=1)
pairplotsX.columns = ["x"+str(i) for i in range(X_used.shape[-1])]+["prediction"]

plt.close()
plt.figure()
pair_plot_funct(pairplotsX,save=False)
plt.savefig(images+"/tibshirani_gap/"+data_info+"_tibshirani_2_clusters_pairs.png")
plt.close()

bests = pd.DataFrame([	[best_uniform,best_gaussian,best_t],
				[best_uniform_max_gap,best_gaussian_max_gap,best_t_max_gap]],
				columns=["uniform","gaussian","t"])

# a 2d example (seen after most of the above code was written)
# https://datasciencelab.wordpress.com/2013/12/27/finding-the-k-in-k-means-clustering/




Example #2
0
	pairplotsX.columns = ["x"+str(i) for i in range(len(grey_option))]+["prediction"]

	# storage of prediction vector (dealing with different num of observations)
	storage_prediction = prediction
	if X.shape[0]!=X_full.shape[0]:
		storage_prediction                 = np.ones((X_full.shape[0]))*-1
		# for the outliers
		storage_prediction[kept_truth]     = prediction 

	kmean_predictions[num_data][:,index_clusters,kk]=storage_prediction

	# imaging 
	image_extension    = "kmeans"+"("+str(num_clusters)+")_"+data_names[num_data]+"_"+grey_output+".png"

	# pairs plots
	pair_plot_funct(pairplotsX,image_extension=image_extension)
	
	# 3d plots
	three_d_plot_funct(X,prediction,np.arange(X.shape[-1])[-3:],
		image_extension=image_extension)


	sys.stdout.write("-")
	sys.stdout.flush()

sys.stdout.write("\n")


###################
#### Dirichlet ####
###################