def optimal_k(data): # mencari daftar sse sse = [] for k in range(1, 8): centroid = initCentroid(data, k) clusters = clustering(data, centroid, k) sumtoCluster = 0 for i in range(k): for j in range(len(data)): if clusters[j] == i: # Euclidean Distance untuk sumtoCluster sumtoCluster += (abs( (data[j, 0] - centroid[i][0])**2) + abs( (data[j, 1] - centroid[i][1])**2)) sse.append([k, sumtoCluster]) # Plot grafik kluster sseKey = [sse[j][0] for j in range(len(sse))] sseValue = [sse[j][1] for j in range(len(sse))] plt.figure() plt.plot(sseKey, sseValue) plt.xlabel("Number of k") plt.ylabel("SSE") plt.show() # mencari k optimal dengan patokan jika nilai yang berkurang lebih sama dengan 30% dari turunnya sse pertama, dianggap turun drastis dan k optimal k_optimal = 2 first_drop_sse = (sse[0][1] - sse[1][1]) for k in range(1, 5): if (sse[k][1] - sse[k + 1][1] >= (first_drop_sse * 0.3)): k_optimal = sse[k + 1][0] return k_optimal
def main(): start = time.time() dataPath = sys.argv[1] data = np.loadtxt(open(dataPath, "rb"),delimiter=",") # Load data np.random.seed(0) data1 = data.copy() data3 = data[np.where(data[:,1] == 6)] data3 = np.concatenate((data[np.where(data[:,1] == 7)], data3), axis=0) data2 = np.concatenate((data[np.where(data[:,1] == 2)], data3), axis=0) data2 = np.concatenate((data[np.where(data[:,1] == 4)], data2), axis=0) dataList = [data1,data2,data3] # Analysis 1 K_list = [2,4,8,16,32] for idx in range(3): wc_list = [] sc_list = [] for K in K_list: np.random.seed(0) classIdArr, repArr = clustering(dataList[idx],K) dataArr = np.insert(dataList[idx],[4],classIdArr,axis = 1) wc = compute_wc(dataArr,K,repArr) sc = compute_sc(dataArr,K) # print wc wc_list.append(wc) sc_list.append(sc) fig, ax = plt.subplots() plt.plot(K_list, wc_list,"C0-o", ms = 3) ax.grid() ax.legend() ax.set(xlabel="K", ylabel="WC-SSD", title="WC against K on dataset%d"%(idx+1)) plt.figure(figsize=(200,50)) plt.show() fig.savefig("wc%d.png"%(idx+1)) fig, ax = plt.subplots() plt.plot(K_list, sc_list, "C1-o", ms = 3) ax.grid() ax.legend() ax.set(xlabel="K", ylabel="SC", title="SC against K on dataset%d"%(idx+1)) plt.figure(figsize=(200,50)) plt.show() fig.savefig("sc%d.png"%(idx+1)) end = time. time() runTime = end - start print("Run Time: %d min %f sec." % (runTime/60,runTime-int(runTime/60)*60))
from matplotlib import pyplot as plt from elbow import optimal_k from kmeans import initCentroid, clustering # Load data dan jadikan array data = np.loadtxt('TrainsetTugas2.txt') x_data = data[:, 0] y_data = data[:, 1] k = optimal_k(data) # koordinat X,Y centroid data random centroid = initCentroid(data, k) # clustering clusters = clustering(data, centroid, k) # Buat Plot colors = ['blue', 'forestgreen', 'red', 'darkgoldenrod', 'purple', 'cyan'] fig, ax = plt.subplots() print('cluster') print('total\t:', len(data), 'titik') for i in range(k): titik = np.array([data[j] for j in range(len(data)) if clusters[j] == i]) print(i, '\t:', len(titik), 'titik') ax.scatter(titik[:, 0], titik[:, 1], s=7, c=colors[i]) ax.scatter(centroid[:, 0], centroid[:, 1], marker='s', s=100, c='black') # simpan result ke txt result = [] result = np.column_stack((data, clusters))
source = "/Users/mengqizhou/Desktop/datamining/programing3/data/initialdata/weighted_vectors.csv" vector = genfromtxt(source, dtype=float,delimiter = ',') vector = np.array(vector) for i in range(0,len(vector)): vector[i][1]+=0.1 file1="/Users/mengqizhou/Desktop/datamining/programing3/data/centroid/sample_cosine_200.csv" file2="/Users/mengqizhou/Desktop/datamining/programing3/data/centroid/sample_euclidean_127.csv" for iteration in range(0,2): if iteration==0: seed = genfromtxt(file1, dtype=float,delimiter = ',') seed = np.array(seed) for i in range(0,len(seed)): seed[i][1]+=0.1 t=time() mean,vnoc,nvoc= clustering(vector,seed,'cosine') print"cosine,"+str(len(mean))+" clusters,time: "+str(time()-t)+" s" np.savetxt("/Users/mengqizhou/Desktop/datamining/programing3/data/kmeans/centroid/cosine_"+str(len(mean))+".csv", mean, '%5.2f',delimiter=",") with open("/Users/mengqizhou/Desktop/datamining/programing3/data/kmeans/vnoc/cosine_"+str(len(vnoc))+".csv", 'wb') as f: writer=csv.writer(f) for item in vnoc: writer.writerow(item) f.close() np.savetxt("/Users/mengqizhou/Desktop/datamining/programing3/data/kmeans/nvoc/cosine_"+str(len(nvoc))+".csv", nvoc, '%i',delimiter=",") elif iteration==1: seed = genfromtxt(file2, dtype=float,delimiter = ',') seed = np.array(seed) for i in range(0,len(seed)): seed[i][1]+=0.1 t=time() mean,vnoc,nvoc= clustering(vector,seed,'euclidean')
max_iterations = 20 dist = 1 # squared euclidean distance trials = 5 # open output file outf_1 = open('results kmeans clustering.txt', 'w') outf_1.write('Settings:' + ' \n ' + 'max iterations: ' + str(max_iterations) + ' \n ' + 'number of trials: ' + str(trials) + ' \n ' + 'distance metric: ' + str(dist) + '\n') # repeat for different values of k for k in range(2, 10): S = 0 # get different (initial) clusterings and store the best for i in range(trials): new_data, new_clusters, new_S = clustering(data, k, max_iterations, dist) if new_S > S: data = new_data clusters = new_clusters S = new_S # write results to the output file outf_1.write('Number of clusters: ' + str(k) + ' \t ' + 'Silhoette coefficient: ' + str(S) + '\n') outf_1.write(' Division of data types per cluster:' + '\n') for key in clusters: outf_1.write(' Cluster ' + str(key) + '\n' + ' NB: ' + str(clusters[key][2]['NB']) + '\n' + ' BRCA: ' + str(clusters[key][2]['BRCA']) + '\n' + ' KIRC: ' + str(clusters[key][2]['KIRC']) + '\n' + ' COAD/READ: ' + str(clusters[key][2]['COAD/READ']) + '\n')