Example #1
0
def optimal_k(data):
    # mencari daftar sse
    sse = []
    for k in range(1, 8):
        centroid = initCentroid(data, k)
        clusters = clustering(data, centroid, k)
        sumtoCluster = 0
        for i in range(k):
            for j in range(len(data)):
                if clusters[j] == i:
                    # Euclidean Distance untuk sumtoCluster
                    sumtoCluster += (abs(
                        (data[j, 0] - centroid[i][0])**2) + abs(
                            (data[j, 1] - centroid[i][1])**2))
        sse.append([k, sumtoCluster])

    # Plot grafik kluster
    sseKey = [sse[j][0] for j in range(len(sse))]
    sseValue = [sse[j][1] for j in range(len(sse))]
    plt.figure()
    plt.plot(sseKey, sseValue)
    plt.xlabel("Number of k")
    plt.ylabel("SSE")
    plt.show()

    # mencari k optimal dengan patokan jika nilai yang berkurang lebih sama dengan 30% dari turunnya sse pertama, dianggap turun drastis dan k optimal
    k_optimal = 2
    first_drop_sse = (sse[0][1] - sse[1][1])
    for k in range(1, 5):
        if (sse[k][1] - sse[k + 1][1] >= (first_drop_sse * 0.3)):
            k_optimal = sse[k + 1][0]
    return k_optimal
Example #2
0
def main():
	start = time.time()
	dataPath = sys.argv[1]
	data = np.loadtxt(open(dataPath, "rb"),delimiter=",") # Load data 

	np.random.seed(0)

	data1 = data.copy()
	data3 = data[np.where(data[:,1] == 6)]
	data3 = np.concatenate((data[np.where(data[:,1] == 7)], data3), axis=0)
	data2 = np.concatenate((data[np.where(data[:,1] == 2)], data3), axis=0)
	data2 = np.concatenate((data[np.where(data[:,1] == 4)], data2), axis=0)
	dataList = [data1,data2,data3]
	# Analysis 1
	K_list = [2,4,8,16,32]

	for idx in range(3):
		wc_list = []
		sc_list = []
		for K in K_list:
			np.random.seed(0)
			classIdArr, repArr = clustering(dataList[idx],K)
			dataArr = np.insert(dataList[idx],[4],classIdArr,axis = 1) 
			wc = compute_wc(dataArr,K,repArr)
			sc = compute_sc(dataArr,K)

			# print wc
			wc_list.append(wc)
			sc_list.append(sc)

		fig, ax = plt.subplots()
		plt.plot(K_list, wc_list,"C0-o", ms = 3)
		ax.grid()
		ax.legend()
		ax.set(xlabel="K", ylabel="WC-SSD",
		       title="WC against K on dataset%d"%(idx+1))
		plt.figure(figsize=(200,50))
		plt.show()
		fig.savefig("wc%d.png"%(idx+1))

		fig, ax = plt.subplots()
		plt.plot(K_list, sc_list, "C1-o", ms = 3)
		ax.grid()
		ax.legend()
		ax.set(xlabel="K", ylabel="SC",
		       title="SC against K on dataset%d"%(idx+1))
		plt.figure(figsize=(200,50))
		plt.show()
		fig.savefig("sc%d.png"%(idx+1))

	end = time. time()
	runTime = end - start
	print("Run Time: %d min %f sec." % (runTime/60,runTime-int(runTime/60)*60))
Example #3
0
from matplotlib import pyplot as plt
from elbow import optimal_k
from kmeans import initCentroid, clustering

# Load data dan jadikan array
data = np.loadtxt('TrainsetTugas2.txt')
x_data = data[:, 0]
y_data = data[:, 1]

k = optimal_k(data)

# koordinat X,Y centroid data random
centroid = initCentroid(data, k)

# clustering
clusters = clustering(data, centroid, k)

# Buat Plot
colors = ['blue', 'forestgreen', 'red', 'darkgoldenrod', 'purple', 'cyan']
fig, ax = plt.subplots()
print('cluster')
print('total\t:', len(data), 'titik')
for i in range(k):
    titik = np.array([data[j] for j in range(len(data)) if clusters[j] == i])
    print(i, '\t:', len(titik), 'titik')
    ax.scatter(titik[:, 0], titik[:, 1], s=7, c=colors[i])
ax.scatter(centroid[:, 0], centroid[:, 1], marker='s', s=100, c='black')

# simpan result ke txt
result = []
result = np.column_stack((data, clusters))
source = "/Users/mengqizhou/Desktop/datamining/programing3/data/initialdata/weighted_vectors.csv"
vector = genfromtxt(source, dtype=float,delimiter = ',')
vector = np.array(vector)
for i in range(0,len(vector)):
            vector[i][1]+=0.1
file1="/Users/mengqizhou/Desktop/datamining/programing3/data/centroid/sample_cosine_200.csv"
file2="/Users/mengqizhou/Desktop/datamining/programing3/data/centroid/sample_euclidean_127.csv"
for iteration in range(0,2):
    if iteration==0:
        seed = genfromtxt(file1, dtype=float,delimiter = ',')
        seed = np.array(seed)       
        for i in range(0,len(seed)):
            seed[i][1]+=0.1
        t=time()
        mean,vnoc,nvoc= clustering(vector,seed,'cosine')
        print"cosine,"+str(len(mean))+" clusters,time: "+str(time()-t)+" s"
        np.savetxt("/Users/mengqizhou/Desktop/datamining/programing3/data/kmeans/centroid/cosine_"+str(len(mean))+".csv", mean, '%5.2f',delimiter=",")
        with open("/Users/mengqizhou/Desktop/datamining/programing3/data/kmeans/vnoc/cosine_"+str(len(vnoc))+".csv", 'wb') as f:
            writer=csv.writer(f)
            for item in vnoc:
                writer.writerow(item)
        f.close()
        np.savetxt("/Users/mengqizhou/Desktop/datamining/programing3/data/kmeans/nvoc/cosine_"+str(len(nvoc))+".csv", nvoc, '%i',delimiter=",")
    elif iteration==1:
        seed = genfromtxt(file2, dtype=float,delimiter = ',')
        seed = np.array(seed)
        for i in range(0,len(seed)):
            seed[i][1]+=0.1
        t=time()
        mean,vnoc,nvoc= clustering(vector,seed,'euclidean')
Example #5
0
max_iterations = 20
dist = 1  # squared euclidean distance
trials = 5

# open output file
outf_1 = open('results kmeans clustering.txt', 'w')
outf_1.write('Settings:' + ' \n ' + 'max iterations: ' + str(max_iterations) +
             ' \n ' + 'number of trials: ' + str(trials) + ' \n ' +
             'distance metric: ' + str(dist) + '\n')

# repeat for different values of k
for k in range(2, 10):
    S = 0
    # get different (initial) clusterings and store the best
    for i in range(trials):
        new_data, new_clusters, new_S = clustering(data, k, max_iterations,
                                                   dist)
        if new_S > S:
            data = new_data
            clusters = new_clusters
            S = new_S

    # write results to the output file
    outf_1.write('Number of clusters: ' + str(k) + ' \t ' +
                 'Silhoette coefficient: ' + str(S) + '\n')
    outf_1.write(' Division of data types per cluster:' + '\n')
    for key in clusters:
        outf_1.write('  Cluster ' + str(key) + '\n' + '  NB:        ' +
                     str(clusters[key][2]['NB']) + '\n' + '  BRCA:      ' +
                     str(clusters[key][2]['BRCA']) + '\n' + '  KIRC:      ' +
                     str(clusters[key][2]['KIRC']) + '\n' + '  COAD/READ: ' +
                     str(clusters[key][2]['COAD/READ']) + '\n')