def kmeans_driver(threshold, k, init): # set up the job args mr_job = MRKmeans( args=['topUsers_Apr-Jul_2014_1000-words.txt', '--file=centroids.txt']) # initialize the centroids centroid_points = [] #k = 4 if init == 'A': centroid_points = startCentroidsA(k) print "(A) K=4 uniform random centroid-distributions over the 1000 words (generate 1000 random numbers and normalize the vectors)\n" elif init == 'B' or init == 'C': centroid_points = startCentroidsBC(k) print "(C) K=4 perturbation-centroids, randomly perturbed from the aggregated (user-wide) distribution\n" else: centroid_points = startCentroidsD(k) print "(D) K=4 \"trained\" centroids, determined by the sums across the classes\n" # write centroids to the expected file with open('centroids.txt', 'w+') as f: f.writelines(','.join(str(j) for j in i) + '\n' for i in centroid_points) f.close() # update centroids iteratively i = 0 code_clusters = [{}] * k while (1): # save previous centoids to check convergency centroid_points_old = centroid_points[:] print "iteration" + str(i) + ":" with mr_job.make_runner() as runner: runner.run() # stream_output: get access of the output for line in runner.stream_output(): key, values = mr_job.parse_output_line(line) #print key, values centroid = values[0] codes = values[1] centroid_points[key] = centroid code_clusters[key] = codes # Update the centroids for the next iteration with open('centroids.txt', 'w') as f: f.writelines(','.join(str(j) for j in i) + '\n' for i in centroid_points) print "\n" i = i + 1 if (stop_criterion(centroid_points_old, centroid_points, threshold)): break print "\nTotal iterations:", i max_vals = [] total_vals = [] print('\n%s\t%s\t\t%s\t\t%s\t\t%s\t\t%s') % ('cluster', 'human', 'cyborg', 'robot', 'spammer', 'total') print '=============================================================================' for idx, cluster in enumerate(code_clusters): zero_val = one_val = two_val = three_val = 0 total = float(sum(cluster.values())) if '0' in cluster.keys(): zero_val = cluster['0'] if '1' in cluster.keys(): one_val = cluster['1'] if '2' in cluster.keys(): two_val = cluster['2'] if '3' in cluster.keys(): three_val = cluster['3'] print('%d\t%d (%.2f%%)\t%d (%.2f%%)\t%d (%.2f%%)\t%d (%.2f%%)\t%d') % ( idx, zero_val, (zero_val / total * 100), one_val, (one_val / total * 100), two_val, (two_val / total * 100), three_val, (three_val / total * 100), total) #purity = sum of the max points for each cluster divided by sum of total points in each cluster max_vals.append(max(cluster.values())) total_vals.append(sum(cluster.values())) purity = float(sum(max_vals)) / (sum(total_vals)) print "purity = %.2f%%" % (100 * purity)
clusters = {} # stream_output: get access of the output for line in runner.stream_output(): key, value = mr_job.parse_output_line(line) centroid, codes = value centroid_points.append(centroid) clusters[key] = codes # Update the centroids for the next iteration with open('Centroids.txt', 'w') as f: f.writelines(','.join(str(j) for j in i) + '\n' for i in centroid_points) print "\n" i = i + 1 max_class = {} if (stop_criterion(centroid_points_old, centroid_points, 0.01)): print "Centroids\n" print centroid_points print "\n\n\n" print "Breakdown by class code:" for cluster_id, cluster in clusters.iteritems(): max_class[cluster_id] = max(cluster.values()) print "Cluster ID:", cluster_id print "Human:", cluster.get('0', 0) print "Cyborg:", cluster.get('1', 0) print "Robot:", cluster.get('2', 0) print "Spammer:", cluster.get('3', 0) print "\n" print "purity = ", sum(max_class.values()) / 1000.0 * 100 break
# Update centroids iteratively i = 0 while(1): # save previous centoids to check convergency centroid_points_old = centroid_points[:] print "iteration"+str(i)+":" with mr_job.make_runner() as runner: centroid_points = [] cluster_dist ={} runner.run() # stream_output: get access of the output for line in runner.stream_output(): key,value = mr_job.parse_output_line(line) centroid, codes = value centroid_points.append(centroid) cluster_dist[key]=codes i = i + 1 #check if we have convergence if(stop_criterion(centroid_points_old,centroid_points,0.001)): break #write new centroids back to file with open('Centroids.txt', 'w') as f: for centroid in centroid_points: f.writelines(','.join(map(str, centroid)) + '\n') f.close() calc_purity(cluster_dist)
centroid_points_old = centroid_points with mr_job.make_runner() as runner: #print "running iteration" + str(i) + ":" runner.run() centroid_points = [] clusters = {} # stream_output: get access of the output for line in runner.stream_output(): key, value = mr_job.parse_output_line(line) centroid, codes = value centroid_points.append(centroid) clusters[key] = codes if (stop_criterion(centroid_points_old, centroid_points, THRESHOLD)): print clusters # display statistics print "cluster distribution" print "-" * 80 print "iteration # {}".format(i) codes = {0: 'Human', 1: 'Cyborg', 2: 'Robot', 3: 'Spammer'} human_total = np.sum( [clusters[k].get('0', 0) for k in clusters.keys()]) cyborg_total = np.sum( [clusters[k].get('1', 0) for k in clusters.keys()]) robot_total = np.sum( [clusters[k].get('2', 0) for k in clusters.keys()]) spammer_total = np.sum( [clusters[k].get('3', 0) for k in clusters.keys()])
centroid_points_old = centroid_points with mr_job.make_runner() as runner: #print "running iteration" + str(i) + ":" runner.run() centroid_points = [] clusters = {} # stream_output: get access of the output for line in runner.stream_output(): key, value = mr_job.parse_output_line(line) centroid, codes = value centroid_points.append(centroid) clusters[key] = codes if(stop_criterion(centroid_points_old, centroid_points, THRESHOLD)): print clusters # display statistics print "cluster distribution" print "-" * 80 print "iteration # {}".format(i) codes = { 0:'Human', 1:'Cyborg', 2:'Robot', 3:'Spammer' } human_total = np.sum([clusters[k].get('0', 0) for k in clusters.keys()]) cyborg_total = np.sum([clusters[k].get('1', 0) for k in clusters.keys()]) robot_total = np.sum([clusters[k].get('2', 0) for k in clusters.keys()]) spammer_total = np.sum([clusters[k].get('3', 0) for k in clusters.keys()]) max_class = {} print "-" * 80 print "{0:>5} |{1:>12} (%) |{2:>12} (%) |{3:>12} (%) |{4:>12} (%)".format("k", "Human", "Cyborg", "Robot", "Spammer")
print "iteration"+str(i)+":" with mr_job.make_runner() as runner: runner.run() # stream_output: get access of the output for line in runner.stream_output(): key,value = mr_job.parse_output_line(line) print key, value centroid_points[key] = value # Update the centroids for the next iteration with open('Centroids.txt', 'w') as f: f.writelines(','.join(str(j) for j in i) + '\n' for i in centroid_points) print "\n" i = i + 1 if(stop_criterion(centroid_points_old,centroid_points,0.01)): # python function break print "Centroids\n" print centroid_points # # Visualize the results # In[10]: pylab.plot(samples1[:, 0], samples1[:, 1],'*', color = 'red') pylab.plot(samples2[:, 0], samples2[:, 1],'o',color = 'blue') pylab.plot(samples3[:, 0], samples3[:, 1],'+',color = 'green') for point in centroid_points: pylab.plot(point[0], point[1], '*',color='pink',markersize=20) pylab.show()