Example #1
0
def kmeans_driver(threshold, k, init):
    # set up the job args
    mr_job = MRKmeans(
        args=['topUsers_Apr-Jul_2014_1000-words.txt', '--file=centroids.txt'])

    # initialize the centroids
    centroid_points = []
    #k = 4
    if init == 'A':
        centroid_points = startCentroidsA(k)
        print "(A) K=4 uniform random centroid-distributions over the 1000 words (generate 1000 random numbers and normalize the vectors)\n"
    elif init == 'B' or init == 'C':
        centroid_points = startCentroidsBC(k)
        print "(C) K=4 perturbation-centroids, randomly perturbed from the aggregated (user-wide) distribution\n"
    else:
        centroid_points = startCentroidsD(k)
        print "(D) K=4 \"trained\" centroids, determined by the sums across the classes\n"

    # write centroids to the expected file
    with open('centroids.txt', 'w+') as f:
        f.writelines(','.join(str(j) for j in i) + '\n'
                     for i in centroid_points)
    f.close()

    # update centroids iteratively
    i = 0
    code_clusters = [{}] * k
    while (1):
        # save previous centoids to check convergency
        centroid_points_old = centroid_points[:]
        print "iteration" + str(i) + ":"
        with mr_job.make_runner() as runner:
            runner.run()
            # stream_output: get access of the output
            for line in runner.stream_output():
                key, values = mr_job.parse_output_line(line)
                #print key, values
                centroid = values[0]
                codes = values[1]
                centroid_points[key] = centroid
                code_clusters[key] = codes

        # Update the centroids for the next iteration
        with open('centroids.txt', 'w') as f:
            f.writelines(','.join(str(j) for j in i) + '\n'
                         for i in centroid_points)

        print "\n"
        i = i + 1
        if (stop_criterion(centroid_points_old, centroid_points, threshold)):
            break

    print "\nTotal iterations:", i

    max_vals = []
    total_vals = []
    print('\n%s\t%s\t\t%s\t\t%s\t\t%s\t\t%s') % ('cluster', 'human', 'cyborg',
                                                 'robot', 'spammer', 'total')
    print '============================================================================='
    for idx, cluster in enumerate(code_clusters):
        zero_val = one_val = two_val = three_val = 0
        total = float(sum(cluster.values()))
        if '0' in cluster.keys(): zero_val = cluster['0']
        if '1' in cluster.keys(): one_val = cluster['1']
        if '2' in cluster.keys(): two_val = cluster['2']
        if '3' in cluster.keys(): three_val = cluster['3']

        print('%d\t%d (%.2f%%)\t%d (%.2f%%)\t%d (%.2f%%)\t%d (%.2f%%)\t%d') % (
            idx, zero_val, (zero_val / total * 100), one_val,
            (one_val / total * 100), two_val,
            (two_val / total * 100), three_val,
            (three_val / total * 100), total)

        #purity = sum of the max points for each cluster divided by sum of total points in each cluster
        max_vals.append(max(cluster.values()))
        total_vals.append(sum(cluster.values()))

    purity = float(sum(max_vals)) / (sum(total_vals))
    print "purity = %.2f%%" % (100 * purity)
Example #2
0
        clusters = {}
        # stream_output: get access of the output
        for line in runner.stream_output():
            key, value = mr_job.parse_output_line(line)
            centroid, codes = value
            centroid_points.append(centroid)
            clusters[key] = codes

    # Update the centroids for the next iteration
    with open('Centroids.txt', 'w') as f:
        f.writelines(','.join(str(j) for j in i) + '\n'
                     for i in centroid_points)

    print "\n"
    i = i + 1
    max_class = {}
    if (stop_criterion(centroid_points_old, centroid_points, 0.01)):
        print "Centroids\n"
        print centroid_points
        print "\n\n\n"
        print "Breakdown by class code:"
        for cluster_id, cluster in clusters.iteritems():
            max_class[cluster_id] = max(cluster.values())
            print "Cluster ID:", cluster_id
            print "Human:", cluster.get('0', 0)
            print "Cyborg:", cluster.get('1', 0)
            print "Robot:", cluster.get('2', 0)
            print "Spammer:", cluster.get('3', 0)
            print "\n"
        print "purity = ", sum(max_class.values()) / 1000.0 * 100
        break
Example #3
0
    
# Update centroids iteratively
i = 0
while(1):
    # save previous centoids to check convergency
    centroid_points_old = centroid_points[:]
    print "iteration"+str(i)+":"
    with mr_job.make_runner() as runner: 
        centroid_points = []
        cluster_dist ={}
        runner.run()
        # stream_output: get access of the output 
        for line in runner.stream_output():
            key,value =  mr_job.parse_output_line(line)
            centroid, codes = value
            centroid_points.append(centroid)
            cluster_dist[key]=codes
    i = i + 1
    
    #check if we have convergence
    if(stop_criterion(centroid_points_old,centroid_points,0.001)):
        break
 
    #write new centroids back to file 
    with open('Centroids.txt', 'w') as f:
        for centroid in centroid_points:
            f.writelines(','.join(map(str, centroid)) + '\n')
        f.close()

calc_purity(cluster_dist) 
Example #4
0
    centroid_points_old = centroid_points

    with mr_job.make_runner() as runner:
        #print "running iteration" + str(i) + ":"
        runner.run()
        centroid_points = []
        clusters = {}

        # stream_output: get access of the output
        for line in runner.stream_output():
            key, value = mr_job.parse_output_line(line)
            centroid, codes = value
            centroid_points.append(centroid)
            clusters[key] = codes

    if (stop_criterion(centroid_points_old, centroid_points, THRESHOLD)):
        print clusters
        # display statistics
        print "cluster distribution"
        print "-" * 80
        print "iteration # {}".format(i)
        codes = {0: 'Human', 1: 'Cyborg', 2: 'Robot', 3: 'Spammer'}

        human_total = np.sum(
            [clusters[k].get('0', 0) for k in clusters.keys()])
        cyborg_total = np.sum(
            [clusters[k].get('1', 0) for k in clusters.keys()])
        robot_total = np.sum(
            [clusters[k].get('2', 0) for k in clusters.keys()])
        spammer_total = np.sum(
            [clusters[k].get('3', 0) for k in clusters.keys()])
    centroid_points_old = centroid_points

    with mr_job.make_runner() as runner: 
        #print "running iteration" + str(i) + ":"
        runner.run()
        centroid_points = []
        clusters = {}
        
        # stream_output: get access of the output 
        for line in runner.stream_output():
            key, value =  mr_job.parse_output_line(line)
            centroid, codes = value
            centroid_points.append(centroid)
            clusters[key] = codes

    if(stop_criterion(centroid_points_old, centroid_points, THRESHOLD)):
        print clusters
        # display statistics
        print "cluster distribution"
        print "-" * 80
        print "iteration # {}".format(i)
        codes = { 0:'Human', 1:'Cyborg', 2:'Robot', 3:'Spammer' }
        
        human_total   = np.sum([clusters[k].get('0', 0) for k in clusters.keys()])
        cyborg_total  = np.sum([clusters[k].get('1', 0) for k in clusters.keys()])
        robot_total   = np.sum([clusters[k].get('2', 0) for k in clusters.keys()])
        spammer_total = np.sum([clusters[k].get('3', 0) for k in clusters.keys()])
        
        max_class = {}
        print "-" * 80
        print "{0:>5} |{1:>12} (%) |{2:>12} (%) |{3:>12} (%) |{4:>12} (%)".format("k", "Human", "Cyborg", "Robot", "Spammer")
    print "iteration"+str(i)+":"
    with mr_job.make_runner() as runner: 
        runner.run()
        # stream_output: get access of the output 
        for line in runner.stream_output():
            key,value =  mr_job.parse_output_line(line)
            print key, value
            centroid_points[key] = value
            
        # Update the centroids for the next iteration
        with open('Centroids.txt', 'w') as f:
            f.writelines(','.join(str(j) for j in i) + '\n' for i in centroid_points)
        
    print "\n"
    i = i + 1
    if(stop_criterion(centroid_points_old,centroid_points,0.01)): # python function
        break
print "Centroids\n"
print centroid_points


# # Visualize the results

# In[10]:

pylab.plot(samples1[:, 0], samples1[:, 1],'*', color = 'red')
pylab.plot(samples2[:, 0], samples2[:, 1],'o',color = 'blue')
pylab.plot(samples3[:, 0], samples3[:, 1],'+',color = 'green')
for point in centroid_points:
    pylab.plot(point[0], point[1], '*',color='pink',markersize=20)
pylab.show()