예제 #1
0
    return centroids


# write initial centroids to file
centroid_points = startCentroidsBC(k)
with open('Centroids.txt', 'w+') as f:
    f.writelines(','.join(str(j) for j in i) + '\n' for i in centroid_points)
f.close()

# Update centroids iteratively
i = 0
while (1):
    # save previous centoids to check convergency
    centroid_points_old = centroid_points[:]
    print "iteration" + str(i) + ":"
    with mr_job.make_runner() as runner:
        runner.run()
        centroid_points = []
        clusters = {}
        # stream_output: get access of the output
        for line in runner.stream_output():
            key, value = mr_job.parse_output_line(line)
            centroid, codes = value
            centroid_points.append(centroid)
            clusters[key] = codes

    # Update the centroids for the next iteration
    with open('Centroids.txt', 'w') as f:
        f.writelines(','.join(str(j) for j in i) + '\n'
                     for i in centroid_points)
예제 #2
0
def kmeans_driver(threshold, k, init):
    # set up the job args
    mr_job = MRKmeans(
        args=['topUsers_Apr-Jul_2014_1000-words.txt', '--file=centroids.txt'])

    # initialize the centroids
    centroid_points = []
    #k = 4
    if init == 'A':
        centroid_points = startCentroidsA(k)
        print "(A) K=4 uniform random centroid-distributions over the 1000 words (generate 1000 random numbers and normalize the vectors)\n"
    elif init == 'B' or init == 'C':
        centroid_points = startCentroidsBC(k)
        print "(C) K=4 perturbation-centroids, randomly perturbed from the aggregated (user-wide) distribution\n"
    else:
        centroid_points = startCentroidsD(k)
        print "(D) K=4 \"trained\" centroids, determined by the sums across the classes\n"

    # write centroids to the expected file
    with open('centroids.txt', 'w+') as f:
        f.writelines(','.join(str(j) for j in i) + '\n'
                     for i in centroid_points)
    f.close()

    # update centroids iteratively
    i = 0
    code_clusters = [{}] * k
    while (1):
        # save previous centoids to check convergency
        centroid_points_old = centroid_points[:]
        print "iteration" + str(i) + ":"
        with mr_job.make_runner() as runner:
            runner.run()
            # stream_output: get access of the output
            for line in runner.stream_output():
                key, values = mr_job.parse_output_line(line)
                #print key, values
                centroid = values[0]
                codes = values[1]
                centroid_points[key] = centroid
                code_clusters[key] = codes

        # Update the centroids for the next iteration
        with open('centroids.txt', 'w') as f:
            f.writelines(','.join(str(j) for j in i) + '\n'
                         for i in centroid_points)

        print "\n"
        i = i + 1
        if (stop_criterion(centroid_points_old, centroid_points, threshold)):
            break

    print "\nTotal iterations:", i

    max_vals = []
    total_vals = []
    print('\n%s\t%s\t\t%s\t\t%s\t\t%s\t\t%s') % ('cluster', 'human', 'cyborg',
                                                 'robot', 'spammer', 'total')
    print '============================================================================='
    for idx, cluster in enumerate(code_clusters):
        zero_val = one_val = two_val = three_val = 0
        total = float(sum(cluster.values()))
        if '0' in cluster.keys(): zero_val = cluster['0']
        if '1' in cluster.keys(): one_val = cluster['1']
        if '2' in cluster.keys(): two_val = cluster['2']
        if '3' in cluster.keys(): three_val = cluster['3']

        print('%d\t%d (%.2f%%)\t%d (%.2f%%)\t%d (%.2f%%)\t%d (%.2f%%)\t%d') % (
            idx, zero_val, (zero_val / total * 100), one_val,
            (one_val / total * 100), two_val,
            (two_val / total * 100), three_val,
            (three_val / total * 100), total)

        #purity = sum of the max points for each cluster divided by sum of total points in each cluster
        max_vals.append(max(cluster.values()))
        total_vals.append(sum(cluster.values()))

    purity = float(sum(max_vals)) / (sum(total_vals))
    print "purity = %.2f%%" % (100 * purity)
예제 #3
0
        total = int(cluster[2])#get the total count of words
        feature = map(lambda x:((1.0 * float(x)) / total),cluster[3:]) #normalise
        centroid_points.append(feature)
    with open('Centroids.txt', 'w+') as f:
        f.writelines(','.join(str(j) for j in i) + '\n' for i in centroid_points)
    f.close()

print 'Centroid Type: %s' %cen_type
    
# Update centroids iteratively
i = 0
while(1):
    # save previous centoids to check convergency
    centroid_points_old = centroid_points[:]
    print "iteration"+str(i)+":"
    with mr_job.make_runner() as runner: 
        centroid_points = []
        cluster_dist ={}
        runner.run()
        # stream_output: get access of the output 
        for line in runner.stream_output():
            key,value =  mr_job.parse_output_line(line)
            centroid, codes = value
            centroid_points.append(centroid)
            cluster_dist[key]=codes
    i = i + 1
    
    #check if we have convergence
    if(stop_criterion(centroid_points_old,centroid_points,0.001)):
        break