Beispiel #1
0
def kmeans_driver(threshold, k, init):
    # set up the job args
    mr_job = MRKmeans(
        args=['topUsers_Apr-Jul_2014_1000-words.txt', '--file=centroids.txt'])

    # initialize the centroids
    centroid_points = []
    #k = 4
    if init == 'A':
        centroid_points = startCentroidsA(k)
        print "(A) K=4 uniform random centroid-distributions over the 1000 words (generate 1000 random numbers and normalize the vectors)\n"
    elif init == 'B' or init == 'C':
        centroid_points = startCentroidsBC(k)
        print "(C) K=4 perturbation-centroids, randomly perturbed from the aggregated (user-wide) distribution\n"
    else:
        centroid_points = startCentroidsD(k)
        print "(D) K=4 \"trained\" centroids, determined by the sums across the classes\n"

    # write centroids to the expected file
    with open('centroids.txt', 'w+') as f:
        f.writelines(','.join(str(j) for j in i) + '\n'
                     for i in centroid_points)
    f.close()

    # update centroids iteratively
    i = 0
    code_clusters = [{}] * k
    while (1):
        # save previous centoids to check convergency
        centroid_points_old = centroid_points[:]
        print "iteration" + str(i) + ":"
        with mr_job.make_runner() as runner:
            runner.run()
            # stream_output: get access of the output
            for line in runner.stream_output():
                key, values = mr_job.parse_output_line(line)
                #print key, values
                centroid = values[0]
                codes = values[1]
                centroid_points[key] = centroid
                code_clusters[key] = codes

        # Update the centroids for the next iteration
        with open('centroids.txt', 'w') as f:
            f.writelines(','.join(str(j) for j in i) + '\n'
                         for i in centroid_points)

        print "\n"
        i = i + 1
        if (stop_criterion(centroid_points_old, centroid_points, threshold)):
            break

    print "\nTotal iterations:", i

    max_vals = []
    total_vals = []
    print('\n%s\t%s\t\t%s\t\t%s\t\t%s\t\t%s') % ('cluster', 'human', 'cyborg',
                                                 'robot', 'spammer', 'total')
    print '============================================================================='
    for idx, cluster in enumerate(code_clusters):
        zero_val = one_val = two_val = three_val = 0
        total = float(sum(cluster.values()))
        if '0' in cluster.keys(): zero_val = cluster['0']
        if '1' in cluster.keys(): one_val = cluster['1']
        if '2' in cluster.keys(): two_val = cluster['2']
        if '3' in cluster.keys(): three_val = cluster['3']

        print('%d\t%d (%.2f%%)\t%d (%.2f%%)\t%d (%.2f%%)\t%d (%.2f%%)\t%d') % (
            idx, zero_val, (zero_val / total * 100), one_val,
            (one_val / total * 100), two_val,
            (two_val / total * 100), three_val,
            (three_val / total * 100), total)

        #purity = sum of the max points for each cluster divided by sum of total points in each cluster
        max_vals.append(max(cluster.values()))
        total_vals.append(sum(cluster.values()))

    purity = float(sum(max_vals)) / (sum(total_vals))
    print "purity = %.2f%%" % (100 * purity)
#!/usr/bin/env python
#START STUDENT CODE45_RUNNER
import numpy as np
import sys
from Kmeans import MRKmeans, stop_criterion

# set the randomizer seed so results are the same each time.
np.random.seed(0)

# define mrjob runner
mr_job = MRKmeans(
    args=["topUsers_Apr-Jul_2014_1000-words.txt", '--file=Centroids.txt'])

centroid_points = []
k = 4
class_codes = {
    '0.0': 'Human',
    '1.0': 'Cyborg',
    '2.0': 'Robot',
    '3.0': 'Spammer'
}


def startCentroidsBC(k):
    import re
    counter = 0
    for line in open(
            "topUsers_Apr-Jul_2014_1000-words_summaries.txt").readlines():
        if counter == 1:
            data = re.split(",", line)
            globalAggregate = [
Beispiel #3
0
from numpy import random
import numpy as np
from Kmeans import MRKmeans, stop_criterion
import sys
from custom_func import calc_purity
mr_job = MRKmeans(args=['topUsers_Apr-Jul_2014_1000-words.txt'])

random.seed(0)
#number of features
n= 1000

#get centroid type and number of clusters from user
if len(sys.argv) >2: k = int(sys.argv[2])
cen_type = sys.argv[1]

#Geneate initial centroids
centroid_points = []

#based on the centroid type generate centroids
if(cen_type=='Uniform'):
    rand_int = random.uniform(size=[k,n])
    total = np.sum(rand_int,axis=1)
    centroid_points = (rand_int.T/total).T
    with open('Centroids.txt', 'w+') as f:
        f.writelines(','.join(str(j) for j in i) + '\n' for i in centroid_points)
    f.close()
    
elif(cen_type=='Perturbation'):
    data = [s.split('\n')[0].split(',') for s in 
                   open("topUsers_Apr-Jul_2014_1000-words_summaries.txt").readlines()][1]
Beispiel #4
0
import numpy as np
import sys
from Kmeans import MRKmeans, stop_criterion

# initialize variables
SOURCE = "topUsers_Apr-Jul_2014_1000-words.txt"
SUMMARY = "topUsers_Apr-Jul_2014_1000-words_summaries.txt"
CENTROIDS = "/tmp/centroids"
THRESHOLD = 0.001

# set the randomizer seed so results are the same each time.
np.random.seed(0)

# define mrjob runner
mr_job = MRKmeans(args=[SOURCE])

# validate driver inputs - K and distribution type
if len(sys.argv) != 3:
    print "Invalid number of arguments. Pass k (cluster size) and centroid distribution type (uniform, perturbed, normal)"
    sys.exit(1)

k = sys.argv[1]
try:
    k = int(k)
except:
    raise TypeError("Invalid k. k must be an integer")

distr_type = sys.argv[2]
if distr_type not in ['uniform', 'perturbed', 'trained']:
    print "Invalid centroid distribution type. Type should be uniform, perturbed or trained."
    sys.exit(1)
Beispiel #5
0
            centroids[idx][1] = centroids[idx][1] + y
        centroids[idx][0] = centroids[idx][0]/num[idx]
        centroids[idx][1] = centroids[idx][1]/num[idx]

        yield idx,(centroids[idx][0],centroids[idx][1])
      
if __name__ == '__main__':
    MRKmeans.run()

## Driver ##

%reload_ext autoreload
%autoreload 2
from numpy import random
from Kmeans import MRKmeans, stop_criterion
mr_job = MRKmeans(args=['Kmeandata.csv', '--file=Centroids.txt'])

#Geneate initial centroids
centroid_points = []
k = 3
for i in range(k):
    centroid_points.append([random.uniform(-3,3),random.uniform(-3,3)])
with open('Centroids.txt', 'w+') as f:
        f.writelines(','.join(str(j) for j in i) + '\n' for i in centroid_points)
# Initiate the W0, W1 


# Update centroids iteratively
i = 0
while(1):
    # save previous centoids to check convergency
# New Centroids = initial centroids
# 
# While(1):
# + Cacluate new centroids
# + stop if new centroids close to old centroids
# + Updates centroids 

# In[11]:

#get_ipython().magic(u'reload_ext autoreload')
#get_ipython().magic(u'autoreload 2')
%reload_ext autoreload
%autoreload 2
from numpy import random
from Kmeans import MRKmeans, stop_criterion
mr_job = MRKmeans(args=['Kmeandata.csv', '--file=Centroids.txt']) # training data, initial centriods coded below

#Geneate initial centroids
centroid_points = []
k = 3
for i in range(k):
    random.seed(8888)
    centroid_points.append([random.uniform(-3,3),random.uniform(-3,3)])
with open('Centroids.txt', 'w') as f:
        f.writelines(','.join(str(j) for j in i) + '\n' for i in centroid_points)

# Update centroids iteratively
i = 0
while(1):
    # save previous centoids to check convergency
    centroid_points_old = centroid_points[:] # store the current version of the centroids