def main():
    #
    # initialize start by getting existing canopy centers and 
    # setting it to the intermediate results file
    #
    resultsPath = os.path.join(PROJECT_ROOT, 'intermediateResults.txt')
    fileIn = open(resultsPath)
    centroidsJson = fileIn.read()
    fileIn.close()

    # input file
    filePath = os.path.join(PROJECT_ROOT, 'enron_corpus.txt')

    #
    # setup the delta that we consider threshhold for "close-enough"
    #
    delta = 10

    #
    # Begin iteration on change in centroids
    #
    #while delta > 0.01:
    loop_count = 0
    while loop_count < 20:
        # parse old centroid values
        # centersJson format: [ document1, document2,..., documentN ]
        oldCentroids = json.loads(centroidsJson)

        # run one iteration
        mrJob2 = MRkMeansIter( args=[filePath] )
        with mrJob2.make_runner() as runner:
            runner.run()
            
        # compare new centroids to old ones
        # determine if we are close enough to a final clustering solution 
        fileIn = open(resultsPath)
        centroidsJson = fileIn.read()
        fileIn.close()
        newCentroids = json.loads(centroidsJson)
       
        delta = 0.0
        for i in range(len(newCentroids)):
            new_center_doc = newCentroids[i]
            old_center_doc = oldCentroids[i]
            dist = jaccard_dist(new_center_doc, old_center_doc)
            delta += jaccard_dist(new_center_doc, old_center_doc)
       
        delta = delta / len(newCentroids) 
        print delta
        loop_count += 1
Example #2
0
def main():
    #first run the initializer to get starting centroids
    filePath = '/Users/ashwin/repos/mlclass/kMeans3/input.txt'
    centPath = "/Users/ashwin/repos/mlclass/kMeans3/"
    
    kMeansStr = '3'
    mrJob = MRkMeansInit([filePath] + ["--k" , kMeansStr  , "--pathName", centPath])
    #mrJob = MRkMeansInit(args=[filePath])
    with mrJob.make_runner() as runner:
        runner.run()
    
    #pull out the centroid values to compare with values after one iteration
    centPath = "/Users/ashwin/repos/mlclass/kMeans3/"
    
    fileIn = open(centPath + "intermediateResults.txt")
    centroidsJson = fileIn.read()
    fileIn.close()
    
    delta = 10
    #Begin iteration on change in centroids
    while delta > 0.01:
        #parse old centroid values
        oldCentroids = json.loads(centroidsJson)
        #run one iteration
        mrJob2 = MRkMeansIter(args=[filePath] + ["--k" , kMeansStr  , "--pathName", centPath])
        with mrJob2.make_runner() as runner:
            runner.run()
            
        #compare new centroids to old ones
        fileIn = open(centPath + "intermediateResults.txt")
        centroidsJson = fileIn.read()
        fileIn.close()
        newCentroids = json.loads(centroidsJson)
        
        kMeans = len(newCentroids)
        
        delta = 0.0
        for i in range(kMeans):
            delta += dist(newCentroids[i],oldCentroids[i])
        
        print delta
Example #3
0
def main():
    #first run the initializer to get starting centroids
    filePath = '/home/mike-bowles/pyWorkspace/mapReducers/src/kMeans3/input.txt'
    
    
    #mrJob = MRkMeansInit(args=[filePath])
    #mrJob = MRkMeansInit(args=['-r', 'emr', filePath])
    #with mrJob.make_runner() as runner:
    #    runner.run()
    
    #pull out the centroid values to compare with values after one iteration
    centPath = "s3://mike-mrjob/kMeans/centroids/intermediateResults.txt"
    key = EMRJobRunner().get_s3_key(centPath)
    centroidsJson = key.get_contents_as_string()
    
    
    delta = 10
    #Begin iteration on change in centroids
    while delta > 0.01:
        #parse old centroid values
        oldCentroids = json.loads(centroidsJson)
        #run one iteration
        mrJob2 = MRkMeansIter(args=['-r', 'emr', filePath])
        with mrJob2.make_runner() as runner:
            runner.run()
            
        #compare new centroids to old ones
        centroidsJson = key.get_contents_as_string()
        newCentroids = json.loads(centroidsJson)
        
        kMeans = len(newCentroids)
        
        delta = 0.0
        for i in range(kMeans):
            delta += dist(newCentroids[i],oldCentroids[i])
        
        print delta
Example #4
0
def main():
    #first run the initializer to get starting centroids
    filePath = os.path.join(PROJECT_ROOT, 'input.txt')
    mrJob = MRkMeansInit(args=[filePath])
    with mrJob.make_runner() as runner:
        runner.run()

    #pull out the centroid values to compare with values after one iteration
    centPath = os.path.join(PROJECT_ROOT, 'intermediateResults.txt')
    fileIn = open(centPath)
    centroidsJson = fileIn.read()
    fileIn.close()

    delta = 10
    #Begin iteration on change in centroids
    while delta > 0.001:
        #parse old centroid values
        oldCentroids = json.loads(centroidsJson)
        #run one iteration
        mrJob2 = MRkMeansIter(args=[filePath])
        with mrJob2.make_runner() as runner:
            runner.run()

        #compare new centroids to old ones
        fileIn = open(centPath)
        centroidsJson = fileIn.read()
        fileIn.close()
        newCentroids = json.loads(centroidsJson)

        kMeans = len(newCentroids)

        delta = 0.0
        for i in range(kMeans):
            delta += dist(newCentroids[i], oldCentroids[i])

        print "delta={0},  centers={1}".format(delta, str(newCentroids))
def main():
    #first run the initializer to get starting centroids
    filePath =  os.path.join(PROJECT_ROOT, 'input.txt')
    mrJob = MRkMeansInit(args=[filePath])
    with mrJob.make_runner() as runner:
        runner.run()
    
    #pull out the centroid values to compare with values after one iteration
    centPath = os.path.join(PROJECT_ROOT, 'intermediateResults.txt')
    fileIn = open(centPath)
    centroidsJson = fileIn.read()
    fileIn.close()
    
    delta = 10
    #Begin iteration on change in centroids
    while delta > 0.001:
        #parse old centroid values
        oldCentroids = json.loads(centroidsJson)
        #run one iteration
        mrJob2 = MRkMeansIter(args=[filePath])
        with mrJob2.make_runner() as runner:
            runner.run()
            
        #compare new centroids to old ones
        fileIn = open(centPath)
        centroidsJson = fileIn.read()
        fileIn.close()
        newCentroids = json.loads(centroidsJson)
        
        kMeans = len(newCentroids)
        
        delta = 0.0
        for i in range(kMeans):
            delta += dist(newCentroids[i],oldCentroids[i])

        print "delta={0},  centers={1}" .format(delta, str(newCentroids))