コード例 #1
0
ファイル: kMeans.py プロジェクト: AshKash/kit-sink
def main():
    #first run the initializer to get starting centroids
    filePath = '/home/mike-bowles/pyWorkspace/mapReducers/src/kMeans3/input.txt'
    
    
    #mrJob = MRkMeansInit(args=[filePath])
    #mrJob = MRkMeansInit(args=['-r', 'emr', filePath])
    #with mrJob.make_runner() as runner:
    #    runner.run()
    
    #pull out the centroid values to compare with values after one iteration
    centPath = "s3://mike-mrjob/kMeans/centroids/intermediateResults.txt"
    key = EMRJobRunner().get_s3_key(centPath)
    centroidsJson = key.get_contents_as_string()
    
    
    delta = 10
    #Begin iteration on change in centroids
    while delta > 0.01:
        #parse old centroid values
        oldCentroids = json.loads(centroidsJson)
        #run one iteration
        mrJob2 = MRkMeansIter(args=['-r', 'emr', filePath])
        with mrJob2.make_runner() as runner:
            runner.run()
            
        #compare new centroids to old ones
        centroidsJson = key.get_contents_as_string()
        newCentroids = json.loads(centroidsJson)
        
        kMeans = len(newCentroids)
        
        delta = 0.0
        for i in range(kMeans):
            delta += dist(newCentroids[i],oldCentroids[i])
        
        print delta