def main(): # # initialize start by getting existing canopy centers and # setting it to the intermediate results file # resultsPath = os.path.join(PROJECT_ROOT, 'intermediateResults.txt') fileIn = open(resultsPath) centroidsJson = fileIn.read() fileIn.close() # input file filePath = os.path.join(PROJECT_ROOT, 'enron_corpus.txt') # # setup the delta that we consider threshhold for "close-enough" # delta = 10 # # Begin iteration on change in centroids # #while delta > 0.01: loop_count = 0 while loop_count < 20: # parse old centroid values # centersJson format: [ document1, document2,..., documentN ] oldCentroids = json.loads(centroidsJson) # run one iteration mrJob2 = MRkMeansIter( args=[filePath] ) with mrJob2.make_runner() as runner: runner.run() # compare new centroids to old ones # determine if we are close enough to a final clustering solution fileIn = open(resultsPath) centroidsJson = fileIn.read() fileIn.close() newCentroids = json.loads(centroidsJson) delta = 0.0 for i in range(len(newCentroids)): new_center_doc = newCentroids[i] old_center_doc = oldCentroids[i] dist = jaccard_dist(new_center_doc, old_center_doc) delta += jaccard_dist(new_center_doc, old_center_doc) delta = delta / len(newCentroids) print delta loop_count += 1
def main(): #first run the initializer to get starting centroids filePath = '/Users/ashwin/repos/mlclass/kMeans3/input.txt' centPath = "/Users/ashwin/repos/mlclass/kMeans3/" kMeansStr = '3' mrJob = MRkMeansInit([filePath] + ["--k" , kMeansStr , "--pathName", centPath]) #mrJob = MRkMeansInit(args=[filePath]) with mrJob.make_runner() as runner: runner.run() #pull out the centroid values to compare with values after one iteration centPath = "/Users/ashwin/repos/mlclass/kMeans3/" fileIn = open(centPath + "intermediateResults.txt") centroidsJson = fileIn.read() fileIn.close() delta = 10 #Begin iteration on change in centroids while delta > 0.01: #parse old centroid values oldCentroids = json.loads(centroidsJson) #run one iteration mrJob2 = MRkMeansIter(args=[filePath] + ["--k" , kMeansStr , "--pathName", centPath]) with mrJob2.make_runner() as runner: runner.run() #compare new centroids to old ones fileIn = open(centPath + "intermediateResults.txt") centroidsJson = fileIn.read() fileIn.close() newCentroids = json.loads(centroidsJson) kMeans = len(newCentroids) delta = 0.0 for i in range(kMeans): delta += dist(newCentroids[i],oldCentroids[i]) print delta
def main(): #first run the initializer to get starting centroids filePath = '/home/mike-bowles/pyWorkspace/mapReducers/src/kMeans3/input.txt' #mrJob = MRkMeansInit(args=[filePath]) #mrJob = MRkMeansInit(args=['-r', 'emr', filePath]) #with mrJob.make_runner() as runner: # runner.run() #pull out the centroid values to compare with values after one iteration centPath = "s3://mike-mrjob/kMeans/centroids/intermediateResults.txt" key = EMRJobRunner().get_s3_key(centPath) centroidsJson = key.get_contents_as_string() delta = 10 #Begin iteration on change in centroids while delta > 0.01: #parse old centroid values oldCentroids = json.loads(centroidsJson) #run one iteration mrJob2 = MRkMeansIter(args=['-r', 'emr', filePath]) with mrJob2.make_runner() as runner: runner.run() #compare new centroids to old ones centroidsJson = key.get_contents_as_string() newCentroids = json.loads(centroidsJson) kMeans = len(newCentroids) delta = 0.0 for i in range(kMeans): delta += dist(newCentroids[i],oldCentroids[i]) print delta
def main(): #first run the initializer to get starting centroids filePath = os.path.join(PROJECT_ROOT, 'input.txt') mrJob = MRkMeansInit(args=[filePath]) with mrJob.make_runner() as runner: runner.run() #pull out the centroid values to compare with values after one iteration centPath = os.path.join(PROJECT_ROOT, 'intermediateResults.txt') fileIn = open(centPath) centroidsJson = fileIn.read() fileIn.close() delta = 10 #Begin iteration on change in centroids while delta > 0.001: #parse old centroid values oldCentroids = json.loads(centroidsJson) #run one iteration mrJob2 = MRkMeansIter(args=[filePath]) with mrJob2.make_runner() as runner: runner.run() #compare new centroids to old ones fileIn = open(centPath) centroidsJson = fileIn.read() fileIn.close() newCentroids = json.loads(centroidsJson) kMeans = len(newCentroids) delta = 0.0 for i in range(kMeans): delta += dist(newCentroids[i], oldCentroids[i]) print "delta={0}, centers={1}".format(delta, str(newCentroids))
def main(): #first run the initializer to get starting centroids filePath = os.path.join(PROJECT_ROOT, 'input.txt') mrJob = MRkMeansInit(args=[filePath]) with mrJob.make_runner() as runner: runner.run() #pull out the centroid values to compare with values after one iteration centPath = os.path.join(PROJECT_ROOT, 'intermediateResults.txt') fileIn = open(centPath) centroidsJson = fileIn.read() fileIn.close() delta = 10 #Begin iteration on change in centroids while delta > 0.001: #parse old centroid values oldCentroids = json.loads(centroidsJson) #run one iteration mrJob2 = MRkMeansIter(args=[filePath]) with mrJob2.make_runner() as runner: runner.run() #compare new centroids to old ones fileIn = open(centPath) centroidsJson = fileIn.read() fileIn.close() newCentroids = json.loads(centroidsJson) kMeans = len(newCentroids) delta = 0.0 for i in range(kMeans): delta += dist(newCentroids[i],oldCentroids[i]) print "delta={0}, centers={1}" .format(delta, str(newCentroids))