def knn(p, train, k): """Does a K-nearest-neighbors classification on a point p given a training set. Parameters: - p: Numpy array with the point to classify - train: An iterable object with (p, label) entries, where p is a numpy array representing a point in the training set, and label is the label for that point. - k: The number of neighbors to use. """ h = [[-sys.maxint,None,None] for x in range(k)] for pt, label in train: k_distance, _, _ = h[0] k_distance = -k_distance dist = distance(p.astype("float"), pt) if dist < k_distance: heapq.heapreplace(h, [-dist, pt, label]) labels = [label for dist,pt,label in h] max_label = find_majority(labels) h.sort() for x in h: x[0] = -x[0] return (max_label, h)
def get_k_means(self, centroids): point_assignment = [] for n in self.points: distances = [distance(n,c) for c in centroids] point_assignment.append(numpy.argmin(distances)) new_centroids = numpy.array([ numpy.zeros(self.points.shape[1]) for i in range(self.k)]) points_in_cluster = [0]*self.k for p,i in zip(self.points,point_assignment): new_centroids[i] += p points_in_cluster[i] += 1 for i in range(self.k): new_centroids[i] /= points_in_cluster[i] return new_centroids, point_assignment
points = data.to_array() km = KMeans(points, args.k) centroids = km.select_random_centroids() iteration = 1 while True: print "Iteration #%i with centroids %s" % (iteration, [(x[0], x[1]) for x in centroids]) new_centroids, point_assignment = km.get_k_means(centroids) if args.outfile == None: gen_scatter_plot(points, point_assignment, centroids) distances = [distance(c1, c2) for c1, c2 in zip(centroids, new_centroids)] max_d = max(distances) if max_d <= args.cutoff: print "Max shift: %.2f <= %.2f" % (max_d, args.cutoff) print "Algorithm has converged." if args.outfile == None: gen_scatter_plot(points, point_assignment, new_centroids) break else: print "Max shift: %.2f > %.2f" % (max_d, args.cutoff) print "Algorithm hasn't converged yet." centroids = new_centroids iteration += 1 if args.outfile != None:
km = KMeans(points, args.k) centroids = km.select_random_centroids() iteration = 1 while True: print "Iteration #%i with centroids %s" % (iteration, [(x[0], x[1]) for x in centroids]) new_centroids, point_assignment = km.get_k_means(centroids) if args.outfile == None: gen_scatter_plot(points, point_assignment, centroids) distances = [distance(c1, c2) for c1, c2 in zip(centroids, new_centroids)] max_d = max(distances) if max_d <= args.cutoff: print "Max shift: %.2f <= %.2f" % (max_d, args.cutoff) print "Algorithm has converged." if args.outfile == None: gen_scatter_plot(points, point_assignment, new_centroids) break else: print "Max shift: %.2f > %.2f" % (max_d, args.cutoff) print "Algorithm hasn't converged yet." centroids = new_centroids iteration += 1 if args.outfile != None: