def scale(data: List[Vector]) -> Tuple[Vector, Vector]: """returns the means and standard deviations for each position""" dim = len(data[0]) means = vector_mean(data) stdevs = [ standard_deviation([vector[i] for vector in data]) for i in range(dim) ] return means, stdevs
def cluster_means(k: int, inputs: List[Vector], assignments: List[int]) -> List[Vector]: # clusters[i] contains the inputs whose assignment is i clusters = [[] for i in range(k)] for input, assignment in zip(inputs, assignments): clusters[assignment].append(input) # if a cluster is empty, just use a random point return [vector_mean(cluster) if cluster else random.choice(inputs) for cluster in clusters]
def least_squares_fit_ridge(xs: List[Vector], ys: List[float], alpha: float, learning_rate: float, num_steps: int, batch_size: int = 1) -> Vector: # Start guess with mean guess = [random.random() for _ in xs[0]] for i in range(num_steps): for start in range(0, len(xs), batch_size): batch_xs = xs[start:start + batch_size] batch_ys = ys[start:start + batch_size] gradient = vector_mean([ sqerror_ridge_gradient(x, y, guess, alpha) for x, y in zip(batch_xs, batch_ys) ]) guess = gradient_step(guess, gradient, -learning_rate) return guess
def least_squares_fit(xs: List[Vector], ys: List[float], learning_rate: float = 0.001, num_steps: int = 1000, batch_size: int = 1) -> Vector: """ Find the beta that minimizes the sum of squared errors assuming the model y = dot(x, beta). """ # Start with a random guess guess = [random.random() for _ in xs[0]] for _ in tqdm.trange(num_steps, desc="least squares fit"): for start in range(0, len(xs), batch_size): batch_xs = xs[start:start + batch_size] batch_ys = ys[start:start + batch_size] gradient = vector_mean([ sqerror_gradient(x, y, guess) for x, y in zip(batch_xs, batch_ys) ]) guess = gradient_step(guess, gradient, -learning_rate) return guess
def main(): inputs: List[List[float]] = [[-14,-5],[13,13],[20,23],[-19,-11],[-9,-16],[21,27],[-49,15],[26,13],[-46,5],[-34,-1],[11,15],[-49,0],[-22,-16],[19,28],[-12,-8],[-13,-19],[-41,8],[-11,-6],[-25,-9],[-18,-3]] random.seed(12) # so you get the same results as me clusterer = KMeans(k=3) clusterer.train(inputs) means = sorted(clusterer.means) # sort for the unit test assert len(means) == 3 # Check that the means are close to what we expect. assert squared_distance(means[0], [-44, 5]) < 1 assert squared_distance(means[1], [-16, -10]) < 1 assert squared_distance(means[2], [18, 20]) < 1 random.seed(0) clusterer = KMeans(k=2) clusterer.train(inputs) means = sorted(clusterer.means) assert len(means) == 2 assert squared_distance(means[0], [-26, -5]) < 1 assert squared_distance(means[1], [18, 20]) < 1 from matplotlib import pyplot as plt def squared_clustering_errors(inputs: List[Vector], k: int) -> float: """finds the total squared error from k-means clustering the inputs""" clusterer = KMeans(k) clusterer.train(inputs) means = clusterer.means assignments = [clusterer.classify(input) for input in inputs] return sum(squared_distance(input, means[cluster]) for input, cluster in zip(inputs, assignments)) # now plot from 1 up to len(inputs) clusters ks = range(1, len(inputs) + 1) errors = [squared_clustering_errors(inputs, k) for k in ks] plt.plot(ks, errors) plt.xticks(ks) plt.xlabel("k") plt.ylabel("total squared error") plt.title("Total Error vs. # of Clusters") # plt.show() plt.savefig('im/total_error_vs_num_clusters') plt.gca().clear() image_path = r"girl_with_book.jpg" # wherever your image is import matplotlib.image as mpimg img = mpimg.imread(image_path) / 256 # rescale to between 0 and 1 # .tolist() converts a numpy array to a Python list pixels = [pixel.tolist() for row in img for pixel in row] clusterer = KMeans(5) clusterer.train(pixels) # this might take a while def recolor(pixel: Vector) -> Vector: cluster = clusterer.classify(pixel) # index of the closest cluster return clusterer.means[cluster] # mean of the closest cluster new_img = [[recolor(pixel) for pixel in row] # recolor this row of pixels for row in img] # for each row in the image plt.close() plt.imshow(new_img) plt.axis('off') # plt.show() plt.savefig('im/recolored_girl_with_book.jpg') plt.gca().clear() base_cluster = bottom_up_cluster(inputs) three_clusters = [get_values(cluster) for cluster in generate_clusters(base_cluster, 3)] # sort smallest to largest tc = sorted(three_clusters, key=len) assert len(tc) == 3 assert [len(c) for c in tc] == [2, 4, 14] assert sorted(tc[0]) == [[11, 15], [13, 13]] plt.close() for i, cluster, marker, color in zip([1, 2, 3], three_clusters, ['D','o','*'], ['r','g','b']): xs, ys = zip(*cluster) # magic unzipping trick plt.scatter(xs, ys, color=color, marker=marker) # put a number at the mean of the cluster x, y = vector_mean(cluster) plt.plot(x, y, marker='$' + str(i) + '$', color='black') plt.title("User Locations -- 3 Bottom-Up Clusters, Min") plt.xlabel("blocks east of city center") plt.ylabel("blocks north of city center") # plt.show() plt.savefig('im/bottom_up_clusters_min.png') plt.gca().clear() plt.close() base_cluster_max = bottom_up_cluster(inputs, max) three_clusters_max = [get_values(cluster) for cluster in generate_clusters(base_cluster_max, 3)] for i, cluster, marker, color in zip([1, 2, 3], three_clusters_max, ['D','o','*'], ['r','g','b']): xs, ys = zip(*cluster) # magic unzipping trick plt.scatter(xs, ys, color=color, marker=marker) # put a number at the mean of the cluster x, y = vector_mean(cluster) plt.plot(x, y, marker='$' + str(i) + '$', color='black') plt.title("User Locations -- 3 Bottom-Up Clusters, Max") plt.xlabel("blocks east of city center") plt.ylabel("blocks north of city center") plt.savefig('im/bottom_up_clusters_max.png') plt.gca().clear()
def de_mean(data: List[Vector]) -> List[Vector]: """Recenters the data to have mean 0 in every dimension""" mean = vector_mean(data) return [subtract(vector, mean) for vector in data]