def fit(X, k, do_plot=False): N, D = X.shape y = np.ones(N) medians = np.zeros((k, D)) # random initialization for kk in range(k): i = np.random.randint(N) medians[kk] = X[i] dist = np.zeros((N, k)) while True: y_old = y # Compute L1 distance to each median for n in range(N): current_obj = X[n, ] for i in range(k): current_median = medians[i, ] distance = np.abs(current_obj[0] - current_median[0]) + np.abs( current_obj[1] - current_median[1]) # print distance dist[n, i] = distance dist[np.isnan(dist)] = np.inf y = np.argmin(dist, axis=1) # Update medians for kk in range(k): # medians[kk] = X[y==kk].median(axis=0) cluster = X[y == kk] median_x = np.median(cluster, axis=0) # print median_x medians[kk] = median_x changes = np.sum(y != y_old) print('Running K-medians, changes in cluster assignment = {}'.format( changes)) # Stop if no point changed cluster if changes == 0: break model = dict() model['medians'] = medians model['predict'] = predict model['error'] = error if do_plot and D == 2: utils.plot_2dclustering(X, y) print("Displaying figure...") plt.show() return model
def closure_1_3_1(): k = 4 best_model = None min_error = np.inf for i in range(50): model = Kmeans(k) model.fit(X) error = model.error(X) if error < min_error: min_error = error best_model = model plt.figure() utils.plot_2dclustering(X, best_model.predict(X)) fname = os.path.join("..", "figs", "kmeans_outliers_best_model.png") plt.savefig(fname) print("\nFigure saved as '%s'" % fname)
def fit(X, k, do_plot=False): N, D = X.shape y = np.ones(N) medians = np.zeros((k, D)) for kk in range(k): i = np.random.randint(N) medians[kk] = X[i] while True: y_old = y # Compute distance to each median for n in range(N): dist1 = np.absolute(np.sum(X[n, :]) - np.sum(medians, axis=1)) y[n] = np.argmin(dist1) medians = np.zeros((k, D)) # Update medians for kk in range(k): medians[kk] = np.median(X[y == kk], axis=0) changes = np.sum(y != y_old) print('Running K-medians, changes in cluster assignment = {}'.format( changes)) # Stop if no point changed cluster if changes == 0: break if do_plot and D == 2: utils.plot_2dclustering(X, y) print("Displaying figure...") plt.show() model = dict() model['medians'] = medians model['predict'] = predict model['error'] = error return model
def fit(X, k, do_plot=False): N, D = X.shape y = np.ones(N) means = np.zeros((k, D)) for kk in range(k): i = np.random.randint(N) means[kk] = X[i] while True: y_old = y # Compute euclidean distance to each mean dist2 = utils.euclidean_dist_squared(X, means) dist2[np.isnan(dist2)] = np.inf y = np.argmin(dist2, axis=1) means = np.zeros((k, D)) # Update means for kk in range(k): means[kk] = X[y == kk].mean(axis=0) changes = np.sum(y != y_old) print('Running K-means, changes in cluster assignment = {}'.format(changes)) # Stop if no point changed cluster if changes == 0: break if do_plot and D == 2: utils.plot_2dclustering(X, y) print("Displaying figure...") plt.show() model = dict() model['means'] = means model['predict'] = predict model['error'] = error return model
'--question', required=True, choices=[ '1', '1.1', '1.2', '1.3', '1.4', '2', '2.2', '4', '4.1', '4.3' ]) io_args = parser.parse_args() question = io_args.question if question == '1': X = utils.load_dataset('clusterData')['X'] model = Kmeans(k=4) model.fit(X) utils.plot_2dclustering(X, model.predict(X)) fname = os.path.join("..", "figs", "kmeans_basic.png") plt.savefig(fname) print("\nFigure saved as '%s'" % fname) if question == '1.1': X = utils.load_dataset('clusterData')['X'] # part 1: implement kmeans.error # part 2: get clustering with lowest error out of 50 random initialization best_model = None min_error = np.inf for i in range(50): model = Kmeans(k=4)
print("Testing error =", te_err) if question == '3.1': X = utils.load_dataset('clusterData')['X'] model = kmeans.fit(X, k=4) low = model['error'](model, X) for i in range(49): new_model = kmeans.fit(X, k=4) err = new_model['error'](new_model, X) if err < low: model = new_model low = err utils.plot_2dclustering(X, model['predict'](model, X)) print("Displaying figure...") plt.title("K-Means on clusterData") plt.show() # part 1: implement kmeans.error # part 2: get clustering with lowest error out of 50 random initialization if question == '3.2': X = utils.load_dataset('clusterData')['X'] # part 3: plot min error across 50 random inits, as k is varied from 1 to 10 low = np.zeros(10) for k in range(1, 11):