def create_splits(self, X): # get shape of dataset N, D = X.shape # thresholds is set of K-Means of each feature self.thresholds = [] for d in range(D): # reshape (n,) to (n,1) feature = X[:, d] feature = np.reshape(feature, [feature.size, 1]) # Initialize K-Means model k_means = Kmeans(k=k) min_err = np.inf min_err_means = None for i in range(50): k_means.fit(feature) error = k_means.error(feature) if error < min_err: min_err = error min_err_means = k_means.means self.thresholds.append(min_err_means)
def closure_1_3_1(): k = 4 best_model = None min_error = np.inf for i in range(50): model = Kmeans(k) model.fit(X) error = model.error(X) if error < min_error: min_error = error best_model = model plt.figure() utils.plot_2dclustering(X, best_model.predict(X)) fname = os.path.join("..", "figs", "kmeans_outliers_best_model.png") plt.savefig(fname) print("\nFigure saved as '%s'" % fname)
def closure_1_3_2(): minErrs = [] for k in range(1, 11): best_model = None min_error = np.inf for i in range(50): model = Kmeans(k) model.fit(X) error = model.error(X) if error < min_error: min_error = error best_model = model minErrs.append(min_error) plt.figure() plt.plot(list(range(1, 11)), minErrs) plt.xlabel('k') plt.ylabel('Error') plt.title('k-means training error as k increases') fname = os.path.join("..", "figs", "kmeans_err_k_outliers.png") plt.savefig(fname) print("\nFigure saved as '%s'" % fname)
fname = os.path.join("..", "figs", "kmeans_basic.png") plt.savefig(fname) print("\nFigure saved as '%s'" % fname) if question == '1.1': X = utils.load_dataset('clusterData')['X'] # part 1: implement kmeans.error # part 2: get clustering with lowest error out of 50 random initialization best_model = None min_error = np.inf for i in range(50): model = Kmeans(k=4) model.fit(X) error = model.error(X) if error < min_error: min_error = error best_model = model utils.plot_2dclustering(X, best_model.predict(X)) fname = os.path.join("..", "figs", "kmeans_50_inits.png") plt.savefig(fname) print("\nFigure saved as '%s'" % fname) if question == '1.2': # part 3: plot min error across 50 random inits, as k is varied from 1 to 10 X = utils.load_dataset('clusterData')['X']
model.fit(X) y = model.predict(X) plt.scatter(X[:, 0], X[:, 1], c=y, cmap="jet") fname = os.path.join("..", "figs", "kmeans_basic.png") plt.savefig(fname) print("\nFigure saved as '%s'" % fname) elif question == '5.1': X = load_dataset('clusterData.pkl')['X'] model = Kmeans(k=4) low = model.error(X) for i in range(49): new_model = Kmeans(k=4) err = new_model.error(X) if err < low: model = new_model low = err utils.plot_2dclustering(X, model['predict'](model, X)) print("Displaying figure...") plt.title("K-Means on clusterData") plt.show() elif question == '5.2': X = load_dataset('clusterData.pkl')['X'] elif question == '5.3': X = load_dataset('clusterData2.pkl')['X']
plt.xlabel("Random clustering k=4") fname = os.path.join("..", "figs", "1.png") plt.savefig(fname) print("\nFigure saved as '%s'" % fname) if question == '1.1': X = utils.load_dataset('clusterData')['X'] models = [] errors = np.zeros(50) for i in range(0, 50): model = Kmeans(k=4) model.fit(X) models.append(model) errors[i] = model.error(X) model = models.pop(np.argmin(errors)) utils.plot_2dclustering(X, model.predict(X)) plt.xlabel("Best of 50 clusterings k=4") fname = os.path.join("..", "figs", "1.1.png") plt.savefig(fname) print("\nFigure saved as '%s'" % fname) if question == '1.2': X = utils.load_dataset('clusterData')['X'] k = np.arange(0, 12) min_errors = np.zeros(k.size) for kk in k: models = []
plot_2dclustering(X, model.predict(X)) fname = os.path.join("..", "figs", "kmeans_basic.png") # plt.savefig(fname) print("\nFigure saved as '%s'" % fname) elif question == '3.1': X = load_dataset('clusterData.pkl')['X'] error = float('inf') model = Kmeans(k=4) model.fit(X) error = model.error(X) for i in range(49): model_2 = Kmeans(k=4) model_2.fit(X) if model_2.error(X) < error: model = model_2 plot_2dclustering(X, model.predict(X)) fname = os.path.join("..", "figs", "kmeans_3_1.png") plt.savefig(fname) print("\nFigure saved as '%s'" % fname) elif question == '3.2': X = load_dataset('clusterData.pkl')['X'] errors = np.ones([10, ]) * -1 for i in range(50): kVal = random.randint(1, 10) model = Kmeans(k=kVal) model.fit(X) error = model.error(X)