def main(): if len(sys.argv) < 2: print("Usage:\n\t{} [housing-data]".format(sys.argv[0])) sys.exit(1) dataset = reader.read(sys.argv[1], delim=' ') # Exapand features with nonlinear functions # Need to put them back together to handle features, labels = util.fldivide(dataset) features, scale = scaling.unit_scale(features) features = util.basis_expand(features, lambda x: x ** 2, lambda x: x ** 3) features = np.hstack([features, np.ones((len(features), 1))]) dataset = util.fljoin(features, labels) reg = NormalEquationLinearRegressor(regularization=1e-8) cv = CrossValidator(reg) feat_indices, feat_errors = cv.best_3features_topN(dataset, n=5) for indices, err in zip(feat_indices, feat_errors): bestfeats = np.dstack([features[:, i] for i in indices]).squeeze() data = util.fljoin(bestfeats, labels) reg.train(data) print(reg.w) print("indices = {}, err = {}".format(indices, err))
def train(self, x): x, y = util.fldivide(x) # Check if any columns of x are all zeros (bad news for inversion) try: self.w = np.linalg.inv(x.T.dot(x) + self.l * np.eye(len(x[0]))).dot(x.T).dot(y) except np.linalg.linalg.LinAlgError as e: print(e) self.w = np.empty(len(x[0])) self.w[:] = np.NAN
def best_2features(self, dataset, k=10): feats, labels = util.fldivide(dataset) N = len(feats[0]) err = np.ones((N, N)) for i in range(len(feats[0])): for j in range(i+1, len(feats[0])): f1, f2, l = feats[:, i], feats[:, j], labels d = np.dstack((f1, f2, l)).squeeze() err[i, j] = self.kfold(d) minpos = np.unravel_index(err.argmin(), err.shape) return minpos[0], minpos[1], err[minpos[0], minpos[1]]
def train(self, x): x, y = util.fldivide(x) iters = 0 m, n = x.shape self.w = np.empty(n) new_w = np.ones_like(self.w) while np.linalg.norm(self.w - new_w) > self.convergence: np.copyto(self.w, new_w) for j in range(n): diff = sum((self.w.dot(x[i]) - y[i]) * x[i, j] for i in range(m)) new_w[j] = self.w[j] - (self.learn_rate / m) * (diff + self.l * self.w[j]) iters += 1 self.w = new_w
def main(): if len(sys.argv) < 2: print("Usage:\n\t{} [trainfile] [testfile]".format(sys.argv[0])) sys.exit(1) train_file, test_file = sys.argv[1:] train_data, train_labels = util.fldivide(read(train_file)) test_data, test_labels = util.fldivide(read(test_file)) for i in range(5): nth_train_data = util.make_nth_order(train_data, i) nth_train = np.hstack((nth_train_data, train_labels.reshape((len(train_labels), 1)))) nth_test_data = util.make_nth_order(test_data, i) model = GradientDescentLinearRegressor(learn_rate=0.4, regularization=1e1) model.train(nth_train) predicted = model.predict(nth_test_data) mse = model.error(predicted, test_labels) plot_scatter_curve(test_data, test_labels, model.w, fignum=i, title="Gradient Descent, order {}, alpha={}, lambda={}, mse={}".format(i, model.learn_rate, model.l, mse)) plt.show()
def kfold(self, dataset, k=10): # Divide data into k-groups kgroups = util.kdivide(dataset, k) err = [] for i in range(len(kgroups)): # Catenate testing samples (leave out group i) train_set = np.vstack(kgroups[:i] + kgroups[i+1:]) test_set = kgroups[i] # Run classifier, test with test set self.clf.train(train_set) test_f, test_l = util.fldivide(test_set) predicted_l = self.clf.predict(test_f) err.append(self.clf.error(predicted_l, test_l)) # Return mean error from all k-groups return sum(err) / len(err)
def best_3features_topN(self, dataset, n=1, k=10): feats, labels = util.fldivide(dataset) N = len(feats[0]) err = np.empty((N, N, N)) err[:] = np.NAN for i in range(len(feats[0])): for j in range(i+1, len(feats[0])): for k in range(j+1, len(feats[0])): f1, f2, f3, l = feats[:, i], feats[:, j], feats[:, k], labels d = np.dstack((f1, f2, f3, l)).squeeze() err[i, j, k] = self.kfold(d) # Return indices of top N sets of three features + their error indices = [] errors = [] for i in range(n): minpos = np.unravel_index(np.nanargmin(err), err.shape) indices.append(minpos) errors.append(err[minpos]) err[minpos] = np.NAN return indices, errors