Esempio n. 1
0
def main():
    if len(sys.argv) < 2:
        print("Usage:\n\t{} [housing-data]".format(sys.argv[0]))
        sys.exit(1)

    dataset = reader.read(sys.argv[1], delim=' ')

    # Exapand features with nonlinear functions
    # Need to put them back together to handle 
    features, labels = util.fldivide(dataset)
    features, scale = scaling.unit_scale(features)
    features = util.basis_expand(features, lambda x: x ** 2, lambda x: x ** 3)
    features = np.hstack([features, np.ones((len(features), 1))])
    dataset = util.fljoin(features, labels)
    
    reg = NormalEquationLinearRegressor(regularization=1e-8)
    cv  = CrossValidator(reg)

    feat_indices, feat_errors = cv.best_3features_topN(dataset, n=5)
    for indices, err in zip(feat_indices, feat_errors):
        bestfeats = np.dstack([features[:, i] for i in indices]).squeeze()
        data = util.fljoin(bestfeats, labels)
        reg.train(data)
        print(reg.w)
        print("indices = {}, err = {}".format(indices, err))
Esempio n. 2
0
 def train(self, x):
     x, y = util.fldivide(x)
     
     # Check if any columns of x are all zeros (bad news for inversion)
     try:
         self.w = np.linalg.inv(x.T.dot(x) + self.l * np.eye(len(x[0]))).dot(x.T).dot(y)
     except np.linalg.linalg.LinAlgError as e:
         print(e)
         self.w = np.empty(len(x[0]))
         self.w[:] = np.NAN
Esempio n. 3
0
 def best_2features(self, dataset, k=10):
     feats, labels = util.fldivide(dataset)
     N = len(feats[0])
     err = np.ones((N, N))
     for i in range(len(feats[0])):
         for j in range(i+1, len(feats[0])):
             f1, f2, l = feats[:, i], feats[:, j], labels
             d = np.dstack((f1, f2, l)).squeeze()
             err[i, j] = self.kfold(d)
     minpos = np.unravel_index(err.argmin(), err.shape)
     return minpos[0], minpos[1], err[minpos[0], minpos[1]]
Esempio n. 4
0
    def train(self, x):
        x, y = util.fldivide(x)
        iters = 0
        m, n = x.shape
        self.w = np.empty(n)
        new_w = np.ones_like(self.w)

        while np.linalg.norm(self.w - new_w) > self.convergence:
            np.copyto(self.w, new_w)
            for j in range(n):
                diff = sum((self.w.dot(x[i]) - y[i]) * x[i, j] for i in range(m))
                new_w[j] = self.w[j] - (self.learn_rate / m) * (diff + self.l * self.w[j])
            iters += 1
        self.w = new_w
Esempio n. 5
0
def main():
    if len(sys.argv) < 2:
        print("Usage:\n\t{} [trainfile] [testfile]".format(sys.argv[0]))
        sys.exit(1)
    
    train_file, test_file = sys.argv[1:]
    train_data, train_labels = util.fldivide(read(train_file))
    test_data, test_labels   = util.fldivide(read(test_file))
    
    for i in range(5):
        nth_train_data = util.make_nth_order(train_data, i)
        nth_train = np.hstack((nth_train_data, train_labels.reshape((len(train_labels), 1))))
        nth_test_data  = util.make_nth_order(test_data, i)

        model = GradientDescentLinearRegressor(learn_rate=0.4, regularization=1e1)
        model.train(nth_train)
        predicted = model.predict(nth_test_data)

        mse = model.error(predicted, test_labels)

        plot_scatter_curve(test_data, test_labels, model.w, fignum=i,
                title="Gradient Descent, order {}, alpha={}, lambda={}, mse={}".format(i, model.learn_rate, model.l, mse))

    plt.show()
Esempio n. 6
0
    def kfold(self, dataset, k=10):
        # Divide data into k-groups
        kgroups = util.kdivide(dataset, k)
        err = []
        for i in range(len(kgroups)):
            # Catenate testing samples (leave out group i)
            train_set = np.vstack(kgroups[:i] + kgroups[i+1:])
            test_set = kgroups[i]
            
            # Run classifier, test with test set
            self.clf.train(train_set)
            test_f, test_l = util.fldivide(test_set)
            predicted_l = self.clf.predict(test_f)
            err.append(self.clf.error(predicted_l, test_l))

        # Return mean error from all k-groups
        return sum(err) / len(err)
Esempio n. 7
0
    def best_3features_topN(self, dataset, n=1, k=10):
        feats, labels = util.fldivide(dataset)
        N = len(feats[0])
        err = np.empty((N, N, N))
        err[:] = np.NAN
        for i in range(len(feats[0])):
            for j in range(i+1, len(feats[0])):
                for k in range(j+1, len(feats[0])):
                    f1, f2, f3, l = feats[:, i], feats[:, j], feats[:, k], labels
                    d = np.dstack((f1, f2, f3, l)).squeeze()
                    err[i, j, k] = self.kfold(d)

        # Return indices of top N sets of three features + their error
        indices = []
        errors  = []
        for i in range(n):
            minpos = np.unravel_index(np.nanargmin(err), err.shape)
            indices.append(minpos)
            errors.append(err[minpos])
            err[minpos] = np.NAN
        return indices, errors