Ejemplo n.º 1
0
    def inner_optimization(self, iterations, prox_every=25):
        budget = self.group_budget
        for t in range(iterations):
            print()
            np.random.shuffle(self.train)
            for x in iterview(self.train, colors.green % 'Pass %s' % (t+1)):

                S = ScoringModel(x, self.A, self.feature_backoff, self.sparse, self.dense)
                self.gradient(x.N, x.tags, S)
                S.backprop()

                if budget is not None and self.sparse.step % prox_every == 0:
                    self.dense.prox_budget(budget)

                self.sparse.step += 1

            assert np.isfinite(self.sparse.w).all()
            assert np.isfinite(self.dense.w).all()

            # make sure to call prox udate before finishing this pass. This will
            # keep the number of features within the budget.
            if budget is not None:
                self.dense.prox_budget(budget)

            self.after_inner_pass()
Ejemplo n.º 2
0
    def test_gradient(self, T):

        D = 100

        # Give the updater 'trivial' parameters so that it doesn't make the test
        # complicated.
        sparse = LazyRegularizedAdagrad(D*self.A, C=0, L=2, eta=1.0, fudge=1)
        sparse.w[:] = np.random.uniform(-1, 1, size=sparse.d)
        sparse.step = 0

        #groups = [[i] for i in range(self.H)]
        groups = self.group_structure()
        
        dense = OnlineProx(groups, self.H, C=0, L=2, eta=1.0, fudge=1)
        dense.w[:] = np.random.uniform(-1, 1, size=dense.d)

        # Since updates are done inplace. We need to copy the original
        # parameters so that we can later 'infer' the gradient step.
        sparse_W_copy = np.array(sparse.w, copy=1)
        dense_W_copy = np.array(dense.w, copy=1)

        x = MockInstance(T, self.A, D = D, K = 5)

        S = ScoringModel(x, self.A, self.feature_backoff, sparse, dense)
        self.gradient(T, x.tags, S)
        S.backprop()

        def func():
            S = ScoringModel(x, self.A, self.feature_backoff, sparse, dense)
            return self.objective(T, x.tags, S)

        if 0:
            # TODO: we don't run this test because it doesn't pass! This is
            # because lazy adagrad manipulates the stepsize somewhat
            # unpredictably in order to get the benefit of inlining (avoiding
            # allocating temproary datastructures to buffer adjoints before
            # propagating them.)

            g = sparse_W_copy - sparse.finalize()
            sparse.w[:] = sparse_W_copy
            dense.w[:] = dense_W_copy
            [keys] = np.nonzero(g)
            fdcheck(func, sparse.w, g, keys)

        # figure out what the gradient step must have been.
        g = dense_W_copy - dense.w   # updater is descent

        sparse.w[:] = sparse_W_copy
        dense.w[:] = dense_W_copy
        c = fdcheck(func, dense.w, g)
        assert c.pearson >= 0.999999
        assert c.max_err <= 1e-8
        assert np.allclose(c.expect, c.got)
        print('[test gradient]:', colors.light.green % 'pass')
Ejemplo n.º 3
0
    def test_overfitting(self, T, y=None):
        D = 100
        groups = []
        dense = OnlineProx(groups, self.H, C=0, L=2, eta=1.0, fudge=1)
        dense.w[:] = np.random.uniform(-1, 1, size=dense.d)

        sparse = LazyRegularizedAdagrad(D*self.A, C=0, L=2, eta=1.0, fudge=1)
        sparse.w[:] = np.random.uniform(-1, 1, size=sparse.d)

        x = MockInstance(T, self.A, D=D, K=5, y=y)

        print()
        #print('[test overfitting]')
        for _ in range(10):
            S = ScoringModel(x, self.A, self.feature_backoff, sparse, dense)
            self.gradient(T, x.tags, S)
            S.backprop()
            y = self.predict(x.N, S)
            #print('obj: %g, acc: %.2f' % (self.objective(T, x.tags, S),
            #                              (y==x.tags).mean()))

        y = self.predict(x.N, S)
        assert (y==x.tags).all()
        print('[test overfitting]', colors.light.green % 'pass')
Ejemplo n.º 4
0
 def func():
     S = ScoringModel(x, self.A, self.feature_backoff, sparse, dense)
     return self.objective(T, x.tags, S)
Ejemplo n.º 5
0
 def predict(self, x):
     "Predict tags for `Instance x`."
     S = ScoringModel(x, self.A, self.feature_backoff, self.sparse, self.dense)
     y = super(ActiveSet, self).predict(x.N, S)
     return self.sigma.lookup_many(y)
Ejemplo n.º 6
0
    def active_set(self):
        for outer in range(1, self.outer_iterations+1):
            print()
            print(colors.green % '=====================')
            print(colors.green % 'Outer %s' % outer)

            self.inner_optimization(self.inner_iterations)

            if outer != self.outer_iterations:
                print()
                print(colors.yellow % 'Grow %s' % outer)

                # old feature index
                old = {c: self.context_feature_id(c) for c in self.C}
                w = self.dense.w.copy()
                q = np.array(self.dense.q, copy=1)

                TEST_EXPECT = 0

                if TEST_EXPECT:
                    # Record expectations under previous model. Technically,
                    # this is observed-expected features.
                    predictions = []
                    for x in self.train:
                        S = ScoringModel(x, self.A, self.feature_backoff, self.sparse, self.dense)
                        self.gradient(x.N, x.tags, S)   # don't backprop thru scoring model because we don't change the parameters.
                        predictions.append({k: S.d_dense[i] for k,i in old.items()})

                # "Grow" Z by extending active features with on more character.
                active = self.active_features()

                # Heuristic: Use an intelligent guess for 'new' q values in the
                # next iterations.
                #
                # This improves active set's ability to monotonically improve
                # after growing. Otherwise, adagrad will update too aggressively
                # compared to the sensible alternative of start at the last seen
                # value (if possible) or at the fudge value.
                #
                # In other words, new features get huge learning rates compared
                # to existing ones. Features that used to exist also get pretty
                # big learning rates too. This is because adagrad learning rates
                # decrease quickly with time as they are 1/sqrt(sum-of-squares).
                #
                # I found that guessing the mean q works better than min or max.
                self.dense.w[:] = 0
                self.dense.q[:] = float(q.mean())   # [2018-08-13 Mon] the use of `float` is workaround for "BufferError: Object is not writable."

                # Grow active contexts to the right.
                cc = {p+(y,) for p in active for y in self.sigma}

                ####
                # Note that just because we extended a bunch of active elements
                # by all elements of sigma, this does not mean that we are
                # last-character closed.
                #
                # Feel free to check via the following (failing) assertion
                #
                #   assert set(prefix_closure(cc)) == set(last_char_sub_closure(self.sigma, prefix_closure(cc)))
                #
                # The reason is that some elements go to zero and, thus, get
                # pruned. This is the same reason why `active` is not
                # automatically prefix closed.

                ####
                # Is the growing set prefix closed by construction?
                #
                # No. The grown set is also not prefix closed either because
                # it's possible for a parent to be zero with nonzero children.
                #
                # Here is an assertion that will fail.
                #
                # assert set(prefix_closure(cc)) == set(cc)
                #
                #cc = set(prefix_closure(cc))

                ####
                # XXX: In general, we probably do not want to do last-char-sub
                # closure. I've added it in because it seems to help use
                # more-closely preserve the distribution after manipulating the
                # active set.
                #cc = set(last_char_sub_closure(self.sigma, cc))

                # Filter active set by allowed-context constraints, if supplied.
                if self.allowed_contexts:
                    cc &= set(self.allowed_contexts)

                # Update DFA and group lasso data structures.
                self.update(self.sigma, cc)
                self.dense.set_groups(self.group_structure())
                print(colors.yellow % '=> new', '|C| = %s' % len(self.C))

                # Copy previous weights
                for c in self.C:
                    i = self.context_feature_id(c)
                    if c in old:
                        o = old[c]
                        self.dense.w[i] = w[o]
                        self.dense.q[i] = q[o]

                if 0:
                    print()
                    print(colors.light.red % 'is accuracy the same???????')
                    self.after_inner_pass()
                    print(colors.light.red % '^^^^^^^^^^^^^^^^^^^^^^^^^^^')
                    print()

                if TEST_EXPECT:
                    # DEBUGGING: check that expections match
                    #
                    # I'm not sure this test is implemented perfectly because we
                    # need to compute the expected value of all the old features
                    # under the new model.
                    #
                    # We get away using the new model because it has backoff
                    # features.
                    #
                    # In the case of a unigram model (order-0 model), this test
                    # fails. Why? are the unigrams used incorrectly?
                    #
                    new = {c: self.context_feature_id(c) for c in self.C}

                    for x, want in zip(self.train, predictions):
                        S = ScoringModel(x, self.A, self.feature_backoff, self.sparse, self.dense)
                        self.gradient(x.N, x.tags, S)    # don't backprop thru scoring model because we don't change the parameters.

                        # just check on *old* features.
                        E = {k: 0 for k in want}
                        E.update({k: S.d_dense[new[k]] for k in want if k in new})

                        # XXX: filter down to features in both vectors, I guess?
                        E = {k: v for k, v in E.items() if k in new}
                        want = {k: v for k, v in want.items() if k in new}

                        c = compare(want, E, verbose=1)

                        if c.pearson <= .99:
                            c.show()