Example #1
0
def test_evalb_unofficial():
    "Compare evalb_unofficial to evalb (official)."
    from ldp.parsing.util import binarize
    from ldp.prune.example import Setup
    from ldp.parse.benchmark import Parser
    from ldp.parse import leftchild
    from arsenal.iterview import iterview
    for grammar in ['medium']:
        s = Setup(grammar=grammar, maxlength=30, train=0, dev=50)
        parser = Parser(leftchild, s.grammar, chomsky=0)
        for e in iterview(s.dev):
            m = e.mask
            state = parser(e, m)
            ucoarse = parser.decode(e, state.derivation)

            # TODO: Technically, average evalb([x_i]) over sentences is
            # *NOT* the same as evalb([x_1...x_n]) on the corpus.
            #
            # This is a "macro v. micro average" problem.

            unofficial = lambda a, b: fpr(*evalb_unofficial(a, b))[0]

            fb = unofficial(e.gold_unbinarized, binarize(ucoarse))
            f = unofficial(e.gold_unbinarized, ucoarse)
            h = evalb(e.gold_unbinarized, ucoarse)
            assert abs(fb - f) < 1e-8, "binarization shouldn't affect scores."
            assert abs(f - h) < 1e-4
    print '[test/evalb unofficial] pass'
Example #2
0
def quick_fdcheck(func, w, g, n_checks=20, eps=1e-5, verbose=1, progressbar=1):
    """
    Check gradient along random directions (a faster alternative to axis-aligned directions).

    Tim Vieira (2017) "How to test gradient implementations"
    https://timvieira.github.io/blog/post/2017/04/21/how-to-test-gradient-implementations/

    """
    keys = ['rand_%s' % i for i in range(n_checks)]
    H = {}
    G = {}

    was = w.flatten()

    w = np.asarray(w.flat)
    g = np.asarray(g.flat)

    dim = len(w)

    for k in (iterview(keys) if progressbar else keys):
        d = spherical(dim)
        G[k] = g.dot(d)
        w[:] = was + eps * d
        b = func()
        w[:] = was - eps * d
        a = func()
        w[:] = was
        H[k] = (b - a) / (2 * eps)

    return compare(H, G, verbose=verbose)
Example #3
0
def quick_fdcheck(func, w, g, n_checks = 20, eps = 1e-5, verbose=1, progressbar=1):
    """
    Check gradient along random directions (a faster alternative to axis-aligned directions).

    Tim Vieira (2017) "How to test gradient implementations"
    https://timvieira.github.io/blog/post/2017/04/21/how-to-test-gradient-implementations/

    """
    keys = ['rand_%s' % i for i in range(n_checks)]
    H = {}
    G = {}

    was = w.flatten()

    w = np.asarray(w.flat)
    g = np.asarray(g.flat)

    dim = len(w)

    for k in (iterview(keys) if progressbar else keys):
        d = spherical(dim)
        G[k] = g.dot(d)
        w[:] = was + eps*d
        b = func()
        w[:] = was - eps*d
        a = func()
        w[:] = was
        H[k] = (b-a) / (2*eps)

    return compare(H, G, verbose=verbose)
Example #4
0
    def test():
        a0,b0,c0 = range(1, 10), range(21, 30), range(81, 90)
        test = zip(a0,b0,c0)
        a, b, c = iunzip(test)
        a, b, c = map(list, (a,b,c))
        assert a == a0 and b == b0 and c == c0
        recombined = zip(a, b, c)
        assert recombined == test

        def example_iterview():
            for _ in iterview(xrange(400), every=20):
                sleep(0.01)
        #example_iterview()

        X = [[0 for i in xrange(4)] for j in xrange(4)]
        for (k,(i,j)) in enumerate(cross_lower_triangle(range(4))):
            X[i][j] = k+1
        target = [[0, 0, 0, 0],
                  [1, 0, 0, 0],
                  [2, 3, 0, 0],
                  [4, 5, 6, 0]]
        assert X == target

        import numpy as np
        d = (np.random.rand(20, 3) - 0.5) * 100
        A = np.average(d, axis=0)
        a = last(rolling_average(d))
        assert np.linalg.norm(A - a) < 1e-10  # roughly zero difference

        import doctest; doctest.testmod()

        for _ in iterview(range(100), 1):
            sleep(.1)
Example #5
0
    def sample(self, data, num=1000):
        """ sample """
        samples = []
        inside = 0
        correct1, correct2, total = 0, 0, 0
        for instance in iterview(data, colored('Sampling', 'green')):
            psi = self.features.potentials_catchup(instance, self.updater.w, self.updater)
            sr = instance.sr
            dist = {}
            for s in self.model.sample(sr, psi, num=num):
                output = ""
                for x, y in s:
                    output += y
                if output not in dist:
                    dist[output] = 0
                dist[output] += 1

            count = dist[instance.ur_gold] if instance.ur_gold in dist else 0
            decoded = self.decode(instance)[1]

            if decoded != instance.ur_gold and count > 0:
                inside += 1
            if decoded == instance.ur_gold:
                correct1 += 1
            if instance.ur_gold in dist:
                correct2 += 1
            total += 1
            samples.append(dist)
            
        # TODO: put into log
        #print ; print inside
        #print correct1 / total, correct2 / total
        return samples
Example #6
0
    def optimize_chunk(self, iterations):
        """ optimize the model """
        for i in xrange(iterations):
            for tree in iterview(self.train,
                                 colored('Pass %s' % (i + 1), 'blue')):
                gt0, gt, ge = self.features.potentials_catchup(
                    tree, self.updater.w, self.updater)
                dgt0, dgt, dge = zeros_like(gt0), zeros_like(gt), zeros_like(
                    ge)
                self.segmenter.dll(tree, ge, gt0, gt, dge, dgt0, dgt)
                self.features.update(tree, dge, dgt0, dgt, self.updater)
                self.updater.step += 1

            self.save_segmenter(self.weights, i)
            self.decode(self.train, VITERBI)
            self.decode(self.dev, VITERBI)
            test_acc, test_f1 = self.decode(self.test, VITERBI)
            logging.info("chunk epoch {0} train acc: {1}".format(*(i,
                                                                   train_acc)))
            logging.info("chunk epoch {0} dev acc: {1}".format(*(i, dev_acc)))
            logging.info("chunk epoch {0} test acc: {1}".format(*(i,
                                                                  test_acc)))
            logging.info("chunk epoch {0} train f1: {1}".format(*(i,
                                                                  train_f1)))
            logging.info("chunk epoch {0} dev f1: {1}".format(*(i, dev_f1)))
            logging.info("chunk epoch {0} test f1: {1}".format(*(i, dev_f1)))
def preprocess_bubs_format(bubs, output):
    """Convert grammar from bubs-parser into ldp-friendly csv format. The result is
    an equivalent grammar, which is much faster to load because it has been
    integerized.

    Given a gzipped grammar from bubs-parser, e.g. `eng.M2.gr.gz`, this function
    will generate four files:

    - eng.M2.gr.csv: grammar rules
    - eng.M2.lex.csv: lexical rules
    - eng.M2.lex.alphabet: mapping from terminals to integers
    - eng.M2.sym.alphabet: mapping from syms to integers

    """

    sym = Alphabet()
    lex = Alphabet()

    import gzip
    lines = gzip.open(bubs, 'rb').readlines()
    reading_lex = False

    l = []
    f = []
    for line in iterview(lines[1:]):  # drop first line

        if line.startswith('===== LEXICON'):
            reading_lex = True
            continue

        x = line.strip().split()
        if not x:
            continue

        lhs = x[0]
        rhs = tuple(b for b in x[2:-1])
        score = x[-1]
        if len(rhs) == 1:
            rhs = (rhs[0], '')
        y, z = rhs
        lhs = sym[lhs]

        y = lex[y] if reading_lex else sym[y]
        z = sym[z] if z else -1

        if reading_lex:
            l.append({'score': score, 'head': lhs, 'left': y})
        else:
            f.append({'score': score, 'head': lhs, 'left': y, 'right': z})

    # non-gzipped loads faster.
    #DataFrame(f).to_csv(gzip.open(output + '.gr.csv.gz', 'wb'))
    #DataFrame(l).to_csv(gzip.open(output + '.lex.csv.gz', 'wb'))

    DataFrame(f).to_csv(output + '.gr.csv')
    DataFrame(l).to_csv(output + '.lex.csv')
    sym.save(output + '.sym.alphabet')
    lex.save(output + '.lex.alphabet')
Example #8
0
def fdcheck(func,
            w,
            g,
            keys=None,
            eps=1e-5,
            quiet=0,
            verbose=1,
            progressbar=1,
            throw=True):
    """
    Finite-difference check.

    Returns `arsenal.math.compare` instance.

    - `func`: zero argument function, which references `w` in caller's scope.
    - `w`: parameters.
    - `g`: gradient estimate to compare against
    - `keys`: dimensions to check
    - `eps`: perturbation size

    """
    if quiet:
        verbose = 0
        progressbar = 0

    if keys is None:
        if hasattr(
                w, 'keys'
        ):  # support for sparse vectors represented as a dictionary-like object.
            keys = list(w.keys())
            d = {}
        else:
            # use flat views, if need be.
            if len(w.shape) > 1: w = w.flat
            if len(g.shape) > 1: g = g.flat
            d = np.zeros_like(w)
            keys = list(
                range(len(w))
            )  # TODO: these keys have lost their names. So not good for debugging.
    else:
        d = {}

    for k in (iterview(keys) if progressbar else keys):
        was = w[k]
        w[k] = was + eps
        b = func()
        w[k] = was - eps
        a = func()
        w[k] = was
        d[k] = (b - a) / (2 * eps)

    if throw and not np.allclose([d[k] for k in keys], [g[k] for k in keys]):
        compare(d, g, verbose=True)
        raise AssertionError('^^^^ see compare above')

    return compare([d[k] for k in keys], [g[k] for k in keys], verbose=verbose)
Example #9
0
 def sample_ur(self, data, num_samples=1000):
     """ samples the UR """
     samples = self.transducer.sample(data, num=num_samples)
     for tree, samples in iterview(zip(data, samples),
                                   colored('Sampling', 'red')):
         tree.ur_samples = []
         viterbi_ur = self.transducer.decode(tree)[1]
         tree.ur_samples.append(viterbi_ur)
         for sample, count in samples.items():
             tree.ur_samples.append(sample)
Example #10
0
File: crf.py Project: endymecy/pgm
 def perceptron(self, data, rate=0.01, iterations=20, validate=None):
     """ Parameter estimation with the perceptron algorithm. """
     W = self.W
     for i in range(iterations):
         for x in iterview(data, msg='Iteration %s' % i):
             for k in self.path_features(x, self.argmax(x)):
                 W[k] -= rate
             for k in x.target_features:
                 W[k] += rate
         if validate:
             validate(self, i)
Example #11
0
File: crf.py Project: TPLink32/nlp
    def test_gradient(self, data, subsetsize=100):
        def fd(x, i, eps=1e-5):
            """Compute `i`th component of the finite-difference approximation to the
            gradient of log-likelihood at current parameters on example `x`.

            """

            was = self.W[i]  # record value

            self.W[i] = was + eps
            b = self.likelihood(x)

            self.W[i] = was - eps
            a = self.likelihood(x)

            self.W[i] = was  # restore original value

            return (b - a) / 2 / eps

        for x in iterview(data, msg='test grad'):

            g = defaultdict(float)
            for k, v in self.expectation(x).iteritems():
                g[k] -= 1 * v
            for k in x.target_features:
                g[k] += 1

            # pick a subset of features to test
            d = np.random.choice(g.keys(), subsetsize, replace=0)

            f = {}
            for i in iterview(d, msg='fd approx'):  # loop over active features
                f[i] = fd(x, i)

            from arsenal.math import compare
            compare([f[k] for k in d], [g[k] for k in d],
                    name='test gradient %s' % x,
                    scatter=1,
                    show_regression=1)
            import pylab as pl
            pl.show()
Example #12
0
 def optimize(self, iterations=10, start=0):
     """ optimize the model  """
     #np.random.shuffle(self.train)
     for i in xrange(iterations):
         for instance in iterview(self.train, colored('Pass %s' % (i+1+start), 'blue')):
             psi = self.features.potentials_catchup(instance, self.updater.w, self.updater)
             dpsi = zeros_like(psi)
             x, y = instance.sr, instance.ur
             #print "LL", self.model.ll(x, y, psi, minx=MINX, miny=MINY)
             dpsi = self.model.dll(x, y, psi, minx=MINX, miny=MINY)
             self.features.update(instance, dpsi, self.updater)
             self.updater.step += 1
Example #13
0
 def perceptron(self, data, rate=0.01, iterations=20, validate=None):
     """ Parameter estimation with the perceptron algorithm. """
     self.preprocess(data)
     W = self.W
     for i in xrange(iterations):
         for x in iterview(data, msg='Iteration %s' % i):
             for k in self.path_features(x, self.argmax(x)):
                 W[k] -= rate
             for k in x.target_features:
                 W[k] += rate
         if validate:
             validate(self, i)
Example #14
0
File: crf.py Project: endymecy/pgm
 def sgd(self, data, iterations=20, a0=10, validate=None):
     """ Parameter estimation with stochastic gradient descent (sgd). """
     W = self.W
     for i in range(iterations):
         rate = a0 / (np.sqrt(i) + 1)
         for x in iterview(data, msg='Iteration %s' % i):
             for k, v in self.expectation(x).iteritems():
                 W[k] -= rate * v
             for k in x.target_features:
                 W[k] += rate
         if validate:
             validate(self, i)
Example #15
0
    def test_gradient(self, data, subsetsize=100):

        def fd(x, i, eps=1e-5):
            """Compute `i`th component of the finite-difference approximation to the
            gradient of log-likelihood at current parameters on example `x`.

            """

            was = self.W[i]   # record value

            self.W[i] = was+eps
            b = self.likelihood(x)

            self.W[i] = was-eps
            a = self.likelihood(x)

            self.W[i] = was   # restore original value

            return (b - a) / 2 / eps

        for x in iterview(data, msg='test grad'):

            g = defaultdict(float)
            for k, v in self.expectation(x).iteritems():
                g[k] -= 1*v
            for k in x.target_features:
                g[k] += 1

            # pick a subset of features to test
            d = np.random.choice(g.keys(), subsetsize, replace=0)

            f = {}
            for i in iterview(d, msg='fd approx'):     # loop over active features
                f[i] = fd(x, i)

            from arsenal.math import compare
            compare([f[k] for k in d],
                    [g[k] for k in d], name='test gradient %s' % x, scatter=1, show_regression=1)
            import pylab as pl
            pl.show()
Example #16
0
 def sgd(self, data, iterations=20, a0=10, validate=None):
     """ Parameter estimation with stochastic gradient descent (sgd). """
     self.preprocess(data)
     W = self.W
     for i in xrange(iterations):
         rate = a0 / (sqrt(i) + 1)
         for x in iterview(data, msg='Iteration %s' % i):
             for k, v in self.expectation(x).iteritems():
                 W[k] -= rate*v
             for k in x.target_features:
                 W[k] += rate
         if validate:
             validate(self, i)
Example #17
0
def fdcheck(func,
            w,
            g,
            keys=None,
            eps=1e-5,
            quiet=0,
            verbose=1,
            progressbar=1):
    """
    Finite-difference check.

    Returns `arsenal.math.compare` instance.

    - `func`: zero argument function, which references `w` in caller's scope.
    - `w`: parameters.
    - `g`: gradient estimate to compare against
    - `keys`: dimensions to check
    - `eps`: perturbation size

    """
    if quiet:
        verbose = 0
        progressbar = 0

    if keys is None:
        if hasattr(w, 'keys'):
            keys = w.keys()
            d = {}
        else:
            d = np.zeros_like(w)

            # use flat views, if need be.
            if len(w.shape) > 1:
                w = w.flat
            if len(g.shape) > 1:
                g = g.flat
            if len(d.shape) > 1:
                d = d.flat

            keys = range(len(w))

    for k in (iterview(keys) if progressbar else keys):
        was = w[k]
        w[k] = was + eps
        b = func()
        w[k] = was - eps
        a = func()
        w[k] = was
        d[k] = (b - a) / (2 * eps)

    return compare([d[k] for k in keys], [g[k] for k in keys], verbose=verbose)
Example #18
0
def quick_fdcheck(func,
                  w,
                  g,
                  n_checks=20,
                  eps=1e-5,
                  quiet=True,
                  verbose=False,
                  progressbar=False,
                  throw=True):
    """
    Check gradient along random directions (a faster alternative to axis-aligned directions).

    Tim Vieira (2017) "How to test gradient implementations"
    https://timvieira.github.io/blog/post/2017/04/21/how-to-test-gradient-implementations/

    """

    if quiet:
        verbose = 0
        progressbar = 0

    keys = ['rand_%s' % i for i in range(n_checks)]
    H = {}
    G = {}

    was = w.flatten()

    w = np.asarray(w.flat)
    g = np.asarray(g.flat)

    dim = len(w)

    for k in (iterview(keys) if progressbar else keys):
        d = spherical(dim)
        G[k] = g.dot(d)
        w[:] = was + eps * d
        b = func()
        w[:] = was - eps * d
        a = func()
        w[:] = was
        H[k] = (b - a) / (2 * eps)

    different = not np.allclose(list(H.values()), list(G.values()))
    if verbose or different:
        compare(H, G, verbose=True)

    if different and throw:
        raise AssertionError('^^^^ see compare above')

    return compare(H, G, verbose=False)
Example #19
0
def main():
    from arsenal.iterview import iterview
    from time import sleep
    from numpy.random import uniform
    t = Timer('test')

    for i in iterview(range(1, 20)):
        for _ in range(10):
            with t(i=i):
                c = 0.01
                z = max(i**2 * 0.0001 + uniform(-c, c), 0.0)
                sleep(z)

    a = t.plot_feature('i')
    print(a)
    pl.show()
Example #20
0
def main():
    import pylab as pl
    from arsenal.iterview import iterview
    from time import sleep
    from numpy.random import uniform
    t = Timer('test')

    for i in iterview(xrange(1, 20)):
        for _ in xrange(10):
            with t(i=i):
                c = 0.01
                z = max(i**2 * 0.0001 + uniform(-c, c), 0.0)
                sleep(z)

    a = t.plot_feature('i')
    print a
    pl.show()
Example #21
0
def fdcheck(func, w, g, keys = None, eps = 1e-5, quiet=0, verbose=1, progressbar=1):
    """
    Finite-difference check.

    Returns `arsenal.math.compare` instance.

    - `func`: zero argument function, which references `w` in caller's scope.
    - `w`: parameters.
    - `g`: gradient estimate to compare against
    - `keys`: dimensions to check
    - `eps`: perturbation size

    """
    if quiet:
        verbose = 0
        progressbar = 0

    if keys is None:
        if hasattr(w, 'keys'):
            keys = w.keys()
            d = {}
        else:
            d = np.zeros_like(w)

            # use flat views, if need be.
            if len(w.shape) > 1:
                w = w.flat
            if len(g.shape) > 1:
                g = g.flat
            if len(d.shape) > 1:
                d = d.flat

            keys = range(len(w))

    for k in (iterview(keys) if progressbar else keys):
        was = w[k]
        w[k] = was + eps
        b = func()
        w[k] = was - eps
        a = func()
        w[k] = was
        d[k] = (b-a) / (2*eps)

    return compare([d[k] for k in keys],
                   [g[k] for k in keys],
                   verbose=verbose)
Example #22
0
def quick_fdcheck(func, w, g, n_checks, eps=1e-5, verbose=1, progressbar=1):
    "Check gradient along random directions (a faster alternative to axis-aligned directions)."
    keys = ['rand_%s' % i for i in range(n_checks)]
    H = {}
    G = {}

    was = w.copy()
    for k in (iterview(keys) if progressbar else keys):
        d = spherical(w.shape[0])
        G[k] = g.dot(d)
        w[:] = was + eps * d
        b = func()
        w[:] = was - eps * d
        a = func()
        w[:] = was
        H[k] = (b - a) / (2 * eps)

    return compare(H, G, verbose=verbose)
Example #23
0
def quick_fdcheck(func, w, g, n_checks, eps = 1e-5, verbose=1, progressbar=1):
    "Check gradient along random directions (a faster alternative to axis-aligned directions)."
    keys = ['rand_%s' % i for i in range(n_checks)]
    H = {}
    G = {}

    was = w.copy()
    for k in (iterview(keys) if progressbar else keys):
        d = spherical(w.shape[0])
        G[k] = g.dot(d)
        w[:] = was + eps*d
        b = func()
        w[:] = was - eps*d
        a = func()
        w[:] = was
        H[k] = (b-a) / (2*eps)

    return compare(H, G, verbose=verbose)
Example #24
0
 def evaluate(self, data, maximum=100000000):
     """ decode the model """
     correct, total = 0, 0
     counter = 0
     for instance in iterview(data, colored('Decoding', 'red')):
         if counter == maximum:
             break
         psi = self.features.potentials_catchup(instance, self.updater.w, self.updater)
         ur1 = instance.ur
         results = self.model.decode(instance.sr, psi, minx=MINX, miny=MINY)
         ll = self.model.ll(instance.sr, ur1, psi, minx=MINX, miny=MINY)
         score, ur2 = results[0], results[1]
         if ur1 == ur2:
             correct += 1
         print ur1, ur2
         total += 1
         counter += 1
     print
     return float(correct) / total
Example #25
0
def fdcheck(func, w, g, keys = None, eps = 1e-5, quiet=0, verbose=1, progressbar=1):
    """
    Finite-difference check.

    Returns `arsenal.math.compare` instance.

    - `func`: zero argument function, which references `w` in caller's scope.
    - `w`: parameters.
    - `g`: gradient estimate to compare against
    - `keys`: dimensions to check
    - `eps`: perturbation size

    """
    if quiet:
        verbose = 0
        progressbar = 0

    if keys is None:
        if hasattr(w, 'keys'):  # support for sparse vectors represented as a dictionary-like object.
            keys = list(w.keys())
            d = {}
        else:
            # use flat views, if need be.
            if len(w.shape) > 1: w = w.flat
            if len(g.shape) > 1: g = g.flat
            d = np.zeros_like(w)
            keys = list(range(len(w)))    # TODO: these keys have lost their names. So not good for debugging.
    else:
        d = {}

    for k in (iterview(keys) if progressbar else keys):
        was = w[k]
        w[k] = was + eps
        b = func()
        w[k] = was - eps
        a = func()
        w[k] = was
        d[k] = (b-a) / (2*eps)

    return compare([d[k] for k in keys],
                   [g[k] for k in keys],
                   verbose=verbose)
Example #26
0
File: crf.py Project: endymecy/pgm
 def very_sgd(self, data, iterations=20, a0=10, R=1, validate=None):
     """Parameter estimation with stochastic gradient descent (sgd) where
     expectations are estimated by sampling.
     """
     assert R > 0
     W = self.W
     for i in range(iterations):
         rate = a0 / (np.sqrt(i) + 1)
         for x in iterview(data, msg='Iteration %s' % i):
             r = 0
             for y in sample(x):
                 r += 1
                 for k in self.path_features(x, y):
                     W[k] -= rate / R
                 if r >= R:
                     break
             for k in x.target_features:
                 W[k] += rate
         if validate:
             validate(self, i)
Example #27
0
    def very_sgd(self, data, iterations=20, a0=10, R=1, validate=None):
        """Parameter estimation with stochastic gradient descent (sgd) where
        expectations are estimated by sampling.

        """
        assert R > 0
        self.preprocess(data)
        W = self.W
        for i in xrange(iterations):
            rate = a0 / (sqrt(i) + 1)
            for x in iterview(data, msg='Iteration %s' % i):
                r = 0
                for y in self.sample(x):
                    r += 1
                    for k in self.path_features(x, y):
                        W[k] -= rate / R
                    if r >= R:
                        break
                for k in x.target_features:
                    W[k] += rate
            if validate:
                validate(self, i)
Example #28
0
def _test_sample_tree(example, grammar, N):
    #    gold = {(X,I,K) for (X,I,K) in example.gold_items if (I,K) in example.nodes}
    print()
    _forest = parse_forest(example, grammar)
    # apply temperature to grammar rules
    forest = Hypergraph()
    forest.root = _forest.root
    for e in _forest.edges:
        c = LogVal.Zero()
        c.logeq(e.weight)
        forest.edge(c, e.head, *e.body)
    # run inside-outside
    B, A = sum_product(forest)
    Z = B[forest.root]
    # compute marginals and recall from samples
    #    sample_recall = 0.0
    m = defaultdict(float)
    for _ in iterview(range(N)):
        t = sample(forest, B)
        for s in t.subtrees():
            x = s.label()
            m[x] += 1.0 / N
#            xx = rename(grammar, x)
#            sample_recall += (xx in gold) * 1.0 / N
# convert node names and marginalize-out time index
    IO = defaultdict(float)
    for x in forest.incoming:
        IO[x] += (B[x] * A[x] / Z).to_real()
    # check marginals
    threshold = 1e-4
    for x in IO:
        (I, K, X, T) = x
        if K - I > 1:
            a = IO[x]
            b = m[x]
            if a > threshold or b > threshold:
                print('[%s %s %8s, %s] %7.3f %7.3f' \
                    % (I, K, X, T, a, b))
                assert abs(a - b) < 0.05
Example #29
0
    def optimize_joint(self, iterations, num_samples=10, eta1=0.0, eta2=0.0):
        """ optimize jointly using importance sampling """
        # TODO: unit test
        self.updater.eta = eta1
        for i in xrange(iterations):
            samples = self.transducer.sample(self.train, num=num_samples)
            for tree, sample in iterview(zip(self.train, samples),
                                         colored('Pass %s' % (i + 1), 'blue')):
                # compute approximate partition function
                logZ = NINF
                strings, weights = [], []
                for (ur, count) in sample.items():
                    score = self.transducer.ll(tree, ur)
                    if self.segmenter_type == CHUNK:
                        score += self.score_chunk(tree, ur)
                    elif self.segmenter_type == TREE:
                        score += self.score_tree(tree, ur)
                    # TODO: double check
                    logZ = logaddexp(logZ, score)
                    weights.append(score)
                    strings.append(ur)

                #TODO: make more elegant
                tmp = []
                for weight in weights:
                    tmp.append(weight - logZ)  # TODO: double check
                weights = tmp

                # take a tranducer weight gradient step with the importance sampling
                self.transducer.step_is(tree, strings, weights, eta=eta2)
                # take a segmenter weight gradient step with the importance sampling
                for ur, weight in zip(sample, weights):
                    if self.segmenter_type == CHUNK:
                        self.is_chunk(tree, ur, weight)
                    elif self.segmenter_type == TREE:
                        self.is_tree(tree, ur, weight)
                self.updater.step += 1
Example #30
0
def check_gradient(f, grad, theta, alphabet=None, eps=1e-4, tol=0.01, skip_zero=True,
                    verbose=True, progress=True, keys=None, random_subset=None):
    """Check gradient that `f(theta) == grad` by centered-difference approximation.

    Provides feedback on which dimensions differ

    Arguments:

     - `f`: function we are taking the gradient of.

     - `grad`: What we think the gradient is at `theta`.

     - `theta`:

     - `alphabet` (optional): a bijective map from strings to integers. Expects
       `arsenal.alphabet.Alphabet` instance. This is used to map integer-valued
       dimensions to human-readable names (e.g., strings).

     - `eps`: perturbation size

     - `tol`: what is deem an error (we use relative error)

     - `skip_zero`: good relative error is hard to get for values which are
       really small (near zero).

     - `random_subset`: number dimensions to probe in comparison (useful for
       high dimensions because this test is linear in the dimensionality of
       `theta`).

    """

    import numpy as np
    from numpy import zeros_like
    from arsenal.terminal import green, red, yellow
    from random import sample
    from arsenal.math import cosine

    fails = 0

    grad = np.asarray(grad)

    if keys is None:
        if alphabet is not None:
            keys = list(alphabet._flip.keys())
            assert len(alphabet), 'Alphabet is empty.'
        else:
            keys = list(range(len(theta)))
        if random_subset is not None:
            if hasattr(random_subset, '__iter__'):
                keys = list(random_subset)
            else:
                keys = sample(keys, min(len(keys), random_subset))

    assert len(keys) > 0

    fd = zeros_like(theta)
    for i in (iterview(keys, msg='checkgrad') if progress else keys):
        was = theta[i]
        # perturb right
        theta[i] = was + eps
        right = f(theta)
        # perturb left
        theta[i] = was - eps
        left = f(theta)
        # reset
        theta[i] = was
        # centered difference
        fd[i] = (right - left) / (2*eps)

    w = max(list(map(len, list(alphabet.keys())))) if alphabet is not None else 0

    nzeros = 0

    for i in keys:
        # check relative error

        if skip_zero and abs(fd[i]) < 1e-10 and abs(grad[i]) < 1e-10:  # both approximately zero
            nzeros += 1
            continue

        relative_error = abs(fd[i] - grad[i]) / max(abs(fd[i]), abs(grad[i]))
        if relative_error > tol:
            name = alphabet.lookup(i) if alphabet is not None else i
            fails += 1

            if verbose:
                print(red % 'dim = %s rel-err = %5.3f' % (('%%-%ss' % w) % (name,), relative_error), \
                    'want: %g; got: %g' % (fd[i], grad[i]))
            else:
                assert False, \
                    'dim = %s rel-err = %5.3f, want: %g; got: %g' \
                    % (('%%-%ss' % w) % (name,), relative_error, fd[i], grad[i])

    if nzeros * 1.0 / len(keys) >= 0.75:
        print(yellow % '[warning] checkgradient skipped a lot of approximately zero components ' \
            'percentage= %g (%s/%s)' % (nzeros * 1.0 / len(keys), nzeros, len(keys)))

    if verbose:
        print('gradient:', end=' ')
        if not fails:
            print(green % 'OK', end=' ')
        else:
            print(red % 'failed %s of %s' % (fails, len(keys)), end=' ')

        print('cosine similarity: %g' % cosine(grad[keys], fd[keys]))
Example #31
0
def check_gradient(f, grad, theta, alphabet=None, eps=1e-4, tol=0.01, skip_zero=True,
                    verbose=True, progress=True, keys=None, random_subset=None):

    """Check gradient that `f(theta) == grad` by centered-difference approximation.

    Provides feedback on which dimensions differ

    Arguments:

     - `f`: function we are taking the gradient of.

     - `grad`: What we think the gradient is at `theta`.

     - `theta`:

     - `alphabet` (optional): a bijective map from strings to integers. Expects
       `arsenal.alphabet.Alphabet` instance. This is used to map integer-valued
       dimensions to human-readable names (e.g., strings).

     - `eps`: perturbation size

     - `tol`: what is deem an error (we use relative error)

     - `skip_zero`: good relative error is hard to get for values which are
       really small (near zero).

     - `random_subset`: number dimensions to probe in comparison (useful for
       high dimensions because this test is linear in the dimensionality of
       `theta`).

    """

    fails = 0

    grad = np.asarray(grad)

    if keys is None:
        if alphabet is not None:
            keys = alphabet._flip.keys()
            assert len(alphabet), 'Alphabet is empty.'
        else:
            keys = range(len(theta))
        if random_subset is not None:
            if hasattr(random_subset, '__iter__'):
                keys = list(random_subset)
            else:
                keys = sample(keys, min(len(keys), random_subset))

    assert len(keys) > 0

    fd = zeros_like(theta)
    for i in (iterview(keys, msg='checkgrad') if progress else keys):
        was = theta[i]
        # perturb right
        theta[i] = was + eps
        right = f(theta)
        # perturb left
        theta[i] = was - eps
        left = f(theta)
        # reset
        theta[i] = was
        # centered difference
        fd[i] = (right - left) / (2*eps)

    w = max(map(len, alphabet.keys())) if alphabet is not None else 0

    nzeros = 0

    for i in keys:
        # check relative error

        if skip_zero and abs(fd[i]) < 1e-10 and abs(grad[i]) < 1e-10:  # both approximately zero
            nzeros += 1
            continue

        relative_error = abs(fd[i] - grad[i]) / max(abs(fd[i]), abs(grad[i]))
        if relative_error > tol:
            name = alphabet.lookup(i) if alphabet is not None else i
            fails += 1

            if verbose:
                print red % 'dim = %s rel-err = %5.3f' % (('%%-%ss' % w) % (name,), relative_error), \
                    'want: %g; got: %g' % (fd[i], grad[i])
            else:
                assert False, \
                    'dim = %s rel-err = %5.3f, want: %g; got: %g' \
                    % (('%%-%ss' % w) % (name,), relative_error, fd[i], grad[i])

    if nzeros * 1.0 / len(keys) >= 0.75:
        print yellow % '[warning] checkgradient skipped a lot of approximately zero components ' \
            'percentage= %g (%s/%s)' % (nzeros * 1.0 / len(keys), nzeros, len(keys))

    if verbose:
        print 'gradient:',
        if not fails:
            print green % 'OK',
        else:
            print red % 'failed %s of %s' % (fails, len(keys)),

        print 'cosine similarity: %g' % cosine(grad[keys], fd[keys])
Example #32
0
 def example_iterview():
     for _ in iterview(xrange(400), every=20):
         sleep(0.01)
Example #33
0
    def decode(self, data, data_type, decode_type=None, sep=u"#"):
        """ decode the chunker """
        if decode_type is None:
            decode_type = self.decode_type

        if decode_type == ORACLE:
            self.oracle_ur(data)
        elif decode_type == BASELINE:
            self.baseline_ur(data)
        elif decode_type == VITERBI:
            self.decode_ur(data)
        elif decode_type == SAMPLE:
            self.sample_ur(data)
        else:
            raise Exception('Illicit Decode Type')

        ur_correct, ur_total = 0, 0
        correct, f1, tree_f1, lev, total = 0, 0, 0, 0, 0
        for tree in iterview(data, colored('Decoding', 'red')):
            max_ur, max_score = None, NINF
            counter = 0
            for ur in tree.ur_samples:
                tree.update_ur(ur)
                counter += 1
                score = 0.0
                score = self.transducer.ll(tree, ur)
                #print
                #print "LL", self.transducer.ll(tree, ur)
                if self.segmenter_type == CHUNK:
                    score += self.score_chunk(tree, ur)
                    #print "SCORE", self.score_chunk(tree, ur)
                    #print ur
                    #raw_input()
                elif self.segmenter_type == TREE:
                    score += self.score_tree(tree, ur)
                # take the best importance sample
                if score >= max_score:
                    max_score = score
                    max_ur = ur
                    #print "counter", counter
            if max_ur == tree.ur_gold:
                ur_correct += 1
            ur_total += 1
            truth, guess, tree_f1_tmp = None, None, None
            if self.segmenter_type == CHUNK:
                truth, guess = self.decode_chunk(tree, max_ur)
            elif self.segmenter_type == TREE:
                truth, guess, tree_f1_tmp = self.decode_tree(tree, max_ur)
                tree_f1 += tree_f1_tmp

            # ACCURACY
            if truth == guess:
                correct += 1
            # LEVENSHTEIN
            lev += Levenshtein.distance(sep.join(truth), sep.join(guess))
            # F1
            set1, set2 = set(guess), set(truth)
            p, r = 0, 0
            for e in set1:
                if e in set2:
                    p += 1
            for e in set2:
                if e in set1:
                    r += 1
            p /= len(set1)
            r /= len(set2)
            if p + r > 0:
                f1 += 2 * p * r / (p + r)
            total += 1

        logging.info("decoder type: {0}".format(decode_type))
        logging.info("{0} ur acc: {1}".format(*(data_type,
                                                ur_correct / total)))
        logging.info("{0} seg acc: {1}".format(*(data_type, correct / total)))
        logging.info("{0} f1: {1}".format(*(data_type, f1 / total)))
        logging.info("{0} edit: {1}".format(*(data_type, lev / total)))
        if self.segmenter_type == TREE:
            logging.info("{0} tree f1: {1}".format(*(data_type,
                                                     tree_f1 / total)))
Example #34
0
 def oracle_ur(self, data):
     """ uses the oracle  UR """
     for tree in iterview(data, colored('Updating Oracle UR', 'red')):
         tree.ur_samples = []
         tree.ur_samples.append(tree.ur_gold)
Example #35
0
 def decode_ur(self, data):
     """ decodes the UR """
     for tree in iterview(data, colored('Updating Viterbi UR', 'red')):
         tree.ur_samples = []
         viterbi_ur = self.transducer.decode(tree)[1]
         tree.ur_samples.append(viterbi_ur)
Example #36
0
 def baseline_ur(self, data):
     """ baseline ur """
     for tree in iterview(data, colored('Updating Baseline UR', 'red')):
         tree.ur_samples = []
         tree.ur_samples.append(tree.sr)
Example #37
0
def load_results(results, _args, filters=()):
    jobs_running = [y.split()[0] for y in list(file('tmp/jobs'))[2:]]

    data = []
    jobs = []
    msgs = []
    for x in iterview(results.glob('*')):

        # Extract name of the experiment, {YYYY-MM-DD}-{NAME}-{argument hash}
        name = x.basename()[11:].split('-')[0]

        if _args.jobids:
            if (x / 'sge-jobid.txt').exists():
                jobid = (x / 'sge-jobid.txt').text().strip()
                if jobid not in _args.jobids:
                    continue
            else:
                continue

        else:
            if not any(p == name for p in filters):  # substring match
                continue

        # TODO: the finish file doesn't get written if we call qdel, but the log
        # files now contain some timestamps, so we should grab the last
        # timestamp logged as the "finish time."
        done = (x / 'finish').exists()

        if (x / 'sge-jobid.txt').exists():
            jobid = (x / 'sge-jobid.txt').text().strip()
        else:
            jobid = None

        dump_exists = False
        log_exists = False
        args_exists = False

        args = {}

        d = x / 'dump'
        if d.exists():  # job hasn't started (might've died)
            dump_exists = True
        else:
            # Dump doesn't exist for reasons other than it failed to get
            # scheduled. This might have to do with lack of permissions or a
            # failure in the python code (e.g., ImportError).
            assert jobid is None, x
            # try again, but this time it's not nested.
            d = x
            if d.exists():
                dump_exists = True

        if dump_exists:
            if (d / 'log.csv'
                ).exists():  # job hasn't produced first data point
                log_exists = True

            if (d / 'args.pkl').exists():
                # load command-line arguments fro pickle and add prefix (`'args_'`)
                # to avoid collisions with other column names.
                args = load(d / 'args.pkl')
                args = {'args_' + k: v for k, v in args.__dict__.items()}
                args_exists = True

        if not args_exists or not log_exists or not dump_exists:
            miss = []
            if jobid is None:
                miss.append('jobid')
            if not args_exists:
                miss.append('args')
            if not log_exists:
                miss.append('log')
            if not dump_exists:
                miss.append('dump')
            msgs.append('%s %s' %
                        (x, yellow % '(missing: %s)' % ' '.join(miss)))

        # Note: two jobs /might/ get assigned the same job id, which might cause
        # 'hash collision' style problems.
        running = (jobid in jobs_running)

        if args_exists:
            start = to_datetime((x / 'start').text())
            elapsed = (start.now() - start).total_seconds()
        else:
            continue

        J = dict(directory=x,
                 jobid=jobid,
                 start=start,
                 elapsed=elapsed,
                 done=done,
                 log_exists=log_exists,
                 dump_exists=dump_exists,
                 args_exists=args_exists,
                 running=running)

        # TODO: use dedicated CLI options for filtering jobs. The eval trick
        # (below) is unnecessary and provides a clunky filter notation.
        class __args:
            pass

        for k, v in args.items():
            if k.startswith('args_'):
                setattr(__args, k[5:], v)  # drop 'args_' prefix
        skip = 0
        for ff in _args.filter:
            fff = ff.replace('df.args_', '__args.')
            if not eval(fff):
                skip = 1
                break
        if skip:
            continue

        J.update(args)
        jobs.append(J)

        if not log_exists:
            continue

        log = get_log(d / 'log.csv', _args.accuracy, _args.runtime)

        if log is None:
            continue

        for (_, e) in log.iterrows():
            b = e.to_dict()
            status = dict(name=name,
                          jobid=jobid,
                          running=running,
                          done=done,
                          log=d / 'log.csv')
            b.update(J)
            b.update(status)
            b.update(args)
            data.append(b)

        # add column for time-elapsed (in days) since initial iteration.
        el = log.datetime.map(to_datetime).tolist()
        log['elapsed'] = [(x - el[0]).total_seconds() / (24 * 60 * 60)
                          for x in el]

        # TODO: move to after filters are applied
        if _args.lc:  # enable to view learning curves super-imposed on one another.
            learning_curve_handler(_args, args, log, jobid)

    for msg in msgs:
        print msg

    return data, jobs
                            threshold=0.05,
                            R=10_000,
                            verbose=1):
    "Pair permutation test"

    def effect(xs, ys):
        return np.abs(statistic(xs) - statistic(ys))  # two-sided: A != B

    ra = statistic(xs)
    rb = statistic(ys)
    diff = effect(xs, ys)  # observed difference

    n = len(xs)
    k = 0
    reps = range(R)
    if verbose: reps = iterview(reps, msg='perm test')
    for _ in reps:
        # randomly generate a vector of zeros and ones (uniformly).
        swaps = np.random.randint(0, 2, n).astype(bool)  # flip n coins
        k += diff <= effect(
            np.select([swaps, ~swaps], [xs, ys]),  # swap elements accordingly
            np.select([~swaps, swaps], [xs, ys]))

    s = k / R

    #threshold = 0.5*threshold
    #s *= 0.5   # because we have a two-sided test.

    if verbose:
        # which system has higher reward? is it significant?
        asig = (colors.red %