Beispiel #1
0
def main(proportion=None, iterations=20, save='model.pkl~', load=None):

    class Token(object):
        def __init__(self, form):
            self.form = form
            self.attributes = []
        def add(self, features):
            """ Add features to this Token. """
            self.attributes.extend(features)

    def token_features(tk):
        """ very basic feature extraction. """
        w = tk.form
        yield 'word=' + w
        yield 'simplified=' + re.sub('[0-9]', '0', re.sub('[^a-zA-Z0-9()\.\,]', '', w.lower()))
        for c in re.findall('[^a-zA-Z0-9]', w):  # non-alpha-numeric
            yield 'contains(%r)' % c

    def preprocessing(s):
        """ Run instance thru feature extraction. """
        s[0].add(['first-token'])
        s[-1].add(['last-token'])
        for tk in s:
            tk.add(token_features(tk))
        if 1:
            # previous token features
            for t in xrange(1, len(s)):
                s[t].add(f + '@-1' for f in token_features(s[t-1]))
            # next token features
            for t in xrange(len(s) - 1):
                s[t].add(f + '@+1' for f in token_features(s[t+1]))
        return s

    def get_data(f):
        for x in fromSGML(f, linegrouper="<NEW.*?>", bioencoding=False):
            x, y = zip(*[(Token(w), y) for y, w in x])
            preprocessing(x)
            yield Instance(x, truth=y)


    [train, test] = partition(get_data('tagged_references.txt'), proportion)


    def validate(model, iteration=None):

        def f1(data, name):
            print
            print 'Phrase-based F1:', name
            f1 = F1()
            for i, x in enumerate(iterview(data)):
                predict = extract_contiguous(model(x))
                truth = extract_contiguous(x.truth)
                # (i,begin,end) uniquely identifies the span
                for (label, begins, ends) in truth:
                    f1.add_relevant(label, (i, begins, ends))
                for (label, begins, ends) in predict:
                    f1.add_retrieved(label, (i, begins, ends))
            print
            return f1.scores(verbose=True)

        f1(train, name='TRAIN')
        f1(test, name='TEST')

        print
        print 'likelihood:', sum(map(crf.likelihood, iterview(train))) / len(train)
        print
        print

    if load:
        crf = StringCRF.load(load)
        validate(crf)
        return

    # Create and train CRF
    (L, A) = build_domain(train)

    print len(L), 'labels'
    print len(A), 'features'

    crf = StringCRF(L, A)

    fit = [crf.sgd, crf.perceptron][1]
    fit(train, iterations=iterations, validate=validate)

    if save:
        crf.save(save)
Beispiel #2
0
def main(proportion=None, iterations=20, save='model.pkl~', load=None):

    class Token(object):
        def __init__(self, form):
            self.form = form
            self.attributes = []
        def add(self, features):
            """ Add features to this Token. """
            self.attributes.extend(features)

    def token_features(tk):
        """ very basic feature extraction. """
        w = tk.form
        yield 'word=' + w
        yield 'simplified=' + re.sub('[0-9]', '0', re.sub('[^a-zA-Z0-9()\.\,]', '', w.lower()))
        for c in re.findall('[^a-zA-Z0-9]', w):  # non-alpha-numeric
            yield 'contains(%r)' % c

    def preprocessing(s):
        """ Run instance thru feature extraction. """
        s[0].add(['first-token'])
        s[-1].add(['last-token'])
        for tk in s:
            tk.add(token_features(tk))
        if 1:
            # previous token features
            for t in xrange(1, len(s)):
                s[t].add(f + '@-1' for f in token_features(s[t-1]))
            # next token features
            for t in xrange(len(s) - 1):
                s[t].add(f + '@+1' for f in token_features(s[t+1]))
        return s

    def get_data(f):
        for x in fromSGML(f, linegrouper="<NEW.*?>", bioencoding=False):
            x, y = zip(*[(Token(w), y) for y, w in x])
            preprocessing(x)
            yield Instance(x, truth=y)


    [train, test] = partition(get_data('tagged_references.txt'), proportion)


    def validate(model, iteration=None):

        def f1(data, name):
            print
            print 'Phrase-based F1:', name
            f1 = F1()
            for i, x in enumerate(iterview(data)):
                predict = extract_contiguous(model(x))
                truth = extract_contiguous(x.truth)
                # (i,begin,end) uniquely identifies the span
                for (label, begins, ends) in truth:
                    f1.add_relevant(label, (i, begins, ends))
                for (label, begins, ends) in predict:
                    f1.add_retrieved(label, (i, begins, ends))
            print
            return f1.scores(verbose=True)

        def weight_sparsity(W, t=0.0001):
            a = (np.abs(W) > t).sum()
            b = W.size
            print '%.2f (%s/%s) sparsity' % (a*100.0/b, a, b)

#        f1(train, name='TRAIN')
#        if test:
#            f1(test, name='TEST')

#        print
#        weight_sparsity(model.W)
        llh = sum(map(crf.likelihood, iterview(train, msg='llh'))) / len(train)

        from arsenal.viz.util import lineplot
        with lineplot('llh') as d:
            d.append(llh)

        print
        print 'likelihood:', llh
        print
        print

    if load:
        crf = StringCRF.load(load)
        validate(crf)
        return

    # Create and train CRF
    (L, A) = build_domain(train)

    print len(L), 'labels'
    print len(A), 'features'

    crf = StringCRF(L, A)

    if 0:
        print 'testing gradient....'
        crf.preprocess(train)
        crf.W[:] = np.random.uniform(-1,1,size=crf.W.shape)
        crf.test_gradient(train[:10])
        print 'testing....done'

    fit = [crf.sgd, crf.perceptron, crf.very_sgd][2]
    fit(train, iterations=iterations, validate=validate)

    if save:
        crf.save(save)
Beispiel #3
0
def main(proportion=None, iterations=20, save="model.pkl~", load=None):
    class Token(object):
        def __init__(self, form):
            self.form = form
            self.attributes = []

        def add(self, features):
            """ Add features to this Token. """
            self.attributes.extend(features)

    def token_features(tk):
        """ very basic feature extraction. """
        w = tk.form
        yield "word=" + w
        yield "simplified=" + re.sub("[0-9]", "0", re.sub("[^a-zA-Z0-9()\.\,]", "", w.lower()))
        for c in re.findall("[^a-zA-Z0-9]", w):  # non-alpha-numeric
            yield "contains(%r)" % c

    def preprocessing(s):
        """ Run instance thru feature extraction. """
        s[0].add(["first-token"])
        s[-1].add(["last-token"])
        for tk in s:
            tk.add(token_features(tk))
        if 1:
            # previous token features
            for t in xrange(1, len(s)):
                s[t].add(f + "@-1" for f in token_features(s[t - 1]))
            # next token features
            for t in xrange(len(s) - 1):
                s[t].add(f + "@+1" for f in token_features(s[t + 1]))
        return s

    def get_data(f):
        for x in fromSGML(f, linegrouper="<NEW.*?>", bioencoding=False):
            x, y = zip(*[(Token(w), y) for y, w in x])
            preprocessing(x)
            yield Instance(x, truth=y)

    [train, test] = partition(get_data("tagged_references.txt"), proportion)

    def validate(model, iteration=None):
        def f1(data, name):
            print
            print "Phrase-based F1:", name
            f1 = F1()
            for i, x in enumerate(iterview(data)):
                predict = extract_contiguous(model(x))
                truth = extract_contiguous(x.truth)
                # (i,begin,end) uniquely identifies the span
                for (label, begins, ends) in truth:
                    f1.add_relevant(label, (i, begins, ends))
                for (label, begins, ends) in predict:
                    f1.add_retrieved(label, (i, begins, ends))
            print
            return f1.scores(verbose=True)

        def weight_sparsity(W, t=0.0001):
            a = (np.abs(W) > t).sum()
            b = W.size
            print "%.2f (%s/%s) sparsity" % (a * 100.0 / b, a, b)

        f1(train, name="TRAIN")
        f1(test, name="TEST")

        print
        weight_sparsity(model.W)
        print
        print "likelihood:", sum(map(crf.likelihood, iterview(train))) / len(train)
        print
        print

    if load:
        crf = StringCRF.load(load)
        validate(crf)
        return

    # Create and train CRF
    (L, A) = build_domain(train)

    print len(L), "labels"
    print len(A), "features"

    crf = StringCRF(L, A)

    fit = [crf.sgd, crf.perceptron][0]
    fit(train, iterations=iterations, validate=validate)

    if save:
        crf.save(save)