Exemple #1
0
def build_model(fmt='binary'):
    print('Loading training data...')
    train_paths = [
        find('corpora/ace_data/ace.dev'),
        find('corpora/ace_data/ace.heldout'),
        find('corpora/ace_data/bbn.dev'),
        find('corpora/ace_data/muc.dev')
    ]
    train_trees = load_ace_data(train_paths, fmt)
    train_data = [postag_tree(t) for t in train_trees]
    print('Training...')
    cp = NEChunkParser(train_data)
    del train_data

    print('Loading eval data...')
    eval_paths = [find('corpora/ace_data/ace.eval')]
    eval_trees = load_ace_data(eval_paths, fmt)
    eval_data = [postag_tree(t) for t in eval_trees]

    print('Evaluating...')
    chunkscore = ChunkScore()
    for i, correct in enumerate(eval_data):
        guess = cp.parse(correct.leaves())
        chunkscore.score(correct, guess)
        if i < 3: cmp_chunks(correct, guess)
    print(chunkscore)

    outfilename = '/tmp/ne_chunker_%s.pickle' % fmt
    print('Saving chunker to %s...' % outfilename)

    with open(outfilename, 'wb') as outfile:
        pickle.dump(cp, outfile, -1)

    return cp
def build_model(fmt='binary'):
    print('Loading training data...')
    train_paths = [find('corpora/ace_data/ace.dev'),
                   find('corpora/ace_data/ace.heldout'),
                   find('corpora/ace_data/bbn.dev'),
                   find('corpora/ace_data/muc.dev')]
    train_trees = load_ace_data(train_paths, fmt)
    train_data = [postag_tree(t) for t in train_trees]
    print('Training...')
    cp = NEChunkParser(train_data)
    del train_data

    print('Loading eval data...')
    eval_paths = [find('corpora/ace_data/ace.eval')]
    eval_trees = load_ace_data(eval_paths, fmt)
    eval_data = [postag_tree(t) for t in eval_trees]

    print('Evaluating...')
    chunkscore = ChunkScore()
    for i, correct in enumerate(eval_data):
        guess = cp.parse(correct.leaves())
        chunkscore.score(correct, guess)
        if i < 3: cmp_chunks(correct, guess)
    print(chunkscore)

    outfilename = '/tmp/ne_chunker_%s.pickle' % fmt
    print('Saving chunker to %s...' % outfilename)

    with open(outfilename, 'wb') as outfile:
        pickle.dump(cp, outfile, -1)

    return cp
def build_event_chunking_model():
    # Assemble training data, splitting token/PoS pairs
    train_corpus = []
    for tree in load_training_data('parsed'):
        for sentence_tree in tree:
            newtree = split_tree_tokens(sentence_tree)
            train_corpus.append(newtree)
    # Train chunker
    chunker = EventChunkParser(train_corpus)
    del train_corpus
    # Load evaluation data, splitting token/PoS pairs
    eval_corpus = []
    for tree in load_evaluation_data('parsed'):
        for sentence_tree in tree:
            eval_corpus.append(split_tree_tokens(sentence_tree))
    # Evaluate model
    print 'Evaluating...'
    chunkscore = ChunkScore()
    for i, correct in enumerate(eval_corpus):
        guessed = chunker.parse(correct.leaves())
        guessed = chunker._parse_to_tagged(guessed)
        chunkscore.score(correct, guessed)
        if i < 3:
            cmp_chunks(correct, guessed)
    print chunkscore
    return chunker
Exemple #4
0
def build_model(fmt="binary"):
    print("Loading training data...")
    train_paths = [
        find("corpora/ace_data/ace.dev"),
        find("corpora/ace_data/ace.heldout"),
        find("corpora/ace_data/bbn.dev"),
        find("corpora/ace_data/muc.dev"),
    ]
    train_trees = load_ace_data(train_paths, fmt)
    train_data = [postag_tree(t) for t in train_trees]
    print("Training...")
    cp = NEChunkParser(train_data)
    del train_data

    print("Loading eval data...")
    eval_paths = [find("corpora/ace_data/ace.eval")]
    eval_trees = load_ace_data(eval_paths, fmt)
    eval_data = [postag_tree(t) for t in eval_trees]

    print("Evaluating...")
    chunkscore = ChunkScore()
    for i, correct in enumerate(eval_data):
        guess = cp.parse(correct.leaves())
        chunkscore.score(correct, guess)
        if i < 3:
            cmp_chunks(correct, guess)
    print(chunkscore)

    outfilename = "/tmp/ne_chunker_%s.pickle" % fmt
    print("Saving chunker to %s..." % outfilename)

    with open(outfilename, "wb") as out:
        pickle.dump(cp, out, -1)

    return cp
Exemple #5
0
def build_model(fmt="binary"):
    print("Loading training data...")
    train_paths = [
        find("corpora/ace_data/ace.dev"),
        find("corpora/ace_data/ace.heldout"),
        find("corpora/ace_data/bbn.dev"),
        find("corpora/ace_data/muc.dev"),
    ]
    train_trees = load_ace_data(train_paths, fmt)
    train_data = [postag_tree(t) for t in train_trees]
    print("Training...")
    cp = NEChunkParser(train_data)
    del train_data

    print("Loading eval data...")
    eval_paths = [find("corpora/ace_data/ace.eval")]
    eval_trees = load_ace_data(eval_paths, fmt)
    eval_data = [postag_tree(t) for t in eval_trees]

    print("Evaluating...")
    chunkscore = ChunkScore()
    for i, correct in enumerate(eval_data):
        guess = cp.parse(correct.leaves())
        chunkscore.score(correct, guess)
        if i < 3:
            cmp_chunks(correct, guess)
    print(chunkscore)

    outfilename = "/tmp/ne_chunker_{0}.pickle".format(fmt)
    print("Saving chunker to {0}...".format(outfilename))

    with open(outfilename, "wb") as outfile:
        pickle.dump(cp, outfile, -1)

    return cp
Exemple #6
0
    def evaluate(self, gold):
        """
        Score the accuracy of the chunker against the gold standard.
        Remove the chunking the gold standard text, rechunk it using
        the chunker, and return a ``ChunkScore`` object
        reflecting the performance of this chunk peraser.

        :type gold: list(Tree)
        :param gold: The list of chunked sentences to score the chunker on.
        :rtype: ChunkScore
        """
        chunkscore = ChunkScore()
        for correct in gold:
            chunkscore.score(correct, self.parse(correct.leaves()))
        return chunkscore
Exemple #7
0
    def evaluate(self, gold):
        """
        Score the accuracy of the chunker against the gold standard.
        Remove the chunking the gold standard text, rechunk it using
        the chunker, and return a ``ChunkScore`` object
        reflecting the performance of this chunk peraser.

        :type gold: list(Tree)
        :param gold: The list of chunked sentences to score the chunker on.
        :rtype: ChunkScore
        """
        chunkscore = ChunkScore()
        for correct in gold:
            chunkscore.score(correct, self.parse(correct.leaves()))
        return chunkscore
Exemple #8
0
    def compare(self, ann2, labels=None):
        """
        Estimates the accuracy of annotation/prediction assuming the current
        Annotation is the gold standard
        :return: NLTK ChinkScore Object
        """
        if labels is None:
            labels = tuple(value for value in self.labels if value in ann2.labels)
        self_nltk = nltk.chunk.conllstr2tree(self.to_iob()[1], chunk_types=labels)
        ann2_nltk = nltk.chunk.conllstr2tree(ann2.to_iob()[1], chunk_types=labels)

        chunk_score = ChunkScore()
        chunk_score.score(self_nltk, ann2_nltk)

        return chunk_score
Exemple #9
0

def make_tree(tree):
    if isinstance(tree, tuple) and len(tree) > 2:
        return ImmutableTree(tree[0],
                             [make_tree(tree[i]) for i in range(2, len(tree))])
    else:
        return tree


if __name__ == "__main__":
    if len(sys.argv) != 2:
        print >> sys.stderr, "Usage: %s grammar.pickle" % (sys.argv[0])
        exit(1)

    (grammar, sentence_tags) = pickle.load(open(sys.argv[1]))

    score = ChunkScore()
    for (gold, s) in list(sentences())[0:10]:
        print ' '.join(s)
        parses = list(parse(s, grammar, sentence_tags))
        if parses:
            guess = make_tree(max(parses, key=operator.itemgetter(1)))
            print gold
            print guess
            score.score(gold, guess)
    print 'Accuracy:', score.accuracy()
    print 'Precision:', score.precision()
    print 'Recall:', score.recall()
    print 'F Measure:', score.f_measure()
Exemple #10
0
        if s not in parse(f, grammar):
            print "[ERROR] " + format(f)

def make_tree(tree):
    if isinstance(tree, tuple) and len(tree) > 2:
        return ImmutableTree(tree[0], [make_tree(tree[i]) for i in range(2, len(tree))])
    else:
        return tree
            
if __name__ == "__main__":
    if len(sys.argv) != 2:
        print >> sys.stderr, "Usage: %s grammar.pickle"%(sys.argv[0])
        exit(1)

    (grammar,sentence_tags) = pickle.load(open(sys.argv[1]))

    score = ChunkScore()
    for (gold, s) in list(sentences())[0:10]:
        print ' '.join(s)
        parses = list(parse(s, grammar, sentence_tags))
        if parses:
            guess = make_tree(max(parses, key=operator.itemgetter(1)))
            print gold
            print guess
            score.score(gold, guess)
    print 'Accuracy:', score.accuracy()
    print 'Precision:', score.precision()
    print 'Recall:', score.recall()
    print 'F Measure:', score.f_measure()