Beispiel #1
0
def do_hmm(documents, split):
    train, test = bifurcate(documents, split)

    # train does NOT accept generators
    tagger = HiddenMarkovModelTagger.train([doc.token_label_pairs() for doc in train])
    results = defaultdict(list)
    for doc in test:
        predicted = tagger.tag(doc.tokens)
        gold_labels = doc.labels
        # precision =
        # recall =

        token_tag_pairs = nltk.pos_tag(doc.literal)
        print '\n-----------\n' + gloss(token_tag_pairs)

        for (token, predicted_label), gold_label in zip(predicted, gold_labels):
            results[(predicted_label, gold_label)] += [token]
Beispiel #2
0
def do_hmm(documents, split):
    train, test = bifurcate(documents, split)

    # train does NOT accept generators
    tagger = HiddenMarkovModelTagger.train(
        [doc.token_label_pairs() for doc in train])
    results = defaultdict(list)
    for doc in test:
        predicted = tagger.tag(doc.tokens)
        gold_labels = doc.labels
        # precision =
        # recall =

        token_tag_pairs = nltk.pos_tag(doc.literal)
        print '\n-----------\n' + gloss(token_tag_pairs)

        for (token,
             predicted_label), gold_label in zip(predicted, gold_labels):
            results[(predicted_label, gold_label)] += [token]
Beispiel #3
0
 def __str__(self):
     # spaced_tokens, spaced_labels = spaced_tokens + '\n' + spaced_labels
     return gloss(self.token_pos_tag_pairs())
Beispiel #4
0
    parser.add_argument('--train')
    parser.add_argument('--tag')
    parser.add_argument('--model', default='crf.model')
    opts = parser.parse_args()

    # This demonstrates how to obtain the version string of CRFsuite.
    print 'CRFSuite v%s' % crfsuite.version()

    if opts.train:
        # Create a Trainer object.
        trainer = Trainer()

        # Read training instances from STDIN, and set them to trainer.
        with open(opts.train) as lines:
            for data, labels in read_svm_format(lines):
                trainer.append(data, labels, 0)

        trainer.save(opts.model)
    else:
        tagger = Tagger(opts.model)
        with open(opts.tag) as lines:
            for data, gold_labels in read_svm_format(lines):
                predicted_labels = tagger.tag(data)
                tokens = [item[0].attr for item in data]
                print gloss(zip(tokens, predicted_labels, gold_labels))

                # total_probability = tagger.probability(predicted_labels)
                # marginals = [tagger.marginal(label, i) for i, label in enumerate(predicted_labels)]
                # Output the predicted labels with their marginal probabilities.
                # print '%s:%d-%f' % (y, )
Beispiel #5
0
    parser.add_argument('--train')
    parser.add_argument('--tag')
    parser.add_argument('--model', default='crf.model')
    opts = parser.parse_args()

    # This demonstrates how to obtain the version string of CRFsuite.
    print 'CRFSuite v%s' % crfsuite.version()

    if opts.train:
        # Create a Trainer object.
        trainer = Trainer()

        # Read training instances from STDIN, and set them to trainer.
        with open(opts.train) as lines:
            for data, labels in read_svm_format(lines):
                trainer.append(data, labels, 0)

        trainer.save(opts.model)
    else:
        tagger = Tagger(opts.model)
        with open(opts.tag) as lines:
            for data, gold_labels in read_svm_format(lines):
                predicted_labels = tagger.tag(data)
                tokens = [item[0].attr for item in data]
                print gloss(zip(tokens, predicted_labels, gold_labels))

                # total_probability = tagger.probability(predicted_labels)
                # marginals = [tagger.marginal(label, i) for i, label in enumerate(predicted_labels)]
                # Output the predicted labels with their marginal probabilities.
                # print '%s:%d-%f' % (y, )
Beispiel #6
0
 def __str__(self):
     # spaced_tokens, spaced_labels = spaced_tokens + '\n' + spaced_labels
     return gloss(self.token_pos_tag_pairs())