def do_hmm(documents, split): train, test = bifurcate(documents, split) # train does NOT accept generators tagger = HiddenMarkovModelTagger.train([doc.token_label_pairs() for doc in train]) results = defaultdict(list) for doc in test: predicted = tagger.tag(doc.tokens) gold_labels = doc.labels # precision = # recall = token_tag_pairs = nltk.pos_tag(doc.literal) print '\n-----------\n' + gloss(token_tag_pairs) for (token, predicted_label), gold_label in zip(predicted, gold_labels): results[(predicted_label, gold_label)] += [token]
def do_hmm(documents, split): train, test = bifurcate(documents, split) # train does NOT accept generators tagger = HiddenMarkovModelTagger.train( [doc.token_label_pairs() for doc in train]) results = defaultdict(list) for doc in test: predicted = tagger.tag(doc.tokens) gold_labels = doc.labels # precision = # recall = token_tag_pairs = nltk.pos_tag(doc.literal) print '\n-----------\n' + gloss(token_tag_pairs) for (token, predicted_label), gold_label in zip(predicted, gold_labels): results[(predicted_label, gold_label)] += [token]
def __str__(self): # spaced_tokens, spaced_labels = spaced_tokens + '\n' + spaced_labels return gloss(self.token_pos_tag_pairs())
parser.add_argument('--train') parser.add_argument('--tag') parser.add_argument('--model', default='crf.model') opts = parser.parse_args() # This demonstrates how to obtain the version string of CRFsuite. print 'CRFSuite v%s' % crfsuite.version() if opts.train: # Create a Trainer object. trainer = Trainer() # Read training instances from STDIN, and set them to trainer. with open(opts.train) as lines: for data, labels in read_svm_format(lines): trainer.append(data, labels, 0) trainer.save(opts.model) else: tagger = Tagger(opts.model) with open(opts.tag) as lines: for data, gold_labels in read_svm_format(lines): predicted_labels = tagger.tag(data) tokens = [item[0].attr for item in data] print gloss(zip(tokens, predicted_labels, gold_labels)) # total_probability = tagger.probability(predicted_labels) # marginals = [tagger.marginal(label, i) for i, label in enumerate(predicted_labels)] # Output the predicted labels with their marginal probabilities. # print '%s:%d-%f' % (y, )