def postag( templates=None, tagged_data=None, num_sents=1000, max_rules=300, min_score=3, min_acc=None, train=0.8, trace=3, randomize=False, ruleformat="str", incremental_stats=False, template_stats=False, error_output=None, serialize_output=None, learning_curve_output=None, learning_curve_take=300, baseline_backoff_tagger=None, separate_baseline_data=False, cache_baseline_tagger=None): """ Brill Tagger Demonstration :param templates: how many sentences of training and testing data to use :type templates: list of Template :param tagged_data: maximum number of rule instances to create :type tagged_data: C{int} :param num_sents: how many sentences of training and testing data to use :type num_sents: C{int} :param max_rules: maximum number of rule instances to create :type max_rules: C{int} :param min_score: the minimum score for a rule in order for it to be considered :type min_score: C{int} :param min_acc: the minimum score for a rule in order for it to be considered :type min_acc: C{float} :param train: the fraction of the the corpus to be used for training (1=all) :type train: C{float} :param trace: the level of diagnostic tracing output to produce (0-4) :type trace: C{int} :param randomize: whether the training data should be a random subset of the corpus :type randomize: C{bool} :param ruleformat: rule output format, one of "str", "repr", "verbose" :type ruleformat: C{str} :param incremental_stats: if true, will tag incrementally and collect stats for each rule (rather slow) :type incremental_stats: C{bool} :param template_stats: if true, will print per-template statistics collected in training and (optionally) testing :type template_stats: C{bool} :param error_output: the file where errors will be saved :type error_output: C{string} :param serialize_output: the file where the learned tbl tagger will be saved :type serialize_output: C{string} :param learning_curve_output: filename of plot of learning curve(s) (train and also test, if available) :type learning_curve_output: C{string} :param learning_curve_take: how many rules plotted :type learning_curve_take: C{int} :param baseline_backoff_tagger: the file where rules will be saved :type baseline_backoff_tagger: tagger :param separate_baseline_data: use a fraction of the training data exclusively for training baseline :type separate_baseline_data: C{bool} :param cache_baseline_tagger: cache baseline tagger to this file (only interesting as a temporary workaround to get deterministic output from the baseline unigram tagger between python versions) :type cache_baseline_tagger: C{string} Note on separate_baseline_data: if True, reuse training data both for baseline and rule learner. This is fast and fine for a demo, but is likely to generalize worse on unseen data. Also cannot be sensibly used for learning curves on training data (the baseline will be artificially high). """ # defaults baseline_backoff_tagger = baseline_backoff_tagger or REGEXP_TAGGER if templates is None: from nltk.tag.brill import describe_template_sets, brill24 # some pre-built template sets taken from typical systems or publications are # available. Print a list with describe_template_sets() # for instance: templates = brill24() (training_data, baseline_data, gold_data, testing_data) = \ _demo_prepare_data(tagged_data, train, num_sents, randomize, separate_baseline_data) # creating (or reloading from cache) a baseline tagger (unigram tagger) # this is just a mechanism for getting deterministic output from the baseline between # python versions if cache_baseline_tagger: if not os.path.exists(cache_baseline_tagger): baseline_tagger = UnigramTagger(baseline_data, backoff=baseline_backoff_tagger) with open(cache_baseline_tagger, 'w') as print_rules: pickle.dump(baseline_tagger, print_rules) print("Trained baseline tagger, pickled it to {0}".format(cache_baseline_tagger)) with open(cache_baseline_tagger, "r") as print_rules: baseline_tagger= pickle.load(print_rules) print("Reloaded pickled tagger from {0}".format(cache_baseline_tagger)) else: baseline_tagger = UnigramTagger(baseline_data, backoff=baseline_backoff_tagger) print("Trained baseline tagger") if gold_data: print(" Accuracy on test set: {0:0.4f}".format(baseline_tagger.evaluate(gold_data))) # creating a Brill tagger tbrill = time.time() trainer = BrillTaggerTrainer(baseline_tagger, templates, trace, ruleformat=ruleformat) print("Training tbl tagger...") brill_tagger = trainer.train(training_data, max_rules, min_score, min_acc) print("Trained tbl tagger in {0:0.2f} seconds".format(time.time() - tbrill)) if gold_data: print(" Accuracy on test set: %.4f" % brill_tagger.evaluate(gold_data)) # printing the learned rules, if learned silently if trace == 1: print("\nLearned rules: ") for (ruleno, rule) in enumerate(brill_tagger.rules(),1): print("{0:4d} {1:s}".format(ruleno, rule.format(ruleformat))) # printing template statistics (optionally including comparison with the training data) # note: if not separate_baseline_data, then baseline accuracy will be artificially high if incremental_stats: print("Incrementally tagging the test data, collecting individual rule statistics") (taggedtest, teststats) = brill_tagger.batch_tag_incremental(testing_data, gold_data) print(" Rule statistics collected") if not separate_baseline_data: print("WARNING: train_stats asked for separate_baseline_data=True; the baseline " "will be artificially high") trainstats = brill_tagger.train_stats() if template_stats: brill_tagger.print_template_statistics(teststats) if learning_curve_output: _demo_plot(learning_curve_output, teststats, trainstats, take=learning_curve_take) print("Wrote plot of learning curve to {0}".format(learning_curve_output)) else: print("Tagging the test data") taggedtest = brill_tagger.batch_tag(testing_data) if template_stats: brill_tagger.print_template_statistics() # writing error analysis to file if error_output is not None: with open(error_output, 'w') as f: f.write('Errors for Brill Tagger %r\n\n' % serialize_output) for e in error_list(gold_data, taggedtest): f.write(e+'\n') print("Wrote tagger errors including context to {0}".format(error_output)) # serializing the tagger to a pickle file and reloading (just to see it works) if serialize_output is not None: taggedtest = brill_tagger.batch_tag(testing_data) with open(serialize_output, 'w') as print_rules: pickle.dump(brill_tagger, print_rules) print("Wrote pickled tagger to {0}".format(serialize_output)) with open(serialize_output, "r") as print_rules: brill_tagger_reloaded = pickle.load(print_rules) print("Reloaded pickled tagger from {0}".format(serialize_output)) taggedtest_reloaded = brill_tagger.batch_tag(testing_data) if taggedtest == taggedtest_reloaded: print("Reloaded tagger tried on test set, results identical") else: print("PROBLEM: Reloaded tagger gave different results on test set")
def postag( templates=None, tagged_data=None, num_sents=1000, max_rules=300, min_score=3, min_acc=None, train=0.8, trace=3, randomize=False, ruleformat="str", incremental_stats=False, template_stats=False, error_output=None, serialize_output=None, learning_curve_output=None, learning_curve_take=300, baseline_backoff_tagger=None, separate_baseline_data=False, cache_baseline_tagger=None): """ Brill Tagger Demonstration :param templates: how many sentences of training and testing data to use :type templates: list of Template :param tagged_data: maximum number of rule instances to create :type tagged_data: C{int} :param num_sents: how many sentences of training and testing data to use :type num_sents: C{int} :param max_rules: maximum number of rule instances to create :type max_rules: C{int} :param min_score: the minimum score for a rule in order for it to be considered :type min_score: C{int} :param min_acc: the minimum score for a rule in order for it to be considered :type min_acc: C{float} :param train: the fraction of the the corpus to be used for training (1=all) :type train: C{float} :param trace: the level of diagnostic tracing output to produce (0-4) :type trace: C{int} :param randomize: whether the training data should be a random subset of the corpus :type randomize: C{bool} :param ruleformat: rule output format, one of "str", "repr", "verbose" :type ruleformat: C{str} :param incremental_stats: if true, will tag incrementally and collect stats for each rule (rather slow) :type incremental_stats: C{bool} :param template_stats: if true, will print per-template statistics collected in training and (optionally) testing :type template_stats: C{bool} :param error_output: the file where errors will be saved :type error_output: C{string} :param serialize_output: the file where the learned tbl tagger will be saved :type serialize_output: C{string} :param learning_curve_output: filename of plot of learning curve(s) (train and also test, if available) :type learning_curve_output: C{string} :param learning_curve_take: how many rules plotted :type learning_curve_take: C{int} :param baseline_backoff_tagger: the file where rules will be saved :type baseline_backoff_tagger: tagger :param separate_baseline_data: use a fraction of the training data exclusively for training baseline :type separate_baseline_data: C{bool} :param cache_baseline_tagger: cache baseline tagger to this file (only interesting as a temporary workaround to get deterministic output from the baseline unigram tagger between python versions) :type cache_baseline_tagger: C{string} Note on separate_baseline_data: if True, reuse training data both for baseline and rule learner. This is fast and fine for a demo, but is likely to generalize worse on unseen data. Also cannot be sensibly used for learning curves on training data (the baseline will be artificially high). """ # defaults baseline_backoff_tagger = baseline_backoff_tagger or REGEXP_TAGGER if templates is None: from nltk.tag.brill import describe_template_sets, brill24 # some pre-built template sets taken from typical systems or publications are # available. Print a list with describe_template_sets() # for instance: templates = brill24() (training_data, baseline_data, gold_data, testing_data) = \ _demo_prepare_data(tagged_data, train, num_sents, randomize, separate_baseline_data) # creating (or reloading from cache) a baseline tagger (unigram tagger) # this is just a mechanism for getting deterministic output from the baseline between # python versions if cache_baseline_tagger: if not os.path.exists(cache_baseline_tagger): baseline_tagger = UnigramTagger(baseline_data, backoff=baseline_backoff_tagger) with open(cache_baseline_tagger, 'w') as print_rules: pickle.dump(baseline_tagger, print_rules) print("Trained baseline tagger, pickled it to {0}".format(cache_baseline_tagger)) with open(cache_baseline_tagger, "r") as print_rules: baseline_tagger= pickle.load(print_rules) print("Reloaded pickled tagger from {0}".format(cache_baseline_tagger)) else: baseline_tagger = UnigramTagger(baseline_data, backoff=baseline_backoff_tagger) print("Trained baseline tagger") if gold_data: print(" Accuracy on test set: {0:0.4f}".format(baseline_tagger.evaluate(gold_data))) # creating a Brill tagger tbrill = time.time() trainer = BrillTaggerTrainer(baseline_tagger, templates, trace, ruleformat=ruleformat) print("Training tbl tagger...") brill_tagger = trainer.train(training_data, max_rules, min_score, min_acc) print("Trained tbl tagger in {0:0.2f} seconds".format(time.time() - tbrill)) if gold_data: print(" Accuracy on test set: %.4f" % brill_tagger.evaluate(gold_data)) # printing the learned rules, if learned silently if trace == 1: print("\nLearned rules: ") for (ruleno, rule) in enumerate(brill_tagger.rules(),1): print("{0:4d} {1:s}".format(ruleno, rule.format(ruleformat))) # printing template statistics (optionally including comparison with the training data) # note: if not separate_baseline_data, then baseline accuracy will be artificially high if incremental_stats: print("Incrementally tagging the test data, collecting individual rule statistics") (taggedtest, teststats) = brill_tagger.batch_tag_incremental(testing_data, gold_data) print(" Rule statistics collected") if not separate_baseline_data: print("WARNING: train_stats asked for separate_baseline_data=True; the baseline " "will be artificially high") trainstats = brill_tagger.train_stats() if template_stats: brill_tagger.print_template_statistics(teststats) if learning_curve_output: _demo_plot(learning_curve_output, teststats, trainstats, take=learning_curve_take) print("Wrote plot of learning curve to {0}".format(learning_curve_output)) else: print("Tagging the test data") taggedtest = brill_tagger.tag_sents(testing_data) if template_stats: brill_tagger.print_template_statistics() # writing error analysis to file if error_output is not None: with open(error_output, 'w') as f: f.write('Errors for Brill Tagger %r\n\n' % serialize_output) f.write(u'\n'.join(error_list(gold_data, taggedtest)).encode('utf-8') + '\n') print("Wrote tagger errors including context to {0}".format(error_output)) # serializing the tagger to a pickle file and reloading (just to see it works) if serialize_output is not None: taggedtest = brill_tagger.tag_sents(testing_data) with open(serialize_output, 'w') as print_rules: pickle.dump(brill_tagger, print_rules) print("Wrote pickled tagger to {0}".format(serialize_output)) with open(serialize_output, "r") as print_rules: brill_tagger_reloaded = pickle.load(print_rules) print("Reloaded pickled tagger from {0}".format(serialize_output)) taggedtest_reloaded = brill_tagger.tag_sents(testing_data) if taggedtest == taggedtest_reloaded: print("Reloaded tagger tried on test set, results identical") else: print("PROBLEM: Reloaded tagger gave different results on test set")