return score


if __name__ == "__main__":
    MODE = sys.argv[1]

    if MODE == "train":
        n = int(sys.argv[2])

        print("reading...")
        text = get_text(n)

        print("training...")
        trainer = PunktTrainer()
        trainer.INCLUDE_ALL_COLLOCS = True
        trainer.ABBREV = 0.3
        trainer.train(text, verbose=True)
        del text

        print("building tokenizer...")
        tokenizer = PunktSentenceTokenizer(trainer.get_params())
        abbrevs = tokenizer._params.abbrev_types
        print(sorted(abbrevs))
        print("%i abbreviations" % len(abbrevs))

        target_abbrevs = [
            "i.e", "e.g", "prof", "dr", "m.sc", "no", "nos", "mr", "mrs", "ms",
            "seq", "o.r.s"
        ]
        for target in target_abbrevs:
            print(target, target in abbrevs, score(trainer, target))
from pymongo import MongoClient
from progressbar import ProgressBar


client = MongoClient()
db = client.legislation
bills = db.bills

trainer = PunktTrainer()

# # set custom parameters
extra_collocations = {(u'sec', u'##number##')}
extra_sentence_starters = {u'(##number##)'}
# extra_abbreviations = {u'U.S.C', u'usc'}

trainer.ABBREV = 0.3
"""cut-off value whether a 'token' is an abbreviation"""

trainer.IGNORE_ABBREV_PENALTY = False
"""allows the disabling of the abbreviation penalty heuristic, which
exponentially disadvantages words that are found at times without a
final period."""

trainer.ABBREV_BACKOFF = 5
"""upper cut-off for Mikheev's(2002) abbreviation detection algorithm"""

trainer.COLLOCATION = 7.88
"""minimal log-likelihood value that two tokens need to be considered
as a collocation"""

trainer.SENT_STARTER = 30
Beispiel #3
0
import pickle
import sys

from os.path import basename
from nltk.tokenize.punkt import PunktTrainer

__author__ = 'Florian Leitner'
__version__ = '1.0'

if len(sys.argv) == 2 and sys.argv[1] in ('-h', '--help'):
    print('usage: {} < TEXT > MODEL'.format(basename(sys.argv[0])))
    sys.exit(1)

trainer = PunktTrainer()
# configuration
trainer.ABBREV = 0.3  # cut-off value whether a ‘token’ is an abbreviation
trainer.ABBREV_CUTOFF = 5  # upper cut-off for Mikheev’s (2002) abbreviation detection algorithm
trainer.COLLOCATION = 7.88  # minimal log-likelihood value that two tokens need to be considered as a collocation
trainer.IGNORE_ABBREV_PENALTY = False  # disables the abbreviation penalty heuristic, which exponentially disadvantages words that are found at times without a final period
trainer.INCLUDE_ABBREV_COLLOCS = True  # include as potential collocations all word pairs where the first word is an abbreviation - such collocations override the orthographic heuristic, but not the sentence starter heuristic
trainer.INCLUDE_ALL_COLLOCS = False  # this includes as potential collocations all word pairs where the first word ends in a period - it may be useful in corpora where there is a lot of variation that makes abbreviations like Mr difficult to identify
trainer.MIN_COLLOC_FREQ = 3  # minimum bound on the number of times a bigram needs to appear before it can be considered a collocation - useful when INCLUDE_*_COLLOCS are used
trainer.SENT_STARTER = 30  # minimal log-likelihood value that a token requires to be considered as a frequent sentence starter

for line in fileinput.input():
    trainer.train(line)
    #print(line)

#trainer.freq_threshold()
trainer.finalize_training()
params = trainer.get_params()