Ejemplo n.º 1
0
as a frequent sentence starter"""

trainer.INCLUDE_ALL_COLLOCS = False
"""this includes as potential collocations all word pairs where the first
word ends in a period. It may be useful in corpora where there is a lot
of variation that makes abbreviations like Mr difficult to identify."""

trainer.INCLUDE_ABBREV_COLLOCS = False
"""this includes as potential collocations all word pairs where the first
word is an abbreviation. Such collocations override the orthographic
heuristic, but not the sentence starter heuristic. This is overridden by
INCLUDE_ALL_COLLOCS, and if both are false, only collocations with initials
and ordinals are considered."""
""""""

trainer.MIN_COLLOC_FREQ = 1
"""this sets a minimum bound on the number of times a bigram needs to
appear before it can be considered a collocation, in addition to log
likelihood statistics. This is useful when INCLUDE_ALL_COLLOCS is True."""

progress = ProgressBar()
for doc in progress(docs):
    trainer.train(doc, finalize=False, verbose=False)

print "Finalizing training..."
trainer.finalize_training(verbose=True)
print "Training done."

params = trainer.get_params()
with open('sentence_tokenizer_params.pkl', 'wb') as f:
    pickle.dump(params, f, protocol=pickle.HIGHEST_PROTOCOL)
Ejemplo n.º 2
0
as a frequent sentence starter"""

trainer.INCLUDE_ALL_COLLOCS = False
"""this includes as potential collocations all word pairs where the first
word ends in a period. It may be useful in corpora where there is a lot
of variation that makes abbreviations like Mr difficult to identify."""

trainer.INCLUDE_ABBREV_COLLOCS = False
"""this includes as potential collocations all word pairs where the first
word is an abbreviation. Such collocations override the orthographic
heuristic, but not the sentence starter heuristic. This is overridden by
INCLUDE_ALL_COLLOCS, and if both are false, only collocations with initials
and ordinals are considered."""
""""""

trainer.MIN_COLLOC_FREQ = 1
"""this sets a minimum bound on the number of times a bigram needs to
appear before it can be considered a collocation, in addition to log
likelihood statistics. This is useful when INCLUDE_ALL_COLLOCS is True."""


# Read in training corpus
# cursor = bills.find({}, {"text_versions": 1}, no_cursor_timeout=True)
# cursor = bills.find({}, {"text_versions": 1}, no_cursor_timeout=True, limit=10000)
cursor = bills.find({"congress": {"$in": ["114", "113"]}}, {"text_versions": 1}, no_cursor_timeout=True)

# Train trainer
pbar = ProgressBar(maxval=cursor.count()).start()
for i, line in enumerate(cursor):
    text = line['text_versions'].itervalues().next()
    trainer.train(text, finalize=False, verbose=False)
Ejemplo n.º 3
0
from os.path import basename
from nltk.tokenize.punkt import PunktTrainer

__author__ = 'Florian Leitner'
__version__ = '1.0'

if len(sys.argv) == 2 and sys.argv[1] in ('-h', '--help'):
    print('usage: {} < TEXT > MODEL'.format(basename(sys.argv[0])))
    sys.exit(1)

trainer = PunktTrainer()
# configuration
trainer.ABBREV = 0.3  # cut-off value whether a ‘token’ is an abbreviation
trainer.ABBREV_CUTOFF = 5  # upper cut-off for Mikheev’s (2002) abbreviation detection algorithm
trainer.COLLOCATION = 7.88  # minimal log-likelihood value that two tokens need to be considered as a collocation
trainer.IGNORE_ABBREV_PENALTY = False  # disables the abbreviation penalty heuristic, which exponentially disadvantages words that are found at times without a final period
trainer.INCLUDE_ABBREV_COLLOCS = True  # include as potential collocations all word pairs where the first word is an abbreviation - such collocations override the orthographic heuristic, but not the sentence starter heuristic
trainer.INCLUDE_ALL_COLLOCS = False  # this includes as potential collocations all word pairs where the first word ends in a period - it may be useful in corpora where there is a lot of variation that makes abbreviations like Mr difficult to identify
trainer.MIN_COLLOC_FREQ = 3  # minimum bound on the number of times a bigram needs to appear before it can be considered a collocation - useful when INCLUDE_*_COLLOCS are used
trainer.SENT_STARTER = 30  # minimal log-likelihood value that a token requires to be considered as a frequent sentence starter

for line in fileinput.input():
    trainer.train(line)
    #print(line)

#trainer.freq_threshold()
trainer.finalize_training()
params = trainer.get_params()
pickle.dump(params, sys.stdout.buffer)