Ejemplo n.º 1
0
def get_nltk_sent_tokenizer(container, lang):

    assert lang in ["zh", "en"], "Unknown language."

    trainer = PunktTrainer()
    if isinstance(container, Container):
        article_paths = container.get_all_article_paths(
            root_dir="../processed_data/crawler/nejm/articles/", ext=lang)
    elif isinstance(container, list):
        print("{} Articles.".format(len(container)))
        article_paths = container
    else:
        raise ValueError("Cannot parse container with class {}".\
         format(container.__class__))

    missing_count = 0
    for path in article_paths:
        try:
            article = get_article_as_lowercase_string(path)
            trainer.train(text=article, finalize=False)
        except FileNotFoundError:
            print("{} not found.".format(path))
            missing_count += 1
    print("{} articles not found.".format(missing_count))

    trainer.finalize_training()
    tokenizer = PunktSentenceTokenizer(trainer.get_params())
    return tokenizer
Ejemplo n.º 2
0
def train_punkt(ctx, input, output, abbr, colloc):
    """Train Punkt sentence splitter using sentences in input."""
    click.echo('chemdataextractor.tokenize.train_punkt')
    import pickle
    from nltk.tokenize.punkt import PunktSentenceTokenizer, PunktTrainer
    punkt = PunktTrainer()
    # Set these to true to include collocations more leniently, then increase MIN_COLLOC_FREQ to restrict again
    # punkt.INCLUDE_ALL_COLLOCS = False
    # punkt.INCLUDE_ABBREV_COLLOCS = False
    # punkt.MIN_COLLOC_FREQ = 1
    # Don't train on titles. They may contain abbreviations, but basically never have actual sentence boundaries.
    for fin in input:
        click.echo('Training on %s' % fin.name)
        sentences = fin.read()  #.replace('.\n', '. \n\n')
        punkt.train(sentences, finalize=False, verbose=True)
    punkt.finalize_training(verbose=True)
    if abbr:
        abbreviations = abbr.read().strip().split('\n')
        click.echo('Manually adding abbreviations: %s' % abbreviations)
        punkt._params.abbrev_types.update(abbreviations)
    if colloc:
        collocations = [
            tuple(l.split('. ', 1)) for l in colloc.read().strip().split('\n')
        ]
        click.echo('Manually adding collocs: %s' % collocations)
        punkt._params.collocations.update(collocations)
    model = PunktSentenceTokenizer(punkt.get_params())
    pickle.dump(model, output, protocol=pickle.HIGHEST_PROTOCOL)
Ejemplo n.º 3
0
def train_tokenizer(trainfile,abbreviationfile,modelfile):
 k = 0
 skipped_ = 0
 custom_ = 0
 
 punkt = PunktTrainer()
 input_ = codecs.open(trainfile, encoding='utf-8')
 
 for sentence in input_:
  k+=1
  if k%100 == 0:
   print('trained from sentences :' + str(k))
  try:
   punkt.train(sentence, finalize=False, verbose=False)
  except:
   skipped_ += 1
 
 input_.close()
 
 if abbreviationfile !='':
  abbreviations_ = codecs.open(abbreviationfile,encoding='utf-8') 
  for abbr in abbreviations_:
   try:
    punkt.train('Start ' + abbr + '. End.' ,finalize=False, verbose=False)
    custom_ += 1
   except:
    pass
  abbreviations_.close()
  
 punkt.finalize_training(verbose=False)
 
 model = PunktSentenceTokenizer(punkt.get_params())
 model_output = codecs.open(modelfile,mode='wb')
 pickle.dump(model,model_output,protocol=pickle.HIGHEST_PROTOCOL)
 model_output.close()
 
 print('')
 print(str(skipped_) + ' sentences skipped')
 print(str(custom_) + ' custom abbreviations added')
Ejemplo n.º 4
0
def main():
    opts, args = getopt.getopt(sys.argv[1:], 'l:', [])

    lang = None
    for o, a in opts:
        if o == '-l':
            lang = a

    if lang is None:
        print >> sys.stderr, "Must pass -l language on the command line!"
        sys.exit(1)
    if lang == 'en':
        print >> sys.stderr, "Don't train for -l en!  We are using the pre-trained punkt tokenizer from NLTK."
        sys.exit(1)

    lang_vars = MyPunktLanguageVars()
    trainer = PunktTrainer(lang_vars=lang_vars)
    train(trainer, lang)
    trainer.finalize_training(verbose=True)

    tokenizer = PunktSentenceTokenizer(trainer.get_params(), lang_vars=lang_vars)
    pickle.dump(tokenizer, open('LingwoNLP/punkt-'+lang+'.pickle','wt'))
Ejemplo n.º 5
0
def main():
    opts, args = getopt.getopt(sys.argv[1:], 'l:', [])

    lang = None
    for o, a in opts:
        if o == '-l':
            lang = a

    if lang is None:
        print >> sys.stderr, "Must pass -l language on the command line!"
        sys.exit(1)
    if lang == 'en':
        print >> sys.stderr, "Don't train for -l en!  We are using the pre-trained punkt tokenizer from NLTK."
        sys.exit(1)

    lang_vars = MyPunktLanguageVars()
    trainer = PunktTrainer(lang_vars=lang_vars)
    train(trainer, lang)
    trainer.finalize_training(verbose=True)

    tokenizer = PunktSentenceTokenizer(trainer.get_params(),
                                       lang_vars=lang_vars)
    pickle.dump(tokenizer, open('LingwoNLP/punkt-' + lang + '.pickle', 'wt'))
Ejemplo n.º 6
0
def train_punktsent(trainfile, modelfile):
  """ 
  Trains an unsupervised NLTK punkt SENTENCE tokenizer. 
  *trainfile* is the filename for the input file. s
  *modelfile* is the filename for the model output file.
  """
  punkt = PunktTrainer()
  try:
    with codecs.open(trainfile, 'r','utf8') as fin:
      punkt.train(fin.read(), finalize=False, verbose=False)
  except KeyboardInterrupt:
    print 'KeyboardInterrupt: Stopping the reading of the dump early!'
  ##HACK: Adds abbreviations from rb_tokenizer.
  abbrv_sent = " ".join([i.strip() for i in \
                         codecs.open('abbrev.lex','r','utf8').readlines()])
  abbrv_sent = "Start"+abbrv_sent+"End."
  punkt.train(abbrv_sent,finalize=False, verbose=False)
  # Finalize and outputs trained model.
  punkt.finalize_training(verbose=True)
  model = PunktSentenceTokenizer(punkt.get_params())
  with open(modelfile, mode='wb') as fout:
    pickle.dump(model, fout, protocol=pickle.HIGHEST_PROTOCOL)
  return model
Ejemplo n.º 7
0
def train_punktsent(trainfile, modelfile):
    """ 
  Trains an unsupervised NLTK punkt SENTENCE tokenizer. 
  *trainfile* is the filename for the input file. s
  *modelfile* is the filename for the model output file.
  """
    punkt = PunktTrainer()
    try:
        with codecs.open(trainfile, 'r', 'utf8') as fin:
            punkt.train(fin.read(), finalize=False, verbose=False)
    except KeyboardInterrupt:
        print 'KeyboardInterrupt: Stopping the reading of the dump early!'
    ##HACK: Adds abbreviations from rb_tokenizer.
    abbrv_sent = " ".join([i.strip() for i in \
                           codecs.open('abbrev.lex','r','utf8').readlines()])
    abbrv_sent = "Start" + abbrv_sent + "End."
    punkt.train(abbrv_sent, finalize=False, verbose=False)
    # Finalize and outputs trained model.
    punkt.finalize_training(verbose=True)
    model = PunktSentenceTokenizer(punkt.get_params())
    with open(modelfile, mode='wb') as fout:
        pickle.dump(model, fout, protocol=pickle.HIGHEST_PROTOCOL)
    return model
def build_sentence_model(text, extra_abbrevs=None):
    """
    Build a sentence model from text with optional
    extra abbreviations to include.
    :param text:
    :param extra_abbrevs:
    :return:
    """

    # Setup Punkt trainer
    punkt_trainer = PunktTrainer()
    punkt_trainer.train(text, verbose=False, finalize=False)
    punkt_trainer.finalize_training(verbose=False)

    # Extract parameters from trainer
    punkt_params = punkt_trainer.get_params()

    # Add any extras if passed
    if extra_abbrevs is not None:
        for abbrev in extra_abbrevs:
            punkt_params.abbrev_types.add(abbrev.strip(".").lower())

    # Return model instantiated with new parameters
    return PunktSentenceTokenizer(punkt_params)
Ejemplo n.º 9
0
# Read in training corpus
# cursor = bills.find({}, {"text_versions": 1}, no_cursor_timeout=True)
# cursor = bills.find({}, {"text_versions": 1}, no_cursor_timeout=True, limit=10000)
cursor = bills.find({"congress": {"$in": ["114", "113"]}}, {"text_versions": 1}, no_cursor_timeout=True)

# Train trainer
pbar = ProgressBar(maxval=cursor.count()).start()
for i, line in enumerate(cursor):
    text = line['text_versions'].itervalues().next()
    trainer.train(text, finalize=False, verbose=False)
    pbar.update(i)
pbar.finish()

print "Finalizing training..."
trainer.finalize_training(verbose=True)
print "Training done."

# Include custom parameters
params = trainer.get_params()
# params.collocations = params.collocations | extra_collocations
# params.sent_starters = params.sent_starters | extra_sentence_starters

with open('sentence_tokenizer_params.pickle', 'wb') as f:
    pickle.dump(params, f, protocol=pickle.HIGHEST_PROTOCOL)
print "Params: %s" % repr(params)

# Create tokenizer
tokenizer = PunktSentenceTokenizer(params)

# Dump pickled tokenizer
Ejemplo n.º 10
0
heuristic, but not the sentence starter heuristic. This is overridden by
INCLUDE_ALL_COLLOCS, and if both are false, only collocations with initials
and ordinals are considered."""
""""""

trainer.MIN_COLLOC_FREQ = 1
"""this sets a minimum bound on the number of times a bigram needs to
appear before it can be considered a collocation, in addition to log
likelihood statistics. This is useful when INCLUDE_ALL_COLLOCS is True."""

progress = ProgressBar()
for doc in progress(docs):
    trainer.train(doc, finalize=False, verbose=False)

print "Finalizing training..."
trainer.finalize_training(verbose=True)
print "Training done."

params = trainer.get_params()
with open('sentence_tokenizer_params.pkl', 'wb') as f:
    pickle.dump(params, f, protocol=pickle.HIGHEST_PROTOCOL)
print "Params: %s" % repr(params)

# set custom parameters
# extra_collocations = {(u'sec', u'##number##')}
# extra_sentence_starters = {u'(##number##)'}
# extra_abbreviations = {u'U.S.C', u'usc'}

# add in custom collocations etc
# params.collocations = params.collocations | extra_collocations
# params.sent_starters = params.sent_starters | extra_sentence_starters
Ejemplo n.º 11
0
from os.path import basename
from nltk.tokenize.punkt import PunktTrainer

__author__ = 'Florian Leitner'
__version__ = '1.0'

if len(sys.argv) == 2 and sys.argv[1] in ('-h', '--help'):
    print('usage: {} < TEXT > MODEL'.format(basename(sys.argv[0])))
    sys.exit(1)

trainer = PunktTrainer()
# configuration
trainer.ABBREV = 0.3  # cut-off value whether a ‘token’ is an abbreviation
trainer.ABBREV_CUTOFF = 5  # upper cut-off for Mikheev’s (2002) abbreviation detection algorithm
trainer.COLLOCATION = 7.88  # minimal log-likelihood value that two tokens need to be considered as a collocation
trainer.IGNORE_ABBREV_PENALTY = False  # disables the abbreviation penalty heuristic, which exponentially disadvantages words that are found at times without a final period
trainer.INCLUDE_ABBREV_COLLOCS = True  # include as potential collocations all word pairs where the first word is an abbreviation - such collocations override the orthographic heuristic, but not the sentence starter heuristic
trainer.INCLUDE_ALL_COLLOCS = False  # this includes as potential collocations all word pairs where the first word ends in a period - it may be useful in corpora where there is a lot of variation that makes abbreviations like Mr difficult to identify
trainer.MIN_COLLOC_FREQ = 3  # minimum bound on the number of times a bigram needs to appear before it can be considered a collocation - useful when INCLUDE_*_COLLOCS are used
trainer.SENT_STARTER = 30  # minimal log-likelihood value that a token requires to be considered as a frequent sentence starter

for line in fileinput.input():
    trainer.train(line)
    #print(line)

#trainer.freq_threshold()
trainer.finalize_training()
params = trainer.get_params()
pickle.dump(params, sys.stdout.buffer)