def get_nltk_sent_tokenizer(container, lang): assert lang in ["zh", "en"], "Unknown language." trainer = PunktTrainer() if isinstance(container, Container): article_paths = container.get_all_article_paths( root_dir="../processed_data/crawler/nejm/articles/", ext=lang) elif isinstance(container, list): print("{} Articles.".format(len(container))) article_paths = container else: raise ValueError("Cannot parse container with class {}".\ format(container.__class__)) missing_count = 0 for path in article_paths: try: article = get_article_as_lowercase_string(path) trainer.train(text=article, finalize=False) except FileNotFoundError: print("{} not found.".format(path)) missing_count += 1 print("{} articles not found.".format(missing_count)) trainer.finalize_training() tokenizer = PunktSentenceTokenizer(trainer.get_params()) return tokenizer
def train_punkt(ctx, input, output, abbr, colloc): """Train Punkt sentence splitter using sentences in input.""" click.echo('chemdataextractor.tokenize.train_punkt') import pickle from nltk.tokenize.punkt import PunktSentenceTokenizer, PunktTrainer punkt = PunktTrainer() # Set these to true to include collocations more leniently, then increase MIN_COLLOC_FREQ to restrict again # punkt.INCLUDE_ALL_COLLOCS = False # punkt.INCLUDE_ABBREV_COLLOCS = False # punkt.MIN_COLLOC_FREQ = 1 # Don't train on titles. They may contain abbreviations, but basically never have actual sentence boundaries. for fin in input: click.echo('Training on %s' % fin.name) sentences = fin.read() #.replace('.\n', '. \n\n') punkt.train(sentences, finalize=False, verbose=True) punkt.finalize_training(verbose=True) if abbr: abbreviations = abbr.read().strip().split('\n') click.echo('Manually adding abbreviations: %s' % abbreviations) punkt._params.abbrev_types.update(abbreviations) if colloc: collocations = [ tuple(l.split('. ', 1)) for l in colloc.read().strip().split('\n') ] click.echo('Manually adding collocs: %s' % collocations) punkt._params.collocations.update(collocations) model = PunktSentenceTokenizer(punkt.get_params()) pickle.dump(model, output, protocol=pickle.HIGHEST_PROTOCOL)
def train_tokenizer(trainfile,abbreviationfile,modelfile): k = 0 skipped_ = 0 custom_ = 0 punkt = PunktTrainer() input_ = codecs.open(trainfile, encoding='utf-8') for sentence in input_: k+=1 if k%100 == 0: print('trained from sentences :' + str(k)) try: punkt.train(sentence, finalize=False, verbose=False) except: skipped_ += 1 input_.close() if abbreviationfile !='': abbreviations_ = codecs.open(abbreviationfile,encoding='utf-8') for abbr in abbreviations_: try: punkt.train('Start ' + abbr + '. End.' ,finalize=False, verbose=False) custom_ += 1 except: pass abbreviations_.close() punkt.finalize_training(verbose=False) model = PunktSentenceTokenizer(punkt.get_params()) model_output = codecs.open(modelfile,mode='wb') pickle.dump(model,model_output,protocol=pickle.HIGHEST_PROTOCOL) model_output.close() print('') print(str(skipped_) + ' sentences skipped') print(str(custom_) + ' custom abbreviations added')
def main(): opts, args = getopt.getopt(sys.argv[1:], 'l:', []) lang = None for o, a in opts: if o == '-l': lang = a if lang is None: print >> sys.stderr, "Must pass -l language on the command line!" sys.exit(1) if lang == 'en': print >> sys.stderr, "Don't train for -l en! We are using the pre-trained punkt tokenizer from NLTK." sys.exit(1) lang_vars = MyPunktLanguageVars() trainer = PunktTrainer(lang_vars=lang_vars) train(trainer, lang) trainer.finalize_training(verbose=True) tokenizer = PunktSentenceTokenizer(trainer.get_params(), lang_vars=lang_vars) pickle.dump(tokenizer, open('LingwoNLP/punkt-'+lang+'.pickle','wt'))
def main(): opts, args = getopt.getopt(sys.argv[1:], 'l:', []) lang = None for o, a in opts: if o == '-l': lang = a if lang is None: print >> sys.stderr, "Must pass -l language on the command line!" sys.exit(1) if lang == 'en': print >> sys.stderr, "Don't train for -l en! We are using the pre-trained punkt tokenizer from NLTK." sys.exit(1) lang_vars = MyPunktLanguageVars() trainer = PunktTrainer(lang_vars=lang_vars) train(trainer, lang) trainer.finalize_training(verbose=True) tokenizer = PunktSentenceTokenizer(trainer.get_params(), lang_vars=lang_vars) pickle.dump(tokenizer, open('LingwoNLP/punkt-' + lang + '.pickle', 'wt'))
def train_punktsent(trainfile, modelfile): """ Trains an unsupervised NLTK punkt SENTENCE tokenizer. *trainfile* is the filename for the input file. s *modelfile* is the filename for the model output file. """ punkt = PunktTrainer() try: with codecs.open(trainfile, 'r','utf8') as fin: punkt.train(fin.read(), finalize=False, verbose=False) except KeyboardInterrupt: print 'KeyboardInterrupt: Stopping the reading of the dump early!' ##HACK: Adds abbreviations from rb_tokenizer. abbrv_sent = " ".join([i.strip() for i in \ codecs.open('abbrev.lex','r','utf8').readlines()]) abbrv_sent = "Start"+abbrv_sent+"End." punkt.train(abbrv_sent,finalize=False, verbose=False) # Finalize and outputs trained model. punkt.finalize_training(verbose=True) model = PunktSentenceTokenizer(punkt.get_params()) with open(modelfile, mode='wb') as fout: pickle.dump(model, fout, protocol=pickle.HIGHEST_PROTOCOL) return model
def train_punktsent(trainfile, modelfile): """ Trains an unsupervised NLTK punkt SENTENCE tokenizer. *trainfile* is the filename for the input file. s *modelfile* is the filename for the model output file. """ punkt = PunktTrainer() try: with codecs.open(trainfile, 'r', 'utf8') as fin: punkt.train(fin.read(), finalize=False, verbose=False) except KeyboardInterrupt: print 'KeyboardInterrupt: Stopping the reading of the dump early!' ##HACK: Adds abbreviations from rb_tokenizer. abbrv_sent = " ".join([i.strip() for i in \ codecs.open('abbrev.lex','r','utf8').readlines()]) abbrv_sent = "Start" + abbrv_sent + "End." punkt.train(abbrv_sent, finalize=False, verbose=False) # Finalize and outputs trained model. punkt.finalize_training(verbose=True) model = PunktSentenceTokenizer(punkt.get_params()) with open(modelfile, mode='wb') as fout: pickle.dump(model, fout, protocol=pickle.HIGHEST_PROTOCOL) return model
def build_sentence_model(text, extra_abbrevs=None): """ Build a sentence model from text with optional extra abbreviations to include. :param text: :param extra_abbrevs: :return: """ # Setup Punkt trainer punkt_trainer = PunktTrainer() punkt_trainer.train(text, verbose=False, finalize=False) punkt_trainer.finalize_training(verbose=False) # Extract parameters from trainer punkt_params = punkt_trainer.get_params() # Add any extras if passed if extra_abbrevs is not None: for abbrev in extra_abbrevs: punkt_params.abbrev_types.add(abbrev.strip(".").lower()) # Return model instantiated with new parameters return PunktSentenceTokenizer(punkt_params)
# Read in training corpus # cursor = bills.find({}, {"text_versions": 1}, no_cursor_timeout=True) # cursor = bills.find({}, {"text_versions": 1}, no_cursor_timeout=True, limit=10000) cursor = bills.find({"congress": {"$in": ["114", "113"]}}, {"text_versions": 1}, no_cursor_timeout=True) # Train trainer pbar = ProgressBar(maxval=cursor.count()).start() for i, line in enumerate(cursor): text = line['text_versions'].itervalues().next() trainer.train(text, finalize=False, verbose=False) pbar.update(i) pbar.finish() print "Finalizing training..." trainer.finalize_training(verbose=True) print "Training done." # Include custom parameters params = trainer.get_params() # params.collocations = params.collocations | extra_collocations # params.sent_starters = params.sent_starters | extra_sentence_starters with open('sentence_tokenizer_params.pickle', 'wb') as f: pickle.dump(params, f, protocol=pickle.HIGHEST_PROTOCOL) print "Params: %s" % repr(params) # Create tokenizer tokenizer = PunktSentenceTokenizer(params) # Dump pickled tokenizer
heuristic, but not the sentence starter heuristic. This is overridden by INCLUDE_ALL_COLLOCS, and if both are false, only collocations with initials and ordinals are considered.""" """""" trainer.MIN_COLLOC_FREQ = 1 """this sets a minimum bound on the number of times a bigram needs to appear before it can be considered a collocation, in addition to log likelihood statistics. This is useful when INCLUDE_ALL_COLLOCS is True.""" progress = ProgressBar() for doc in progress(docs): trainer.train(doc, finalize=False, verbose=False) print "Finalizing training..." trainer.finalize_training(verbose=True) print "Training done." params = trainer.get_params() with open('sentence_tokenizer_params.pkl', 'wb') as f: pickle.dump(params, f, protocol=pickle.HIGHEST_PROTOCOL) print "Params: %s" % repr(params) # set custom parameters # extra_collocations = {(u'sec', u'##number##')} # extra_sentence_starters = {u'(##number##)'} # extra_abbreviations = {u'U.S.C', u'usc'} # add in custom collocations etc # params.collocations = params.collocations | extra_collocations # params.sent_starters = params.sent_starters | extra_sentence_starters
from os.path import basename from nltk.tokenize.punkt import PunktTrainer __author__ = 'Florian Leitner' __version__ = '1.0' if len(sys.argv) == 2 and sys.argv[1] in ('-h', '--help'): print('usage: {} < TEXT > MODEL'.format(basename(sys.argv[0]))) sys.exit(1) trainer = PunktTrainer() # configuration trainer.ABBREV = 0.3 # cut-off value whether a ‘token’ is an abbreviation trainer.ABBREV_CUTOFF = 5 # upper cut-off for Mikheev’s (2002) abbreviation detection algorithm trainer.COLLOCATION = 7.88 # minimal log-likelihood value that two tokens need to be considered as a collocation trainer.IGNORE_ABBREV_PENALTY = False # disables the abbreviation penalty heuristic, which exponentially disadvantages words that are found at times without a final period trainer.INCLUDE_ABBREV_COLLOCS = True # include as potential collocations all word pairs where the first word is an abbreviation - such collocations override the orthographic heuristic, but not the sentence starter heuristic trainer.INCLUDE_ALL_COLLOCS = False # this includes as potential collocations all word pairs where the first word ends in a period - it may be useful in corpora where there is a lot of variation that makes abbreviations like Mr difficult to identify trainer.MIN_COLLOC_FREQ = 3 # minimum bound on the number of times a bigram needs to appear before it can be considered a collocation - useful when INCLUDE_*_COLLOCS are used trainer.SENT_STARTER = 30 # minimal log-likelihood value that a token requires to be considered as a frequent sentence starter for line in fileinput.input(): trainer.train(line) #print(line) #trainer.freq_threshold() trainer.finalize_training() params = trainer.get_params() pickle.dump(params, sys.stdout.buffer)