def train_sentence_tokenizer(self: object, text: str): """ Train sentence tokenizer. """ language_punkt_vars = PunktLanguageVars # Set punctuation if self.punctuation: if self.strict: language_punkt_vars.sent_end_chars = self.punctuation + self.strict_punctuation else: language_punkt_vars.sent_end_chars = self.punctuation # Set abbreviations trainer = PunktTrainer(text, language_punkt_vars) trainer.INCLUDE_ALL_COLLOCS = True trainer.INCLUDE_ABBREV_COLLOCS = True tokenizer = PunktSentenceTokenizer(trainer.get_params()) if self.abbreviations: for abbreviation in self.abbreviations: tokenizer._params.abbrev_types.add(abbreviation) return tokenizer
def train_sentence_tokenizer(self: object, text: str): """ Train sentences tokenizer. """ language_punkt_vars = PunktLanguageVars # Set punctuation if self.punctuation: if self.strict: language_punkt_vars.sent_end_chars = (self.punctuation + self.strict_punctuation) else: language_punkt_vars.sent_end_chars = self.punctuation # Set abbreviations trainer = PunktTrainer(text, language_punkt_vars) trainer.INCLUDE_ALL_COLLOCS = True trainer.INCLUDE_ABBREV_COLLOCS = True tokenizer = PunktSentenceTokenizer(trainer.get_params()) if self.abbreviations: for abbreviation in self.abbreviations: tokenizer._params.abbrev_types.add(abbreviation) return tokenizer
def train(src, tgt): with open(src, 'r', encoding='utf-8') as infile, \ open(tgt, 'wb') as sent_tokenizer: contents = infile.read() language_punkt_vars = PunktLanguageVars # language_punkt_vars.sent_end_chars=tuple(args.end_chars) print("# Training sent tokenizer") trainer = PunktTrainer(contents, language_punkt_vars) trainer.INCLUDE_ALL_COLLOCS = True trainer.INCLUDE_ABBREV_COLLOCS = True params = trainer.get_params() tokenizer = PunktSentenceTokenizer(params) tokenizer._params.abbrev_types.add('brgy') tokenizer._params.abbrev_types.add('sen') tokenizer._params.abbrev_types.add('supt') tokenizer._params.abbrev_types.add('rep') tokenizer._params.abbrev_types.add('dr') tokenizer._params.abbrev_types.add('col') tokenizer._params.abbrev_types.add('sec') tokenizer._params.abbrev_types.add('mt') tokenizer._params.abbrev_types.add('asst') tokenizer._params.abbrev_types.add('mr') tokenizer._params.abbrev_types.add('c/insp') tokenizer._params.abbrev_types.add('rep') tokenizer._params.abbrev_types.add('sta') tokenizer._params.abbrev_types.add('sto') pickle.dump(tokenizer, sent_tokenizer)
def constructor(): trainer = PunktTrainer() trainer.INCLUDE_ALL_COLLOCS = True trainer.INCLUDE_ABBREV_COLLOCS = True trainer.train_tokens(self.words()) params = trainer.get_params() return PunktSentenceTokenizer(params)
def trainSentenceTokenizer(): """ Method trains custom sentence tokenizer using punk. At the moment it preforms worse then plain english one (most likely due to not that much data) """ collection = database["crawled-data"] text = "" for record in collection.find({ABSTRACT_DOCUMENT: {"$ne": None}}): text += record[ABSTRACT_DOCUMENT] + " " trainer = PunktTrainer() trainer.INCLUDE_ALL_COLLOCS = True trainer.INCLUDE_ABBREV_COLLOCS = True trainer.train(text) model = nltk.PunktSentenceTokenizer(trainer.get_params()) with open("latvianPunkt2.pickle", mode='wb') as fout: pickle.dump(model, fout, protocol=pickle.HIGHEST_PROTOCOL)
"""upper cut-off for Mikheev's(2002) abbreviation detection algorithm""" trainer.COLLOCATION = 7.88 """minimal log-likelihood value that two tokens need to be considered as a collocation""" trainer.SENT_STARTER = 30 """minimal log-likelihood value that a token requires to be considered as a frequent sentence starter""" trainer.INCLUDE_ALL_COLLOCS = False """this includes as potential collocations all word pairs where the first word ends in a period. It may be useful in corpora where there is a lot of variation that makes abbreviations like Mr difficult to identify.""" trainer.INCLUDE_ABBREV_COLLOCS = False """this includes as potential collocations all word pairs where the first word is an abbreviation. Such collocations override the orthographic heuristic, but not the sentence starter heuristic. This is overridden by INCLUDE_ALL_COLLOCS, and if both are false, only collocations with initials and ordinals are considered.""" """""" trainer.MIN_COLLOC_FREQ = 1 """this sets a minimum bound on the number of times a bigram needs to appear before it can be considered a collocation, in addition to log likelihood statistics. This is useful when INCLUDE_ALL_COLLOCS is True.""" # Read in training corpus # cursor = bills.find({}, {"text_versions": 1}, no_cursor_timeout=True)
"""upper cut-off for Mikheev's(2002) abbreviation detection algorithm""" trainer.COLLOCATION = 7.88 """minimal log-likelihood value that two tokens need to be considered as a collocation""" trainer.SENT_STARTER = 30 """minimal log-likelihood value that a token requires to be considered as a frequent sentence starter""" trainer.INCLUDE_ALL_COLLOCS = False """this includes as potential collocations all word pairs where the first word ends in a period. It may be useful in corpora where there is a lot of variation that makes abbreviations like Mr difficult to identify.""" trainer.INCLUDE_ABBREV_COLLOCS = False """this includes as potential collocations all word pairs where the first word is an abbreviation. Such collocations override the orthographic heuristic, but not the sentence starter heuristic. This is overridden by INCLUDE_ALL_COLLOCS, and if both are false, only collocations with initials and ordinals are considered.""" """""" trainer.MIN_COLLOC_FREQ = 1 """this sets a minimum bound on the number of times a bigram needs to appear before it can be considered a collocation, in addition to log likelihood statistics. This is useful when INCLUDE_ALL_COLLOCS is True.""" progress = ProgressBar() for doc in progress(docs): trainer.train(doc, finalize=False, verbose=False)
from os.path import basename from nltk.tokenize.punkt import PunktTrainer __author__ = 'Florian Leitner' __version__ = '1.0' if len(sys.argv) == 2 and sys.argv[1] in ('-h', '--help'): print('usage: {} < TEXT > MODEL'.format(basename(sys.argv[0]))) sys.exit(1) trainer = PunktTrainer() # configuration trainer.ABBREV = 0.3 # cut-off value whether a ‘token’ is an abbreviation trainer.ABBREV_CUTOFF = 5 # upper cut-off for Mikheev’s (2002) abbreviation detection algorithm trainer.COLLOCATION = 7.88 # minimal log-likelihood value that two tokens need to be considered as a collocation trainer.IGNORE_ABBREV_PENALTY = False # disables the abbreviation penalty heuristic, which exponentially disadvantages words that are found at times without a final period trainer.INCLUDE_ABBREV_COLLOCS = True # include as potential collocations all word pairs where the first word is an abbreviation - such collocations override the orthographic heuristic, but not the sentence starter heuristic trainer.INCLUDE_ALL_COLLOCS = False # this includes as potential collocations all word pairs where the first word ends in a period - it may be useful in corpora where there is a lot of variation that makes abbreviations like Mr difficult to identify trainer.MIN_COLLOC_FREQ = 3 # minimum bound on the number of times a bigram needs to appear before it can be considered a collocation - useful when INCLUDE_*_COLLOCS are used trainer.SENT_STARTER = 30 # minimal log-likelihood value that a token requires to be considered as a frequent sentence starter for line in fileinput.input(): trainer.train(line) #print(line) #trainer.freq_threshold() trainer.finalize_training() params = trainer.get_params() pickle.dump(params, sys.stdout.buffer)
from extract_features import parse_tess from cltk.utils.file_operations import open_pickle from nltk.tokenize.punkt import PunktLanguageVars, PunktTrainer, PunktSentenceTokenizer PunktLanguageVars.sent_end_chars = ('.', ';', ';') PunktLanguageVars.internal_punctuation = (',', '·', ':') text = parse_tess('tesserae/texts/grc/xenophon.anabasis.tess') tokenizer = open_pickle('tokenizers/ancient_greek.pickle') print('Xenophon tokens: ' + str(len(tokenizer.tokenize(text)))) print() trainer = PunktTrainer(lang_vars=PunktLanguageVars()) trainer.INCLUDE_ALL_COLLOCS = True trainer.INCLUDE_ABBREV_COLLOCS = True trainer.train(text, verbose=True) new_tokenizer = PunktSentenceTokenizer(trainer.get_params()) print('tokenizers equal? ' + str(tokenizer == new_tokenizer)) print('tokenization equal? ' + str(tokenizer.tokenize(text) == new_tokenizer.tokenize(text))) old_tok_out = open('feature_data/old_tok.txt', mode='w') old_tok_out.write('\n'.join(tokenizer.tokenize(text))) new_tok_out = open('feature_data/new_tok.txt', mode='w') new_tok_out.write('\n'.join(new_tokenizer.tokenize(text))) ''' There seem to be very few abbreviations in the tesserae corpus. This means training the PunktSentenceTokenizer might not yield any improvement. From paper abstract: "[Punkt sentence tokenization training] is based on the assumption that a large number of ambiguities in the determination of sentence boundaries can be eliminated once abbreviations have been identified."