def train_punktsent(trainfile, modelfile): """ Trains an unsupervised NLTK punkt SENTENCE tokenizer. *trainfile* is the filename for the input file. s *modelfile* is the filename for the model output file. """ punkt = PunktTrainer() try: with codecs.open(trainfile, 'r','utf8') as fin: punkt.train(fin.read(), finalize=False, verbose=False) except KeyboardInterrupt: print 'KeyboardInterrupt: Stopping the reading of the dump early!' ##HACK: Adds abbreviations from rb_tokenizer. abbrv_sent = " ".join([i.strip() for i in \ codecs.open('abbrev.lex','r','utf8').readlines()]) abbrv_sent = "Start"+abbrv_sent+"End." punkt.train(abbrv_sent,finalize=False, verbose=False) # Finalize and outputs trained model. punkt.finalize_training(verbose=True) model = PunktSentenceTokenizer(punkt.get_params()) with open(modelfile, mode='wb') as fout: pickle.dump(model, fout, protocol=pickle.HIGHEST_PROTOCOL) return model
def build_sentence_model(text, extra_abbrevs=None): """ Build a sentence model from text with optional extra abbreviations to include. :param text: :param extra_abbrevs: :return: """ # Setup Punkt trainer punkt_trainer = PunktTrainer() punkt_trainer.train(text, verbose=False, finalize=False) punkt_trainer.finalize_training(verbose=False) # Extract parameters from trainer punkt_params = punkt_trainer.get_params() # Add any extras if passed if extra_abbrevs is not None: for abbrev in extra_abbrevs: punkt_params.abbrev_types.add(abbrev.strip(".").lower()) # Return model instantiated with new parameters return PunktSentenceTokenizer(punkt_params)
'''Does the same thing as split_sent.py, but expects the file to be uncompressed and that the columns are src_url, tgt_url, src_line, tgt_line, adq_score, dom''' from nltk.tokenize.punkt import PunktSentenceTokenizer, PunktTrainer import nltk.data, sys, gzip train = False if train: with gzip.open("en_corp", 'rt', encoding='utf-8') as encorp, gzip.open( "de_corp", 'rt', encoding='utf-8') as decorp: text_en = encorp.read() text_de = decorp.read() trainer_en = PunktTrainer() trainer_en.INCLUDE_ALL_COLLOCS = True trainer_en.train(text_en) trainer_de = PunktTrainer() trainer_de.INCLUDE_ALL_COLLOCS = True trainer_de.train(text_de) tokenizer_en = PunktSentenceTokenizer(trainer_en.get_params()) tokenizer_de = PunktSentenceTokenizer(trainer_de.get_params()) else: #tokenizer_en=PunktSentenceTokenizer() #tokenizer_de=PunktSentenceTokenizer() #nltk.download('punkt') tokenizer_en = nltk.data.load('tokenizers/punkt/english.pickle') tokenizer_de = nltk.data.load('tokenizers/punkt/german.pickle') mismatch = 0 with open(sys.argv[1]) as filtered: for line in filtered:
text = "" from nltk.corpus import gutenberg for file_id in gutenberg.fileids(): text += gutenberg.raw(file_id) print len(text) soup = BeautifulSoup(open("D:\\YK Python\\xmltodict\\LUMNLRB3.BL23899175.xml").read(), 'html.parser') from pprint import pprint from nltk.tokenize.punkt import PunktSentenceTokenizer, PunktTrainer trainer = PunktTrainer() trainer.INCLUDE_ALL_COLLOCS = True trainer.train(text) tokenizer = PunktSentenceTokenizer(trainer.get_params()) sentences = soup.get_text(' ') sentence_list= tokenizer.tokenize(sentences) from pymongo import MongoClient client = MongoClient('mongodb://localhost:27017/') db=client['nlp'] coll=db['Keywords_list']
trainer.MIN_COLLOC_FREQ = 1 """this sets a minimum bound on the number of times a bigram needs to appear before it can be considered a collocation, in addition to log likelihood statistics. This is useful when INCLUDE_ALL_COLLOCS is True.""" # Read in training corpus # cursor = bills.find({}, {"text_versions": 1}, no_cursor_timeout=True) # cursor = bills.find({}, {"text_versions": 1}, no_cursor_timeout=True, limit=10000) cursor = bills.find({"congress": {"$in": ["114", "113"]}}, {"text_versions": 1}, no_cursor_timeout=True) # Train trainer pbar = ProgressBar(maxval=cursor.count()).start() for i, line in enumerate(cursor): text = line['text_versions'].itervalues().next() trainer.train(text, finalize=False, verbose=False) pbar.update(i) pbar.finish() print "Finalizing training..." trainer.finalize_training(verbose=True) print "Training done." # Include custom parameters params = trainer.get_params() # params.collocations = params.collocations | extra_collocations # params.sent_starters = params.sent_starters | extra_sentence_starters with open('sentence_tokenizer_params.pickle', 'wb') as f: pickle.dump(params, f, protocol=pickle.HIGHEST_PROTOCOL) print "Params: %s" % repr(params)
from nltk.tokenize.punkt import PunktTrainer import pickle PUNCTUATION = ( ';', '.', '!', '?', ) trainer = PunktTrainer() trainer.INCLUDE_ALL_COLLOCS = True with open('./corpus.txt', 'r') as fs: text = fs.read() trainer.train(text, verbose=True) params = trainer.get_params() with open('./egs/punkt_tokenize/vi.pkl', 'wb') as fs: pickle.dump(params, fs)
trainer.INCLUDE_ABBREV_COLLOCS = False """this includes as potential collocations all word pairs where the first word is an abbreviation. Such collocations override the orthographic heuristic, but not the sentence starter heuristic. This is overridden by INCLUDE_ALL_COLLOCS, and if both are false, only collocations with initials and ordinals are considered.""" """""" trainer.MIN_COLLOC_FREQ = 1 """this sets a minimum bound on the number of times a bigram needs to appear before it can be considered a collocation, in addition to log likelihood statistics. This is useful when INCLUDE_ALL_COLLOCS is True.""" progress = ProgressBar() for doc in progress(docs): trainer.train(doc, finalize=False, verbose=False) print "Finalizing training..." trainer.finalize_training(verbose=True) print "Training done." params = trainer.get_params() with open('sentence_tokenizer_params.pkl', 'wb') as f: pickle.dump(params, f, protocol=pickle.HIGHEST_PROTOCOL) print "Params: %s" % repr(params) # set custom parameters # extra_collocations = {(u'sec', u'##number##')} # extra_sentence_starters = {u'(##number##)'} # extra_abbreviations = {u'U.S.C', u'usc'}
# coding: utf-8 import codecs import sys from nltk.tokenize.punkt import PunktTrainer, PunktSentenceTokenizer training = ''.join( codecs.open('IT-TrainingCorpus.txt', 'rb', 'utf-8').readlines()) trainer = PunktTrainer() trainer.train(training, verbose=True) tokenizer = PunktSentenceTokenizer(trainer.get_params(), verbose=True) text = ''.join(codecs.open(sys.argv[1], 'rb', 'utf-8').readlines()) sentences = tokenizer.tokenize(text) clean = [s for s in sentences if s.find('<strong>') != -1] codecs.open('clean-gold', 'wb', 'utf-8').writelines([s + '\n' for s in clean])
from os.path import basename from nltk.tokenize.punkt import PunktTrainer __author__ = 'Florian Leitner' __version__ = '1.0' if len(sys.argv) == 2 and sys.argv[1] in ('-h', '--help'): print('usage: {} < TEXT > MODEL'.format(basename(sys.argv[0]))) sys.exit(1) trainer = PunktTrainer() # configuration trainer.ABBREV = 0.3 # cut-off value whether a ‘token’ is an abbreviation trainer.ABBREV_CUTOFF = 5 # upper cut-off for Mikheev’s (2002) abbreviation detection algorithm trainer.COLLOCATION = 7.88 # minimal log-likelihood value that two tokens need to be considered as a collocation trainer.IGNORE_ABBREV_PENALTY = False # disables the abbreviation penalty heuristic, which exponentially disadvantages words that are found at times without a final period trainer.INCLUDE_ABBREV_COLLOCS = True # include as potential collocations all word pairs where the first word is an abbreviation - such collocations override the orthographic heuristic, but not the sentence starter heuristic trainer.INCLUDE_ALL_COLLOCS = False # this includes as potential collocations all word pairs where the first word ends in a period - it may be useful in corpora where there is a lot of variation that makes abbreviations like Mr difficult to identify trainer.MIN_COLLOC_FREQ = 3 # minimum bound on the number of times a bigram needs to appear before it can be considered a collocation - useful when INCLUDE_*_COLLOCS are used trainer.SENT_STARTER = 30 # minimal log-likelihood value that a token requires to be considered as a frequent sentence starter for line in fileinput.input(): trainer.train(line) #print(line) #trainer.freq_threshold() trainer.finalize_training() params = trainer.get_params() pickle.dump(params, sys.stdout.buffer)
temperature = default_temperature top_k = default_top_k top_p = default_top_p min_length = default_min_length do_sample = default_do_sample num_return_sequences = default_num_return_sequences num_beams = default_num_beams no_repeat_ngram_size = default_no_repeat_ngram_size early_stopping = default_early_stopping sample_sentences = "" for file_id in gutenberg.fileids(): sample_sentences += gutenberg.raw(file_id) trainer = PunktTrainer() trainer.INCLUDE_ALL_COLLOCS = True trainer.train(sample_sentences) sentence_tokenizer = PunktSentenceTokenizer(trainer.get_params()) sentence_tokenizer._params.abbrev_types.add('dr') def clean_prediction(text): if full_sentences: sentences = drop_incomplete_sentences(text) else: sentences = text return sentences.replace(stop_token, '').strip('\n').strip() def drop_incomplete_sentences(text): sentences = sentence_tokenizer.tokenize(text)
PunktLanguageVars.sent_end_chars = ('.', ';', ';') PunktLanguageVars.internal_punctuation = (',', '·', ':') trainer = PunktTrainer(lang_vars=PunktLanguageVars()) trainer.INCLUDE_ALL_COLLOCS = True trainer.INCLUDE_ABBREV_COLLOCS = True corpus_dir = 'tesserae' + os.sep + 'texts' + os.sep + 'grc' file_extension = 'tess' #Obtain all the files to parse by traversing through the directory file_names = sorted(list({current_path + os.sep + current_file_name for current_path, current_dir_names, current_file_names in \ os.walk(corpus_dir) for current_file_name in current_file_names if current_file_name.endswith('.' + file_extension)})) counter = 1 for file_name in file_names: file_text = file_parsers[file_extension](file_name) trainer.train(file_text, verbose=False, finalize=False) print_progress_bar(counter, len(file_names)) counter += 1 with open(lang + '.pickle', 'wb') as pickle_file: pickle_file.write( pickle.dumps(PunktSentenceTokenizer(trainer.get_params()))) # params = trainer.get_params() # tkzr = PunktSentenceTokenizer(params) # # s = 'test test test test test. test test test test. test test. test test; test test test.' # s = 'test test test. test test test test test; test test. test test' # print(tkzr.tokenize(s)) # print(TokenizeSentence('greek').tokenize_sentences(s))
# coding: utf-8 import codecs from sys import argv, exit from nltk.tokenize.punkt import PunktTrainer, PunktSentenceTokenizer if len(argv) != 3: print "Usage: %s <TRAINING_CORPUS> <SENTENCES_TO_SPLIT>" % __file__ exit(1) training = ''.join(codecs.open(argv[1], 'rb', 'utf-8').readlines()) trainer = PunktTrainer() trainer.train(training, verbose=True) tokenizer = PunktSentenceTokenizer(trainer.get_params(), verbose=True) text = ''.join(codecs.open(argv[2], 'rb', 'utf-8').readlines()) sentences = tokenizer.tokenize(text) codecs.open('split', 'wb', 'utf-8').writelines([s + '\n' for s in sentences])
Session = sessionmaker(bind=engine) session = Session() return session if __name__ == "__main__": session = loadSession() res = session.query(Story).all() all_text = '' for story in res: all_text += story.text all_text += ' ' trainer = PunktTrainer() trainer.INCLUDE_ALL_COLLOCS = True trainer.train(all_text) tokenizer = PunktSentenceTokenizer(trainer.get_params()) # Test the tokenizer on a piece of text sentences = "Работаю в провинциальном городе в магазине отделочных материалов и сантехники.Заходит к нам на днях надменная пергидролевая дева, покрытая слоем штукатурки толщиной в палец. Собирается с мыслями, напускает на себя важный вид и обращается ко мне:Дева, медленно и с видом опытного сантехника: Молодой человек, у вас ванны железные есть?Я: Нет, у нас только акрил. Металлических нет.Дева: Молодой человек, я не спрашиваю металлические, я спрашиваю железные!Я: Извините, железных тоже нет.Дева презрительно смотрит на меня, бурчит что-то себе под нос, и, виляя бедрами, уходит. Смотрим в окно. Выходит. Подходит к побитой жизнью шестерке, деловито садится на переднее сиденье, подзывает торопливо курящего поодаль водителя.Дева, возмущенно: Понабрали крестьян, металлические ванны от железных не отличают!Водитель, тяжело вздохнув, затаптывает окурок, занимает свое место, и экипаж отправляется дальше, на поиски волшебной неметаллической ванны из железа." print(tokenizer.tokenize(sentences)) # ['Mr. James told me Dr.', 'Brown is not available today.', 'I will try tomorrow.'] # View the learned abbreviations print(tokenizer._params.abbrev_types_) # set([...]) raise
from cltk.utils.file_operations import open_pickle from nltk.tokenize.punkt import PunktLanguageVars, PunktTrainer, PunktSentenceTokenizer from extract_features import parse_tess PunktLanguageVars.sent_end_chars = ('.', ';', ';') PunktLanguageVars.internal_punctuation = (',', '·', ':') text = parse_tess('tesserae/texts/grc/xenophon.anabasis.tess') new_xeno_trainer = PunktTrainer() # new_xeno_trainer.INCLUDE_ALL_COLLOCS = True # new_xeno_trainer.INCLUDE_ABBREV_COLLOCS = True new_xeno_trainer.train(text) new_xeno_params = new_xeno_trainer.get_params() tess_xeno_params = open_pickle('tokenizers/ancient_greek.pickle')._params print(new_xeno_params.abbrev_types) print(new_xeno_params.abbrev_types == tess_xeno_params.abbrev_types) print() print(new_xeno_params.collocations) print(new_xeno_params.collocations == tess_xeno_params.collocations) print() print(new_xeno_params.sent_starters) print(new_xeno_params.sent_starters == tess_xeno_params.sent_starters) print() print(new_xeno_params.ortho_context) print(new_xeno_params.ortho_context == tess_xeno_params.ortho_context) print() ''' I got the internal PunktParameters object from the cltk pickle file that was trained on Xenophon's Anabasis (https://github.com/cltk/greek_training_set_sentence_cltk/blob/master/training_sentences.txt), and I also got the internal PunktParameters object from an PunktTrainer that I created from training on Xenophon's Anabasis from the tesserae corpus (https://github.com/tesserae/tesserae/blob/master/texts/grc/xenophon.anabasis.tess).