def summarize_pdf(article_text): trainer=PunktTrainer() trainer.train(article_text) tok=PunktSentenceTokenizer(trainer.get_params()) sentence_list = tok.tokenize(article_text) sentence_lists=[] sent_list=[] clean_sent=[] for sent in sentence_list: tok=TreebankWordTokenizer() words=tok.tokenize(sent) wordss=[] words=[ww.lower() for ww in words] sentence_lists.append(" ".join(words)) for word,tag in pos_tag(words): if tag.startswith('NN'): pos='n' elif tag.startswith('VB'): pos='v' elif tag.startswith('RB'): pos='r' else: pos='a' stem=WordNetLemmatizer() w=stem.lemmatize(word,pos) if(w not in punc) & bool(re.search("[^\d]",w)): wordss.append(w.lower()) clean_sent.append(' '.join(wordss)) sent_list.append(wordss) return sent_list,clean_sent,sentence_lists,sentence_list
def train_sentence_tokenizer(self: object, text: str): """ Train sentence tokenizer. """ language_punkt_vars = PunktLanguageVars # Set punctuation if self.punctuation: if self.strict: language_punkt_vars.sent_end_chars = self.punctuation + self.strict_punctuation else: language_punkt_vars.sent_end_chars = self.punctuation # Set abbreviations trainer = PunktTrainer(text, language_punkt_vars) trainer.INCLUDE_ALL_COLLOCS = True trainer.INCLUDE_ABBREV_COLLOCS = True tokenizer = PunktSentenceTokenizer(trainer.get_params()) if self.abbreviations: for abbreviation in self.abbreviations: tokenizer._params.abbrev_types.add(abbreviation) return tokenizer
def train_sentence_tokenizer(self: object, text: str): """ Train sentences tokenizer. """ language_punkt_vars = PunktLanguageVars # Set punctuation if self.punctuation: if self.strict: language_punkt_vars.sent_end_chars = (self.punctuation + self.strict_punctuation) else: language_punkt_vars.sent_end_chars = self.punctuation # Set abbreviations trainer = PunktTrainer(text, language_punkt_vars) trainer.INCLUDE_ALL_COLLOCS = True trainer.INCLUDE_ABBREV_COLLOCS = True tokenizer = PunktSentenceTokenizer(trainer.get_params()) if self.abbreviations: for abbreviation in self.abbreviations: tokenizer._params.abbrev_types.add(abbreviation) return tokenizer
def train(src, tgt): with open(src, 'r', encoding='utf-8') as infile, \ open(tgt, 'wb') as sent_tokenizer: contents = infile.read() language_punkt_vars = PunktLanguageVars # language_punkt_vars.sent_end_chars=tuple(args.end_chars) print("# Training sent tokenizer") trainer = PunktTrainer(contents, language_punkt_vars) trainer.INCLUDE_ALL_COLLOCS = True trainer.INCLUDE_ABBREV_COLLOCS = True params = trainer.get_params() tokenizer = PunktSentenceTokenizer(params) tokenizer._params.abbrev_types.add('brgy') tokenizer._params.abbrev_types.add('sen') tokenizer._params.abbrev_types.add('supt') tokenizer._params.abbrev_types.add('rep') tokenizer._params.abbrev_types.add('dr') tokenizer._params.abbrev_types.add('col') tokenizer._params.abbrev_types.add('sec') tokenizer._params.abbrev_types.add('mt') tokenizer._params.abbrev_types.add('asst') tokenizer._params.abbrev_types.add('mr') tokenizer._params.abbrev_types.add('c/insp') tokenizer._params.abbrev_types.add('rep') tokenizer._params.abbrev_types.add('sta') tokenizer._params.abbrev_types.add('sto') pickle.dump(tokenizer, sent_tokenizer)
class Punket_tokenizer: def __init__(self): self.modelfile = 'punket_tokenizer.pk' if os.path.exists(self.modelfile): self.tokenizer = self.punkt_tokenize_load() else: self.trainer = PunktTrainer() text = "" for file_id in gutenberg.fileids(): text += gutenberg.raw(file_id) self.trainer.INCLUDE_ALL_COLLOCS = True self.trainer.train(text) self.tokenizer = PunktSentenceTokenizer(self.trainer.get_params()) self.tokenizer._params.abbrev_types.add('dr') self.tokenizer._params.abbrev_types.add('mr') self.tokenizer._params.abbrev_types.add('mrs') self.tokenizer._params.abbrev_types.add('miss') self.tokenizer._params.abbrev_types.add('ms') self.tokenizer._params.abbrev_types.add('no') self.tokenizer._params.abbrev_types.add('jan') self.tokenizer._params.abbrev_types.add('feb') self.tokenizer._params.abbrev_types.add('mar') self.tokenizer._params.abbrev_types.add('apr') self.tokenizer._params.abbrev_types.add('may') self.tokenizer._params.abbrev_types.add('jun') self.tokenizer._params.abbrev_types.add('aug') self.tokenizer._params.abbrev_types.add('sep') self.tokenizer._params.abbrev_types.add('oct') self.tokenizer._params.abbrev_types.add('nov') self.tokenizer._params.abbrev_types.add('dec') with open(self.modelfile, mode='wb') as fout: pickle.dump(self.tokenizer, fout, protocol=pickle.HIGHEST_PROTOCOL) def punkt_tokenize_load(self): with open(self.modelfile, mode='rb') as fin: punket_tokenizer = pickle.load(fin) return punket_tokenizer def puket_tokenizer_add_rule(self, word): self.tokenizer._params.abbrev_types.add(word) def punket_sentence_tokenizer(self, sentences): return self.tokenizer.tokenize(sentences)
def score(trainer: PunktTrainer, typ: str) -> float: # Count how many periods & nonperiods are in the # candidate. num_periods = typ.count('.') + 1 num_nonperiods = len(typ) - num_periods + 1 # Let <a> be the candidate without the period, and <b> # be the period. Find a log likelihood ratio that # indicates whether <ab> occurs as a single unit (high # value of ll), or as two independent units <a> and # <b> (low value of ll). count_with_period = trainer._type_fdist[typ + '.'] count_without_period = trainer._type_fdist[typ] ll = trainer._dunning_log_likelihood( count_with_period + count_without_period, trainer._num_period_toks, count_with_period, trainer._type_fdist.N(), ) # Apply three scaling factors to 'tweak' the basic log # likelihood ratio: # F_length: long word -> less likely to be an abbrev # F_periods: more periods -> more likely to be an abbrev # F_penalty: penalize occurrences w/o a period f_length = math.exp(-num_nonperiods) f_periods = num_periods f_penalty = int(trainer.IGNORE_ABBREV_PENALTY) or math.pow( num_nonperiods, -count_without_period) score = ll * f_length * f_periods * f_penalty return score
def train_punkt(ctx, input, output, abbr, colloc): """Train Punkt sentence splitter using sentences in input.""" click.echo('chemdataextractor.tokenize.train_punkt') import pickle from nltk.tokenize.punkt import PunktSentenceTokenizer, PunktTrainer punkt = PunktTrainer() # Set these to true to include collocations more leniently, then increase MIN_COLLOC_FREQ to restrict again # punkt.INCLUDE_ALL_COLLOCS = False # punkt.INCLUDE_ABBREV_COLLOCS = False # punkt.MIN_COLLOC_FREQ = 1 # Don't train on titles. They may contain abbreviations, but basically never have actual sentence boundaries. for fin in input: click.echo('Training on %s' % fin.name) sentences = fin.read() #.replace('.\n', '. \n\n') punkt.train(sentences, finalize=False, verbose=True) punkt.finalize_training(verbose=True) if abbr: abbreviations = abbr.read().strip().split('\n') click.echo('Manually adding abbreviations: %s' % abbreviations) punkt._params.abbrev_types.update(abbreviations) if colloc: collocations = [ tuple(l.split('. ', 1)) for l in colloc.read().strip().split('\n') ] click.echo('Manually adding collocs: %s' % collocations) punkt._params.collocations.update(collocations) model = PunktSentenceTokenizer(punkt.get_params()) pickle.dump(model, output, protocol=pickle.HIGHEST_PROTOCOL)
def constructor(): trainer = PunktTrainer() trainer.INCLUDE_ALL_COLLOCS = True trainer.INCLUDE_ABBREV_COLLOCS = True trainer.train_tokens(self.words()) params = trainer.get_params() return PunktSentenceTokenizer(params)
def get_nltk_sent_tokenizer(container, lang): assert lang in ["zh", "en"], "Unknown language." trainer = PunktTrainer() if isinstance(container, Container): article_paths = container.get_all_article_paths( root_dir="../processed_data/crawler/nejm/articles/", ext=lang) elif isinstance(container, list): print("{} Articles.".format(len(container))) article_paths = container else: raise ValueError("Cannot parse container with class {}".\ format(container.__class__)) missing_count = 0 for path in article_paths: try: article = get_article_as_lowercase_string(path) trainer.train(text=article, finalize=False) except FileNotFoundError: print("{} not found.".format(path)) missing_count += 1 print("{} articles not found.".format(missing_count)) trainer.finalize_training() tokenizer = PunktSentenceTokenizer(trainer.get_params()) return tokenizer
def train_from_file(training_file): """Make a ruleset from a file.""" language_punkt_vars = PunktLanguageVars language_punkt_vars.sent_end_chars = ('.', '?', ':') language_punkt_vars.internal_punctuation = (',', ';') with open(training_file) as opened_training_file: train_data = opened_training_file.read() trainer = PunktTrainer(train_data, language_punkt_vars) with open('latin.pickle', 'wb') as open_pickle_file: pickle.dump(trainer, open_pickle_file)
def __init__(self, vocab_size=None): self.vocab_size = vocab_size # Tokenization tools and other private attributes. self._sent_tokenizer = PunktSentenceTokenizer() self._sent_trainer = PunktTrainer() self._word_tokenizer = WordTokenizer() self._index_to_dfreq = None self._is_finalized = False # Number of times a given word has been seen across entire corpus. self.word_to_freq = OrderedCounter() # Number of docs that contained word w. self.word_to_dfreq = OrderedCounter() # Number of documents trained on so far. self.num_docs = 0 # Dicts that will be filled when fitting documents. # word_index: w => i (index into vocabulary) # index_docs: i => doc_counts (doc_freq for word with index i). self.word_to_index = OrderedDict()
def rank_sentences(text, sentence_scores, title="", n=7): final_sentences = [] trainer = PunktTrainer() trainer.INCLUDE_ALL_COLLOCS = True trainer.train(text) sent_tokenizer = PunktSentenceTokenizer(trainer.get_params()) for s in sentence_scores: if title == "": break else: sentence_scores[s] *= (1 + similarity_score(title, s)) sc = sentence_scores.copy() sc = OrderedDict(sorted(sc.items(), key=lambda t: t[1], reverse=True)) ordered_sents = dict(islice(sc.items(), n)) proper_sentences = sent_tokenizer.tokenize(text) for s in proper_sentences: if s.lower() in ordered_sents: final_sentences.append(s) return final_sentences
def main(): opts, args = getopt.getopt(sys.argv[1:], 'l:', []) lang = None for o, a in opts: if o == '-l': lang = a if lang is None: print >> sys.stderr, "Must pass -l language on the command line!" sys.exit(1) if lang == 'en': print >> sys.stderr, "Don't train for -l en! We are using the pre-trained punkt tokenizer from NLTK." sys.exit(1) lang_vars = MyPunktLanguageVars() trainer = PunktTrainer(lang_vars=lang_vars) train(trainer, lang) trainer.finalize_training(verbose=True) tokenizer = PunktSentenceTokenizer(trainer.get_params(), lang_vars=lang_vars) pickle.dump(tokenizer, open('LingwoNLP/punkt-'+lang+'.pickle','wt'))
def get_tokenizer(training_text): trainer = PunktTrainer() trainer.INCLUDE_ALL_COLLOCS = True trainer.train(training_text) tokenizer = PunktSentenceTokenizer(trainer.get_params()) tokenizer._params.abbrev_types.update(ABBREVIATIONS) return tokenizer
def main(): opts, args = getopt.getopt(sys.argv[1:], 'l:', []) lang = None for o, a in opts: if o == '-l': lang = a if lang is None: print >> sys.stderr, "Must pass -l language on the command line!" sys.exit(1) if lang == 'en': print >> sys.stderr, "Don't train for -l en! We are using the pre-trained punkt tokenizer from NLTK." sys.exit(1) lang_vars = MyPunktLanguageVars() trainer = PunktTrainer(lang_vars=lang_vars) train(trainer, lang) trainer.finalize_training(verbose=True) tokenizer = PunktSentenceTokenizer(trainer.get_params(), lang_vars=lang_vars) pickle.dump(tokenizer, open('LingwoNLP/punkt-' + lang + '.pickle', 'wt'))
def train_latin_from_file(): """Open a training set file and write a Latin pickle trainer""" training_file = 'training_sentences.txt' training_path = os.path.join(cltk_data, 'compiled', 'sentence_tokens_latin/', training_file) with open(training_path, 'r') as f: train_data = f.read() language_vars = PunktLanguageVars language_vars.sent_end_chars=('.', '?', ';', ':') language_vars.internal_punctuation = ',' trainer = PunktTrainer(train_data, language_vars) pickle_name = 'latin.pickle' pickle_path = os.path.join(cltk_data, 'compiled', 'sentence_tokens_latin/', pickle_name) with open(pickle_path, 'wb') as f: pickle.dump(trainer, f)
def train_greek_from_file(): language_punkt_vars = PunktLanguageVars language_punkt_vars.sent_end_chars = ('.', ';',) language_punkt_vars.internal_punctuation = (',', '·') training_file = 'training_sentences.txt' training_path = os.path.join(cltk_data, 'compiled', 'sentence_tokens_greek/', training_file) with open(training_path) as f: train_data = f.read() #build trainer trainer = PunktTrainer(train_data, language_punkt_vars) pickle_name = 'greek.pickle' pickle_path = os.path.join(cltk_data, 'compiled', 'sentence_tokens_greek/', pickle_name) with open(pickle_path, 'wb') as f: pickle.dump(trainer, f)
def train_tokenizer(trainfile,abbreviationfile,modelfile): k = 0 skipped_ = 0 custom_ = 0 punkt = PunktTrainer() input_ = codecs.open(trainfile, encoding='utf-8') for sentence in input_: k+=1 if k%100 == 0: print('trained from sentences :' + str(k)) try: punkt.train(sentence, finalize=False, verbose=False) except: skipped_ += 1 input_.close() if abbreviationfile !='': abbreviations_ = codecs.open(abbreviationfile,encoding='utf-8') for abbr in abbreviations_: try: punkt.train('Start ' + abbr + '. End.' ,finalize=False, verbose=False) custom_ += 1 except: pass abbreviations_.close() punkt.finalize_training(verbose=False) model = PunktSentenceTokenizer(punkt.get_params()) model_output = codecs.open(modelfile,mode='wb') pickle.dump(model,model_output,protocol=pickle.HIGHEST_PROTOCOL) model_output.close() print('') print(str(skipped_) + ' sentences skipped') print(str(custom_) + ' custom abbreviations added')
def trainSentenceTokenizer(self): text = "" for file_id in gutenberg.fileids(): text += gutenberg.raw(file_id) trainer = PunktTrainer() trainer.INCLUDE_ALL_COLLOCS = True trainer.train(text) tokenizer = PunktSentenceTokenizer(trainer.get_params()) tokenizer._params.abbrev_types.add('dr') tokenizer._params.abbrev_types.add('fig') return tokenizer
def train_punktsent(trainfile, modelfile): """ Trains an unsupervised NLTK punkt SENTENCE tokenizer. *trainfile* is the filename for the input file. s *modelfile* is the filename for the model output file. """ punkt = PunktTrainer() try: with codecs.open(trainfile, 'r', 'utf8') as fin: punkt.train(fin.read(), finalize=False, verbose=False) except KeyboardInterrupt: print 'KeyboardInterrupt: Stopping the reading of the dump early!' ##HACK: Adds abbreviations from rb_tokenizer. abbrv_sent = " ".join([i.strip() for i in \ codecs.open('abbrev.lex','r','utf8').readlines()]) abbrv_sent = "Start" + abbrv_sent + "End." punkt.train(abbrv_sent, finalize=False, verbose=False) # Finalize and outputs trained model. punkt.finalize_training(verbose=True) model = PunktSentenceTokenizer(punkt.get_params()) with open(modelfile, mode='wb') as fout: pickle.dump(model, fout, protocol=pickle.HIGHEST_PROTOCOL) return model
def get_sentence_tokenizer(language): """ Return the sentence tokenizer callable. """ pickle_path = 'sentence_tokenizer.pickle' try: input_file = open(pickle_path, 'rb') sentence_tokenizer = load(input_file) input_file.close() except FileNotFoundError: data_file_paths = [] sentences = [] try: # Get the paths to each file the bot will be trained with corpus_files = list_corpus_files('core.corpus.{language}'.format( language=language.ENGLISH_NAME.lower() )) except LookupError: # Fall back to English sentence splitting rules if a language is not supported corpus_files = list_corpus_files('core.corpus.{language}'.format( language=languages.ENG.ENGLISH_NAME.lower() )) data_file_paths.extend(corpus_files) for corpus, _categories, _file_path in load_corpus(*data_file_paths): for conversation in corpus: for text in conversation: sentences.append(text.upper()) sentences.append(text.lower()) trainer = PunktTrainer() trainer.INCLUDE_ALL_COLLOCS = True trainer.train('\n'.join(sentences)) sentence_tokenizer = PunktSentenceTokenizer(trainer.get_params()) # Pickle the sentence tokenizer for future use output_file = open(pickle_path, 'wb') dump(sentence_tokenizer, output_file, -1) output_file.close() return sentence_tokenizer
def trainSentenceTokenizer(): """ Method trains custom sentence tokenizer using punk. At the moment it preforms worse then plain english one (most likely due to not that much data) """ collection = database["crawled-data"] text = "" for record in collection.find({ABSTRACT_DOCUMENT: {"$ne": None}}): text += record[ABSTRACT_DOCUMENT] + " " trainer = PunktTrainer() trainer.INCLUDE_ALL_COLLOCS = True trainer.INCLUDE_ABBREV_COLLOCS = True trainer.train(text) model = nltk.PunktSentenceTokenizer(trainer.get_params()) with open("latvianPunkt2.pickle", mode='wb') as fout: pickle.dump(model, fout, protocol=pickle.HIGHEST_PROTOCOL)
def train_punktsent(trainfile, modelfile): """ Trains an unsupervised NLTK punkt SENTENCE tokenizer. *trainfile* is the filename for the input file. s *modelfile* is the filename for the model output file. """ punkt = PunktTrainer() try: with codecs.open(trainfile, 'r','utf8') as fin: punkt.train(fin.read(), finalize=False, verbose=False) except KeyboardInterrupt: print 'KeyboardInterrupt: Stopping the reading of the dump early!' ##HACK: Adds abbreviations from rb_tokenizer. abbrv_sent = " ".join([i.strip() for i in \ codecs.open('abbrev.lex','r','utf8').readlines()]) abbrv_sent = "Start"+abbrv_sent+"End." punkt.train(abbrv_sent,finalize=False, verbose=False) # Finalize and outputs trained model. punkt.finalize_training(verbose=True) model = PunktSentenceTokenizer(punkt.get_params()) with open(modelfile, mode='wb') as fout: pickle.dump(model, fout, protocol=pickle.HIGHEST_PROTOCOL) return model
def get_tokenizer(self, xml, abbrevWordList, spentSplitList): #class BulletPointLangVars(PunktLanguageVars): #sent_end_chars = ('?', '!') #for i in range(len(spentSplitList)): # sent_end_chars = sent_end_chars + tuple(spentSplitList[i]) trainer = PunktTrainer() trainer.INCLUDE_ALL_COLLOCS = True train_data = 'sss' trainer.train(train_data) tokenizer = PunktSentenceTokenizer(trainer.get_params()) #tokenizer = PunktSentenceTokenizer(trainer.get_params(), lang_vars = BulletPointLangVars()) #문장분리 예외추가 rule['ABBREV_WORDS'].extend(abbrevWordList) for i in rule['ABBREV_WORDS']: tokenizer._params.abbrev_types.add(i) tokenizer = PunktSentenceTokenizer(trainer.get_params()) return tokenizer
def score_sentences(text, word_scores, unique): trainer = PunktTrainer() trainer.INCLUDE_ALL_COLLOCS = True trainer.train(text) sent_score = {} sent_tokenizer = PunktSentenceTokenizer(trainer.get_params()) sentences = sent_tokenizer.tokenize(text.lower()) for s in sentences: words = clean_text(s) sent_score[s] = 0 for w in words: w = lemmatizer.lemmatize(w) if w in unique: sent_score[s] += word_scores[w] return sent_score
def build_sentence_model(text, extra_abbrevs=None): """ Build a sentence model from text with optional extra abbreviations to include. :param text: :param extra_abbrevs: :return: """ # Setup Punkt trainer punkt_trainer = PunktTrainer() punkt_trainer.train(text, verbose=False, finalize=False) punkt_trainer.finalize_training(verbose=False) # Extract parameters from trainer punkt_params = punkt_trainer.get_params() # Add any extras if passed if extra_abbrevs is not None: for abbrev in extra_abbrevs: punkt_params.abbrev_types.add(abbrev.strip(".").lower()) # Return model instantiated with new parameters return PunktSentenceTokenizer(punkt_params)
def get_V(self, topics_file_name, other_file): if other_file == True: path = topics_file_name else: path = 'OpinosisDataset1.0_0/topics/{}'.format(topics_file_name) text = open(path, encoding="utf8", errors='ignore') text = text.read() # get the X_train_counts and X_train_tf trainer = PunktTrainer() trainer.INCLUDE_ALL_COLLOCS = True trainer.train(text) tokenizer = PunktSentenceTokenizer(trainer.get_params()) X = tokenizer.tokenize(text) bigram_vectorizer = CountVectorizer(ngram_range=(1, 2), token_pattern=r'\b\w+\b', min_df=1) X_train_counts = bigram_vectorizer.fit_transform(X) tf_transformer = TfidfTransformer(use_idf=True).fit(X_train_counts) X_train_tf = tf_transformer.transform(X_train_counts) return X_train_counts, X_train_tf, tokenizer, bigram_vectorizer
def create_sentences(text_file, min_sentence_len): trainer = PunktTrainer() trainer.INCLUDE_ALL_COLLOCS = True with open(text_file, "r") as input_file: paragraphs = input_file.read() trainer.train(paragraphs) tokenizer = PunktSentenceTokenizer(trainer.get_params()) # print(tokenizer._params.abbrev_types) sentences = [] for line in open(text_file, "r+").readlines(): sentences_tmp = tokenizer.tokenize(line) for sentence in sentences_tmp: sentences.append(sentence) with open("dataset/sentences.txt", "a") as out_file: for sentence in sentences: if len(sentence) > min_sentence_len: out_file.write(sentence + "\n\n")
import fileinput import pickle import sys from os.path import basename from nltk.tokenize.punkt import PunktTrainer __author__ = 'Florian Leitner' __version__ = '1.0' if len(sys.argv) == 2 and sys.argv[1] in ('-h', '--help'): print('usage: {} < TEXT > MODEL'.format(basename(sys.argv[0]))) sys.exit(1) trainer = PunktTrainer() # configuration trainer.ABBREV = 0.3 # cut-off value whether a ‘token’ is an abbreviation trainer.ABBREV_CUTOFF = 5 # upper cut-off for Mikheev’s (2002) abbreviation detection algorithm trainer.COLLOCATION = 7.88 # minimal log-likelihood value that two tokens need to be considered as a collocation trainer.IGNORE_ABBREV_PENALTY = False # disables the abbreviation penalty heuristic, which exponentially disadvantages words that are found at times without a final period trainer.INCLUDE_ABBREV_COLLOCS = True # include as potential collocations all word pairs where the first word is an abbreviation - such collocations override the orthographic heuristic, but not the sentence starter heuristic trainer.INCLUDE_ALL_COLLOCS = False # this includes as potential collocations all word pairs where the first word ends in a period - it may be useful in corpora where there is a lot of variation that makes abbreviations like Mr difficult to identify trainer.MIN_COLLOC_FREQ = 3 # minimum bound on the number of times a bigram needs to appear before it can be considered a collocation - useful when INCLUDE_*_COLLOCS are used trainer.SENT_STARTER = 30 # minimal log-likelihood value that a token requires to be considered as a frequent sentence starter for line in fileinput.input(): trainer.train(line) #print(line) #trainer.freq_threshold()
from nltk.tokenize.punkt import PunktSentenceTokenizer, PunktTrainer import pickle from pymongo import MongoClient from progressbar import ProgressBar client = MongoClient() db = client.legislation bills = db.bills trainer = PunktTrainer() # # set custom parameters extra_collocations = {(u'sec', u'##number##')} extra_sentence_starters = {u'(##number##)'} # extra_abbreviations = {u'U.S.C', u'usc'} trainer.ABBREV = 0.3 """cut-off value whether a 'token' is an abbreviation""" trainer.IGNORE_ABBREV_PENALTY = False """allows the disabling of the abbreviation penalty heuristic, which exponentially disadvantages words that are found at times without a final period.""" trainer.ABBREV_BACKOFF = 5 """upper cut-off for Mikheev's(2002) abbreviation detection algorithm""" trainer.COLLOCATION = 7.88 """minimal log-likelihood value that two tokens need to be considered as a collocation"""
# coding: utf-8 import codecs from sys import argv, exit from nltk.tokenize.punkt import PunktTrainer, PunktSentenceTokenizer if len(argv) != 3: print "Usage: %s <TRAINING_CORPUS> <SENTENCES_TO_SPLIT>" % __file__ exit(1) training = ''.join(codecs.open(argv[1], 'rb', 'utf-8').readlines()) trainer = PunktTrainer() trainer.train(training, verbose=True) tokenizer = PunktSentenceTokenizer(trainer.get_params(), verbose=True) text = ''.join(codecs.open(argv[2], 'rb', 'utf-8').readlines()) sentences = tokenizer.tokenize(text) codecs.open('split', 'wb', 'utf-8').writelines([s + '\n' for s in sentences])
from pprint import pprint import pickle from nltk.tokenize.punkt import PunktSentenceTokenizer, PunktTrainer # print(dir(gutenberg)) # print(gutenberg.fileids()) text = "" # for file_id in gutenberg.fileids(): # text += gutenberg.raw(file_id) with open('eminescu.txt', 'r') as file: text = file.read() with open('hogas.txt', 'r', encoding='utf8') as file: text += file.read() with open('bucuresti.txt', 'r', encoding='utf8') as file: text += file.read() with open('pesteri.txt', 'r', encoding='utf8') as file: text += file.read() # print(len(text)) trainer = PunktTrainer() trainer.INCLUDE_ALL_COLLOCS = True trainer.train(text) with open('model.txt', 'wb') as file: pickle.dump(trainer, file)
from bs4 import BeautifulSoup text = "" from nltk.corpus import gutenberg for file_id in gutenberg.fileids(): text += gutenberg.raw(file_id) print len(text) soup = BeautifulSoup(open("D:\\YK Python\\xmltodict\\LUMNLRB3.BL23899175.xml").read(), 'html.parser') from pprint import pprint from nltk.tokenize.punkt import PunktSentenceTokenizer, PunktTrainer trainer = PunktTrainer() trainer.INCLUDE_ALL_COLLOCS = True trainer.train(text) tokenizer = PunktSentenceTokenizer(trainer.get_params()) sentences = soup.get_text(' ') sentence_list= tokenizer.tokenize(sentences) from pymongo import MongoClient client = MongoClient('mongodb://localhost:27017/') db=client['nlp'] coll=db['Keywords_list']
from nltk.tokenize.punkt import PunktTrainer import pickle PUNCTUATION = ( ';', '.', '!', '?', ) trainer = PunktTrainer() trainer.INCLUDE_ALL_COLLOCS = True with open('./corpus.txt', 'r') as fs: text = fs.read() trainer.train(text, verbose=True) params = trainer.get_params() with open('./egs/punkt_tokenize/vi.pkl', 'wb') as fs: pickle.dump(params, fs)
num_nonperiods, -count_without_period) score = ll * f_length * f_periods * f_penalty return score if __name__ == "__main__": MODE = sys.argv[1] if MODE == "train": n = int(sys.argv[2]) print("reading...") text = get_text(n) print("training...") trainer = PunktTrainer() trainer.INCLUDE_ALL_COLLOCS = True trainer.ABBREV = 0.3 trainer.train(text, verbose=True) del text print("building tokenizer...") tokenizer = PunktSentenceTokenizer(trainer.get_params()) abbrevs = tokenizer._params.abbrev_types print(sorted(abbrevs)) print("%i abbreviations" % len(abbrevs)) target_abbrevs = [ "i.e", "e.g", "prof", "dr", "m.sc", "no", "nos", "mr", "mrs", "ms", "seq", "o.r.s" ]
'''Does the same thing as split_sent.py, but expects the file to be uncompressed and that the columns are src_url, tgt_url, src_line, tgt_line, adq_score, dom''' from nltk.tokenize.punkt import PunktSentenceTokenizer, PunktTrainer import nltk.data, sys, gzip train = False if train: with gzip.open("en_corp", 'rt', encoding='utf-8') as encorp, gzip.open( "de_corp", 'rt', encoding='utf-8') as decorp: text_en = encorp.read() text_de = decorp.read() trainer_en = PunktTrainer() trainer_en.INCLUDE_ALL_COLLOCS = True trainer_en.train(text_en) trainer_de = PunktTrainer() trainer_de.INCLUDE_ALL_COLLOCS = True trainer_de.train(text_de) tokenizer_en = PunktSentenceTokenizer(trainer_en.get_params()) tokenizer_de = PunktSentenceTokenizer(trainer_de.get_params()) else: #tokenizer_en=PunktSentenceTokenizer() #tokenizer_de=PunktSentenceTokenizer() #nltk.download('punkt') tokenizer_en = nltk.data.load('tokenizers/punkt/english.pickle') tokenizer_de = nltk.data.load('tokenizers/punkt/german.pickle') mismatch = 0 with open(sys.argv[1]) as filtered: for line in filtered:
outer += 1 trainer.train("\n".join(lines), finalize=False) count = 0 lines = [] count += 1 if preprocess: line = preprocess(line) lines.append(line) # open model vars = PunktLanguageVars vars.sent_end_chars = (u".", u"?", u"!", u")", u"\"", u"'", u":", u"|", u"»", u"]") trainer = PunktTrainer(lang_vars=vars()) train_with_file(gull_fn, trainer) for fn in glob(os.path.join(avis_path, '*.s')): train_with_file(fn, trainer, preprocess=lambda x: avis_pat.match(x.strip()).group(1)) params = trainer.get_params() punkt = PunktSentenceTokenizer(params) cPickle.dump(punkt, 'punkt-norwegian-open.pickle') # full model vars = PunktLanguageVars vars.sent_end_chars = (u".", u"?", u"!", u")", u"\"", u"'", u":", u"|", u"»", u"]") trainer = PunktTrainer(lang_vars=vars())