def train_sentence_tokenizer(self: object, text: str): """ Train sentences tokenizer. """ language_punkt_vars = PunktLanguageVars # Set punctuation if self.punctuation: if self.strict: language_punkt_vars.sent_end_chars = (self.punctuation + self.strict_punctuation) else: language_punkt_vars.sent_end_chars = self.punctuation # Set abbreviations trainer = PunktTrainer(text, language_punkt_vars) trainer.INCLUDE_ALL_COLLOCS = True trainer.INCLUDE_ABBREV_COLLOCS = True tokenizer = PunktSentenceTokenizer(trainer.get_params()) if self.abbreviations: for abbreviation in self.abbreviations: tokenizer._params.abbrev_types.add(abbreviation) return tokenizer
def constructor(): trainer = PunktTrainer() trainer.INCLUDE_ALL_COLLOCS = True trainer.INCLUDE_ABBREV_COLLOCS = True trainer.train_tokens(self.words()) params = trainer.get_params() return PunktSentenceTokenizer(params)
def train(src, tgt): with open(src, 'r', encoding='utf-8') as infile, \ open(tgt, 'wb') as sent_tokenizer: contents = infile.read() language_punkt_vars = PunktLanguageVars # language_punkt_vars.sent_end_chars=tuple(args.end_chars) print("# Training sent tokenizer") trainer = PunktTrainer(contents, language_punkt_vars) trainer.INCLUDE_ALL_COLLOCS = True trainer.INCLUDE_ABBREV_COLLOCS = True params = trainer.get_params() tokenizer = PunktSentenceTokenizer(params) tokenizer._params.abbrev_types.add('brgy') tokenizer._params.abbrev_types.add('sen') tokenizer._params.abbrev_types.add('supt') tokenizer._params.abbrev_types.add('rep') tokenizer._params.abbrev_types.add('dr') tokenizer._params.abbrev_types.add('col') tokenizer._params.abbrev_types.add('sec') tokenizer._params.abbrev_types.add('mt') tokenizer._params.abbrev_types.add('asst') tokenizer._params.abbrev_types.add('mr') tokenizer._params.abbrev_types.add('c/insp') tokenizer._params.abbrev_types.add('rep') tokenizer._params.abbrev_types.add('sta') tokenizer._params.abbrev_types.add('sto') pickle.dump(tokenizer, sent_tokenizer)
def constructor(): trainer = PunktTrainer() trainer.INCLUDE_ALL_COLLOCS = True trainer.INCLUDE_ABBREV_COLLOCS = True trainer.train_tokens(self.words()) params = trainer.get_params() return PunktSentenceTokenizer(params)
def rank_sentences(text, sentence_scores, title="", n=7): final_sentences = [] trainer = PunktTrainer() trainer.INCLUDE_ALL_COLLOCS = True trainer.train(text) sent_tokenizer = PunktSentenceTokenizer(trainer.get_params()) for s in sentence_scores: if title == "": break else: sentence_scores[s] *= (1 + similarity_score(title, s)) sc = sentence_scores.copy() sc = OrderedDict(sorted(sc.items(), key=lambda t: t[1], reverse=True)) ordered_sents = dict(islice(sc.items(), n)) proper_sentences = sent_tokenizer.tokenize(text) for s in proper_sentences: if s.lower() in ordered_sents: final_sentences.append(s) return final_sentences
def train_sentence_tokenizer(self: object, text: str): """ Train sentence tokenizer. """ language_punkt_vars = PunktLanguageVars # Set punctuation if self.punctuation: if self.strict: language_punkt_vars.sent_end_chars = self.punctuation + self.strict_punctuation else: language_punkt_vars.sent_end_chars = self.punctuation # Set abbreviations trainer = PunktTrainer(text, language_punkt_vars) trainer.INCLUDE_ALL_COLLOCS = True trainer.INCLUDE_ABBREV_COLLOCS = True tokenizer = PunktSentenceTokenizer(trainer.get_params()) if self.abbreviations: for abbreviation in self.abbreviations: tokenizer._params.abbrev_types.add(abbreviation) return tokenizer
def get_tokenizer(training_text): trainer = PunktTrainer() trainer.INCLUDE_ALL_COLLOCS = True trainer.train(training_text) tokenizer = PunktSentenceTokenizer(trainer.get_params()) tokenizer._params.abbrev_types.update(ABBREVIATIONS) return tokenizer
def trainSentenceTokenizer(self): text = "" for file_id in gutenberg.fileids(): text += gutenberg.raw(file_id) trainer = PunktTrainer() trainer.INCLUDE_ALL_COLLOCS = True trainer.train(text) tokenizer = PunktSentenceTokenizer(trainer.get_params()) tokenizer._params.abbrev_types.add('dr') tokenizer._params.abbrev_types.add('fig') return tokenizer
def get_sentence_tokenizer(language): """ Return the sentence tokenizer callable. """ pickle_path = 'sentence_tokenizer.pickle' try: input_file = open(pickle_path, 'rb') sentence_tokenizer = load(input_file) input_file.close() except FileNotFoundError: data_file_paths = [] sentences = [] try: # Get the paths to each file the bot will be trained with corpus_files = list_corpus_files('core.corpus.{language}'.format( language=language.ENGLISH_NAME.lower() )) except LookupError: # Fall back to English sentence splitting rules if a language is not supported corpus_files = list_corpus_files('core.corpus.{language}'.format( language=languages.ENG.ENGLISH_NAME.lower() )) data_file_paths.extend(corpus_files) for corpus, _categories, _file_path in load_corpus(*data_file_paths): for conversation in corpus: for text in conversation: sentences.append(text.upper()) sentences.append(text.lower()) trainer = PunktTrainer() trainer.INCLUDE_ALL_COLLOCS = True trainer.train('\n'.join(sentences)) sentence_tokenizer = PunktSentenceTokenizer(trainer.get_params()) # Pickle the sentence tokenizer for future use output_file = open(pickle_path, 'wb') dump(sentence_tokenizer, output_file, -1) output_file.close() return sentence_tokenizer
def trainSentenceTokenizer(): """ Method trains custom sentence tokenizer using punk. At the moment it preforms worse then plain english one (most likely due to not that much data) """ collection = database["crawled-data"] text = "" for record in collection.find({ABSTRACT_DOCUMENT: {"$ne": None}}): text += record[ABSTRACT_DOCUMENT] + " " trainer = PunktTrainer() trainer.INCLUDE_ALL_COLLOCS = True trainer.INCLUDE_ABBREV_COLLOCS = True trainer.train(text) model = nltk.PunktSentenceTokenizer(trainer.get_params()) with open("latvianPunkt2.pickle", mode='wb') as fout: pickle.dump(model, fout, protocol=pickle.HIGHEST_PROTOCOL)
def score_sentences(text, word_scores, unique): trainer = PunktTrainer() trainer.INCLUDE_ALL_COLLOCS = True trainer.train(text) sent_score = {} sent_tokenizer = PunktSentenceTokenizer(trainer.get_params()) sentences = sent_tokenizer.tokenize(text.lower()) for s in sentences: words = clean_text(s) sent_score[s] = 0 for w in words: w = lemmatizer.lemmatize(w) if w in unique: sent_score[s] += word_scores[w] return sent_score
def get_tokenizer(self, xml, abbrevWordList, spentSplitList): #class BulletPointLangVars(PunktLanguageVars): #sent_end_chars = ('?', '!') #for i in range(len(spentSplitList)): # sent_end_chars = sent_end_chars + tuple(spentSplitList[i]) trainer = PunktTrainer() trainer.INCLUDE_ALL_COLLOCS = True train_data = 'sss' trainer.train(train_data) tokenizer = PunktSentenceTokenizer(trainer.get_params()) #tokenizer = PunktSentenceTokenizer(trainer.get_params(), lang_vars = BulletPointLangVars()) #문장분리 예외추가 rule['ABBREV_WORDS'].extend(abbrevWordList) for i in rule['ABBREV_WORDS']: tokenizer._params.abbrev_types.add(i) tokenizer = PunktSentenceTokenizer(trainer.get_params()) return tokenizer
def get_V(self, topics_file_name, other_file): if other_file == True: path = topics_file_name else: path = 'OpinosisDataset1.0_0/topics/{}'.format(topics_file_name) text = open(path, encoding="utf8", errors='ignore') text = text.read() # get the X_train_counts and X_train_tf trainer = PunktTrainer() trainer.INCLUDE_ALL_COLLOCS = True trainer.train(text) tokenizer = PunktSentenceTokenizer(trainer.get_params()) X = tokenizer.tokenize(text) bigram_vectorizer = CountVectorizer(ngram_range=(1, 2), token_pattern=r'\b\w+\b', min_df=1) X_train_counts = bigram_vectorizer.fit_transform(X) tf_transformer = TfidfTransformer(use_idf=True).fit(X_train_counts) X_train_tf = tf_transformer.transform(X_train_counts) return X_train_counts, X_train_tf, tokenizer, bigram_vectorizer
def create_sentences(text_file, min_sentence_len): trainer = PunktTrainer() trainer.INCLUDE_ALL_COLLOCS = True with open(text_file, "r") as input_file: paragraphs = input_file.read() trainer.train(paragraphs) tokenizer = PunktSentenceTokenizer(trainer.get_params()) # print(tokenizer._params.abbrev_types) sentences = [] for line in open(text_file, "r+").readlines(): sentences_tmp = tokenizer.tokenize(line) for sentence in sentences_tmp: sentences.append(sentence) with open("dataset/sentences.txt", "a") as out_file: for sentence in sentences: if len(sentence) > min_sentence_len: out_file.write(sentence + "\n\n")
'''Does the same thing as split_sent.py, but expects the file to be uncompressed and that the columns are src_url, tgt_url, src_line, tgt_line, adq_score, dom''' from nltk.tokenize.punkt import PunktSentenceTokenizer, PunktTrainer import nltk.data, sys, gzip train = False if train: with gzip.open("en_corp", 'rt', encoding='utf-8') as encorp, gzip.open( "de_corp", 'rt', encoding='utf-8') as decorp: text_en = encorp.read() text_de = decorp.read() trainer_en = PunktTrainer() trainer_en.INCLUDE_ALL_COLLOCS = True trainer_en.train(text_en) trainer_de = PunktTrainer() trainer_de.INCLUDE_ALL_COLLOCS = True trainer_de.train(text_de) tokenizer_en = PunktSentenceTokenizer(trainer_en.get_params()) tokenizer_de = PunktSentenceTokenizer(trainer_de.get_params()) else: #tokenizer_en=PunktSentenceTokenizer() #tokenizer_de=PunktSentenceTokenizer() #nltk.download('punkt') tokenizer_en = nltk.data.load('tokenizers/punkt/english.pickle') tokenizer_de = nltk.data.load('tokenizers/punkt/german.pickle') mismatch = 0 with open(sys.argv[1]) as filtered: for line in filtered:
from bs4 import BeautifulSoup text = "" from nltk.corpus import gutenberg for file_id in gutenberg.fileids(): text += gutenberg.raw(file_id) print len(text) soup = BeautifulSoup(open("D:\\YK Python\\xmltodict\\LUMNLRB3.BL23899175.xml").read(), 'html.parser') from pprint import pprint from nltk.tokenize.punkt import PunktSentenceTokenizer, PunktTrainer trainer = PunktTrainer() trainer.INCLUDE_ALL_COLLOCS = True trainer.train(text) tokenizer = PunktSentenceTokenizer(trainer.get_params()) sentences = soup.get_text(' ') sentence_list= tokenizer.tokenize(sentences) from pymongo import MongoClient client = MongoClient('mongodb://localhost:27017/') db=client['nlp'] coll=db['Keywords_list']
"""allows the disabling of the abbreviation penalty heuristic, which exponentially disadvantages words that are found at times without a final period.""" trainer.ABBREV_BACKOFF = 5 """upper cut-off for Mikheev's(2002) abbreviation detection algorithm""" trainer.COLLOCATION = 7.88 """minimal log-likelihood value that two tokens need to be considered as a collocation""" trainer.SENT_STARTER = 30 """minimal log-likelihood value that a token requires to be considered as a frequent sentence starter""" trainer.INCLUDE_ALL_COLLOCS = False """this includes as potential collocations all word pairs where the first word ends in a period. It may be useful in corpora where there is a lot of variation that makes abbreviations like Mr difficult to identify.""" trainer.INCLUDE_ABBREV_COLLOCS = False """this includes as potential collocations all word pairs where the first word is an abbreviation. Such collocations override the orthographic heuristic, but not the sentence starter heuristic. This is overridden by INCLUDE_ALL_COLLOCS, and if both are false, only collocations with initials and ordinals are considered.""" """""" trainer.MIN_COLLOC_FREQ = 1 """this sets a minimum bound on the number of times a bigram needs to appear before it can be considered a collocation, in addition to log
from os.path import basename from nltk.tokenize.punkt import PunktTrainer __author__ = 'Florian Leitner' __version__ = '1.0' if len(sys.argv) == 2 and sys.argv[1] in ('-h', '--help'): print('usage: {} < TEXT > MODEL'.format(basename(sys.argv[0]))) sys.exit(1) trainer = PunktTrainer() # configuration trainer.ABBREV = 0.3 # cut-off value whether a ‘token’ is an abbreviation trainer.ABBREV_CUTOFF = 5 # upper cut-off for Mikheev’s (2002) abbreviation detection algorithm trainer.COLLOCATION = 7.88 # minimal log-likelihood value that two tokens need to be considered as a collocation trainer.IGNORE_ABBREV_PENALTY = False # disables the abbreviation penalty heuristic, which exponentially disadvantages words that are found at times without a final period trainer.INCLUDE_ABBREV_COLLOCS = True # include as potential collocations all word pairs where the first word is an abbreviation - such collocations override the orthographic heuristic, but not the sentence starter heuristic trainer.INCLUDE_ALL_COLLOCS = False # this includes as potential collocations all word pairs where the first word ends in a period - it may be useful in corpora where there is a lot of variation that makes abbreviations like Mr difficult to identify trainer.MIN_COLLOC_FREQ = 3 # minimum bound on the number of times a bigram needs to appear before it can be considered a collocation - useful when INCLUDE_*_COLLOCS are used trainer.SENT_STARTER = 30 # minimal log-likelihood value that a token requires to be considered as a frequent sentence starter for line in fileinput.input(): trainer.train(line) #print(line) #trainer.freq_threshold() trainer.finalize_training() params = trainer.get_params() pickle.dump(params, sys.stdout.buffer)
"""allows the disabling of the abbreviation penalty heuristic, which exponentially disadvantages words that are found at times without a final period.""" trainer.ABBREV_BACKOFF = 5 """upper cut-off for Mikheev's(2002) abbreviation detection algorithm""" trainer.COLLOCATION = 7.88 """minimal log-likelihood value that two tokens need to be considered as a collocation""" trainer.SENT_STARTER = 30 """minimal log-likelihood value that a token requires to be considered as a frequent sentence starter""" trainer.INCLUDE_ALL_COLLOCS = False """this includes as potential collocations all word pairs where the first word ends in a period. It may be useful in corpora where there is a lot of variation that makes abbreviations like Mr difficult to identify.""" trainer.INCLUDE_ABBREV_COLLOCS = False """this includes as potential collocations all word pairs where the first word is an abbreviation. Such collocations override the orthographic heuristic, but not the sentence starter heuristic. This is overridden by INCLUDE_ALL_COLLOCS, and if both are false, only collocations with initials and ordinals are considered.""" """""" trainer.MIN_COLLOC_FREQ = 1 """this sets a minimum bound on the number of times a bigram needs to appear before it can be considered a collocation, in addition to log