Beispiel #1
0
def train_punktsent(trainfile, modelfile):
  """ 
  Trains an unsupervised NLTK punkt SENTENCE tokenizer. 
  *trainfile* is the filename for the input file. s
  *modelfile* is the filename for the model output file.
  """
  punkt = PunktTrainer()
  try:
    with codecs.open(trainfile, 'r','utf8') as fin:
      punkt.train(fin.read(), finalize=False, verbose=False)
  except KeyboardInterrupt:
    print 'KeyboardInterrupt: Stopping the reading of the dump early!'
  ##HACK: Adds abbreviations from rb_tokenizer.
  abbrv_sent = " ".join([i.strip() for i in \
                         codecs.open('abbrev.lex','r','utf8').readlines()])
  abbrv_sent = "Start"+abbrv_sent+"End."
  punkt.train(abbrv_sent,finalize=False, verbose=False)
  # Finalize and outputs trained model.
  punkt.finalize_training(verbose=True)
  model = PunktSentenceTokenizer(punkt.get_params())
  with open(modelfile, mode='wb') as fout:
    pickle.dump(model, fout, protocol=pickle.HIGHEST_PROTOCOL)
  return model
def build_sentence_model(text, extra_abbrevs=None):
    """
    Build a sentence model from text with optional
    extra abbreviations to include.
    :param text:
    :param extra_abbrevs:
    :return:
    """

    # Setup Punkt trainer
    punkt_trainer = PunktTrainer()
    punkt_trainer.train(text, verbose=False, finalize=False)
    punkt_trainer.finalize_training(verbose=False)

    # Extract parameters from trainer
    punkt_params = punkt_trainer.get_params()

    # Add any extras if passed
    if extra_abbrevs is not None:
        for abbrev in extra_abbrevs:
            punkt_params.abbrev_types.add(abbrev.strip(".").lower())

    # Return model instantiated with new parameters
    return PunktSentenceTokenizer(punkt_params)
Beispiel #3
0
'''Does the same thing as split_sent.py, but expects the file to be uncompressed and that the columns are src_url, tgt_url, src_line, tgt_line, adq_score, dom'''
from nltk.tokenize.punkt import PunktSentenceTokenizer, PunktTrainer
import nltk.data, sys, gzip

train = False
if train:
    with gzip.open("en_corp", 'rt', encoding='utf-8') as encorp, gzip.open(
            "de_corp", 'rt', encoding='utf-8') as decorp:
        text_en = encorp.read()
        text_de = decorp.read()

    trainer_en = PunktTrainer()
    trainer_en.INCLUDE_ALL_COLLOCS = True
    trainer_en.train(text_en)

    trainer_de = PunktTrainer()
    trainer_de.INCLUDE_ALL_COLLOCS = True
    trainer_de.train(text_de)

    tokenizer_en = PunktSentenceTokenizer(trainer_en.get_params())
    tokenizer_de = PunktSentenceTokenizer(trainer_de.get_params())
else:
    #tokenizer_en=PunktSentenceTokenizer()
    #tokenizer_de=PunktSentenceTokenizer()
    #nltk.download('punkt')
    tokenizer_en = nltk.data.load('tokenizers/punkt/english.pickle')
    tokenizer_de = nltk.data.load('tokenizers/punkt/german.pickle')

mismatch = 0
with open(sys.argv[1]) as filtered:
    for line in filtered:
Beispiel #4
0
text = ""
from nltk.corpus import gutenberg

for file_id in gutenberg.fileids():
    text += gutenberg.raw(file_id)
print len(text)
soup = BeautifulSoup(open("D:\\YK Python\\xmltodict\\LUMNLRB3.BL23899175.xml").read(), 'html.parser')
from pprint import pprint

from nltk.tokenize.punkt import PunktSentenceTokenizer, PunktTrainer

trainer = PunktTrainer()

trainer.INCLUDE_ALL_COLLOCS = True

trainer.train(text)

tokenizer = PunktSentenceTokenizer(trainer.get_params())

sentences = soup.get_text(' ')

sentence_list= tokenizer.tokenize(sentences)

from pymongo import MongoClient

client = MongoClient('mongodb://localhost:27017/')

db=client['nlp']

coll=db['Keywords_list']
trainer.MIN_COLLOC_FREQ = 1
"""this sets a minimum bound on the number of times a bigram needs to
appear before it can be considered a collocation, in addition to log
likelihood statistics. This is useful when INCLUDE_ALL_COLLOCS is True."""


# Read in training corpus
# cursor = bills.find({}, {"text_versions": 1}, no_cursor_timeout=True)
# cursor = bills.find({}, {"text_versions": 1}, no_cursor_timeout=True, limit=10000)
cursor = bills.find({"congress": {"$in": ["114", "113"]}}, {"text_versions": 1}, no_cursor_timeout=True)

# Train trainer
pbar = ProgressBar(maxval=cursor.count()).start()
for i, line in enumerate(cursor):
    text = line['text_versions'].itervalues().next()
    trainer.train(text, finalize=False, verbose=False)
    pbar.update(i)
pbar.finish()

print "Finalizing training..."
trainer.finalize_training(verbose=True)
print "Training done."

# Include custom parameters
params = trainer.get_params()
# params.collocations = params.collocations | extra_collocations
# params.sent_starters = params.sent_starters | extra_sentence_starters

with open('sentence_tokenizer_params.pickle', 'wb') as f:
    pickle.dump(params, f, protocol=pickle.HIGHEST_PROTOCOL)
print "Params: %s" % repr(params)
from nltk.tokenize.punkt import PunktTrainer
import pickle

PUNCTUATION = (
    ';',
    '.',
    '!',
    '?',
)
trainer = PunktTrainer()
trainer.INCLUDE_ALL_COLLOCS = True

with open('./corpus.txt', 'r') as fs:
    text = fs.read()

trainer.train(text, verbose=True)
params = trainer.get_params()
with open('./egs/punkt_tokenize/vi.pkl', 'wb') as fs:
    pickle.dump(params, fs)
Beispiel #7
0
trainer.INCLUDE_ABBREV_COLLOCS = False
"""this includes as potential collocations all word pairs where the first
word is an abbreviation. Such collocations override the orthographic
heuristic, but not the sentence starter heuristic. This is overridden by
INCLUDE_ALL_COLLOCS, and if both are false, only collocations with initials
and ordinals are considered."""
""""""

trainer.MIN_COLLOC_FREQ = 1
"""this sets a minimum bound on the number of times a bigram needs to
appear before it can be considered a collocation, in addition to log
likelihood statistics. This is useful when INCLUDE_ALL_COLLOCS is True."""

progress = ProgressBar()
for doc in progress(docs):
    trainer.train(doc, finalize=False, verbose=False)

print "Finalizing training..."
trainer.finalize_training(verbose=True)
print "Training done."

params = trainer.get_params()
with open('sentence_tokenizer_params.pkl', 'wb') as f:
    pickle.dump(params, f, protocol=pickle.HIGHEST_PROTOCOL)
print "Params: %s" % repr(params)

# set custom parameters
# extra_collocations = {(u'sec', u'##number##')}
# extra_sentence_starters = {u'(##number##)'}
# extra_abbreviations = {u'U.S.C', u'usc'}
# coding: utf-8
import codecs
import sys

from nltk.tokenize.punkt import PunktTrainer, PunktSentenceTokenizer
training = ''.join(
    codecs.open('IT-TrainingCorpus.txt', 'rb', 'utf-8').readlines())
trainer = PunktTrainer()
trainer.train(training, verbose=True)
tokenizer = PunktSentenceTokenizer(trainer.get_params(), verbose=True)
text = ''.join(codecs.open(sys.argv[1], 'rb', 'utf-8').readlines())
sentences = tokenizer.tokenize(text)
clean = [s for s in sentences if s.find('<strong>') != -1]
codecs.open('clean-gold', 'wb', 'utf-8').writelines([s + '\n' for s in clean])
Beispiel #9
0
from os.path import basename
from nltk.tokenize.punkt import PunktTrainer

__author__ = 'Florian Leitner'
__version__ = '1.0'

if len(sys.argv) == 2 and sys.argv[1] in ('-h', '--help'):
    print('usage: {} < TEXT > MODEL'.format(basename(sys.argv[0])))
    sys.exit(1)

trainer = PunktTrainer()
# configuration
trainer.ABBREV = 0.3  # cut-off value whether a ‘token’ is an abbreviation
trainer.ABBREV_CUTOFF = 5  # upper cut-off for Mikheev’s (2002) abbreviation detection algorithm
trainer.COLLOCATION = 7.88  # minimal log-likelihood value that two tokens need to be considered as a collocation
trainer.IGNORE_ABBREV_PENALTY = False  # disables the abbreviation penalty heuristic, which exponentially disadvantages words that are found at times without a final period
trainer.INCLUDE_ABBREV_COLLOCS = True  # include as potential collocations all word pairs where the first word is an abbreviation - such collocations override the orthographic heuristic, but not the sentence starter heuristic
trainer.INCLUDE_ALL_COLLOCS = False  # this includes as potential collocations all word pairs where the first word ends in a period - it may be useful in corpora where there is a lot of variation that makes abbreviations like Mr difficult to identify
trainer.MIN_COLLOC_FREQ = 3  # minimum bound on the number of times a bigram needs to appear before it can be considered a collocation - useful when INCLUDE_*_COLLOCS are used
trainer.SENT_STARTER = 30  # minimal log-likelihood value that a token requires to be considered as a frequent sentence starter

for line in fileinput.input():
    trainer.train(line)
    #print(line)

#trainer.freq_threshold()
trainer.finalize_training()
params = trainer.get_params()
pickle.dump(params, sys.stdout.buffer)
trainer.INCLUDE_ABBREV_COLLOCS = False
"""this includes as potential collocations all word pairs where the first
word is an abbreviation. Such collocations override the orthographic
heuristic, but not the sentence starter heuristic. This is overridden by
INCLUDE_ALL_COLLOCS, and if both are false, only collocations with initials
and ordinals are considered."""
""""""

trainer.MIN_COLLOC_FREQ = 1
"""this sets a minimum bound on the number of times a bigram needs to
appear before it can be considered a collocation, in addition to log
likelihood statistics. This is useful when INCLUDE_ALL_COLLOCS is True."""

progress = ProgressBar()
for doc in progress(docs):
    trainer.train(doc, finalize=False, verbose=False)

print "Finalizing training..."
trainer.finalize_training(verbose=True)
print "Training done."

params = trainer.get_params()
with open('sentence_tokenizer_params.pkl', 'wb') as f:
    pickle.dump(params, f, protocol=pickle.HIGHEST_PROTOCOL)
print "Params: %s" % repr(params)

# set custom parameters
# extra_collocations = {(u'sec', u'##number##')}
# extra_sentence_starters = {u'(##number##)'}
# extra_abbreviations = {u'U.S.C', u'usc'}
Beispiel #11
0
temperature = default_temperature
top_k = default_top_k
top_p = default_top_p
min_length = default_min_length
do_sample = default_do_sample
num_return_sequences = default_num_return_sequences
num_beams = default_num_beams
no_repeat_ngram_size = default_no_repeat_ngram_size
early_stopping = default_early_stopping

sample_sentences = ""
for file_id in gutenberg.fileids():
    sample_sentences += gutenberg.raw(file_id)
trainer = PunktTrainer()
trainer.INCLUDE_ALL_COLLOCS = True
trainer.train(sample_sentences)
sentence_tokenizer = PunktSentenceTokenizer(trainer.get_params())
sentence_tokenizer._params.abbrev_types.add('dr')


def clean_prediction(text):
    if full_sentences:
        sentences = drop_incomplete_sentences(text)
    else:
        sentences = text

    return sentences.replace(stop_token, '').strip('\n').strip()


def drop_incomplete_sentences(text):
    sentences = sentence_tokenizer.tokenize(text)
Beispiel #12
0
PunktLanguageVars.sent_end_chars = ('.', ';', ';')
PunktLanguageVars.internal_punctuation = (',', '·', ':')
trainer = PunktTrainer(lang_vars=PunktLanguageVars())
trainer.INCLUDE_ALL_COLLOCS = True
trainer.INCLUDE_ABBREV_COLLOCS = True

corpus_dir = 'tesserae' + os.sep + 'texts' + os.sep + 'grc'
file_extension = 'tess'
#Obtain all the files to parse by traversing through the directory
file_names = sorted(list({current_path + os.sep + current_file_name for current_path, current_dir_names, current_file_names in \
os.walk(corpus_dir) for current_file_name in current_file_names if current_file_name.endswith('.' + file_extension)}))

counter = 1
for file_name in file_names:
    file_text = file_parsers[file_extension](file_name)
    trainer.train(file_text, verbose=False, finalize=False)
    print_progress_bar(counter, len(file_names))
    counter += 1

with open(lang + '.pickle', 'wb') as pickle_file:
    pickle_file.write(
        pickle.dumps(PunktSentenceTokenizer(trainer.get_params())))

# params = trainer.get_params()
# tkzr = PunktSentenceTokenizer(params)
# # s = 'test test test test test. test test test test. test test. test test; test test test.'
# s = 'test test test. test test test test test; test test. test test'
# print(tkzr.tokenize(s))
# print(TokenizeSentence('greek').tokenize_sentences(s))
# coding: utf-8
import codecs
from sys import argv, exit

from nltk.tokenize.punkt import PunktTrainer, PunktSentenceTokenizer

if len(argv) != 3:
    print "Usage: %s <TRAINING_CORPUS> <SENTENCES_TO_SPLIT>" % __file__
    exit(1)

training = ''.join(codecs.open(argv[1], 'rb', 'utf-8').readlines())
trainer = PunktTrainer()
trainer.train(training, verbose=True)
tokenizer = PunktSentenceTokenizer(trainer.get_params(), verbose=True)
text = ''.join(codecs.open(argv[2], 'rb', 'utf-8').readlines())
sentences = tokenizer.tokenize(text)
codecs.open('split', 'wb', 'utf-8').writelines([s + '\n' for s in sentences])
Beispiel #14
0
    Session = sessionmaker(bind=engine)
    session = Session()
    return session


if __name__ == "__main__":
    session = loadSession()
    res = session.query(Story).all()
    all_text = ''
    for story in res:
        all_text += story.text
        all_text += ' '

    trainer = PunktTrainer()
    trainer.INCLUDE_ALL_COLLOCS = True
    trainer.train(all_text)

    tokenizer = PunktSentenceTokenizer(trainer.get_params())

    # Test the tokenizer on a piece of text
    sentences = "Работаю в провинциальном городе в магазине отделочных материалов и сантехники.Заходит к нам на днях надменная пергидролевая дева, покрытая слоем штукатурки толщиной в палец. Собирается с мыслями, напускает на себя важный вид и обращается ко мне:Дева, медленно и с видом опытного сантехника: Молодой человек, у вас ванны железные есть?Я: Нет, у нас только акрил. Металлических нет.Дева: Молодой человек, я не спрашиваю металлические, я спрашиваю железные!Я: Извините, железных тоже нет.Дева презрительно смотрит на меня, бурчит что-то себе под нос, и, виляя бедрами, уходит. Смотрим в окно. Выходит. Подходит к побитой жизнью шестерке, деловито садится на переднее сиденье, подзывает торопливо курящего поодаль водителя.Дева, возмущенно: Понабрали крестьян, металлические ванны от железных не отличают!Водитель, тяжело вздохнув, затаптывает окурок, занимает свое место, и экипаж отправляется дальше, на поиски волшебной неметаллической ванны из железа."

    print(tokenizer.tokenize(sentences))
    # ['Mr. James told me Dr.', 'Brown is not available today.', 'I will try tomorrow.']

    # View the learned abbreviations
    print(tokenizer._params.abbrev_types_)
    # set([...])

    raise
Beispiel #15
0
from cltk.utils.file_operations import open_pickle
from nltk.tokenize.punkt import PunktLanguageVars, PunktTrainer, PunktSentenceTokenizer
from extract_features import parse_tess

PunktLanguageVars.sent_end_chars = ('.', ';', ';')
PunktLanguageVars.internal_punctuation = (',', '·', ':')

text = parse_tess('tesserae/texts/grc/xenophon.anabasis.tess')
new_xeno_trainer = PunktTrainer()
# new_xeno_trainer.INCLUDE_ALL_COLLOCS = True
# new_xeno_trainer.INCLUDE_ABBREV_COLLOCS = True
new_xeno_trainer.train(text)
new_xeno_params = new_xeno_trainer.get_params()

tess_xeno_params = open_pickle('tokenizers/ancient_greek.pickle')._params

print(new_xeno_params.abbrev_types)
print(new_xeno_params.abbrev_types == tess_xeno_params.abbrev_types)
print()
print(new_xeno_params.collocations)
print(new_xeno_params.collocations == tess_xeno_params.collocations)
print()
print(new_xeno_params.sent_starters)
print(new_xeno_params.sent_starters == tess_xeno_params.sent_starters)
print()
print(new_xeno_params.ortho_context)
print(new_xeno_params.ortho_context == tess_xeno_params.ortho_context)
print()
'''
I got the internal PunktParameters object from the cltk pickle file that was trained on Xenophon's Anabasis (https://github.com/cltk/greek_training_set_sentence_cltk/blob/master/training_sentences.txt), and I also got the internal PunktParameters object from an PunktTrainer that I created from training on Xenophon's Anabasis from the tesserae corpus (https://github.com/tesserae/tesserae/blob/master/texts/grc/xenophon.anabasis.tess).