from nltk.tokenize.punkt import PunktSentenceTokenizer, PunktLanguageVars
from functools import reduce
import os
from os.path import join
from extract_features import _get_filenames, parse_tess
from greek_features import composite_files_to_exclude
from textual_feature import sentence_tokenizers

corpus_dir = join('tesserae', 'texts', 'grc')
files = _get_filenames(corpus_dir, 'tess', composite_files_to_exclude)

f = open('sentence_counts.csv', mode='w')
f.write(
    'Data: https://github.com/timgianitsos/tesserae/tree/master/texts/grc,Project: https://www.qcrit.org,Author: Tim Gianitsos ([email protected]),Repo (Private): https://github.com/jdexter476/ProseVerseClassification.git,Code commit: '
    + os.popen('git rev-parse HEAD').read().strip() + ',Corpus commit: ' +
    os.popen('git -C "./tesserae" rev-parse HEAD').read().strip() + '\n')
f.write('file name,number of sentences\n')
for file in files:
    file_text = parse_tess(file)
    num_sentences = len(
        sentence_tokenizers['ancient_greek'].tokenize(file_text))
    f.write(file[file.rindex(os.sep) + 1:] + ',' + str(num_sentences) + '\n')
print('Success!')
Example #2
0
from extract_features import parse_tess
from cltk.utils.file_operations import open_pickle
from nltk.tokenize.punkt import PunktLanguageVars, PunktTrainer, PunktSentenceTokenizer

PunktLanguageVars.sent_end_chars = ('.', ';', ';')
PunktLanguageVars.internal_punctuation = (',', '·', ':')

text = parse_tess('tesserae/texts/grc/xenophon.anabasis.tess')
tokenizer = open_pickle('tokenizers/ancient_greek.pickle')
print('Xenophon tokens: ' + str(len(tokenizer.tokenize(text))))
print()

trainer = PunktTrainer(lang_vars=PunktLanguageVars())
trainer.INCLUDE_ALL_COLLOCS = True
trainer.INCLUDE_ABBREV_COLLOCS = True
trainer.train(text, verbose=True)

new_tokenizer = PunktSentenceTokenizer(trainer.get_params())
print('tokenizers equal? ' + str(tokenizer == new_tokenizer))
print('tokenization equal? ' +
      str(tokenizer.tokenize(text) == new_tokenizer.tokenize(text)))

old_tok_out = open('feature_data/old_tok.txt', mode='w')
old_tok_out.write('\n'.join(tokenizer.tokenize(text)))
new_tok_out = open('feature_data/new_tok.txt', mode='w')
new_tok_out.write('\n'.join(new_tokenizer.tokenize(text)))
'''
There seem to be very few abbreviations in the tesserae corpus. This means training the PunktSentenceTokenizer might not yield any improvement.
From paper abstract: "[Punkt sentence tokenization training] is based on the assumption that a large number of ambiguities in the determination of sentence boundaries can be eliminated once abbreviations have been identified."

Example #3
0
    'tesserae/texts/grc/xenophon.anabasis.tess')

file_names = sorted(
    list({
        current_path + os.sep + current_file_name
        for current_path, current_dir_names, current_file_names in os.walk(
            'tesserae/texts/grc') for current_file_name in current_file_names
        if current_file_name.endswith('.' + 'tess')
    }))

counter = 1
diff1 = 0
diff2 = 0
diff3 = 0
for file_name in file_names:
    file_text = parse_tess(file_name)
    diff1 += 1 if before_params.tokenize(file_text) != after_params.tokenize(
        file_text) else 0
    diff2 += 1 if before_params2.tokenize(file_text) != after_params2.tokenize(
        file_text) else 0
    diff3 += 1 if before_params3.tokenize(file_text) != after_params3.tokenize(
        file_text) else 0
    print_progress_bar(counter, len(file_names))
    counter += 1
print('Differences between pickle loads: ' + str(diff1))
print('Differences between default PunktSentenceTokenizers: ' + str(diff2))
print('Differences between trained PunktSentenceTokenizers: ' + str(diff3))
'''
Changing class variables in PunktLanguageVars seems to have no affect on any of the tokenizers before and after
'''
'''
Example #4
0
# 			print(reg_sentences)
# 			print('\n\n\n\n\n\n\n\n\n')
# 			print(no_period_sentences)
# 			sys.exit()

skip = ['tesserae/texts/grc/plutarch.de_fortuna.tess'
        ]  #Insert files to skip here
for current_path, current_dir_names, current_file_names in os.walk(
        'tesserae/texts/grc', topdown=True):
    for current_file_name in current_file_names:
        if current_path + os.sep + current_file_name in skip:
            print('Skipping ' + current_path + os.sep + current_file_name +
                  '...')
            continue
        print('Reading ' + current_path + os.sep + current_file_name + '...')
        s = parse_tess(current_path + os.sep + current_file_name)
        old_sentences = old_t.tokenize(s)
        no_period_sentences = new_reg_no_per_t.tokenize(s)
        if old_sentences != no_period_sentences:
            print(old_sentences)
            print('\n\n\n\n\n\n\n\n\n')
            print(no_period_sentences)
            sys.exit()
'''
I made a sentence tokenizer with default parameters (1).
I made a sentence tokenizer by passing in the word tokenizer (2).
I made a sentence tokenizer by passing in a word tokenizer similar to the previous word tokenizer, but without a period in the regex (3).

Comparing (2) and (3), they were very very similar with only a few numbers different. There were some occurrences of a single greek letter followed by a period that (2) would split into 2 sentences, whereas (3) would keep it as one sentence. I opted to keep (3) because the original regexes in punkt.py didn't have a period, and because it was probably trying to recognize the single Greek letter as an abbreviation, which it is desirable behavior.

Comparing (1) and (3), they were less similar to each other than (2) and (3) to each other, but still quite similar. The main differences I found was that (1) would NOT recognize slant quotes, and (3) DID recognize them.