from nltk.tokenize.punkt import PunktSentenceTokenizer, PunktLanguageVars from functools import reduce import os from os.path import join from extract_features import _get_filenames, parse_tess from greek_features import composite_files_to_exclude from textual_feature import sentence_tokenizers corpus_dir = join('tesserae', 'texts', 'grc') files = _get_filenames(corpus_dir, 'tess', composite_files_to_exclude) f = open('sentence_counts.csv', mode='w') f.write( 'Data: https://github.com/timgianitsos/tesserae/tree/master/texts/grc,Project: https://www.qcrit.org,Author: Tim Gianitsos ([email protected]),Repo (Private): https://github.com/jdexter476/ProseVerseClassification.git,Code commit: ' + os.popen('git rev-parse HEAD').read().strip() + ',Corpus commit: ' + os.popen('git -C "./tesserae" rev-parse HEAD').read().strip() + '\n') f.write('file name,number of sentences\n') for file in files: file_text = parse_tess(file) num_sentences = len( sentence_tokenizers['ancient_greek'].tokenize(file_text)) f.write(file[file.rindex(os.sep) + 1:] + ',' + str(num_sentences) + '\n') print('Success!')
from extract_features import parse_tess from cltk.utils.file_operations import open_pickle from nltk.tokenize.punkt import PunktLanguageVars, PunktTrainer, PunktSentenceTokenizer PunktLanguageVars.sent_end_chars = ('.', ';', ';') PunktLanguageVars.internal_punctuation = (',', '·', ':') text = parse_tess('tesserae/texts/grc/xenophon.anabasis.tess') tokenizer = open_pickle('tokenizers/ancient_greek.pickle') print('Xenophon tokens: ' + str(len(tokenizer.tokenize(text)))) print() trainer = PunktTrainer(lang_vars=PunktLanguageVars()) trainer.INCLUDE_ALL_COLLOCS = True trainer.INCLUDE_ABBREV_COLLOCS = True trainer.train(text, verbose=True) new_tokenizer = PunktSentenceTokenizer(trainer.get_params()) print('tokenizers equal? ' + str(tokenizer == new_tokenizer)) print('tokenization equal? ' + str(tokenizer.tokenize(text) == new_tokenizer.tokenize(text))) old_tok_out = open('feature_data/old_tok.txt', mode='w') old_tok_out.write('\n'.join(tokenizer.tokenize(text))) new_tok_out = open('feature_data/new_tok.txt', mode='w') new_tok_out.write('\n'.join(new_tokenizer.tokenize(text))) ''' There seem to be very few abbreviations in the tesserae corpus. This means training the PunktSentenceTokenizer might not yield any improvement. From paper abstract: "[Punkt sentence tokenization training] is based on the assumption that a large number of ambiguities in the determination of sentence boundaries can be eliminated once abbreviations have been identified."
'tesserae/texts/grc/xenophon.anabasis.tess') file_names = sorted( list({ current_path + os.sep + current_file_name for current_path, current_dir_names, current_file_names in os.walk( 'tesserae/texts/grc') for current_file_name in current_file_names if current_file_name.endswith('.' + 'tess') })) counter = 1 diff1 = 0 diff2 = 0 diff3 = 0 for file_name in file_names: file_text = parse_tess(file_name) diff1 += 1 if before_params.tokenize(file_text) != after_params.tokenize( file_text) else 0 diff2 += 1 if before_params2.tokenize(file_text) != after_params2.tokenize( file_text) else 0 diff3 += 1 if before_params3.tokenize(file_text) != after_params3.tokenize( file_text) else 0 print_progress_bar(counter, len(file_names)) counter += 1 print('Differences between pickle loads: ' + str(diff1)) print('Differences between default PunktSentenceTokenizers: ' + str(diff2)) print('Differences between trained PunktSentenceTokenizers: ' + str(diff3)) ''' Changing class variables in PunktLanguageVars seems to have no affect on any of the tokenizers before and after ''' '''
# print(reg_sentences) # print('\n\n\n\n\n\n\n\n\n') # print(no_period_sentences) # sys.exit() skip = ['tesserae/texts/grc/plutarch.de_fortuna.tess' ] #Insert files to skip here for current_path, current_dir_names, current_file_names in os.walk( 'tesserae/texts/grc', topdown=True): for current_file_name in current_file_names: if current_path + os.sep + current_file_name in skip: print('Skipping ' + current_path + os.sep + current_file_name + '...') continue print('Reading ' + current_path + os.sep + current_file_name + '...') s = parse_tess(current_path + os.sep + current_file_name) old_sentences = old_t.tokenize(s) no_period_sentences = new_reg_no_per_t.tokenize(s) if old_sentences != no_period_sentences: print(old_sentences) print('\n\n\n\n\n\n\n\n\n') print(no_period_sentences) sys.exit() ''' I made a sentence tokenizer with default parameters (1). I made a sentence tokenizer by passing in the word tokenizer (2). I made a sentence tokenizer by passing in a word tokenizer similar to the previous word tokenizer, but without a period in the regex (3). Comparing (2) and (3), they were very very similar with only a few numbers different. There were some occurrences of a single greek letter followed by a period that (2) would split into 2 sentences, whereas (3) would keep it as one sentence. I opted to keep (3) because the original regexes in punkt.py didn't have a period, and because it was probably trying to recognize the single Greek letter as an abbreviation, which it is desirable behavior. Comparing (1) and (3), they were less similar to each other than (2) and (3) to each other, but still quite similar. The main differences I found was that (1) would NOT recognize slant quotes, and (3) DID recognize them.