def display_non_duplicate_plutarch(filenames): d = {} for name in filenames: d[name] = hash(open(name).read()) rev_multidict = {} for key, value in d.items(): rev_multidict.setdefault(value, set()).add(key) dup_names = (v for k, v in rev_multidict.items() if len(v) > 1) dup_plutarch_filenames = [] for duplicates in dup_names: for elem in duplicates: if elem.startswith('tesserae/texts/grc/plutarch/'): dup_plutarch_filenames.append(elem) dup_plutarch_filenames = sorted(dup_plutarch_filenames) plutarch_filenames = sorted( _get_filenames('tesserae/texts/grc/plutarch', 'tess', set())) #Prints files in the plutarch directory that have been identified as duplicates print('\n'.join(dup_plutarch_filenames)) print() #Prints all files in the plutarch directory print('\n'.join(plutarch_filenames))
from nltk.tokenize.punkt import PunktSentenceTokenizer, PunktLanguageVars from functools import reduce import os from os.path import join from extract_features import _get_filenames, parse_tess from greek_features import composite_files_to_exclude from textual_feature import sentence_tokenizers corpus_dir = join('tesserae', 'texts', 'grc') files = _get_filenames(corpus_dir, 'tess', composite_files_to_exclude) f = open('sentence_counts.csv', mode='w') f.write( 'Data: https://github.com/timgianitsos/tesserae/tree/master/texts/grc,Project: https://www.qcrit.org,Author: Tim Gianitsos ([email protected]),Repo (Private): https://github.com/jdexter476/ProseVerseClassification.git,Code commit: ' + os.popen('git rev-parse HEAD').read().strip() + ',Corpus commit: ' + os.popen('git -C "./tesserae" rev-parse HEAD').read().strip() + '\n') f.write('file name,number of sentences\n') for file in files: file_text = parse_tess(file) num_sentences = len( sentence_tokenizers['ancient_greek'].tokenize(file_text)) f.write(file[file.rindex(os.sep) + 1:] + ',' + str(num_sentences) + '\n') print('Success!')
if line[1] not in labels: labels[line[1]] = len(labels) vg[line[0]] = line[1] cnt.update(vg.values()) assert len(vg) == 141 assert vg['aeschylus.agamemnon.tess'] == 'drama' assert vg['tryphiodorus.the_taking_of_ilios.tess'] == 'epic' assert sum(cnt.values()) == len(pg) + len(vg) print('Verse genres:', set(vg.values())) print('Counts:', cnt) print('Category key:', labels) filename_to_path = { s[s.rindex(os.sep) + 1:]: s for s in _get_filenames('tesserae/texts/grc', 'tess', set()) } path_to_filename = { s: s[s.rindex(os.sep) + 1:] for s in _get_filenames('tesserae/texts/grc', 'tess', set()) } assert len(filename_to_path) == len(path_to_filename) file_to_genre = dict(**pg, **vg) output_file = 'genre_labels.csv' print('Writing to ' + output_file + '...') f = open(output_file, 'w') f.write(','.join(k + ':' + str(v) for k, v in labels.items()) + '\n') f.write('Filename,Genre\n') for path in sorted( filename_to_path.values() ): #Iterate over sorted values so that the order will match prosody_labels.csv