def preprocessing(): # Get filenames for the thesaurus, the latin text, and greek text thesaurus_filename, Latin_filename, Greek_filename = xls.get_whole_text_comparison_Args() # Read the thesaurus CSV file into a dicitonary object for easy access transDict = ths.build_thesaurus(thesaurus_filename) # Read both the latin text and greek text into a dictionary for easy access Latin_word_num, Latin_search_dict, Latin_text = xls.build_search_dictionary(Latin_filename, "Latin", True) Greek_word_num, Greek_search_dict, Greek_text = xls.build_search_dictionary = (Greek_filename, "Greek", True) return Latin_word_num, Latin_search_dict, Latin_text, Greek_word_num, Greek_search_dict, Greek_text
def preprocessing(): latin_cltk_importer = CorpusImporter('latin') latin_cltk_importer.import_corpus('latin_models_cltk') greek_cltk_importer = CorpusImporter('greek') greek_cltk_importer.import_corpus('greek_models_cltk') # Get filenames for the thesaurus, the latin text, and greek text thesaurus_filename, Greek_filename = xls.get_search_by_phrase_Args() # Read the thesaurus CSV file into a dicitonary object for easy access transDict = ths.build_thesaurus(thesaurus_filename) # Read greek text into a dictionary for easy access Greek_word_num, Greek_search_dict, Greek_text = xls.build_search_dictionary(Greek_filename, "Greek", True) return transDict, Greek_word_num, Greek_search_dict, Greek_text
def test_build_search_dict(curr_test, filename, words_in_file , language, lemmatized_version = False): word_num, search_dict, indexed_corpus = xls.build_search_dictionary(filename, language ,lemmatized_version) if not (word_num == words_in_file): curr_test.passed = False curr_test.errors.append("Wrong number of words added (only " + str(word_num) + " out of " + str(words_in_file) + " words added) ") test_file = open(filename,'r') i = 0 curr_word = "" while 1: char = test_file.read(1) if not (re.sub("[\p{Z}\t\r\n\v\f\s]", "", char) == ""): curr_word += char else: curr_word = normalize_word(curr_word) if not (curr_word == ""): if curr_word in search_dict: if not( i in search_dict[curr_word]): curr_test.passed = False error_message = curr_word + " did not have the proper index in the search dict" error_message += "\n\t\t\t word number: " +str(i) + " Indices: " + str(search_dict[curr_word]) curr_test.errors.append(error_message) else: curr_test.passed = False error_message = curr_word + " was not found in the search dict" curr_test.errors.append(error_message) i += 1 curr_word = "" if char == None or char == "": break test_file.close() return curr_test
def process_corpus(self, filename, language, make_IndexedText=True, use_lemmatized_text=False): self.corpus_ready = False self.word_num, self.search_dict, self.indexed_corpus = xls.build_search_dictionary(filename, language, use_lemmatized_text) self.corpus_ready = True