コード例 #1
0
def preprocessing():


	# Get filenames for the thesaurus, the latin text, and greek text
	thesaurus_filename, Latin_filename, Greek_filename = xls.get_whole_text_comparison_Args()

	# Read the thesaurus CSV file into a dicitonary object for easy access
	transDict = ths.build_thesaurus(thesaurus_filename)

	# Read both the latin text and greek text into a dictionary for easy access
	Latin_word_num, Latin_search_dict, Latin_text = xls.build_search_dictionary(Latin_filename, "Latin", True)

	Greek_word_num, Greek_search_dict, Greek_text  = xls.build_search_dictionary = (Greek_filename, "Greek", True)

	return Latin_word_num, Latin_search_dict, Latin_text, Greek_word_num, Greek_search_dict, Greek_text
コード例 #2
0
def preprocessing():
	latin_cltk_importer = CorpusImporter('latin')
	latin_cltk_importer.import_corpus('latin_models_cltk')

	greek_cltk_importer = CorpusImporter('greek')
	greek_cltk_importer.import_corpus('greek_models_cltk')

	# Get filenames for the thesaurus, the latin text, and greek text
	thesaurus_filename, Greek_filename = xls.get_search_by_phrase_Args()

	# Read the thesaurus CSV file into a dicitonary object for easy access
	transDict = ths.build_thesaurus(thesaurus_filename)
	
	# Read greek text into a dictionary for easy access
	Greek_word_num, Greek_search_dict, Greek_text = xls.build_search_dictionary(Greek_filename, "Greek", True)

	return transDict, Greek_word_num, Greek_search_dict, Greek_text
コード例 #3
0
def test_build_search_dict(curr_test, filename, words_in_file , language, lemmatized_version = False):

	word_num, search_dict, indexed_corpus = xls.build_search_dictionary(filename, language ,lemmatized_version)

	if not (word_num == words_in_file):
		curr_test.passed = False
		curr_test.errors.append("Wrong number of words added (only " + str(word_num) + " out of " + str(words_in_file) + " words added) ")
	
	test_file = open(filename,'r') 
	
	i = 0

	curr_word = ""

	while 1:
		char = test_file.read(1)

		if not (re.sub("[\p{Z}\t\r\n\v\f\s]", "", char) == ""):
			curr_word += char
		else:
			curr_word = normalize_word(curr_word)
			if not (curr_word == ""):
				if curr_word in search_dict:

					if not( i in search_dict[curr_word]):
						curr_test.passed = False	
									
						error_message = curr_word + " did not have the proper index in the search dict"
						error_message += "\n\t\t\t word number: " +str(i) + " Indices: " + str(search_dict[curr_word]) 
						curr_test.errors.append(error_message) 
				else:
					curr_test.passed = False
					error_message = curr_word + " was not found in the search dict"
						
					curr_test.errors.append(error_message)
					
				i += 1
			curr_word = ""

		if char == None or char == "":
			break

	test_file.close() 

	return curr_test
コード例 #4
0
	def process_corpus(self, filename, language, make_IndexedText=True, use_lemmatized_text=False):
		self.corpus_ready = False
		self.word_num, self.search_dict, self.indexed_corpus = xls.build_search_dictionary(filename, language, use_lemmatized_text)
		self.corpus_ready = True