def setUp(self): self.file_name = 'sukuntest.csv' self.sukun_words = ExcelHelperMethod.read_csv_file(path + self.file_name) self.chars_count = WordLetterProcessingHelperMethod.get_chars_count_for_each_word_in_this(deepcopy(self.sukun_words)) self.Chars = WordLetterProcessingHelperMethod.convert_list_of_words_to_list_of_chars(deepcopy(self.sukun_words)) self.Chars_And_Its_Location = WordLetterProcessingHelperMethod.get_location_of_each_char( self.Chars, self.chars_count)
def create_netcdf_target_classes(): execute_create_netcdf_target_classes_start_time = datetime.datetime.now() letter = [] diacritics = [] searchCounter = 0 targetClass = [] beforeWhileLoop = datetime.datetime.now() for eachItem in range(0, len(selected_letters_in_this_loop)): yourLabel = selected_letters_in_this_loop[eachItem][7] OneHotTargetClassNotFound = True decomposed_letter = WordLetterProcessingHelperMethod.decompose_diac_char_into_char_and_diacritics( yourLabel) if len(decomposed_letter) == 2 and decomposed_letter[1] == u'ّ': decomposed_letter[1] = u'َّ' letter.append(decomposed_letter[0]) diacritics.append(decomposed_letter[1]) yourLabel = WordLetterProcessingHelperMethod.attach_diacritics_to_chars( letter, diacritics)[0] while OneHotTargetClassNotFound: try: if listOfDiacritizedCharacter[searchCounter][1] == yourLabel: OneHotTargetClassNotFound = False targetClass.append( listOfDiacritizedCharacter[searchCounter][0]) searchCounter = 0 else: searchCounter += 1 except: x = 1 afterWhileLoop = datetime.datetime.now() print "While Loop takes : ", afterWhileLoop - beforeWhileLoop global purified_target_class purified_target_class = [] purified_target_class = np.array(targetClass) execute_create_netcdf_target_class_end_time = datetime.datetime.now() print "createNetCDFTargetClasses takes : ", \ execute_create_netcdf_target_class_end_time - execute_create_netcdf_target_classes_start_time
def do_we_need_to_search_in_dictionary(dictionary, word): for each_word in dictionary: decomposed_dict = WordLetterProcessingHelperMethod.decompose_word_into_letters( each_word) decomposed_act = WordLetterProcessingHelperMethod.decompose_word_into_letters( word) norm_dict = WordLetterProcessingHelperMethod.normalize(decomposed_dict) norm_act = WordLetterProcessingHelperMethod.normalize(decomposed_act) if len(norm_dict) != len(norm_act): raise ValueError( "Bug Found In 'do_we_need_to_search_in_dictionary'") if sorted(norm_dict) == sorted(norm_act): return False for each_word in dictionary: decomposed_dict = WordLetterProcessingHelperMethod.decompose_word_into_letters( each_word) decomposed_act = WordLetterProcessingHelperMethod.decompose_word_into_letters( word) norm_dict = WordLetterProcessingHelperMethod.normalize(decomposed_dict) norm_act = WordLetterProcessingHelperMethod.normalize(decomposed_act) for x in range(0, len(norm_act)): # compare letters before last letter if x < (len(norm_act) - 1): if norm_dict[x] != norm_act[x]: # so diff is in first or middle letters break else: # so diff is in last letter so ignore it return False return True
def prepare_master_object(selected_sentence, rnn_op, exp_op, location, undiac_words): list_of_master_object = [] list_of_word_len = [] total_length = 0 letter_counter = 0 if len(rnn_op) != len(exp_op) != len(location): raise Exception("bug found in data") for each_word in undiac_words: if each_word != 'space': decomposed_word = WordLetterProcessingHelperMethod.decompose_word_into_letters( each_word) total_length += len(decomposed_word) list_of_word_len.append(total_length) master = MasterObject() for (each_rnn_char, each_exp_char, each_location) in (zip(rnn_op, exp_op, location)): if each_rnn_char == 'space': continue master.undiac_char = WordLetterProcessingHelperMethod.remove_diacritics_from_this( each_rnn_char) master.rnn_diac_char = each_rnn_char decomposed_result_1 = WordLetterProcessingHelperMethod.decompose_diac_char_into_char_and_diacritics\ (each_rnn_char) if len(decomposed_result_1) == 1: master.rnn_diac = '' elif len(decomposed_result_1) == 2 and decomposed_result_1[1] != u'ْ': master.rnn_diac = decomposed_result_1[1] elif len(decomposed_result_1) == 2 and decomposed_result_1[1] == u'ْ': master.rnn_diac = '' master.rnn_diac_char = master.undiac_char elif len(decomposed_result_1) == 3: master.rnn_diac = decomposed_result_1[1] + decomposed_result_1[2] master.exp_diac_char = each_exp_char decomposed_result_2 = WordLetterProcessingHelperMethod.decompose_diac_char_into_char_and_diacritics\ (each_exp_char) if len(decomposed_result_2) == 1: master.exp_diac = '' elif len(decomposed_result_2) == 2 and decomposed_result_2[1] != u'ْ': master.exp_diac = decomposed_result_2[1] elif len(decomposed_result_2) == 2 and decomposed_result_2[1] == u'ْ': master.exp_diac = '' master.exp_diac_char = master.undiac_char elif len(decomposed_result_2) == 3: master.exp_diac = decomposed_result_2[1] + decomposed_result_2[2] master.location_in_word = each_location master.location_in_sent = letter_counter master.sentence = selected_sentence index = 0 for each_length_index in range(0, len(list_of_word_len)): index = each_length_index if list_of_word_len[each_length_index] >= (letter_counter + 1): break master.undiac_word = undiac_words[index] if each_location == 'first' and list_of_word_len[index] != 1: master.has_next_char = True master.has_prev_char = False elif each_location == 'first' and len(list(master.undiac_word)) == 1: master.has_next_char = False master.has_prev_char = False elif each_location == 'middle': master.has_next_char = True master.has_prev_char = True elif each_location == 'last': master.has_next_char = False master.has_prev_char = True list_of_master_object.append(deepcopy(master)) letter_counter += 1 list_of_rnn_words = WordLetterProcessingHelperMethod.reform_word_from_version_2( list_of_master_object) st_range = 0 for each_number in list_of_word_len: en_range = each_number for index in range(st_range, en_range): try: list_of_master_object[index].rnn_diac_word = list_of_rnn_words[ 0] list_of_master_object[index].exp_diac_word = selected_sentence[ 0] except: x = 1 del list_of_rnn_words[0] del selected_sentence[0] st_range = en_range return list_of_master_object
selected_sentence = DBHelperMethod.get_sentence_by(sentence_number) rnn_output = ExcelHelperMethod.read_rnn_op_csv_file(path + file_name) neurons_with_highest_probability, neurons_op_value = RNNOPProcessingHelperMethod.get_neurons_numbers_with_highest_output_value( rnn_output) list_of_available_diac_chars = DBHelperMethod.get_available_diacritized_chars( ) RNN_Predicted_Diac_Chars = RNNOPProcessingHelperMethod.\ deduce_from_rnn_op_predicted_chars(list_of_available_diac_chars, neurons_with_highest_probability) # Expected OP OP_Diac_Chars = DBHelperMethod.get_diacritized_chars_by( sentence_number, type) RNN_Predicted_Diac_Chars = WordLetterProcessingHelperMethod.check_target_and_output_letters_are_same( deepcopy(RNN_Predicted_Diac_Chars), OP_Diac_Chars) if sentence_number == 45: x = 1 RNN_Predicted_Chars_Count = WordLetterProcessingHelperMethod.get_chars_count_for_each_word_in_this( selected_sentence) RNN_Predicted_Chars_And_Its_Location = WordLetterProcessingHelperMethod.get_location_of_each_char( RNN_Predicted_Diac_Chars, RNN_Predicted_Chars_Count) WordLetterProcessingHelperMethod.append_neuron_op_value( RNN_Predicted_Chars_And_Its_Location, neurons_op_value) # Post Processing RNN_Predicted_Chars_After_Sukun = SukunCorrection.sukun_correction( deepcopy(RNN_Predicted_Chars_And_Its_Location)) RNN_Predicted_Chars_After_Fatha = FathaCorrection.fatha_correction( deepcopy(RNN_Predicted_Chars_After_Sukun)) RNN_Predicted_Chars_After_Dictionary = DictionaryCorrection.get_diac_version_with_smallest_dist( deepcopy(RNN_Predicted_Chars_After_Fatha), sentence_number)
def fatha_correction(list_of_objects_of_chars_and_its_location): counter = 0 current_index = 0 actual_letters_after_fatha_correction = [] prev_char_object = WordLetterProcessingHelperMethod.LetterPosition() prev_prev_char_object = WordLetterProcessingHelperMethod.LetterPosition() next_char_object = WordLetterProcessingHelperMethod.LetterPosition() for each_letter_object in list_of_objects_of_chars_and_its_location: actual_letters_after_fatha_correction.append(each_letter_object) character = remove_diacritics(each_letter_object.letter) if (character in letters_of_fatha_correction) and (each_letter_object.location != 'first'): letter_caused_fatha_correction = character if (counter - 1) >= 0: prev_char_object = list_of_objects_of_chars_and_its_location[counter - 1] prev_char_object.letter = unicodedata2.normalize('NFC', str(prev_char_object.letter)) if (counter - 2) >= 0: prev_prev_char_object = list_of_objects_of_chars_and_its_location[counter - 2] prev_prev_char_object.letter = unicodedata2.normalize('NFC', prev_prev_char_object.letter) if ((counter + 1) <= (len(list_of_objects_of_chars_and_its_location) - 1)) and (each_letter_object.location != 'last'): next_char_object = list_of_objects_of_chars_and_its_location[counter + 1] corrected_char = prev_char_object.letter if letter_caused_fatha_correction == u'ة': corrected_char = correct_teh_marbota_prev_char(prev_char_object) elif letter_caused_fatha_correction == u'ا': if each_letter_object.location == 'middle': if remove_diacritics(prev_char_object.letter) == u'ب': # , بِاتِّخَاذِكُمُ ,وَبِالْآخِرَةِ , بِالْعُدْوَةِ if u'ّ' in next_char_object.letter or\ next_char_object.letter == remove_diacritics(next_char_object.letter): corrected_char = correct_alef_prev_char_ba2_maksora(prev_char_object) # بَالِغَةٌ , بَاسِرَةٌ else: corrected_char = correct_alef_prev_char_normal_case(prev_char_object) elif remove_diacritics(prev_char_object.letter) == u'ل': if prev_char_object.location == 'first': # do not handle this case # special case with no law (these are contradict) لَا , لِامْرَأَتِهِ corrected_char = prev_char_object.letter elif prev_prev_char_object.letter == u'ا': # do not handle this case # special case with no law (these are contradict) الِاسْمُ corrected_char = prev_char_object.letter else: corrected_char = correct_alef_prev_char_normal_case(prev_char_object) # مِائَةَ , مِائَتَيْنِ elif remove_diacritics(prev_char_object.letter) == u'م' \ and prev_char_object.location == 'first' \ and next_char_object.letter == u'ئَ': corrected_char = correct_alef_prev_char_mem(prev_char_object) else: corrected_char = correct_alef_prev_char_normal_case(prev_char_object) elif each_letter_object.location == 'last' or each_letter_object.location == 'first': corrected_char = prev_char_object.letter else: corrected_char = correct_alef_prev_char_normal_case(prev_char_object) elif letter_caused_fatha_correction == u'ى': # طُوًى, ضُحًى if prev_prev_char_object.location == 'first' and u'ُ' in prev_prev_char_object.letter and \ each_letter_object.location == 'last': corrected_char = correct_alef_maksora_prev_char_tanween_case(prev_char_object) # أَبَى else: corrected_char = correct_alef_maksora_prev_char_normal_case(prev_char_object) actual_letters_after_fatha_correction[counter - 1].letter = corrected_char counter += 1 else: counter += 1 current_index += 1 return actual_letters_after_fatha_correction
location = loc[start_range:end_range:1] # Post Processing RNN_Predicted_Chars_And_Its_Location = dp.create_letter_location_object( nn_op_letters, location) RNN_Predicted_Chars_After_Sukun = SukunCorrection.sukun_correction( deepcopy(RNN_Predicted_Chars_And_Its_Location)) RNN_Predicted_Chars_After_Fatha = FathaCorrection.fatha_correction( deepcopy(RNN_Predicted_Chars_After_Sukun)) RNN_Predicted_Chars_After_Dictionary = DictionaryCorrection.get_diac_version_with_smallest_dist_no_db_access( RNN_Predicted_Chars_After_Fatha, undiac_words, dic_words_for_selected_sent) # Expected OP OP_Diac_Chars_Count = WordLetterProcessingHelperMethod.get_chars_count_for_each_word_in_this( selected_sentence) OP_Diac_Chars_And_Its_Location = WordLetterProcessingHelperMethod.get_location_of_each_char( expected_letters, OP_Diac_Chars_Count, True) OP_Diac_Chars_After_Sukun = SukunCorrection.sukun_correction( deepcopy(OP_Diac_Chars_And_Its_Location)) # DER Calculation error = DERCalculationHelperMethod.get_diacritization_error \ (RNN_Predicted_Chars_After_Dictionary, OP_Diac_Chars_After_Sukun, selected_sentence) error_without_last_letter = DERCalculationHelperMethod.get_diacritization_error_without_counting_last_letter \ (RNN_Predicted_Chars_After_Dictionary, OP_Diac_Chars_After_Sukun, selected_sentence) # write error in excel file excel_1 = current_row_1 current_row_1 = ExcelHelperMethod.write_data_into_excel_file(
for file_name, sentence_number in zip(result, list_of_sentence_numbers): selected_sentence = DBHelperMethod.get_sentence_by(sentence_number) rnn_output = ExcelHelperMethod.read_rnn_op_csv_file(path + file_name) neurons_with_highest_probability = RNNOPProcessingHelperMethod.get_neurons_numbers_with_highest_output_value( rnn_output) list_of_available_diacritics = DBHelperMethod.get_all_diacritics() RNN_Predicted_diacritics = RNNOPProcessingHelperMethod.\ deduce_from_rnn_op_predicted_chars(list_of_available_diacritics, neurons_with_highest_probability) IP_Undiacritized_Chars = DBHelperMethod.get_un_diacritized_chars_by( sentence_number, type) RNN_Predicted_chars = WordLetterProcessingHelperMethod.attach_diacritics_to_chars( IP_Undiacritized_Chars, RNN_Predicted_diacritics) RNN_Predicted_Chars_Count = WordLetterProcessingHelperMethod.get_chars_count_for_each_word_in_this( selected_sentence) RNN_Predicted_Chars_And_Its_Location = WordLetterProcessingHelperMethod.get_location_of_each_char( RNN_Predicted_chars, RNN_Predicted_Chars_Count) # Post Processing RNN_Predicted_Chars_After_Sukun = SukunCorrection.sukun_correction( deepcopy(RNN_Predicted_Chars_And_Its_Location)) RNN_Predicted_Chars_After_Fatha = FathaCorrection.fatha_correction( deepcopy(RNN_Predicted_Chars_After_Sukun)) RNN_Predicted_Chars_After_Dictionary = DictionaryCorrection.get_diac_version_with_smallest_dist( deepcopy(RNN_Predicted_Chars_After_Fatha), sentence_number) # Expected OP
def get_diac_version_with_smallest_dist(list_of_objects): list_of_actual_words_after_dictionary_correction = [] list_of_undiac_objects = WordLetterProcessingHelperMethod.remove_diacritics_from( list_of_objects) list_of_undiac_words = WordLetterProcessingHelperMethod.reform_word_from( list_of_undiac_objects) diacritized_rnn_op_words = WordLetterProcessingHelperMethod.reform_word_from( list_of_objects) if len(diacritized_rnn_op_words) != len(list_of_undiac_words): raise Exception( "error appeared in get_diac_version_with_smallest_dist") for each_corrected_word, each_un_diacritized_word in zip( diacritized_rnn_op_words, list_of_undiac_words): minimum_error = 100000000 dictionary_diacritized_words = DBHelperMethod.\ get_dictionary_all_diacritized_version_of(each_un_diacritized_word) if len(dictionary_diacritized_words) == 0: dictionary_diacritized_words.append(each_corrected_word) dictionary_diacritized_words_after_sukun_correction = SukunCorrection.\ sukun_correction_for_list_of_words(dictionary_diacritized_words) if do_we_need_to_search_in_dictionary( dictionary_diacritized_words_after_sukun_correction, each_corrected_word): for each_word in dictionary_diacritized_words_after_sukun_correction: error_count = 0 decomposed_dic_word = WordLetterProcessingHelperMethod.decompose_word_into_letters( each_word) decomposed_act_word = WordLetterProcessingHelperMethod.decompose_word_into_letters( each_corrected_word) norm_dic_word = WordLetterProcessingHelperMethod.normalize( decomposed_dic_word) norm_act_word = WordLetterProcessingHelperMethod.normalize( decomposed_act_word) for each_diacritized_version_letter, each_current_word_letter in zip( norm_dic_word, norm_act_word): if (len(each_diacritized_version_letter) - len(each_current_word_letter) == 1) or ((len(each_diacritized_version_letter) - len(each_current_word_letter) == -1)): error_count += 1 elif (len(each_diacritized_version_letter) - len(each_current_word_letter) == 2) or ((len(each_diacritized_version_letter) - len(each_current_word_letter) == -2)): error_count += 2 else: for each_item_in_diacritized_version, each_item_in_current_word in \ zip(each_diacritized_version_letter, each_current_word_letter): if each_item_in_diacritized_version != each_item_in_current_word: error_count += 1 if error_count < minimum_error: minimum_error = error_count selected_dictionary_word = each_word list_of_actual_words_after_dictionary_correction.append( selected_dictionary_word) else: list_of_actual_words_after_dictionary_correction.append( each_corrected_word) chars_after_dic_correction = WordLetterProcessingHelperMethod.convert_list_of_words_to_list_of_chars( list_of_actual_words_after_dictionary_correction) if len(list_of_objects) != len(chars_after_dic_correction): raise Exception("Error Happened Here") for x in range(0, len(list_of_objects)): list_of_objects[x].letter = chars_after_dic_correction[x] return list_of_objects
selected_sentence = DBHelperMethod.get_sentence_by(sentence_number) rnn_input = DBHelperMethod.get_un_diacritized_chars_by(sentence_number, type) rnn_output = ExcelHelperMethod.read_rnn_op_csv_file(path + file_name) neurons_with_highest_probability, neurons_op_value = RNNOPProcessingHelperMethod.\ get_neurons_numbers_with_highest_output_value(rnn_output) list_of_available_diac_chars = DBHelperMethod.get_available_diacritics_and_un_diacritized_chars() RNN_Predicted_Diac_Chars = RNNOPProcessingHelperMethod.\ deduce_from_rnn_op_predicted_chars(list_of_available_diac_chars, neurons_with_highest_probability) # Expected OP OP_Diac_Chars = DBHelperMethod.get_diacritized_chars_by(sentence_number, type) # RNN_Predicted_Diac_Chars = WordLetterProcessingHelperMethod.check_target_and_output_letters_are_same(deepcopy(RNN_Predicted_Diac_Chars), OP_Diac_Chars) RNN_Predicted_Chars_Count = WordLetterProcessingHelperMethod.get_chars_count_for_each_word_in_this(selected_sentence) RNN_Predicted_Chars_And_Its_Location = WordLetterProcessingHelperMethod.get_location_of_each_char(RNN_Predicted_Diac_Chars, RNN_Predicted_Chars_Count) WordLetterProcessingHelperMethod.append_neuron_op_value(RNN_Predicted_Chars_And_Its_Location, neurons_op_value) # check below line if counter == 36: x = 1 WordLetterProcessingHelperMethod.append_diacritics_with_un_diacritized_char(RNN_Predicted_Chars_And_Its_Location, rnn_input, list_of_available_diac_chars) # Post Processing RNN_Predicted_Chars_After_Sukun = SukunCorrection.sukun_correction(deepcopy(RNN_Predicted_Chars_And_Its_Location)) RNN_Predicted_Chars_After_Fatha = FathaCorrection.fatha_correction(deepcopy(RNN_Predicted_Chars_After_Sukun)) RNN_Predicted_Chars_After_Dictionary = DictionaryCorrection.get_diac_version_with_smallest_dist(deepcopy(RNN_Predicted_Chars_After_Fatha), sentence_number) # Expected OP OP_Diac_Chars = DBHelperMethod.get_diacritized_chars_by(sentence_number, type) OP_Diac_Chars_Count = WordLetterProcessingHelperMethod.get_chars_count_for_each_word_in_this(
def get_diac_version_with_smallest_dist_no_db_access_version_2( master_object, dic_words): # rnn_diac_words = [each_object.rnn_diac_word for each_object in master_object] # rnn_diac_words = [x[0] for x in groupby(rnn_diac_words1)] # rnn_diac_words = master_object[0].sentence undiac_words = [] rnn_diac_words = [] for each_word in master_object[0].sentence: undiac_words.append( WordLetterProcessingHelperMethod.remove_diacritics_from_this_word( each_word)) for each_word in undiac_words: for each_object in master_object: if each_object.undiac_word == each_word: rnn_diac_words.append(each_object.rnn_diac_word) break #undiac_words = [each_object.undiac_word for each_object in master_object] #undiac_words = [x[0] for x in groupby(undiac_words)] selected_dictionary_word = '' selected_norm_dictionary_word = '' output = [] for each_word, undiac_word in zip(rnn_diac_words, undiac_words): rows, cols = np.where(dic_words == undiac_word) dictionary_diacritized_words = [] dictionary_diacritized_words = (dic_words[rows, 0]).tolist() # no dictionary data found if len(dictionary_diacritized_words) == 0: output.append(extract_data_in_req_format(WordLetterProcessingHelperMethod.normalize \ (WordLetterProcessingHelperMethod.decompose_word_into_letters( each_word)), each_word)) else: dict_words_after_sukun_correction = SukunCorrection. \ sukun_correction_for_list_of_words(dictionary_diacritized_words) if not (do_we_need_to_search_in_dictionary( dict_words_after_sukun_correction, each_word)): output.append(extract_data_in_req_format(WordLetterProcessingHelperMethod.normalize \ (WordLetterProcessingHelperMethod.decompose_word_into_letters(each_word)), each_word)) else: minimum_error = 100000000 for each_dic_word in dict_words_after_sukun_correction: error_count = 0 norm_dic_word = WordLetterProcessingHelperMethod.normalize \ (WordLetterProcessingHelperMethod.decompose_word_into_letters(each_dic_word)) norm_act_word = WordLetterProcessingHelperMethod.normalize \ (WordLetterProcessingHelperMethod.decompose_word_into_letters(each_word)) # unify last char because it depend on context norm_dic_word[-1] = norm_act_word[-1] for each_dic_letter, each_act_letter in zip( norm_dic_word, norm_act_word): if each_dic_letter[0] != each_act_letter[ 0] or each_dic_letter[1] != each_act_letter[1]: error_count += 1 if error_count < minimum_error: minimum_error = error_count selected_norm_dictionary_word = norm_dic_word selected_dictionary_word = each_dic_word output.append( extract_data_in_req_format(selected_norm_dictionary_word, selected_dictionary_word)) merged = list(itertools.chain(*output)) if len(merged) != len(master_object): Exception("bug found") for index, (each_merged_object) in enumerate(merged): master_object[index].rnn_diac_char = each_merged_object.letter master_object[index].rnn_diac = each_merged_object.diac master_object[index].rnn_diac_word = each_merged_object.word return master_object