def change_n_random_words( self, n: int, words: List[str], except_idxs: List[int] = None, shuffle: bool = True, ): word_indices = list(range(len(words))) if shuffle: random.shuffle(word_indices) changed_idxs = [] idx = 0 count = 0 while count < n and idx < len(word_indices): index = word_indices[idx] idx += 1 if except_idxs and index in except_idxs: continue _, word_chars, word_diacritics = extract_haraqat(words[index]) rand: int = random.randint(0, len(word_diacritics) - 1) word_diacritics[rand] = get_different_haraqah( word_diacritics[rand]) words[index] = combine_txt_and_haraqat(word_chars, word_diacritics) changed_idxs.append(index) count += 1 assert count == n return words, changed_idxs
def change_n_random_core_word( self, n: int, words: List[str], except_idxs: List[int] = None, shuffle: bool = True, ): word_indices = list(range(len(words))) if shuffle: random.shuffle(word_indices) changed_idxs = [] idx = 0 count = 0 while count < n and idx < len(word_indices): index = word_indices[idx] word = words[index] idx += 1 if except_idxs and index in except_idxs: continue if count_diacritics(words[index], skip_count_equal=2) < 2: continue _, word_chars, word_diacritics = extract_haraqat(words[index]) indices = get_case_ending_indices_from_un_diacritized_txt( word_chars) if len(indices) == 1: choices = [ val for val in range(len(word_diacritics)) if val != indices[-1] ] rand = random.choice(choices) else: rand = random.randint(0, len(word_diacritics) - 2) rand = 0 word_diacritics[rand] = get_different_haraqah( word_diacritics[rand]) words[index] = combine_txt_and_haraqat(word_chars, word_diacritics) assert get_word_without_case_ending( words[index]) != get_word_without_case_ending(word) changed_idxs.append(index) count += 1 assert count == n return words, changed_idxs
def test_get_different_haraqah(self): should_be_true = True for haraqah in ALL_POSSIBLE_HARAQAT.keys(): different_h = get_different_haraqah(haraqah) if haraqah == different_h: should_be_true = False break self.assertTrue(should_be_true)
def test_der_case_and_not_case_ending(self): _, text, haraqat = extract_haraqat(self.content) assert self.number_of_changes <= len(haraqat) haraqat_indices = list(range(len(haraqat))) case_ending_indices = get_case_ending_indices_from_un_diacritized_txt( text) case_ending_map = {} for i in case_ending_indices: case_ending_map[i] = 0 not_case_ending_indices = [ index for index in haraqat_indices if case_ending_map.get(index) is None ] random.shuffle(case_ending_indices) for i in range(self.case_ending_change): wrong_haraqah = get_different_haraqah( haraqat[case_ending_indices[i]]) haraqat[case_ending_indices[i]] = wrong_haraqah random.shuffle(not_case_ending_indices) for i in range(self.not_case_ending_change): wrong_haraqah = get_different_haraqah( haraqat[not_case_ending_indices[i]]) haraqat[not_case_ending_indices[i]] = wrong_haraqah predicted_content = combine_txt_and_haraqat(text, haraqat) result = calculate_der(self.content, predicted_content) result_should_be = calculate_rate( len(haraqat) - self.number_of_changes, self.number_of_changes) self.assertEqual(result_should_be, result) result = calculate_der(self.content, predicted_content, case_ending=False) result_should_be = calculate_rate( len(haraqat) - len(case_ending_indices), self.not_case_ending_change) self.assertEqual(result_should_be, result)
def change_n_random_words_last_char( self, n: int, words: List[str], except_idxs: List[int] = None, shuffle: bool = True, ): word_indices = list(range(len(words))) if shuffle: random.shuffle(word_indices) changed_idxs = [] idx = 0 count = 0 while count < n and idx < len(word_indices): index = word_indices[idx] idx += 1 if except_idxs and index in except_idxs: continue word = words[index] _, word_chars, word_diacritics = extract_haraqat(words[index]) if count_diacritics(words[index], skip_count_equal=2) < 2: continue indices = get_case_ending_indices_from_un_diacritized_txt( word_chars) if len(indices) != 1: continue last_idx = indices[-1] word_diacritics[last_idx] = get_different_haraqah( word_diacritics[last_idx]) words[index] = combine_txt_and_haraqat(word_chars, word_diacritics) changed_idxs.append(index) count += 1 assert count == n return words, changed_idxs