コード例 #1
0
    def change_n_random_words(
        self,
        n: int,
        words: List[str],
        except_idxs: List[int] = None,
        shuffle: bool = True,
    ):
        word_indices = list(range(len(words)))
        if shuffle:
            random.shuffle(word_indices)
        changed_idxs = []

        idx = 0
        count = 0
        while count < n and idx < len(word_indices):
            index = word_indices[idx]
            idx += 1

            if except_idxs and index in except_idxs:
                continue

            _, word_chars, word_diacritics = extract_haraqat(words[index])

            rand: int = random.randint(0, len(word_diacritics) - 1)
            word_diacritics[rand] = get_different_haraqah(
                word_diacritics[rand])
            words[index] = combine_txt_and_haraqat(word_chars, word_diacritics)

            changed_idxs.append(index)

            count += 1

        assert count == n

        return words, changed_idxs
コード例 #2
0
    def change_n_random_core_word(
        self,
        n: int,
        words: List[str],
        except_idxs: List[int] = None,
        shuffle: bool = True,
    ):
        word_indices = list(range(len(words)))

        if shuffle:
            random.shuffle(word_indices)

        changed_idxs = []
        idx = 0
        count = 0

        while count < n and idx < len(word_indices):
            index = word_indices[idx]
            word = words[index]
            idx += 1

            if except_idxs and index in except_idxs:
                continue

            if count_diacritics(words[index], skip_count_equal=2) < 2:
                continue

            _, word_chars, word_diacritics = extract_haraqat(words[index])

            indices = get_case_ending_indices_from_un_diacritized_txt(
                word_chars)

            if len(indices) == 1:
                choices = [
                    val for val in range(len(word_diacritics))
                    if val != indices[-1]
                ]
                rand = random.choice(choices)
            else:
                rand = random.randint(0, len(word_diacritics) - 2)

            rand = 0

            word_diacritics[rand] = get_different_haraqah(
                word_diacritics[rand])
            words[index] = combine_txt_and_haraqat(word_chars, word_diacritics)

            assert get_word_without_case_ending(
                words[index]) != get_word_without_case_ending(word)

            changed_idxs.append(index)

            count += 1

        assert count == n

        return words, changed_idxs
コード例 #3
0
    def test_get_different_haraqah(self):
        should_be_true = True

        for haraqah in ALL_POSSIBLE_HARAQAT.keys():
            different_h = get_different_haraqah(haraqah)
            if haraqah == different_h:
                should_be_true = False
                break
        self.assertTrue(should_be_true)
コード例 #4
0
    def test_der_case_and_not_case_ending(self):
        _, text, haraqat = extract_haraqat(self.content)
        assert self.number_of_changes <= len(haraqat)
        haraqat_indices = list(range(len(haraqat)))
        case_ending_indices = get_case_ending_indices_from_un_diacritized_txt(
            text)

        case_ending_map = {}
        for i in case_ending_indices:
            case_ending_map[i] = 0

        not_case_ending_indices = [
            index for index in haraqat_indices
            if case_ending_map.get(index) is None
        ]

        random.shuffle(case_ending_indices)
        for i in range(self.case_ending_change):
            wrong_haraqah = get_different_haraqah(
                haraqat[case_ending_indices[i]])
            haraqat[case_ending_indices[i]] = wrong_haraqah

        random.shuffle(not_case_ending_indices)
        for i in range(self.not_case_ending_change):
            wrong_haraqah = get_different_haraqah(
                haraqat[not_case_ending_indices[i]])
            haraqat[not_case_ending_indices[i]] = wrong_haraqah

        predicted_content = combine_txt_and_haraqat(text, haraqat)

        result = calculate_der(self.content, predicted_content)
        result_should_be = calculate_rate(
            len(haraqat) - self.number_of_changes, self.number_of_changes)
        self.assertEqual(result_should_be, result)

        result = calculate_der(self.content,
                               predicted_content,
                               case_ending=False)
        result_should_be = calculate_rate(
            len(haraqat) - len(case_ending_indices),
            self.not_case_ending_change)
        self.assertEqual(result_should_be, result)
コード例 #5
0
    def change_n_random_words_last_char(
        self,
        n: int,
        words: List[str],
        except_idxs: List[int] = None,
        shuffle: bool = True,
    ):

        word_indices = list(range(len(words)))

        if shuffle:
            random.shuffle(word_indices)

        changed_idxs = []

        idx = 0
        count = 0
        while count < n and idx < len(word_indices):
            index = word_indices[idx]
            idx += 1
            if except_idxs and index in except_idxs:
                continue

            word = words[index]
            _, word_chars, word_diacritics = extract_haraqat(words[index])

            if count_diacritics(words[index], skip_count_equal=2) < 2:
                continue

            indices = get_case_ending_indices_from_un_diacritized_txt(
                word_chars)

            if len(indices) != 1:
                continue

            last_idx = indices[-1]

            word_diacritics[last_idx] = get_different_haraqah(
                word_diacritics[last_idx])
            words[index] = combine_txt_and_haraqat(word_chars, word_diacritics)

            changed_idxs.append(index)

            count += 1

        assert count == n

        return words, changed_idxs