def test_find_lcs_length_calls_required_function(self, mock):
        """
        Tests that find_lcs_length function
            calls fill_lcs_matrix function
        """
        patches_sentence = ('the', 'dog', 'is', 'running')
        plagiarism_threshold = 0.3

        find_lcs_length(patches_sentence, patches_sentence,
                        plagiarism_threshold)
        self.assertTrue(mock.called)
def test_till_calculate_plagiarism_score():
    origin_text = 'the big cat is sleeping'
    susp_text = 'the cat is big'

    origin_tokens = tokenize(origin_text)
    susp_tokens = tokenize(susp_text)

    print(f'Raw text: {origin_text}')
    print(f'Tokenized text: {origin_tokens}\n\n')

    lcs_lenght = main.find_lcs_length(origin_tokens,
                                      susp_tokens,
                                      plagiarism_threshold=0.0)
    print('A length of the longest common subsequence for \n\n'
        f'{origin_text} \n\nand \n\n{susp_text}: \n\n{lcs_lenght} \n')

    matrix = main.fill_lcs_matrix(origin_tokens, susp_tokens)
    print('A matrix:')
    print(*matrix, sep='\n', end='\n\n')

    longest_lcs = main.find_lcs(origin_tokens, susp_tokens, matrix)
    print(f'The longest common subsequence: {longest_lcs}')

    score = main.calculate_plagiarism_score(lcs_lenght, susp_tokens)
    print(f'The plagiarism score: {score:.2f}\n')
    return score
    def test_find_lcs_length_empty_input(self):
        """
        Tests that find_lcs_length function
            can handle empty input params
        """
        expected = 0

        empty_sentence = ()
        patches_sentence = ('a', 'boy', 'plays', 'with', 'ball')
        plagiarism_threshold = 0.3

        actual = find_lcs_length(empty_sentence, patches_sentence,
                                 plagiarism_threshold)
        actual_reversed = find_lcs_length(patches_sentence, empty_sentence,
                                          plagiarism_threshold)
        self.assertEqual(expected, actual)
        self.assertEqual(expected, actual_reversed)
    def test_find_lcs_length_incorrect_inputs(self):
        """
        Tests that find_lcs_length function
            can handle incorrect inputs
        """
        expected = -1
        bad_inputs = [[], {}, '', 9.22, -1, 0, -6, None, True, (None, None)]
        patches_sentence = ('the', 'dog', 'is', 'running')
        plagiarism_threshold = 0.3

        for bad_input in bad_inputs:
            actual = find_lcs_length(bad_input, patches_sentence,
                                     plagiarism_threshold)
            actual_reversed = find_lcs_length(patches_sentence, bad_input,
                                              plagiarism_threshold)
            self.assertEqual(expected, actual)
            self.assertEqual(expected, actual_reversed)
    def test_find_lcs_length_reversed_behaviour(self):
        """
        Tests that find_lcs_length function
            can reverse input sentences params
        """
        expected = 5
        sentence_first = ('the', 'dog', 'is', 'running', 'inside', 'the',
                          'house')
        sentence_second = ('the', 'cat', 'is', 'sleeping', 'inside', 'the',
                           'house')
        plagiarism_threshold = 0.3

        actual = find_lcs_length(sentence_first, sentence_second,
                                 plagiarism_threshold)
        actual_reversed = find_lcs_length(sentence_second, sentence_first,
                                          plagiarism_threshold)
        self.assertEqual(expected, actual)
        self.assertEqual(expected, actual_reversed)
    def test_find_lcs_length_different_sized_inputs(self):
        """
        Tests that find_lcs_length function
            can handle different sized token inputs
        """
        expected = 3

        sentence_first = ('the', 'dog', 'is', 'running', 'inside')
        sentence_second = ('the', 'cat', 'is', 'sleeping', 'inside', 'the',
                           'house')
        plagiarism_threshold = 0.3

        actual = find_lcs_length(sentence_first, sentence_second,
                                 plagiarism_threshold)
        actual_reversed = find_lcs_length(sentence_second, sentence_first,
                                          plagiarism_threshold)
        self.assertEqual(expected, actual)
        self.assertEqual(expected, actual_reversed)
    def test_find_lcs_length_output_check(self):
        """
        Tests that find_lcs_length function
            can generate correct output according to given params
        """
        sentence_first = ('the', 'dog', 'is', 'running', 'here')
        sentence_second = ('a', 'boy', 'plays', 'with', 'ball')
        plagiarism_threshold = 0.3

        actual = find_lcs_length(sentence_first, sentence_second,
                                 plagiarism_threshold)
        self.assertTrue(isinstance(actual, int))
    def test_find_lcs_length_threshold_behaviour(self):
        """
        Tests that find_lcs_length function
            can preprocess threshold inputs
        """
        sentence_first = ('the', 'dog', 'is')
        sentence_second = ('the', 'cat', 'is')
        plagiarism_threshold = 0.3

        not_expected = 2 / 3  # 2/3 < 0.3 = 0
        actual = find_lcs_length(sentence_first, sentence_second,
                                 plagiarism_threshold)
        self.assertNotEqual(not_expected, actual)
    def test_find_lcs_length_incorrect_threshold(self):
        """
        Tests that find_lcs_length function
            can handle incorrect threshold input
        """
        expected = -1
        bad_inputs = [[], {}, '', -1, -6.34, -6, 1.2, None, True, (None, None)]
        patches_sentence = ('the', 'dog', 'is', 'running')

        for bad_input in bad_inputs:
            actual = find_lcs_length(patches_sentence, patches_sentence,
                                     bad_input)
            self.assertEqual(expected, actual)
    def test_find_lcs_length_no_diff(self):
        """
        Tests that find_lcs_length function
            can handle fully different sentences
        """
        expected = 0

        sentence_first = ('the', 'dog', 'is', 'running', 'here')
        sentence_second = ('a', 'boy', 'plays', 'with', 'ball')
        plagiarism_threshold = 0.3

        actual = find_lcs_length(sentence_first, sentence_second,
                                 plagiarism_threshold)
        self.assertEqual(expected, actual)
    def test_find_lcs_length_ideal(self):
        """
        Tests that find_lcs_matrix function
            can handle simple input case
        """
        expected = 2

        sentence_first = ('the', 'dog', 'is', 'running')
        sentence_second = ('the', 'cat', 'is', 'sleeping')
        plagiarism_threshold = 0.3

        actual = find_lcs_length(sentence_first, sentence_second,
                                 plagiarism_threshold)
        self.assertEqual(expected, actual)
    def test_find_lcs_length_complex(self):
        """
        Tests that find_lcs_length function
            can handle complex input case
        """
        expected = 5

        sentence_first = ('the', 'dog', 'is', 'running', 'inside', 'the',
                          'house')
        sentence_second = ('the', 'cat', 'is', 'sleeping', 'inside', 'the',
                           'house')
        plagiarism_threshold = 0.3

        actual = find_lcs_length(sentence_first, sentence_second,
                                 plagiarism_threshold)
        self.assertEqual(expected, actual)
Beispiel #13
0
import main
ORIGINAL_TEXT = 'I have a dog.\nHis name is Nemo.\nI found him yesterday'
SUSPICIOUS_TEXT = 'I have a cat.\nHer name is Anny.\nI met her yesterday'

tokenized_orig_text = main.tokenize_by_lines(ORIGINAL_TEXT)
tokenized_susp_text = main.tokenize_by_lines(SUSPICIOUS_TEXT)
print(f"Original text tokens: {tokenized_orig_text}\nSuspicious text tokens: {tokenized_susp_text}\n")

orig_first_sent = tokenized_orig_text[2]
susp_first_sent = tokenized_susp_text[2]

zero_matrix_first = main.create_zero_matrix(len(orig_first_sent), len(susp_first_sent))
lcs_matrix = main.fill_lcs_matrix(orig_first_sent, susp_first_sent)
print(f"Filled LCS matrix for first sentences: {lcs_matrix}\n")

lcs_length = main.find_lcs_length(orig_first_sent, susp_first_sent, 0.3)
print(f"LCS length for first sentences: {lcs_length}\n")

lcs = main.find_lcs(orig_first_sent, susp_first_sent, lcs_matrix)
print(f"LCS for first sentences: {lcs}\n")

plagiarism_score = main.calculate_plagiarism_score(lcs_length, susp_first_sent)
print(f"The plagiarism score for first sentences: {plagiarism_score}\n")

plagiarism_text = main.calculate_text_plagiarism_score(tokenized_orig_text, tokenized_susp_text)
print(f"The plagiarism score for the text: {plagiarism_text}\n")

diff_in_sent = main.find_diff_in_sentence(orig_first_sent, susp_first_sent, lcs)
print(f"Indexes of differences in first sentences: {diff_in_sent}\n")
statistics = main.accumulate_diff_stats(tokenized_orig_text, tokenized_susp_text)
print(f"The main statistics for pairs of sentences in texts:\n{statistics}\n")