def test_find_lcs_length_calls_required_function(self, mock): """ Tests that find_lcs_length function calls fill_lcs_matrix function """ patches_sentence = ('the', 'dog', 'is', 'running') plagiarism_threshold = 0.3 find_lcs_length(patches_sentence, patches_sentence, plagiarism_threshold) self.assertTrue(mock.called)
def test_till_calculate_plagiarism_score(): origin_text = 'the big cat is sleeping' susp_text = 'the cat is big' origin_tokens = tokenize(origin_text) susp_tokens = tokenize(susp_text) print(f'Raw text: {origin_text}') print(f'Tokenized text: {origin_tokens}\n\n') lcs_lenght = main.find_lcs_length(origin_tokens, susp_tokens, plagiarism_threshold=0.0) print('A length of the longest common subsequence for \n\n' f'{origin_text} \n\nand \n\n{susp_text}: \n\n{lcs_lenght} \n') matrix = main.fill_lcs_matrix(origin_tokens, susp_tokens) print('A matrix:') print(*matrix, sep='\n', end='\n\n') longest_lcs = main.find_lcs(origin_tokens, susp_tokens, matrix) print(f'The longest common subsequence: {longest_lcs}') score = main.calculate_plagiarism_score(lcs_lenght, susp_tokens) print(f'The plagiarism score: {score:.2f}\n') return score
def test_find_lcs_length_empty_input(self): """ Tests that find_lcs_length function can handle empty input params """ expected = 0 empty_sentence = () patches_sentence = ('a', 'boy', 'plays', 'with', 'ball') plagiarism_threshold = 0.3 actual = find_lcs_length(empty_sentence, patches_sentence, plagiarism_threshold) actual_reversed = find_lcs_length(patches_sentence, empty_sentence, plagiarism_threshold) self.assertEqual(expected, actual) self.assertEqual(expected, actual_reversed)
def test_find_lcs_length_incorrect_inputs(self): """ Tests that find_lcs_length function can handle incorrect inputs """ expected = -1 bad_inputs = [[], {}, '', 9.22, -1, 0, -6, None, True, (None, None)] patches_sentence = ('the', 'dog', 'is', 'running') plagiarism_threshold = 0.3 for bad_input in bad_inputs: actual = find_lcs_length(bad_input, patches_sentence, plagiarism_threshold) actual_reversed = find_lcs_length(patches_sentence, bad_input, plagiarism_threshold) self.assertEqual(expected, actual) self.assertEqual(expected, actual_reversed)
def test_find_lcs_length_reversed_behaviour(self): """ Tests that find_lcs_length function can reverse input sentences params """ expected = 5 sentence_first = ('the', 'dog', 'is', 'running', 'inside', 'the', 'house') sentence_second = ('the', 'cat', 'is', 'sleeping', 'inside', 'the', 'house') plagiarism_threshold = 0.3 actual = find_lcs_length(sentence_first, sentence_second, plagiarism_threshold) actual_reversed = find_lcs_length(sentence_second, sentence_first, plagiarism_threshold) self.assertEqual(expected, actual) self.assertEqual(expected, actual_reversed)
def test_find_lcs_length_different_sized_inputs(self): """ Tests that find_lcs_length function can handle different sized token inputs """ expected = 3 sentence_first = ('the', 'dog', 'is', 'running', 'inside') sentence_second = ('the', 'cat', 'is', 'sleeping', 'inside', 'the', 'house') plagiarism_threshold = 0.3 actual = find_lcs_length(sentence_first, sentence_second, plagiarism_threshold) actual_reversed = find_lcs_length(sentence_second, sentence_first, plagiarism_threshold) self.assertEqual(expected, actual) self.assertEqual(expected, actual_reversed)
def test_find_lcs_length_output_check(self): """ Tests that find_lcs_length function can generate correct output according to given params """ sentence_first = ('the', 'dog', 'is', 'running', 'here') sentence_second = ('a', 'boy', 'plays', 'with', 'ball') plagiarism_threshold = 0.3 actual = find_lcs_length(sentence_first, sentence_second, plagiarism_threshold) self.assertTrue(isinstance(actual, int))
def test_find_lcs_length_threshold_behaviour(self): """ Tests that find_lcs_length function can preprocess threshold inputs """ sentence_first = ('the', 'dog', 'is') sentence_second = ('the', 'cat', 'is') plagiarism_threshold = 0.3 not_expected = 2 / 3 # 2/3 < 0.3 = 0 actual = find_lcs_length(sentence_first, sentence_second, plagiarism_threshold) self.assertNotEqual(not_expected, actual)
def test_find_lcs_length_incorrect_threshold(self): """ Tests that find_lcs_length function can handle incorrect threshold input """ expected = -1 bad_inputs = [[], {}, '', -1, -6.34, -6, 1.2, None, True, (None, None)] patches_sentence = ('the', 'dog', 'is', 'running') for bad_input in bad_inputs: actual = find_lcs_length(patches_sentence, patches_sentence, bad_input) self.assertEqual(expected, actual)
def test_find_lcs_length_no_diff(self): """ Tests that find_lcs_length function can handle fully different sentences """ expected = 0 sentence_first = ('the', 'dog', 'is', 'running', 'here') sentence_second = ('a', 'boy', 'plays', 'with', 'ball') plagiarism_threshold = 0.3 actual = find_lcs_length(sentence_first, sentence_second, plagiarism_threshold) self.assertEqual(expected, actual)
def test_find_lcs_length_ideal(self): """ Tests that find_lcs_matrix function can handle simple input case """ expected = 2 sentence_first = ('the', 'dog', 'is', 'running') sentence_second = ('the', 'cat', 'is', 'sleeping') plagiarism_threshold = 0.3 actual = find_lcs_length(sentence_first, sentence_second, plagiarism_threshold) self.assertEqual(expected, actual)
def test_find_lcs_length_complex(self): """ Tests that find_lcs_length function can handle complex input case """ expected = 5 sentence_first = ('the', 'dog', 'is', 'running', 'inside', 'the', 'house') sentence_second = ('the', 'cat', 'is', 'sleeping', 'inside', 'the', 'house') plagiarism_threshold = 0.3 actual = find_lcs_length(sentence_first, sentence_second, plagiarism_threshold) self.assertEqual(expected, actual)
import main ORIGINAL_TEXT = 'I have a dog.\nHis name is Nemo.\nI found him yesterday' SUSPICIOUS_TEXT = 'I have a cat.\nHer name is Anny.\nI met her yesterday' tokenized_orig_text = main.tokenize_by_lines(ORIGINAL_TEXT) tokenized_susp_text = main.tokenize_by_lines(SUSPICIOUS_TEXT) print(f"Original text tokens: {tokenized_orig_text}\nSuspicious text tokens: {tokenized_susp_text}\n") orig_first_sent = tokenized_orig_text[2] susp_first_sent = tokenized_susp_text[2] zero_matrix_first = main.create_zero_matrix(len(orig_first_sent), len(susp_first_sent)) lcs_matrix = main.fill_lcs_matrix(orig_first_sent, susp_first_sent) print(f"Filled LCS matrix for first sentences: {lcs_matrix}\n") lcs_length = main.find_lcs_length(orig_first_sent, susp_first_sent, 0.3) print(f"LCS length for first sentences: {lcs_length}\n") lcs = main.find_lcs(orig_first_sent, susp_first_sent, lcs_matrix) print(f"LCS for first sentences: {lcs}\n") plagiarism_score = main.calculate_plagiarism_score(lcs_length, susp_first_sent) print(f"The plagiarism score for first sentences: {plagiarism_score}\n") plagiarism_text = main.calculate_text_plagiarism_score(tokenized_orig_text, tokenized_susp_text) print(f"The plagiarism score for the text: {plagiarism_text}\n") diff_in_sent = main.find_diff_in_sentence(orig_first_sent, susp_first_sent, lcs) print(f"Indexes of differences in first sentences: {diff_in_sent}\n") statistics = main.accumulate_diff_stats(tokenized_orig_text, tokenized_susp_text) print(f"The main statistics for pairs of sentences in texts:\n{statistics}\n")