def test_calculate_text_plagiarism_score(): origin_text = '''the cat is big the sun is beatiful the moon is rising''' susp_text = '''the big cat the beatiful sun was rising a blue moon will rise''' origin_tokens = main.tokenize_by_lines(origin_text) susp_tokens = main.tokenize_by_lines(susp_text) score = main.calculate_text_plagiarism_score(origin_tokens, susp_tokens) print('The text plagiarism score for \n\n' f'{origin_text} \n\n and \n\n{susp_text}: \n\n{score:.2f}\n\n') return score
def test_accumulated_stat_and_report(): origin_text = '''the cat is big the sun is beatiful the moon is rising''' susp_text = '''the cat is big the beatiful sun was rising a blue moon will rise''' origin_tokens = main.tokenize_by_lines(origin_text) susp_tokens = main.tokenize_by_lines(susp_text) stat = main.accumulate_diff_stats(origin_tokens, susp_tokens) print('The accumulated main statistics for pairs of sentences in texts: \n\n' f'{origin_text} \n\n and \n\n{susp_text}: \n') print(*stat.items(), sep='\n', end='\n\n') report = main.create_diff_report(origin_tokens, susp_tokens, stat) print(f'A report:\n\n{report}\n') return report
""" Longest common subsequence implementation starter """ from main import tokenize_by_lines, accumulate_diff_stats, create_diff_report ORIGINAL_TEXT = "i like small cats" SUSPICIOUS_TEXT = "i prefer small dogs" original = tokenize_by_lines(ORIGINAL_TEXT) suspicious = tokenize_by_lines(SUSPICIOUS_TEXT) diff_stats = accumulate_diff_stats(original, suspicious) RESULT = create_diff_report(original, suspicious, diff_stats) assert RESULT, "LCS_length not working"
""" Longest common subsequence implementation starter """ import main if __name__ == '__main__': with open('lab_2/diff_report_example.txt', 'r', encoding='utf-8') as file: report_example = file.read() ORIGINAL_TEXT = 'I have a cat. \nIts body is covered with bushy white fur.' SUSPICIOUS_TEXT = 'I have a cat. \nIts body is covered with shiny black fur.' print('\tOriginal text:\n' + ORIGINAL_TEXT) print('\n\tSuspicious text:\n' + SUSPICIOUS_TEXT) print('\n\t...tokenizing original text...') original_text_tokens = main.tokenize_by_lines(ORIGINAL_TEXT) print('\t...tokenizing suspicious text...') suspicious_text_tokens = main.tokenize_by_lines(SUSPICIOUS_TEXT) print('\t...creating a line-by-line diff report...\n\n') report = (main.create_diff_report( original_text_tokens, suspicious_text_tokens, main.accumulate_diff_stats(original_text_tokens, suspicious_text_tokens))) print(report) RESULT = report assert RESULT == report_example, 'LCS implementation not working'
""" Longest common subsequence implementation starter """ import main ORIGINAL_TEXT = 'I have a dog.\nHis name is Nemo.\nI found him yesterday' SUSPICIOUS_TEXT = 'I have a cat.\nHer name is Anny.\nI met her yesterday' tokenized_orig_text = main.tokenize_by_lines(ORIGINAL_TEXT) tokenized_susp_text = main.tokenize_by_lines(SUSPICIOUS_TEXT) print(f"Original text tokens: {tokenized_orig_text}\nSuspicious text tokens: {tokenized_susp_text}\n") orig_first_sent = tokenized_orig_text[2] susp_first_sent = tokenized_susp_text[2] zero_matrix_first = main.create_zero_matrix(len(orig_first_sent), len(susp_first_sent)) lcs_matrix = main.fill_lcs_matrix(orig_first_sent, susp_first_sent) print(f"Filled LCS matrix for first sentences: {lcs_matrix}\n") lcs_length = main.find_lcs_length(orig_first_sent, susp_first_sent, 0.3) print(f"LCS length for first sentences: {lcs_length}\n") lcs = main.find_lcs(orig_first_sent, susp_first_sent, lcs_matrix) print(f"LCS for first sentences: {lcs}\n") plagiarism_score = main.calculate_plagiarism_score(lcs_length, susp_first_sent) print(f"The plagiarism score for first sentences: {plagiarism_score}\n") plagiarism_text = main.calculate_text_plagiarism_score(tokenized_orig_text, tokenized_susp_text) print(f"The plagiarism score for the text: {plagiarism_text}\n") diff_in_sent = main.find_diff_in_sentence(orig_first_sent, susp_first_sent, lcs)