def test_calculate_text_plagiarism_score():
    origin_text = '''the cat is big
the sun is beatiful
the moon is rising'''

    susp_text = '''the big cat
the beatiful sun was rising 
a blue moon will rise'''

    origin_tokens = main.tokenize_by_lines(origin_text)
    susp_tokens = main.tokenize_by_lines(susp_text)

    score = main.calculate_text_plagiarism_score(origin_tokens,
                                                 susp_tokens)

    print('The text plagiarism score for \n\n'
        f'{origin_text} \n\n and \n\n{susp_text}: \n\n{score:.2f}\n\n')
    return score
def test_accumulated_stat_and_report():
    origin_text = '''the cat is big
the sun is beatiful
the moon is rising'''

    susp_text = '''the cat is big
the beatiful sun was rising 
a blue moon will rise'''

    origin_tokens = main.tokenize_by_lines(origin_text)
    susp_tokens = main.tokenize_by_lines(susp_text)

    stat = main.accumulate_diff_stats(origin_tokens, susp_tokens)

    print('The accumulated main statistics for pairs of sentences in texts: \n\n'
        f'{origin_text} \n\n and \n\n{susp_text}: \n')
    print(*stat.items(), sep='\n', end='\n\n')

    report = main.create_diff_report(origin_tokens,
                                     susp_tokens,
                                     stat)
    print(f'A report:\n\n{report}\n')
    return report
Exemple #3
0
"""
Longest common subsequence implementation starter
"""

from main import tokenize_by_lines, accumulate_diff_stats, create_diff_report

ORIGINAL_TEXT = "i like small cats"

SUSPICIOUS_TEXT = "i prefer small dogs"

original = tokenize_by_lines(ORIGINAL_TEXT)
suspicious = tokenize_by_lines(SUSPICIOUS_TEXT)

diff_stats = accumulate_diff_stats(original, suspicious)

RESULT = create_diff_report(original, suspicious, diff_stats)
assert RESULT, "LCS_length not working"
Exemple #4
0
"""
Longest common subsequence implementation starter
"""

import main

if __name__ == '__main__':
    with open('lab_2/diff_report_example.txt', 'r', encoding='utf-8') as file:
        report_example = file.read()
    ORIGINAL_TEXT = 'I have a cat. \nIts body is covered with bushy white fur.'
    SUSPICIOUS_TEXT = 'I have a cat. \nIts body is covered with shiny black fur.'
    print('\tOriginal text:\n' + ORIGINAL_TEXT)
    print('\n\tSuspicious text:\n' + SUSPICIOUS_TEXT)
    print('\n\t...tokenizing original text...')
    original_text_tokens = main.tokenize_by_lines(ORIGINAL_TEXT)
    print('\t...tokenizing suspicious text...')
    suspicious_text_tokens = main.tokenize_by_lines(SUSPICIOUS_TEXT)
    print('\t...creating a line-by-line diff report...\n\n')
    report = (main.create_diff_report(
        original_text_tokens, suspicious_text_tokens,
        main.accumulate_diff_stats(original_text_tokens,
                                   suspicious_text_tokens)))
    print(report)

    RESULT = report
    assert RESULT == report_example, 'LCS implementation not working'
"""
Longest common subsequence implementation starter
"""
import main
ORIGINAL_TEXT = 'I have a dog.\nHis name is Nemo.\nI found him yesterday'
SUSPICIOUS_TEXT = 'I have a cat.\nHer name is Anny.\nI met her yesterday'

tokenized_orig_text = main.tokenize_by_lines(ORIGINAL_TEXT)
tokenized_susp_text = main.tokenize_by_lines(SUSPICIOUS_TEXT)
print(f"Original text tokens: {tokenized_orig_text}\nSuspicious text tokens: {tokenized_susp_text}\n")

orig_first_sent = tokenized_orig_text[2]
susp_first_sent = tokenized_susp_text[2]

zero_matrix_first = main.create_zero_matrix(len(orig_first_sent), len(susp_first_sent))
lcs_matrix = main.fill_lcs_matrix(orig_first_sent, susp_first_sent)
print(f"Filled LCS matrix for first sentences: {lcs_matrix}\n")

lcs_length = main.find_lcs_length(orig_first_sent, susp_first_sent, 0.3)
print(f"LCS length for first sentences: {lcs_length}\n")

lcs = main.find_lcs(orig_first_sent, susp_first_sent, lcs_matrix)
print(f"LCS for first sentences: {lcs}\n")

plagiarism_score = main.calculate_plagiarism_score(lcs_length, susp_first_sent)
print(f"The plagiarism score for first sentences: {plagiarism_score}\n")

plagiarism_text = main.calculate_text_plagiarism_score(tokenized_orig_text, tokenized_susp_text)
print(f"The plagiarism score for the text: {plagiarism_text}\n")

diff_in_sent = main.find_diff_in_sentence(orig_first_sent, susp_first_sent, lcs)