Exemple #1
0
def check(origin, plagiarized):
    with open(origin, "r") as file:
        origin = file.read()

    with open(plagiarized, "r") as file:
        plagiarism = file.read()

    text_length = min(len(origin.split()), len(plagiarism.split()))

    if text_length < 60:
        raise NotImplementedError("Compare texts with at least 60 words.")

    window = max(text_length // 21, 3)
    kgram = window - 1
    base = 11 if text_length < 250 else 23 if text_length < 600 else 101
    modulo = max(round(text_length * 5, -3), 1000)

    fprint = Fingerprint(kgram_len=kgram, window_len=window, base=base, modulo=modulo)

    first = fprint.generate(str=origin)
    second = fprint.generate(str=plagiarism)

    similar = [
        x
        for x in first
        if x in second
    ]

    similar_grams = Counter([
        element[0]
        for element in first
        for sec in second
        if sec[0] == element[0]
    ])

    print("Identical substring hashes:")
    pprint(similar)
    print("\nIdentical grams:")
    pprint(similar_grams)
def fingerprint_function(url):
    f = Fingerprint(kgram_len=4, window_len=1, base=10, modulo=1000)
    return f.generate(str=url)