def check(origin, plagiarized): with open(origin, "r") as file: origin = file.read() with open(plagiarized, "r") as file: plagiarism = file.read() text_length = min(len(origin.split()), len(plagiarism.split())) if text_length < 60: raise NotImplementedError("Compare texts with at least 60 words.") window = max(text_length // 21, 3) kgram = window - 1 base = 11 if text_length < 250 else 23 if text_length < 600 else 101 modulo = max(round(text_length * 5, -3), 1000) fprint = Fingerprint(kgram_len=kgram, window_len=window, base=base, modulo=modulo) first = fprint.generate(str=origin) second = fprint.generate(str=plagiarism) similar = [ x for x in first if x in second ] similar_grams = Counter([ element[0] for element in first for sec in second if sec[0] == element[0] ]) print("Identical substring hashes:") pprint(similar) print("\nIdentical grams:") pprint(similar_grams)
def fingerprint_function(url): f = Fingerprint(kgram_len=4, window_len=1, base=10, modulo=1000) return f.generate(str=url)