def test_module_level(self): s1 = """ def main(): s = 0 for j in range(10): for i in range(10): if i > j: s += i + j print(s) if __name__ == '__main__': main() """ s2 = """ s = 0 for j in range(10): for i in range(10): if i > j: s += i + j print(s) """ result = pycode_similar.detect([s1, s2], module_level=True) sum_plagiarism_percent, *tail = pycode_similar.summarize(result[0][1]) # s1.main vs s2.__main__ AND s1.__main__ vs s2.__main__ self.assertGreater(sum_plagiarism_percent, 0.6) result = pycode_similar.detect([s2, s1], module_level=True) sum_plagiarism_percent, *tail = pycode_similar.summarize(result[0][1]) # s2.__main__ vs s1.main self.assertGreater(sum_plagiarism_percent, 0.85)
def test_no_function(self): s1 = """ def foo(a): c = a """ s2 = """ class B(object): pass """ try: result = pycode_similar.detect([s2, s1]) except Exception as ex: self.assertEqual(ex.source, 0) result = pycode_similar.detect([s1, s2]) self.assertEqual(result[0][1][0].plagiarism_percent, 0)
def test_space_and_comments(self): s1 = """ def foo(a): \""" foo comments. \""" if a >= 1: return True # this should return False return False """ s2 = """ def bar(b): # bar comments. if 1 <= b: \""" This should return True \""" return True return False """ result = pycode_similar.detect([s1, s2]) self.assertEqual(result[0][1][0].plagiarism_percent, 1)
def processpyfiles(): files = os.listdir(path) for pyfile in os.listdir(pypath): fp1 = open(pypath + '\\' + pyfile, encoding="utf8") txt = fp1.read() fp1.close() simi_matrix = list() code = [] for file in files: fp = open(path + '\\' + file, encoding="utf8") filetext = fp.read() code.append(filetext) print(code) # for ft in filetext: # code.append(ft) for file in code: try: simi_matrix.append( dict( pycode_similar.detect( [str(txt), str(file)], diff_method=pycode_similar.UnifiedDiff)).get(1) [0].plagiarism_percent) except: return "SYNTAX ERROR IN FILE" if max(simi_matrix) >= .90: return "PLAGIARIZED" else: return "NOT PLAGIARIZED"
def test_expr(self): s1 = """ def foo(a): yield c """ s2 = """ def bar(b): yield a """ result = pycode_similar.detect([s1, s2]) self.assertEqual(result[0][1][0].plagiarism_percent, 1)
def test_strip_print(self): s1 = """ def foo(a): a = b print('abc', a) """ s2 = """ def foo(a): print('abc', bar()) a = b """ result = pycode_similar.detect([s1, s2]) self.assertEqual(result[0][1][0].plagiarism_percent, 1)
def test_strip_import(self): s1 = """ def foo(): import sys from os import path """ s2 = """ def foo(): import os import ast from collections import Counter """ result = pycode_similar.detect([s1, s2]) self.assertEqual(result[0][1][0].plagiarism_percent, 1)
def test_keep_prints(self): s1 = """ a = [] for j in range(10): for i in range(10): a.append(i - j) print(a) print(abs(el) * el for el in a if abs(el) > 2) """ s2 = """ a = [ i - j for j in range(10) for i in range(10) ] print(a) print(abs(el) * el for el in a if abs(el) > 2) """ result = pycode_similar.detect([s1, s2], module_level=True, keep_prints=False) self.assertLess(result[0][1][0].plagiarism_percent, 0.2) result = pycode_similar.detect([s1, s2], module_level=True, keep_prints=True) self.assertGreater(result[0][1][0].plagiarism_percent, 0.5)
def test_gte_lte(self): s1 = """ def foo(a): if a >= 1: return True return False """ s2 = """ def bar(b): if 1 <= b: return True return False """ result = pycode_similar.detect([s1, s2]) self.assertEqual(result[0][1][0].plagiarism_percent, 1)
def compare_file_to_others(ref_file, candidate_files): files = [ ref_file, ] files.extend(candidate_files) print(files) payload = [] for name in files: payload.append(read_file_content(name)) res = pycode_similar.detect(payload, diff_method=pycode_similar.UnifiedDiff) per_function_reports = res[0][1] total = 0 for i in per_function_reports: total += i.plagiarism_percent return total / len(per_function_reports)
def test_basic_detect(self): s1 = """ def foo(a): if a > 1: return True return False """ s2 = """ class A(object): def __init__(self, a): self._a = a def bar(self): if self._a > 2: return True return False """ result = pycode_similar.detect([s1, s2]) self.assertGreater(result[0][1][0].plagiarism_percent, 0.5)
def main(): filepath1 = './' + sys.argv[1] filepath2 = './' + sys.argv[2] f1 = open(filepath1, 'r') f2 = open(filepath2, 'r') results = pycode_similar.detect([f1.read(), f2.read()]) result = 0 for index, func_ast_diff_list in results: sum_total_count = sum(func_diff_info.total_count for func_diff_info in func_ast_diff_list) sum_plagiarism_count = sum(func_diff_info.plagiarism_count for func_diff_info in func_ast_diff_list) result = ( '{:.2f} % ({}/{}) of ref code structure is plagiarized by candidate.' .format(sum_plagiarism_count / float(sum_total_count) * 100, sum_plagiarism_count, sum_total_count)) print(result) sys.stdout.flush()
def checkPlagiarismForFiles(config, args, f1, f2): if args.threshold > 99: return checkForPlagiarismByDiff(config, f1, f2) c1 = readFile(f1) c2 = readFile(f2) if c1 == c2: print(f'Plagiarism detected: {f1} and {f2} are identical') return True try: res = pycode_similar.detect([c1, c2], diff_method=pycode_similar.UnifiedDiff, keep_prints=True, module_level=True) except Exception as e: msg = f'Cannot check plagiarism between {f1} and {f2}: ' + str(e) if isinstance(e, SyntaxError): verbose(msg) else: warn(msg) return if len(res) == 0: warn(f'No plagiarism result found for {f1} and {f2}') if len(res) > 1: warn(f'More than one plagiarism result found for {f1} and {f2}') (_, x) = res[0] sumPlagiarismFactor, sumPlagiarismCount, sumTotalCount = pycode_similar.summarize(x) sumPlagiarismPercent = sumPlagiarismFactor * 100 msg = '{:.2f} % ({}/{}) of ref code structure is plagiarized by candidate.'.format( sumPlagiarismPercent, sumPlagiarismCount, sumTotalCount, ) if sumPlagiarismPercent >= args.threshold: print(f'Detected plagiarism between {f1} and {f2}: {msg}') if args.verbose: shell.run(['diff', '-u', f1, f2], onError='ignore') return True else: verbose(f'No plagiarism between {f1} and {f2}') return False
def compare_file_to_others(ref_file, candidate_files): files = [ ref_file, ] files.extend(candidate_files) print(files) payload = [] for name in files: try: content = read_file_content(name) _ = ast.parse(content) payload.append(content) except SyntaxError: pass res = pycode_similar.detect(payload, diff_method=pycode_similar.UnifiedDiff) per_function_reports = res[0][1] if not per_function_reports: return 0 total = 0 for i in per_function_reports: total += i.plagiarism_percent return total / len(per_function_reports)
def compare(self, a, b): res = pycode_similar.detect([a, b], diff_method=pycode_similar.UnifiedDiff) val = res[0][1][0].plagiarism_percent return val
plt.title('students') plt.xlabel('time') plt.ylabel('signal') plt.show() # learn to use matplotlib class SimilarDetector: def __init__(self, codes): self.code_list = codes def detect(self): pycode_similar.detect(self.code_list, diff_method=pycode_similar.UnifiedDiff) res = pycode_similar.detect( [''' def p(): print(123)''', ''' def f(): print(123)'''], diff_method=pycode_similar.TreeDiff) funcInfos = res[0][1] sum_plagiarism_count = 0 sum_total_count = 0 for func_info in funcInfos: sum_plagiarism_count += func_info.plagiarism_count sum_total_count += func_info.total_count print(sum_plagiarism_count / sum_total_count)
def detect(self): pycode_similar.detect(self.code_list, diff_method=pycode_similar.UnifiedDiff)
OTHER = [] x_old = measurement[0] y_old = measurement[1] else: x_old = OTHER[-1][0] y_old = OTHER[-1][1] x = measurement[0] y = measurement[1] if len(OTHER) >= 3: x_old2 = OTHER[-2][0] y_old2 = OTHER[-2][1] else: x_old2 = x_old y_old2 = y_old bearing = atan2(y - y_old, x - x_old) bearing_old = atan2(y_old - y_old2, x_old - x_old2) theta = bearing - bearing_old d = sqrt((y - y_old) ** 2 + (x - x_old) ** 2) x_new = x + d * cos(theta + bearing) y_new = y + d * sin(theta + bearing) xy_estimate = (x_new, y_new) # You must return xy_estimate (x, y), and OTHER (even if it is None) # in this order for grading purposes. OTHER.append(measurement) return xy_estimate, OTHER""" print(pycode_similar.detect([str1,str2]))
optional arguments: -h, --help show this help message and exit -l L if AST line of the function >= value then output detail (default: 4) -p P if plagiarism percentage of the function >= value then output detail (default: 0.5) -k, --keep-prints keep print nodes -m, --module-level process module level nodes pycode_similar: error: too few arguments Of course, you can use it as a python library, too. .. code-block:: python import pycode_similar pycode_similar.detect([referenced_code_str, candidate_code_str1, candidate_code_str2, ...], diff_method=pycode_similar.UnifiedDiff, keep_prints=False, module_level=False) Implementation -------------- This tool has implemented two diff methods: line based diff(UnifiedDiff) and tree edit distance based diff(TreeDiff), both of them are run in function AST level. - UnifiedDiff, diff normalized function AST string lines, naive but efficiency. - TreeDiff, diff function AST, very slow and the result is not good for small functions. (depends on `zss <https://pypi.python.org/pypi/zss>`_) So, when run this tool in cmd, the default diff method is UnifiedDiff. And you can switch to TreeDiff when use it as a library. Testing -------------- If you have the source code you can run the tests with
import pycode_similar #sys.path.insert(0, '/home/leonard/Faculdades/ESTACIO/Projetos/IC/2017-2018/Luiz/Codigos/src/modules/Plagiarism-Checker-master') #rom textpreforrkr import main_method pycode_similar.detect(['Q1_gabarito','Q1_resposta1'], diff_method=UnifiedDiff) #main_method(['Q1_gabarito','Q1_resposta1'],'resp')