def runEntry(entry, students, helpers, assignName, sourceSuffix, resultsSuffix, allowPartners): # create an empty PairResults object resultFilename = common.makeFilenameSafe(entry["sources"][0]) + resultsSuffix results = common.PairResults(assignName, resultFilename, helpers) # for each pair of students for i in range(len(students)): for j in range(i + 1): if i != j: # are these students partners? student1 = students[i] student2 = students[j] # get both file paths safeFilename = common.makeFilenameSafe(entry["sources"][0]) + sourceSuffix path1 = helpers.getPreprocessedPath(student1, assignName, safeFilename) path2 = helpers.getPreprocessedPath(student2, assignName, safeFilename) # make sure both paths exist if path1 != None and path2 != None: editDistance = runEditDistance(path1, path2) # save the pair result result = common.PairResult(student1, student2, editDistance) results.add(result) # flush results to disk results.finish() helpers.printf("Finished '{}/{}'!\n".format(assignName, entry["sources"][0]))
def runAssignment(assignment, students, args, helpers): clusters = {} allowPartners = assignment.args["allowPartners"] # for each file files = assignment.args["files"] for filename in files: # find collisions for student in students: safeFilename = common.makeFilenameSafe(filename) + args["sourceSuffix"] studentText = helpers.readFromPreprocessed(student, assignment.name, safeFilename) if studentText != None: hashedText = hashText(studentText) if hashedText not in clusters: clusters[hashedText] = common.Cluster(allowPartners, filename, 100) member = common.Member(student, assignment.name, helpers) clusters[hashedText].add(member) # process the clusters clusterArray = [] for key in clusters: clusterArray.append(clusters[key]) # write the results for this assignment # clustersToStandardJSON will make sure that there are at least two people in the cluster common.clustersToStandardJSON(clusterArray, assignment.name, args["resultsSuffix"], helpers) # say we're done with this assignment helpers.printf("Finished assignment '{}'...\n".format(assignment.name))
def runEntry(filename, students, helpers, assignment, args, allowPartners): # get the data assignName = assignment.name sourceSuffix = args["sourceSuffix"] resultsSuffix = args["resultsSuffix"] threshold = assignment.args["threshold"] above = args["above"] minThreshold = None if assignment.args.has_key("minThreshold"): minThreshold = assignment.args["minThreshold"] safeFilename = common.makeFilenameSafe(filename) + sourceSuffix filepath = helpers.getProcessedPath(assignName, safeFilename) if filepath != None: rawData = common.PairResults(assignName, safeFilename, helpers) data = [] # convert into python objects for pair in rawData.iterate(): data.append(pair) # get the mean mean = getMean(data) # get the deviation deviation = getDeviation(data, mean) helpers.printf("{}/{}: mean {}, deviation {}\n".format( assignName, filename, mean, deviation)) # filter out data filtered = filterData(data, mean, deviation, threshold, above, minThreshold) # create the clusters clusters = createClusters(filtered, filename, assignName, allowPartners, helpers) # flush to disk common.clustersToStandardJSON( clusters, assignName, common.makeFilenameSafe(filename) + resultsSuffix, helpers) # all done! helpers.printf("Finished '{}', with {} results!\n".format( assignName, len(clusters)))
def runEntry(filename, students, helpers, assignment, args, allowPartners): # get the data assignName = assignment.name sourceSuffix = args["sourceSuffix"] resultsSuffix = args["resultsSuffix"] threshold = assignment.args["threshold"] above = args["above"] minThreshold = None if assignment.args.has_key("minThreshold"): minThreshold = assignment.args["minThreshold"] safeFilename = common.makeFilenameSafe(filename) + sourceSuffix filepath = helpers.getProcessedPath(assignName, safeFilename) if filepath != None: rawData = common.PairResults(assignName, safeFilename, helpers) data = [] # convert into python objects for pair in rawData.iterate(): data.append(pair) # get the mean mean = getMean(data) # get the deviation deviation = getDeviation(data, mean) helpers.printf("{}/{}: mean {}, deviation {}\n".format(assignName, filename, mean, deviation)) # filter out data filtered = filterData(data, mean, deviation, threshold, above, minThreshold) # create the clusters clusters = createClusters(filtered, filename, assignName, allowPartners, helpers) # flush to disk common.clustersToStandardJSON(clusters, assignName, common.makeFilenameSafe(filename) + resultsSuffix, helpers) # all done! helpers.printf("Finished '{}', with {} results!\n".format(assignName, len(clusters)))
def getStats(students, assign, filename, helpers): # gather students stats into an array studentDict = {} array = [] for student in students: safeFilename = common.makeFilenameSafe(filename) + "stats.json" path = helpers.getPreprocessedPath(student, assign.name, safeFilename) if path != None: json = io.readJSON(path) studentDict[student] = json array.append(json) return (studentDict, array)
def doAssignment(students, assign, helpers): helpers.printf("tokenizing '{}' in parellel...\n".format(assign.name)) # for each student for student in students: # for each specificied file files = assign.args["files"] for filename in files: path = helpers.getAssignmentPath(student, assign.name, filename) if path != None: # tokenize the file result = tokenizer.simple(path) # write the result safeFilename = common.makeFilenameSafe(filename) + "tokenized.txt" helpers.writeToPreprocessed(result, student, assign.name, safeFilename) # all done! helpers.printf("Finished '{}'!\n".format(assign.name))
def run(students, assignments, args, helpers): # for each assignment for assign in assignments: # for each student for student in students: # for each specificied file files = assign.args["files"] for filename in files: # read the raw text rawText = helpers.readFromAssignment(student, assign.name, filename) if rawText != None: # make a friendly filename for saving safeFilename = common.makeFilenameSafe(filename) # mangle it, write the mangled text mangle(rawText, student, assign.name, safeFilename, helpers) # all done return True
def runFile(students, assign, helpers): helpers.printf("Processing assignment '{}' in parellel...\n".format(assign.name)) # for each student for student in students: # for each specificied file files = assign.args["files"] for filename in files: # get the path path = helpers.getAssignmentPath(student, assign.name, filename) if path != None: # get the identifiers text = tokenize(path) # write to a file safeFilename = common.makeFilenameSafe(filename) + "identifiers.txt" helpers.writeToPreprocessed(text, student, assign.name, safeFilename) # all done helpers.printf("Finished '{}'!\n".format(assign.name))
def doAssignment(students, assign, helpers, compress): helpers.printf("processing '{}' in parellel...\n".format(assign.name)) # for each student for student in students: # for each entry entries = assign.args["entries"] for entry in entries: entryPoint = entry["entryPoint"] path = helpers.getAssignmentPath(student, assign.name, entryPoint) sources = entry["sources"] if path != None: # tokenize the file result = tokenizer.mted(path, sources, compress) # write the result safeFilename = common.makeFilenameSafe(sources[0]) + "mted.txt" helpers.writeToPreprocessed(result, student, assign.name, safeFilename) # all done helpers.printf("Finished '{}'!\n".format(assign.name))
def runAssignment(students, assign, helpers): helpers.printf("Processing assignment '{}' in parellel...\n".format(assign.name)) # for each student for student in students: # for each specificied file files = assign.args["files"] for filename in files: # get the path path = helpers.getAssignmentPath(student, assign.name, filename) if path != None: # get the stats stats = genStats(path, helpers) # write to a file safeFilename = common.makeFilenameSafe(filename) + "stats.json" text = io.getJSONString(stats, True) helpers.writeToPreprocessed(text, student, assign.name, safeFilename) # all done helpers.printf("Finished '{}'!\n".format(assign.name))
def doAssignment(students, assign, helpers): helpers.printf("processing '{}' in parellel...\n".format(assign.name)) # for each student for student in students: # for each entry entries = assign.args["entries"] for entry in entries: sources = entry["sources"] # try to read the text text = helpers.readFromAssignment(student, assign.name, sources[0]) if text != None: # tokenize the file result = tokenize(text) # write the result safeFilename = common.makeFilenameSafe(sources[0]) + "vted.txt" helpers.writeToPreprocessed(result, student, assign.name, safeFilename) # all done helpers.printf("Finished '{}'!\n".format(assign.name))
def runAssignment(assignment, students, args, helpers): clusters = {} allowPartners = assignment.args["allowPartners"] # for each file files = assignment.args["files"] for filename in files: # find collisions for student in students: safeFilename = common.makeFilenameSafe( filename) + args["sourceSuffix"] studentText = helpers.readFromPreprocessed(student, assignment.name, safeFilename) if studentText != None: hashedText = hashText(studentText) if hashedText not in clusters: clusters[hashedText] = common.Cluster( allowPartners, filename, 100) member = common.Member(student, assignment.name, helpers) clusters[hashedText].add(member) # process the clusters clusterArray = [] for key in clusters: clusterArray.append(clusters[key]) # write the results for this assignment # clustersToStandardJSON will make sure that there are at least two people in the cluster common.clustersToStandardJSON(clusterArray, assignment.name, args["resultsSuffix"], helpers) # say we're done with this assignment helpers.printf("Finished assignment '{}'...\n".format(assignment.name))
def doAssignment(students, assign, helpers): helpers.printf("processing '{}' in parellel...\n".format(assign.name)) # for each student for student in students: # for each entry entries = assign.args["entries"] for entry in entries: sources = entry["sources"] # try to read the text text = helpers.readFromAssignment(student, assign.name, sources[0]) if text != None: # process the file result = processMIPS(text) # write the result safeFilename = common.makeFilenameSafe(sources[0]) + "mips.txt" helpers.writeToPreprocessed(result, student, assign.name, safeFilename) # all done helpers.printf("Finished '{}'!\n".format(assign.name))
def runFile(students, assign, helpers): helpers.printf("Processing assignment '{}' in parellel...\n".format( assign.name)) # for each student for student in students: # for each specificied file files = assign.args["files"] for filename in files: # get the path path = helpers.getAssignmentPath(student, assign.name, filename) if path != None: # get the identifiers text = tokenize(path) # write to a file safeFilename = common.makeFilenameSafe( filename) + "identifiers.txt" helpers.writeToPreprocessed(text, student, assign.name, safeFilename) # all done helpers.printf("Finished '{}'!\n".format(assign.name))
def runAssignment(students, assign, helpers): helpers.printf("Processing assignment '{}' in parellel...\n".format( assign.name)) # for each student for student in students: # for each specificied file files = assign.args["files"] for filename in files: # get the path path = helpers.getAssignmentPath(student, assign.name, filename) if path != None: # get the stats stats = genStats(path, helpers) # write to a file safeFilename = common.makeFilenameSafe(filename) + "stats.json" text = io.getJSONString(stats, True) helpers.writeToPreprocessed(text, student, assign.name, safeFilename) # all done helpers.printf("Finished '{}'!\n".format(assign.name))
def runAssignment(students, assignment, args, helpers, weightFun, genKeys): assignName = assignment.name files = assignment.args["files"] allowPartners = assignment.args["allowPartners"] threshold = args["threshold"] * float(len(students)) sourceSuffix = args["sourceSuffix"] resultsSuffix = args["resultsSuffix"] helpers.printf("Running assignment '{}' in parellel...\n".format(assignName)) for filename in files: # build the index index = InvertedIndex() for student in students: # try to read the file safeFilename = common.makeFilenameSafe(filename) + sourceSuffix text = helpers.readFromPreprocessed(student, assignName, safeFilename) if text != None: # generate the keys keys = genKeys(text) # add to the index for key in keys: index.add(key, student) # prune and weight index.prune(threshold) index.weight(weightFun, len(students)) # build the denormalized pair results resultFilename = common.makeFilenameSafe(filename) + "raw_" + resultsSuffix results = common.PairResults(assignName, resultFilename, helpers) seen = [] for student in students: # retreive the keys safeFilename = common.makeFilenameSafe(filename) + sourceSuffix text = helpers.readFromPreprocessed(student, assignName, safeFilename) if text != None: # generate the keys keys = genKeys(text) # get the member (for the partner) member = common.Member(student, assignName, helpers) partner = member.partner # handle allowPartners if not allowPartners: partner = None # get the score results studentResults = index.scoreStudent(student, partner, keys) # add to pair results for other in studentResults: if other not in seen: pair = common.PairResult(student, other, studentResults[other]) results.add(pair) # prevent duplicates seen.append(student) # normalize the scores to range 0-100 results.finish() biggest = 0.0 for pair in results.iterate(): if pair.score > biggest: biggest = float(pair.score) # flush to disk finalResultFilename = common.makeFilenameSafe(filename) + resultsSuffix finalResults = common.PairResults(assignName, finalResultFilename, helpers) for pair in results.iterate(): pair.score = (float(pair.score) / biggest) * 100.0 finalResults.add(pair) finalResults.finish() # all done helpers.printf("Finished '{}'!\n".format(assignName))
def runAssignment(students, assignment, args, helpers, weightFun, genKeys): assignName = assignment.name files = assignment.args["files"] allowPartners = assignment.args["allowPartners"] threshold = args["threshold"] * float(len(students)) sourceSuffixes = ["tokenized.txt", "identifiers.txt", "literals.txt"] resultsSuffix = args["resultsSuffix"] helpers.printf( "Running assignment '{}' in parellel...\n".format(assignName)) for filename in files: indexes = [InvertedIndex(), InvertedIndex(), InvertedIndex()] # for each type of Data for i in range(3): sourceSuffix = sourceSuffixes[i] curWeightFun = weightFun[i] curGenKeys = genKeys[i] index = indexes[i] for student in students: # try to read the file safeFilename = common.makeFilenameSafe(filename) + sourceSuffix text = helpers.readFromPreprocessed(student, assignName, safeFilename) if text != None: # generate the keys keys = curGenKeys(text) # add to the index for key in keys: index.add(key, student) # prune and weight index.prune(threshold) index.weight(curWeightFun, len(students)) # build the denormalized pair results resultFilename = common.makeFilenameSafe( filename) + "raw_" + resultsSuffix results = common.PairResults(assignName, resultFilename, helpers) seen = [] for student in students: combined = {} for i in range(3): # retreive the keys safeFilename = common.makeFilenameSafe( filename) + sourceSuffixes[i] text = helpers.readFromPreprocessed(student, assignName, safeFilename) index = indexes[i] if text != None: # generate the keys keys = genKeys[i](text) # get the member (for the partner) member = common.Member(student, assignName, helpers) partner = member.partner # handle allowPartners if not allowPartners: partner = None # get the score results studentResults = index.scoreStudent(student, partner, keys) # add to results for other in studentResults: if other in combined: # add the score combined[other] += studentResults[other] else: # create the entry combined[other] = studentResults[other] # add to pair results for other in combined: if other not in seen: pair = common.PairResult(student, other, combined[other]) results.add(pair) # prevent duplicates seen.append(student) # normalize the scores to range 0-100 results.finish() biggest = 0.0 for pair in results.iterate(): if pair.score > biggest: biggest = float(pair.score) # flush to disk finalResultFilename = common.makeFilenameSafe(filename) + resultsSuffix finalResults = common.PairResults(assignName, finalResultFilename, helpers) for pair in results.iterate(): pair.score = (float(pair.score) / biggest) * 100.0 finalResults.add(pair) finalResults.finish() # all done helpers.printf("Finished '{}'!\n".format(assignName))
def runEntry(filename, students, helpers, assignment, args, allowPartners): # get the data assignName = assignment.name sourceSuffix = args["sourceSuffix"] resultsSuffix = args["resultsSuffix"] percent = float(args["percent"]) / 100.0 top = args["top"] safeFilename = common.makeFilenameSafe(filename) + sourceSuffix filepath = helpers.getProcessedPath(assignName, safeFilename) if filepath != None: rawData = common.PairResults(assignName, safeFilename, helpers) data = [] # convert into python objects i = 0 for pair in rawData.iterate(): data.append(pair) i += 1 if i % 100000 == 0: gc.collect() # sort them data.sort(sortFun) # calculate and print stats mean = getMean(data) dev = getDeviation(data, mean) helpers.printf("{}/{} mean: {}, std. devation: {}\n".format(assignName, filename, mean, dev)) # take to the top bottom percent takeNum = math.floor(float(len(data)) * percent) if "maxResults" in args: takeNum = min(args["maxResults"], takeNum) if top: data = data[::-1] # conveniently reverse results = [] taken = 0 index = 0 while taken < takeNum: current = data[index] member1 = common.Member(current.pair[0], assignName, helpers) member2 = common.Member(current.pair[1], assignName, helpers) cluster = common.Cluster(allowPartners, filename, current.score) cluster.add(member1) cluster.add(member2) if cluster.hasCheating() == False: # student are partners, ignore index += 1 continue # take this entry taken += 1 index += 1 results.append(current) if index % 50000 == 0: gc.collect() # create the clusters clusters = createClusters(results, filename, assignName, allowPartners, helpers) # group pairs if necessary if args.has_key("groupPairs") and args["groupPairs"] == True: clusters = common.groupPairClusters(clusters, top) # free up RAM gc.collect() # flush to disk common.clustersToStandardJSON(clusters, assignName, common.makeFilenameSafe(filename) + resultsSuffix, helpers) # all done! helpers.printf("Finished '{}/{}', with {} results!\n".format(assignName, filename, len(clusters)))
def run(students, assignments, args, helpers): # for each assignment for assign in assignments: # for each student for student in students: cwd = os.getcwd() helpers.makeStudentPath(student, assign.name) os.chdir(helpers.getStudentPath(student, assign.name)) command = "cp ../{0}*{1} .".format(student, args['input']) p = subprocess.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True) o, e = p.communicate() newest = "" newestNum = 0 for f in os.listdir(os.getcwd()): m = re.match("{0}".format(student), f) # Given the set of student submissions, pick the newest if (m): num = f.split('_')[1] if (num > newestNum): newestNum = num newest = f if (newest != ""): if (args['input'] == "tar"): command = "tar xf {0}".format(newest) else: print "INPUT TYPE NOT SUPPORTED - {0}".format( args['input']) p = subprocess.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True) o, e = p.communicate() # for each specificied file files = assign.args["files"] for filename in files: # read the raw text rawText = helpers.readFromAssignment( student, assign.name, filename) if rawText != None: # make a friendly filename for saving safeFilename = common.makeFilenameSafe(filename) # mangle it, write the mangled text mangle(rawText, student, assign.name, safeFilename, helpers) # Delete the other input files then go back command = "rm {0}*{1}".format(student, args['input']) p = subprocess.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True) o, e = p.communicate() os.chdir(cwd) # all done return True
def runEntry(filename, students, helpers, assignment, args, allowPartners): # get the data assignName = assignment.name sourceSuffix = args["sourceSuffix"] resultsSuffix = args["resultsSuffix"] percent = float(args["percent"]) / 100.0 top = args["top"] safeFilename = common.makeFilenameSafe(filename) + sourceSuffix filepath = helpers.getProcessedPath(assignName, safeFilename) if filepath != None: rawData = common.PairResults(assignName, safeFilename, helpers) data = [] # convert into python objects i = 0 for pair in rawData.iterate(): data.append(pair) i += 1 if i % 100000 == 0: gc.collect() # sort them data.sort(sortFun) # calculate and print stats mean = getMean(data) dev = getDeviation(data, mean) helpers.printf("{}/{} mean: {}, std. devation: {}\n".format( assignName, filename, mean, dev)) # take to the top bottom percent takeNum = math.floor(float(len(data)) * percent) if "maxResults" in args: takeNum = min(args["maxResults"], takeNum) if top: data = data[::-1] # conveniently reverse results = [] taken = 0 index = 0 while taken < takeNum: current = data[index] member1 = common.Member(current.pair[0], assignName, helpers) member2 = common.Member(current.pair[1], assignName, helpers) cluster = common.Cluster(allowPartners, filename, current.score) cluster.add(member1) cluster.add(member2) if cluster.hasCheating() == False: # student are partners, ignore index += 1 continue # take this entry taken += 1 index += 1 results.append(current) if index % 50000 == 0: gc.collect() # create the clusters clusters = createClusters(results, filename, assignName, allowPartners, helpers) # group pairs if necessary if args.has_key("groupPairs") and args["groupPairs"] == True: clusters = common.groupPairClusters(clusters, top) # free up RAM gc.collect() # flush to disk common.clustersToStandardJSON( clusters, assignName, common.makeFilenameSafe(filename) + resultsSuffix, helpers) # all done! helpers.printf("Finished '{}/{}', with {} results!\n".format( assignName, filename, len(clusters)))