def plot_score_distribution(summary_df: pd.DataFrame): logger.debug('plot score distributions') summary_df = summary_df.sort_values(by='score') max_score = summary_df['max_score'].max() plt.clf() ax = plt.gca() try: sns.distplot(summary_df[~summary_df['student_id'].duplicated( keep='first')]['score'], rug=True, fit=norm, bins=int(max_score), ax=ax) except LinAlgError as error: logger.warning(f'unable to plot score distribution: {error}') ax.set_xlim(0, max_score) ax.set_xlabel('score') ax.set_ylabel('share') ax.set_title('score distribution without duplicates (takes lower score)') plt.tight_layout() with io.BytesIO() as buffer: plt.savefig(buffer, format='svg', transparent=False) return buffer.getvalue()
def _apply_cases(self, state: Dict) -> List[Result]: state = state.copy() results = [] # prepare import filter if_regex, if_blacklist = self._variables.get('IMPORT_FILTER', (None, None)) for i, case in enumerate(self._cases.values(), start=1): logger.debug(f'[{i}/{len(self._cases)}] execute {case}') with io.StringIO() as stdout, io.StringIO() as stderr: with ExitStack() as es: if if_regex is not None: es.enter_context( import_filter(if_regex, blacklist=if_blacklist)) es.enter_context(capture_output(stdout, stderr)) achieved, msg = case(state) results.append( Result(id=case.id, label=case.label, target=case.targets, score=achieved, score_max=case.score, messages=[msg], stdout=stdout.getvalue(), stderr=stderr.getvalue())) logger.debug('testing completed') return results
def summarize_results(results, filenames=None, include_tasks=False) -> pd.DataFrame: task_names = [] if include_tasks: task_keys = [] print(list(results[0].results[0].__dict__.keys())) for res in results[0].results: task_keys.append(res.id) if len(res.label) > 0: task_names.append(str(res.label)) else: task_names.append(str(res.target)) logger.debug(f'summarize {len(results)} results') header = [ 'student_id', 'last_name', 'first_name', 'score', 'max_score', 'patches', 'checksum', *task_names, 'filename' ] if filenames is None: filenames = ["" for _ in range(len(results))] def row_factory(): for r, filename in zip(results, filenames): stuff = [] if include_tasks: for key in task_keys: for task in r.results: if task.id == key: stuff.append(task.score) break else: stuff.append(0) for member in r.team_members: s = r.summary() yield (member.student_id, member.last_name, member.first_name, s.score, s.score_max, len(r.applied_patches), r.checksum, *stuff, filename) summary_df = pd.DataFrame(row_factory(), columns=header).sort_values(by='last_name') print(summary_df.columns) summary_df['multiple_submissions'] = summary_df['student_id'].duplicated( keep=False) if not math.isclose(summary_df['max_score'].std(), 0): logger.warning('max scores seem not to be consistent!') return summary_df
def execute(self, args=None): """ Commandline interface for notebook test. Call with `--help` flag to get further information. :param args: optional arguments, uses `sys.argv` by default :return: number of failed tests """ parser = argparse.ArgumentParser( description='run tests on jupyter notebook') parser.add_argument('notebook', type=str, help='the jupyter notebook to test') parser.add_argument('-t', '--target', type=str, metavar='', help='where to store results') parser.add_argument('-c', '--context', type=str, metavar='', help='context directory') parser.add_argument('-v', '--verbose', action='count', default=0, help='verbosity level') args = parser.parse_args(args) logger.setLevel(loglevel(args.verbose)) logger.debug(f'args: {args}') results = self._grade_notebook( Path(args.notebook).absolute(), target_dir=Path(args.target).absolute() if args.target else None, context=Path(args.context).absolute() if args.context else None) return results.summary().failed
def inject_patch(results: Results, path='.', prefix: str = 'results') -> Path: """Store results as patch in mounted results archive""" path = Path(path) ct = len(list(path.glob(f'{prefix}_patch*.json'))) with cd(path): with open(f'{prefix}_patch_{ct + 1:02d}.json', mode='wt') as f: json.dump(results.to_dict(), f, indent=4) # update report if it exists if Path('report.html').exists(): results = load_patched() logger.debug(f'update report for {results.checksum}') with open('report.html', mode='wt') as f: f.write( render('report.html', title='report', id=results.checksum, results={results.checksum: results}, summary=results.summary())) return path
def run(path_nb_): if args.backend is None: cmd = [ 'python', f'"{path_tst}"', f'"{path_nb_}"', '-t', f'"{path_tgt}"', *(('-c', f'"{path_cxt}"') if path_cxt else ()), *(('-' + 'v' * args.verbose, ) if args.verbose > 0 else ()) ] elif 'docker' in args.backend: cmd = [ 'docker', 'run', '-v', f'"{path_tst}:/autograde/test.py"', '-v', f'"{path_nb_}:/autograde/notebook.ipynb"', '-v', f'"{path_tgt}:/autograde/target"', *(('-v', f'"{path_cxt}:/autograde/context:ro"') if path_cxt else ()), *(('-u', str(os.geteuid())) if 'rootless' not in args.backend else ()), args.tag, *(('-' + 'v' * args.verbose, ) if args.verbose > 0 else ()) ] elif args.backend == 'podman': cmd = [ 'podman', 'run', '-v', f'"{path_tst}:/autograde/test.py"', '-v', f'"{path_nb_}:/autograde/notebook.ipynb"', '-v', f'"{path_tgt}:/autograde/target"', *(('-v', f'"{path_cxt}:/autograde/context"') if path_cxt else ()), args.tag, *(('-' + 'v' * args.verbose, ) if args.verbose > 0 else ()) ] else: raise ValueError(f'unknown backend: {args.backend}') logger.info(f'test: {path_nb_}') logger.debug('run' + ' '.join(cmd)) if not args.backend: return subprocess.call(' '.join(cmd), shell=True) return subprocess.call(' '.join(cmd), shell=True)
def plot_fraud_matrix(sources: Dict[str, str]) -> bytes: logger.debug('apply fraud detection') hashes = sorted(sources) diffs = pd.DataFrame(np.NaN, index=hashes, columns=hashes) for h in hashes: diffs.loc[h][h] = 1. for (ha, ca), (hb, cb) in combinations(sources.items(), 2): diffs.loc[ha][hb] = diffs.loc[hb][ha] = SequenceMatcher(a=ca, b=cb).ratio() plt.clf() ax = sns.heatmap(diffs, vmin=0., vmax=1., xticklabels=True, yticklabels=True) ax.set_title('similarity of notebook code') with io.BytesIO() as buffer: plt.savefig(buffer, format='svg', transparent=False) return buffer.getvalue()
def cmd_summary(args): """Generate human & machine readable summary of results""" from autograde.cli.util import load_patched, render, list_results, merge_results, b64str, plot_fraud_matrix, \ plot_score_distribution, summarize_results path = Path(args.result or Path.cwd()).expanduser().absolute() assert path.is_dir(), f'{path} is no regular directory' include_similarities = args.similarities results = list() sources = dict() filenames = [] for path_ in list_results(path): logger.debug(f'read {path_}') filenames.append(path_.absolute()) with mount_tar(path_) as tar, cd(tar): r = load_patched() results.append(r) with open('code.py', mode='rt') as f: sources[r.checksum] = f.read() # merge results results_df = merge_results(results) logger.debug('store raw.csv') results_df.to_csv(path.joinpath('raw.csv'), index=False) # summarize results summary_df = summarize_results(results, filenames, include_tasks=True) logger.debug('store summary.csv') summary_df.to_csv(path.joinpath('summary.csv'), index=False) if include_similarities: plots = dict(distribution=b64str(plot_score_distribution(summary_df)), similarities=b64str(plot_fraud_matrix(sources))) else: plots = dict(score_distribution=b64str( plot_score_distribution(summary_df)), similarities=b64str(plot_score_distribution(summary_df))) logger.info('render summary.html') with open(path.joinpath('summary.html'), mode='wt') as f: f.write( render('summary.html', title='summary', summary=summary_df, plots=plots)) return 0
def cmd_build(args): """Build autograde container image for specified backend""" if args.backend is None: logger.warning('no backend specified') return 1 if args.requirements: with Path(args.requirements).open(mode='rt') as f: requirements = list( filter(lambda s: s, map(str.strip, f.read().split('\n')))) else: requirements = [] with TemporaryDirectory() as tmp: logger.debug(f'copy source to {tmp}') shutil.copytree('.', tmp, dirs_exist_ok=True) if requirements: logger.info(f'add additional requirements: {requirements}') with Path(tmp).joinpath('requirements.txt').open(mode='w') as f: logger.debug('add additional requirements: ' + ' '.join(requirements)) f.write('\n'.join(requirements)) if 'docker' in args.backend: cmd = ['docker', 'build', '-t', args.tag, tmp] elif args.backend == 'podman': cmd = [ 'podman', 'build', '-t', args.tag, '--cgroup-manager=cgroupfs', tmp ] else: raise ValueError(f'unknown backend: {args.backend}') logger.debug('run: ' + ' '.join(cmd)) return subprocess.run(cmd, capture_output=args.quiet).returncode
def route_settings(): settings.update(**request.form) logger.debug(f'update settings: {settings}') return redirect(request.referrer)
def cmd_audit(args): """Launch a web interface for manually auditing test results""" import logging from flask import Flask, redirect, request import flask.cli as flask_cli from werkzeug.exceptions import HTTPException, InternalServerError from autograde.util import logger, parse_bool, timestamp_utc_iso, mount_tar with ExitStack() as exit_stack: # settings settings = AuditSettings() # mount & index all results mounts = OrderedDict() sources = dict() results = dict() for path in list_results(args.result): mount_path = Path( exit_stack.enter_context(mount_tar(path, mode='a'))) r = load_patched(mount_path) results[r.checksum] = r mounts[r.checksum] = mount_path with mount_path.joinpath('code.py').open(mode='rt') as f: sources[r.checksum] = f.read() patched = set() next_ids = dict(zip(mounts, list(mounts)[1:])) prev_ids = dict(((b, a) for a, b in next_ids.items())) # create actual flask application app = Flask('autograde - audit') # monkey patching for nicer cli output flask_cli.show_server_banner = lambda *_, **__: logger.debug( 'suppress flask banner') app.logger = logger logging.root = logger @app.errorhandler(Exception) def handle_error(error): logger.warning(f'{type(error)}: {error}') error = error if isinstance( error, HTTPException) else InternalServerError() return render('error.html', title='Oooops', error=error), error.code @app.route('/') def route_root(): return redirect('/audit') @app.route('/settings', methods=('POST', )) def route_settings(): settings.update(**request.form) logger.debug(f'update settings: {settings}') return redirect(request.referrer) @app.route('/audit', strict_slashes=False) @app.route('/audit/<string:id>') def route_audit(id=None): return render('audit.html', title='audit', settings=settings, results=results, id=id, prev_id=prev_ids.get(id), next_id=next_ids.get(id), patched=patched, mounts=mounts) @app.route('/patch', methods=('POST', )) def route_patch(): if (rid := request.form.get('id')) and (mount := mounts.get(rid)): scores = dict() comments = dict() r = deepcopy(results[rid]) r.title = 'manual audit' r.timestamp = timestamp_utc_iso() # extract form data for key, value in request.form.items(): if key.startswith('score:'): scores[key.split(':')[ -1]] = math.nan if value == '' else float(value) elif key.startswith('comment:'): comments[key.split(':')[-1]] = value # update results modification_flag = False for result in r.results: score = scores.get(result.id) if score is not None and not math.isclose( score, result.score): logger.debug(f'update score of result {result.id[:8]}') result.score = score modification_flag = True if comment := comments.get(result.id): logger.debug( f'update messages of result {result.id[:8]}') result.messages.append( settings.format_comment(comment)) modification_flag = True
def route_stop(): if func := request.environ.get('werkzeug.server.shutdown'): logger.debug('shutdown werkzeug server') func()
if comment := comments.get(result.id): logger.debug( f'update messages of result {result.id[:8]}') result.messages.append( settings.format_comment(comment)) modification_flag = True # patch results back if modification_flag: # update state & persist patch inject_patch(r, mount) results[rid] = results[rid].patch(r) patched.add(rid) else: logger.debug('no modifications were made') if next_id := next_ids.get(rid): return redirect(f'/audit/{next_id}#edit') return redirect('/audit') @app.route('/report/<string:id>') def route_report(id): return render('report.html', title='report (preview)', id=id, results=results, summary=results[id].summary()) @app.route('/source/<string:id>')
def exec_notebook(notebook, file: TextIO = sys.stdout, cell_timeout: float = 0., ignore_errors: bool = False, variables: Dict = None): """ Extract source code from jupyter notebook and execute it. :param notebook: file like with notebook data :param file: where to send stdout :param ignore_errors: whether or not errors will be forwarded or ignored :param cell_timeout: timeout for cell execution 0=∞ :param variables: variables to be inserted into initial state :return: the state mutated by executed code """ state = dict() variables = variables or {} state.update(deepcopy(variables)) try: logger.debug('parse notebook') # when executed within a docker container, some minor warnings occur that we filter here with warnings.catch_warnings(): warnings.simplefilter('ignore') notebook = read(notebook, 4) shell = InteractiveShell.instance() # extract comment cells md_cells = [ c.source for c in filter(lambda c: c.cell_type == 'markdown', notebook.cells) ] # prepare code cells for execution def _code_cells(): yield 'injected: setup', INJECT_BEFORE, 0 for i, cell in enumerate( filter(lambda c: c.cell_type == 'code', notebook.cells)): # render code source = shell.input_transformer_manager.transform_cell( cell.source) yield ( f'nb-{i+1}', f'{source.strip()}\n\n# injected by test\ndump_figure()', cell_timeout) yield 'injected: teardown', INJECT_AFTER, 0 code_cells = list(_code_cells()) except Exception as error: logger.error(f'unable to parse notebook: {error}') raise ValueError(error) # prepare import filter if_regex, if_blacklist = variables.get('IMPORT_FILTER', (None, None)) # the log is supposed to be a valid, standalone python script print('#!/usr/bin/env python3', file=file) # actual code execution with ExitStack() as shadow_stack: for i, (label, code, timeout_) in enumerate(code_cells, start=1): state.update({ '__LABEL__': deepcopy(label), '__PLOT_REGISTRY__': [] }) with io.StringIO() as stdout, io.StringIO() as stderr: logger.debug( f'[{i}/{len(code_cells)}] execute cell ("{label}")') stopwatch = StopWatch() try: with capture_output(stdout, stderr): # actual execution that extends state with ExitStack() as es: if if_regex is not None and i > 1: es.enter_context( import_filter(if_regex, blacklist=if_blacklist)) es.enter_context(timeout(timeout_)) es.enter_context(stopwatch) shadow_stack.enter_context( shadowed_exec(code, state)) except Exception as error: # extend log with some meaningful error message traceback.print_exception(type(error), error, error.__traceback__, file=stderr) if not ignore_errors: raise error finally: # log code and output with capture_output(file): _label = f' CODE CELL {label} ' print(f'# {_label:-^78}') print(str(code).strip()) print( f"\n# EXECUTED IN {stopwatch.duration_rel()[-1]:.3}s" ) stdout_s = stdout.getvalue() if stdout_s: print('# STDOUT') print(as_py_comment(stdout_s, 4)) stderr_s = stderr.getvalue() if stderr_s: print('# STDERR') print(as_py_comment(stderr_s, 4)) print('\n') # add markdown comments to state state['__COMMENTS__'] = md_cells # add artifact loader state['__ARTIFACTS__'] = ArtifactLoader() logger.debug('execution completed') yield state
def cli(args=None): # environment variables verbosity = int(os.environ.get('AG_VERBOSITY', 0)) container_backend = os.environ.get('AG_BACKEND', None) container_tag = os.environ.get('AG_TAG', 'autograde') # command line arguments parser = argparse.ArgumentParser( description='utility for grading jupyter notebooks', epilog='autograde on github: https://github.com/cssh-rwth/autograde', prog='autograde', ) # global flags parser.add_argument('-v', '--verbose', action='count', default=verbosity, help='verbosity level') parser.add_argument( '--backend', type=str, default=container_backend, choices=['docker', 'rootless-docker', 'podman'], metavar='', help=f'container backend to use, default is {container_backend}') parser.add_argument('--tag', type=str, default=container_tag, metavar='', help=f'container tag, default: "{container_tag}"') parser.set_defaults(func=cmd_version) subparsers = parser.add_subparsers(help='sub command help') # build sub command bld_parser = subparsers.add_parser('build', help=cmd_build.__doc__) bld_parser.add_argument('-r', '--requirements', type=Path, default=None, help='additional requirements to install') bld_parser.add_argument('-q', '--quiet', action='store_true', help='mute output') bld_parser.set_defaults(func=cmd_build) # test sub command tst_parser = subparsers.add_parser('test', help=cmd_tst.__doc__) tst_parser.add_argument('test', type=str, help='autograde test script') tst_parser.add_argument('notebook', type=str, help='the jupyter notebook(s) to be tested') tst_parser.add_argument('-t', '--target', type=str, metavar='', help='where to store results') tst_parser.add_argument('-c', '--context', type=str, metavar='', help='context directory') tst_parser.set_defaults(func=cmd_tst) # patch sub command ptc_parser = subparsers.add_parser('patch', help=cmd_patch.__doc__) ptc_parser.add_argument('result', type=str, help='result archive(s) to be patched') ptc_parser.add_argument('patch', type=str, help='result archive(s) for patching') ptc_parser.set_defaults(func=cmd_patch) # audit sub command adt_parser = subparsers.add_parser('audit', help=cmd_audit.__doc__) adt_parser.add_argument('result', type=str, help='result archive(s) to audit') adt_parser.add_argument('-b', '--bind', type=str, default='127.0.0.1', help='host to bind to') adt_parser.add_argument('-p', '--port', type=int, default=5000, help='port') adt_parser.set_defaults(func=cmd_audit) # report sub command rpt_parser = subparsers.add_parser('report', help=cmd_report.__doc__) rpt_parser.add_argument('result', type=str, help='result archive(s) for creating the report') rpt_parser.set_defaults(func=cmd_report) # summary sub command sum_parser = subparsers.add_parser('summary', help=cmd_summary.__doc__) sum_parser.add_argument('--similarities', action="store_true", help="activates notebook similarity computation") sum_parser.add_argument('result', type=str, help='result archives to summarize') sum_parser.set_defaults(func=cmd_summary) # version sub command vrs_parser = subparsers.add_parser('version', help=cmd_version.__doc__) vrs_parser.set_defaults(func=cmd_version) args = parser.parse_args(args) logger.setLevel(loglevel(args.verbose)) logger.debug(f'default encoding: {sys.getdefaultencoding()}') logger.debug(f'args: {args}') return args.func(args)
def _grade_notebook(self, nb_path, target_dir=None, context=None): target_dir = target_dir or os.getcwd() # prepare notebook with open(nb_path, mode='rb') as f: nb_data = f.read() nb_hash = sha256(nb_data).hexdigest() nb_hash_short = nb_hash[:8] with cd(target_dir): archive = Path(f'results_{nb_hash_short}.tar.xz') if archive.exists(): logger.debug(f'remove existing {archive}') archive.unlink() with ExitStack() as exec_test_stack: tar = exec_test_stack.enter_context( mount_tar(archive, mode='w:xz')) exec_test_stack.enter_context(cd(tar)) # store copy of notebook logger.debug('dump copy of original notebook') with open('notebook.ipynb', mode='wb') as f: f.write(nb_data) # prepare context and execute notebook with open('code.py', mode='wt') as c, cd('artifacts', mkdir=True): # prepare execution context in file system if context is not None: logger.debug(f'copy context files from: {context}') shutil.copytree(context, '.', dirs_exist_ok=True) # build index of all files known before execution index = set() for path in Path('.').glob('**/*'): if path.is_file(): with path.open(mode='rb') as f: index.add(sha256(f.read()).hexdigest()) # actual notebook execution try: logger.debug('execute notebook') state = exec_test_stack.enter_context( exec_notebook(io.StringIO(nb_data.decode('utf-8')), file=c, ignore_errors=True, cell_timeout=self._cell_timeout, variables=self._variables)) except ValueError: state = {} # remove files that haven't changed artifacts = [] artifacts_excluded = [] for path in Path('.').glob('**/*'): if path.is_file(): delete_flag = False with path.open(mode='rb') as f: if sha256(f.read()).hexdigest() in index: artifacts_excluded.append(str(path)) delete_flag = True else: artifacts.append(str(path)) if delete_flag: path.unlink() # infer meta information group = list( map(lambda m: TeamMember(**m), state.get('team_members', []))) if not group: logger.warning( f'Couldn\'t find valid information about team members in "{nb_path}"' ) # execute tests logger.debug('execute tests') results = Results( title=self._title, notebook=str(nb_path), checksum=nb_hash, team_members=group, artifacts=sorted(artifacts), excluded_artifacts=sorted(artifacts_excluded), results=self._apply_cases(state)) # store results as json logger.debug('dump results as json') with open('results.json', mode='wt') as f: json.dump(results.to_dict(), fp=f, indent=4) # infer new, more readable name names = results.format_members(separator=',') archive_name_new = Path( f'results_[{names}]_{nb_hash_short}.tar.xz') if archive_name_new.exists(): logger.debug(f'remove existing {archive_name_new}') archive_name_new.unlink() archive.rename(archive_name_new) return results