def test_wide_deep(notebooks, tmp): notebook_path = notebooks["wide_deep"] params = { "MOVIELENS_DATA_SIZE": "100k", "EPOCHS": 1, "EVALUATE_WHILE_TRAINING": False, "MODEL_DIR": tmp, "EXPORT_DIR_BASE": tmp, "RATING_METRICS": ["rmse", "mae"], "RANKING_METRICS": ["ndcg_at_k", "precision_at_k"], } pm.execute_notebook( notebook_path, OUTPUT_NOTEBOOK, kernel_name=KERNEL_NAME, parameters=params ) results = pm.read_notebook(OUTPUT_NOTEBOOK).dataframe.set_index("name")["value"] # Model performance is highly dependant on the initial random weights # when epochs is small with a small dataset. # Therefore, in the smoke-test context, rather check if the model training is working # with minimum performance metrics as follows: assert results["rmse"] < 2.0 assert results["mae"] < 2.0 assert results["ndcg_at_k"] > 0.0 assert results["precision_at_k"] > 0.0
def test_notebook_dkn(notebooks): notebook_path = notebooks["dkn_quickstart"] pm.execute_notebook( notebook_path, OUTPUT_NOTEBOOK, kernel_name=KERNEL_NAME, parameters=dict(epoch=1), ) results = pm.read_notebook(OUTPUT_NOTEBOOK).dataframe.set_index("name")["value"] assert results["res"]["auc"] == pytest.approx(0.4707, rel=TOL, abs=ABS_TOL) assert results["res"]["acc"] == pytest.approx(0.5725, rel=TOL, abs=ABS_TOL)
def test_template_runs(notebooks): notebook_path = notebooks["template"] pm.execute_notebook( notebook_path, OUTPUT_NOTEBOOK, parameters=dict(PM_VERSION=pm.__version__), kernel_name=KERNEL_NAME, ) nb = pm.read_notebook(OUTPUT_NOTEBOOK) df = nb.dataframe assert df.shape[0] == 2 check_version = df.loc[df["name"] == "checked_version", "value"].values[0] assert check_version is True
def test_fastai_integration(notebooks, size, epochs, expected_values): notebook_path = notebooks["fastai"] pm.execute_notebook(notebook_path, OUTPUT_NOTEBOOK, kernel_name=KERNEL_NAME) pm.execute_notebook( notebook_path, OUTPUT_NOTEBOOK, kernel_name=KERNEL_NAME, parameters=dict(TOP_K=10, MOVIELENS_DATA_SIZE=size, EPOCHS=epochs), ) results = pm.read_notebook(OUTPUT_NOTEBOOK).dataframe.set_index("name")["value"] for key, value in expected_values.items(): assert results[key] == pytest.approx(value, rel=TOL, abs=ABS_TOL)
def test_mmlspark_lightgbm_criteo_smoke(notebooks): notebook_path = notebooks["mmlspark_lightgbm_criteo"] pm.execute_notebook( notebook_path, OUTPUT_NOTEBOOK, kernel_name=KERNEL_NAME, parameters=dict( DATA_SIZE="sample", NUM_ITERATIONS=50, EARLY_STOPPING_ROUND=10 ) ) nb = pm.read_notebook(OUTPUT_NOTEBOOK) results = nb.dataframe.set_index("name")["value"] assert results["auc"] == pytest.approx(0.68895, rel=TOL, abs=ABS_TOL)
def test_ncf_smoke(notebooks): notebook_path = notebooks["ncf"] pm.execute_notebook( notebook_path, OUTPUT_NOTEBOOK, kernel_name=KERNEL_NAME, parameters=dict(TOP_K=10, MOVIELENS_DATA_SIZE="100k", EPOCHS=1, BATCH_SIZE=256), ) results = pm.read_notebook(OUTPUT_NOTEBOOK).dataframe.set_index("name")["value"] # There is too much variability to do an approx equal, just adding top values assert results["map"] < 0.05 assert results["ndcg"] < 0.20 assert results["precision"] < 0.17 assert results["recall"] < 0.10
def test_is_jupyter(): # Test on the terminal assert is_jupyter() is False assert is_databricks() is False # Test on Jupyter notebook path = os.path.join("tests", "unit", "test_notebook_utils.ipynb") pm.execute_notebook( path, OUTPUT_NOTEBOOK, kernel_name=KERNEL_NAME, ) nb = pm.read_notebook(OUTPUT_NOTEBOOK) df = nb.dataframe result_is_jupyter = df.loc[df["name"] == "is_jupyter", "value"].values[0] assert result_is_jupyter is True result_is_databricks = df.loc[df["name"] == "is_databricks", "value"].values[0] assert result_is_databricks is False
def test_fastai(notebooks): notebook_path = notebooks["fastai"] pm.execute_notebook( notebook_path, OUTPUT_NOTEBOOK, kernel_name=KERNEL_NAME, parameters=dict(TOP_K=10, MOVIELENS_DATA_SIZE="100k", EPOCHS=1), ) results = pm.read_notebook(OUTPUT_NOTEBOOK).dataframe.set_index("name")["value"] assert results["rmse"] == pytest.approx(0.959352, rel=TOL, abs=ABS_TOL) assert results["mae"] == pytest.approx(0.766504, rel=TOL, abs=ABS_TOL) assert results["rsquared"] == pytest.approx(0.287902, rel=TOL, abs=ABS_TOL) assert results["exp_var"] == pytest.approx(0.289008, rel=TOL, abs=ABS_TOL) assert results["map"] == pytest.approx(0.024379, rel=TOL, abs=ABS_TOL) assert results["ndcg"] == pytest.approx(0.148380, rel=TOL, abs=ABS_TOL) assert results["precision"] == pytest.approx(0.138494, rel=TOL, abs=ABS_TOL) assert results["recall"] == pytest.approx(0.058747, rel=TOL, abs=ABS_TOL)
def test_wide_deep(notebooks, size, epochs, expected_values, tmp): notebook_path = notebooks["wide_deep"] params = { "MOVIELENS_DATA_SIZE": size, "EPOCHS": epochs, "EVALUATE_WHILE_TRAINING": False, "MODEL_DIR": tmp, "EXPORT_DIR_BASE": tmp, "RATING_METRICS": ["rmse", "mae", "rsquared", "exp_var"], "RANKING_METRICS": ["ndcg_at_k", "map_at_k", "precision_at_k", "recall_at_k"], } pm.execute_notebook( notebook_path, OUTPUT_NOTEBOOK, kernel_name=KERNEL_NAME, parameters=params ) results = pm.read_notebook(OUTPUT_NOTEBOOK).dataframe.set_index("name")["value"] for key, value in expected_values.items(): assert results[key] == pytest.approx(value, rel=TOL, abs=ABS_TOL)
def test_notebook_xdeepfm(notebooks): notebook_path = notebooks["xdeepfm_quickstart"] pm.execute_notebook( notebook_path, OUTPUT_NOTEBOOK, kernel_name=KERNEL_NAME, parameters=dict( EPOCHS_FOR_SYNTHETIC_RUN=20, EPOCHS_FOR_CRITEO_RUN=1, BATCH_SIZE_SYNTHETIC=128, BATCH_SIZE_CRITEO=512, ), ) results = pm.read_notebook(OUTPUT_NOTEBOOK).dataframe.set_index("name")["value"] assert results["res_syn"]["auc"] == pytest.approx(0.982, rel=TOL, abs=ABS_TOL) assert results["res_syn"]["logloss"] == pytest.approx(0.2306, rel=TOL, abs=ABS_TOL) assert results["res_real"]["auc"] == pytest.approx(0.628, rel=TOL, abs=ABS_TOL) assert results["res_real"]["logloss"] == pytest.approx(0.5589, rel=TOL, abs=ABS_TOL)
def test_als_pyspark_smoke(notebooks): notebook_path = notebooks["als_pyspark"] pm.execute_notebook( notebook_path, OUTPUT_NOTEBOOK, kernel_name=KERNEL_NAME, parameters=dict(TOP_K=10, MOVIELENS_DATA_SIZE="100k"), ) nb = pm.read_notebook(OUTPUT_NOTEBOOK) results = nb.dataframe.set_index("name")["value"] start_or_get_spark("ALS PySpark").stop() assert results["map"] == pytest.approx(0.0052, rel=TOL, abs=ABS_TOL) assert results["ndcg"] == pytest.approx(0.0463, rel=TOL, abs=ABS_TOL) assert results["precision"] == pytest.approx(0.0487, rel=TOL, abs=ABS_TOL) assert results["recall"] == pytest.approx(0.0177, rel=TOL, abs=ABS_TOL) assert results["rmse"] == pytest.approx(0.9636, rel=TOL, abs=ABS_TOL) assert results["mae"] == pytest.approx(0.7508, rel=TOL, abs=ABS_TOL) assert results["exp_var"] == pytest.approx(0.2672, rel=TOL, abs=ABS_TOL) assert results["rsquared"] == pytest.approx(0.2611, rel=TOL, abs=ABS_TOL)
def test_als_pyspark_integration(notebooks): notebook_path = notebooks["als_pyspark"] pm.execute_notebook( notebook_path, OUTPUT_NOTEBOOK, kernel_name=KERNEL_NAME, parameters=dict(TOP_K=10, MOVIELENS_DATA_SIZE="1m"), ) nb = pm.read_notebook(OUTPUT_NOTEBOOK) results = nb.dataframe.set_index("name")["value"] start_or_get_spark("ALS PySpark").stop() assert results["map"] == pytest.approx(0.00201, rel=TOL, abs=ABS_TOL) assert results["ndcg"] == pytest.approx(0.02516, rel=TOL, abs=ABS_TOL) assert results["precision"] == pytest.approx(0.03172, rel=TOL, abs=ABS_TOL) assert results["recall"] == pytest.approx(0.009302, rel=TOL, abs=ABS_TOL) assert results["rmse"] == pytest.approx(0.8621, rel=TOL, abs=ABS_TOL) assert results["mae"] == pytest.approx(0.68023, rel=TOL, abs=ABS_TOL) assert results["exp_var"] == pytest.approx(0.4094, rel=TOL, abs=ABS_TOL) assert results["rsquared"] == pytest.approx(0.4038, rel=TOL, abs=ABS_TOL)
def test_lightgcn_deep_dive_integration(notebooks, yaml_file, data_path, size, epochs, batch_size, expected_values, seed): notebook_path = notebooks["lightgcn_deep_dive"] pm.execute_notebook( notebook_path, OUTPUT_NOTEBOOK, kernel_name=KERNEL_NAME, parameters=dict( TOP_K=10, MOVIELENS_DATA_SIZE=size, EPOCHS=epochs, BATCH_SIZE=batch_size, SEED=seed, yaml_file=yaml_file, user_file=os.path.join(data_path, r"user_embeddings"), item_file=os.path.join(data_path, r"item_embeddings"), ), ) results = pm.read_notebook(OUTPUT_NOTEBOOK).dataframe.set_index( "name")["value"] for key, value in expected_values.items(): assert results[key] == pytest.approx(value, rel=TOL, abs=ABS_TOL)
def test_npa_quickstart_integration(notebooks, epochs, seed, MIND_type, expected_values): notebook_path = notebooks["npa_quickstart"] params = {"epochs": epochs, "seed": seed, "MIND_type": MIND_type} pm.execute_notebook(notebook_path, OUTPUT_NOTEBOOK, kernel_name=KERNEL_NAME, parameters=params) results = pm.read_notebook(OUTPUT_NOTEBOOK).dataframe.set_index( "name")["value"] for key, value in expected_values.items(): assert results[key]["group_auc"] == pytest.approx(value["group_auc"], rel=TOL, abs=ABS_TOL) assert results[key]["mean_mrr"] == pytest.approx(value["mean_mrr"], rel=TOL, abs=ABS_TOL) assert results[key]["ndcg@5"] == pytest.approx(value["ndcg@5"], rel=TOL, abs=ABS_TOL) assert results[key]["ndcg@10"] == pytest.approx(value["ndcg@10"], rel=TOL, abs=ABS_TOL)
def test_slirec_quickstart_integration(notebooks, yaml_file, data_path, epochs, batch_size, expected_values, seed): notebook_path = notebooks["slirec_quickstart"] params = { "yaml_file": yaml_file, "data_path": data_path, "EPOCHS": epochs, "BATCH_SIZE": batch_size, "RANDOM_SEED": seed, } pm.execute_notebook(notebook_path, OUTPUT_NOTEBOOK, kernel_name=KERNEL_NAME, parameters=params) results = pm.read_notebook(OUTPUT_NOTEBOOK).dataframe.set_index( "name")["value"] for key, value in expected_values.items(): assert results[key]["auc"] == pytest.approx(value["auc"], rel=TOL, abs=ABS_TOL) assert results[key]["logloss"] == pytest.approx(value["logloss"], rel=TOL, abs=ABS_TOL)
def _t_fn(info, inputs): base_dir = '/tmp/dagstermill/{run_id}/'.format( run_id=info.context.run_id) output_notebook_dir = os.path.join(base_dir, 'output_notebooks/') if not os.path.exists(output_notebook_dir): os.makedirs(output_notebook_dir) temp_path = os.path.join( output_notebook_dir, '{prefix}-out.ipynb'.format(prefix=str(uuid.uuid4()))) try: _source_nb = pm.execute_notebook( notebook_path, temp_path, parameters=dict(dm_context=serialize_dm_context(info, inputs)), ) output_nb = pm.read_notebook(temp_path) info.context.debug( 'Notebook execution complete for {name}. Data is {data}'. format(name=name, data=output_nb.data)) for output_def in info.solid_def.output_defs: if output_def.name in output_nb.data: value = unmarshal_value(output_def.runtime_type, output_nb.data[output_def.name]) yield Result(value, output_def.name) finally: if do_cleanup and os.path.exists(temp_path): os.remove(temp_path)
def _t_fn(transform_context, inputs): check.inst_param(transform_context, 'transform_context', TransformExecutionContext) check.param_invariant( isinstance(transform_context.environment_dict, dict), 'context', 'SystemTransformExecutionContext must have valid environment_dict', ) system_transform_context = transform_context.get_system_context() base_dir = '/tmp/dagstermill/{run_id}/'.format( run_id=transform_context.run_id) output_notebook_dir = os.path.join(base_dir, 'output_notebooks/') if not os.path.exists(output_notebook_dir): os.makedirs(output_notebook_dir) temp_path = os.path.join( output_notebook_dir, '{prefix}-out.ipynb'.format(prefix=str(uuid.uuid4()))) output_log_path = os.path.join(base_dir, 'run.log') try: nb = load_notebook_node(notebook_path) nb_no_parameters = replace_parameters( system_transform_context, nb, get_papermill_parameters(system_transform_context, inputs, output_log_path), ) intermediate_path = os.path.join( output_notebook_dir, '{prefix}-inter.ipynb'.format(prefix=str(uuid.uuid4()))) write_ipynb(nb_no_parameters, intermediate_path) with open(output_log_path, 'a') as f: f.close() process = subprocess.Popen( [ 'papermill', '--log-output', '--log-level', 'ERROR', intermediate_path, temp_path ], stderr=subprocess.PIPE, ) _stdout, stderr = process.communicate() while process.poll() is None: # while subprocess alive if system_transform_context.event_callback: with open(output_log_path, 'r') as ff: current_time = os.path.getmtime(output_log_path) while process.poll() is None: new_time = os.path.getmtime(output_log_path) if new_time != current_time: line = ff.readline() if not line: break event_record_dict = json.loads(line) system_transform_context.event_callback( EventRecord(**event_record_dict)) current_time = new_time if process.returncode != 0: raise DagstermillError( 'There was an error when Papermill tried to execute the notebook. ' 'The process stderr is \'{stderr}\''.format(stderr=stderr)) output_nb = pm.read_notebook(temp_path) system_transform_context.log.debug( 'Notebook execution complete for {name}. Data is {data}'. format(name=name, data=output_nb.data)) yield Materialization( '{name} output notebook'.format( name=transform_context.solid.name), temp_path) for output_def in system_transform_context.solid_def.output_defs: if output_def.name in output_nb.data: value = read_value(output_def.runtime_type, output_nb.data[output_def.name]) yield Result(value, output_def.name) finally: if do_cleanup and os.path.exists(temp_path): os.remove(temp_path)
for cname in GB_ATTRS: # convert to str for grouping. sum_df[cname] = sum_df[cname].astype(np.str) sum_df['exp_n'] = sum_df.groupby(GB_ATTRS)['RES_PATH'].transform('count') # XXX read the result from res_path # read the pm data for i, row in sum_df.iterrows(): display(sum_df[i:i + 1]) res_nb_path = os.path.join(row['RES_PATH'], 'script.ipynb') # READ the config res_nb = pm.read_notebook(res_nb_path) res_nb.display_output('XXXX') g_sum_df = sum_df.groupby(GB_ATTRS).mean() # Check the affect of every parameters for attr in GB_ATTRS: print(attr, '=' * 20) display(sum_df.groupby(attr).mean().loc[:, 'pretrain_group5_model_ar':'ft_group5_wr']) display(sum_df.groupby(attr).count().loc[:, 'pretrain_group5_model_ar':'ft_group5_wr']) # END exp_summary.py -------------------------------------------------- # BEGIN jupyter.ipynb ----------------------------------------------- # https://github.com/nteract/papermill
def test_bad_file_ext(self): with self.assertRaises(PapermillException): read_notebook('result_notebook.py')
parser = argparse.ArgumentParser(prog='SAT') parser.add_argument("heuristic", type=int, help="[1] DP basic, [2] MOMs, [3] JeroSloWang, [4] Logistic regression", default=1) parser.add_argument("infile", type=str, help="DIMACS file including all constraints to satisfy", default="constraint_problem.txt") args = parser.parse_args() argMapping = {1:"random", 2:"moms", 3:"jerow", 4:"logreg"} print("Finding satisfying interpretation of", args.infile, "using", argMapping[args.heuristic], "splits...") pm.execute_notebook( 'SAT_book.ipynb', 'output.ipynb', parameters = dict(heuristic=argMapping[args.heuristic], infile=args.infile) ) nb = pm.read_notebook('output.ipynb') df = nb.dataframe solution = next(iter((df[df['name']=='solution']['value'].values))) if solution: print("Saved solution to file: solution.txt") else: print("No solution found") with open("solution.txt", "w") as outfile: for s in solution: outfile.write('' + str(s) + ' 0\n')
paramaters if os.environ["ACCEPT_PARAMETERS"] else None, progress_bar=False, log_output=False, report_mode=False) ## Get return type for notebook from 'Http_Accept' header ## if none return empty string (stdout) notebook_return_type = os.environ.get('Http_Accept', "").split(",")[0] ## Get all the cell outputs from notebook ## iterate backwards and retrieve output that ## meets 'Http_Accept' header, if none match ## return last output ## ## If 'Http_Accept' is */* or "" default to stdout output_notebook = papermill.read_notebook(os.environ["OUTPUT_NOTEBOOK"]) outputs=sum([list(c["outputs"]) \ if "outputs" in c.keys() else [] \ for c in output_notebook.node.cells],[]) for cell in outputs[::-1]: if (cell["output_type"] == "error"): print(cell["traceback"]) break elif (cell["output_type"] == "stream") and \ (notebook_return_type in ["*/*", ""]): print(cell["text"]) break elif (cell["output_type"] in ["display_data", "execute_result"]):
def _t_fn(info, inputs): check.param_invariant( isinstance(info.context.environment_config, dict), 'info', 'TransformExecutionInfo must have valid environment_config', ) base_dir = '/tmp/dagstermill/{run_id}/'.format( run_id=info.context.run_id) output_notebook_dir = os.path.join(base_dir, 'output_notebooks/') if not os.path.exists(output_notebook_dir): os.makedirs(output_notebook_dir) temp_path = os.path.join( output_notebook_dir, '{prefix}-out.ipynb'.format(prefix=str(uuid.uuid4()))) output_log_path = os.path.join(base_dir, 'run.log') try: nb = load_notebook_node(notebook_path) nb_no_parameters = replace_parameters( info, nb, get_papermill_parameters(info, inputs, output_log_path)) intermediate_path = os.path.join( output_notebook_dir, '{prefix}-inter.ipynb'.format(prefix=str(uuid.uuid4()))) write_ipynb(nb_no_parameters, intermediate_path) with open(output_log_path, 'a') as f: f.close() # info.log.info("Output log path is {}".format(output_log_path)) # info.log.info("info.context.event_callback {}".format(info.context.event_callback)) process = subprocess.Popen( ["papermill", intermediate_path, temp_path]) # _source_nb = pm.execute_notebook(intermediate_path, temp_path) while process.poll() is None: # while subprocess alive if info.context.event_callback: with open(output_log_path, 'r') as ff: current_time = os.path.getmtime(output_log_path) while process.poll() is None: new_time = os.path.getmtime(output_log_path) if new_time != current_time: line = ff.readline() if not line: break event_record_dict = json.loads(line) event_record_dict['event_type'] = EventType( event_record_dict['event_type']) info.context.event_callback( EventRecord(**event_record_dict)) current_time = new_time if process.returncode != 0: # Throw event that is an execution error! info.log.debug("There was an error in Papermill!") info.log.debug('stderr was None' if process.stderr is None else process.stderr) exit() output_nb = pm.read_notebook(temp_path) info.log.debug( 'Notebook execution complete for {name}. Data is {data}'. format(name=name, data=output_nb.data)) info.log.info( "Output notebook path is {}".format(output_notebook_dir)) for output_def in info.solid_def.output_defs: if output_def.name in output_nb.data: value = read_value(output_def.runtime_type, output_nb.data[output_def.name]) yield Result(value, output_def.name) finally: if do_cleanup and os.path.exists(temp_path): os.remove(temp_path)
def get_outputs_df(nbname): df = pm.read_notebook(nbname).dataframe df = df.loc[df['type'] == 'record'].set_index('name') df.drop(['type', 'filename'], axis=1, inplace=True) return df
def get_output_files(dirname): return [pm.read_notebook(x) for x in glob('%s/*.ipynb' % dirname)]
if outdir is "": outdir = os.getcwd() if not os.path.isdir(outdir): print("output directory", outdir, "does not exist. Creating.") os.makedirs(outdir) notebooks = {"hello_world": args.input} notebook_path = notebooks["hello_world"] pm.execute_notebook(notebook_path, OUTPUT_NOTEBOOK, kernel_name=KERNEL_NAME, parameters=dict(x=args.x, y=args.y)) nb = pm.read_notebook(OUTPUT_NOTEBOOK) ## Now log thing via AML try: from azureml.core import Run run = Run.get_context() except ImportError: run = None print('*** run value is:', run) def _log(metric, value): if run is not None: print('logging variables with AML logging functions.') if type(value) == list and len(value) > 0 and (type(
if params['TOP_K'] <= 0: raise ValueError("Top K should be larger than 0") if params['MODEL_TYPE'] not in {'wide', 'deep', 'wide_deep'}: raise ValueError("Model type should be either 'wide', 'deep', or 'wide_deep'") if params['DATA_DIR'] is None: raise ValueError("Datastore path should be given") print("Args:") for k, v in params.items(): _log(k, v) print("Run", NOTEBOOK_NAME) pm.execute_notebook( NOTEBOOK_NAME, OUTPUT_NOTEBOOK, parameters=params, kernel_name='python3' ) nb = pm.read_notebook(OUTPUT_NOTEBOOK) for m, v in nb.data.items(): _log(m, v) # clean-up os.remove(OUTPUT_NOTEBOOK) shutil.rmtree(params['MODEL_DIR'], ignore_errors=True)