def test_optimize_criteo(tmpdir): input_path = str(tmpdir.mkdir("input")) _get_random_criteo_data(1000).to_csv(os.path.join(input_path, "day_0"), sep="\t", header=False) os.environ["INPUT_DATA_DIR"] = input_path os.environ["OUTPUT_DATA_DIR"] = str(tmpdir.mkdir("output")) with get_cuda_cluster() as cuda_cluster: scheduler_port = cuda_cluster.scheduler_address def _nb_modify(line): # Use cuda_cluster "fixture" port rather than allowing notebook # to deploy a LocalCUDACluster within the subprocess line = line.replace("download_criteo = True", "download_criteo = False") line = line.replace("cluster = None", f"cluster = '{scheduler_port}'") return line notebook_path = os.path.join( dirname(TEST_PATH), "examples/scaling-criteo/", "01-Download-Convert.ipynb", ) _run_notebook(tmpdir, notebook_path, _nb_modify)
def test_multigpu_dask_example(tmpdir): with get_cuda_cluster() as cuda_cluster: os.environ["BASE_DIR"] = str(tmpdir) scheduler_port = cuda_cluster.scheduler_address def _nb_modify(line): # Use cuda_cluster "fixture" port rather than allowing notebook # to deploy a LocalCUDACluster within the subprocess line = line.replace("cluster = None", f"cluster = '{scheduler_port}'") # Use a much smaller "toy" dataset line = line.replace("write_count = 25", "write_count = 4") line = line.replace('freq = "1s"', 'freq = "1h"') # Use smaller partitions for smaller dataset line = line.replace("part_mem_fraction=0.1", "part_size=1_000_000") line = line.replace("out_files_per_proc=8", "out_files_per_proc=1") return line notebook_path = os.path.join(dirname(TEST_PATH), "examples", "multi-gpu_dask.ipynb") _run_notebook(tmpdir, notebook_path, _nb_modify)