def run_scene_optimizer() -> None: """ """ with initialize_config_module(config_module="gtsfm.configs"): # config is relative to the gtsfm module cfg = compose(config_name="default_lund_door_set1_config.yaml") scene_optimizer: SceneOptimizer = instantiate(cfg.SceneOptimizer) loader = OlssonLoader(os.path.join(DATA_ROOT, "set1_lund_door"), image_extension="JPG") sfm_result_graph = scene_optimizer.create_computation_graph( num_images=len(loader), image_pair_indices=loader.get_valid_pairs(), image_graph=loader.create_computation_graph_for_images(), camera_intrinsics_graph=loader. create_computation_graph_for_intrinsics(), gt_pose_graph=loader.create_computation_graph_for_poses(), ) # create dask client cluster = LocalCluster(n_workers=2, threads_per_worker=4) with Client(cluster), performance_report(filename="dask-report.html"): sfm_result = sfm_result_graph.compute() assert isinstance(sfm_result, GtsfmData)
def run(client, args, n_workers, write_profile=None): # Generate random Dask dataframes ddf_base = get_random_ddf(args.chunk_size, args.base_chunks, args.frac_match, "build", args).persist() ddf_other = get_random_ddf(args.chunk_size, args.other_chunks, args.frac_match, "other", args).persist() wait(ddf_base) wait(ddf_other) assert len(ddf_base.dtypes) == 2 assert len(ddf_other.dtypes) == 2 data_processed = len(ddf_base) * sum([t.itemsize for t in ddf_base.dtypes]) data_processed += len(ddf_other) * sum( [t.itemsize for t in ddf_other.dtypes]) # Get contexts to use (defaults to null contexts that doesn't do anything) ctx1, ctx2 = contextlib.nullcontext(), contextlib.nullcontext() if args.backend == "explicit-comms": ctx1 = dask.config.set(explicit_comms=True) if write_profile is not None: ctx2 = performance_report(filename=args.profile) with ctx1: with ctx2: t1 = perf_counter() merge(args, ddf_base, ddf_other) t2 = perf_counter() return (data_processed, t2 - t1)
def run(client, args, n_workers, write_profile=None): # Generate random Dask dataframe chunksize = args.partition_size // 8 # Convert bytes to float64 nchunks = args.in_parts totalsize = chunksize * nchunks x = da.random.random((totalsize, ), chunks=(chunksize, )) df = dask.dataframe.from_dask_array(x, columns="data").to_frame() if args.type == "gpu": import cudf df = df.map_partitions(cudf.from_pandas) df = df.persist() wait(df) data_processed = len(df) * sum([t.itemsize for t in df.dtypes]) if write_profile is None: ctx = contextlib.nullcontext() else: ctx = performance_report(filename=args.profile) with ctx: t1 = clock() if args.backend == "dask": shuffle_dask(df) else: shuffle_explicit_comms(df) t2 = clock() return (data_processed, t2 - t1)
def run(args, write_profile=None): # Generate random Dask dataframes ddf_base = get_random_ddf(args.chunk_size, args.n_workers, args.frac_match, "build", args).persist() ddf_other = get_random_ddf(args.chunk_size, args.n_workers, args.frac_match, "other", args).persist() wait(ddf_base) wait(ddf_other) assert (len(ddf_base.dtypes) == 2) assert (len(ddf_other.dtypes) == 2) data_processed = len(ddf_base) * sum([t.itemsize for t in ddf_base.dtypes]) data_processed += len(ddf_other) * sum( [t.itemsize for t in ddf_other.dtypes]) # Lazy merge/join operation ddf_join = ddf_base.merge(ddf_other, on=["key"], how="inner") if args.set_index: ddf_join = ddf_join.set_index("key") # Execute the operations to benchmark if write_profile is not None: with performance_report(filename=args.profile): t1 = clock() wait(ddf_join.persist()) took = clock() - t1 else: t1 = clock() wait(ddf_join.persist()) took = clock() - t1 return (data_processed, took)
def example_function(): x = da.random.random((100_000, 100_000, 10), chunks=(10_000, 10_000, 5)) y = da.random.random((100_000, 100_000, 10), chunks=(10_000, 10_000, 5)) z = (da.arcsin(x) + da.arccos(y)).sum(axis=(1, 2)) with performance_report(filename="dask-report_mpi.html"): result = z.compute()
def run_scene_optimizer(args) -> None: """ Run GTSFM over images from an Argoverse vehicle log""" with initialize_config_module(config_module="gtsfm.configs"): # config is relative to the gtsfm module cfg = compose(config_name="default_lund_door_set1_config.yaml") scene_optimizer: SceneOptimizer = instantiate(cfg.SceneOptimizer) loader = ArgoverseDatasetLoader( dataset_dir=args.dataset_dir, log_id=args.log_id, stride=args.stride, max_num_imgs=args.max_num_imgs, max_lookahead_sec=args.max_lookahead_sec, camera_name=args.camera_name, ) sfm_result_graph = scene_optimizer.create_computation_graph( len(loader), loader.get_valid_pairs(), loader.create_computation_graph_for_images(), loader.create_computation_graph_for_intrinsics(), use_intrinsics_in_verification=True, gt_pose_graph=loader.create_computation_graph_for_poses(), ) # create dask client cluster = LocalCluster(n_workers=2, threads_per_worker=4) with Client(cluster), performance_report(filename="dask-report.html"): sfm_result = sfm_result_graph.compute() assert isinstance(sfm_result, GtsfmData) scene_avg_reproj_error = sfm_result.get_scene_avg_reprojection_error() logger.info('Scene avg reproj error: {}'.format( str(np.round(scene_avg_reproj_error, 3))))
async def _run(client, args): # Create a simple random array if args.type == "gpu": rs = da.random.RandomState(RandomState=cp.random.RandomState) else: rs = da.random.RandomState(RandomState=np.random.RandomState) x = rs.random((args.size, args.size), chunks=args.chunk_size).persist() ks = 2 * (2 * args.kernel_size + 1, ) await wait(x) # Execute the operations to benchmark if args.profile is not None: async with performance_report(filename=args.profile): t1 = clock() await wait( client.persist( x.map_overlap(mean_filter, args.kernel_size, shape=ks))) took = clock() - t1 else: t1 = clock() await wait( client.persist( x.map_overlap(mean_filter, args.kernel_size, shape=ks))) took = clock() - t1 return (took, x.npartitions)
def shuffle_dask(args, df, write_profile): # Execute the operations to benchmark if write_profile is not None: with performance_report(filename=args.profile): t1 = clock() wait(shuffle(df, index="data", shuffle="tasks").persist()) took = clock() - t1 else: t1 = clock() wait(shuffle(df, index="data", shuffle="tasks").persist()) took = clock() - t1 return took
def run(self) -> None: """Run the SceneOptimizer.""" start_time = time.time() # create dask client cluster = LocalCluster( n_workers=self.parsed_args.num_workers, threads_per_worker=self.parsed_args.threads_per_worker) pairs_graph = self.retriever.create_computation_graph(self.loader) with Client(cluster), performance_report(filename="dask-report.html"): image_pair_indices = pairs_graph.compute() delayed_sfm_result, delayed_io = self.scene_optimizer.create_computation_graph( num_images=len(self.loader), image_pair_indices=image_pair_indices, image_graph=self.loader.create_computation_graph_for_images(), all_intrinsics=self.loader.get_all_intrinsics(), image_shapes=self.loader.get_image_shapes(), relative_pose_priors=self.loader.get_relative_pose_priors( image_pair_indices), absolute_pose_priors=self.loader.get_absolute_pose_priors(), cameras_gt=self.loader.get_gt_cameras(), gt_wTi_list=self.loader.get_gt_poses(), matching_regime=ImageMatchingRegime( self.parsed_args.matching_regime), ) with Client(cluster), performance_report(filename="dask-report.html"): sfm_result, *io = dask.compute(delayed_sfm_result, *delayed_io) assert isinstance(sfm_result, GtsfmData) end_time = time.time() duration_sec = end_time - start_time logger.info( "GTSFM took %.2f minutes to compute sparse multi-view result.", duration_sec / 60)
def merge(args, ddf1, ddf2, write_profile): # Lazy merge/join operation ddf_join = ddf1.merge(ddf2, on=["key"], how="inner") if args.set_index: ddf_join = ddf_join.set_index("key") # Execute the operations to benchmark if write_profile is not None: with performance_report(filename=args.profile): t1 = clock() wait(ddf_join.persist()) took = clock() - t1 else: t1 = clock() wait(ddf_join.persist()) took = clock() - t1 return took
async def _run(client, args): # Create a simple random array rs = da.random.RandomState(RandomState=cupy.random.RandomState) x = rs.random((args.size, args.size), chunks=args.chunk_size).persist() await wait(x) # Execute the operations to benchmark if args.profile is not None: async with performance_report(filename=args.profile): t1 = clock() await client.compute((x + x.T).sum()) took = clock() - t1 else: t1 = clock() await client.compute((x + x.T).sum()) took = clock() - t1 return (took, x.npartitions)
def run(self) -> None: """Run Structure-from-Motion (SfM) pipeline.""" start_time = time.time() # Create dask client. cluster = LocalCluster( n_workers=self.parsed_args.num_workers, threads_per_worker=self.parsed_args.threads_per_worker) with Client(cluster) as client, performance_report( filename="dask-report.html"): pairs_graph = self.retriever.create_computation_graph(self.loader) image_pair_indices = pairs_graph.compute() # Scatter surface mesh across all nodes to preserve computation time and memory. gt_scene_trimesh_future = client.scatter( self.loader.gt_scene_trimesh, broadcast=True) # Prepare computation graph. delayed_sfm_result, delayed_io = self.scene_optimizer.create_computation_graph( num_images=len(self.loader), image_pair_indices=image_pair_indices, image_graph=self.loader.create_computation_graph_for_images(), all_intrinsics=self.loader.get_all_intrinsics(), image_shapes=self.loader.get_image_shapes(), gt_scene_mesh=gt_scene_trimesh_future, cameras_gt=self.loader.get_gt_cameras(), gt_wTi_list=self.loader.get_gt_poses(), matching_regime=ImageMatchingRegime( self.parsed_args.matching_regime), absolute_pose_priors=self.loader.get_absolute_pose_priors(), relative_pose_priors=self.loader.get_relative_pose_priors( image_pair_indices), ) # Run SfM pipeline. sfm_result, *io = dask.compute(delayed_sfm_result, *delayed_io) assert isinstance(sfm_result, GtsfmData) logger.info( "GTSFM took %.2f minutes to compute sparse multi-view result.", (time.time() - start_time) / 60)
def profiled(*args, **kwargs): name = func.__name__ t0 = time.time() if dask_profile: with performance_report(filename=f"profiled-{name}.html"): result = func(*args, **kwargs) else: result = func(*args, **kwargs) elapsed_time = time.time() - t0 logging_info = {} logging_info["elapsed_time_seconds"] = elapsed_time logging_info["function_name"] = name logdf = pd.DataFrame.from_dict(logging_info, orient="index").T if csv: logdf.to_csv(f"benchmarked_{name}.csv", index=False) else: print(logdf) return result
def benchmark(func, *args, **kwargs): csv = kwargs.pop("csv", True) dask_profile = kwargs.pop("dask_profile", False) compute_result = kwargs.pop("compute_result", False) name = func.__name__ t0 = time.time() if dask_profile: with performance_report(filename=f"profiled-{name}.html"): result = func(*args, **kwargs) else: result = func(*args, **kwargs) elapsed_time = time.time() - t0 logging_info = {} logging_info["elapsed_time_seconds"] = elapsed_time logging_info["function_name"] = name if compute_result: import dask_cudf if isinstance(result, dask_cudf.DataFrame): len_tasks = [dask.delayed(len)(df) for df in result.to_delayed()] else: len_tasks = [] for read_df in result: len_tasks += [ dask.delayed(len)(df) for df in read_df.to_delayed() ] compute_st = time.time() results = dask.compute(*len_tasks) compute_et = time.time() logging_info["compute_time_seconds"] = compute_et - compute_st logdf = pd.DataFrame.from_dict(logging_info, orient="index").T if csv: logdf.to_csv(f"benchmarked_{name}.csv", index=False) else: print(logdf) return result
def main(): # create run directory tree logging.info("1 - create_dir_tree") dirs = pa.create_dir_tree(root_dir, run_name, overwrite=overwrite) # create tiling logging.info("2 - generate_tiling") tl = generate_tiling(dirs, overwrite) # create run directories, erase them if need be logging.info("3 - create_tile_run_tree") tl.create_tile_run_tree(dirs["run"], overwrite=overwrite) logging.info("4 - start main loop") flag = True reboot = 0 while flag: logging.info("--- reboot %d", reboot) # sping up cluster cluster, client = spin_up_cluster(dask_jobs) print(cluster) # run parcels simulation ms = MemorySampler() with performance_report(filename=f"dask-report-{reboot}.html"), ms.sample(f"reboot{reboot}"): flag = run(dirs, tl, cluster, client) # https://distributed.dask.org/en/latest/diagnosing-performance.html#analysing-memory-usage-over-time ms.to_pandas().to_csv(f"dask-memory-report-{reboot}.csv") # close dask close_dask(cluster, client) reboot += 1
def run_scene_optimizer(args: argparse.Namespace) -> None: """Run GTSFM over images from an Argoverse vehicle log""" with hydra.initialize_config_module(config_module="gtsfm.configs"): # config is relative to the gtsfm module cfg = hydra.compose(config_name=args.config_name) scene_optimizer: SceneOptimizer = instantiate(cfg.SceneOptimizer) loader = ArgoverseDatasetLoader( dataset_dir=args.dataset_dir, log_id=args.log_id, stride=args.stride, max_num_imgs=args.max_num_imgs, max_lookahead_sec=args.max_lookahead_sec, camera_name=args.camera_name, max_resolution=args.max_resolution, ) delayed_sfm_result, delayed_io = scene_optimizer.create_computation_graph( num_images=len(loader), image_pair_indices=loader.get_valid_pairs(), image_graph=loader.create_computation_graph_for_images(), all_intrinsics=loader.get_all_intrinsics(), image_shapes=loader.get_image_shapes(), cameras_gt=loader.get_gt_cameras(), ) # create dask client cluster = LocalCluster(n_workers=args.num_workers, threads_per_worker=args.threads_per_worker) with Client(cluster), performance_report(filename="dask-report.html"): sfm_result, *io = dask.compute(delayed_sfm_result, *delayed_io) assert isinstance(sfm_result, GtsfmData) scene_avg_reproj_error = sfm_result.get_avg_scene_reprojection_error() logger.info("Scene avg reproj error: %.3f", scene_avg_reproj_error)
def main(args): # Input data_path = args.data_path out_path = args.out_path freq_limit = args.freq_limit out_files_per_proc = args.splits if args.protocol == "ucx": os.environ["UCX_TLS"] = "tcp,cuda_copy,cuda_ipc,sockcm" # Use Criteo dataset by default (for now) cont_names = (args.cont_names.split(",") if args.cont_names else ["I" + str(x) for x in range(1, 14)]) cat_names = (args.cat_names.split(",") if args.cat_names else ["C" + str(x) for x in range(1, 27)]) label_name = ["label"] if args.cat_splits: tree_width = { name: int(s) for name, s in zip(cat_names, args.cat_splits.split(",")) } else: tree_width = {col: 1 for col in cat_names} if args.cat_names is None: # Using Criteo... Use more hash partitions for # known high-cardinality columns tree_width["C20"] = 8 tree_width["C1"] = 8 tree_width["C22"] = 4 tree_width["C10"] = 4 tree_width["C21"] = 2 tree_width["C11"] = 2 tree_width["C23"] = 2 tree_width["C12"] = 2 # Specify categorical caching location cat_cache = None if args.cat_cache: cat_cache = args.cat_cache.split(",") if len(cat_cache) == 1: cat_cache = cat_cache[0] else: # If user is specifying a list of options, # they must specify an option for every cat column assert len(cat_names) == len(cat_cache) if isinstance(cat_cache, str): cat_cache = {col: cat_cache for col in cat_names} elif isinstance(cat_cache, list): cat_cache = {name: c for name, c in zip(cat_names, cat_cache)} else: # Criteo/DLRM Defaults cat_cache = {col: "device" for col in cat_names} if args.cat_names is None: cat_cache["C20"] = "host" cat_cache["C1"] = "host" # Only need to cache the largest two on a dgx-2 if args.n_workers < 16: cat_cache["C22"] = "host" cat_cache["C10"] = "host" # Use total device size to calculate args.device_limit_frac device_size = device_mem_size(kind="total") device_limit = int(args.device_limit_frac * device_size) device_pool_size = int(args.device_pool_frac * device_size) part_size = int(args.part_mem_frac * device_size) # Setup LocalCUDACluster if args.protocol == "tcp": cluster = LocalCUDACluster( protocol=args.protocol, n_workers=args.n_workers, CUDA_VISIBLE_DEVICES=args.devs, device_memory_limit=device_limit, local_directory=args.dask_workspace, dashboard_address=":3787", ) else: cluster = LocalCUDACluster( protocol=args.protocol, n_workers=args.n_workers, CUDA_VISIBLE_DEVICES=args.devs, enable_nvlink=True, device_memory_limit=device_limit, local_directory=args.dask_workspace, dashboard_address=":3787", ) client = Client(cluster) # Setup RMM pool if not args.no_rmm_pool: setup_rmm_pool(client, device_pool_size) # Define Dask NVTabular "Workflow" processor = Workflow(cat_names=cat_names, cont_names=cont_names, label_name=label_name, client=client) processor.add_feature([ops.ZeroFill(), ops.LogOp()]) processor.add_preprocess( ops.Categorify( out_path=out_path, tree_width=tree_width, cat_cache=cat_cache, freq_threshold=freq_limit, on_host=args.cat_on_host, )) processor.finalize() dataset = Dataset(data_path, "parquet", part_size=part_size) # Execute the dask graph runtime = time.time() if args.profile is not None: with performance_report(filename=args.profile): processor.apply( dataset, shuffle="full" if args.worker_shuffle else "partial", out_files_per_proc=out_files_per_proc, output_path=out_path, ) else: processor.apply( dataset, shuffle="full" if args.worker_shuffle else "partial", out_files_per_proc=out_files_per_proc, output_path=out_path, ) runtime = time.time() - runtime print("\nDask-NVTabular DLRM/Criteo benchmark") print("--------------------------------------") print(f"partition size | {part_size}") print(f"protocol | {args.protocol}") print(f"device(s) | {args.devs}") print(f"rmm-pool | {(not args.no_rmm_pool)}") print(f"out_files_per_proc | {args.splits}") print(f"worker-shuffle | {args.worker_shuffle}") print("======================================") print(f"Runtime[s] | {runtime}") print("======================================\n") client.close()
async def run(args): # Set up workers on the local machine async with LocalCUDACluster( protocol=args.protocol, n_workers=len(args.devs.split(",")), CUDA_VISIBLE_DEVICES=args.devs, asynchronous=True, ) as cluster: async with Client(cluster, asynchronous=True) as client: # Create a simple random array rs = da.random.RandomState(RandomState=cupy.random.RandomState) x = rs.random((args.size, args.size), chunks=args.chunk_size).persist() await wait(x) # Execute the operations to benchmark if args.profile is not None: async with performance_report(filename=args.profile): t1 = clock() await client.compute((x + x.T).sum()) took = clock() - t1 else: t1 = clock() await client.compute((x + x.T).sum()) took = clock() - t1 # Collect, aggregate, and print peer-to-peer bandwidths incoming_logs = await client.run( lambda dask_worker: dask_worker.incoming_transfer_log ) bandwidths = defaultdict(list) total_nbytes = defaultdict(list) for k, L in incoming_logs.items(): for d in L: if d["total"] >= args.ignore_size: bandwidths[k, d["who"]].append(d["bandwidth"]) total_nbytes[k, d["who"]].append(d["total"]) bandwidths = { ( cluster.scheduler.workers[w1].name, cluster.scheduler.workers[w2].name, ): [ "%s/s" % format_bytes(x) for x in np.quantile(v, [0.25, 0.50, 0.75]) ] for (w1, w2), v in bandwidths.items() } total_nbytes = { ( cluster.scheduler.workers[w1].name, cluster.scheduler.workers[w2].name, ): format_bytes(sum(nb)) for (w1, w2), nb in total_nbytes.items() } print("Roundtrip benchmark") print("--------------------------") print(f"Size | {args.size}*{args.size}") print(f"Chunk-size | {args.chunk_size}") print(f"Ignore-size | {format_bytes(args.ignore_size)}") print(f"Protocol | {args.protocol}") print(f"Device(s) | {args.devs}") print(f"npartitions | {x.npartitions}") print("==========================") print(f"Total time | {format_time(took)}") print("==========================") print("(w1,w2) | 25% 50% 75% (total nbytes)") print("--------------------------") for (d1, d2), bw in sorted(bandwidths.items()): print( "(%02d,%02d) | %s %s %s (%s)" % (d1, d2, bw[0], bw[1], bw[2], total_nbytes[(d1, d2)]) )
def main(args): """Multi-GPU Criteo/DLRM Preprocessing Benchmark This benchmark is designed to measure the time required to preprocess the Criteo (1TB) dataset for Facebook’s DLRM model. The user must specify the path of the raw dataset (using the `--data-path` flag), as well as the output directory for all temporary/final data (using the `--out-path` flag) Example Usage ------------- python dask-nvtabular-criteo-benchmark.py --data-path /path/to/criteo_parquet --out-path /out/dir/` Dataset Requirements (Parquet) ------------------------------ This benchmark is designed with a parquet-formatted dataset in mind. While a CSV-formatted dataset can be processed by NVTabular, converting to parquet will yield significantly better performance. To convert your dataset, try using the `optimize_criteo.ipynb` notebook (also located in `NVTabular/examples/`) For a detailed parameter overview see `NVTabular/examples/MultiGPUBench.md` """ # Input data_path = args.data_path freq_limit = args.freq_limit out_files_per_proc = args.out_files_per_proc high_card_columns = args.high_cards.split(",") dashboard_port = args.dashboard_port if args.protocol == "ucx": UCX_TLS = os.environ.get("UCX_TLS", "tcp,cuda_copy,cuda_ipc,sockcm") os.environ["UCX_TLS"] = UCX_TLS # Cleanup output directory BASE_DIR = args.out_path dask_workdir = os.path.join(BASE_DIR, "workdir") output_path = os.path.join(BASE_DIR, "output") stats_path = os.path.join(BASE_DIR, "stats") if not os.path.isdir(BASE_DIR): os.mkdir(BASE_DIR) for dir_path in (dask_workdir, output_path, stats_path): if os.path.isdir(dir_path): shutil.rmtree(dir_path) os.mkdir(dir_path) # Use Criteo dataset by default (for now) cont_names = (args.cont_names.split(",") if args.cont_names else ["I" + str(x) for x in range(1, 14)]) cat_names = (args.cat_names.split(",") if args.cat_names else ["C" + str(x) for x in range(1, 27)]) label_name = ["label"] # Specify Categorify/GroupbyStatistics options tree_width = {} cat_cache = {} for col in cat_names: if col in high_card_columns: tree_width[col] = args.tree_width cat_cache[col] = args.cat_cache_high else: tree_width[col] = 1 cat_cache[col] = args.cat_cache_low # Use total device size to calculate args.device_limit_frac device_size = device_mem_size(kind="total") device_limit = int(args.device_limit_frac * device_size) device_pool_size = int(args.device_pool_frac * device_size) part_size = int(args.part_mem_frac * device_size) # Parse shuffle option shuffle = None if args.shuffle == "PER_WORKER": shuffle = nvt_io.Shuffle.PER_WORKER elif args.shuffle == "PER_PARTITION": shuffle = nvt_io.Shuffle.PER_PARTITION # Check if any device memory is already occupied for dev in args.devices.split(","): fmem = _pynvml_mem_size(kind="free", index=int(dev)) used = (device_size - fmem) / 1e9 if used > 1.0: warnings.warn( f"BEWARE - {used} GB is already occupied on device {int(dev)}!" ) # Setup LocalCUDACluster if args.protocol == "tcp": cluster = LocalCUDACluster( protocol=args.protocol, n_workers=args.n_workers, CUDA_VISIBLE_DEVICES=args.devices, device_memory_limit=device_limit, local_directory=dask_workdir, dashboard_address=":" + dashboard_port, ) else: cluster = LocalCUDACluster( protocol=args.protocol, n_workers=args.n_workers, CUDA_VISIBLE_DEVICES=args.devices, enable_nvlink=True, device_memory_limit=device_limit, local_directory=dask_workdir, dashboard_address=":" + dashboard_port, ) client = Client(cluster) # Setup RMM pool if args.device_pool_frac > 0.01: setup_rmm_pool(client, device_pool_size) # Define Dask NVTabular "Workflow" processor = Workflow(cat_names=cat_names, cont_names=cont_names, label_name=label_name, client=client) if args.normalize: processor.add_feature([ops.FillMissing(), ops.Normalize()]) else: processor.add_feature( [ops.FillMissing(), ops.Clip(min_value=0), ops.LogOp()]) processor.add_preprocess( ops.Categorify( out_path=stats_path, tree_width=tree_width, cat_cache=cat_cache, freq_threshold=freq_limit, search_sorted=not freq_limit, on_host=not args.cats_on_device, )) processor.finalize() dataset = Dataset(data_path, "parquet", part_size=part_size) # Execute the dask graph runtime = time.time() if args.profile is not None: with performance_report(filename=args.profile): processor.apply( dataset, shuffle=shuffle, out_files_per_proc=out_files_per_proc, output_path=output_path, num_io_threads=args.num_io_threads, ) else: processor.apply( dataset, num_io_threads=args.num_io_threads, shuffle=shuffle, out_files_per_proc=out_files_per_proc, output_path=output_path, ) runtime = time.time() - runtime print("\nDask-NVTabular DLRM/Criteo benchmark") print("--------------------------------------") print(f"partition size | {part_size}") print(f"protocol | {args.protocol}") print(f"device(s) | {args.devices}") print(f"rmm-pool-frac | {(args.device_pool_frac)}") print(f"out-files-per-proc | {args.out_files_per_proc}") print(f"num_io_threads | {args.num_io_threads}") print(f"shuffle | {args.shuffle}") print(f"cats-on-device | {args.cats_on_device}") print("======================================") print(f"Runtime[s] | {runtime}") print("======================================\n") client.close()
worker.random_seed = seed worker.sample_seed = seed + 1000 performance_filename = os.path.join( OUTPUT_PATH, "perf_" + str(ratio) + "_" + str(seed)) #https://stackoverflow.com/questions/4789837/how-to-terminate-a-python-subprocess-launched-with-shell-true cmd = "python -m inferelator.utils.profiler -p {pid} -o {pfn}".format( pid=os.getpid(), pfn=performance_filename + "_mem.tsv") memory_monitor = subprocess.Popen(cmd, stdout=subprocess.PIPE, shell=True, preexec_fn=os.setsid) start_time = time.time() with performance_report(filename=performance_filename + ".html"): result = worker.run() csv_row = [ str(ratio), str(seed), str(worker._num_obs), '%.1f' % (time.time() - start_time) ] csv_row += [result.all_scores[n] for n in result.all_names] csv_handler.writerow(csv_row) del worker del result
async def run(args): # Set up workers on the local machine async with LocalCUDACluster( protocol=args.protocol, n_workers=len(args.devs.split(",")), CUDA_VISIBLE_DEVICES=args.devs, ucx_net_devices="auto", enable_infiniband=True, enable_nvlink=True, asynchronous=True, ) as cluster: async with Client(cluster, asynchronous=True) as client: def _worker_setup(size=None): import rmm rmm.reinitialize( pool_allocator=not args.no_rmm_pool, devices=0, initial_pool_size=size, ) cupy.cuda.set_allocator(rmm.rmm_cupy_allocator) await client.run(_worker_setup) # Create an RMM pool on the scheduler due to occasional deserialization # of CUDA objects. May cause issues with InfiniBand otherwise. await client.run_on_scheduler(_worker_setup, 1e9) # Create a simple random array rs = da.random.RandomState(RandomState=cupy.random.RandomState) x = rs.random((args.size, args.size), chunks=args.chunk_size).persist() await wait(x) # Execute the operations to benchmark if args.profile is not None: async with performance_report(filename=args.profile): t1 = clock() await client.compute((x + x.T).sum()) took = clock() - t1 else: t1 = clock() await client.compute((x + x.T).sum()) took = clock() - t1 # Collect, aggregate, and print peer-to-peer bandwidths incoming_logs = await client.run( lambda dask_worker: dask_worker.incoming_transfer_log) bandwidths = defaultdict(list) total_nbytes = defaultdict(list) for k, L in incoming_logs.items(): for d in L: if d["total"] >= args.ignore_size: bandwidths[k, d["who"]].append(d["bandwidth"]) total_nbytes[k, d["who"]].append(d["total"]) bandwidths = {( cluster.scheduler.workers[w1].name, cluster.scheduler.workers[w2].name, ): [ "%s/s" % format_bytes(x) for x in np.quantile(v, [0.25, 0.50, 0.75]) ] for (w1, w2), v in bandwidths.items()} total_nbytes = {( cluster.scheduler.workers[w1].name, cluster.scheduler.workers[w2].name, ): format_bytes(sum(nb)) for (w1, w2), nb in total_nbytes.items()} print("Roundtrip benchmark") print("--------------------------") print(f"Size | {args.size}*{args.size}") print(f"Chunk-size | {args.chunk_size}") print(f"Ignore-size | {format_bytes(args.ignore_size)}") print(f"Protocol | {args.protocol}") print(f"Device(s) | {args.devs}") print(f"npartitions | {x.npartitions}") print("==========================") print(f"Total time | {format_time(took)}") print("==========================") print("(w1,w2) | 25% 50% 75% (total nbytes)") print("--------------------------") for (d1, d2), bw in sorted(bandwidths.items()): print("(%02d,%02d) | %s %s %s (%s)" % (d1, d2, bw[0], bw[1], bw[2], total_nbytes[(d1, d2)]))
def _predict(ms, stack, **kw): args = OmegaConf.create(kw) OmegaConf.set_struct(args, True) pyscilog.log_to_file(args.output_filename + '.log') pyscilog.enable_memory_logging(level=3) # number of threads per worker if args.nthreads is None: if args.host_address is not None: raise ValueError( "You have to specify nthreads when using a distributed scheduler" ) import multiprocessing nthreads = multiprocessing.cpu_count() args.nthreads = nthreads else: nthreads = args.nthreads if args.mem_limit is None: if args.host_address is not None: raise ValueError( "You have to specify mem-limit when using a distributed scheduler" ) import psutil mem_limit = int(psutil.virtual_memory()[0] / 1e9) # 100% of memory by default args.mem_limit = mem_limit else: mem_limit = args.mem_limit nband = args.nband if args.nworkers is None: nworkers = nband args.nworkers = nworkers else: nworkers = args.nworkers if args.nthreads_per_worker is None: nthreads_per_worker = 1 args.nthreads_per_worker = nthreads_per_worker else: nthreads_per_worker = args.nthreads_per_worker # the number of chunks being read in simultaneously is equal to # the number of dask threads nthreads_dask = nworkers * nthreads_per_worker if args.ngridder_threads is None: if args.host_address is not None: ngridder_threads = nthreads // nthreads_per_worker else: ngridder_threads = nthreads // nthreads_dask args.ngridder_threads = ngridder_threads else: ngridder_threads = args.ngridder_threads ms = list(ms) print('Input Options:', file=log) for key in kw.keys(): print(' %25s = %s' % (key, args[key]), file=log) # numpy imports have to happen after this step from pfb import set_client set_client(nthreads, mem_limit, nworkers, nthreads_per_worker, args.host_address, stack, log) import numpy as np from pfb.utils.misc import chan_to_band_mapping import dask from dask.distributed import performance_report from dask.graph_manipulation import clone from daskms import xds_from_storage_ms as xds_from_ms from daskms import xds_from_storage_table as xds_from_table from daskms.utils import dataset_type mstype = dataset_type(ms[0]) if mstype == 'casa': from daskms import xds_to_table elif mstype == 'zarr': from daskms.experimental.zarr import xds_to_zarr as xds_to_table import dask.array as da from africanus.constants import c as lightspeed from africanus.gridding.wgridder.dask import model as im2vis from pfb.utils.fits import load_fits from pfb.utils.misc import restore_corrs, plan_row_chunk from astropy.io import fits # always returns 4D # gridder expects freq axis model = np.atleast_3d(load_fits(args.model).squeeze()) nband, nx, ny = model.shape hdr = fits.getheader(args.model) cell_d = np.abs(hdr['CDELT1']) cell_rad = np.deg2rad(cell_d) # chan <-> band mapping freqs, freq_bin_idx, freq_bin_counts, freq_out, band_mapping, chan_chunks = chan_to_band_mapping( ms, nband=nband) # degridder memory budget max_chan_chunk = 0 for ims in ms: for spw in freqs[ims]: counts = freq_bin_counts[ims][spw].compute() max_chan_chunk = np.maximum(max_chan_chunk, counts.max()) # assumes number of correlations are the same across MS/SPW xds = xds_from_ms(ms[0]) ncorr = xds[0].dims['corr'] nrow = xds[0].dims['row'] if args.output_type is not None: output_type = np.dtype(args.output_type) else: output_type = np.result_type(np.dtype(args.real_type), np.complex64) data_bytes = output_type.itemsize bytes_per_row = max_chan_chunk * ncorr * data_bytes memory_per_row = bytes_per_row # model memory_per_row += 3 * 8 # uvw if mstype == 'zarr': if args.model_column in xds[0].keys(): model_chunks = getattr(xds[0], args.model_column).data.chunks else: model_chunks = xds[0].DATA.data.chunks print('Chunking model same as data') # get approx image size # this is not a conservative estimate when multiple SPW's map to a single # imaging band pixel_bytes = np.dtype(args.output_type).itemsize band_size = nx * ny * pixel_bytes if args.host_address is None: # full image on single node row_chunk = plan_row_chunk(mem_limit / nworkers, band_size, nrow, memory_per_row, nthreads_per_worker) else: # single band per node row_chunk = plan_row_chunk(mem_limit, band_size, nrow, memory_per_row, nthreads_per_worker) if args.row_chunks is not None: row_chunk = int(args.row_chunks) if row_chunk == -1: row_chunk = nrow print( "nrows = %i, row chunks set to %i for a total of %i chunks per node" % (nrow, row_chunk, int(np.ceil(nrow / row_chunk))), file=log) chunks = {} for ims in ms: chunks[ims] = [] # xds_from_ms expects a list per ds for spw in freqs[ims]: chunks[ims].append({ 'row': row_chunk, 'chan': chan_chunks[ims][spw]['chan'] }) model = da.from_array(model.astype(args.real_type), chunks=(1, nx, ny), name=False) writes = [] radec = None # assumes we are only imaging field 0 of first MS for ims in ms: xds = xds_from_ms(ims, chunks=chunks[ims], columns=('UVW')) # subtables ddids = xds_from_table(ims + "::DATA_DESCRIPTION") fields = xds_from_table(ims + "::FIELD") spws = xds_from_table(ims + "::SPECTRAL_WINDOW") pols = xds_from_table(ims + "::POLARIZATION") # subtable data ddids = dask.compute(ddids)[0] fields = dask.compute(fields)[0] spws = dask.compute(spws)[0] pols = dask.compute(pols)[0] out_data = [] for ds in xds: field = fields[ds.FIELD_ID] radec = field.PHASE_DIR.data.squeeze() # check fields match if radec is None: radec = field.PHASE_DIR.data.squeeze() if not np.array_equal(radec, field.PHASE_DIR.data.squeeze()): continue spw = ds.DATA_DESC_ID # this is not correct, need to use spw uvw = clone(ds.UVW.data) bands = band_mapping[ims][spw] model = model[list(bands), :, :] vis = im2vis(uvw, freqs[ims][spw], model, freq_bin_idx[ims][spw], freq_bin_counts[ims][spw], cell_rad, nthreads=ngridder_threads, epsilon=args.epsilon, do_wstacking=args.wstack) model_vis = restore_corrs(vis, ncorr) if mstype == 'zarr': model_vis = model_vis.rechunk(model_chunks) uvw = uvw.rechunk((model_chunks[0], 3)) out_ds = ds.assign( **{ args.model_column: (("row", "chan", "corr"), model_vis), 'UVW': (("row", "three"), uvw) }) # out_ds = ds.assign(**{args.model_column: (("row", "chan", "corr"), model_vis)}) out_data.append(out_ds) writes.append(xds_to_table(out_data, ims, columns=[args.model_column])) dask.visualize(*writes, filename=args.output_filename + '_predict_graph.pdf', optimize_graph=False, collapse_outputs=True) if not args.mock: with performance_report(filename=args.output_filename + '_predict_per.html'): dask.compute(writes, optimize_graph=False) print("All done here.", file=log)
def main(): client = Client(n_workers=10, threads_per_worker=1) print(client) df = dask.datasets.timeseries( start="2000-01-01", end="2000-01-31", # end="2000-12-31", partition_freq="1h", freq="60s", ) df = df.persist() wait(df) iterations = 10 with performance_report(filename=f"{today}-simple-scheduler.html"): simple = [] # print('start simple: ', flush=True) for i in range(iterations): start = time.time() z = df.x + 1 + 2 - df.y z.sum().compute() stop = time.time() simple.append(stop - start) simple = np.array(simple) df2 = None with performance_report(filename=f"{today}-shuffle-scheduler.html"): shuffle_t = [] # print('start shuffle: ', flush=True) for i in range(iterations): client.cancel(df2) start = time.time() # shuffle(df, "id", shuffle="tasks") df2 = df.set_index("id").persist() wait(df2) stop = time.time() shuffle_t.append(stop - start) shuffle_t = np.array(shuffle_t) with performance_report(filename=f"{today}-rand-access-scheduler.html"): rand_access = [] for i in range(iterations): start = time.time() df2.head() stop = time.time() rand_access.append(stop - start) rand_access = np.array(rand_access) data = dsa.random.random((10000, 1000000), chunks=(1, 1000000)) da = xr.DataArray(data, dims=['time', 'x'], coords={'day': ('time', np.arange(10000) % 100)}) clim = da.groupby('day').mean(dim='time') anom = da.groupby('day') - clim anom_mean = anom.mean(dim='time') with performance_report(filename=f"{today}-anom-mean-scheduler.html"): anom_mean_t = [] for i in range(iterations): start = time.time() anom_mean.compute() stop = time.time() anom_mean_t.append(stop - start) anom_mean_t = np.array(anom_mean_t) return dict(simple=simple, shuffle=shuffle_t, rand_access=rand_access, anom_mean=anom_mean_t)
mem_limit = str(input('Input Max Ram (GB) [Default=8]: \n') or '8') procs = str(input('Input Number of Processes [Default=1]: \n') or '1') threads = str( input('Input Threads per Process [Default=2]: \n') or '2') env = DaskEnv(mem_limit=mem_limit, nprocs=procs, nthreads=threads) print(env.client.dashboard_link) jobs = [ingestion.to_parquet] schema = 'paymaster/schema/perf_schema.json' schema_name = 'perf' ingest_dir = '/mnt/data/fnma-data/sf/perf/raw' files = os.listdir(ingest_dir) mods = [job.__module__.split('.')[-1] for job in jobs] names = [job.__name__ for job in jobs] # modnames = ['.'.join([mods[i], names[i]]) for i in range(len(mods))] print('{client} \n queuing {jobs} in {files}'.format(client=repr( env.client), jobs=jobs, files=files)) ts = datetime.now().strftime('%Y%m%d%H%M%S') logfile = LOG_ROOT + '{}-{}.log'.format(__name__, ts) fn = ('dark-performance-report-{}-{}-{}.html'.format( __name__, schema_name, ts)) report_path = REPORT_ROOT + fn with performance_report(report_path): try: for job in jobs: res = job(schema=schema, log_path=logfile) except Exception as e: env.shutdown() raise e
async def _run(client, args): if args.type == "gpu": import cupy as xp else: import numpy as xp # Create a simple random array rs = da.random.RandomState(RandomState=xp.random.RandomState) if args.operation == "transpose_sum": x = rs.random((args.size, args.size), chunks=args.chunk_size).persist() await wait(x) func_args = (x, ) func = lambda x: (x + x.T).sum() elif args.operation == "dot": x = rs.random((args.size, args.size), chunks=args.chunk_size).persist() y = rs.random((args.size, args.size), chunks=args.chunk_size).persist() await wait(x) await wait(y) func_args = (x, y) func = lambda x, y: x.dot(y) elif args.operation == "svd": x = rs.random( (args.size, args.second_size), chunks=(int(args.chunk_size), args.second_size), ).persist() await wait(x) func_args = (x, ) func = lambda x: np.linalg.svd(x) elif args.operation == "fft": x = rs.random((args.size, args.size), chunks=(args.size, args.chunk_size)).persist() await wait(x) func_args = (x, ) func = lambda x: np.fft.fft(x, axis=0) shape = x.shape chunksize = x.chunksize # Execute the operations to benchmark if args.profile is not None: async with performance_report(filename=args.profile): t1 = clock() await client.compute(func(*func_args)) took = clock() - t1 else: t1 = clock() res = client.compute(func(*func_args)) await client.gather(res) if args.type == "gpu": await client.run(xp.cuda.Device().synchronize) took = clock() - t1 return { "took": took, "npartitions": x.npartitions, "shape": shape, "chunksize": chunksize, }
def gridsearch_wfv(self, params): # self.hyperparameters = hyperparameters # self.rmse_results = defaultdict(list) # replace this variable by creating a key-value in # the self.hyper_dict dictionary with value containing list of RMSE values self.all_params_combs = list() # determine if there is more than one combination of hyperparameters # if only one combination, set get_stats_ flag to True self.get_stats_ = (len(params[max(params, key=lambda x: len(params[x]))]) == 1) for params_comb_dict in (dict( zip(params.keys(), v)) for v in list(product(*list(params.values())))): # for self.hyper_dict in hyperparameters: # self.params_combs_list.append(params_comb_dict) self.params_comb_dict = params_comb_dict.copy() self.params_comb_dict["rmse_list_"] = list() self.params_comb_dict["monthly_rmse_list_"] = list() self.params_comb_dict["fit_times_list_"] = list() try: self.model = lgb.DaskLGBMRegressor( client=self.client, random_state=42, silent=False, tree_learner="data", force_row_wise=True, **params_comb_dict, ) except Exception: logging.exception( "Exception occurred while initializing Dask model.") # kill all active work, delete all data on the network, and restart the worker processes. self.client.restart() sys.exit(1) # call method that loops over train-validation sets with performance_report( filename=f"dask_report_{self.curr_dt_time}.html"): for train, test, get_stats in self.train_test_time_split(): self.fit(train).predict(test).rmse_all_folds( test, get_stats) self.params_comb_dict["avg_rmse_"] = mean( self.params_comb_dict["rmse_list_"]) self.params_comb_dict["monthly_avg_rmse_"] = mean( self.params_comb_dict["monthly_rmse_list_"]) self.all_params_combs.append(self.params_comb_dict) best_params = min(self.all_params_combs, key=lambda x: x["monthly_avg_rmse_"]) self.best_score_ = best_params["monthly_avg_rmse_"] # remove non-parameter key-values from self.best_params (i.e., rmse_list_ and avg_rmse_, etc.) self.best_params_ = { k: v for k, v in best_params.items() if k in params } # save list of parameter-result dictionaries to dataframe and then to CSV if self.all_params_combs: all_params_combs_df = pd.DataFrame(self.all_params_combs) output_csv = "all_params_combs.csv" all_params_combs_df.to_csv(output_csv, index=False) try: key = f"lightgbm_all_params_combs_{self.curr_dt_time}.csv" # global s3_client s3_client = boto3.client("s3") response = s3_client.upload_file(output_csv, "sales-demand-data", key) logging.info( "Name of CSV uploaded to S3 and containing all parameter combinations " f"and results is: {key}") except ClientError as e: logging.exception( "CSV file with LightGBM parameter combinations and results was not copied to S3." ) else: logging.debug( "List of parameter-result dictionaries is empty and was not converted to CSV!" )
memory='4GB', disk='4GB', env_extra=env_extra, ) if args.executor == 'dask/casa': client = Client("tls://localhost:8786") import shutil shutil.make_archive("workflows", "zip", base_dir="workflows") client.upload_file("workflows.zip") else: cluster.adapt(minimum=args.scaleout) client = Client(cluster) print("Waiting for at least one worker...") client.wait_for_workers(1) with performance_report(filename="dask-report.html"): output = processor.run_uproot_job( sample_dict, treename='Events', processor_instance=processor_instance, executor=processor.dask_executor, executor_args={ 'client': client, 'skipbadfiles': args.skipbadfiles, 'schema': processor.NanoAODSchema, 'retries': 3, }, chunksize=args.chunk, maxchunks=args.max) save(output, args.output)
def _correct_errors(ra, err_rate, p_value=0.05): # True: use Dask's broadcast (ra transfer via inproc/tcp) # False: each worker reacs ra.pickle from disk use_dask_broadcast = False log.debug( "Available CPU / RAM: {} / {} GB".format( _get_cpu_count(), int(_get_available_memory() / 1024 ** 3) ), module_name="rmt_correction", ) n_workers = _calc_max_workers(ra) log.debug( "Estimated optimum n_workers: {}".format(n_workers), module_name="rmt_correction", ) if int(os.environ.get("SEQC_MAX_WORKERS", 0)) > 0: n_workers = int(os.environ.get("SEQC_MAX_WORKERS")) log.debug( "n_workers overridden with SEQC_MAX_WORKERS: {}".format(n_workers), module_name="rmt_correction", ) # n_workers = 1 # p_value = 0.005 # configure dask.distributed # memory_terminate_fraction doesn't work for some reason # https://github.com/dask/distributed/issues/3519 # https://docs.dask.org/en/latest/setup/single-distributed.html#localcluster # https://docs.dask.org/en/latest/scheduling.html#local-threads worker_kwargs = { "n_workers": n_workers, "threads_per_worker": 1, "processes": True, "memory_limit": "64G", "memory_target_fraction": 0.95, "memory_spill_fraction": 0.99, "memory_pause_fraction": False, # "memory_terminate_fraction": False, } # do not kill worker at 95% memory level dask.config.set({"distributed.worker.memory.terminate": False}) dask.config.set({"distributed.scheduler.allowed-failures": 50}) # setup Dask distributed client cluster = LocalCluster(**worker_kwargs) client = Client(cluster) # debug message log.debug( "Dask processes={} threads={}".format( len(client.nthreads().values()), np.sum(list(client.nthreads().values())) ), module_name="rmt_correction", ) log.debug( "Dask worker_kwargs " + " ".join([f"{k}={v}" for k, v in worker_kwargs.items()]), module_name="rmt_correction", ) log.debug("Dask Dashboard=" + client.dashboard_link, module_name="rmt_correction") # group by cells (same cell barcodes as one group) log.debug("Grouping...", module_name="rmt_correction") indices_grouped_by_cells = ra.group_indices_by_cell() if use_dask_broadcast: # send readarray in advance to all workers (i.e. broadcast=True) # this way, we reduce the serialization time log.debug("Scattering ReadArray...", module_name="rmt_correction") [future_ra] = client.scatter([ra], broadcast=True) else: # write ra to pickle which will be used later to parallel process rmt correction with open("pre-correction-ra.pickle", "wb") as fout: pickle.dump(ra, fout, protocol=4) # correct errors per cell group in parallel log.debug("Submitting jobs to Dask...", module_name="rmt_correction") with performance_report(filename="dask-report.html"): futures = [] # distribute chunks to workers evenly n_chunks = math.ceil(len(indices_grouped_by_cells) / n_workers) chunks = partition_all(n_chunks, indices_grouped_by_cells) for chunk in tqdm(chunks, disable=None): future = client.submit( _correct_errors_by_cell_group_chunks, future_ra if use_dask_broadcast else None, chunk, err_rate, p_value, ) futures.append(future) # wait until all done log.debug("Waiting untill all tasks complete...", module_name="rmt_correction") completed, not_completed = wait(futures) if len(not_completed) > 1: raise Exception("There are uncompleted tasks!") # gather the resutls and release log.debug( "Collecting the task results from the workers...", module_name="rmt_correction" ) results = [] for future in tqdm(completed, disable=None): # this returns a list of a list # len(result) should be the number of chunks e.g. 50 result = future.result() # remove empty lists result = list(filter(lambda x: len(x) > 0, result)) # aggregate and release results.extend(result) future.release() # clean up del futures del completed del not_completed client.shutdown() client.close() # iterate through the list of returned read indices and donor rmts # create a mapping tble of pre-/post-correction mapping = set() for result in results: for idx, idx_corrected_rmt in result: # record pre-/post-correction # skip if it's already marked as rmt error if ( ra.data["cell"][idx], ra.data["rmt"][idx], ra.data["rmt"][idx_corrected_rmt], ) in mapping: continue mapping.add( ( ra.data["cell"][idx], ra.data["rmt"][idx], ra.data["rmt"][idx_corrected_rmt], ) ) # iterate through the list of returned read indices and donor rmts # actually, update the read array object with corrected UMI for result in results: for idx, idx_corrected_rmt in result: # skip if it's already marked as rmt error if ra.data["status"][idx_corrected_rmt] & ra.filter_codes["rmt_error"]: continue # correct ra.data["rmt"][idx] = ra.data["rmt"][idx_corrected_rmt] # report error ra.data["status"][idx] |= ra.filter_codes["rmt_error"] return pd.DataFrame(mapping, columns=["CB", "UR", "UB"])
async def _run(client, args): if args.type == "gpu": import cupy as xp else: import numpy as xp # Create a simple random array rs = da.random.RandomState(RandomState=xp.random.RandomState) if args.operation == "transpose_sum": rng = start_range(message="make array(s)", color="green") x = rs.random((args.size, args.size), chunks=args.chunk_size).persist() await wait(x) end_range(rng) func_args = (x, ) func = lambda x: (x + x.T).sum() elif args.operation == "dot": rng = start_range(message="make array(s)", color="green") x = rs.random((args.size, args.size), chunks=args.chunk_size).persist() y = rs.random((args.size, args.size), chunks=args.chunk_size).persist() await wait(x) await wait(y) end_range(rng) func_args = (x, y) func = lambda x, y: x.dot(y) elif args.operation == "svd": rng = start_range(message="make array(s)", color="green") x = rs.random( (args.size, args.second_size), chunks=(int(args.chunk_size), args.second_size), ).persist() await wait(x) end_range(rng) func_args = (x, ) func = lambda x: np.linalg.svd(x) elif args.operation == "fft": rng = start_range(message="make array(s)", color="green") x = rs.random((args.size, args.size), chunks=(args.size, args.chunk_size)).persist() await wait(x) end_range(rng) func_args = (x, ) func = lambda x: np.fft.fft(x, axis=0) elif args.operation == "sum": rng = start_range(message="make array(s)", color="green") x = rs.random((args.size, args.size), chunks=args.chunk_size).persist() await wait(x) end_range(rng) func_args = (x, ) func = lambda x: x.sum() elif args.operation == "mean": rng = start_range(message="make array(s)", color="green") x = rs.random((args.size, args.size), chunks=args.chunk_size).persist() await wait(x) end_range(rng) func_args = (x, ) func = lambda x: x.mean() elif args.operation == "slice": rng = start_range(message="make array(s)", color="green") x = rs.random((args.size, args.size), chunks=args.chunk_size).persist() await wait(x) end_range(rng) func_args = (x, ) func = lambda x: x[::3].copy() elif args.operation == "col_sum": rng = start_range(message="make array(s)", color="green") x = rs.normal(10, 1, (args.size, ), chunks=args.chunk_size).persist() y = rs.normal(10, 1, (args.size, ), chunks=args.chunk_size).persist() await wait(x) await wait(y) end_range(rng) func_args = (x, y) func = lambda x, y: x + y elif args.operation == "col_mask": rng = start_range(message="make array(s)", color="green") x = rs.normal(10, 1, (args.size, ), chunks=args.chunk_size).persist() y = rs.normal(10, 1, (args.size, ), chunks=args.chunk_size).persist() await wait(x) await wait(y) end_range(rng) func_args = (x, y) func = lambda x, y: x[y > 10] elif args.operation == "col_gather": rng = start_range(message="make array(s)", color="green") x = rs.normal(10, 1, (args.size, ), chunks=args.chunk_size).persist() idx = rs.randint(0, len(x), (args.second_size, ), chunks=args.chunk_size).persist() await wait(x) await wait(idx) end_range(rng) func_args = (x, idx) func = lambda x, idx: x[idx] shape = x.shape chunksize = x.chunksize # Execute the operations to benchmark if args.profile is not None: async with performance_report(filename=args.profile): rng = start_range(message=args.operation, color="purple") t1 = clock() await wait(client.persist(func(*func_args))) if args.type == "gpu": await client.run(lambda xp: xp.cuda.Device().synchronize(), xp) took = clock() - t1 end_range(rng) else: rng = start_range(message=args.operation, color="purple") t1 = clock() await wait(client.persist(func(*func_args))) if args.type == "gpu": await client.run(lambda xp: xp.cuda.Device().synchronize(), xp) took = clock() - t1 end_range(rng) return { "took": took, "npartitions": x.npartitions, "shape": shape, "chunksize": chunksize, }
def run_trait_gwas( ds: Dataset, trait_group_id: int, trait_name: str, batch_index: int, min_samples: int, retries: int = 3, ) -> pd.DataFrame: assert ds["sample_trait_group_id"].to_series().nunique() == 1 assert ds["sample_trait_name"].to_series().nunique() == 1 # Filter to complete cases start = time.perf_counter() n = ds.dims["samples"] ds = ds.isel(samples=ds["sample_trait"].notnull().all(dim="traits").values) stop = time.perf_counter() sample_size = ds.dims["samples"] logger.info( f"Found {sample_size} complete cases of {n} for '{trait_name}' (id={trait_group_id}) in {stop - start:.1f} seconds" ) # Bypass if sample size too small if sample_size < min_samples: logger.warning( f"Sample size ({sample_size}) too small (<{min_samples}) for trait '{trait_name}' (id={trait_group_id})" ) return None logger.info( f"Running GWAS for '{trait_name}' (id={trait_group_id}) with {sample_size} samples, {ds.dims['traits']} traits" ) start = time.perf_counter() logger.debug( f"Input dataset for trait '{trait_name}' (id={trait_group_id}) GWAS:\n{ds}" ) ds = sg.gwas_linear_regression( ds, dosage="call_dosage", covariates="sample_covariate", traits="sample_trait", add_intercept=True, merge=True, ) # Project and convert to data frame for convenience # in downstream analysis/comparisons ds = ds[[ "sample_trait_id", "sample_trait_name", "sample_trait_group_id", "sample_trait_code_id", "variant_id", "variant_contig", "variant_contig_name", "variant_position", "variant_p_value", "variant_beta", ]] if os.getenv("GENERATE_PERFORMANCE_REPORT", "").lower() == "true": with performance_report( f"logs/reports/pr_{trait_group_id}_{batch_index}.html" ), get_task_stream( plot="save", filename=f"logs/reports/ts_{trait_group_id}_{batch_index}.html" ): ds = ds.compute(retries=retries) else: ds = ds.compute(retries=retries) df = (ds.to_dataframe().reset_index().assign( sample_size=sample_size).rename(columns={ "traits": "trait_index", "variants": "variant_index" })) stop = time.perf_counter() logger.info( f"GWAS for '{trait_name}' (id={trait_group_id}) complete in {stop - start:.1f} seconds" ) return df