Ejemplo n.º 1
0
def run_scene_optimizer() -> None:
    """ """
    with initialize_config_module(config_module="gtsfm.configs"):
        # config is relative to the gtsfm module
        cfg = compose(config_name="default_lund_door_set1_config.yaml")
        scene_optimizer: SceneOptimizer = instantiate(cfg.SceneOptimizer)

        loader = OlssonLoader(os.path.join(DATA_ROOT, "set1_lund_door"),
                              image_extension="JPG")

        sfm_result_graph = scene_optimizer.create_computation_graph(
            num_images=len(loader),
            image_pair_indices=loader.get_valid_pairs(),
            image_graph=loader.create_computation_graph_for_images(),
            camera_intrinsics_graph=loader.
            create_computation_graph_for_intrinsics(),
            gt_pose_graph=loader.create_computation_graph_for_poses(),
        )

        # create dask client
        cluster = LocalCluster(n_workers=2, threads_per_worker=4)

        with Client(cluster), performance_report(filename="dask-report.html"):
            sfm_result = sfm_result_graph.compute()

        assert isinstance(sfm_result, GtsfmData)
Ejemplo n.º 2
0
def run(client, args, n_workers, write_profile=None):
    # Generate random Dask dataframes
    ddf_base = get_random_ddf(args.chunk_size, args.base_chunks,
                              args.frac_match, "build", args).persist()
    ddf_other = get_random_ddf(args.chunk_size, args.other_chunks,
                               args.frac_match, "other", args).persist()
    wait(ddf_base)
    wait(ddf_other)

    assert len(ddf_base.dtypes) == 2
    assert len(ddf_other.dtypes) == 2
    data_processed = len(ddf_base) * sum([t.itemsize for t in ddf_base.dtypes])
    data_processed += len(ddf_other) * sum(
        [t.itemsize for t in ddf_other.dtypes])

    # Get contexts to use (defaults to null contexts that doesn't do anything)
    ctx1, ctx2 = contextlib.nullcontext(), contextlib.nullcontext()
    if args.backend == "explicit-comms":
        ctx1 = dask.config.set(explicit_comms=True)
    if write_profile is not None:
        ctx2 = performance_report(filename=args.profile)

    with ctx1:
        with ctx2:
            t1 = perf_counter()
            merge(args, ddf_base, ddf_other)
            t2 = perf_counter()

    return (data_processed, t2 - t1)
Ejemplo n.º 3
0
def run(client, args, n_workers, write_profile=None):
    # Generate random Dask dataframe
    chunksize = args.partition_size // 8  # Convert bytes to float64
    nchunks = args.in_parts
    totalsize = chunksize * nchunks
    x = da.random.random((totalsize, ), chunks=(chunksize, ))
    df = dask.dataframe.from_dask_array(x, columns="data").to_frame()

    if args.type == "gpu":
        import cudf

        df = df.map_partitions(cudf.from_pandas)

    df = df.persist()
    wait(df)
    data_processed = len(df) * sum([t.itemsize for t in df.dtypes])

    if write_profile is None:
        ctx = contextlib.nullcontext()
    else:
        ctx = performance_report(filename=args.profile)

    with ctx:
        t1 = clock()
        if args.backend == "dask":
            shuffle_dask(df)
        else:
            shuffle_explicit_comms(df)
        t2 = clock()

    return (data_processed, t2 - t1)
Ejemplo n.º 4
0
def run(args, write_profile=None):
    # Generate random Dask dataframes
    ddf_base = get_random_ddf(args.chunk_size, args.n_workers, args.frac_match,
                              "build", args).persist()
    ddf_other = get_random_ddf(args.chunk_size, args.n_workers,
                               args.frac_match, "other", args).persist()
    wait(ddf_base)
    wait(ddf_other)

    assert (len(ddf_base.dtypes) == 2)
    assert (len(ddf_other.dtypes) == 2)
    data_processed = len(ddf_base) * sum([t.itemsize for t in ddf_base.dtypes])
    data_processed += len(ddf_other) * sum(
        [t.itemsize for t in ddf_other.dtypes])

    # Lazy merge/join operation
    ddf_join = ddf_base.merge(ddf_other, on=["key"], how="inner")
    if args.set_index:
        ddf_join = ddf_join.set_index("key")

    # Execute the operations to benchmark
    if write_profile is not None:
        with performance_report(filename=args.profile):
            t1 = clock()
            wait(ddf_join.persist())
            took = clock() - t1
    else:
        t1 = clock()
        wait(ddf_join.persist())
        took = clock() - t1
    return (data_processed, took)
Ejemplo n.º 5
0
def example_function():
    x = da.random.random((100_000, 100_000, 10), chunks=(10_000, 10_000, 5))
    y = da.random.random((100_000, 100_000, 10), chunks=(10_000, 10_000, 5))
    z = (da.arcsin(x) + da.arccos(y)).sum(axis=(1, 2))

    with performance_report(filename="dask-report_mpi.html"):
        result = z.compute()
def run_scene_optimizer(args) -> None:
    """ Run GTSFM over images from an Argoverse vehicle log"""
    with initialize_config_module(config_module="gtsfm.configs"):
        # config is relative to the gtsfm module
        cfg = compose(config_name="default_lund_door_set1_config.yaml")
        scene_optimizer: SceneOptimizer = instantiate(cfg.SceneOptimizer)

        loader = ArgoverseDatasetLoader(
            dataset_dir=args.dataset_dir,
            log_id=args.log_id,
            stride=args.stride,
            max_num_imgs=args.max_num_imgs,
            max_lookahead_sec=args.max_lookahead_sec,
            camera_name=args.camera_name,
        )

        sfm_result_graph = scene_optimizer.create_computation_graph(
            len(loader),
            loader.get_valid_pairs(),
            loader.create_computation_graph_for_images(),
            loader.create_computation_graph_for_intrinsics(),
            use_intrinsics_in_verification=True,
            gt_pose_graph=loader.create_computation_graph_for_poses(),
        )

        # create dask client
        cluster = LocalCluster(n_workers=2, threads_per_worker=4)

        with Client(cluster), performance_report(filename="dask-report.html"):
            sfm_result = sfm_result_graph.compute()

        assert isinstance(sfm_result, GtsfmData)
        scene_avg_reproj_error = sfm_result.get_scene_avg_reprojection_error()
        logger.info('Scene avg reproj error: {}'.format(
            str(np.round(scene_avg_reproj_error, 3))))
Ejemplo n.º 7
0
async def _run(client, args):
    # Create a simple random array
    if args.type == "gpu":
        rs = da.random.RandomState(RandomState=cp.random.RandomState)
    else:
        rs = da.random.RandomState(RandomState=np.random.RandomState)
    x = rs.random((args.size, args.size), chunks=args.chunk_size).persist()
    ks = 2 * (2 * args.kernel_size + 1, )
    await wait(x)

    # Execute the operations to benchmark
    if args.profile is not None:
        async with performance_report(filename=args.profile):
            t1 = clock()
            await wait(
                client.persist(
                    x.map_overlap(mean_filter, args.kernel_size, shape=ks)))
            took = clock() - t1
    else:
        t1 = clock()
        await wait(
            client.persist(
                x.map_overlap(mean_filter, args.kernel_size, shape=ks)))
        took = clock() - t1

    return (took, x.npartitions)
Ejemplo n.º 8
0
def shuffle_dask(args, df, write_profile):
    # Execute the operations to benchmark
    if write_profile is not None:
        with performance_report(filename=args.profile):
            t1 = clock()
            wait(shuffle(df, index="data", shuffle="tasks").persist())
            took = clock() - t1
    else:
        t1 = clock()
        wait(shuffle(df, index="data", shuffle="tasks").persist())
        took = clock() - t1
    return took
Ejemplo n.º 9
0
    def run(self) -> None:
        """Run the SceneOptimizer."""
        start_time = time.time()

        # create dask client
        cluster = LocalCluster(
            n_workers=self.parsed_args.num_workers,
            threads_per_worker=self.parsed_args.threads_per_worker)

        pairs_graph = self.retriever.create_computation_graph(self.loader)
        with Client(cluster), performance_report(filename="dask-report.html"):
            image_pair_indices = pairs_graph.compute()

        delayed_sfm_result, delayed_io = self.scene_optimizer.create_computation_graph(
            num_images=len(self.loader),
            image_pair_indices=image_pair_indices,
            image_graph=self.loader.create_computation_graph_for_images(),
            all_intrinsics=self.loader.get_all_intrinsics(),
            image_shapes=self.loader.get_image_shapes(),
            relative_pose_priors=self.loader.get_relative_pose_priors(
                image_pair_indices),
            absolute_pose_priors=self.loader.get_absolute_pose_priors(),
            cameras_gt=self.loader.get_gt_cameras(),
            gt_wTi_list=self.loader.get_gt_poses(),
            matching_regime=ImageMatchingRegime(
                self.parsed_args.matching_regime),
        )

        with Client(cluster), performance_report(filename="dask-report.html"):
            sfm_result, *io = dask.compute(delayed_sfm_result, *delayed_io)

        assert isinstance(sfm_result, GtsfmData)

        end_time = time.time()
        duration_sec = end_time - start_time
        logger.info(
            "GTSFM took %.2f minutes to compute sparse multi-view result.",
            duration_sec / 60)
Ejemplo n.º 10
0
def merge(args, ddf1, ddf2, write_profile):
    # Lazy merge/join operation
    ddf_join = ddf1.merge(ddf2, on=["key"], how="inner")
    if args.set_index:
        ddf_join = ddf_join.set_index("key")

    # Execute the operations to benchmark
    if write_profile is not None:
        with performance_report(filename=args.profile):
            t1 = clock()
            wait(ddf_join.persist())
            took = clock() - t1
    else:
        t1 = clock()
        wait(ddf_join.persist())
        took = clock() - t1
    return took
Ejemplo n.º 11
0
async def _run(client, args):
    # Create a simple random array
    rs = da.random.RandomState(RandomState=cupy.random.RandomState)
    x = rs.random((args.size, args.size), chunks=args.chunk_size).persist()
    await wait(x)

    # Execute the operations to benchmark
    if args.profile is not None:
        async with performance_report(filename=args.profile):
            t1 = clock()
            await client.compute((x + x.T).sum())
            took = clock() - t1
    else:
        t1 = clock()
        await client.compute((x + x.T).sum())
        took = clock() - t1

    return (took, x.npartitions)
Ejemplo n.º 12
0
    def run(self) -> None:
        """Run Structure-from-Motion (SfM) pipeline."""
        start_time = time.time()
        # Create dask client.
        cluster = LocalCluster(
            n_workers=self.parsed_args.num_workers,
            threads_per_worker=self.parsed_args.threads_per_worker)

        with Client(cluster) as client, performance_report(
                filename="dask-report.html"):
            pairs_graph = self.retriever.create_computation_graph(self.loader)
            image_pair_indices = pairs_graph.compute()

            # Scatter surface mesh across all nodes to preserve computation time and memory.
            gt_scene_trimesh_future = client.scatter(
                self.loader.gt_scene_trimesh, broadcast=True)

            # Prepare computation graph.
            delayed_sfm_result, delayed_io = self.scene_optimizer.create_computation_graph(
                num_images=len(self.loader),
                image_pair_indices=image_pair_indices,
                image_graph=self.loader.create_computation_graph_for_images(),
                all_intrinsics=self.loader.get_all_intrinsics(),
                image_shapes=self.loader.get_image_shapes(),
                gt_scene_mesh=gt_scene_trimesh_future,
                cameras_gt=self.loader.get_gt_cameras(),
                gt_wTi_list=self.loader.get_gt_poses(),
                matching_regime=ImageMatchingRegime(
                    self.parsed_args.matching_regime),
                absolute_pose_priors=self.loader.get_absolute_pose_priors(),
                relative_pose_priors=self.loader.get_relative_pose_priors(
                    image_pair_indices),
            )

            # Run SfM pipeline.
            sfm_result, *io = dask.compute(delayed_sfm_result, *delayed_io)

        assert isinstance(sfm_result, GtsfmData)
        logger.info(
            "GTSFM took %.2f minutes to compute sparse multi-view result.",
            (time.time() - start_time) / 60)
Ejemplo n.º 13
0
        def profiled(*args, **kwargs):
            name = func.__name__
            t0 = time.time()
            if dask_profile:
                with performance_report(filename=f"profiled-{name}.html"):
                    result = func(*args, **kwargs)
            else:
                result = func(*args, **kwargs)
            elapsed_time = time.time() - t0

            logging_info = {}
            logging_info["elapsed_time_seconds"] = elapsed_time
            logging_info["function_name"] = name

            logdf = pd.DataFrame.from_dict(logging_info, orient="index").T

            if csv:
                logdf.to_csv(f"benchmarked_{name}.csv", index=False)
            else:
                print(logdf)
            return result
Ejemplo n.º 14
0
def benchmark(func, *args, **kwargs):
    csv = kwargs.pop("csv", True)
    dask_profile = kwargs.pop("dask_profile", False)
    compute_result = kwargs.pop("compute_result", False)
    name = func.__name__
    t0 = time.time()
    if dask_profile:
        with performance_report(filename=f"profiled-{name}.html"):
            result = func(*args, **kwargs)
    else:
        result = func(*args, **kwargs)
    elapsed_time = time.time() - t0

    logging_info = {}
    logging_info["elapsed_time_seconds"] = elapsed_time
    logging_info["function_name"] = name
    if compute_result:
        import dask_cudf

        if isinstance(result, dask_cudf.DataFrame):
            len_tasks = [dask.delayed(len)(df) for df in result.to_delayed()]
        else:
            len_tasks = []
            for read_df in result:
                len_tasks += [
                    dask.delayed(len)(df) for df in read_df.to_delayed()
                ]

        compute_st = time.time()
        results = dask.compute(*len_tasks)
        compute_et = time.time()
        logging_info["compute_time_seconds"] = compute_et - compute_st

    logdf = pd.DataFrame.from_dict(logging_info, orient="index").T

    if csv:
        logdf.to_csv(f"benchmarked_{name}.csv", index=False)
    else:
        print(logdf)
    return result
Ejemplo n.º 15
0
def main():

    # create run directory tree
    logging.info("1 - create_dir_tree")
    dirs = pa.create_dir_tree(root_dir, run_name, overwrite=overwrite)

    # create tiling
    logging.info("2 - generate_tiling")
    tl = generate_tiling(dirs, overwrite)

    # create run directories, erase them if need be
    logging.info("3 - create_tile_run_tree")
    tl.create_tile_run_tree(dirs["run"], overwrite=overwrite)

    logging.info("4 - start main loop")

    flag = True
    reboot = 0
    while flag:

        logging.info("--- reboot %d", reboot)

        # sping up cluster
        cluster, client = spin_up_cluster(dask_jobs)
        print(cluster)

        # run parcels simulation
        ms = MemorySampler()
        with performance_report(filename=f"dask-report-{reboot}.html"), ms.sample(f"reboot{reboot}"):
            flag = run(dirs, tl, cluster, client)
        # https://distributed.dask.org/en/latest/diagnosing-performance.html#analysing-memory-usage-over-time
        ms.to_pandas().to_csv(f"dask-memory-report-{reboot}.csv")

        # close dask
        close_dask(cluster, client)

        reboot += 1
Ejemplo n.º 16
0
def run_scene_optimizer(args: argparse.Namespace) -> None:
    """Run GTSFM over images from an Argoverse vehicle log"""
    with hydra.initialize_config_module(config_module="gtsfm.configs"):
        # config is relative to the gtsfm module
        cfg = hydra.compose(config_name=args.config_name)
        scene_optimizer: SceneOptimizer = instantiate(cfg.SceneOptimizer)

        loader = ArgoverseDatasetLoader(
            dataset_dir=args.dataset_dir,
            log_id=args.log_id,
            stride=args.stride,
            max_num_imgs=args.max_num_imgs,
            max_lookahead_sec=args.max_lookahead_sec,
            camera_name=args.camera_name,
            max_resolution=args.max_resolution,
        )

        delayed_sfm_result, delayed_io = scene_optimizer.create_computation_graph(
            num_images=len(loader),
            image_pair_indices=loader.get_valid_pairs(),
            image_graph=loader.create_computation_graph_for_images(),
            all_intrinsics=loader.get_all_intrinsics(),
            image_shapes=loader.get_image_shapes(),
            cameras_gt=loader.get_gt_cameras(),
        )

        # create dask client
        cluster = LocalCluster(n_workers=args.num_workers,
                               threads_per_worker=args.threads_per_worker)

        with Client(cluster), performance_report(filename="dask-report.html"):
            sfm_result, *io = dask.compute(delayed_sfm_result, *delayed_io)

        assert isinstance(sfm_result, GtsfmData)
        scene_avg_reproj_error = sfm_result.get_avg_scene_reprojection_error()
        logger.info("Scene avg reproj error: %.3f", scene_avg_reproj_error)
Ejemplo n.º 17
0
def main(args):

    # Input
    data_path = args.data_path
    out_path = args.out_path
    freq_limit = args.freq_limit
    out_files_per_proc = args.splits
    if args.protocol == "ucx":
        os.environ["UCX_TLS"] = "tcp,cuda_copy,cuda_ipc,sockcm"

    # Use Criteo dataset by default (for now)
    cont_names = (args.cont_names.split(",")
                  if args.cont_names else ["I" + str(x) for x in range(1, 14)])
    cat_names = (args.cat_names.split(",")
                 if args.cat_names else ["C" + str(x) for x in range(1, 27)])
    label_name = ["label"]

    if args.cat_splits:
        tree_width = {
            name: int(s)
            for name, s in zip(cat_names, args.cat_splits.split(","))
        }
    else:
        tree_width = {col: 1 for col in cat_names}
        if args.cat_names is None:
            # Using Criteo... Use more hash partitions for
            # known high-cardinality columns
            tree_width["C20"] = 8
            tree_width["C1"] = 8
            tree_width["C22"] = 4
            tree_width["C10"] = 4
            tree_width["C21"] = 2
            tree_width["C11"] = 2
            tree_width["C23"] = 2
            tree_width["C12"] = 2

    # Specify categorical caching location
    cat_cache = None
    if args.cat_cache:
        cat_cache = args.cat_cache.split(",")
        if len(cat_cache) == 1:
            cat_cache = cat_cache[0]
        else:
            # If user is specifying a list of options,
            # they must specify an option for every cat column
            assert len(cat_names) == len(cat_cache)
    if isinstance(cat_cache, str):
        cat_cache = {col: cat_cache for col in cat_names}
    elif isinstance(cat_cache, list):
        cat_cache = {name: c for name, c in zip(cat_names, cat_cache)}
    else:
        # Criteo/DLRM Defaults
        cat_cache = {col: "device" for col in cat_names}
        if args.cat_names is None:
            cat_cache["C20"] = "host"
            cat_cache["C1"] = "host"
            # Only need to cache the largest two on a dgx-2
            if args.n_workers < 16:
                cat_cache["C22"] = "host"
                cat_cache["C10"] = "host"

    # Use total device size to calculate args.device_limit_frac
    device_size = device_mem_size(kind="total")
    device_limit = int(args.device_limit_frac * device_size)
    device_pool_size = int(args.device_pool_frac * device_size)
    part_size = int(args.part_mem_frac * device_size)

    # Setup LocalCUDACluster
    if args.protocol == "tcp":
        cluster = LocalCUDACluster(
            protocol=args.protocol,
            n_workers=args.n_workers,
            CUDA_VISIBLE_DEVICES=args.devs,
            device_memory_limit=device_limit,
            local_directory=args.dask_workspace,
            dashboard_address=":3787",
        )
    else:
        cluster = LocalCUDACluster(
            protocol=args.protocol,
            n_workers=args.n_workers,
            CUDA_VISIBLE_DEVICES=args.devs,
            enable_nvlink=True,
            device_memory_limit=device_limit,
            local_directory=args.dask_workspace,
            dashboard_address=":3787",
        )
    client = Client(cluster)

    # Setup RMM pool
    if not args.no_rmm_pool:
        setup_rmm_pool(client, device_pool_size)

    # Define Dask NVTabular "Workflow"
    processor = Workflow(cat_names=cat_names,
                         cont_names=cont_names,
                         label_name=label_name,
                         client=client)
    processor.add_feature([ops.ZeroFill(), ops.LogOp()])
    processor.add_preprocess(
        ops.Categorify(
            out_path=out_path,
            tree_width=tree_width,
            cat_cache=cat_cache,
            freq_threshold=freq_limit,
            on_host=args.cat_on_host,
        ))
    processor.finalize()

    dataset = Dataset(data_path, "parquet", part_size=part_size)

    # Execute the dask graph
    runtime = time.time()
    if args.profile is not None:
        with performance_report(filename=args.profile):
            processor.apply(
                dataset,
                shuffle="full" if args.worker_shuffle else "partial",
                out_files_per_proc=out_files_per_proc,
                output_path=out_path,
            )
    else:
        processor.apply(
            dataset,
            shuffle="full" if args.worker_shuffle else "partial",
            out_files_per_proc=out_files_per_proc,
            output_path=out_path,
        )
    runtime = time.time() - runtime

    print("\nDask-NVTabular DLRM/Criteo benchmark")
    print("--------------------------------------")
    print(f"partition size     | {part_size}")
    print(f"protocol           | {args.protocol}")
    print(f"device(s)          | {args.devs}")
    print(f"rmm-pool           | {(not args.no_rmm_pool)}")
    print(f"out_files_per_proc | {args.splits}")
    print(f"worker-shuffle     | {args.worker_shuffle}")
    print("======================================")
    print(f"Runtime[s]         | {runtime}")
    print("======================================\n")

    client.close()
Ejemplo n.º 18
0
async def run(args):

    # Set up workers on the local machine
    async with LocalCUDACluster(
        protocol=args.protocol,
        n_workers=len(args.devs.split(",")),
        CUDA_VISIBLE_DEVICES=args.devs,
        asynchronous=True,
    ) as cluster:
        async with Client(cluster, asynchronous=True) as client:

            # Create a simple random array
            rs = da.random.RandomState(RandomState=cupy.random.RandomState)
            x = rs.random((args.size, args.size), chunks=args.chunk_size).persist()
            await wait(x)

            # Execute the operations to benchmark
            if args.profile is not None:
                async with performance_report(filename=args.profile):
                    t1 = clock()
                    await client.compute((x + x.T).sum())
                    took = clock() - t1
            else:
                t1 = clock()
                await client.compute((x + x.T).sum())
                took = clock() - t1

            # Collect, aggregate, and print peer-to-peer bandwidths
            incoming_logs = await client.run(
                lambda dask_worker: dask_worker.incoming_transfer_log
            )
            bandwidths = defaultdict(list)
            total_nbytes = defaultdict(list)
            for k, L in incoming_logs.items():
                for d in L:
                    if d["total"] >= args.ignore_size:
                        bandwidths[k, d["who"]].append(d["bandwidth"])
                        total_nbytes[k, d["who"]].append(d["total"])
            bandwidths = {
                (
                    cluster.scheduler.workers[w1].name,
                    cluster.scheduler.workers[w2].name,
                ): [
                    "%s/s" % format_bytes(x) for x in np.quantile(v, [0.25, 0.50, 0.75])
                ]
                for (w1, w2), v in bandwidths.items()
            }
            total_nbytes = {
                (
                    cluster.scheduler.workers[w1].name,
                    cluster.scheduler.workers[w2].name,
                ): format_bytes(sum(nb))
                for (w1, w2), nb in total_nbytes.items()
            }

            print("Roundtrip benchmark")
            print("--------------------------")
            print(f"Size        | {args.size}*{args.size}")
            print(f"Chunk-size  | {args.chunk_size}")
            print(f"Ignore-size | {format_bytes(args.ignore_size)}")
            print(f"Protocol    | {args.protocol}")
            print(f"Device(s)   | {args.devs}")
            print(f"npartitions | {x.npartitions}")
            print("==========================")
            print(f"Total time  | {format_time(took)}")
            print("==========================")
            print("(w1,w2)     | 25% 50% 75% (total nbytes)")
            print("--------------------------")
            for (d1, d2), bw in sorted(bandwidths.items()):
                print(
                    "(%02d,%02d)     | %s %s %s (%s)"
                    % (d1, d2, bw[0], bw[1], bw[2], total_nbytes[(d1, d2)])
                )
Ejemplo n.º 19
0
def main(args):
    """Multi-GPU Criteo/DLRM Preprocessing Benchmark

    This benchmark is designed to measure the time required to preprocess
    the Criteo (1TB) dataset for Facebook’s DLRM model.  The user must specify
    the path of the raw dataset (using the `--data-path` flag), as well as the
    output directory for all temporary/final data (using the `--out-path` flag)

    Example Usage
    -------------

    python dask-nvtabular-criteo-benchmark.py
                        --data-path /path/to/criteo_parquet --out-path /out/dir/`


    Dataset Requirements (Parquet)
    ------------------------------

    This benchmark is designed with a parquet-formatted dataset in mind.
    While a CSV-formatted dataset can be processed by NVTabular, converting
    to parquet will yield significantly better performance.  To convert your
    dataset, try using the `optimize_criteo.ipynb` notebook (also located
    in `NVTabular/examples/`)

    For a detailed parameter overview see `NVTabular/examples/MultiGPUBench.md`
    """

    # Input
    data_path = args.data_path
    freq_limit = args.freq_limit
    out_files_per_proc = args.out_files_per_proc
    high_card_columns = args.high_cards.split(",")
    dashboard_port = args.dashboard_port
    if args.protocol == "ucx":
        UCX_TLS = os.environ.get("UCX_TLS", "tcp,cuda_copy,cuda_ipc,sockcm")
        os.environ["UCX_TLS"] = UCX_TLS

    # Cleanup output directory
    BASE_DIR = args.out_path
    dask_workdir = os.path.join(BASE_DIR, "workdir")
    output_path = os.path.join(BASE_DIR, "output")
    stats_path = os.path.join(BASE_DIR, "stats")
    if not os.path.isdir(BASE_DIR):
        os.mkdir(BASE_DIR)
    for dir_path in (dask_workdir, output_path, stats_path):
        if os.path.isdir(dir_path):
            shutil.rmtree(dir_path)
        os.mkdir(dir_path)

    # Use Criteo dataset by default (for now)
    cont_names = (args.cont_names.split(",")
                  if args.cont_names else ["I" + str(x) for x in range(1, 14)])
    cat_names = (args.cat_names.split(",")
                 if args.cat_names else ["C" + str(x) for x in range(1, 27)])
    label_name = ["label"]

    # Specify Categorify/GroupbyStatistics options
    tree_width = {}
    cat_cache = {}
    for col in cat_names:
        if col in high_card_columns:
            tree_width[col] = args.tree_width
            cat_cache[col] = args.cat_cache_high
        else:
            tree_width[col] = 1
            cat_cache[col] = args.cat_cache_low

    # Use total device size to calculate args.device_limit_frac
    device_size = device_mem_size(kind="total")
    device_limit = int(args.device_limit_frac * device_size)
    device_pool_size = int(args.device_pool_frac * device_size)
    part_size = int(args.part_mem_frac * device_size)

    # Parse shuffle option
    shuffle = None
    if args.shuffle == "PER_WORKER":
        shuffle = nvt_io.Shuffle.PER_WORKER
    elif args.shuffle == "PER_PARTITION":
        shuffle = nvt_io.Shuffle.PER_PARTITION

    # Check if any device memory is already occupied
    for dev in args.devices.split(","):
        fmem = _pynvml_mem_size(kind="free", index=int(dev))
        used = (device_size - fmem) / 1e9
        if used > 1.0:
            warnings.warn(
                f"BEWARE - {used} GB is already occupied on device {int(dev)}!"
            )

    # Setup LocalCUDACluster
    if args.protocol == "tcp":
        cluster = LocalCUDACluster(
            protocol=args.protocol,
            n_workers=args.n_workers,
            CUDA_VISIBLE_DEVICES=args.devices,
            device_memory_limit=device_limit,
            local_directory=dask_workdir,
            dashboard_address=":" + dashboard_port,
        )
    else:
        cluster = LocalCUDACluster(
            protocol=args.protocol,
            n_workers=args.n_workers,
            CUDA_VISIBLE_DEVICES=args.devices,
            enable_nvlink=True,
            device_memory_limit=device_limit,
            local_directory=dask_workdir,
            dashboard_address=":" + dashboard_port,
        )
    client = Client(cluster)

    # Setup RMM pool
    if args.device_pool_frac > 0.01:
        setup_rmm_pool(client, device_pool_size)

    # Define Dask NVTabular "Workflow"
    processor = Workflow(cat_names=cat_names,
                         cont_names=cont_names,
                         label_name=label_name,
                         client=client)
    if args.normalize:
        processor.add_feature([ops.FillMissing(), ops.Normalize()])
    else:
        processor.add_feature(
            [ops.FillMissing(),
             ops.Clip(min_value=0),
             ops.LogOp()])
    processor.add_preprocess(
        ops.Categorify(
            out_path=stats_path,
            tree_width=tree_width,
            cat_cache=cat_cache,
            freq_threshold=freq_limit,
            search_sorted=not freq_limit,
            on_host=not args.cats_on_device,
        ))
    processor.finalize()

    dataset = Dataset(data_path, "parquet", part_size=part_size)

    # Execute the dask graph
    runtime = time.time()
    if args.profile is not None:
        with performance_report(filename=args.profile):
            processor.apply(
                dataset,
                shuffle=shuffle,
                out_files_per_proc=out_files_per_proc,
                output_path=output_path,
                num_io_threads=args.num_io_threads,
            )
    else:
        processor.apply(
            dataset,
            num_io_threads=args.num_io_threads,
            shuffle=shuffle,
            out_files_per_proc=out_files_per_proc,
            output_path=output_path,
        )
    runtime = time.time() - runtime

    print("\nDask-NVTabular DLRM/Criteo benchmark")
    print("--------------------------------------")
    print(f"partition size     | {part_size}")
    print(f"protocol           | {args.protocol}")
    print(f"device(s)          | {args.devices}")
    print(f"rmm-pool-frac      | {(args.device_pool_frac)}")
    print(f"out-files-per-proc | {args.out_files_per_proc}")
    print(f"num_io_threads     | {args.num_io_threads}")
    print(f"shuffle            | {args.shuffle}")
    print(f"cats-on-device     | {args.cats_on_device}")
    print("======================================")
    print(f"Runtime[s]         | {runtime}")
    print("======================================\n")

    client.close()
                worker.random_seed = seed
                worker.sample_seed = seed + 1000

                performance_filename = os.path.join(
                    OUTPUT_PATH, "perf_" + str(ratio) + "_" + str(seed))

                #https://stackoverflow.com/questions/4789837/how-to-terminate-a-python-subprocess-launched-with-shell-true
                cmd = "python -m inferelator.utils.profiler -p {pid} -o {pfn}".format(
                    pid=os.getpid(), pfn=performance_filename + "_mem.tsv")
                memory_monitor = subprocess.Popen(cmd,
                                                  stdout=subprocess.PIPE,
                                                  shell=True,
                                                  preexec_fn=os.setsid)

                start_time = time.time()
                with performance_report(filename=performance_filename +
                                        ".html"):
                    result = worker.run()

                csv_row = [
                    str(ratio),
                    str(seed),
                    str(worker._num_obs),
                    '%.1f' % (time.time() - start_time)
                ]
                csv_row += [result.all_scores[n] for n in result.all_names]

                csv_handler.writerow(csv_row)

                del worker
                del result
Ejemplo n.º 21
0
async def run(args):

    # Set up workers on the local machine
    async with LocalCUDACluster(
            protocol=args.protocol,
            n_workers=len(args.devs.split(",")),
            CUDA_VISIBLE_DEVICES=args.devs,
            ucx_net_devices="auto",
            enable_infiniband=True,
            enable_nvlink=True,
            asynchronous=True,
    ) as cluster:
        async with Client(cluster, asynchronous=True) as client:

            def _worker_setup(size=None):
                import rmm

                rmm.reinitialize(
                    pool_allocator=not args.no_rmm_pool,
                    devices=0,
                    initial_pool_size=size,
                )
                cupy.cuda.set_allocator(rmm.rmm_cupy_allocator)

            await client.run(_worker_setup)
            # Create an RMM pool on the scheduler due to occasional deserialization
            # of CUDA objects. May cause issues with InfiniBand otherwise.
            await client.run_on_scheduler(_worker_setup, 1e9)

            # Create a simple random array
            rs = da.random.RandomState(RandomState=cupy.random.RandomState)
            x = rs.random((args.size, args.size),
                          chunks=args.chunk_size).persist()
            await wait(x)

            # Execute the operations to benchmark
            if args.profile is not None:
                async with performance_report(filename=args.profile):
                    t1 = clock()
                    await client.compute((x + x.T).sum())
                    took = clock() - t1
            else:
                t1 = clock()
                await client.compute((x + x.T).sum())
                took = clock() - t1

            # Collect, aggregate, and print peer-to-peer bandwidths
            incoming_logs = await client.run(
                lambda dask_worker: dask_worker.incoming_transfer_log)
            bandwidths = defaultdict(list)
            total_nbytes = defaultdict(list)
            for k, L in incoming_logs.items():
                for d in L:
                    if d["total"] >= args.ignore_size:
                        bandwidths[k, d["who"]].append(d["bandwidth"])
                        total_nbytes[k, d["who"]].append(d["total"])
            bandwidths = {(
                cluster.scheduler.workers[w1].name,
                cluster.scheduler.workers[w2].name,
            ): [
                "%s/s" % format_bytes(x)
                for x in np.quantile(v, [0.25, 0.50, 0.75])
            ]
                          for (w1, w2), v in bandwidths.items()}
            total_nbytes = {(
                cluster.scheduler.workers[w1].name,
                cluster.scheduler.workers[w2].name,
            ): format_bytes(sum(nb))
                            for (w1, w2), nb in total_nbytes.items()}

            print("Roundtrip benchmark")
            print("--------------------------")
            print(f"Size        | {args.size}*{args.size}")
            print(f"Chunk-size  | {args.chunk_size}")
            print(f"Ignore-size | {format_bytes(args.ignore_size)}")
            print(f"Protocol    | {args.protocol}")
            print(f"Device(s)   | {args.devs}")
            print(f"npartitions | {x.npartitions}")
            print("==========================")
            print(f"Total time  | {format_time(took)}")
            print("==========================")
            print("(w1,w2)     | 25% 50% 75% (total nbytes)")
            print("--------------------------")
            for (d1, d2), bw in sorted(bandwidths.items()):
                print("(%02d,%02d)     | %s %s %s (%s)" %
                      (d1, d2, bw[0], bw[1], bw[2], total_nbytes[(d1, d2)]))
Ejemplo n.º 22
0
def _predict(ms, stack, **kw):
    args = OmegaConf.create(kw)
    OmegaConf.set_struct(args, True)
    pyscilog.log_to_file(args.output_filename + '.log')
    pyscilog.enable_memory_logging(level=3)

    # number of threads per worker
    if args.nthreads is None:
        if args.host_address is not None:
            raise ValueError(
                "You have to specify nthreads when using a distributed scheduler"
            )
        import multiprocessing
        nthreads = multiprocessing.cpu_count()
        args.nthreads = nthreads
    else:
        nthreads = args.nthreads

    if args.mem_limit is None:
        if args.host_address is not None:
            raise ValueError(
                "You have to specify mem-limit when using a distributed scheduler"
            )
        import psutil
        mem_limit = int(psutil.virtual_memory()[0] /
                        1e9)  # 100% of memory by default
        args.mem_limit = mem_limit
    else:
        mem_limit = args.mem_limit

    nband = args.nband
    if args.nworkers is None:
        nworkers = nband
        args.nworkers = nworkers
    else:
        nworkers = args.nworkers

    if args.nthreads_per_worker is None:
        nthreads_per_worker = 1
        args.nthreads_per_worker = nthreads_per_worker
    else:
        nthreads_per_worker = args.nthreads_per_worker

    # the number of chunks being read in simultaneously is equal to
    # the number of dask threads
    nthreads_dask = nworkers * nthreads_per_worker

    if args.ngridder_threads is None:
        if args.host_address is not None:
            ngridder_threads = nthreads // nthreads_per_worker
        else:
            ngridder_threads = nthreads // nthreads_dask
        args.ngridder_threads = ngridder_threads
    else:
        ngridder_threads = args.ngridder_threads

    ms = list(ms)
    print('Input Options:', file=log)
    for key in kw.keys():
        print('     %25s = %s' % (key, args[key]), file=log)

    # numpy imports have to happen after this step
    from pfb import set_client
    set_client(nthreads, mem_limit, nworkers, nthreads_per_worker,
               args.host_address, stack, log)

    import numpy as np
    from pfb.utils.misc import chan_to_band_mapping
    import dask
    from dask.distributed import performance_report
    from dask.graph_manipulation import clone
    from daskms import xds_from_storage_ms as xds_from_ms
    from daskms import xds_from_storage_table as xds_from_table
    from daskms.utils import dataset_type
    mstype = dataset_type(ms[0])
    if mstype == 'casa':
        from daskms import xds_to_table
    elif mstype == 'zarr':
        from daskms.experimental.zarr import xds_to_zarr as xds_to_table
    import dask.array as da
    from africanus.constants import c as lightspeed
    from africanus.gridding.wgridder.dask import model as im2vis
    from pfb.utils.fits import load_fits
    from pfb.utils.misc import restore_corrs, plan_row_chunk
    from astropy.io import fits

    # always returns 4D
    # gridder expects freq axis
    model = np.atleast_3d(load_fits(args.model).squeeze())
    nband, nx, ny = model.shape
    hdr = fits.getheader(args.model)
    cell_d = np.abs(hdr['CDELT1'])
    cell_rad = np.deg2rad(cell_d)

    # chan <-> band mapping
    freqs, freq_bin_idx, freq_bin_counts, freq_out, band_mapping, chan_chunks = chan_to_band_mapping(
        ms, nband=nband)

    # degridder memory budget
    max_chan_chunk = 0
    for ims in ms:
        for spw in freqs[ims]:
            counts = freq_bin_counts[ims][spw].compute()
            max_chan_chunk = np.maximum(max_chan_chunk, counts.max())

    # assumes number of correlations are the same across MS/SPW
    xds = xds_from_ms(ms[0])
    ncorr = xds[0].dims['corr']
    nrow = xds[0].dims['row']
    if args.output_type is not None:
        output_type = np.dtype(args.output_type)
    else:
        output_type = np.result_type(np.dtype(args.real_type), np.complex64)
    data_bytes = output_type.itemsize
    bytes_per_row = max_chan_chunk * ncorr * data_bytes
    memory_per_row = bytes_per_row  # model
    memory_per_row += 3 * 8  # uvw

    if mstype == 'zarr':
        if args.model_column in xds[0].keys():
            model_chunks = getattr(xds[0], args.model_column).data.chunks
        else:
            model_chunks = xds[0].DATA.data.chunks
            print('Chunking model same as data')

    # get approx image size
    # this is not a conservative estimate when multiple SPW's map to a single
    # imaging band
    pixel_bytes = np.dtype(args.output_type).itemsize
    band_size = nx * ny * pixel_bytes

    if args.host_address is None:
        # full image on single node
        row_chunk = plan_row_chunk(mem_limit / nworkers, band_size, nrow,
                                   memory_per_row, nthreads_per_worker)

    else:
        # single band per node
        row_chunk = plan_row_chunk(mem_limit, band_size, nrow, memory_per_row,
                                   nthreads_per_worker)

    if args.row_chunks is not None:
        row_chunk = int(args.row_chunks)
        if row_chunk == -1:
            row_chunk = nrow

    print(
        "nrows = %i, row chunks set to %i for a total of %i chunks per node" %
        (nrow, row_chunk, int(np.ceil(nrow / row_chunk))),
        file=log)

    chunks = {}
    for ims in ms:
        chunks[ims] = []  # xds_from_ms expects a list per ds
        for spw in freqs[ims]:
            chunks[ims].append({
                'row': row_chunk,
                'chan': chan_chunks[ims][spw]['chan']
            })

    model = da.from_array(model.astype(args.real_type),
                          chunks=(1, nx, ny),
                          name=False)
    writes = []
    radec = None  # assumes we are only imaging field 0 of first MS
    for ims in ms:
        xds = xds_from_ms(ims, chunks=chunks[ims], columns=('UVW'))

        # subtables
        ddids = xds_from_table(ims + "::DATA_DESCRIPTION")
        fields = xds_from_table(ims + "::FIELD")
        spws = xds_from_table(ims + "::SPECTRAL_WINDOW")
        pols = xds_from_table(ims + "::POLARIZATION")

        # subtable data
        ddids = dask.compute(ddids)[0]
        fields = dask.compute(fields)[0]
        spws = dask.compute(spws)[0]
        pols = dask.compute(pols)[0]

        out_data = []
        for ds in xds:
            field = fields[ds.FIELD_ID]
            radec = field.PHASE_DIR.data.squeeze()

            # check fields match
            if radec is None:
                radec = field.PHASE_DIR.data.squeeze()

            if not np.array_equal(radec, field.PHASE_DIR.data.squeeze()):
                continue

            spw = ds.DATA_DESC_ID  # this is not correct, need to use spw

            uvw = clone(ds.UVW.data)

            bands = band_mapping[ims][spw]
            model = model[list(bands), :, :]
            vis = im2vis(uvw,
                         freqs[ims][spw],
                         model,
                         freq_bin_idx[ims][spw],
                         freq_bin_counts[ims][spw],
                         cell_rad,
                         nthreads=ngridder_threads,
                         epsilon=args.epsilon,
                         do_wstacking=args.wstack)

            model_vis = restore_corrs(vis, ncorr)
            if mstype == 'zarr':
                model_vis = model_vis.rechunk(model_chunks)
                uvw = uvw.rechunk((model_chunks[0], 3))

            out_ds = ds.assign(
                **{
                    args.model_column: (("row", "chan", "corr"), model_vis),
                    'UVW': (("row", "three"), uvw)
                })
            # out_ds = ds.assign(**{args.model_column: (("row", "chan", "corr"), model_vis)})
            out_data.append(out_ds)

        writes.append(xds_to_table(out_data, ims, columns=[args.model_column]))

    dask.visualize(*writes,
                   filename=args.output_filename + '_predict_graph.pdf',
                   optimize_graph=False,
                   collapse_outputs=True)

    if not args.mock:
        with performance_report(filename=args.output_filename +
                                '_predict_per.html'):
            dask.compute(writes, optimize_graph=False)

    print("All done here.", file=log)
Ejemplo n.º 23
0
def main():
    client = Client(n_workers=10, threads_per_worker=1)
    print(client)

    df = dask.datasets.timeseries(
        start="2000-01-01",
        end="2000-01-31",
        # end="2000-12-31",
        partition_freq="1h",
        freq="60s",
    )
    df = df.persist()
    wait(df)
    iterations = 10

    with performance_report(filename=f"{today}-simple-scheduler.html"):
        simple = []
        # print('start simple: ', flush=True)
        for i in range(iterations):
            start = time.time()
            z = df.x + 1 + 2 - df.y
            z.sum().compute()
            stop = time.time()
            simple.append(stop - start)
        simple = np.array(simple)

    df2 = None
    with performance_report(filename=f"{today}-shuffle-scheduler.html"):
        shuffle_t = []
        # print('start shuffle: ', flush=True)
        for i in range(iterations):
            client.cancel(df2)
            start = time.time()
            # shuffle(df, "id", shuffle="tasks")
            df2 = df.set_index("id").persist()
            wait(df2)
            stop = time.time()
            shuffle_t.append(stop - start)
        shuffle_t = np.array(shuffle_t)

    with performance_report(filename=f"{today}-rand-access-scheduler.html"):
        rand_access = []
        for i in range(iterations):
            start = time.time()
            df2.head()
            stop = time.time()
            rand_access.append(stop - start)
        rand_access = np.array(rand_access)
    data = dsa.random.random((10000, 1000000), chunks=(1, 1000000))
    da = xr.DataArray(data,
                      dims=['time', 'x'],
                      coords={'day': ('time', np.arange(10000) % 100)})
    clim = da.groupby('day').mean(dim='time')
    anom = da.groupby('day') - clim
    anom_mean = anom.mean(dim='time')
    with performance_report(filename=f"{today}-anom-mean-scheduler.html"):
        anom_mean_t = []
        for i in range(iterations):
            start = time.time()
            anom_mean.compute()
            stop = time.time()
            anom_mean_t.append(stop - start)

        anom_mean_t = np.array(anom_mean_t)

    return dict(simple=simple,
                shuffle=shuffle_t,
                rand_access=rand_access,
                anom_mean=anom_mean_t)
Ejemplo n.º 24
0
        mem_limit = str(input('Input Max Ram (GB) [Default=8]: \n') or '8')
        procs = str(input('Input Number of Processes [Default=1]: \n') or '1')
        threads = str(
            input('Input Threads per Process [Default=2]: \n') or '2')
        env = DaskEnv(mem_limit=mem_limit, nprocs=procs, nthreads=threads)
        print(env.client.dashboard_link)
    jobs = [ingestion.to_parquet]
    schema = 'paymaster/schema/perf_schema.json'
    schema_name = 'perf'
    ingest_dir = '/mnt/data/fnma-data/sf/perf/raw'
    files = os.listdir(ingest_dir)
    mods = [job.__module__.split('.')[-1] for job in jobs]
    names = [job.__name__ for job in jobs]
    # modnames = ['.'.join([mods[i], names[i]]) for i in range(len(mods))]
    print('{client} \n queuing {jobs} in {files}'.format(client=repr(
        env.client),
                                                         jobs=jobs,
                                                         files=files))
    ts = datetime.now().strftime('%Y%m%d%H%M%S')
    logfile = LOG_ROOT + '{}-{}.log'.format(__name__, ts)
    fn = ('dark-performance-report-{}-{}-{}.html'.format(
        __name__, schema_name, ts))
    report_path = REPORT_ROOT + fn
    with performance_report(report_path):
        try:
            for job in jobs:
                res = job(schema=schema, log_path=logfile)
        except Exception as e:
            env.shutdown()
            raise e
Ejemplo n.º 25
0
async def _run(client, args):
    if args.type == "gpu":
        import cupy as xp
    else:
        import numpy as xp

    # Create a simple random array
    rs = da.random.RandomState(RandomState=xp.random.RandomState)

    if args.operation == "transpose_sum":
        x = rs.random((args.size, args.size), chunks=args.chunk_size).persist()
        await wait(x)
        func_args = (x, )

        func = lambda x: (x + x.T).sum()
    elif args.operation == "dot":
        x = rs.random((args.size, args.size), chunks=args.chunk_size).persist()
        y = rs.random((args.size, args.size), chunks=args.chunk_size).persist()
        await wait(x)
        await wait(y)

        func_args = (x, y)

        func = lambda x, y: x.dot(y)
    elif args.operation == "svd":
        x = rs.random(
            (args.size, args.second_size),
            chunks=(int(args.chunk_size), args.second_size),
        ).persist()
        await wait(x)

        func_args = (x, )

        func = lambda x: np.linalg.svd(x)
    elif args.operation == "fft":
        x = rs.random((args.size, args.size),
                      chunks=(args.size, args.chunk_size)).persist()
        await wait(x)

        func_args = (x, )

        func = lambda x: np.fft.fft(x, axis=0)

    shape = x.shape
    chunksize = x.chunksize

    # Execute the operations to benchmark
    if args.profile is not None:
        async with performance_report(filename=args.profile):
            t1 = clock()
            await client.compute(func(*func_args))
            took = clock() - t1
    else:
        t1 = clock()
        res = client.compute(func(*func_args))
        await client.gather(res)
        if args.type == "gpu":
            await client.run(xp.cuda.Device().synchronize)
        took = clock() - t1

    return {
        "took": took,
        "npartitions": x.npartitions,
        "shape": shape,
        "chunksize": chunksize,
    }
Ejemplo n.º 26
0
    def gridsearch_wfv(self, params):
        # self.hyperparameters = hyperparameters
        # self.rmse_results = defaultdict(list) # replace this variable by creating a key-value in
        # the self.hyper_dict dictionary with value containing list of RMSE values
        self.all_params_combs = list()
        # determine if there is more than one combination of hyperparameters
        # if only one combination, set get_stats_ flag to True
        self.get_stats_ = (len(params[max(params,
                                          key=lambda x: len(params[x]))]) == 1)
        for params_comb_dict in (dict(
                zip(params.keys(),
                    v)) for v in list(product(*list(params.values())))):
            # for self.hyper_dict in hyperparameters:
            # self.params_combs_list.append(params_comb_dict)
            self.params_comb_dict = params_comb_dict.copy()
            self.params_comb_dict["rmse_list_"] = list()
            self.params_comb_dict["monthly_rmse_list_"] = list()
            self.params_comb_dict["fit_times_list_"] = list()
            try:
                self.model = lgb.DaskLGBMRegressor(
                    client=self.client,
                    random_state=42,
                    silent=False,
                    tree_learner="data",
                    force_row_wise=True,
                    **params_comb_dict,
                )
            except Exception:
                logging.exception(
                    "Exception occurred while initializing Dask model.")
                # kill all active work, delete all data on the network, and restart the worker processes.
                self.client.restart()
                sys.exit(1)

            # call method that loops over train-validation sets
            with performance_report(
                    filename=f"dask_report_{self.curr_dt_time}.html"):
                for train, test, get_stats in self.train_test_time_split():
                    self.fit(train).predict(test).rmse_all_folds(
                        test, get_stats)

            self.params_comb_dict["avg_rmse_"] = mean(
                self.params_comb_dict["rmse_list_"])
            self.params_comb_dict["monthly_avg_rmse_"] = mean(
                self.params_comb_dict["monthly_rmse_list_"])
            self.all_params_combs.append(self.params_comb_dict)

        best_params = min(self.all_params_combs,
                          key=lambda x: x["monthly_avg_rmse_"])
        self.best_score_ = best_params["monthly_avg_rmse_"]
        # remove non-parameter key-values from self.best_params (i.e., rmse_list_ and avg_rmse_, etc.)
        self.best_params_ = {
            k: v
            for k, v in best_params.items() if k in params
        }

        # save list of parameter-result dictionaries to dataframe and then to CSV
        if self.all_params_combs:
            all_params_combs_df = pd.DataFrame(self.all_params_combs)
            output_csv = "all_params_combs.csv"
            all_params_combs_df.to_csv(output_csv, index=False)

            try:
                key = f"lightgbm_all_params_combs_{self.curr_dt_time}.csv"
                # global s3_client
                s3_client = boto3.client("s3")
                response = s3_client.upload_file(output_csv,
                                                 "sales-demand-data", key)
                logging.info(
                    "Name of CSV uploaded to S3 and containing all parameter combinations "
                    f"and results is: {key}")
            except ClientError as e:
                logging.exception(
                    "CSV file with LightGBM parameter combinations and results was not copied to S3."
                )

        else:
            logging.debug(
                "List of parameter-result dictionaries is empty and was not converted to CSV!"
            )
Ejemplo n.º 27
0
                memory='4GB',
                disk='4GB',
                env_extra=env_extra,
            )

        if args.executor == 'dask/casa':
            client = Client("tls://localhost:8786")
            import shutil
            shutil.make_archive("workflows", "zip", base_dir="workflows")
            client.upload_file("workflows.zip")
        else:
            cluster.adapt(minimum=args.scaleout)
            client = Client(cluster)
            print("Waiting for at least one worker...")
            client.wait_for_workers(1)
        with performance_report(filename="dask-report.html"):
            output = processor.run_uproot_job(
                sample_dict,
                treename='Events',
                processor_instance=processor_instance,
                executor=processor.dask_executor,
                executor_args={
                    'client': client,
                    'skipbadfiles': args.skipbadfiles,
                    'schema': processor.NanoAODSchema,
                    'retries': 3,
                },
                chunksize=args.chunk,
                maxchunks=args.max)

    save(output, args.output)
Ejemplo n.º 28
0
def _correct_errors(ra, err_rate, p_value=0.05):

    # True: use Dask's broadcast (ra transfer via inproc/tcp)
    # False: each worker reacs ra.pickle from disk
    use_dask_broadcast = False

    log.debug(
        "Available CPU / RAM: {} / {} GB".format(
            _get_cpu_count(), int(_get_available_memory() / 1024 ** 3)
        ),
        module_name="rmt_correction",
    )

    n_workers = _calc_max_workers(ra)

    log.debug(
        "Estimated optimum n_workers: {}".format(n_workers),
        module_name="rmt_correction",
    )

    if int(os.environ.get("SEQC_MAX_WORKERS", 0)) > 0:
        n_workers = int(os.environ.get("SEQC_MAX_WORKERS"))
        log.debug(
            "n_workers overridden with SEQC_MAX_WORKERS: {}".format(n_workers),
            module_name="rmt_correction",
        )

    # n_workers = 1
    # p_value = 0.005

    # configure dask.distributed
    # memory_terminate_fraction doesn't work for some reason
    # https://github.com/dask/distributed/issues/3519
    # https://docs.dask.org/en/latest/setup/single-distributed.html#localcluster
    # https://docs.dask.org/en/latest/scheduling.html#local-threads
    worker_kwargs = {
        "n_workers": n_workers,
        "threads_per_worker": 1,
        "processes": True,
        "memory_limit": "64G",
        "memory_target_fraction": 0.95,
        "memory_spill_fraction": 0.99,
        "memory_pause_fraction": False,
        # "memory_terminate_fraction": False,
    }

    # do not kill worker at 95% memory level
    dask.config.set({"distributed.worker.memory.terminate": False})
    dask.config.set({"distributed.scheduler.allowed-failures": 50})

    # setup Dask distributed client
    cluster = LocalCluster(**worker_kwargs)
    client = Client(cluster)

    # debug message
    log.debug(
        "Dask processes={} threads={}".format(
            len(client.nthreads().values()), np.sum(list(client.nthreads().values()))
        ),
        module_name="rmt_correction",
    )
    log.debug(
        "Dask worker_kwargs "
        + " ".join([f"{k}={v}" for k, v in worker_kwargs.items()]),
        module_name="rmt_correction",
    )
    log.debug("Dask Dashboard=" + client.dashboard_link, module_name="rmt_correction")

    # group by cells (same cell barcodes as one group)
    log.debug("Grouping...", module_name="rmt_correction")
    indices_grouped_by_cells = ra.group_indices_by_cell()

    if use_dask_broadcast:
        # send readarray in advance to all workers (i.e. broadcast=True)
        # this way, we reduce the serialization time
        log.debug("Scattering ReadArray...", module_name="rmt_correction")
        [future_ra] = client.scatter([ra], broadcast=True)
    else:
        # write ra to pickle which will be used later to parallel process rmt correction
        with open("pre-correction-ra.pickle", "wb") as fout:
            pickle.dump(ra, fout, protocol=4)

    # correct errors per cell group in parallel
    log.debug("Submitting jobs to Dask...", module_name="rmt_correction")
    with performance_report(filename="dask-report.html"):
        futures = []

        # distribute chunks to workers evenly
        n_chunks = math.ceil(len(indices_grouped_by_cells) / n_workers)
        chunks = partition_all(n_chunks, indices_grouped_by_cells)

        for chunk in tqdm(chunks, disable=None):

            future = client.submit(
                _correct_errors_by_cell_group_chunks,
                future_ra if use_dask_broadcast else None,
                chunk,
                err_rate,
                p_value,
            )
            futures.append(future)

        # wait until all done
        log.debug("Waiting untill all tasks complete...", module_name="rmt_correction")
        completed, not_completed = wait(futures)

    if len(not_completed) > 1:
        raise Exception("There are uncompleted tasks!")

    # gather the resutls and release
    log.debug(
        "Collecting the task results from the workers...", module_name="rmt_correction"
    )
    results = []
    for future in tqdm(completed, disable=None):
        # this returns a list of a list
        # len(result) should be the number of chunks e.g. 50
        result = future.result()

        # remove empty lists
        result = list(filter(lambda x: len(x) > 0, result))

        # aggregate and release
        results.extend(result)
        future.release()

    # clean up
    del futures
    del completed
    del not_completed

    client.shutdown()
    client.close()

    # iterate through the list of returned read indices and donor rmts
    # create a mapping tble of pre-/post-correction
    mapping = set()
    for result in results:
        for idx, idx_corrected_rmt in result:

            # record pre-/post-correction
            # skip if it's already marked as rmt error
            if (
                ra.data["cell"][idx],
                ra.data["rmt"][idx],
                ra.data["rmt"][idx_corrected_rmt],
            ) in mapping:
                continue

            mapping.add(
                (
                    ra.data["cell"][idx],
                    ra.data["rmt"][idx],
                    ra.data["rmt"][idx_corrected_rmt],
                )
            )

    # iterate through the list of returned read indices and donor rmts
    # actually, update the read array object with corrected UMI
    for result in results:
        for idx, idx_corrected_rmt in result:

            # skip if it's already marked as rmt error
            if ra.data["status"][idx_corrected_rmt] & ra.filter_codes["rmt_error"]:
                continue

            # correct
            ra.data["rmt"][idx] = ra.data["rmt"][idx_corrected_rmt]

            # report error
            ra.data["status"][idx] |= ra.filter_codes["rmt_error"]

    return pd.DataFrame(mapping, columns=["CB", "UR", "UB"])
Ejemplo n.º 29
0
async def _run(client, args):
    if args.type == "gpu":
        import cupy as xp
    else:
        import numpy as xp

    # Create a simple random array
    rs = da.random.RandomState(RandomState=xp.random.RandomState)

    if args.operation == "transpose_sum":
        rng = start_range(message="make array(s)", color="green")
        x = rs.random((args.size, args.size), chunks=args.chunk_size).persist()
        await wait(x)
        end_range(rng)

        func_args = (x, )

        func = lambda x: (x + x.T).sum()
    elif args.operation == "dot":
        rng = start_range(message="make array(s)", color="green")
        x = rs.random((args.size, args.size), chunks=args.chunk_size).persist()
        y = rs.random((args.size, args.size), chunks=args.chunk_size).persist()
        await wait(x)
        await wait(y)
        end_range(rng)

        func_args = (x, y)

        func = lambda x, y: x.dot(y)
    elif args.operation == "svd":
        rng = start_range(message="make array(s)", color="green")
        x = rs.random(
            (args.size, args.second_size),
            chunks=(int(args.chunk_size), args.second_size),
        ).persist()
        await wait(x)
        end_range(rng)

        func_args = (x, )

        func = lambda x: np.linalg.svd(x)
    elif args.operation == "fft":
        rng = start_range(message="make array(s)", color="green")
        x = rs.random((args.size, args.size),
                      chunks=(args.size, args.chunk_size)).persist()
        await wait(x)
        end_range(rng)

        func_args = (x, )

        func = lambda x: np.fft.fft(x, axis=0)
    elif args.operation == "sum":
        rng = start_range(message="make array(s)", color="green")
        x = rs.random((args.size, args.size), chunks=args.chunk_size).persist()
        await wait(x)
        end_range(rng)

        func_args = (x, )

        func = lambda x: x.sum()
    elif args.operation == "mean":
        rng = start_range(message="make array(s)", color="green")
        x = rs.random((args.size, args.size), chunks=args.chunk_size).persist()
        await wait(x)
        end_range(rng)

        func_args = (x, )

        func = lambda x: x.mean()
    elif args.operation == "slice":
        rng = start_range(message="make array(s)", color="green")
        x = rs.random((args.size, args.size), chunks=args.chunk_size).persist()
        await wait(x)
        end_range(rng)

        func_args = (x, )

        func = lambda x: x[::3].copy()
    elif args.operation == "col_sum":
        rng = start_range(message="make array(s)", color="green")
        x = rs.normal(10, 1, (args.size, ), chunks=args.chunk_size).persist()
        y = rs.normal(10, 1, (args.size, ), chunks=args.chunk_size).persist()
        await wait(x)
        await wait(y)
        end_range(rng)

        func_args = (x, y)

        func = lambda x, y: x + y
    elif args.operation == "col_mask":
        rng = start_range(message="make array(s)", color="green")
        x = rs.normal(10, 1, (args.size, ), chunks=args.chunk_size).persist()
        y = rs.normal(10, 1, (args.size, ), chunks=args.chunk_size).persist()
        await wait(x)
        await wait(y)
        end_range(rng)

        func_args = (x, y)

        func = lambda x, y: x[y > 10]
    elif args.operation == "col_gather":
        rng = start_range(message="make array(s)", color="green")
        x = rs.normal(10, 1, (args.size, ), chunks=args.chunk_size).persist()
        idx = rs.randint(0,
                         len(x), (args.second_size, ),
                         chunks=args.chunk_size).persist()
        await wait(x)
        await wait(idx)
        end_range(rng)

        func_args = (x, idx)

        func = lambda x, idx: x[idx]

    shape = x.shape
    chunksize = x.chunksize

    # Execute the operations to benchmark
    if args.profile is not None:
        async with performance_report(filename=args.profile):
            rng = start_range(message=args.operation, color="purple")
            t1 = clock()
            await wait(client.persist(func(*func_args)))
            if args.type == "gpu":
                await client.run(lambda xp: xp.cuda.Device().synchronize(), xp)
            took = clock() - t1
            end_range(rng)
    else:
        rng = start_range(message=args.operation, color="purple")
        t1 = clock()
        await wait(client.persist(func(*func_args)))
        if args.type == "gpu":
            await client.run(lambda xp: xp.cuda.Device().synchronize(), xp)
        took = clock() - t1
        end_range(rng)

    return {
        "took": took,
        "npartitions": x.npartitions,
        "shape": shape,
        "chunksize": chunksize,
    }
Ejemplo n.º 30
0
def run_trait_gwas(
    ds: Dataset,
    trait_group_id: int,
    trait_name: str,
    batch_index: int,
    min_samples: int,
    retries: int = 3,
) -> pd.DataFrame:
    assert ds["sample_trait_group_id"].to_series().nunique() == 1
    assert ds["sample_trait_name"].to_series().nunique() == 1

    # Filter to complete cases
    start = time.perf_counter()
    n = ds.dims["samples"]
    ds = ds.isel(samples=ds["sample_trait"].notnull().all(dim="traits").values)
    stop = time.perf_counter()
    sample_size = ds.dims["samples"]
    logger.info(
        f"Found {sample_size} complete cases of {n} for '{trait_name}' (id={trait_group_id}) in {stop - start:.1f} seconds"
    )

    # Bypass if sample size too small
    if sample_size < min_samples:
        logger.warning(
            f"Sample size ({sample_size}) too small (<{min_samples}) for trait '{trait_name}' (id={trait_group_id})"
        )
        return None

    logger.info(
        f"Running GWAS for '{trait_name}' (id={trait_group_id}) with {sample_size} samples, {ds.dims['traits']} traits"
    )

    start = time.perf_counter()
    logger.debug(
        f"Input dataset for trait '{trait_name}' (id={trait_group_id}) GWAS:\n{ds}"
    )

    ds = sg.gwas_linear_regression(
        ds,
        dosage="call_dosage",
        covariates="sample_covariate",
        traits="sample_trait",
        add_intercept=True,
        merge=True,
    )

    # Project and convert to data frame for convenience
    # in downstream analysis/comparisons
    ds = ds[[
        "sample_trait_id",
        "sample_trait_name",
        "sample_trait_group_id",
        "sample_trait_code_id",
        "variant_id",
        "variant_contig",
        "variant_contig_name",
        "variant_position",
        "variant_p_value",
        "variant_beta",
    ]]

    if os.getenv("GENERATE_PERFORMANCE_REPORT", "").lower() == "true":
        with performance_report(
                f"logs/reports/pr_{trait_group_id}_{batch_index}.html"
        ), get_task_stream(
                plot="save",
                filename=f"logs/reports/ts_{trait_group_id}_{batch_index}.html"
        ):
            ds = ds.compute(retries=retries)
    else:
        ds = ds.compute(retries=retries)
    df = (ds.to_dataframe().reset_index().assign(
        sample_size=sample_size).rename(columns={
            "traits": "trait_index",
            "variants": "variant_index"
        }))
    stop = time.perf_counter()
    logger.info(
        f"GWAS for '{trait_name}' (id={trait_group_id}) complete in {stop - start:.1f} seconds"
    )
    return df