Beispiel #1
0
def test_create_rapids_cluster_sync():
    skip_without_credentials()
    cluster = GCPCluster(
        source_image="projects/nv-ai-infra/global/images/packer-1607527229",
        network="dask-gcp-network-test",
        zone="us-east1-c",
        machine_type="n1-standard-1",
        filesystem_size=50,
        ngpus=2,
        gpu_type="nvidia-tesla-t4",
        docker_image="rapidsai/rapidsai:cuda11.0-runtime-ubuntu18.04-py3.8",
        worker_class="dask_cuda.CUDAWorker",
        worker_options={"rmm_pool_size": "15GB"},
        asynchronous=False,
        bootstrap=False,
    )

    cluster.scale(1)

    client = Client(cluster)  # noqa
    client.wait_for_workers(2)

    def gpu_mem():
        from pynvml.smi import nvidia_smi

        nvsmi = nvidia_smi.getInstance()
        return nvsmi.DeviceQuery("memory.free, memory.total")

    results = client.run(gpu_mem)
    for w, res in results.items():
        assert "total" in res["gpu"][0]["fb_memory_usage"].keys()
        print(res)
    cluster.close()
Beispiel #2
0
def setup(dask_scheduler_file=None, rmm_pool_size=None):
    if dask_scheduler_file:
        cluster = None
        # Env var UCX_MAX_RNDV_RAILS=1 must be set too.
        initialize(
            enable_tcp_over_ucx=True,
            enable_nvlink=True,
            enable_infiniband=False,
            enable_rdmacm=False,
            #net_devices="mlx5_0:1",
        )
        client = Client(scheduler_file=dask_scheduler_file)

    else:
        tempdir_object = tempfile.TemporaryDirectory()
        cluster = LocalCUDACluster(local_directory=tempdir_object.name,
                                   rmm_pool_size=rmm_pool_size)
        client = Client(cluster)
        # add the obj to the client so it doesn't get deleted until
        # the 'client' obj gets cleaned up
        client.tempdir_object = tempdir_object
        client.wait_for_workers(len(get_visible_devices()))

    Comms.initialize(p2p=True)
    return (client, cluster)
Beispiel #3
0
class MGContext:
    """Utility Context Manager to start a multi GPU context using dask_cuda
    Parameters:
    -----------
    number_of_devices : int
        Number of devices to use, verification must be done prior to call to
        ensure that there are enough devices available. If not specified, the
        cluster will be initialized to use all visible devices.
    rmm_managed_memory : bool
        True to enable managed memory (UVM) in RMM as part of the
        cluster. Default is False.
    p2p : bool
        Initialize UCX endpoints if True. Default is False.
    """
    def __init__(self,
                 number_of_devices=None,
                 rmm_managed_memory=False,
                 p2p=False):
        self._number_of_devices = number_of_devices
        self._rmm_managed_memory = rmm_managed_memory
        self._client = None
        self._p2p = p2p
        self._cluster = CUDACluster(
            n_workers=self._number_of_devices,
            rmm_managed_memory=self._rmm_managed_memory)

    @property
    def client(self):
        return self._client

    @property
    def cluster(self):
        return self._cluster

    def __enter__(self):
        self._prepare_mg()
        return self

    def _prepare_mg(self):
        self._prepare_client()
        self._prepare_comms()

    def _prepare_client(self):
        self._client = Client(self._cluster)
        self._client.wait_for_workers(self._number_of_devices)

    def _prepare_comms(self):
        Comms.initialize(p2p=self._p2p)

    def _close(self):
        Comms.destroy()
        if self._client is not None:
            self._client.close()
        if self._cluster is not None:
            self._cluster.close()

    def __exit__(self, type, value, traceback):
        self._close()
Beispiel #4
0
def fit(
    dset: Path,
    output_file: Path,
    *,
    model_type: ModelType = ModelType.linear,
    n_iter: int = 50,
    n_workers: int = 1,
    cores_per_worker: int = 4,
    dask_folder: Path = Path.cwd() / "dask",
    mem_per_worker: str = "2GB",
    walltime: str = "0-00:30",
    use_slurm: bool = False,
) -> BaseEstimator:
    """Fit a model

    :param dset: CAS dataset
    :param output_file: output .pickle file
    :param model_type: type of model to use
    :param n_iter: budget for hyper-parameters optimization
    :param n_workers: number of workers to use, maximum number for Slurm backend
    :param cores_per_worker: number of cores per worker
    :param dask_folder: folder to keep workers temporary data
    :param mem_per_worker: maximum of RAM for workers, only for Slurm backend
    :param walltime: maximum time for workers, only for Slurm backend
    :param use_slurm: use Slurm backend for the Dask cluster
    :returns: fitted model
    """
    dset = pd.read_csv(dset)
    X = dset[dset.fold == "train"].drop(columns="fold")
    y = X.pop("injuryCrash")

    # find function to fit the model in the global namespace
    model_func = globals()["fit_" + model_type.name]

    # start a Dask cluster, local by default, use a configuration file for Slurm
    if use_slurm:
        client = slurm_cluster(
            n_workers=n_workers,
            cores_per_worker=cores_per_worker,
            mem_per_worker=mem_per_worker,
            walltime=walltime,
            dask_folder=dask_folder,
        )
    else:
        client = Client(
            n_workers=n_workers,
            threads_per_worker=cores_per_worker,
            local_directory=dask_folder,
        )

    client.wait_for_workers(1)
    model = model_func(X, y, n_iter=n_iter)

    with output_file.open("wb") as fd:
        pickle.dump(model, fd)
Beispiel #5
0
def setup_local_dask_cluster(p2p=True):
    """
    Performs steps to setup a Dask cluster using LocalCUDACluster and returns
    the LocalCUDACluster and corresponding client instance.
    """
    cluster = LocalCUDACluster()
    client = Client(cluster)
    client.wait_for_workers(len(get_visible_devices()))
    Comms.initialize(p2p=p2p)

    return (cluster, client)
Beispiel #6
0
class MGContext:
    """Utility Context Manager to start a multi GPU context using dask_cuda

    Parameters:
    -----------

    number_of_devices : int
        Number of devices to use, verification must be done prior to call
        to ensure that there are enough devices available.
    """
    def __init__(self, number_of_devices=None, rmm_managed_memory=False):
        self._number_of_devices = number_of_devices
        self._rmm_managed_memory = rmm_managed_memory
        self._cluster = None
        self._client = None

    @property
    def client(self):
        return self._client

    @property
    def cluster(self):
        return self._cluster

    def __enter__(self):
        self._prepare_mg()
        return self

    def _prepare_mg(self):
        self._prepare_cluster()
        self._prepare_client()
        self._prepare_comms()

    def _prepare_cluster(self):
        self._cluster = CUDACluster(
            n_workers=self._number_of_devices,
            rmm_managed_memory=self._rmm_managed_memory)

    def _prepare_client(self):
        self._client = Client(self._cluster)
        self._client.wait_for_workers(self._number_of_devices)

    def _prepare_comms(self):
        Comms.initialize()

    def _close(self):
        Comms.destroy()
        if self._client is not None:
            self._client.close()
        if self._cluster is not None:
            self._cluster.close()

    def __exit__(self, type, value, traceback):
        self._close()
Beispiel #7
0
def start_client(Scheduler_file=None,
                 local_directory=None,
                 ncpu=None,
                 n_workers=1,
                 threads_per_worker=None,
                 worker_kwargs=worker_kwargs,
                 LocalCluster_kwargs={},
                 dashboard_address=8801,
                 memory_limit='120gb',
                 processes=False):
    """
        Start a dask client. If no schduler is passed, a new local cluster is started.
    """
    LC = None
    if local_directory is None:
        local_directory = './temp_skylens/'
    local_directory += 'pid' + str(os.getpid()) + '/'
    try:
        os.makedirs(local_directory)
    except Exception as error:
        print('error in creating local directory: ', local_directory, error)
    if threads_per_worker is None:
        if ncpu is None:
            ncpu = multiprocessing.cpu_count() - 1
        threads_per_worker = ncpu
    if n_workers is None:
        n_workers = 1
    if Scheduler_file is None:
        print('Start_client: No scheduler file, will start local cluster at ',
              local_directory)
        #     dask_initialize(nthreads=27,local_directory=dask_dir)
        #     client = Client()
        #         dask.config.set(scheduler='threads')
        LC = LocalCluster(n_workers=n_workers,
                          processes=processes,
                          threads_per_worker=threads_per_worker,
                          local_directory=local_directory,
                          dashboard_address=dashboard_address,
                          memory_limit=memory_limit,
                          **LocalCluster_kwargs,
                          **worker_kwargs)
        client = Client(LC)
    else:
        print('Start_client: Using scheduler file', Scheduler_file)
        client = Client(scheduler_file=Scheduler_file, processes=False)
    client.wait_for_workers(n_workers=1)
    scheduler_info = client.scheduler_info()
    scheduler_info['file'] = Scheduler_file
    return LC, scheduler_info  #client can be obtained from client_get
def Start_Client(gpu_name):

    hostname = socket.gethostname()
    n_workers = 1
    n_cores = 1

    wks2 = "wn-wks2.fe.hhi.de"
    gpu1 = "wn-gpu1.fe.hhi.de"
    gpu2 = "wn-gpu-104-01.fe.hhi.de"

    if hostname == wks2:
        path = "/data/cluster/projects/infineon-radar/daq_x-har/3_Walking_converted/recording-2020-01-28_11-31-55"
        mem = "20G"  # Allocated memory is critical. For this example it must be at least 16GB
        q = "wn-37.q"  # Check current queue status on https://hpc-management.fe.hhi.de/wn/phpqstat/

        cluster = SGECluster(
            n_workers=n_workers,
            cores=n_cores,
            memory=mem,
            resource_spec=f"h_vmem={mem}",
            host=hostname,
            queue=q,
            job_extra=[
                "-v MKL_NUM_THREADS=1,NUMEXPR_NUM_THREADS=1,OMP_NUM_THREADS=1"
            ])
    elif hostname in (gpu1, gpu2):
        os.environ[
            "CUDA_VISIBLE_DEVICES"] = gpu_name  # Check current status with nvidia-smi and pick GPU from 0-3
        cluster = LocalCluster(n_workers=n_workers,
                               threads_per_worker=n_cores,
                               host=hostname)
    else:
        raise ValueError(
            f"{hostname} is not a supported host. Please run this example on {wks}, {gpu1} or {gpu2}."
        )

    client = Client(cluster)
    client.wait_for_workers(n_workers=n_workers)
    print(client)

    return client
Beispiel #9
0
def dask_client():
    dask_scheduler_file = os.environ.get("SCHEDULER_FILE")
    cluster = None
    client = None
    tempdir_object = None

    if dask_scheduler_file:
        # Env var UCX_MAX_RNDV_RAILS=1 must be set too.
        initialize(
            enable_tcp_over_ucx=True,
            enable_nvlink=True,
            enable_infiniband=True,
            enable_rdmacm=True,
            # net_devices="mlx5_0:1",
        )
        client = Client(scheduler_file=dask_scheduler_file)
        print("\ndask_client fixture: client created using "
              f"{dask_scheduler_file}")
    else:
        # The tempdir created by tempdir_object should be cleaned up once
        # tempdir_object goes out-of-scope and is deleted.
        tempdir_object = tempfile.TemporaryDirectory()
        cluster = LocalCUDACluster(local_directory=tempdir_object.name)
        client = Client(cluster)
        client.wait_for_workers(len(get_visible_devices()))
        print("\ndask_client fixture: client created using LocalCUDACluster")

    Comms.initialize(p2p=True)

    yield client

    Comms.destroy()
    # Shut down the connected scheduler and workers
    # therefore we will no longer rely on killing the dask cluster ID
    # for MNMG runs
    client.shutdown()
    if cluster:
        cluster.close()
    print("\ndask_client fixture: client.close() called")
def start_dask_lsfcluster(cluster_size=5):
    """Start a dask cluster."""
    if cluster_size < 4:
        raise Exception('Too small of a cluster')
    # Settings for Sanger farm
    memory_in_gb = 20
    cluster = LSFCluster(
        queue='normal',
        walltime='00:30',
        log_directory='{}/dask_logs'.format(os.getcwd()),
        cores=4,
        memory='{} Gb'.format(memory_in_gb),
        mem=memory_in_gb * 1e+9,  # should be in bytes
        lsf_units='mb',
        job_extra=[
            '-G team152', '-g /lt9/dask',
            '-R "select[mem>{}] rusage[mem={}]"'.format(
                int(memory_in_gb * 1e+3), int(memory_in_gb * 1e+3))
        ],
        use_stdin=True)

    # View the job submission from Dask
    # cluster.job_script()

    # Scale cluster
    cluster.scale(cluster_size)

    # auto-scale between 10 and 100 jobs
    # cluster.adapt(
    #     minimum_jobs=int(cluster_size/4),
    #     maximum_jobs=cluster_size
    # )
    # cluster.adapt(maximum_memory="10 TB")  # use core/memory limits

    client = Client(cluster, timeout=120)
    client.wait_for_workers(n_workers=cluster_size)
    # print(client.scheduler_info()['services'])

    return cluster, client
Beispiel #11
0
import numpy as np
import dask
import dask.dataframe as dd
from dask.distributed import wait
import warnings
from dask_saturn import SaturnCluster
from dask.distributed import Client

cluster = SaturnCluster(
    scheduler_size='medium',
    worker_size='xlarge',
    n_workers=5,
    nthreads=4,
)
client = Client(cluster)
client.wait_for_workers(3)

s3 = s3fs.S3FileSystem(anon=True)

files_2019 = 's3://nyc-tlc/trip data/yellow_tripdata_2019-*.csv'
taxi = dd.read_csv(
    files_2019,
    parse_dates=['tpep_pickup_datetime', 'tpep_dropoff_datetime'],
    storage_options={'anon': True},
    assume_missing=True,
)

# specify feature and label column names
raw_features = [
    'tpep_pickup_datetime', 
    'passenger_count', 
def main(args):
    cluster_options = get_cluster_options(args)
    Cluster = cluster_options["class"]
    cluster_args = cluster_options["args"]
    cluster_kwargs = cluster_options["kwargs"]
    scheduler_addr = cluster_options["scheduler_addr"]

    if args.sched_addr:
        client = Client(args.sched_addr)
    else:
        filterwarnings("ignore",
                       message=".*NVLink.*rmm_pool_size.*",
                       category=UserWarning)

        cluster = Cluster(*cluster_args, **cluster_kwargs)
        if args.multi_node:
            import time

            # Allow some time for workers to start and connect to scheduler
            # TODO: make this a command-line argument?
            time.sleep(15)

        client = Client(scheduler_addr if args.multi_node else cluster)

    if args.type == "gpu":
        client.run(
            setup_memory_pool,
            pool_size=args.rmm_pool_size,
            disable_pool=args.disable_rmm_pool,
            log_directory=args.rmm_log_directory,
        )
        # Create an RMM pool on the scheduler due to occasional deserialization
        # of CUDA objects. May cause issues with InfiniBand otherwise.
        client.run_on_scheduler(
            setup_memory_pool,
            pool_size=1e9,
            disable_pool=args.disable_rmm_pool,
            log_directory=args.rmm_log_directory,
        )

    scheduler_workers = client.run_on_scheduler(get_scheduler_workers)
    n_workers = len(scheduler_workers)
    client.wait_for_workers(n_workers)

    if args.all_to_all:
        all_to_all(client)

    took_list = []
    for _ in range(args.runs - 1):
        took_list.append(run(client, args, n_workers, write_profile=None))
    took_list.append(
        run(client, args, n_workers,
            write_profile=args.profile))  # Only profiling the last run

    # Collect, aggregate, and print peer-to-peer bandwidths
    incoming_logs = client.run(
        lambda dask_worker: dask_worker.incoming_transfer_log)
    bandwidths = defaultdict(list)
    total_nbytes = defaultdict(list)
    for k, L in incoming_logs.items():
        for d in L:
            if d["total"] >= args.ignore_size:
                bandwidths[k, d["who"]].append(d["bandwidth"])
                total_nbytes[k, d["who"]].append(d["total"])
    bandwidths = {(scheduler_workers[w1].name, scheduler_workers[w2].name): [
        "%s/s" % format_bytes(x)
        for x in numpy.quantile(v, [0.25, 0.50, 0.75])
    ]
                  for (w1, w2), v in bandwidths.items()}
    total_nbytes = {(
        scheduler_workers[w1].name,
        scheduler_workers[w2].name,
    ): format_bytes(sum(nb))
                    for (w1, w2), nb in total_nbytes.items()}

    t_runs = numpy.empty(len(took_list))
    if args.markdown:
        print("```")
    print("Shuffle benchmark")
    print("-------------------------------")
    print(f"backend        | {args.backend}")
    print(f"partition-size | {format_bytes(args.partition_size)}")
    print(f"in-parts       | {args.in_parts}")
    print(f"protocol       | {args.protocol}")
    print(f"device(s)      | {args.devs}")
    if args.device_memory_limit:
        print(f"memory-limit   | {format_bytes(args.device_memory_limit)}")
    print(f"rmm-pool       | {(not args.disable_rmm_pool)}")
    if args.protocol == "ucx":
        print(f"tcp            | {args.enable_tcp_over_ucx}")
        print(f"ib             | {args.enable_infiniband}")
        print(f"nvlink         | {args.enable_nvlink}")
    print(f"data-processed | {format_bytes(took_list[0][0])}")
    print("===============================")
    print("Wall-clock     | Throughput")
    print("-------------------------------")
    for idx, (data_processed, took) in enumerate(took_list):
        throughput = int(data_processed / took)
        m = format_time(took)
        m += " " * (15 - len(m))
        print(f"{m}| {format_bytes(throughput)}/s")
        t_runs[idx] = float(format_bytes(throughput).split(" ")[0])
    print("===============================")
    if args.markdown:
        print("\n```")

    if args.plot is not None:
        plot_benchmark(t_runs, args.plot, historical=True)

    if args.backend == "dask":
        if args.markdown:
            print(
                "<details>\n<summary>Worker-Worker Transfer Rates</summary>\n\n```"
            )
        print("(w1,w2)        | 25% 50% 75% (total nbytes)")
        print("-------------------------------")
        for (d1, d2), bw in sorted(bandwidths.items()):
            fmt = ("(%s,%s)        | %s %s %s (%s)" if args.multi_node or
                   args.sched_addr else "(%02d,%02d)        | %s %s %s (%s)")
            print(fmt % (d1, d2, bw[0], bw[1], bw[2], total_nbytes[(d1, d2)]))
        if args.markdown:
            print("```\n</details>\n")

    if args.benchmark_json:
        bandwidths_json = {
            "bandwidth_({d1},{d2})_{i}" if args.multi_node or args.sched_addr
            else "(%02d,%02d)_%s" % (d1, d2, i): parse_bytes(v.rstrip("/s"))
            for (d1, d2), bw in sorted(bandwidths.items()) for i, v in zip(
                ["25%", "50%", "75%", "total_nbytes"],
                [bw[0], bw[1], bw[2], total_nbytes[(d1, d2)]],
            )
        }

        with open(args.benchmark_json, "a") as fp:
            for data_processed, took in took_list:
                fp.write(
                    dumps(
                        dict(
                            {
                                "backend": args.backend,
                                "partition_size": args.partition_size,
                                "in_parts": args.in_parts,
                                "protocol": args.protocol,
                                "devs": args.devs,
                                "device_memory_limit":
                                args.device_memory_limit,
                                "rmm_pool": not args.disable_rmm_pool,
                                "tcp": args.enable_tcp_over_ucx,
                                "ib": args.enable_infiniband,
                                "nvlink": args.enable_nvlink,
                                "data_processed": data_processed,
                                "wall_clock": took,
                                "throughput": data_processed / took,
                            },
                            **bandwidths_json,
                        )) + "\n")

    if args.multi_node:
        client.shutdown()
        client.close()
Beispiel #13
0
class LightGBMDaskLocal:
    # https://github.com/Nixtla/mlforecast/blob/main/nbs/distributed.forecast.ipynb
    """
    persist call: data = self.client.persist(data)
    (assignment replaces old lazy array, as persist does not change the
    input in-place)

    To reduce the risk of hitting memory limits,
    consider restarting each worker process before running any data loading or training code.
    self.client.restart()
        - This function will restart each of the worker processes, clearing out anything
        they’re holding in memory. This function does NOT restart the actual machines of
        your cluster, so it runs very quickly.
        - should the workers just be killed regardless of whether the whole process
        was successful or unsuccessful (sort of a clean up action)? can restarting
        be that cleanup action?

    loop over hyperparameter values (method that accepts hyperparameters as a dictionary -
        initializes self.model = DaskLGBMRegressor() with each set of parameters and
        calls the method that loops over )
    loop over train-valdation sets
    run model's fit method and compute predicted values and RMSE
    """
    def __init__(
        self,
        curr_dt_time,
        n_workers,
        s3_path,
        startmonth,
        n_months_in_first_train_set,
        n_months_in_val_set,
        frac=None,
    ):
        self.curr_dt_time = curr_dt_time
        self.startmonth = startmonth
        self.n_months_in_first_train_set = n_months_in_first_train_set
        self.n_months_in_val_set = n_months_in_val_set
        self.frac = frac if frac is not None else 1.0

        cluster = LocalCluster(n_workers=n_workers)
        self.client = Client(cluster)
        self.client.wait_for_workers(n_workers)
        print(f"***VIEW THE DASHBOARD HERE***: {cluster.dashboard_link}")
        # self.pca_transformed = ___ # call PCA code that returns numpy array here
        # (rename self.pca_transformed to self.full_dataset)
        # numpy array can also be created from the saved (pickle) file

        # for data:
        # instead of first looping over hyperparameter values and then over different
        # train-validation sets, is it better to do it in the opposite order
        # to allow for one set of train-validation data to be created only once?

        try:
            # this commented out code did not work without the meta= argument,
            # meta= was not tried as it needs all other columns listed, in
            # addition to the ones being recast
            # self.full_dataset = self.client.persist(
            #     dd.read_parquet(
            #         s3_path, index=False, engine="pyarrow"
            #     )
            #     .sample(frac=self.frac, random_state=42)
            #     .map_partitions(
            #         self.cast_types,
            #         meta={
            #             'sid_shop_item_qty_sold_day': 'i2',
            #             **{f'cat{n}': 'i2' for n in range(1,23)}
            #         }
            #     )
            #     .map_partitions(self.drop_neg_qty_sold)
            #     .set_index(
            #         "sale_date", sorted=False, npartitions="auto"
            #     )
            #     .repartition(partition_size="100MB")
            # )

            # create Dask dataframe from partitioned Parquet dataset on S3 and persist it to cluster
            self.full_dataset = dd.read_parquet(s3_path,
                                                index=False,
                                                engine="pyarrow").sample(
                                                    frac=self.frac,
                                                    random_state=42)
            self.full_dataset["sale_date"] = self.full_dataset[
                "sale_date"].astype("datetime64[ns]")
            self.full_dataset[
                "sid_shop_item_qty_sold_day"] = self.full_dataset[
                    "sid_shop_item_qty_sold_day"].astype("int16")
            for col in self.full_dataset:
                if col.startswith("cat"):
                    self.full_dataset[col] = self.full_dataset[col].astype(
                        "int16")

            logging.debug(
                f"# of rows in full dataframe before removal of negative target values: {len(self.full_dataset)}"
            )
            self.full_dataset = self.full_dataset[
                self.full_dataset.sid_shop_item_qty_sold_day >= 0]
            # call dataframe.set_index(), then repartition, then persist
            # https://docs.dask.org/en/latest/generated/dask.dataframe.DataFrame.set_index.html
            # set_index(sorted=False, npartitions='auto')
            # df = df.repartition(npartitions=df.npartitions // 100)

            # self.full_dataset = self.client.persist(self.full_dataset)
            # _ = wait([self.full_dataset])

            # https://docs.dask.org/en/latest/generated/dask.dataframe.DataFrame.repartition.html
            # self.full_dataset = self.full_dataset.repartition(partition_size="100MB")
            self.full_dataset = self.full_dataset.set_index(
                "sale_date",
                sorted=False,
                npartitions="auto",
                partition_size=100_000_000,
            )
            # partition_size for set_index: int, optional, desired size of
            # eaach partition in bytes (to be used with npartitions='auto')

            self.full_dataset = self.cull_empty_partitions(self.full_dataset)

            self.full_dataset = self.client.persist(self.full_dataset)
            _ = wait([self.full_dataset])
            logging.debug(
                f"# of rows in full dataframe after removal of negative target values: {len(self.full_dataset)}"
            )
            logging.debug(
                f"Earliest and latest dates in full dataframe are : {dd.compute(self.full_dataset.index.min(), self.full_dataset.index.max())}"
            )
            logging.debug(
                f"Data types of full Dask dataframe are: {self.full_dataset.dtypes}"
            )

        except Exception:
            logging.exception(
                "Exception occurred while creating Dask dataframe and persisting it on the cluster."
            )
            # kill all active work, delete all data on the network, and restart the worker processes.
            self.client.restart()
            sys.exit(1)

        # finally:
        #     self.client.restart()
        #     sys.exit(1)

        # https://stackoverflow.com/questions/58437182/how-to-read-a-single-large-parquet-file-into-multiple-partitions-using-dask-dask
        # Parquet datasets can be saved into separate files.
        # Each file may contain separate row groups.
        # Dask Dataframe reads each Parquet row group into a separate partition.

        # I DON'T WANT TO KEEP THE NUMPY ARRAY IN MEMORY, SO IT NEEDS TO BE
        # DELETED AFTER DASK ARRAY IS CREATED
        # MIGHT BE BETTER TO CREATE DASK ARRAY FROM FILE ON S3, TO AVOID
        # HAVING BOTH NUMPY ARRAY AND PERSISTED DASK ARRAY IN MEMORY
        # I ALSO WANT TO SPLIT THAT NUMPY ARRAY INTO MULTIPLE TRAIN AND VALIDATION
        # SETS, SO WHAT'S THE BEST WAY TO DO THAT?
        # SEND THE ENTIRE ARRAY TO THE CLUSTER AT ONCE - PROBABLY NOT, OR
        # SEND TRAIN AND VALIDATION SETS ONE BY ONE AND DELETE?
        # BUT THAT WILL REQUIRE SENDING DATA TO THE CLUSTER MULTIPLE TIMES -
        # NOT IF THE DATA BEING SENT ARE DIFFERENT EACH TIME
        # THEY ARE NOT GOING TO BE COMPLETELY DIFFERENT BECAUSE TRAIN DATA WILL
        # JUST CONTINUE TO MERGE WITH VALIDATION SETS AND GROW
        # CREATE FIRST DASK ARRAY AND SEND TO CLUSTER, THEN APPEND TO IT?
        # IT DOES NOT LOOK LIKE DASK WOULD ALLOW THAT (SEE
        # https://github.com/dask/distributed/issues/1676 -
        # "You should also be aware that the task/data model underlying dask
        # arrays is immutable. You should never try to modify memory in-place.")
        # SO PROBABLY SEND ALL OF THE DATA TO THE CLUSTER AT THE BEGINNING,
        # THEN TAKE CHUNKS OF IT FOR WALK-FORWARD VALIDATION

        # PROBABLY SHOULD RELY ON LOADING DATA FROM FILE USING DELAYED /
        # FROM_DELAYED
        # SEE https://stackoverflow.com/questions/45941528/how-to-efficiently-send-a-large-numpy-array-to-the-cluster-with-dask-array)

        # can I use a function to read multiple files into one Dask array?

        # either figure out how to read multiple files (saved on S3) into one
        # Dask array, or
        # figure out how to save one array of PCA results to S3 (need disk space
        # to save it locally before transfer to S3 and need a method that can
        # handle transfer of more than 5GB - multipart transfer to S3)

        # try to write PCA-transformed data directly to zarr array (stored in memory)
        # then upload it to S3 (directly from memory)
        # then create dask array from that zarr array in S3

        # try to write PCA-transformed data to xarray then upload it to S3 as zarr

        # save numpy array to parquet file, upload that file to S3 (using upload_file),
        # then read that file into a Dask dataframe
        # write data to parquet on S3 from pandas dataframe and append to it using awswrangler library?
        # (https://github.com/awslabs/aws-data-wrangler/blob/main/tutorials/004%20-%20Parquet%20Datasets.ipynb)
        # df = dd.read_parquet('s3://bucket/my-parquet-data')
        # (https://docs.dask.org/en/latest/generated/dask.dataframe.read_parquet.html#dask.dataframe.read_parquet)
        # from above link:
        # engine argument: If ‘pyarrow’ or ‘pyarrow-dataset’ is specified, the ArrowDatasetEngine (which leverages the pyarrow.dataset API) will be used.
        # read partitioned parquet dataset with Dask:
        # https://stackoverflow.com/questions/67222212/read-partitioned-parquet-dataset-written-by-spark-using-dask-and-pyarrow-dataset

    # def cast_types(self, df):
    #     df = df.copy()
    #     df['sale_date'] = df["sale_date"].astype(
    #         "datetime64[ns]"
    #     )
    #     for col in df:
    #         if col.startswith("cat") or (col == "sid_shop_item_qty_sold_day"):
    #             df[col] = df[col].astype("int16")
    #     return df
    #
    # def drop_neg_qty_sold(self, df):
    #     return df[df.sid_shop_item_qty_sold_day >= 0].copy()

    # function from https://stackoverflow.com/questions/47812785/remove-empty-partitions-in-dask
    def cull_empty_partitions(self, ddf):
        ll = list(ddf.map_partitions(len).compute())
        ddf_delayed = ddf.to_delayed()
        ddf_delayed_new = list()
        pempty = None
        for ix, n in enumerate(ll):
            if 0 == n:
                pempty = ddf.get_partition(ix)
            else:
                ddf_delayed_new.append(ddf_delayed[ix])
        if pempty is not None:
            ddf = dd.from_delayed(ddf_delayed_new, meta=pempty)
        return ddf

    def gridsearch_wfv(self, params):
        # self.hyperparameters = hyperparameters
        # self.rmse_results = defaultdict(list) # replace this variable by creating a key-value in
        # the self.hyper_dict dictionary with value containing list of RMSE values
        self.all_params_combs = list()
        # determine if there is more than one combination of hyperparameters
        # if only one combination, set get_stats_ flag to True
        self.get_stats_ = (len(params[max(params,
                                          key=lambda x: len(params[x]))]) == 1)
        for params_comb_dict in (dict(
                zip(params.keys(),
                    v)) for v in list(product(*list(params.values())))):
            # for self.hyper_dict in hyperparameters:
            # self.params_combs_list.append(params_comb_dict)
            self.params_comb_dict = params_comb_dict.copy()
            self.params_comb_dict["rmse_list_"] = list()
            self.params_comb_dict["monthly_rmse_list_"] = list()
            self.params_comb_dict["fit_times_list_"] = list()
            try:
                self.model = lgb.DaskLGBMRegressor(
                    client=self.client,
                    random_state=42,
                    silent=False,
                    tree_learner="data",
                    force_row_wise=True,
                    **params_comb_dict,
                )
            except Exception:
                logging.exception(
                    "Exception occurred while initializing Dask model.")
                # kill all active work, delete all data on the network, and restart the worker processes.
                self.client.restart()
                sys.exit(1)

            # call method that loops over train-validation sets
            with performance_report(
                    filename=f"dask_report_{self.curr_dt_time}.html"):
                for train, test, get_stats in self.train_test_time_split():
                    self.fit(train).predict(test).rmse_all_folds(
                        test, get_stats)

            self.params_comb_dict["avg_rmse_"] = mean(
                self.params_comb_dict["rmse_list_"])
            self.params_comb_dict["monthly_avg_rmse_"] = mean(
                self.params_comb_dict["monthly_rmse_list_"])
            self.all_params_combs.append(self.params_comb_dict)

        best_params = min(self.all_params_combs,
                          key=lambda x: x["monthly_avg_rmse_"])
        self.best_score_ = best_params["monthly_avg_rmse_"]
        # remove non-parameter key-values from self.best_params (i.e., rmse_list_ and avg_rmse_, etc.)
        self.best_params_ = {
            k: v
            for k, v in best_params.items() if k in params
        }

        # save list of parameter-result dictionaries to dataframe and then to CSV
        if self.all_params_combs:
            all_params_combs_df = pd.DataFrame(self.all_params_combs)
            output_csv = "all_params_combs.csv"
            all_params_combs_df.to_csv(output_csv, index=False)

            try:
                key = f"lightgbm_all_params_combs_{self.curr_dt_time}.csv"
                # global s3_client
                s3_client = boto3.client("s3")
                response = s3_client.upload_file(output_csv,
                                                 "sales-demand-data", key)
                logging.info(
                    "Name of CSV uploaded to S3 and containing all parameter combinations "
                    f"and results is: {key}")
            except ClientError as e:
                logging.exception(
                    "CSV file with LightGBM parameter combinations and results was not copied to S3."
                )

        else:
            logging.debug(
                "List of parameter-result dictionaries is empty and was not converted to CSV!"
            )

            # probably do the opposite:
            # loop over train-validation splits (persisting that data in memory)
            # and run different models on one
            # split, saving the results that can later be aggregated

            # is it possible to read the full range of dates needed for time
            # series validation and then drop/delete rows from array or
            # move some rows to another array:
            # start with July-September (train) + October (validation),
            # then remove October and move September from train to validation

    # def time_split(self):
    #     return (
    #         self.full_dataset.loc[:self.end_date],
    #         self.full_dataset.loc[self.end_date + timedelta(days=1):self.end_date + relativedelta(months=self.n_months_in_val_set, day=31)]
    #         # self.full_dataset[date > self.end_date & date <= self.end_date + relativedelta(months=n_months_in_val_set, day=31)]
    #         # less than or equal to last day of month currently used for validation
    #     )

    def train_test_time_split(self):
        # first (earliest) month: July 2015
        # number of months in first train set: 1
        # number of months in validation set: 2
        #
        # number of months between Oct 2015 and July 2015: 3
        # 3 - (2 - 1) = 2 (two 2-month intervals inside a 3-month interval)
        # (where 2 is the number of months in validation set)

        # (3 - n_months_in_first_train_set + 1) - (2 - 1)
        n_val_sets = (
            month_counter(
                self.startmonth)  # self.startmonth is e.g. July 1, 2015
            - self.n_months_in_first_train_set +
            1) - (self.n_months_in_val_set - 1)

        for m in range(n_val_sets):
            end_date = self.startmonth + relativedelta(
                months=m + self.n_months_in_first_train_set - 1, day=31)
            if self.get_stats_:
                get_stats = m == n_val_sets - 1
            else:
                get_stats = False
            yield (self.full_dataset.loc[:end_date], self.full_dataset.
                   loc[end_date + timedelta(days=1):end_date +
                       relativedelta(months=self.n_months_in_val_set, day=31)],
                   get_stats)
            # self.train, self.test = self.time_split(self.full_dataset, self.end_date)

    def get_sample_weights(self, train):
        weights_arr = train["sid_shop_item_qty_sold_day"].to_dask_array(
            lengths=True).astype('float32')
        weights_arr = da.where(weights_arr == 0,
                               self.params_comb_dict['weight_for_zeros'], 1.)
        return weights_arr

    def fit(self, train):
        try:
            start_time = time.perf_counter()
            logging.debug(
                f"train X dtypes are {train[[col for col in train if col.startswith(('pc','cat'))]].dtypes}"
            )
            logging.debug(
                f"train y type is {train['sid_shop_item_qty_sold_day'].dtype}")
            self.model.fit(
                train[[col for col in train if col.startswith(("pc", "cat"))
                       ]].to_dask_array(lengths=True),
                train["sid_shop_item_qty_sold_day"].to_dask_array(
                    lengths=True),
                sample_weight=self.get_sample_weights(train),
                feature_name=[
                    col for col in train if col.startswith(("pc", "cat"))
                ],
                categorical_feature=[
                    col for col in train if col.startswith("cat")
                ],
            )
            assert self.model.fitted_
            self.params_comb_dict["fit_times_list_"].append(
                time.perf_counter() - start_time)

            return self

        except Exception:
            logging.exception(
                "Exception occurred while fitting model on train data during walk-forward validation."
            )
            # kill all active work, delete all data on the network, and restart the worker processes.
            self.client.restart()
            sys.exit(1)

    def predict(self, test):
        try:
            self.y_pred = self.model.predict(
                test[[col for col in test if col.startswith(("pc", "cat"))]])
            return self
        except Exception:
            logging.exception(
                "Exception occurred while computing predicted values on the test data."
            )
            # kill all active work, delete all data on the network, and restart the worker processes.
            self.client.restart()
            sys.exit(1)

    def rmse_all_folds(self, test, get_stats):
        try:
            # logging.debug(f"Data type of test['sid_shop_item_qty_sold_day'] is: {type(test['sid_shop_item_qty_sold_day'])}")
            # logging.debug(f"Data type of self.y_pred is: {type(self.y_pred)}")
            # logging.debug(f"Shape of test['sid_shop_item_qty_sold_day'] is: {test['sid_shop_item_qty_sold_day'].compute().shape}")
            # logging.debug(f"Shape of self.y_pred is: {self.y_pred.compute().shape}")
            self.params_comb_dict["rmse_list_"].append(
                calc_rmse(
                    test["sid_shop_item_qty_sold_day"].to_dask_array(
                        lengths=True),
                    self.y_pred.compute_chunk_sizes(),
                    get_stats,
                ))
            # self.rmse_results[json.dumps(self.hyper_dict)].append(calc_rmse(test[["sid_shop_item_qty_sold_day"]], self.y_pred))

            self.params_comb_dict["monthly_rmse_list_"].append(
                calc_monthly_rmse(
                    test[["shop_id", "item_id", "sid_shop_item_qty_sold_day"]],
                    self.y_pred,
                ))

        except Exception:
            logging.exception(
                "Exception occurred while computing RMSE on the test data.")
            # kill all active work, delete all data on the network, and restart the worker processes.
            self.client.restart()
            sys.exit(1)

    def refit_and_save(self, model_path):
        """
        https://stackoverflow.com/questions/55208734/save-lgbmregressor-model-from-python-lightgbm-package-to-disc/55209076
        """
        try:
            self.best_model = lgb.DaskLGBMRegressor(
                client=self.client,
                random_state=42,
                silent=False,
                tree_learner="data",
                force_row_wise=True,
                **self.best_params_,
            )
            self.best_model.fit(
                self.full_dataset[[
                    col for col in self.full_dataset
                    if col.startswith(("pc", "cat"))
                ]].to_dask_array(lengths=True),
                self.full_dataset["sid_shop_item_qty_sold_day"].to_dask_array(
                    lengths=True, ),
                sample_weight=self.get_sample_weights(self.full_dataset),
                feature_name=[
                    col for col in self.full_dataset
                    if col.startswith(("pc", "cat"))
                ],
                categorical_feature=[
                    col for col in self.full_dataset if col.startswith("cat")
                ],
            )
            output_txt = str(model_path).split("/")[-1]
            booster = self.best_model.booster_.save_model(output_txt)

            # output_txt = str(model_path).split('/')[-1]
            # global s3_client
            s3_client = boto3.client("s3")
            response = s3_client.upload_file(output_txt, "sales-demand-data",
                                             output_txt)
            logging.info(
                f"Name of saved model uploaded to S3 is: {output_txt}")

        except (Exception, ClientError):
            logging.exception(
                "Exception occurred while fitting model on the full dataset and saving the booster to file on S3."
            )
            # kill all active work, delete all data on the network, and restart the worker processes.
            self.client.restart()
            sys.exit(1)
def run_benchmark(args):
    logging.info('run_benchmark: BEGIN')

    client = Client(address=args.scheduler_address)
    logging.info('client=%s' % str(client))

    logging.info('Waiting for %d Dask workers' % args.num_workers)
    client.wait_for_workers(args.num_workers)

    if args.single_batch:
        # This branch is obsolete.
        input_files = [
            f for p in args.input_file for f in sorted(glob.glob(p))
        ]
        logging.info('len(input_files)=%d' % len(input_files))
        logging.debug('input_files=%s' % str(input_files))

        input_file_sizes = [os.path.getsize(f) for f in input_files]

        perf_ddf = gpu_load_performance_data(input_files,
                                             engine=args.cudf_engine)
        logging.debug('perf_ddf=%s' % str(perf_ddf.head()))

        t0 = time.time()
        perf_ddf = perf_ddf.persist()
        wait(perf_ddf)
        t1 = time.time()
        persist_sec = t1 - t0
        logging.info('persist_sec=%f' % persist_sec)

        logging.info('perf_ddf=%s' % str(perf_ddf))

        compute_sec_list = []

        for i in range(3):
            t0 = time.time()
            computed = perf_ddf.groupby(['servicer'
                                         ])['interest_rate'].max().compute()
            t1 = time.time()
            compute_sec = t1 - t0
            compute_sec_list += [compute_sec]
            logging.info('compute_sec=%f' % compute_sec)
            logging.info('len(computed)=%s' % len(computed))
            logging.debug('computed=%s' % str(computed))

        logging.info('compute_sec_list=%s' % str(compute_sec_list))
        logging.info('len(perf_ddf)=%s' % len(perf_ddf))

        checksum = int(perf_ddf['loan_id'].sum().compute())

        results = dict(
            checksum=checksum,
            compute_sec_list=compute_sec_list,
            num_input_files=len(input_files),
            input_file_sizes=input_file_sizes,
            persist_sec=persist_sec,
            dask_cudf_version=dask_cudf.__version__,
        )
    else:
        logging.info('Getting input file list')
        glob_t0 = time.time()
        input_files = [sorted(glob.glob(p)) for p in args.input_file]
        glob_sec = time.time() - glob_t0
        for f, spec in zip(input_files, args.input_file):
            assert len(f) > 0, 'No files match %s' % spec

        logging.info('Getting file sizes')
        getsize_t0 = time.time()
        input_file_sizes = [[os.path.getsize(f) for f in batch_files]
                            for batch_files in input_files]
        getsize_sec = time.time() - getsize_t0

        logging.info('Creating distributed data frames')
        create_ddf_t0 = time.time()
        perf_ddfs = [
            gpu_load_performance_data(batch_files, engine=args.cudf_engine)
            for batch_files in input_files
        ]
        create_ddf_sec = time.time() - create_ddf_t0

        compute_sec_list = []
        for batch, perf_ddf in enumerate(perf_ddfs):
            logging.info('Computing batch %d' % batch)
            compute_t0 = time.time()
            computed = perf_ddf.groupby(['servicer'
                                         ])['interest_rate'].max().compute()
            logging.info('len(computed)=%s' % len(computed))
            logging.debug('computed=%s' % str(computed))
            del perf_ddf
            compute_sec = time.time() - compute_t0
            compute_sec_list += [compute_sec]
            logging.info('compute_sec=%f' % compute_sec)

        results = dict(
            create_ddf_sec=create_ddf_sec,
            compute_sec_list=compute_sec_list,
            dask_cudf_version=dask_cudf.__version__,
            getsize_sec=getsize_sec,
            glob_sec=glob_sec,
            input_files=input_files,
            input_file_sizes=input_file_sizes,
        )

    logging.info('FINAL RESULTS JSON: ' + json.dumps(results))
    logging.info('run_benchmark: END')
def main(args):
    cluster_options = get_cluster_options(args)
    Cluster = cluster_options["class"]
    cluster_args = cluster_options["args"]
    cluster_kwargs = cluster_options["kwargs"]
    scheduler_addr = cluster_options["scheduler_addr"]

    if args.sched_addr:
        client = Client(args.sched_addr)
    else:
        filterwarnings("ignore",
                       message=".*NVLink.*rmm_pool_size.*",
                       category=UserWarning)

        cluster = Cluster(*cluster_args, **cluster_kwargs)
        if args.multi_node:
            import time

            # Allow some time for workers to start and connect to scheduler
            # TODO: make this a command-line argument?
            time.sleep(15)

        client = Client(scheduler_addr if args.multi_node else cluster)

    if args.type == "gpu":
        client.run(
            setup_memory_pool,
            pool_size=args.rmm_pool_size,
            disable_pool=args.disable_rmm_pool,
            log_directory=args.rmm_log_directory,
        )
        # Create an RMM pool on the scheduler due to occasional deserialization
        # of CUDA objects. May cause issues with InfiniBand otherwise.
        client.run_on_scheduler(
            setup_memory_pool,
            pool_size=1e9,
            disable_pool=args.disable_rmm_pool,
            log_directory=args.rmm_log_directory,
        )

    scheduler_workers = client.run_on_scheduler(get_scheduler_workers)
    n_workers = len(scheduler_workers)
    client.wait_for_workers(n_workers)

    # Allow the number of chunks to vary between
    # the "base" and "other" DataFrames
    args.base_chunks = args.base_chunks or n_workers
    args.other_chunks = args.other_chunks or n_workers

    if args.all_to_all:
        all_to_all(client)

    took_list = []
    for _ in range(args.runs - 1):
        took_list.append(run(client, args, n_workers, write_profile=None))
    took_list.append(
        run(client, args, n_workers,
            write_profile=args.profile))  # Only profiling the last run

    # Collect, aggregate, and print peer-to-peer bandwidths
    incoming_logs = client.run(
        lambda dask_worker: dask_worker.incoming_transfer_log)
    bandwidths = defaultdict(list)
    total_nbytes = defaultdict(list)
    for k, L in incoming_logs.items():
        for d in L:
            if d["total"] >= args.ignore_size:
                bandwidths[k, d["who"]].append(d["bandwidth"])
                total_nbytes[k, d["who"]].append(d["total"])
    bandwidths = {(scheduler_workers[w1].name, scheduler_workers[w2].name): [
        "%s/s" % format_bytes(x)
        for x in numpy.quantile(v, [0.25, 0.50, 0.75])
    ]
                  for (w1, w2), v in bandwidths.items()}
    total_nbytes = {(
        scheduler_workers[w1].name,
        scheduler_workers[w2].name,
    ): format_bytes(sum(nb))
                    for (w1, w2), nb in total_nbytes.items()}

    broadcast = (False if args.shuffle_join else
                 (True if args.broadcast_join else "default"))

    t_runs = numpy.empty(len(took_list))
    if args.markdown:
        print("```")
    print("Merge benchmark")
    print("-------------------------------")
    print(f"backend        | {args.backend}")
    print(f"merge type     | {args.type}")
    print(f"rows-per-chunk | {args.chunk_size}")
    print(f"base-chunks    | {args.base_chunks}")
    print(f"other-chunks   | {args.other_chunks}")
    print(f"broadcast      | {broadcast}")
    print(f"protocol       | {args.protocol}")
    print(f"device(s)      | {args.devs}")
    print(f"rmm-pool       | {(not args.disable_rmm_pool)}")
    print(f"frac-match     | {args.frac_match}")
    if args.protocol == "ucx":
        print(f"tcp            | {args.enable_tcp_over_ucx}")
        print(f"ib             | {args.enable_infiniband}")
        print(f"nvlink         | {args.enable_nvlink}")
    print(f"data-processed | {format_bytes(took_list[0][0])}")
    print("===============================")
    print("Wall-clock     | Throughput")
    print("-------------------------------")
    for idx, (data_processed, took) in enumerate(took_list):
        throughput = int(data_processed / took)
        m = format_time(took)
        m += " " * (15 - len(m))
        print(f"{m}| {format_bytes(throughput)}/s")
        t_runs[idx] = float(format_bytes(throughput).split(" ")[0])
    print("===============================")
    if args.markdown:
        print("\n```")

    if args.plot is not None:
        plot_benchmark(t_runs, args.plot, historical=True)

    if args.backend == "dask":
        if args.markdown:
            print(
                "<details>\n<summary>Worker-Worker Transfer Rates</summary>\n\n```"
            )
        print("(w1,w2)     | 25% 50% 75% (total nbytes)")
        print("-------------------------------")
        for (d1, d2), bw in sorted(bandwidths.items()):
            fmt = ("(%s,%s)     | %s %s %s (%s)" if args.multi_node
                   or args.sched_addr else "(%02d,%02d)     | %s %s %s (%s)")
            print(fmt % (d1, d2, bw[0], bw[1], bw[2], total_nbytes[(d1, d2)]))
        if args.markdown:
            print("```\n</details>\n")

    if args.multi_node:
        client.shutdown()
        client.close()
Beispiel #16
0
def start_client(cluster_file, n_workers):
    cluster = Client(scheduler_file=cluster_file)
    cluster.wait_for_workers(n_workers)
    return cluster
                           memory=args.memory,
                           walltime=args.walltime,
                           interface=args.interface,
                           nanny=True,
                           death_timeout='600s',
                           local_directory=args.local_directory,
                           shebang='#!/usr/bin/env bash',
                           env_extra=["export TBB_CXX_TYPE=gcc"],
                           job_extra=args.job_extra.split(','),
                           queue=args.queue)
    print(args)
    print(cluster.job_script())
    cluster.scale(jobs=args.nodes)
    client = Client(cluster)
    print(client)
    client.wait_for_workers(args.nodes)
    time.sleep(60)

    print(client.get_versions(check=True))

    table = load_table(args.biom_table)
    counts = pd.DataFrame(np.array(table.matrix_data.todense()).T,
                          index=table.ids(),
                          columns=table.ids(axis='observation'))
    metadata = pd.read_table(args.metadata_file, index_col=0)
    replicates = metadata[args.replicates]
    batches = metadata[args.batches]
    # match everything up
    idx = list(set(counts.index) & set(replicates.index) & set(batches.index))
    counts, replicates, batches = [x.loc[idx] for x in
                                   (counts, replicates, batches)]