def test_create_rapids_cluster_sync(): skip_without_credentials() cluster = GCPCluster( source_image="projects/nv-ai-infra/global/images/packer-1607527229", network="dask-gcp-network-test", zone="us-east1-c", machine_type="n1-standard-1", filesystem_size=50, ngpus=2, gpu_type="nvidia-tesla-t4", docker_image="rapidsai/rapidsai:cuda11.0-runtime-ubuntu18.04-py3.8", worker_class="dask_cuda.CUDAWorker", worker_options={"rmm_pool_size": "15GB"}, asynchronous=False, bootstrap=False, ) cluster.scale(1) client = Client(cluster) # noqa client.wait_for_workers(2) def gpu_mem(): from pynvml.smi import nvidia_smi nvsmi = nvidia_smi.getInstance() return nvsmi.DeviceQuery("memory.free, memory.total") results = client.run(gpu_mem) for w, res in results.items(): assert "total" in res["gpu"][0]["fb_memory_usage"].keys() print(res) cluster.close()
def setup(dask_scheduler_file=None, rmm_pool_size=None): if dask_scheduler_file: cluster = None # Env var UCX_MAX_RNDV_RAILS=1 must be set too. initialize( enable_tcp_over_ucx=True, enable_nvlink=True, enable_infiniband=False, enable_rdmacm=False, #net_devices="mlx5_0:1", ) client = Client(scheduler_file=dask_scheduler_file) else: tempdir_object = tempfile.TemporaryDirectory() cluster = LocalCUDACluster(local_directory=tempdir_object.name, rmm_pool_size=rmm_pool_size) client = Client(cluster) # add the obj to the client so it doesn't get deleted until # the 'client' obj gets cleaned up client.tempdir_object = tempdir_object client.wait_for_workers(len(get_visible_devices())) Comms.initialize(p2p=True) return (client, cluster)
class MGContext: """Utility Context Manager to start a multi GPU context using dask_cuda Parameters: ----------- number_of_devices : int Number of devices to use, verification must be done prior to call to ensure that there are enough devices available. If not specified, the cluster will be initialized to use all visible devices. rmm_managed_memory : bool True to enable managed memory (UVM) in RMM as part of the cluster. Default is False. p2p : bool Initialize UCX endpoints if True. Default is False. """ def __init__(self, number_of_devices=None, rmm_managed_memory=False, p2p=False): self._number_of_devices = number_of_devices self._rmm_managed_memory = rmm_managed_memory self._client = None self._p2p = p2p self._cluster = CUDACluster( n_workers=self._number_of_devices, rmm_managed_memory=self._rmm_managed_memory) @property def client(self): return self._client @property def cluster(self): return self._cluster def __enter__(self): self._prepare_mg() return self def _prepare_mg(self): self._prepare_client() self._prepare_comms() def _prepare_client(self): self._client = Client(self._cluster) self._client.wait_for_workers(self._number_of_devices) def _prepare_comms(self): Comms.initialize(p2p=self._p2p) def _close(self): Comms.destroy() if self._client is not None: self._client.close() if self._cluster is not None: self._cluster.close() def __exit__(self, type, value, traceback): self._close()
def fit( dset: Path, output_file: Path, *, model_type: ModelType = ModelType.linear, n_iter: int = 50, n_workers: int = 1, cores_per_worker: int = 4, dask_folder: Path = Path.cwd() / "dask", mem_per_worker: str = "2GB", walltime: str = "0-00:30", use_slurm: bool = False, ) -> BaseEstimator: """Fit a model :param dset: CAS dataset :param output_file: output .pickle file :param model_type: type of model to use :param n_iter: budget for hyper-parameters optimization :param n_workers: number of workers to use, maximum number for Slurm backend :param cores_per_worker: number of cores per worker :param dask_folder: folder to keep workers temporary data :param mem_per_worker: maximum of RAM for workers, only for Slurm backend :param walltime: maximum time for workers, only for Slurm backend :param use_slurm: use Slurm backend for the Dask cluster :returns: fitted model """ dset = pd.read_csv(dset) X = dset[dset.fold == "train"].drop(columns="fold") y = X.pop("injuryCrash") # find function to fit the model in the global namespace model_func = globals()["fit_" + model_type.name] # start a Dask cluster, local by default, use a configuration file for Slurm if use_slurm: client = slurm_cluster( n_workers=n_workers, cores_per_worker=cores_per_worker, mem_per_worker=mem_per_worker, walltime=walltime, dask_folder=dask_folder, ) else: client = Client( n_workers=n_workers, threads_per_worker=cores_per_worker, local_directory=dask_folder, ) client.wait_for_workers(1) model = model_func(X, y, n_iter=n_iter) with output_file.open("wb") as fd: pickle.dump(model, fd)
def setup_local_dask_cluster(p2p=True): """ Performs steps to setup a Dask cluster using LocalCUDACluster and returns the LocalCUDACluster and corresponding client instance. """ cluster = LocalCUDACluster() client = Client(cluster) client.wait_for_workers(len(get_visible_devices())) Comms.initialize(p2p=p2p) return (cluster, client)
class MGContext: """Utility Context Manager to start a multi GPU context using dask_cuda Parameters: ----------- number_of_devices : int Number of devices to use, verification must be done prior to call to ensure that there are enough devices available. """ def __init__(self, number_of_devices=None, rmm_managed_memory=False): self._number_of_devices = number_of_devices self._rmm_managed_memory = rmm_managed_memory self._cluster = None self._client = None @property def client(self): return self._client @property def cluster(self): return self._cluster def __enter__(self): self._prepare_mg() return self def _prepare_mg(self): self._prepare_cluster() self._prepare_client() self._prepare_comms() def _prepare_cluster(self): self._cluster = CUDACluster( n_workers=self._number_of_devices, rmm_managed_memory=self._rmm_managed_memory) def _prepare_client(self): self._client = Client(self._cluster) self._client.wait_for_workers(self._number_of_devices) def _prepare_comms(self): Comms.initialize() def _close(self): Comms.destroy() if self._client is not None: self._client.close() if self._cluster is not None: self._cluster.close() def __exit__(self, type, value, traceback): self._close()
def start_client(Scheduler_file=None, local_directory=None, ncpu=None, n_workers=1, threads_per_worker=None, worker_kwargs=worker_kwargs, LocalCluster_kwargs={}, dashboard_address=8801, memory_limit='120gb', processes=False): """ Start a dask client. If no schduler is passed, a new local cluster is started. """ LC = None if local_directory is None: local_directory = './temp_skylens/' local_directory += 'pid' + str(os.getpid()) + '/' try: os.makedirs(local_directory) except Exception as error: print('error in creating local directory: ', local_directory, error) if threads_per_worker is None: if ncpu is None: ncpu = multiprocessing.cpu_count() - 1 threads_per_worker = ncpu if n_workers is None: n_workers = 1 if Scheduler_file is None: print('Start_client: No scheduler file, will start local cluster at ', local_directory) # dask_initialize(nthreads=27,local_directory=dask_dir) # client = Client() # dask.config.set(scheduler='threads') LC = LocalCluster(n_workers=n_workers, processes=processes, threads_per_worker=threads_per_worker, local_directory=local_directory, dashboard_address=dashboard_address, memory_limit=memory_limit, **LocalCluster_kwargs, **worker_kwargs) client = Client(LC) else: print('Start_client: Using scheduler file', Scheduler_file) client = Client(scheduler_file=Scheduler_file, processes=False) client.wait_for_workers(n_workers=1) scheduler_info = client.scheduler_info() scheduler_info['file'] = Scheduler_file return LC, scheduler_info #client can be obtained from client_get
def Start_Client(gpu_name): hostname = socket.gethostname() n_workers = 1 n_cores = 1 wks2 = "wn-wks2.fe.hhi.de" gpu1 = "wn-gpu1.fe.hhi.de" gpu2 = "wn-gpu-104-01.fe.hhi.de" if hostname == wks2: path = "/data/cluster/projects/infineon-radar/daq_x-har/3_Walking_converted/recording-2020-01-28_11-31-55" mem = "20G" # Allocated memory is critical. For this example it must be at least 16GB q = "wn-37.q" # Check current queue status on https://hpc-management.fe.hhi.de/wn/phpqstat/ cluster = SGECluster( n_workers=n_workers, cores=n_cores, memory=mem, resource_spec=f"h_vmem={mem}", host=hostname, queue=q, job_extra=[ "-v MKL_NUM_THREADS=1,NUMEXPR_NUM_THREADS=1,OMP_NUM_THREADS=1" ]) elif hostname in (gpu1, gpu2): os.environ[ "CUDA_VISIBLE_DEVICES"] = gpu_name # Check current status with nvidia-smi and pick GPU from 0-3 cluster = LocalCluster(n_workers=n_workers, threads_per_worker=n_cores, host=hostname) else: raise ValueError( f"{hostname} is not a supported host. Please run this example on {wks}, {gpu1} or {gpu2}." ) client = Client(cluster) client.wait_for_workers(n_workers=n_workers) print(client) return client
def dask_client(): dask_scheduler_file = os.environ.get("SCHEDULER_FILE") cluster = None client = None tempdir_object = None if dask_scheduler_file: # Env var UCX_MAX_RNDV_RAILS=1 must be set too. initialize( enable_tcp_over_ucx=True, enable_nvlink=True, enable_infiniband=True, enable_rdmacm=True, # net_devices="mlx5_0:1", ) client = Client(scheduler_file=dask_scheduler_file) print("\ndask_client fixture: client created using " f"{dask_scheduler_file}") else: # The tempdir created by tempdir_object should be cleaned up once # tempdir_object goes out-of-scope and is deleted. tempdir_object = tempfile.TemporaryDirectory() cluster = LocalCUDACluster(local_directory=tempdir_object.name) client = Client(cluster) client.wait_for_workers(len(get_visible_devices())) print("\ndask_client fixture: client created using LocalCUDACluster") Comms.initialize(p2p=True) yield client Comms.destroy() # Shut down the connected scheduler and workers # therefore we will no longer rely on killing the dask cluster ID # for MNMG runs client.shutdown() if cluster: cluster.close() print("\ndask_client fixture: client.close() called")
def start_dask_lsfcluster(cluster_size=5): """Start a dask cluster.""" if cluster_size < 4: raise Exception('Too small of a cluster') # Settings for Sanger farm memory_in_gb = 20 cluster = LSFCluster( queue='normal', walltime='00:30', log_directory='{}/dask_logs'.format(os.getcwd()), cores=4, memory='{} Gb'.format(memory_in_gb), mem=memory_in_gb * 1e+9, # should be in bytes lsf_units='mb', job_extra=[ '-G team152', '-g /lt9/dask', '-R "select[mem>{}] rusage[mem={}]"'.format( int(memory_in_gb * 1e+3), int(memory_in_gb * 1e+3)) ], use_stdin=True) # View the job submission from Dask # cluster.job_script() # Scale cluster cluster.scale(cluster_size) # auto-scale between 10 and 100 jobs # cluster.adapt( # minimum_jobs=int(cluster_size/4), # maximum_jobs=cluster_size # ) # cluster.adapt(maximum_memory="10 TB") # use core/memory limits client = Client(cluster, timeout=120) client.wait_for_workers(n_workers=cluster_size) # print(client.scheduler_info()['services']) return cluster, client
import numpy as np import dask import dask.dataframe as dd from dask.distributed import wait import warnings from dask_saturn import SaturnCluster from dask.distributed import Client cluster = SaturnCluster( scheduler_size='medium', worker_size='xlarge', n_workers=5, nthreads=4, ) client = Client(cluster) client.wait_for_workers(3) s3 = s3fs.S3FileSystem(anon=True) files_2019 = 's3://nyc-tlc/trip data/yellow_tripdata_2019-*.csv' taxi = dd.read_csv( files_2019, parse_dates=['tpep_pickup_datetime', 'tpep_dropoff_datetime'], storage_options={'anon': True}, assume_missing=True, ) # specify feature and label column names raw_features = [ 'tpep_pickup_datetime', 'passenger_count',
def main(args): cluster_options = get_cluster_options(args) Cluster = cluster_options["class"] cluster_args = cluster_options["args"] cluster_kwargs = cluster_options["kwargs"] scheduler_addr = cluster_options["scheduler_addr"] if args.sched_addr: client = Client(args.sched_addr) else: filterwarnings("ignore", message=".*NVLink.*rmm_pool_size.*", category=UserWarning) cluster = Cluster(*cluster_args, **cluster_kwargs) if args.multi_node: import time # Allow some time for workers to start and connect to scheduler # TODO: make this a command-line argument? time.sleep(15) client = Client(scheduler_addr if args.multi_node else cluster) if args.type == "gpu": client.run( setup_memory_pool, pool_size=args.rmm_pool_size, disable_pool=args.disable_rmm_pool, log_directory=args.rmm_log_directory, ) # Create an RMM pool on the scheduler due to occasional deserialization # of CUDA objects. May cause issues with InfiniBand otherwise. client.run_on_scheduler( setup_memory_pool, pool_size=1e9, disable_pool=args.disable_rmm_pool, log_directory=args.rmm_log_directory, ) scheduler_workers = client.run_on_scheduler(get_scheduler_workers) n_workers = len(scheduler_workers) client.wait_for_workers(n_workers) if args.all_to_all: all_to_all(client) took_list = [] for _ in range(args.runs - 1): took_list.append(run(client, args, n_workers, write_profile=None)) took_list.append( run(client, args, n_workers, write_profile=args.profile)) # Only profiling the last run # Collect, aggregate, and print peer-to-peer bandwidths incoming_logs = client.run( lambda dask_worker: dask_worker.incoming_transfer_log) bandwidths = defaultdict(list) total_nbytes = defaultdict(list) for k, L in incoming_logs.items(): for d in L: if d["total"] >= args.ignore_size: bandwidths[k, d["who"]].append(d["bandwidth"]) total_nbytes[k, d["who"]].append(d["total"]) bandwidths = {(scheduler_workers[w1].name, scheduler_workers[w2].name): [ "%s/s" % format_bytes(x) for x in numpy.quantile(v, [0.25, 0.50, 0.75]) ] for (w1, w2), v in bandwidths.items()} total_nbytes = {( scheduler_workers[w1].name, scheduler_workers[w2].name, ): format_bytes(sum(nb)) for (w1, w2), nb in total_nbytes.items()} t_runs = numpy.empty(len(took_list)) if args.markdown: print("```") print("Shuffle benchmark") print("-------------------------------") print(f"backend | {args.backend}") print(f"partition-size | {format_bytes(args.partition_size)}") print(f"in-parts | {args.in_parts}") print(f"protocol | {args.protocol}") print(f"device(s) | {args.devs}") if args.device_memory_limit: print(f"memory-limit | {format_bytes(args.device_memory_limit)}") print(f"rmm-pool | {(not args.disable_rmm_pool)}") if args.protocol == "ucx": print(f"tcp | {args.enable_tcp_over_ucx}") print(f"ib | {args.enable_infiniband}") print(f"nvlink | {args.enable_nvlink}") print(f"data-processed | {format_bytes(took_list[0][0])}") print("===============================") print("Wall-clock | Throughput") print("-------------------------------") for idx, (data_processed, took) in enumerate(took_list): throughput = int(data_processed / took) m = format_time(took) m += " " * (15 - len(m)) print(f"{m}| {format_bytes(throughput)}/s") t_runs[idx] = float(format_bytes(throughput).split(" ")[0]) print("===============================") if args.markdown: print("\n```") if args.plot is not None: plot_benchmark(t_runs, args.plot, historical=True) if args.backend == "dask": if args.markdown: print( "<details>\n<summary>Worker-Worker Transfer Rates</summary>\n\n```" ) print("(w1,w2) | 25% 50% 75% (total nbytes)") print("-------------------------------") for (d1, d2), bw in sorted(bandwidths.items()): fmt = ("(%s,%s) | %s %s %s (%s)" if args.multi_node or args.sched_addr else "(%02d,%02d) | %s %s %s (%s)") print(fmt % (d1, d2, bw[0], bw[1], bw[2], total_nbytes[(d1, d2)])) if args.markdown: print("```\n</details>\n") if args.benchmark_json: bandwidths_json = { "bandwidth_({d1},{d2})_{i}" if args.multi_node or args.sched_addr else "(%02d,%02d)_%s" % (d1, d2, i): parse_bytes(v.rstrip("/s")) for (d1, d2), bw in sorted(bandwidths.items()) for i, v in zip( ["25%", "50%", "75%", "total_nbytes"], [bw[0], bw[1], bw[2], total_nbytes[(d1, d2)]], ) } with open(args.benchmark_json, "a") as fp: for data_processed, took in took_list: fp.write( dumps( dict( { "backend": args.backend, "partition_size": args.partition_size, "in_parts": args.in_parts, "protocol": args.protocol, "devs": args.devs, "device_memory_limit": args.device_memory_limit, "rmm_pool": not args.disable_rmm_pool, "tcp": args.enable_tcp_over_ucx, "ib": args.enable_infiniband, "nvlink": args.enable_nvlink, "data_processed": data_processed, "wall_clock": took, "throughput": data_processed / took, }, **bandwidths_json, )) + "\n") if args.multi_node: client.shutdown() client.close()
class LightGBMDaskLocal: # https://github.com/Nixtla/mlforecast/blob/main/nbs/distributed.forecast.ipynb """ persist call: data = self.client.persist(data) (assignment replaces old lazy array, as persist does not change the input in-place) To reduce the risk of hitting memory limits, consider restarting each worker process before running any data loading or training code. self.client.restart() - This function will restart each of the worker processes, clearing out anything they’re holding in memory. This function does NOT restart the actual machines of your cluster, so it runs very quickly. - should the workers just be killed regardless of whether the whole process was successful or unsuccessful (sort of a clean up action)? can restarting be that cleanup action? loop over hyperparameter values (method that accepts hyperparameters as a dictionary - initializes self.model = DaskLGBMRegressor() with each set of parameters and calls the method that loops over ) loop over train-valdation sets run model's fit method and compute predicted values and RMSE """ def __init__( self, curr_dt_time, n_workers, s3_path, startmonth, n_months_in_first_train_set, n_months_in_val_set, frac=None, ): self.curr_dt_time = curr_dt_time self.startmonth = startmonth self.n_months_in_first_train_set = n_months_in_first_train_set self.n_months_in_val_set = n_months_in_val_set self.frac = frac if frac is not None else 1.0 cluster = LocalCluster(n_workers=n_workers) self.client = Client(cluster) self.client.wait_for_workers(n_workers) print(f"***VIEW THE DASHBOARD HERE***: {cluster.dashboard_link}") # self.pca_transformed = ___ # call PCA code that returns numpy array here # (rename self.pca_transformed to self.full_dataset) # numpy array can also be created from the saved (pickle) file # for data: # instead of first looping over hyperparameter values and then over different # train-validation sets, is it better to do it in the opposite order # to allow for one set of train-validation data to be created only once? try: # this commented out code did not work without the meta= argument, # meta= was not tried as it needs all other columns listed, in # addition to the ones being recast # self.full_dataset = self.client.persist( # dd.read_parquet( # s3_path, index=False, engine="pyarrow" # ) # .sample(frac=self.frac, random_state=42) # .map_partitions( # self.cast_types, # meta={ # 'sid_shop_item_qty_sold_day': 'i2', # **{f'cat{n}': 'i2' for n in range(1,23)} # } # ) # .map_partitions(self.drop_neg_qty_sold) # .set_index( # "sale_date", sorted=False, npartitions="auto" # ) # .repartition(partition_size="100MB") # ) # create Dask dataframe from partitioned Parquet dataset on S3 and persist it to cluster self.full_dataset = dd.read_parquet(s3_path, index=False, engine="pyarrow").sample( frac=self.frac, random_state=42) self.full_dataset["sale_date"] = self.full_dataset[ "sale_date"].astype("datetime64[ns]") self.full_dataset[ "sid_shop_item_qty_sold_day"] = self.full_dataset[ "sid_shop_item_qty_sold_day"].astype("int16") for col in self.full_dataset: if col.startswith("cat"): self.full_dataset[col] = self.full_dataset[col].astype( "int16") logging.debug( f"# of rows in full dataframe before removal of negative target values: {len(self.full_dataset)}" ) self.full_dataset = self.full_dataset[ self.full_dataset.sid_shop_item_qty_sold_day >= 0] # call dataframe.set_index(), then repartition, then persist # https://docs.dask.org/en/latest/generated/dask.dataframe.DataFrame.set_index.html # set_index(sorted=False, npartitions='auto') # df = df.repartition(npartitions=df.npartitions // 100) # self.full_dataset = self.client.persist(self.full_dataset) # _ = wait([self.full_dataset]) # https://docs.dask.org/en/latest/generated/dask.dataframe.DataFrame.repartition.html # self.full_dataset = self.full_dataset.repartition(partition_size="100MB") self.full_dataset = self.full_dataset.set_index( "sale_date", sorted=False, npartitions="auto", partition_size=100_000_000, ) # partition_size for set_index: int, optional, desired size of # eaach partition in bytes (to be used with npartitions='auto') self.full_dataset = self.cull_empty_partitions(self.full_dataset) self.full_dataset = self.client.persist(self.full_dataset) _ = wait([self.full_dataset]) logging.debug( f"# of rows in full dataframe after removal of negative target values: {len(self.full_dataset)}" ) logging.debug( f"Earliest and latest dates in full dataframe are : {dd.compute(self.full_dataset.index.min(), self.full_dataset.index.max())}" ) logging.debug( f"Data types of full Dask dataframe are: {self.full_dataset.dtypes}" ) except Exception: logging.exception( "Exception occurred while creating Dask dataframe and persisting it on the cluster." ) # kill all active work, delete all data on the network, and restart the worker processes. self.client.restart() sys.exit(1) # finally: # self.client.restart() # sys.exit(1) # https://stackoverflow.com/questions/58437182/how-to-read-a-single-large-parquet-file-into-multiple-partitions-using-dask-dask # Parquet datasets can be saved into separate files. # Each file may contain separate row groups. # Dask Dataframe reads each Parquet row group into a separate partition. # I DON'T WANT TO KEEP THE NUMPY ARRAY IN MEMORY, SO IT NEEDS TO BE # DELETED AFTER DASK ARRAY IS CREATED # MIGHT BE BETTER TO CREATE DASK ARRAY FROM FILE ON S3, TO AVOID # HAVING BOTH NUMPY ARRAY AND PERSISTED DASK ARRAY IN MEMORY # I ALSO WANT TO SPLIT THAT NUMPY ARRAY INTO MULTIPLE TRAIN AND VALIDATION # SETS, SO WHAT'S THE BEST WAY TO DO THAT? # SEND THE ENTIRE ARRAY TO THE CLUSTER AT ONCE - PROBABLY NOT, OR # SEND TRAIN AND VALIDATION SETS ONE BY ONE AND DELETE? # BUT THAT WILL REQUIRE SENDING DATA TO THE CLUSTER MULTIPLE TIMES - # NOT IF THE DATA BEING SENT ARE DIFFERENT EACH TIME # THEY ARE NOT GOING TO BE COMPLETELY DIFFERENT BECAUSE TRAIN DATA WILL # JUST CONTINUE TO MERGE WITH VALIDATION SETS AND GROW # CREATE FIRST DASK ARRAY AND SEND TO CLUSTER, THEN APPEND TO IT? # IT DOES NOT LOOK LIKE DASK WOULD ALLOW THAT (SEE # https://github.com/dask/distributed/issues/1676 - # "You should also be aware that the task/data model underlying dask # arrays is immutable. You should never try to modify memory in-place.") # SO PROBABLY SEND ALL OF THE DATA TO THE CLUSTER AT THE BEGINNING, # THEN TAKE CHUNKS OF IT FOR WALK-FORWARD VALIDATION # PROBABLY SHOULD RELY ON LOADING DATA FROM FILE USING DELAYED / # FROM_DELAYED # SEE https://stackoverflow.com/questions/45941528/how-to-efficiently-send-a-large-numpy-array-to-the-cluster-with-dask-array) # can I use a function to read multiple files into one Dask array? # either figure out how to read multiple files (saved on S3) into one # Dask array, or # figure out how to save one array of PCA results to S3 (need disk space # to save it locally before transfer to S3 and need a method that can # handle transfer of more than 5GB - multipart transfer to S3) # try to write PCA-transformed data directly to zarr array (stored in memory) # then upload it to S3 (directly from memory) # then create dask array from that zarr array in S3 # try to write PCA-transformed data to xarray then upload it to S3 as zarr # save numpy array to parquet file, upload that file to S3 (using upload_file), # then read that file into a Dask dataframe # write data to parquet on S3 from pandas dataframe and append to it using awswrangler library? # (https://github.com/awslabs/aws-data-wrangler/blob/main/tutorials/004%20-%20Parquet%20Datasets.ipynb) # df = dd.read_parquet('s3://bucket/my-parquet-data') # (https://docs.dask.org/en/latest/generated/dask.dataframe.read_parquet.html#dask.dataframe.read_parquet) # from above link: # engine argument: If ‘pyarrow’ or ‘pyarrow-dataset’ is specified, the ArrowDatasetEngine (which leverages the pyarrow.dataset API) will be used. # read partitioned parquet dataset with Dask: # https://stackoverflow.com/questions/67222212/read-partitioned-parquet-dataset-written-by-spark-using-dask-and-pyarrow-dataset # def cast_types(self, df): # df = df.copy() # df['sale_date'] = df["sale_date"].astype( # "datetime64[ns]" # ) # for col in df: # if col.startswith("cat") or (col == "sid_shop_item_qty_sold_day"): # df[col] = df[col].astype("int16") # return df # # def drop_neg_qty_sold(self, df): # return df[df.sid_shop_item_qty_sold_day >= 0].copy() # function from https://stackoverflow.com/questions/47812785/remove-empty-partitions-in-dask def cull_empty_partitions(self, ddf): ll = list(ddf.map_partitions(len).compute()) ddf_delayed = ddf.to_delayed() ddf_delayed_new = list() pempty = None for ix, n in enumerate(ll): if 0 == n: pempty = ddf.get_partition(ix) else: ddf_delayed_new.append(ddf_delayed[ix]) if pempty is not None: ddf = dd.from_delayed(ddf_delayed_new, meta=pempty) return ddf def gridsearch_wfv(self, params): # self.hyperparameters = hyperparameters # self.rmse_results = defaultdict(list) # replace this variable by creating a key-value in # the self.hyper_dict dictionary with value containing list of RMSE values self.all_params_combs = list() # determine if there is more than one combination of hyperparameters # if only one combination, set get_stats_ flag to True self.get_stats_ = (len(params[max(params, key=lambda x: len(params[x]))]) == 1) for params_comb_dict in (dict( zip(params.keys(), v)) for v in list(product(*list(params.values())))): # for self.hyper_dict in hyperparameters: # self.params_combs_list.append(params_comb_dict) self.params_comb_dict = params_comb_dict.copy() self.params_comb_dict["rmse_list_"] = list() self.params_comb_dict["monthly_rmse_list_"] = list() self.params_comb_dict["fit_times_list_"] = list() try: self.model = lgb.DaskLGBMRegressor( client=self.client, random_state=42, silent=False, tree_learner="data", force_row_wise=True, **params_comb_dict, ) except Exception: logging.exception( "Exception occurred while initializing Dask model.") # kill all active work, delete all data on the network, and restart the worker processes. self.client.restart() sys.exit(1) # call method that loops over train-validation sets with performance_report( filename=f"dask_report_{self.curr_dt_time}.html"): for train, test, get_stats in self.train_test_time_split(): self.fit(train).predict(test).rmse_all_folds( test, get_stats) self.params_comb_dict["avg_rmse_"] = mean( self.params_comb_dict["rmse_list_"]) self.params_comb_dict["monthly_avg_rmse_"] = mean( self.params_comb_dict["monthly_rmse_list_"]) self.all_params_combs.append(self.params_comb_dict) best_params = min(self.all_params_combs, key=lambda x: x["monthly_avg_rmse_"]) self.best_score_ = best_params["monthly_avg_rmse_"] # remove non-parameter key-values from self.best_params (i.e., rmse_list_ and avg_rmse_, etc.) self.best_params_ = { k: v for k, v in best_params.items() if k in params } # save list of parameter-result dictionaries to dataframe and then to CSV if self.all_params_combs: all_params_combs_df = pd.DataFrame(self.all_params_combs) output_csv = "all_params_combs.csv" all_params_combs_df.to_csv(output_csv, index=False) try: key = f"lightgbm_all_params_combs_{self.curr_dt_time}.csv" # global s3_client s3_client = boto3.client("s3") response = s3_client.upload_file(output_csv, "sales-demand-data", key) logging.info( "Name of CSV uploaded to S3 and containing all parameter combinations " f"and results is: {key}") except ClientError as e: logging.exception( "CSV file with LightGBM parameter combinations and results was not copied to S3." ) else: logging.debug( "List of parameter-result dictionaries is empty and was not converted to CSV!" ) # probably do the opposite: # loop over train-validation splits (persisting that data in memory) # and run different models on one # split, saving the results that can later be aggregated # is it possible to read the full range of dates needed for time # series validation and then drop/delete rows from array or # move some rows to another array: # start with July-September (train) + October (validation), # then remove October and move September from train to validation # def time_split(self): # return ( # self.full_dataset.loc[:self.end_date], # self.full_dataset.loc[self.end_date + timedelta(days=1):self.end_date + relativedelta(months=self.n_months_in_val_set, day=31)] # # self.full_dataset[date > self.end_date & date <= self.end_date + relativedelta(months=n_months_in_val_set, day=31)] # # less than or equal to last day of month currently used for validation # ) def train_test_time_split(self): # first (earliest) month: July 2015 # number of months in first train set: 1 # number of months in validation set: 2 # # number of months between Oct 2015 and July 2015: 3 # 3 - (2 - 1) = 2 (two 2-month intervals inside a 3-month interval) # (where 2 is the number of months in validation set) # (3 - n_months_in_first_train_set + 1) - (2 - 1) n_val_sets = ( month_counter( self.startmonth) # self.startmonth is e.g. July 1, 2015 - self.n_months_in_first_train_set + 1) - (self.n_months_in_val_set - 1) for m in range(n_val_sets): end_date = self.startmonth + relativedelta( months=m + self.n_months_in_first_train_set - 1, day=31) if self.get_stats_: get_stats = m == n_val_sets - 1 else: get_stats = False yield (self.full_dataset.loc[:end_date], self.full_dataset. loc[end_date + timedelta(days=1):end_date + relativedelta(months=self.n_months_in_val_set, day=31)], get_stats) # self.train, self.test = self.time_split(self.full_dataset, self.end_date) def get_sample_weights(self, train): weights_arr = train["sid_shop_item_qty_sold_day"].to_dask_array( lengths=True).astype('float32') weights_arr = da.where(weights_arr == 0, self.params_comb_dict['weight_for_zeros'], 1.) return weights_arr def fit(self, train): try: start_time = time.perf_counter() logging.debug( f"train X dtypes are {train[[col for col in train if col.startswith(('pc','cat'))]].dtypes}" ) logging.debug( f"train y type is {train['sid_shop_item_qty_sold_day'].dtype}") self.model.fit( train[[col for col in train if col.startswith(("pc", "cat")) ]].to_dask_array(lengths=True), train["sid_shop_item_qty_sold_day"].to_dask_array( lengths=True), sample_weight=self.get_sample_weights(train), feature_name=[ col for col in train if col.startswith(("pc", "cat")) ], categorical_feature=[ col for col in train if col.startswith("cat") ], ) assert self.model.fitted_ self.params_comb_dict["fit_times_list_"].append( time.perf_counter() - start_time) return self except Exception: logging.exception( "Exception occurred while fitting model on train data during walk-forward validation." ) # kill all active work, delete all data on the network, and restart the worker processes. self.client.restart() sys.exit(1) def predict(self, test): try: self.y_pred = self.model.predict( test[[col for col in test if col.startswith(("pc", "cat"))]]) return self except Exception: logging.exception( "Exception occurred while computing predicted values on the test data." ) # kill all active work, delete all data on the network, and restart the worker processes. self.client.restart() sys.exit(1) def rmse_all_folds(self, test, get_stats): try: # logging.debug(f"Data type of test['sid_shop_item_qty_sold_day'] is: {type(test['sid_shop_item_qty_sold_day'])}") # logging.debug(f"Data type of self.y_pred is: {type(self.y_pred)}") # logging.debug(f"Shape of test['sid_shop_item_qty_sold_day'] is: {test['sid_shop_item_qty_sold_day'].compute().shape}") # logging.debug(f"Shape of self.y_pred is: {self.y_pred.compute().shape}") self.params_comb_dict["rmse_list_"].append( calc_rmse( test["sid_shop_item_qty_sold_day"].to_dask_array( lengths=True), self.y_pred.compute_chunk_sizes(), get_stats, )) # self.rmse_results[json.dumps(self.hyper_dict)].append(calc_rmse(test[["sid_shop_item_qty_sold_day"]], self.y_pred)) self.params_comb_dict["monthly_rmse_list_"].append( calc_monthly_rmse( test[["shop_id", "item_id", "sid_shop_item_qty_sold_day"]], self.y_pred, )) except Exception: logging.exception( "Exception occurred while computing RMSE on the test data.") # kill all active work, delete all data on the network, and restart the worker processes. self.client.restart() sys.exit(1) def refit_and_save(self, model_path): """ https://stackoverflow.com/questions/55208734/save-lgbmregressor-model-from-python-lightgbm-package-to-disc/55209076 """ try: self.best_model = lgb.DaskLGBMRegressor( client=self.client, random_state=42, silent=False, tree_learner="data", force_row_wise=True, **self.best_params_, ) self.best_model.fit( self.full_dataset[[ col for col in self.full_dataset if col.startswith(("pc", "cat")) ]].to_dask_array(lengths=True), self.full_dataset["sid_shop_item_qty_sold_day"].to_dask_array( lengths=True, ), sample_weight=self.get_sample_weights(self.full_dataset), feature_name=[ col for col in self.full_dataset if col.startswith(("pc", "cat")) ], categorical_feature=[ col for col in self.full_dataset if col.startswith("cat") ], ) output_txt = str(model_path).split("/")[-1] booster = self.best_model.booster_.save_model(output_txt) # output_txt = str(model_path).split('/')[-1] # global s3_client s3_client = boto3.client("s3") response = s3_client.upload_file(output_txt, "sales-demand-data", output_txt) logging.info( f"Name of saved model uploaded to S3 is: {output_txt}") except (Exception, ClientError): logging.exception( "Exception occurred while fitting model on the full dataset and saving the booster to file on S3." ) # kill all active work, delete all data on the network, and restart the worker processes. self.client.restart() sys.exit(1)
def run_benchmark(args): logging.info('run_benchmark: BEGIN') client = Client(address=args.scheduler_address) logging.info('client=%s' % str(client)) logging.info('Waiting for %d Dask workers' % args.num_workers) client.wait_for_workers(args.num_workers) if args.single_batch: # This branch is obsolete. input_files = [ f for p in args.input_file for f in sorted(glob.glob(p)) ] logging.info('len(input_files)=%d' % len(input_files)) logging.debug('input_files=%s' % str(input_files)) input_file_sizes = [os.path.getsize(f) for f in input_files] perf_ddf = gpu_load_performance_data(input_files, engine=args.cudf_engine) logging.debug('perf_ddf=%s' % str(perf_ddf.head())) t0 = time.time() perf_ddf = perf_ddf.persist() wait(perf_ddf) t1 = time.time() persist_sec = t1 - t0 logging.info('persist_sec=%f' % persist_sec) logging.info('perf_ddf=%s' % str(perf_ddf)) compute_sec_list = [] for i in range(3): t0 = time.time() computed = perf_ddf.groupby(['servicer' ])['interest_rate'].max().compute() t1 = time.time() compute_sec = t1 - t0 compute_sec_list += [compute_sec] logging.info('compute_sec=%f' % compute_sec) logging.info('len(computed)=%s' % len(computed)) logging.debug('computed=%s' % str(computed)) logging.info('compute_sec_list=%s' % str(compute_sec_list)) logging.info('len(perf_ddf)=%s' % len(perf_ddf)) checksum = int(perf_ddf['loan_id'].sum().compute()) results = dict( checksum=checksum, compute_sec_list=compute_sec_list, num_input_files=len(input_files), input_file_sizes=input_file_sizes, persist_sec=persist_sec, dask_cudf_version=dask_cudf.__version__, ) else: logging.info('Getting input file list') glob_t0 = time.time() input_files = [sorted(glob.glob(p)) for p in args.input_file] glob_sec = time.time() - glob_t0 for f, spec in zip(input_files, args.input_file): assert len(f) > 0, 'No files match %s' % spec logging.info('Getting file sizes') getsize_t0 = time.time() input_file_sizes = [[os.path.getsize(f) for f in batch_files] for batch_files in input_files] getsize_sec = time.time() - getsize_t0 logging.info('Creating distributed data frames') create_ddf_t0 = time.time() perf_ddfs = [ gpu_load_performance_data(batch_files, engine=args.cudf_engine) for batch_files in input_files ] create_ddf_sec = time.time() - create_ddf_t0 compute_sec_list = [] for batch, perf_ddf in enumerate(perf_ddfs): logging.info('Computing batch %d' % batch) compute_t0 = time.time() computed = perf_ddf.groupby(['servicer' ])['interest_rate'].max().compute() logging.info('len(computed)=%s' % len(computed)) logging.debug('computed=%s' % str(computed)) del perf_ddf compute_sec = time.time() - compute_t0 compute_sec_list += [compute_sec] logging.info('compute_sec=%f' % compute_sec) results = dict( create_ddf_sec=create_ddf_sec, compute_sec_list=compute_sec_list, dask_cudf_version=dask_cudf.__version__, getsize_sec=getsize_sec, glob_sec=glob_sec, input_files=input_files, input_file_sizes=input_file_sizes, ) logging.info('FINAL RESULTS JSON: ' + json.dumps(results)) logging.info('run_benchmark: END')
def main(args): cluster_options = get_cluster_options(args) Cluster = cluster_options["class"] cluster_args = cluster_options["args"] cluster_kwargs = cluster_options["kwargs"] scheduler_addr = cluster_options["scheduler_addr"] if args.sched_addr: client = Client(args.sched_addr) else: filterwarnings("ignore", message=".*NVLink.*rmm_pool_size.*", category=UserWarning) cluster = Cluster(*cluster_args, **cluster_kwargs) if args.multi_node: import time # Allow some time for workers to start and connect to scheduler # TODO: make this a command-line argument? time.sleep(15) client = Client(scheduler_addr if args.multi_node else cluster) if args.type == "gpu": client.run( setup_memory_pool, pool_size=args.rmm_pool_size, disable_pool=args.disable_rmm_pool, log_directory=args.rmm_log_directory, ) # Create an RMM pool on the scheduler due to occasional deserialization # of CUDA objects. May cause issues with InfiniBand otherwise. client.run_on_scheduler( setup_memory_pool, pool_size=1e9, disable_pool=args.disable_rmm_pool, log_directory=args.rmm_log_directory, ) scheduler_workers = client.run_on_scheduler(get_scheduler_workers) n_workers = len(scheduler_workers) client.wait_for_workers(n_workers) # Allow the number of chunks to vary between # the "base" and "other" DataFrames args.base_chunks = args.base_chunks or n_workers args.other_chunks = args.other_chunks or n_workers if args.all_to_all: all_to_all(client) took_list = [] for _ in range(args.runs - 1): took_list.append(run(client, args, n_workers, write_profile=None)) took_list.append( run(client, args, n_workers, write_profile=args.profile)) # Only profiling the last run # Collect, aggregate, and print peer-to-peer bandwidths incoming_logs = client.run( lambda dask_worker: dask_worker.incoming_transfer_log) bandwidths = defaultdict(list) total_nbytes = defaultdict(list) for k, L in incoming_logs.items(): for d in L: if d["total"] >= args.ignore_size: bandwidths[k, d["who"]].append(d["bandwidth"]) total_nbytes[k, d["who"]].append(d["total"]) bandwidths = {(scheduler_workers[w1].name, scheduler_workers[w2].name): [ "%s/s" % format_bytes(x) for x in numpy.quantile(v, [0.25, 0.50, 0.75]) ] for (w1, w2), v in bandwidths.items()} total_nbytes = {( scheduler_workers[w1].name, scheduler_workers[w2].name, ): format_bytes(sum(nb)) for (w1, w2), nb in total_nbytes.items()} broadcast = (False if args.shuffle_join else (True if args.broadcast_join else "default")) t_runs = numpy.empty(len(took_list)) if args.markdown: print("```") print("Merge benchmark") print("-------------------------------") print(f"backend | {args.backend}") print(f"merge type | {args.type}") print(f"rows-per-chunk | {args.chunk_size}") print(f"base-chunks | {args.base_chunks}") print(f"other-chunks | {args.other_chunks}") print(f"broadcast | {broadcast}") print(f"protocol | {args.protocol}") print(f"device(s) | {args.devs}") print(f"rmm-pool | {(not args.disable_rmm_pool)}") print(f"frac-match | {args.frac_match}") if args.protocol == "ucx": print(f"tcp | {args.enable_tcp_over_ucx}") print(f"ib | {args.enable_infiniband}") print(f"nvlink | {args.enable_nvlink}") print(f"data-processed | {format_bytes(took_list[0][0])}") print("===============================") print("Wall-clock | Throughput") print("-------------------------------") for idx, (data_processed, took) in enumerate(took_list): throughput = int(data_processed / took) m = format_time(took) m += " " * (15 - len(m)) print(f"{m}| {format_bytes(throughput)}/s") t_runs[idx] = float(format_bytes(throughput).split(" ")[0]) print("===============================") if args.markdown: print("\n```") if args.plot is not None: plot_benchmark(t_runs, args.plot, historical=True) if args.backend == "dask": if args.markdown: print( "<details>\n<summary>Worker-Worker Transfer Rates</summary>\n\n```" ) print("(w1,w2) | 25% 50% 75% (total nbytes)") print("-------------------------------") for (d1, d2), bw in sorted(bandwidths.items()): fmt = ("(%s,%s) | %s %s %s (%s)" if args.multi_node or args.sched_addr else "(%02d,%02d) | %s %s %s (%s)") print(fmt % (d1, d2, bw[0], bw[1], bw[2], total_nbytes[(d1, d2)])) if args.markdown: print("```\n</details>\n") if args.multi_node: client.shutdown() client.close()
def start_client(cluster_file, n_workers): cluster = Client(scheduler_file=cluster_file) cluster.wait_for_workers(n_workers) return cluster
memory=args.memory, walltime=args.walltime, interface=args.interface, nanny=True, death_timeout='600s', local_directory=args.local_directory, shebang='#!/usr/bin/env bash', env_extra=["export TBB_CXX_TYPE=gcc"], job_extra=args.job_extra.split(','), queue=args.queue) print(args) print(cluster.job_script()) cluster.scale(jobs=args.nodes) client = Client(cluster) print(client) client.wait_for_workers(args.nodes) time.sleep(60) print(client.get_versions(check=True)) table = load_table(args.biom_table) counts = pd.DataFrame(np.array(table.matrix_data.todense()).T, index=table.ids(), columns=table.ids(axis='observation')) metadata = pd.read_table(args.metadata_file, index_col=0) replicates = metadata[args.replicates] batches = metadata[args.batches] # match everything up idx = list(set(counts.index) & set(replicates.index) & set(batches.index)) counts, replicates, batches = [x.loc[idx] for x in (counts, replicates, batches)]