def test_Client_kwargs(loop): with Client(loop=loop, processes=False, n_workers=2) as c: assert len(c.cluster.workers) == 2 assert all(isinstance(w, Worker) for w in c.cluster.workers) assert c.cluster.status == 'closed'
initialize_ray() num_cpus = ray.cluster_resources()["CPU"] elif execution_engine == "Dask": # pragma: no cover from distributed.client import get_client import warnings if threading.current_thread().name == "MainThread": warnings.warn("The Dask Engine for Modin is experimental.") try: client = get_client() except ValueError: from distributed import Client num_cpus = os.environ.get("MODIN_CPUS", None) or multiprocessing.cpu_count() client = Client(n_workers=int(num_cpus)) elif execution_engine != "Python": raise ImportError( "Unrecognized execution engine: {}.".format(execution_engine)) DEFAULT_NPARTITIONS = max(4, int(num_cpus)) __all__ = [ "DataFrame", "Series", "read_csv", "read_parquet", "read_json", "read_html", "read_clipboard", "read_excel",
ray.register_custom_serializer(types.MethodType, use_pickle=True) if execution_engine == "Ray": initialize_ray() num_cpus = ray.global_state.cluster_resources()["CPU"] elif execution_engine == "Dask": # pragma: no cover from distributed.client import _get_global_client if threading.current_thread().name == "MainThread": # initialize the dask client client = _get_global_client() if client is None: from distributed import Client client = Client() num_cpus = sum(client.ncores().values()) elif execution_engine != "Python": raise ImportError("Unrecognized execution engine: {}.".format(execution_engine)) DEFAULT_NPARTITIONS = max(4, int(num_cpus)) __all__ = [ "DataFrame", "Series", "read_csv", "read_parquet", "read_json", "read_html", "read_clipboard", "read_excel",
def create_client_and_cluster(n_jobs, dask_kwargs, entityset_size): Client, LocalCluster = get_client_cluster() cluster = None if 'cluster' in dask_kwargs: cluster = dask_kwargs['cluster'] else: # diagnostics_port sets the default port to launch bokeh web interface # if it is set to None web interface will not be launched diagnostics_port = None if 'diagnostics_port' in dask_kwargs: diagnostics_port = dask_kwargs['diagnostics_port'] del dask_kwargs['diagnostics_port'] workers = n_jobs_to_workers(n_jobs) if n_jobs != -1 and workers < n_jobs: warning_string = "{} workers requested, but only {} workers created." warning_string = warning_string.format(n_jobs, workers) warnings.warn(warning_string) # Distributed default memory_limit for worker is 'auto'. It calculates worker # memory limit as total virtual memory divided by the number # of cores available to the workers (alwasy 1 for featuretools setup). # This means reducing the number of workers does not increase the memory # limit for other workers. Featuretools default is to calculate memory limit # as total virtual memory divided by number of workers. To use distributed # default memory limit, set dask_kwargs['memory_limit']='auto' if 'memory_limit' in dask_kwargs: memory_limit = dask_kwargs['memory_limit'] del dask_kwargs['memory_limit'] else: total_memory = psutil.virtual_memory().total memory_limit = int(total_memory / float(workers)) cluster = LocalCluster(n_workers=workers, threads_per_worker=1, diagnostics_port=diagnostics_port, memory_limit=memory_limit, **dask_kwargs) # if cluster has bokeh port, notify user if unexpected port number if diagnostics_port is not None: if hasattr(cluster, 'scheduler') and cluster.scheduler: info = cluster.scheduler.identity() if 'bokeh' in info['services']: msg = "Dashboard started on port {}" print(msg.format(info['services']['bokeh'])) client = Client(cluster) warned_of_memory = False for worker in list(client.scheduler_info()['workers'].values()): worker_limit = worker['memory_limit'] if worker_limit < entityset_size: raise ValueError("Insufficient memory to use this many workers") elif worker_limit < 2 * entityset_size and not warned_of_memory: logger.warning( "Worker memory is between 1 to 2 times the memory" " size of the EntitySet. If errors occur that do" " not occur with n_jobs equals 1, this may be the " "cause. See https://featuretools.alteryx.com/en/stable/guides/performance.html#parallel-feature-computation" " for more information.") warned_of_memory = True return client, cluster
def test_rabit_ops(): from distributed import Client, LocalCluster n_workers = 3 with LocalCluster(n_workers=n_workers) as cluster: with Client(cluster) as client: run_rabit_ops(client, n_workers)
def client_secondary(loop, cluster_fixture): scheduler, workers = cluster_fixture with Client(scheduler["address"], loop=loop) as client: yield client
n_runs = 200 seed_start = 1000 already_finished = [ f.name for f in DATA_DIR.glob("*.parquet") if "epochs" in pd.read_parquet(f).columns and pd.read_parquet(f).epochs.max() >= epochs - 5 ] with open("tuned-hyperparameters.json", "r") as f: params = json.load(f) print("n_runs =", n_runs) cont = input("Ok? y/n : ") if cont.lower() == "n": sys.exit(1) client = Client("localhost:8786") def submit(seed, **kwargs): import train assert train.__version__ == "0.1" import adadamp assert adadamp.__version__ == "0.1.4" return train.main(epochs=epochs, verbose=False, seed=seed, tuning=False, **kwargs) futures = []
def test_Client_twice(loop): with Client(loop=loop, silence_logs=False, dashboard_address=None) as c: with Client(loop=loop, silence_logs=False, dashboard_address=None) as f: assert c.cluster.scheduler.port != f.cluster.scheduler.port
def test_blocks_until_full(loop): with Client(loop=loop) as c: assert len(c.nthreads()) > 0
def test_Client_solo(loop): with Client(loop=loop, silence_logs=False) as c: pass assert c.cluster.status == "closed"
def test_Client_kwargs(loop): with Client(loop=loop, processes=False, n_workers=2, silence_logs=False) as c: assert len(c.cluster.workers) == 2 assert all(isinstance(w, Worker) for w in c.cluster.workers.values()) assert c.cluster.status == "closed"
def create_dask_client(): print("Creating local cuda cluster as no dask scheduler is provided.") cluster = LocalCUDACluster() client = Client(cluster) print(client) return client
def get_dask_client(self): return Client(self.scheduler)
def test_Client_twice(loop): with Client(loop=loop) as c: with Client(loop=loop) as f: assert c.cluster.scheduler.port != f.cluster.scheduler.port
import time import numpy as np from netCDF4 import Dataset from datetime import datetime import sys import os """ Get the radars from given time. Input the range of dates and time wanted for the collection of images """ start_year = int(sys.argv[1]) start_month = int(sys.argv[2]) start_day = int(sys.argv[3]) end_year = int(sys.argv[4]) end_month = int(sys.argv[5]) end_day = int(sys.argv[6]) # serial = 0 run in parallel, = 1 run in serial serial = 0 times = time_procedures.get_radar_times_cpol(start_year, start_month, start_day, 19, 0, end_year, end_month, end_day, 0, 2) if (serial == 0): # Initalize the cluster. Adjust the number of workers to your liking. Cluster = LocalCluster(n_workers=4, processes=False) client = Client(Cluster) # Map the calls to multidop onto the workers the_futures = client.map(do_multidop_for_time, times[0]) wait(the_futures) else: for timer in times[0]: do_multidop_for_time(timer)
def test_client_cluster_synchronous(loop): with clean(threads=False): with Client(loop=loop, processes=False) as c: assert not c.asynchronous assert not c.cluster.asynchronous
def coro(): with dask.config.set(config): s = False for i in range(5): try: s, ws = yield start_cluster( ncores, scheduler, loop, security=security, Worker=Worker, scheduler_kwargs=scheduler_kwargs, worker_kwargs=worker_kwargs, ) except Exception as e: logger.error( "Failed to start gen_cluster, retrying", exc_info=True, ) else: workers[:] = ws args = [s] + workers break if s is False: raise Exception("Could not start cluster") if client: c = yield Client(s.address, loop=loop, security=security, asynchronous=True, **client_kwargs) args = [c] + args try: future = func(*args) if timeout: future = gen.with_timeout( timedelta(seconds=timeout), future) result = yield future if s.validate: s.validate_state() finally: if client and c.status not in ("closing", "closed"): yield c._close(fast=s.status == "closed") yield end_cluster(s, workers) yield gen.with_timeout( timedelta(seconds=1), cleanup_global_workers()) try: c = yield default_client() except ValueError: pass else: yield c._close(fast=True) for i in range(5): if all(c.closed() for c in Comm._instances): break else: yield gen.sleep(0.05) else: L = [ c for c in Comm._instances if not c.closed() ] Comm._instances.clear() # raise ValueError("Unclosed Comms", L) print("Unclosed Comms", L) raise gen.Return(result)
def _correct_errors(ra, err_rate, p_value=0.05): # True: use Dask's broadcast (ra transfer via inproc/tcp) # False: each worker reacs ra.pickle from disk use_dask_broadcast = False log.debug( "Available CPU / RAM: {} / {} GB".format( _get_cpu_count(), int(_get_available_memory() / 1024 ** 3) ), module_name="rmt_correction", ) n_workers = _calc_max_workers(ra) log.debug( "Estimated optimum n_workers: {}".format(n_workers), module_name="rmt_correction", ) if int(os.environ.get("SEQC_MAX_WORKERS", 0)) > 0: n_workers = int(os.environ.get("SEQC_MAX_WORKERS")) log.debug( "n_workers overridden with SEQC_MAX_WORKERS: {}".format(n_workers), module_name="rmt_correction", ) # n_workers = 1 # p_value = 0.005 # configure dask.distributed # memory_terminate_fraction doesn't work for some reason # https://github.com/dask/distributed/issues/3519 # https://docs.dask.org/en/latest/setup/single-distributed.html#localcluster # https://docs.dask.org/en/latest/scheduling.html#local-threads worker_kwargs = { "n_workers": n_workers, "threads_per_worker": 1, "processes": True, "memory_limit": "64G", "memory_target_fraction": 0.95, "memory_spill_fraction": 0.99, "memory_pause_fraction": False, # "memory_terminate_fraction": False, } # do not kill worker at 95% memory level dask.config.set({"distributed.worker.memory.terminate": False}) dask.config.set({"distributed.scheduler.allowed-failures": 50}) # setup Dask distributed client cluster = LocalCluster(**worker_kwargs) client = Client(cluster) # debug message log.debug( "Dask processes={} threads={}".format( len(client.nthreads().values()), np.sum(list(client.nthreads().values())) ), module_name="rmt_correction", ) log.debug( "Dask worker_kwargs " + " ".join([f"{k}={v}" for k, v in worker_kwargs.items()]), module_name="rmt_correction", ) log.debug("Dask Dashboard=" + client.dashboard_link, module_name="rmt_correction") # group by cells (same cell barcodes as one group) log.debug("Grouping...", module_name="rmt_correction") indices_grouped_by_cells = ra.group_indices_by_cell() if use_dask_broadcast: # send readarray in advance to all workers (i.e. broadcast=True) # this way, we reduce the serialization time log.debug("Scattering ReadArray...", module_name="rmt_correction") [future_ra] = client.scatter([ra], broadcast=True) else: # write ra to pickle which will be used later to parallel process rmt correction with open("pre-correction-ra.pickle", "wb") as fout: pickle.dump(ra, fout, protocol=4) # correct errors per cell group in parallel log.debug("Submitting jobs to Dask...", module_name="rmt_correction") with performance_report(filename="dask-report.html"): futures = [] # distribute chunks to workers evenly n_chunks = math.ceil(len(indices_grouped_by_cells) / n_workers) chunks = partition_all(n_chunks, indices_grouped_by_cells) for chunk in tqdm(chunks, disable=None): future = client.submit( _correct_errors_by_cell_group_chunks, future_ra if use_dask_broadcast else None, chunk, err_rate, p_value, ) futures.append(future) # wait until all done log.debug("Waiting untill all tasks complete...", module_name="rmt_correction") completed, not_completed = wait(futures) if len(not_completed) > 1: raise Exception("There are uncompleted tasks!") # gather the resutls and release log.debug( "Collecting the task results from the workers...", module_name="rmt_correction" ) results = [] for future in tqdm(completed, disable=None): # this returns a list of a list # len(result) should be the number of chunks e.g. 50 result = future.result() # remove empty lists result = list(filter(lambda x: len(x) > 0, result)) # aggregate and release results.extend(result) future.release() # clean up del futures del completed del not_completed client.shutdown() client.close() # iterate through the list of returned read indices and donor rmts # create a mapping tble of pre-/post-correction mapping = set() for result in results: for idx, idx_corrected_rmt in result: # record pre-/post-correction # skip if it's already marked as rmt error if ( ra.data["cell"][idx], ra.data["rmt"][idx], ra.data["rmt"][idx_corrected_rmt], ) in mapping: continue mapping.add( ( ra.data["cell"][idx], ra.data["rmt"][idx], ra.data["rmt"][idx_corrected_rmt], ) ) # iterate through the list of returned read indices and donor rmts # actually, update the read array object with corrected UMI for result in results: for idx, idx_corrected_rmt in result: # skip if it's already marked as rmt error if ra.data["status"][idx_corrected_rmt] & ra.filter_codes["rmt_error"]: continue # correct ra.data["rmt"][idx] = ra.data["rmt"][idx_corrected_rmt] # report error ra.data["status"][idx] |= ra.filter_codes["rmt_error"] return pd.DataFrame(mapping, columns=["CB", "UR", "UB"])
def tls_client(tls_cluster, loop, security): s, workers = tls_cluster with Client(s["address"], security=security, loop=loop) as client: yield client
def test_secede_with_no_processes(loop): # noqa: F811 # https://github.com/dask/distributed/issues/1775 with Client(loop=loop, processes=False, set_as_default=True): with parallel_backend('dask'): Parallel(n_jobs=4)(delayed(id)(i) for i in range(2))
def test_stream_shares_client_loop(loop): with cluster() as (s, [a, b]): with Client(s['address'], loop=loop) as client: # noqa: F841 source = Stream() d = source.timed_window('20ms').scatter() # noqa: F841 assert source.loop is client.loop
def test_dont_assume_function_purity(loop): # noqa: F811 with cluster() as (s, [a, b]): with Client(s['address'], loop=loop) as client: # noqa: F841 with parallel_backend('dask') as (ba, _): x, y = Parallel()(delayed(random2)() for i in range(2)) assert x != y
def test_empty_dmatrix_hist(): with LocalCluster(n_workers=5) as cluster: with Client(cluster) as client: parameters = {'tree_method': 'hist'} run_empty_dmatrix(client, parameters)
def setup(): from distributed import LocalCluster, Client cluster = LocalCluster(n_workers=1, threads_per_worker=1, processes=False) use_distributed(Client(cluster))
async def test_config(cleanup): async with Scheduler() as s: async with Nanny(s.address, config={"foo": "bar"}) as n: async with Client(s.address, asynchronous=True) as client: config = await client.run(dask.config.get, "foo") assert config[n.worker_address] == "bar"
def client(): client = Client(processes=False, asynchronous=False) try: yield client finally: client.close()
def test_empty_dmatrix_approx(): with LocalCluster(n_workers=kWorkers) as cluster: with Client(cluster) as client: parameters = {'tree_method': 'approx'} run_empty_dmatrix_reg(client, parameters) run_empty_dmatrix_cls(client, parameters)
def make_datasets(in_csv, out_dir): """Processes csv file and saves a curated dataset to disk. Parameters ---------- in-csv: str path to csv file in local disk out_dir: directory where files should be saved to. Returns ------- None """ log = logging.getLogger('make-dataset') out_dir = Path(out_dir) out_dir.mkdir(parents=True, exist_ok=True) # Connect to the dask cluster log.info( f'Starting make_datasets with in_csv: {in_csv} and out_dir: {out_dir}') log.info('Connecting to cluster') c = Client('dask-scheduler:8786') # load data as a dask Dataframe if you have trouble with dask # please fall back to pandas or numpy log.info('Reading csv file') ddf = dd.read_csv(in_csv, blocksize=1e6) log.info('ouput dataframe head') log.info(ddf.head()) log.info('Trace 1') # we set the index so we can properly execute loc below ddf = ddf.set_index('Unnamed: 0') # trigger computation n_samples = len(ddf) # Fill NaN values with new 'Unknown' category ddf['country'] = ddf['country'].fillna('Unknown') ddf['province'] = ddf['province'].fillna('Unknown') ddf['taster_name'] = ddf['taster_name'].fillna('Unknown') log.info('Trace 2') # Fill region_1 missing values using the 'province' column. # Most common value for each province will be used. Rest are labeled Unknown mode = dd.Aggregation('mode', chunk, agg, finalize) most_common_region = ddf.groupby(['province']).agg({ 'region_1': mode }).compute() ddf['region_1'] = ddf.apply( lambda x: most_common_region.loc[x.province, 'region_1'] if x.province in most_common_region['region_1'].index else 'Unknown', axis=1).where(ddf['region_1'].isna(), ddf['region_1']) log.info('Trace 3') # We fill price values with the province's average price. If that is # not available, we use the global average price mean_prices = ddf.groupby(['province'])['price'].mean().compute() global_mean = ddf['price'].mean().compute() mean_prices = mean_prices.fillna(global_mean) ddf['price'] = ddf.apply(lambda x: mean_prices[x['province']], axis=1, meta=('x', 'f8')).where(ddf['price'].isna(), ddf['price']) # Drop this columns as explained in notebook ddf = ddf.drop([ 'description', 'designation', 'region_2', 'taster_twitter_handle', 'title' ], axis=1) # Encode categorical values using one-hot encoding. # This results in >6k columns. Maybe we'll need to change the encoding type # for some features such as 'winery' with so many unique values. # Also, I think this should be done in the model task. ddf = ddf.categorize() # encoder = DummyEncoder() # ddf = encoder.fit_transform(ddf) # # Normalize price values # scaler = StandardScaler() # ddf['price'] = scaler.fit_transform(ddf[['price']]).price log.info('dataset processed') # split dataset into train test feel free to adjust test percentage idx = np.arange(n_samples) test_idx = idx[:n_samples // 10] test = ddf.loc[test_idx] train_idx = idx[n_samples // 10:] train = ddf.loc[train_idx] # This also shuffles the data. Not sure if csv was shuffled before.. # train, test = ddf.random_split([0.9, 0.1], shuffle=True) _save_datasets(train, test, out_dir)
def test_Client_with_local(loop): with LocalCluster(1, scheduler_port=0, silence_logs=False, diagnostics_port=None, loop=loop) as c: with Client(c, loop=loop) as e: assert len(e.ncores()) == len(c.workers) assert c.scheduler_address in repr(c)
def test_Client_solo(loop): with Client(loop=loop) as c: pass assert c.cluster.status == 'closed'