Example #1
0
def test_Client_kwargs(loop):
    with Client(loop=loop, processes=False, n_workers=2) as c:
        assert len(c.cluster.workers) == 2
        assert all(isinstance(w, Worker) for w in c.cluster.workers)
    assert c.cluster.status == 'closed'
Example #2
0
    initialize_ray()
    num_cpus = ray.cluster_resources()["CPU"]
elif execution_engine == "Dask":  # pragma: no cover
    from distributed.client import get_client
    import warnings

    if threading.current_thread().name == "MainThread":
        warnings.warn("The Dask Engine for Modin is experimental.")
        try:
            client = get_client()
        except ValueError:
            from distributed import Client

            num_cpus = os.environ.get("MODIN_CPUS",
                                      None) or multiprocessing.cpu_count()
            client = Client(n_workers=int(num_cpus))
elif execution_engine != "Python":
    raise ImportError(
        "Unrecognized execution engine: {}.".format(execution_engine))

DEFAULT_NPARTITIONS = max(4, int(num_cpus))

__all__ = [
    "DataFrame",
    "Series",
    "read_csv",
    "read_parquet",
    "read_json",
    "read_html",
    "read_clipboard",
    "read_excel",
Example #3
0
        ray.register_custom_serializer(types.MethodType, use_pickle=True)


if execution_engine == "Ray":
    initialize_ray()
    num_cpus = ray.global_state.cluster_resources()["CPU"]
elif execution_engine == "Dask":  # pragma: no cover
    from distributed.client import _get_global_client

    if threading.current_thread().name == "MainThread":
        # initialize the dask client
        client = _get_global_client()
        if client is None:
            from distributed import Client

            client = Client()
        num_cpus = sum(client.ncores().values())
elif execution_engine != "Python":
    raise ImportError("Unrecognized execution engine: {}.".format(execution_engine))

DEFAULT_NPARTITIONS = max(4, int(num_cpus))

__all__ = [
    "DataFrame",
    "Series",
    "read_csv",
    "read_parquet",
    "read_json",
    "read_html",
    "read_clipboard",
    "read_excel",
Example #4
0
def create_client_and_cluster(n_jobs, dask_kwargs, entityset_size):
    Client, LocalCluster = get_client_cluster()

    cluster = None
    if 'cluster' in dask_kwargs:
        cluster = dask_kwargs['cluster']
    else:
        # diagnostics_port sets the default port to launch bokeh web interface
        # if it is set to None web interface will not be launched
        diagnostics_port = None
        if 'diagnostics_port' in dask_kwargs:
            diagnostics_port = dask_kwargs['diagnostics_port']
            del dask_kwargs['diagnostics_port']

        workers = n_jobs_to_workers(n_jobs)
        if n_jobs != -1 and workers < n_jobs:
            warning_string = "{} workers requested, but only {} workers created."
            warning_string = warning_string.format(n_jobs, workers)
            warnings.warn(warning_string)

        # Distributed default memory_limit for worker is 'auto'. It calculates worker
        # memory limit as total virtual memory divided by the number
        # of cores available to the workers (alwasy 1 for featuretools setup).
        # This means reducing the number of workers does not increase the memory
        # limit for other workers.  Featuretools default is to calculate memory limit
        # as total virtual memory divided by number of workers. To use distributed
        # default memory limit, set dask_kwargs['memory_limit']='auto'
        if 'memory_limit' in dask_kwargs:
            memory_limit = dask_kwargs['memory_limit']
            del dask_kwargs['memory_limit']
        else:
            total_memory = psutil.virtual_memory().total
            memory_limit = int(total_memory / float(workers))

        cluster = LocalCluster(n_workers=workers,
                               threads_per_worker=1,
                               diagnostics_port=diagnostics_port,
                               memory_limit=memory_limit,
                               **dask_kwargs)

        # if cluster has bokeh port, notify user if unexpected port number
        if diagnostics_port is not None:
            if hasattr(cluster, 'scheduler') and cluster.scheduler:
                info = cluster.scheduler.identity()
                if 'bokeh' in info['services']:
                    msg = "Dashboard started on port {}"
                    print(msg.format(info['services']['bokeh']))

    client = Client(cluster)

    warned_of_memory = False
    for worker in list(client.scheduler_info()['workers'].values()):
        worker_limit = worker['memory_limit']
        if worker_limit < entityset_size:
            raise ValueError("Insufficient memory to use this many workers")
        elif worker_limit < 2 * entityset_size and not warned_of_memory:
            logger.warning(
                "Worker memory is between 1 to 2 times the memory"
                " size of the EntitySet. If errors occur that do"
                " not occur with n_jobs equals 1, this may be the "
                "cause.  See https://featuretools.alteryx.com/en/stable/guides/performance.html#parallel-feature-computation"
                " for more information.")
            warned_of_memory = True

    return client, cluster
Example #5
0
def test_rabit_ops():
    from distributed import Client, LocalCluster
    n_workers = 3
    with LocalCluster(n_workers=n_workers) as cluster:
        with Client(cluster) as client:
            run_rabit_ops(client, n_workers)
Example #6
0
def client_secondary(loop, cluster_fixture):
    scheduler, workers = cluster_fixture
    with Client(scheduler["address"], loop=loop) as client:
        yield client
Example #7
0
    n_runs = 200
    seed_start = 1000

    already_finished = [
        f.name for f in DATA_DIR.glob("*.parquet")
        if "epochs" in pd.read_parquet(f).columns
        and pd.read_parquet(f).epochs.max() >= epochs - 5
    ]
    with open("tuned-hyperparameters.json", "r") as f:
        params = json.load(f)
    print("n_runs =", n_runs)
    cont = input("Ok? y/n : ")
    if cont.lower() == "n":
        sys.exit(1)

    client = Client("localhost:8786")

    def submit(seed, **kwargs):
        import train
        assert train.__version__ == "0.1"

        import adadamp
        assert adadamp.__version__ == "0.1.4"

        return train.main(epochs=epochs,
                          verbose=False,
                          seed=seed,
                          tuning=False,
                          **kwargs)

    futures = []
def test_Client_twice(loop):
    with Client(loop=loop, silence_logs=False, dashboard_address=None) as c:
        with Client(loop=loop, silence_logs=False,
                    dashboard_address=None) as f:
            assert c.cluster.scheduler.port != f.cluster.scheduler.port
def test_blocks_until_full(loop):
    with Client(loop=loop) as c:
        assert len(c.nthreads()) > 0
def test_Client_solo(loop):
    with Client(loop=loop, silence_logs=False) as c:
        pass
    assert c.cluster.status == "closed"
def test_Client_kwargs(loop):
    with Client(loop=loop, processes=False, n_workers=2,
                silence_logs=False) as c:
        assert len(c.cluster.workers) == 2
        assert all(isinstance(w, Worker) for w in c.cluster.workers.values())
    assert c.cluster.status == "closed"
Example #12
0
def create_dask_client():
    print("Creating local cuda cluster as no dask scheduler is provided.")
    cluster = LocalCUDACluster()
    client = Client(cluster)
    print(client)
    return client
Example #13
0
 def get_dask_client(self):
     return Client(self.scheduler)
Example #14
0
def test_Client_twice(loop):
    with Client(loop=loop) as c:
        with Client(loop=loop) as f:
            assert c.cluster.scheduler.port != f.cluster.scheduler.port
Example #15
0
import time
import numpy as np
from netCDF4 import Dataset
from datetime import datetime
import sys
import os
""" Get the radars from given time.
Input the range of dates and time wanted for the collection of images """
start_year = int(sys.argv[1])
start_month = int(sys.argv[2])
start_day = int(sys.argv[3])
end_year = int(sys.argv[4])
end_month = int(sys.argv[5])
end_day = int(sys.argv[6])

# serial = 0 run in parallel, = 1 run in serial
serial = 0
times = time_procedures.get_radar_times_cpol(start_year, start_month,
                                             start_day, 19, 0, end_year,
                                             end_month, end_day, 0, 2)
if (serial == 0):
    # Initalize the cluster. Adjust the number of workers to your liking.
    Cluster = LocalCluster(n_workers=4, processes=False)
    client = Client(Cluster)
    # Map the calls to multidop onto the workers
    the_futures = client.map(do_multidop_for_time, times[0])
    wait(the_futures)
else:
    for timer in times[0]:
        do_multidop_for_time(timer)
def test_client_cluster_synchronous(loop):
    with clean(threads=False):
        with Client(loop=loop, processes=False) as c:
            assert not c.asynchronous
            assert not c.cluster.asynchronous
                    def coro():
                        with dask.config.set(config):
                            s = False
                            for i in range(5):
                                try:
                                    s, ws = yield start_cluster(
                                        ncores,
                                        scheduler,
                                        loop,
                                        security=security,
                                        Worker=Worker,
                                        scheduler_kwargs=scheduler_kwargs,
                                        worker_kwargs=worker_kwargs,
                                    )
                                except Exception as e:
                                    logger.error(
                                        "Failed to start gen_cluster, retrying",
                                        exc_info=True,
                                    )
                                else:
                                    workers[:] = ws
                                    args = [s] + workers
                                    break
                            if s is False:
                                raise Exception("Could not start cluster")
                            if client:
                                c = yield Client(s.address,
                                                 loop=loop,
                                                 security=security,
                                                 asynchronous=True,
                                                 **client_kwargs)
                                args = [c] + args
                            try:
                                future = func(*args)
                                if timeout:
                                    future = gen.with_timeout(
                                        timedelta(seconds=timeout), future)
                                result = yield future
                                if s.validate:
                                    s.validate_state()
                            finally:
                                if client and c.status not in ("closing",
                                                               "closed"):
                                    yield c._close(fast=s.status == "closed")
                                yield end_cluster(s, workers)
                                yield gen.with_timeout(
                                    timedelta(seconds=1),
                                    cleanup_global_workers())

                            try:
                                c = yield default_client()
                            except ValueError:
                                pass
                            else:
                                yield c._close(fast=True)

                            for i in range(5):
                                if all(c.closed() for c in Comm._instances):
                                    break
                                else:
                                    yield gen.sleep(0.05)
                            else:
                                L = [
                                    c for c in Comm._instances
                                    if not c.closed()
                                ]
                                Comm._instances.clear()
                                # raise ValueError("Unclosed Comms", L)
                                print("Unclosed Comms", L)

                            raise gen.Return(result)
Example #18
0
def _correct_errors(ra, err_rate, p_value=0.05):

    # True: use Dask's broadcast (ra transfer via inproc/tcp)
    # False: each worker reacs ra.pickle from disk
    use_dask_broadcast = False

    log.debug(
        "Available CPU / RAM: {} / {} GB".format(
            _get_cpu_count(), int(_get_available_memory() / 1024 ** 3)
        ),
        module_name="rmt_correction",
    )

    n_workers = _calc_max_workers(ra)

    log.debug(
        "Estimated optimum n_workers: {}".format(n_workers),
        module_name="rmt_correction",
    )

    if int(os.environ.get("SEQC_MAX_WORKERS", 0)) > 0:
        n_workers = int(os.environ.get("SEQC_MAX_WORKERS"))
        log.debug(
            "n_workers overridden with SEQC_MAX_WORKERS: {}".format(n_workers),
            module_name="rmt_correction",
        )

    # n_workers = 1
    # p_value = 0.005

    # configure dask.distributed
    # memory_terminate_fraction doesn't work for some reason
    # https://github.com/dask/distributed/issues/3519
    # https://docs.dask.org/en/latest/setup/single-distributed.html#localcluster
    # https://docs.dask.org/en/latest/scheduling.html#local-threads
    worker_kwargs = {
        "n_workers": n_workers,
        "threads_per_worker": 1,
        "processes": True,
        "memory_limit": "64G",
        "memory_target_fraction": 0.95,
        "memory_spill_fraction": 0.99,
        "memory_pause_fraction": False,
        # "memory_terminate_fraction": False,
    }

    # do not kill worker at 95% memory level
    dask.config.set({"distributed.worker.memory.terminate": False})
    dask.config.set({"distributed.scheduler.allowed-failures": 50})

    # setup Dask distributed client
    cluster = LocalCluster(**worker_kwargs)
    client = Client(cluster)

    # debug message
    log.debug(
        "Dask processes={} threads={}".format(
            len(client.nthreads().values()), np.sum(list(client.nthreads().values()))
        ),
        module_name="rmt_correction",
    )
    log.debug(
        "Dask worker_kwargs "
        + " ".join([f"{k}={v}" for k, v in worker_kwargs.items()]),
        module_name="rmt_correction",
    )
    log.debug("Dask Dashboard=" + client.dashboard_link, module_name="rmt_correction")

    # group by cells (same cell barcodes as one group)
    log.debug("Grouping...", module_name="rmt_correction")
    indices_grouped_by_cells = ra.group_indices_by_cell()

    if use_dask_broadcast:
        # send readarray in advance to all workers (i.e. broadcast=True)
        # this way, we reduce the serialization time
        log.debug("Scattering ReadArray...", module_name="rmt_correction")
        [future_ra] = client.scatter([ra], broadcast=True)
    else:
        # write ra to pickle which will be used later to parallel process rmt correction
        with open("pre-correction-ra.pickle", "wb") as fout:
            pickle.dump(ra, fout, protocol=4)

    # correct errors per cell group in parallel
    log.debug("Submitting jobs to Dask...", module_name="rmt_correction")
    with performance_report(filename="dask-report.html"):
        futures = []

        # distribute chunks to workers evenly
        n_chunks = math.ceil(len(indices_grouped_by_cells) / n_workers)
        chunks = partition_all(n_chunks, indices_grouped_by_cells)

        for chunk in tqdm(chunks, disable=None):

            future = client.submit(
                _correct_errors_by_cell_group_chunks,
                future_ra if use_dask_broadcast else None,
                chunk,
                err_rate,
                p_value,
            )
            futures.append(future)

        # wait until all done
        log.debug("Waiting untill all tasks complete...", module_name="rmt_correction")
        completed, not_completed = wait(futures)

    if len(not_completed) > 1:
        raise Exception("There are uncompleted tasks!")

    # gather the resutls and release
    log.debug(
        "Collecting the task results from the workers...", module_name="rmt_correction"
    )
    results = []
    for future in tqdm(completed, disable=None):
        # this returns a list of a list
        # len(result) should be the number of chunks e.g. 50
        result = future.result()

        # remove empty lists
        result = list(filter(lambda x: len(x) > 0, result))

        # aggregate and release
        results.extend(result)
        future.release()

    # clean up
    del futures
    del completed
    del not_completed

    client.shutdown()
    client.close()

    # iterate through the list of returned read indices and donor rmts
    # create a mapping tble of pre-/post-correction
    mapping = set()
    for result in results:
        for idx, idx_corrected_rmt in result:

            # record pre-/post-correction
            # skip if it's already marked as rmt error
            if (
                ra.data["cell"][idx],
                ra.data["rmt"][idx],
                ra.data["rmt"][idx_corrected_rmt],
            ) in mapping:
                continue

            mapping.add(
                (
                    ra.data["cell"][idx],
                    ra.data["rmt"][idx],
                    ra.data["rmt"][idx_corrected_rmt],
                )
            )

    # iterate through the list of returned read indices and donor rmts
    # actually, update the read array object with corrected UMI
    for result in results:
        for idx, idx_corrected_rmt in result:

            # skip if it's already marked as rmt error
            if ra.data["status"][idx_corrected_rmt] & ra.filter_codes["rmt_error"]:
                continue

            # correct
            ra.data["rmt"][idx] = ra.data["rmt"][idx_corrected_rmt]

            # report error
            ra.data["status"][idx] |= ra.filter_codes["rmt_error"]

    return pd.DataFrame(mapping, columns=["CB", "UR", "UB"])
Example #19
0
def tls_client(tls_cluster, loop, security):
    s, workers = tls_cluster
    with Client(s["address"], security=security, loop=loop) as client:
        yield client
Example #20
0
def test_secede_with_no_processes(loop):  # noqa: F811
    # https://github.com/dask/distributed/issues/1775
    with Client(loop=loop, processes=False, set_as_default=True):
        with parallel_backend('dask'):
            Parallel(n_jobs=4)(delayed(id)(i) for i in range(2))
Example #21
0
def test_stream_shares_client_loop(loop):
    with cluster() as (s, [a, b]):
        with Client(s['address'], loop=loop) as client:  # noqa: F841
            source = Stream()
            d = source.timed_window('20ms').scatter()  # noqa: F841
            assert source.loop is client.loop
Example #22
0
def test_dont_assume_function_purity(loop):  # noqa: F811
    with cluster() as (s, [a, b]):
        with Client(s['address'], loop=loop) as client:  # noqa: F841
            with parallel_backend('dask') as (ba, _):
                x, y = Parallel()(delayed(random2)() for i in range(2))
                assert x != y
Example #23
0
def test_empty_dmatrix_hist():
    with LocalCluster(n_workers=5) as cluster:
        with Client(cluster) as client:
            parameters = {'tree_method': 'hist'}
            run_empty_dmatrix(client, parameters)
Example #24
0
def setup():
    from distributed import LocalCluster, Client
    cluster = LocalCluster(n_workers=1, threads_per_worker=1, processes=False)
    use_distributed(Client(cluster))
Example #25
0
async def test_config(cleanup):
    async with Scheduler() as s:
        async with Nanny(s.address, config={"foo": "bar"}) as n:
            async with Client(s.address, asynchronous=True) as client:
                config = await client.run(dask.config.get, "foo")
                assert config[n.worker_address] == "bar"
Example #26
0
def client():
    client = Client(processes=False, asynchronous=False)
    try:
        yield client
    finally:
        client.close()
Example #27
0
def test_empty_dmatrix_approx():
    with LocalCluster(n_workers=kWorkers) as cluster:
        with Client(cluster) as client:
            parameters = {'tree_method': 'approx'}
            run_empty_dmatrix_reg(client, parameters)
            run_empty_dmatrix_cls(client, parameters)
Example #28
0
def make_datasets(in_csv, out_dir):
    """Processes csv file and saves a curated dataset to disk.

    Parameters
    ----------
    in-csv: str
        path to csv file in local disk
    out_dir:
        directory where files should be saved to.

    Returns
    -------
    None
    """
    log = logging.getLogger('make-dataset')
    out_dir = Path(out_dir)
    out_dir.mkdir(parents=True, exist_ok=True)

    # Connect to the dask cluster
    log.info(
        f'Starting make_datasets with in_csv: {in_csv} and out_dir: {out_dir}')
    log.info('Connecting to cluster')
    c = Client('dask-scheduler:8786')

    # load data as a dask Dataframe if you have trouble with dask
    # please fall back to pandas or numpy
    log.info('Reading csv file')
    ddf = dd.read_csv(in_csv, blocksize=1e6)

    log.info('ouput dataframe head')
    log.info(ddf.head())
    log.info('Trace 1')
    # we set the index so we can properly execute loc below
    ddf = ddf.set_index('Unnamed: 0')

    # trigger computation
    n_samples = len(ddf)

    # Fill NaN values with new 'Unknown' category
    ddf['country'] = ddf['country'].fillna('Unknown')
    ddf['province'] = ddf['province'].fillna('Unknown')
    ddf['taster_name'] = ddf['taster_name'].fillna('Unknown')
    log.info('Trace 2')
    # Fill region_1 missing values using the 'province' column.
    # Most common value for each province will be used. Rest are labeled Unknown
    mode = dd.Aggregation('mode', chunk, agg, finalize)
    most_common_region = ddf.groupby(['province']).agg({
        'region_1': mode
    }).compute()
    ddf['region_1'] = ddf.apply(
        lambda x: most_common_region.loc[x.province, 'region_1']
        if x.province in most_common_region['region_1'].index else 'Unknown',
        axis=1).where(ddf['region_1'].isna(), ddf['region_1'])
    log.info('Trace 3')
    # We fill price values with the province's average price. If that is
    # not available, we use the global average price
    mean_prices = ddf.groupby(['province'])['price'].mean().compute()
    global_mean = ddf['price'].mean().compute()
    mean_prices = mean_prices.fillna(global_mean)
    ddf['price'] = ddf.apply(lambda x: mean_prices[x['province']],
                             axis=1,
                             meta=('x', 'f8')).where(ddf['price'].isna(),
                                                     ddf['price'])
    # Drop this columns as explained in notebook
    ddf = ddf.drop([
        'description', 'designation', 'region_2', 'taster_twitter_handle',
        'title'
    ],
                   axis=1)

    # Encode categorical values using one-hot encoding.
    # This results in >6k columns. Maybe we'll need to change the encoding type
    # for some features such as 'winery' with so many unique values.
    # Also, I think this should be done in the model task.
    ddf = ddf.categorize()
    # encoder = DummyEncoder()
    # ddf = encoder.fit_transform(ddf)

    # # Normalize price values
    # scaler = StandardScaler()
    # ddf['price'] = scaler.fit_transform(ddf[['price']]).price
    log.info('dataset processed')

    # split dataset into train test feel free to adjust test percentage
    idx = np.arange(n_samples)
    test_idx = idx[:n_samples // 10]
    test = ddf.loc[test_idx]

    train_idx = idx[n_samples // 10:]
    train = ddf.loc[train_idx]

    # This also shuffles the data. Not sure if csv was shuffled before..
    # train, test = ddf.random_split([0.9, 0.1], shuffle=True)

    _save_datasets(train, test, out_dir)
Example #29
0
def test_Client_with_local(loop):
    with LocalCluster(1, scheduler_port=0, silence_logs=False,
                      diagnostics_port=None, loop=loop) as c:
        with Client(c, loop=loop) as e:
            assert len(e.ncores()) == len(c.workers)
            assert c.scheduler_address in repr(c)
Example #30
0
def test_Client_solo(loop):
    with Client(loop=loop) as c:
        pass
    assert c.cluster.status == 'closed'