Example #1
0
def test_publish_bag(s, a, b):
    db = pytest.importorskip('dask.bag')
    c = Client((s.ip, s.port), start=False)
    yield c._start()
    f = Client((s.ip, s.port), start=False)
    yield f._start()

    bag = db.from_sequence([0, 1, 2])
    bagp = c.persist(bag)

    assert len(futures_of(bagp)) == 3
    keys = {f.key for f in futures_of(bagp)}
    assert keys == set(bag.dask)

    yield c._publish_dataset(data=bagp)

    # check that serialization didn't affect original bag's dask
    assert len(futures_of(bagp)) == 3

    result = yield f._get_dataset('data')
    assert set(result.dask.keys()) == set(bagp.dask.keys())
    assert {f.key for f in result.dask.values()} == {f.key for f in bagp.dask.values()}

    out = yield f.compute(result)._result()
    assert out == [0, 1, 2]
    yield c._shutdown()
    yield f._shutdown()
Example #2
0
def test_multiple_maxlen(c, s, a, b):
    c2 = Client((s.ip, s.port), start=False)
    yield c2._start()

    x = c.channel('x', maxlen=10)
    assert x.futures.maxlen == 10
    x2 = c2.channel('x', maxlen=20)
    assert x2.futures.maxlen == 20

    for i in range(10):
        x.append(c.submit(inc, i))

    while len(s.wants_what[c2.id]) < 10:
        yield gen.sleep(0.01)

    for i in range(10, 20):
        x.append(c.submit(inc, i))

    while len(x2) < 20:
        yield gen.sleep(0.01)

    yield gen.sleep(0.1)

    assert len(x2) == 20  # They stay this long after a delay
    assert len(s.task_state) == 20

    yield c2._shutdown()
Example #3
0
def create_client_and_cluster(n_jobs, num_tasks, dask_kwargs, entityset_size):
    cluster = None
    if 'cluster' in dask_kwargs:
        cluster = dask_kwargs['cluster']
    else:
        # diagnostics_port sets the default port to launch bokeh web interface
        # if it is set to None web interface will not be launched
        diagnostics_port = None
        if 'diagnostics_port' in dask_kwargs:
            diagnostics_port = dask_kwargs['diagnostics_port']
            del dask_kwargs['diagnostics_port']

        workers = n_jobs_to_workers(n_jobs)
        workers = min(workers, num_tasks)

        # Distributed default memory_limit for worker is 'auto'. It calculates worker
        # memory limit as total virtual memory divided by the number
        # of cores available to the workers (alwasy 1 for featuretools setup).
        # This means reducing the number of workers does not increase the memory
        # limit for other workers.  Featuretools default is to calculate memory limit
        # as total virtual memory divided by number of workers. To use distributed
        # default memory limit, set dask_kwargs['memory_limit']='auto'
        if 'memory_limit' in dask_kwargs:
            memory_limit = dask_kwargs['memory_limit']
            del dask_kwargs['memory_limit']
        else:
            total_memory = psutil.virtual_memory().total
            memory_limit = int(total_memory / float(workers))

        cluster = LocalCluster(n_workers=workers,
                               threads_per_worker=1,
                               diagnostics_port=diagnostics_port,
                               memory_limit=memory_limit,
                               **dask_kwargs)

        # if cluster has bokeh port, notify user if unxepected port number
        if diagnostics_port is not None:
            if hasattr(cluster, 'scheduler') and cluster.scheduler:
                info = cluster.scheduler.identity()
                if 'bokeh' in info['services']:
                    msg = "Dashboard started on port {}"
                    print(msg.format(info['services']['bokeh']))

    client = Client(cluster)

    warned_of_memory = False
    for worker in list(client.scheduler_info()['workers'].values()):
        worker_limit = worker['memory_limit']
        if worker_limit < entityset_size:
            raise ValueError("Insufficient memory to use this many workers")
        elif worker_limit < 2 * entityset_size and not warned_of_memory:
            logger.warn("Worker memory is between 1 to 2 times the memory"
                        " size of the EntitySet. If errors occur that do"
                        " not occur with n_jobs equals 1, this may be the "
                        "cause.  See https://docs.featuretools.com/guides/parallel.html"
                        " for more information.")
            warned_of_memory = True

    return client, cluster
Example #4
0
class ClusterDaskDistributor(DistributorBaseClass):
    """
    Distributor using a dask cluster, meaning that the calculation is spread over a cluster
    """

    def __init__(self, address):
        """
        Sets up a distributor that connects to a Dask Scheduler to distribute the calculaton of the features

        :param address: the ip address and port number of the Dask Scheduler
        :type address: str
        """

        from distributed import Client

        self.client = Client(address=address)

    def calculate_best_chunk_size(self, data_length):
        """
        Uses the number of dask workers in the cluster (during execution time, meaning when you start the extraction)
        to find the optimal chunk_size.

        :param data_length: A length which defines how many calculations there need to be.
        :type data_length: int
        """
        n_workers = len(self.client.scheduler_info()["workers"])
        chunk_size, extra = divmod(data_length, n_workers * 5)
        if extra:
            chunk_size += 1
        return chunk_size

    def distribute(self, func, partitioned_chunks, kwargs):
        """
        Calculates the features in a parallel fashion by distributing the map command to the dask workers on a cluster

        :param func: the function to send to each worker.
        :type func: callable
        :param partitioned_chunks: The list of data chunks - each element is again
            a list of chunks - and should be processed by one worker.
        :type partitioned_chunks: iterable
        :param kwargs: parameters for the map function
        :type kwargs: dict of string to parameter

        :return: The result of the calculation as a list - each item should be the result of the application of func
            to a single element.
        """

        result = self.client.gather(self.client.map(partial(func, **kwargs), partitioned_chunks))
        return [item for sublist in result for item in sublist]

    def close(self):
        """
        Closes the connection to the Dask Scheduler
        """
        self.client.close()
Example #5
0
class LocalDaskDistributor(DistributorBaseClass):
    """
    Distributor using a local dask cluster and inproc communication.
    """

    def __init__(self, n_workers):
        """

        Initiates a LocalDaskDistributor instance.

        :param n_workers: How many workers should the local dask cluster have?
        :type n_workers: int
        """

        from distributed import LocalCluster, Client
        import tempfile

        # attribute .local_dir_ is the path where the local dask workers store temporary files
        self.local_dir_ = tempfile.mkdtemp()
        cluster = LocalCluster(n_workers=n_workers, processes=False, local_dir=self.local_dir_)

        self.client = Client(cluster)
        self.n_workers = n_workers

    def distribute(self, func, partitioned_chunks, kwargs):
        """
        Calculates the features in a parallel fashion by distributing the map command to the dask workers on a local
        machine

        :param func: the function to send to each worker.
        :type func: callable
        :param partitioned_chunks: The list of data chunks - each element is again
            a list of chunks - and should be processed by one worker.
        :type partitioned_chunks: iterable
        :param kwargs: parameters for the map function
        :type kwargs: dict of string to parameter

        :return: The result of the calculation as a list - each item should be the result of the application of func
            to a single element.
        """
        result = self.client.gather(self.client.map(partial(func, **kwargs), partitioned_chunks))
        return [item for sublist in result for item in sublist]

    def close(self):
        """
        Closes the connection to the local Dask Scheduler
        """
        self.client.close()
Example #6
0
def test_adaptive_local_cluster_multi_workers():
    loop = IOLoop.current()
    cluster = LocalCluster(0, scheduler_port=0, silence_logs=False, nanny=False,
                      diagnostic_port=None, loop=loop, start=False)
    alc = Adaptive(cluster.scheduler, cluster, interval=100)
    c = Client(cluster, start=False, loop=loop)
    yield c._start()

    for i in range(20):
        futures = c.map(slowinc, range(100), delay=0.01)
        yield c._gather(futures)
        del futures
        yield gen.sleep(0.1)

    yield c._shutdown()
    yield cluster._close()
Example #7
0
def test_adaptive_local_cluster_multi_workers():
    loop = IOLoop.current()
    cluster = LocalCluster(0, scheduler_port=0, silence_logs=False, nanny=False,
                           diagnostics_port=None, loop=loop, start=False)
    cluster.scheduler.allowed_failures = 1000
    alc = Adaptive(cluster.scheduler, cluster, interval=100)
    c = Client(cluster, start=False, loop=loop)
    yield c._start()

    futures = c.map(slowinc, range(100), delay=0.01)

    start = time()
    while not cluster.scheduler.worker_info:
        yield gen.sleep(0.01)
        assert time() < start + 15

    yield c._gather(futures)
    del futures

    start = time()
    while cluster.workers:
        yield gen.sleep(0.01)
        assert time() < start + 5

    assert not cluster.workers
    yield gen.sleep(0.2)
    assert not cluster.workers

    futures = c.map(slowinc, range(100), delay=0.01)
    yield c._gather(futures)

    yield c._shutdown()
    yield cluster._close()
Example #8
0
def test_wait_for_workers_timeout():
    # Start a cluster with 0 worker:
    cluster = LocalCluster(n_workers=0, processes=False, threads_per_worker=2)
    client = Client(cluster)
    try:
        with parallel_backend('dask', wait_for_workers_timeout=0.1):
            # Short timeout: DaskDistributedBackend
            msg = "DaskDistributedBackend has no worker after 0.1 seconds."
            with pytest.raises(TimeoutError, match=msg):
                Parallel()(delayed(inc)(i) for i in range(10))

        with parallel_backend('dask', wait_for_workers_timeout=0):
            # No timeout: fallback to generic joblib failure:
            msg = "DaskDistributedBackend has no active worker"
            with pytest.raises(RuntimeError, match=msg):
                Parallel()(delayed(inc)(i) for i in range(10))
    finally:
        client.close()
        cluster.close()
Example #9
0
def test_wait_for_workers(cluster_strategy):
    cluster = LocalCluster(n_workers=0, processes=False, threads_per_worker=2)
    client = Client(cluster)
    if cluster_strategy == "adaptive":
        cluster.adapt(minimum=0, maximum=2)
    elif cluster_strategy == "late_scaling":
        # Tell the cluster to start workers but this is a non-blocking call
        # and new workers might take time to connect. In this case the Parallel
        # call should wait for at least one worker to come up before starting
        # to schedule work.
        cluster.scale(2)
    try:
        with parallel_backend('dask'):
            # The following should wait a bit for at least one worker to
            # become available.
            Parallel()(delayed(inc)(i) for i in range(10))
    finally:
        client.close()
        cluster.close()
Example #10
0
    def __init__(self, address):
        """
        Sets up a distributor that connects to a Dask Scheduler to distribute the calculaton of the features

        :param address: the ip address and port number of the Dask Scheduler
        :type address: str
        """

        from distributed import Client

        self.client = Client(address=address)
Example #11
0
def test_scale_up_and_down():
    loop = IOLoop.current()
    cluster = LocalCluster(0, scheduler_port=0, nanny=False, silence_logs=False,
                           diagnostics_port=None, loop=loop, start=False)
    c = Client(cluster, start=False, loop=loop)
    yield c._start()

    assert not cluster.workers

    yield cluster.scale_up(2)
    assert len(cluster.workers) == 2
    assert len(cluster.scheduler.ncores) == 2

    addr = cluster.workers[0].address
    yield cluster.scale_down([addr])

    assert len(cluster.workers) == 1
    assert addr not in cluster.scheduler.ncores

    yield c._shutdown()
    yield cluster._close()
Example #12
0
    def __init__(self, n_workers):
        """

        Initiates a LocalDaskDistributor instance.

        :param n_workers: How many workers should the local dask cluster have?
        :type n_workers: int
        """

        from distributed import LocalCluster, Client

        cluster = LocalCluster(n_workers=n_workers, processes=False)
        self.client = Client(cluster)
        self.n_workers = n_workers
Example #13
0
def test_multiple_clients_restart(s, a, b):
    e1 = Client((s.ip, s.port), start=False)
    yield e1._start()
    e2 = Client((s.ip, s.port), start=False)
    yield e2._start()

    x = e1.submit(inc, 1)
    y = e2.submit(inc, 2)
    xx = yield x._result()
    yy = yield y._result()
    assert xx == 2
    assert yy == 3

    yield e1._restart()

    assert x.cancelled()
    assert y.cancelled()

    yield e1._shutdown(fast=True)
    yield e2._shutdown(fast=True)
Example #14
0
    def __init__(self, n_workers):
        """

        Initiates a LocalDaskDistributor instance.

        :param n_workers: How many workers should the local dask cluster have?
        :type n_workers: int
        """

        from distributed import LocalCluster, Client
        import tempfile

        # attribute .local_dir_ is the path where the local dask workers store temporary files
        self.local_dir_ = tempfile.mkdtemp()
        cluster = LocalCluster(n_workers=n_workers, processes=False, local_dir=self.local_dir_)

        self.client = Client(cluster)
        self.n_workers = n_workers
Example #15
0
def test_publish_simple(s, a, b):
    c = Client((s.ip, s.port), start=False)
    yield c._start()
    f = Client((s.ip, s.port), start=False)
    yield f._start()

    data = yield c._scatter(range(3))
    out = yield c._publish_dataset(data=data)
    assert 'data' in s.extensions['publish'].datasets

    with pytest.raises(KeyError) as exc_info:
        out = yield c._publish_dataset(data=data)

    assert "exists" in str(exc_info.value)
    assert "data" in str(exc_info.value)

    result = yield c.scheduler.publish_list()
    assert result == ['data']

    result = yield f.scheduler.publish_list()
    assert result == ['data']

    yield c._shutdown()
    yield f._shutdown()
Example #16
0
def test_Client_solo(loop):
    with Client(loop=loop) as c:
        pass
    assert c.cluster.status == 'closed'
Example #17
0
def test_client_cluster_synchronous(loop):
    with clean(threads=False):
        with Client(loop=loop, processes=False) as c:
            assert not c.asynchronous
            assert not c.cluster.asynchronous
Example #18
0
def test_Client_twice(loop):
    with Client(loop=loop, silence_logs=False, dashboard_address=None) as c:
        with Client(loop=loop, silence_logs=False, dashboard_address=None) as f:
            assert c.cluster.scheduler.port != f.cluster.scheduler.port
Example #19
0
def test_Client_solo(loop):
    with Client(loop=loop, silence_logs=False) as c:
        pass
    assert c.cluster.status == Status.closed
Example #20
0
class Ensemble(object):
    '''
    Ensembles represent an multiple SUMMA configurations based on
    changing the decisions or parameters of a given run.

    Attributes
    ----------
    executable:
        Path to the SUMMA executable
    filemanager: (optional)
        Path to the file manager
    configuration:
        Dictionary of runs, along with settings
    num_workers:
        Number of parallel workers to use
    simulations:
        Dictionary of run names and Simulation objects
    '''
    def __init__(self,
                 executable: str,
                 configuration: dict,
                 filemanager: str = None,
                 num_workers: int = 1,
                 threads_per_worker: int = OMP_NUM_THREADS,
                 scheduler: str = None,
                 client: Client = None):
        """
        Create a new Ensemble object. The API mirrors that of the
        Simulation object.
        """
        self._status = 'Initialized'
        self.executable: str = executable
        self.filemanager: str = filemanager
        self.configuration: dict = configuration
        self.num_workers: int = num_workers
        self.simulations: dict = {}
        self.submissions: list = []
        # Try to get a client, and if none exists then start a new one
        if client:
            self._client = client
            workers = len(self._client.get_worker_logs())
            if workers <= self.num_workers:
                self._client.cluster.scale(workers)
        else:
            try:
                self._client = get_client()
                # Start more workers if necessary:
                workers = len(self._client.get_worker_logs())
                if workers <= self.num_workers:
                    self._client.cluster.scale(workers)
            except ValueError:
                self._client = Client(n_workers=self.num_workers,
                                      threads_per_worker=threads_per_worker)
        self._generate_simulation_objects()

    def _generate_simulation_objects(self):
        """
        Create a mapping of configurations to the simulation objects.
        """
        if self.filemanager:
            for name, config in self.configuration.items():
                self.simulations[name] = Simulation(self.executable,
                                                    self.filemanager, False)
        else:
            for name, config in self.configuration.items():
                assert config['file_manager'] is not None, \
                    "No filemanager found in configuration or Ensemble!"
                self.simulations[name] = Simulation(self.executable,
                                                    config['file_manager'],
                                                    False)

    def _generate_coords(self):
        """
        Generate the coordinates that can be used to merge the output
        of the ensemble runs into a single dataset.
        """
        decision_dims = ChainDict()
        manager_dims = ChainDict()
        parameter_dims = ChainDict()
        for name, conf in self.configuration.items():
            for k, v in conf.get('decisions', {}).items():
                decision_dims[k] = v
            for k, v in conf.get('file_manager', {}).items():
                manager_dims[k] = v
            #for k, v in conf.get('parameters', {}).items():
            #    parameter_dims[k] = v
            for k, v in conf.get('trial_parameters', {}).items():
                parameter_dims[k] = v
        return {
            'decisions': decision_dims,
            'managers': manager_dims,
            'parameters': parameter_dims
        }

    def merge_output(self):
        """
        Open and merge all of the output datasets from the ensemble
        run into a single dataset.
        """
        nc = self._generate_coords()
        new_coords = (list(nc.get('decisions', {})) +
                      list(nc.get('parameters', {})))
        decision_tuples = [
            tuple(n.split('++')[1:-1]) for n in self.configuration.keys()
        ]
        for i, t in enumerate(decision_tuples):
            decision_tuples[i] = tuple(
                (float(l.split('=')[-1]) if '=' in l else l for l in t))
        decision_names = [
            '++'.join(tuple(n.split('++')[1:-1]))
            for n in self.configuration.keys()
        ]
        if sum([len(dt) for dt in decision_tuples]) == 0:
            raise NameError("Simulations in the ensemble do not share all"
                            " common decisions! Please use `open_output`"
                            " to retrieve the output of this Ensemble")
        for i, t in enumerate(decision_names):
            decision_names[i] = '++'.join(l.split('=')[0] for l in t)
        new_idx = pd.MultiIndex.from_tuples(decision_tuples, names=new_coords)
        out_file_paths = [
            s.get_output_files() for s in self.simulations.values()
        ]
        out_file_paths = [fi for sublist in out_file_paths for fi in sublist]
        full = xr.open_mfdataset(out_file_paths,
                                 concat_dim='run_number',
                                 combine='nested')
        merged = full.assign_coords(run_number=decision_names)
        merged['run_number'] = new_idx
        merged = merged.unstack('run_number')
        return merged

    def start(self, run_option: str, prerun_cmds: list = None):
        """
        Start running the ensemble members.

        Parameters
        ----------
        run_option:
            The run type. Should be either 'local' or 'docker'
        prerun_cmds:
            A list of preprocessing commands to run
        """
        for n, s in self.simulations.items():
            # Sleep calls are to ensure writeout happens
            config = self.configuration[n]
            self.submissions.append(
                self._client.submit(_submit, s, n, run_option, prerun_cmds,
                                    config))

    def run(self, run_option: str, prerun_cmds=None, monitor: bool = True):
        """
        Run the ensemble

        Parameters
        ----------
        run_option:
            Where to run the simulation. Can be ``local`` or ``docker``
        prerun_cmds:
            A list of shell commands to run before running SUMMA
        monitor:
            Whether to halt operation until runs are complete
        """
        self.start(run_option, prerun_cmds)
        if monitor:
            return self.monitor()
        else:
            return True

    def map(self, fun, args, include_sims=True, monitor=True):
        for n, s in self.simulations.items():
            config = self.configuration[n]
            if include_sims:
                all_args = (s, n, *args, {'config': config})
            else:
                all_args = (*args, {'config': config})
            self.submissions.append(self._client.submit(fun, *all_args))
        if monitor:
            return self.monitor()
        else:
            return True

    def monitor(self):
        """
        Halt computation until submitted simulations are complete
        """
        simulations = self._client.gather(self.submissions)
        for s in simulations:
            self.simulations[s.run_suffix] = s

    def summary(self):
        """
        Show the user information about ensemble status
        """
        success, error, other = [], [], []
        for n, s in self.simulations.items():
            if s.status == 'Success':
                success.append(n)
            elif s.status == 'Error':
                error.append(n)
            else:
                other.append(n)
        return {'success': success, 'error': error, 'other': other}

    def rerun_failed(self,
                     run_option: str,
                     prerun_cmds=None,
                     monitor: bool = True):
        """
        Try to re-run failed simulations.

        Parameters
        ----------
        run_option:
            Where to run the simulation. Can be ``local`` or ``docker``
        prerun_cmds:
            A list of shell commands to run before running SUMMA
        monitor:
            Whether to halt operation until runs are complete
        """
        run_summary = self.summary()
        self.submissions = []
        for n in run_summary['error']:
            config = self.configuration[n]
            s = self.simulations[n]
            s.reset()
            self.submissions.append(
                self._client.submit(_submit, s, n, run_option, prerun_cmds,
                                    config))
        if monitor:
            return self.monitor()
        else:
            return True
Example #21
0
async def test_simple(cleanup):
    async with Scheduler(protocol="ucx") as s:
        async with Worker(s.address) as a:
            async with Client(s.address, asynchronous=True) as c:
                result = await c.submit(lambda x: x + 1, 10)
                assert result == 11
def test_dont_assume_function_purity(loop):  # noqa: F811
    with cluster() as (s, [a, b]):
        with Client(s['address'], loop=loop) as client:  # noqa: F841
            with parallel_backend('dask') as (ba, _):
                x, y = Parallel()(delayed(random2)() for i in range(2))
                assert x != y
Example #23
0
def test_rolling_sync(loop):
    with cluster() as (c, [a, b]):
        with Client(('127.0.0.1', c['port']), loop=loop) as c:
            df = pd.util.testing.makeTimeDataFrame()
            ddf = dd.from_pandas(df, npartitions=10)
            dd.rolling_mean(ddf.A, 2).compute(get=c.get)
    import socket
    time.sleep(5)
    return '{0} on {1}'.format(i, socket.gethostname())

if __name__ == '__main__':
    arg_parser = ArgumentParser(description='compute sum of squares and check '
                                            'task placement')
    arg_parser.add_argument('--scheduler', help='scheduler host')
    arg_parser.add_argument('--scheduler_port', default='8786',
                            help='scheduler port to use')
    arg_parser.add_argument('--n', type=int, default=100,
                            help='number of terms in sum')
    arg_parser.add_argument('--verbose', action='store_true',
                            help='give verbose output')
    options = arg_parser.parse_args()
    client = Client('{0}:{1}'.format(options.scheduler,
                                     options.scheduler_port))
    if options.verbose:
        print('Client: {0}'.format(str(client)), flush=True)
    futures = client.map(square, range(options.n))
    total = client.submit(sum, futures)
    expected_total = (options.n - 1)*options.n*(2*options.n - 1)//6
    print('sum_i=0..99 i^2 = {0:d}, expected {1:d}'.format(total.result(),
                                                           expected_total))
    futures = client.map(get_hostname, range(options.n))
    process_locations = client.gather(futures)
    if options.verbose:
        print('task placement:')
        print('\t' + '\n\t'.join(process_locations))
    count = dict()
    for process_location in process_locations:
        _, _, hostname = process_location.split()
def test_secede_with_no_processes(loop):  # noqa: F811
    # https://github.com/dask/distributed/issues/1775
    with Client(loop=loop, processes=False, set_as_default=True):
        with parallel_backend('dask'):
            Parallel(n_jobs=4)(delayed(id)(i) for i in range(2))
Example #26
0
def test_stream_shares_client_loop(loop):  # noqa: F811
    with cluster() as (s, [a, b]):
        with Client(s['address'], loop=loop) as client:  # noqa: F841
            source = Stream()
            d = source.timed_window('20ms').scatter()  # noqa: F841
            assert source.loop is client.loop
Example #27
0
async def test_config(cleanup):
    async with Scheduler() as s:
        async with Nanny(s.address, config={"foo": "bar"}) as n:
            async with Client(s.address, asynchronous=True) as client:
                config = await client.run(dask.config.get, "foo")
                assert config[n.worker_address] == "bar"
Example #28
0
#validation para esta parte nos vamos a basar en las funciones que aparece en manual
#dask-ml Documentation versión 0.1 que encontré en internet (a partir de la página 12)

regdask = LinearRegression()

param_grid = [{'C': [5, 10, 15], 'tol': [1e-4, 1e-3, 1e-2]}]

# In[22]:

dk_grid_search = GridSearchCV(regdask, param_grid=param_grid, n_jobs=-1, cv=5)

# In[23]:

#Para correr las cosas en distribuido agregamos el siguiente código:
from dask.distributed import Client
client = Client("scheduler:8786")

# In[24]:

ini = timeit.default_timer()

lr_dask = dk_grid_search.fit(x_train, y_train)

print("Tiempo de ejecución: " + str(timeit.default_timer() - ini))

# In[25]:

lr_dask.best_estimator_

# In[26]:
Example #29
0
def test_rabit_ops():
    from distributed import Client, LocalCluster
    n_workers = 3
    with LocalCluster(n_workers=n_workers) as cluster:
        with Client(cluster) as client:
            run_rabit_ops(client, n_workers)
from lib_experimental_utils import simulation, qasm_simulator, ibmqx4, create_direction_only_pass_manager, \
    create_experiment_qobj, get_gate_times, experiment, ibmqx2, ibmq_16_melbourne, create_optimize_only_pass_manager, \
    ibmq_ourense, ibmq_vigo

LOG = logging.getLogger(__name__)


def setup_logging():
    import lib_experimental_utils
    logging.basicConfig(format=logging.BASIC_FORMAT, level='WARN')
    logging.getLogger(lib_experimental_utils.__name__).setLevel('DEBUG')
    logging.getLogger(__name__).setLevel('DEBUG')


module_path = os.path.dirname(__file__)
client = Client(address='localhost:8786')  # type: Client


class BackendEnum(Enum):
    SIMULATOR = 0
    IBMQX2 = 1
    IBMQX4 = 2
    IBMQ_OURENSE = 3
    IBMQ_VIGO = 4


def update_files():
    client.upload_file("{}/lib_circuits.py".format(module_path))
    client.upload_file("{}/lib_experimental_utils.py".format(module_path))
    client.upload_file("{}/lib_experiment_setups.py".format(module_path))
Example #31
0
def test_loc_sync(loop):
    with cluster() as (c, [a, b]):
        with Client(('127.0.0.1', c['port']), loop=loop) as c:
            df = pd.util.testing.makeTimeDataFrame()
            ddf = dd.from_pandas(df, npartitions=10)
            ddf.loc['2000-01-17':'2000-01-24'].compute(get=c.get)
Example #32
0
 def register_dask_scheduler_plugin(cls, client: distributed.Client):
     plugin = SharedMemoryRefCounter(cls.REFCOUNT_TAG)
     client.register_scheduler_plugin(plugin,
                                      idempotent=True,
                                      name="metagraph_shared_csr_refcount")
async def test_security_dict_input_no_security(cleanup):
    async with Scheduler(security={}) as s:
        async with Worker(s.address, security={}) as w:
            async with Client(s.address, security={}, asynchronous=True) as c:
                result = await c.submit(inc, 1)
                assert result == 2
Example #34
0
async def test_no_workers(cleanup):
    async with Client(
        n_workers=0, silence_logs=False, dashboard_address=None, asynchronous=True
    ) as c:
        pass
Example #35
0
 def get_dask_client(self):
     return Client(self.scheduler)
Example #36
0
def test_Client_kwargs(loop):
    with Client(loop=loop, processes=False, n_workers=2, silence_logs=False) as c:
        assert len(c.cluster.workers) == 2
        assert all(isinstance(w, Worker) for w in c.cluster.workers.values())
    assert c.cluster.status == Status.closed
Example #37
0
from dask.diagnostics import ProgressBar
from dask.diagnostics import ResourceProfiler
from dask.dot import dot_graph
from dask.array.core import map_blocks
import dask.bag as db
import dask.dataframe as df
import random
import logging
logging.basicConfig(stream=sys.stdout, level=logging.CRITICAL)
from chest import Chest
from random import shuffle
import os.path
from distributed import Client

hostname = "198.202.115.240:8786"
client = Client(hostname)

#from multiprocessing.pool import ThreadPool
RESULT_DIR = "results"
RESULT_FILE_PREFIX = "pair-distance-"
HEADER_CSV = "Scenario, Type, Time"
#BASE_DIRECTORY=os.getcwd()
# Dask has issues with NFS home directory on Comet
# BASE_DIRECTORY='/scratch/luckow/7146882'
BASE_DIRECTORY = '/oasis/scratch/comet/luckow/temp_project'
#BASE_DIRECTORY='/scratch/luckow/7218009/'
OUT_DIR = os.path.join(BASE_DIRECTORY, "npy_stack")

FILENAMES = [
    "../132k_dataset/atom_pos_132K.npy", "../145K_dataset/atom_pos_145K.npy",
    "../300K_dataset/atom_pos_291K.npy", '../840K_dataset/atom_pos_839K.npy'
Example #38
0
def test_blocks_until_full(loop):
    with Client(loop=loop) as c:
        assert len(c.nthreads()) > 0
Example #39
0
def test_empty_dmatrix_approx():
    with LocalCluster(n_workers=kWorkers) as cluster:
        with Client(cluster) as client:
            parameters = {'tree_method': 'approx'}
            run_empty_dmatrix_reg(client, parameters)
            run_empty_dmatrix_cls(client, parameters)
def preprocessing_script():
    """
    This script will process all the hybridization folders combined in a 
    processing folder. The input parameters are passed using arparse

    Parameters:
    -----------
    
    scheduler: string
        tcp address of the dask.distributed scheduler (ex. tcp://192.168.0.4:7003). 
        default = False. If False the process will run on the local computer using nCPUs-1

    path: string
        Path to the processing directory


    """


    # Inputs of the function
    parser = argparse.ArgumentParser(description='Preprocessing script')
    parser.add_argument('-scheduler', default=False, help='dask scheduler address ex. tcp://192.168.0.4:7003')
    parser.add_argument('-path', help='processing directory')
    args = parser.parse_args()
    
    # Directory to process
    processing_directory = args.path
    # Dask scheduler address
    scheduler_address = args.scheduler
    
    if scheduler_address:
        # Start dask client on server or cluster
        client=Client(scheduler_address)

    else:
        # Start dask client on local machine. It will use all the availabe
        # cores -1

        # number of core to use
        ncores = multiprocessing.cpu_count()-1
        cluster = LocalCluster(n_workers=ncores)
        client=Client(cluster)

    # Subdirectories of the processing_directory that need to be skipped for the
    # analysis
    blocked_directories = ['_logs']

    # Starting logger
    utils.init_file_logger(processing_directory)
    logger = logging.getLogger()

    # Determine the operating system running the code
    os_windows, add_slash = utils.determine_os()

    # Check training slash in the processing directory
    processing_directory=utils.check_trailing_slash(processing_directory,os_windows)

    # Get a list of the hybridization to process
    processing_hyb_list = next(os.walk(processing_directory))[1]

    # Remove the blocked directories from the directories to process
    processing_hyb_list = [el for el in processing_hyb_list if el not in blocked_directories ]

    for processing_hyb in processing_hyb_list:
    
        # Determine the hyb number from the name
        hybridization_number = processing_hyb.split('_hyb')[-1]
        hybridization = 'Hybridization' + hybridization_number
        hyb_dir = processing_directory + processing_hyb + add_slash
        
        # Parse the Experimental metadata file (serial)
        experiment_infos,image_properties, hybridizations_infos, \
        converted_positions, microscope_parameters =\
        utils.experimental_metadata_parser(hyb_dir)
        
        # Parse the configuration file 
        flt_rawcnt_config = utils.filtering_raw_counting_config_parser(hyb_dir)
        
        
        # ----------------- .nd2 FILE CONVERSION ------------------------------

        # Create the temporary subdirectory tree (serial)
        tmp_dir_path, tmp_gene_dirs=utils.create_subdirectory_tree(hyb_dir,\
                    hybridization,hybridizations_infos,processing_hyb,suffix='tmp',add_slash=add_slash)

        # Get the list of the nd2 files to process inside the directory
        files_list = glob.glob(hyb_dir+processing_hyb+'_raw_data'+add_slash+'*.nd2')

        # Get the list of genes that are analyzed in the current hybridization
        gene_list = list(hybridizations_infos[hybridization].keys())

        # Organize the file to process in a list which order match the gene_list for
        # parallel processing
        organized_files_list = [f for gene in gene_list for f in files_list if gene+'.nd2' in f  ]
        organized_tmp_dir_list = [f for gene in gene_list for f in tmp_gene_dirs if gene in f  ]

        # Each .nd2 file will be processed in a worker part of a different node
        # Get the addresses of one process/node to use for conversion
        node_addresses = utils.identify_nodes(client)
        workers_conversion = [list(el.items())[0][1] for key,el in node_addresses.items()]

        # Run the conversion
        futures_processes=client.map(io.nd2_to_npy,gene_list,organized_files_list,
                                    tmp_gene_dirs,processing_hyb=processing_hyb,
                                    use_ram=flt_rawcnt_config['use_ram'],
                                    max_ram=flt_rawcnt_config['max_ram'],
                                    workers=workers_conversion)
        client.gather(futures_processes)

        

        # ---------------------------------------------------------------------
        
        
        # ----------------- FILTERING AND RAW COUNTING ------------------------
        
        # Create directories 

        # Create the directory where to save the filtered images
        suffix = 'filtered_png'
        filtered_png_img_dir_path, filtered_png_img_gene_dirs = \
                utils.create_subdirectory_tree(hyb_dir,hybridization,hybridizations_infos,
                            processing_hyb,suffix,add_slash,analysis_name=flt_rawcnt_config['analysis_name'])

        suffix = 'filtered_npy'
        filtered_img_dir_path, filtered_img_gene_dirs = \
                utils.create_subdirectory_tree(hyb_dir,hybridization,hybridizations_infos,
                            processing_hyb,suffix,add_slash,analysis_name=flt_rawcnt_config['analysis_name'])

        # Create the directory where to save the counting
        suffix = 'counting'
        counting_dir_path, counting_gene_dirs = \
            utils.create_subdirectory_tree(hyb_dir,hybridization,hybridizations_infos,processing_hyb,
                            suffix,add_slash,flt_rawcnt_config['skip_tags_counting'],
                            flt_rawcnt_config['skip_genes_counting'],
                            analysis_name=flt_rawcnt_config['analysis_name'])


        if flt_rawcnt_config['illumination_correction']:

            # Create the directory where to save the counting
            suffix = 'illumination_funcs'
            illumination_func_dir_path, illumination_func_gene_dirs = \
                utils.create_subdirectory_tree(hyb_dir,hybridization,hybridizations_infos,processing_hyb,
                                                suffix,add_slash,analysis_name=flt_rawcnt_config['analysis_name'])

            # Loop through channels and calculate illumination
            for gene in hybridizations_infos[hybridization].keys():
                
                flist_img_to_filter=glob.glob(hyb_dir+processing_hyb+'_tmp/'+processing_hyb+'_'+gene+'_tmp/*.npy')

                logger.debug('Create average image for gene %s', gene)

                # Chunking the image list
                num_chunks = sum(list(client.ncores().values()))
                chunked_list = utils.list_chunking(flist_img_to_filter,num_chunks)

                # Scatter the images sublists to process in parallel
                futures = client.scatter(chunked_list)

                # Create dask processing graph
                output = []
                for future in futures:
                    ImgMean = delayed(utils.partial_image_mean)(future)
                    output.append(ImgMean)
                ImgMean_all = delayed(sum)(output)
                ImgMean_all = ImgMean_all/float(len(futures))

                # Compute the graph
                ImgMean = ImgMean_all.compute()

                logger.debug('Create illumination function for gene %s',gene)
                # Create illumination function
                Illumination=filters.gaussian(ImgMean,sigma=(20,300,300))

                # Normalization of the illumination
                Illumination_flat=np.amax(Illumination,axis=0)
                Illumination_norm=Illumination_flat/np.amax(Illumination_flat)

                logger.debug('Save illumination function for gene %s',gene)
                # Save the illumination function
                illumination_path = [ill_path for ill_path in illumination_func_gene_dirs if gene in ill_path][0]
                illumination_fname=illumination_path+gene+'_illumination_func.npy'
                np.save(illumination_fname,Illumination_norm,allow_pickle=False)  

                # Broadcast the illumination function to all the cores
                client.scatter(Illumination_norm, broadcast=True)

                logger.debug('Filtering %s',gene)
                # Filtering and counting
                futures_processes=client.map(counting.filtering_and_counting_ill_correction,flist_img_to_filter, \
                                illumination_function=Illumination_norm,\
                                filtered_png_img_gene_dirs=filtered_png_img_gene_dirs,\
                                filtered_img_gene_dirs =filtered_img_gene_dirs,\
                                counting_gene_dirs=counting_gene_dirs,plane_keep=flt_rawcnt_config['plane_keep'], \
                                min_distance=flt_rawcnt_config['min_distance'], stringency=flt_rawcnt_config['stringency'],\
                                skip_genes_counting=flt_rawcnt_config['skip_genes_counting'],skip_tags_counting=flt_rawcnt_config['skip_tags_counting'])
                client.gather(futures_processes)
               

        else:
            for gene in hybridizations_infos[hybridization].keys():
                flist_img_to_filter=glob.glob(hyb_dir+processing_hyb+'_tmp/'+processing_hyb+'_'+gene+'_tmp/*.npy')
                # filtering
                logger.debug('Filtering without illumination correction %s',gene)

                futures_processes=client.map(counting.filtering_and_counting,flist_img_to_filter, \
                                        filtered_png_img_gene_dirs=filtered_png_img_gene_dirs, \
                                        filtered_img_gene_dirs=filtered_img_gene_dirs, \
                                        counting_gene_dirs=counting_gene_dirs, \
                                        plane_keep=flt_rawcnt_config['plane_keep'], min_distance=flt_rawcnt_config['min_distance'],\
                                        stringency=flt_rawcnt_config['stringency'],\
                                        skip_genes_counting=flt_rawcnt_config['skip_genes_counting'],skip_tags_counting=flt_rawcnt_config['skip_tags_counting'])

                client.gather(futures_processes)
                
        # ---------------------------------------------------------------------
        
        # # ----------------- COMBINE THE FILTERED DATA IN .ppf.hdf5 ------------------------
        # # Combine the filter data in one single .ppf for each hybridization
        # # This step will run in serial mode and will not need to shuffle data
        # #  between cores because everything is on the common file system

        # logger.debug('Create .ppf.hdf5 file')

        # # Create the ppf.hdf5 file that contains the filtered data in uint16
        # preprocessing_file_path = hdf5_utils.hdf5_create_preprocessing_file(hybridizations_infos,processing_hyb,
        #                                 hybridization,flt_rawcnt_config['analysis_name'], hyb_dir,converted_positions,image_properties)

        # logger.debug('Write the .npy filtered files into the .ppf file')
        # # Load and write the .npy tmp images into the hdf5 file

        # # open the hdf5 file
        # with h5py.File(preprocessing_file_path) as f_hdl:
        #     # Loop through each gene
        #     for gene in hybridizations_infos[hybridization].keys():

        #         logger.debug('Writing %s images in .ppf.hdf5',gene)
        #         # list of the files to transfer
        #         filtered_gene_dir = [fdir for fdir in filtered_img_gene_dirs if gene in fdir][0]
        #         filtered_files_list = glob.glob(filtered_gene_dir+'*.npy')

        #         # loop through the list of file
        #         for f_file in filtered_files_list:
        #             pos = f_file.split('/')[-1].split('_')[-1].split('.')[0]
        #             f_hdl[gene]['FilteredData'][pos][:] =np.load(f_file)
        #             f_hdl.flush()
        
        # # ---------------------------------------------------------------------
        
        # # ----------------- STITCHING ------------------------
        # # Load the stitching parameters from the .yaml file

        # # Stitch the image in 2D or 3D (3D need more work/testing)
        # nr_dim = flt_rawcnt_config['nr_dim']

        # # Estimated overlapping between images according to the Nikon software
        # est_overlap = image_properties['Overlapping_percentage']

        # # Number of peaks to use for the alignment
        # nr_peaks = flt_rawcnt_config['nr_peaks']

        # # Determine if the coords need to be flipped

        # y_flip = flt_rawcnt_config['y_flip']

        # # Method to use for blending
        # # can be 'linear' or 'non linear'
        # # The methods that performs the best is the 'non linear'

        # blend = flt_rawcnt_config['blend']

        # # Reference gene for stitching
        # reference_gene = flt_rawcnt_config['reference_gene']

        # pixel_size = image_properties['PixelSize']

        # # Get the list of the filtered files of the reference gene
        # filtered_gene_dir = [gene_dir for gene_dir in filtered_img_gene_dirs if reference_gene in gene_dir][0]
        # filtered_files_list = glob.glob(filtered_gene_dir+'*.npy')

        # # Create pointer of the hdf5 file that will store the stitched reference image
        # # for the current hybridization
        # # Writing
        # tile_file_base_name = flt_rawcnt_config['analysis_name']+'_'+ processing_hyb
        # data_name   = (tile_file_base_name
        #                 + '_' + reference_gene
        #                 + '_stitching_data')

        # stitching_file_name = tile_file_base_name + '.sf.hdf5'
        # stitching_file= h5py.File(hyb_dir+stitching_file_name,'w',libver='latest')  # replace with 'a' as soon as you fix the error


        # # Determine the tiles organization
        # tiles, contig_tuples, nr_pixels, z_count, micData = stitching.get_pairwise_input_npy(image_properties,converted_positions, hybridization,
        #                         est_overlap = est_overlap, y_flip = False, nr_dim = 2)



        # # Align the tiles 
        # futures_processes=client.map(pairwisesingle.align_single_pair_npy,contig_tuples,
        #                             filtered_files_list=filtered_files_list,micData=micData, 
        #                         nr_peaks=nr_peaks)

        # # Gather the futures
        # data = client.gather(futures_processes)


        # # In this case the order of the returned contingency tuples is with
        # # the order of the input contig_tuples

        # # P_all = [el for data_single in data for el in data_single[0]]
        # P_all =[data_single[0] for data_single in data ]
        # P_all = np.array(P_all)
        # P_all = P_all.flat[:]
        # covs_all = [data_single[1] for data_single in data]
        # alignment = {'P': P_all,
        #             'covs': covs_all}


        # # Calculates a shift in global coordinates for each tile (global
        # # alignment) and then applies these shifts to the  corner coordinates
        # # of each tile and returns and saves these shifted corner coordinates.
        # joining = stitching.get_place_tile_input(hyb_dir, tiles, contig_tuples,
        #                                             micData, nr_pixels, z_count,
        #                                             alignment, data_name,
        #                                             nr_dim=nr_dim)

        # # Create the hdf5 file structure
        # stitched_group, linear_blending, blend =  hdf5preparation.create_structures_hdf5_stitched_ref_gene_file_npy(stitching_file, joining, nr_pixels,
        #                                 reference_gene, blend = 'non linear')

        # # Fill the hdf5 containing the stitched image with empty data and
        # # create the blending mask
        # stitched_group['final_image'][:]= np.zeros(joining['final_image_shape'],dtype=np.float64)
        # if blend is not None:
        #     # make mask
        #     stitched_group['blending_mask'][:] = np.zeros(joining['final_image_shape'][-2:],dtype=np.float64)
        #     tilejoining.make_mask(joining, nr_pixels, stitched_group['blending_mask'])

            
        # # Create the subdirectory used to save the blended tiles
        # suffix = 'blended_tiles'
        # blended_tiles_directory = utils.create_single_directory(hyb_dir,reference_gene, hybridization,processing_hyb,suffix,add_slash,
        #                                 analysis_name=flt_rawcnt_config['analysis_name'])

        # # Get the directory with the filtered npy images of the reference_gene to use for stitching
        # stitching_files_dir = [npy_dir for npy_dir in filtered_img_gene_dirs if reference_gene in npy_dir][0]


        # # Create the tmp directory where to save the masks
        # suffix = 'masks'
        # masked_tiles_directory = utils.create_single_directory(hyb_dir,reference_gene, hybridization,processing_hyb,suffix,add_slash,
        #                                 analysis_name=flt_rawcnt_config['analysis_name'])

        # # Create and save the mask files
        # for corn_value,corner_coords in joining['corner_list']:
        #     if not(np.isnan(corner_coords[0])):
        #         cur_mask = stitched_group['blending_mask'][int(corner_coords[0]):int(corner_coords[0]) + int(nr_pixels),
        #                             int(corner_coords[1]):int(corner_coords[1]) + int(nr_pixels)]

        #         fname = masked_tiles_directory + flt_rawcnt_config['analysis_name'] +'_'+processing_hyb+'_'+reference_gene+'_masks_joining_pos_'+str(corn_value)
        #         np.save(fname,cur_mask)


        # # Blend all the tiles and save them in a directory
        # futures_processes = client.map(tilejoining.generate_blended_tile_npy,joining['corner_list'],
        #                             stitching_files_dir = stitching_files_dir,
        #                             blended_tiles_directory = blended_tiles_directory,
        #                             masked_tiles_directory = masked_tiles_directory,
        #                             analysis_name = flt_rawcnt_config['analysis_name'],
        #                             processing_hyb = processing_hyb,reference_gene = reference_gene,
        #                             micData = micData,tiles = tiles,nr_pixels=nr_pixels,
        #                             linear_blending=linear_blending)



        # _ = client.gather(futures_processes)


        # # Write the stitched image
        # tilejoining.make_final_image_npy(joining, stitching_file, blended_tiles_directory, tiles,reference_gene, nr_pixels)

        # # close the hdf5 file
        # stitching_file.close()


        # # Delete the directories with blended tiles and masks
        # shutil.rmtree(blended_tiles_directory)
        # shutil.rmtree(masked_tiles_directory)

        # ----------------- DELETE FILES ------------------------
        # Don't delete the *.npy files here because can be used to 
        # create the final images using the apply stitching related function    









    client.close()
Example #41
0
from netCDF4 import Dataset
import numpy as np
from datetime import datetime, timedelta
from copy import deepcopy
import glob
import math
import dask.array as da
from distributed import Client, LocalCluster
from dask import delayed, compute
import time
import sys
from scipy import ndimage

# Start a cluster with x workers
cluster = LocalCluster(n_workers=int(sys.argv[1]))
client = Client(cluster)

# Input the range of dates and time wanted for the collection of images
start_year = 2006
start_day = 1
start_month = 1
start_hour = 1
start_minute = 0
start_second = 0

end_year = 2006
end_month = 3
end_day = 1
end_hour = 0
end_minute = 00
end_second = 0
Example #42
0
 def _get_dask_client(self, options):
     if options.dask_cluster_uri:
         function = mlrun.import_function(options.dask_cluster_uri)
         return function.client, function.metadata.name
     return Client(), None
Example #43
0
def test_Client_twice(loop):
    with Client(loop=loop) as c:
        with Client(loop=loop) as f:
            assert c.cluster.scheduler.port != f.cluster.scheduler.port
Example #44
0
def test_publish_roundtrip(s, a, b):
    c = Client((s.ip, s.port), start=False)
    yield c._start()
    f = Client((s.ip, s.port), start=False)
    yield f._start()

    data = yield c._scatter([0, 1, 2])
    yield c._publish_dataset(data=data)

    assert 'published-data' in s.who_wants[data[0].key]
    result = yield f._get_dataset(name='data')

    assert len(result) == len(data)
    out = yield f._gather(result)
    assert out == [0, 1, 2]

    with pytest.raises(KeyError) as exc_info:
        result = yield f._get_dataset(name='nonexistent')

    assert "not found" in str(exc_info.value)
    assert "nonexistent" in str(exc_info.value)

    yield c._shutdown()
    yield f._shutdown()
Example #45
0
def test_Client_twice(loop):
    with Client(loop=loop, silence_logs=False) as c:
        with Client(loop=loop, silence_logs=False) as f:
            assert c.cluster.scheduler.port != f.cluster.scheduler.port
async def test_get_cluster_details(
    clusters_config: None,
    registered_user: Callable[..., Dict[str, Any]],
    async_client: httpx.AsyncClient,
    local_dask_gateway_server: DaskGatewayServer,
    cluster: Callable[..., Cluster],
    dask_gateway_cluster: GatewayCluster,
    dask_gateway_cluster_client: DaskClient,
):
    user_1 = registered_user()
    # define the cluster in the DB
    some_cluster = cluster(
        user_1,
        endpoint=local_dask_gateway_server.address,
        authentication=SimpleAuthentication(
            username="******",
            password=local_dask_gateway_server.password).dict(by_alias=True),
    )
    # in its present state, the cluster should have no workers
    cluster_out = await _get_cluster_details(async_client, user_1["id"],
                                             some_cluster.id)
    assert not cluster_out.scheduler.workers, "the cluster should not have any worker!"

    # now let's scale the cluster
    _NUM_WORKERS = 1
    await dask_gateway_cluster.scale(_NUM_WORKERS)
    async for attempt in AsyncRetrying(reraise=True,
                                       stop=stop_after_delay(60),
                                       wait=wait_fixed(1)):
        with attempt:
            cluster_out = await _get_cluster_details(async_client,
                                                     user_1["id"],
                                                     some_cluster.id)
            assert cluster_out.scheduler.workers, "the cluster has no workers!"
            assert (
                len(cluster_out.scheduler.workers) == _NUM_WORKERS
            ), f"the cluster is missing {_NUM_WORKERS}, currently has {len(cluster_out.scheduler.workers)}"
            print(
                f"cluster now has its {_NUM_WORKERS}, after {json.dumps(attempt.retry_state.retry_object.statistics)}"
            )
    print(f"!!> cluster dashboard link: {dask_gateway_cluster.dashboard_link}")

    # let's start some computation
    _TASK_SLEEP_TIME = 5

    def do_some_work(x: int):
        import time

        time.sleep(x)
        return True

    task = dask_gateway_cluster_client.submit(do_some_work, _TASK_SLEEP_TIME)
    # wait for the computation to start, we should see this in the cluster infos
    async for attempt in AsyncRetrying(reraise=True,
                                       stop=stop_after_delay(10),
                                       wait=wait_fixed(1)):
        with attempt:
            cluster_out = await _get_cluster_details(async_client,
                                                     user_1["id"],
                                                     some_cluster.id)
            assert (next(iter(
                cluster_out.scheduler.workers.values())).metrics.executing == 1
                    ), "worker is not executing the task"
            print(
                f"!!> cluster metrics: {next(iter(cluster_out.scheduler.workers.values())).metrics=}"
            )
    # let's wait for the result
    result = task.result(timeout=_TASK_SLEEP_TIME + 5)
    assert result
    assert await result == True
    # wait for the computation to effectively stop
    async for attempt in AsyncRetrying(reraise=True,
                                       stop=stop_after_delay(60),
                                       wait=wait_fixed(1)):
        with attempt:
            cluster_out = await _get_cluster_details(async_client,
                                                     user_1["id"],
                                                     some_cluster.id)
            print(
                f"!!> cluster metrics: {next(iter(cluster_out.scheduler.workers.values())).metrics=}"
            )
            assert (next(iter(
                cluster_out.scheduler.workers.values())).metrics.executing == 0
                    ), "worker is still executing the task"
            assert (next(iter(
                cluster_out.scheduler.workers.values())).metrics.in_memory == 1
                    ), "worker did not keep the result in memory"
            assert (next(iter(
                cluster_out.scheduler.workers.values())).metrics.cpu == 0
                    ), "worker did not update the cpu metrics"

    # since the task is completed the worker should have stopped executing
    cluster_out = await _get_cluster_details(async_client, user_1["id"],
                                             some_cluster.id)
    worker_data = next(iter(cluster_out.scheduler.workers.values()))
    assert worker_data.metrics.executing == 0
    # in dask, the task remains in memory until the result is deleted
    assert worker_data.metrics.in_memory == 1
Example #47
0
def main():
    # Define parameters to use for multiprocessing
    client = Client()
    num_workers = min(multiprocessing.cpu_count(), 7)
    print('Number of workers = ', num_workers)
    run_start_time = time.time()

    # Directories to save data
    CUR_DIR = os.path.dirname(os.path.realpath(__file__))
    base_dir = os.path.join(CUR_DIR, BASELINE_DIR)
    reform_dir = os.path.join(CUR_DIR, REFORM_DIR)

    # Set some OG model parameters
    # See default_parameters.json for more description of these parameters
    alpha_T = np.zeros(50)  # Adjusting the path of transfer spending
    alpha_T[0:2] = 0.09
    alpha_T[2:10] = 0.09 + 0.01
    alpha_T[10:40] = 0.09 - 0.01
    alpha_T[40:] = 0.09
    alpha_G = np.zeros(7)  # Adjusting the path of non-transfer spending
    alpha_G[0:3] = 0.05 - 0.01
    alpha_G[3:6] = 0.05 - 0.005
    alpha_G[6:] = 0.05
    # Set start year for baseline and reform.
    START_YEAR = 2021
    # Also adjust the Frisch elasticity, the start year, the
    # effective corporate income tax rate, and the SS debt-to-GDP ratio
    og_spec = {
        'frisch': 0.41,
        'start_year': START_YEAR,
        'cit_rate': [0.21],
        'debt_ratio_ss': 1.0,
        'alpha_T': alpha_T.tolist(),
        'alpha_G': alpha_G.tolist()
    }
    '''
    ------------------------------------------------------------------------
    Run baseline policy first
    ------------------------------------------------------------------------
    '''
    p = Specifications(
        baseline=True,
        num_workers=num_workers,
        baseline_dir=base_dir,
        output_base=base_dir,
    )
    # Update parameters for baseline from default json file
    p.update_specifications(og_spec)

    start_time = time.time()
    runner(p, time_path=True, client=client)
    print('run time = ', time.time() - start_time)
    '''
    ------------------------------------------------------------------------
    Run reform policy
    ------------------------------------------------------------------------
    '''
    # update the effective corporate income tax rate
    og_spec.update({'cit_rate': [0.35]})
    p2 = Specifications(
        baseline=False,
        num_workers=num_workers,
        baseline_dir=base_dir,
        output_base=reform_dir,
    )
    # Update parameters for baseline from default json file
    p2.update_specifications(og_spec)

    start_time = time.time()
    runner(p2, time_path=True, client=client)
    print('run time = ', time.time() - start_time)

    # return ans - the percentage changes in macro aggregates and prices
    # due to policy changes from the baseline to the reform
    base_tpi = safe_read_pickle(os.path.join(base_dir, 'TPI', 'TPI_vars.pkl'))
    base_params = safe_read_pickle(os.path.join(base_dir, 'model_params.pkl'))
    reform_tpi = safe_read_pickle(
        os.path.join(reform_dir, 'TPI', 'TPI_vars.pkl'))
    reform_params = safe_read_pickle(
        os.path.join(reform_dir, 'model_params.pkl'))
    ans = ot.macro_table(base_tpi,
                         base_params,
                         reform_tpi=reform_tpi,
                         reform_params=reform_params,
                         var_list=['Y', 'C', 'K', 'L', 'r', 'w'],
                         output_type='pct_diff',
                         num_years=10,
                         start_year=og_spec['start_year'])

    # create plots of output
    op.plot_all(base_dir, reform_dir, os.path.join(CUR_DIR,
                                                   'run_example_plots'))

    print("total time was ", (time.time() - run_start_time))
    print('Percentage changes in aggregates:', ans)
    # save percentage change output to csv file
    ans.to_csv('ogcore_example_output.csv')
    client.close()