Python Client.map Examples, distributed.Client.map Python Examples

Example #1

0

Show file

File: test_adaptive.py Project: dask/distributed

def test_adaptive_local_cluster_multi_workers():
    loop = IOLoop.current()
    cluster = LocalCluster(0, scheduler_port=0, silence_logs=False, nanny=False,
                           diagnostics_port=None, loop=loop, start=False)
    cluster.scheduler.allowed_failures = 1000
    alc = Adaptive(cluster.scheduler, cluster, interval=100)
    c = Client(cluster, start=False, loop=loop)
    yield c._start()

    futures = c.map(slowinc, range(100), delay=0.01)

    start = time()
    while not cluster.scheduler.worker_info:
        yield gen.sleep(0.01)
        assert time() < start + 15

    yield c._gather(futures)
    del futures

    start = time()
    while cluster.workers:
        yield gen.sleep(0.01)
        assert time() < start + 5

    assert not cluster.workers
    yield gen.sleep(0.2)
    assert not cluster.workers

    futures = c.map(slowinc, range(100), delay=0.01)
    yield c._gather(futures)

    yield c._shutdown()
    yield cluster._close()

Example #2

0

Show file

def main(_):
    #Generate scheduler
    data = da.from_array(np.array(Image.open(r'dota2.jpg')),
                         chunks=(600, 400, 3))
    client = Client(args.address)
    client.upload_file('calcov.py')

    temp3 = np.zeros((3, 3))
    temp3[0, :] = [0.062467, 0.125000, 0.062467]
    temp3[1, :] = [0.125000, 0.250131, 0.125000]
    temp3[2, :] = [0.062467, 0.125000, 0.062467]

    D = []
    B = []
    for i in range(args.queue):
        D.append(np.array(data + i * 10))
        B.append(temp3 + 0.05)

    future = client.map(calcov.calCov, B, D)
    result = [[np.array(_[0]), str(_[1]), str(_[2])]
              for _ in client.gather(future)]

    shutil.rmtree(r'./data', ignore_errors=True)
    os.mkdir(r'./data')
    i = 0
    for _ in result:
        data = _[0]
        time = _[1]
        name = _[2].strip('tcp://')
        new_im = Image.fromarray(data)
        new_im.save('./data/result_%s_%s_(%s).jpg' % (i, time, name))
        i += 1

Example #3

0

Show file

def main():
    #define parallel mcmc wrapper
    def parallel_mcmc(_):
        return (mcmc(initial_parameters=epa_0,
                     proposer=normal_prop,
                     param2res=param2res,
                     costfunction=costfunction,
                     nsimu=5000))

    #check jobs resources to initialize dask workers
    num_threads = int(
        environ.get('SLURM_CPUS_PER_TASK', environ.get('OMP_NUM_THREADS', 1)))
    initialize(interface='ib0', nthreads=num_threads)
    client = Client()

    #run 10 chains
    [[c_form1, j_form1], [c_form2, j_form2], [c_form3, j_form3],
     [c_form4, j_form4], [c_form5, j_form5], [c_form6, j_form6],
     [c_form7, j_form7], [c_form8, j_form8], [c_form9, j_form9],
     [c_form10,
      j_form10]] = client.gather(client.map(parallel_mcmc, range(0, 10)))

    #print chain5 output as test
    formal_c_path = dataPath.joinpath('chain5_pmcmc_c.csv')
    formal_j_path = dataPath.joinpath('chain5_pmcmc_j.csv')
    pd.DataFrame(c_form5).to_csv(formal_c_path, sep=',')
    pd.DataFrame(j_form5).to_csv(formal_j_path, sep=',')

Example #4

0

Show file

class LocalDaskDistributor(DistributorBaseClass):
    """
    Distributor using a local dask cluster and inproc communication.
    """
    def __init__(self, n_workers):
        """
        Initiates a LocalDaskDistributor instance.

        Parameters
        ----------
        n_workers : int
            How many workers should the local dask cluster have?
        """

        super().__init__()
        import tempfile

        from distributed import Client, LocalCluster

        # attribute .local_dir_ is the path where the local dask workers store temporary files
        self.local_dir_ = tempfile.mkdtemp()
        cluster = LocalCluster(n_workers=n_workers,
                               processes=False,
                               local_dir=self.local_dir_)

        self.client = Client(cluster)
        self.n_workers = n_workers

    def distribute(self, func, partitioned_chunks, kwargs):
        """
        Calculates the features in a parallel fashion by distributing the map command to the dask workers on a local
        machine

        Parameters
        ----------
        func : Callable
            Function to send to each worker.
        partitioned_chunks : List
            List of data chunks, each chunk is processed by one woker
        kwargs : Dict
            Parameters for the map function
        Returns
        -------
        List
            The result of the calculation as a list - each item should be the result of the application of func
            to a single element.
        """

        if isinstance(partitioned_chunks, Iterable):
            # since dask 2.0.0 client map no longer accepts iterables
            partitioned_chunks = list(partitioned_chunks)
        result = self.client.gather(
            self.client.map(partial(func, **kwargs), partitioned_chunks))
        return result

    def close(self):
        """
        Closes the connection to the local Dask Scheduler
        """
        self.client.close()

Example #5

0

Show file

def main(args=None):
    args = parse_args(args)
    client = Client(args.scheduler)
    keys = [
        f'nyc-tlc/trip data/yellow_tripdata_2009-{m:0>2}.csv'
        for m in range(1, 13)
    ]
    results = client.map(fetch, keys)
    wait(results)

Example #6

0

Show file

File: distribution.py Project: jeffzi/tsfresh

class ClusterDaskDistributor(DistributorBaseClass):
    """
    Distributor using a dask cluster, meaning that the calculation is spread over a cluster
    """
    def __init__(self, address):
        """
        Sets up a distributor that connects to a Dask Scheduler to distribute the calculaton of the features

        :param address: the ip address and port number of the Dask Scheduler
        :type address: str
        """

        from distributed import Client

        self.client = Client(address=address)

    def calculate_best_chunk_size(self, data_length):
        """
        Uses the number of dask workers in the cluster (during execution time, meaning when you start the extraction)
        to find the optimal chunk_size.

        :param data_length: A length which defines how many calculations there need to be.
        :type data_length: int
        """
        n_workers = len(self.client.scheduler_info()["workers"])
        chunk_size, extra = divmod(data_length, n_workers * 5)
        if extra:
            chunk_size += 1
        return chunk_size

    def distribute(self, func, partitioned_chunks, kwargs):
        """
        Calculates the features in a parallel fashion by distributing the map command to the dask workers on a cluster

        :param func: the function to send to each worker.
        :type func: callable
        :param partitioned_chunks: The list of data chunks - each element is again
            a list of chunks - and should be processed by one worker.
        :type partitioned_chunks: iterable
        :param kwargs: parameters for the map function
        :type kwargs: dict of string to parameter

        :return: The result of the calculation as a list - each item should be the result of the application of func
            to a single element.
        """
        if isinstance(partitioned_chunks, Iterable):
            # since dask 2.0.0 client map no longer accepts iterables
            partitioned_chunks = list(partitioned_chunks)
        result = self.client.gather(
            self.client.map(partial(func, **kwargs), partitioned_chunks))
        return [item for sublist in result for item in sublist]

    def close(self):
        """
        Closes the connection to the Dask Scheduler
        """
        self.client.close()

Example #7

0

Show file

File: distribution.py Project: SriRamaKusu/tsfresh

class ClusterDaskDistributor(DistributorBaseClass):
    """
    Distributor using a dask cluster, meaning that the calculation is spread over a cluster
    """

    def __init__(self, address):
        """
        Sets up a distributor that connects to a Dask Scheduler to distribute the calculaton of the features

        :param address: the ip address and port number of the Dask Scheduler
        :type address: str
        """

        from distributed import Client

        self.client = Client(address=address)

    def calculate_best_chunk_size(self, data_length):
        """
        Uses the number of dask workers in the cluster (during execution time, meaning when you start the extraction)
        to find the optimal chunk_size.

        :param data_length: A length which defines how many calculations there need to be.
        :type data_length: int
        """
        n_workers = len(self.client.scheduler_info()["workers"])
        chunk_size, extra = divmod(data_length, n_workers * 5)
        if extra:
            chunk_size += 1
        return chunk_size

    def distribute(self, func, partitioned_chunks, kwargs):
        """
        Calculates the features in a parallel fashion by distributing the map command to the dask workers on a cluster

        :param func: the function to send to each worker.
        :type func: callable
        :param partitioned_chunks: The list of data chunks - each element is again
            a list of chunks - and should be processed by one worker.
        :type partitioned_chunks: iterable
        :param kwargs: parameters for the map function
        :type kwargs: dict of string to parameter

        :return: The result of the calculation as a list - each item should be the result of the application of func
            to a single element.
        """

        result = self.client.gather(self.client.map(partial(func, **kwargs), partitioned_chunks))
        return [item for sublist in result for item in sublist]

    def close(self):
        """
        Closes the connection to the Dask Scheduler
        """
        self.client.close()

Example #8

0

Show file

File: daskit.py Project: timkphd/examples

def main():
    #get command line arguments controling launch
    threads = 1
    workers = 8
    for x in sys.argv[1:]:
        if x.find("threads") > -1:
            z = x.split("=")
            threads = int(z[1])
        if x.find("workers") > -1:
            z = x.split("=")
            workers = int(z[1])


# launch with either threads and/or workers specified (0 = default)
    if threads == 0 and workers != 0:
        print("lanching  %d workers, default threads" % (workers))
        cluster = LocalCluster(n_workers=workers)
    if threads != 0 and workers == 0:
        print("lanching  %d threads, defalut workers" % (threads))
        cluster = LocalCluster(threads_per_worker=threads)
    if threads != 0 and workers != 0:
        print("lanching  %d workers  with %d threads" % (workers, threads))
        cluster = LocalCluster(n_workers=workers, threads_per_worker=threads)
    print(cluster)
    client = Client(cluster)
    print(client)

    # do serial
    # NOTE: it is possible to launch an asynchronous client
    # but here we just do serial synchronous.  See:
    # https://distributed.dask.org/en/latest/asynchronous.html
    result = []
    print("   pid  Start T")
    for i in range(0, 5):
        j = 2
        result.append(client.submit(test, i, j).result())
    print(result)
    print(Counter(result))
    #do parallel
    n = 15
    np.random.seed(1234)
    x = np.random.random(n) * 20
    #set to uniform nonzero to get uniform run times for each task
    x = np.ones(n) * 10
    print(x)
    print("   pid  Start T")
    L = client.map(test, range(n), x)
    mylist = client.gather(L)
    pids = []
    for m in mylist:
        x = m.split()[0]
        pids.append(x)
        print(m)
    pids = sorted(set(pids))
    print(len(pids), pids)

Example #9

0

Show file

File: distribution.py Project: zuoxiaolei/tsfresh

class LocalDaskDistributor(DistributorBaseClass):
    """
    Distributor using a local dask cluster and inproc communication.
    """
    def __init__(self, n_workers):
        """

        Initiates a LocalDaskDistributor instance.

        :param n_workers: How many workers should the local dask cluster have?
        :type n_workers: int
        """

        from distributed import LocalCluster, Client
        import tempfile

        # attribute .local_dir_ is the path where the local dask workers store temporary files
        self.local_dir_ = tempfile.mkdtemp()
        cluster = LocalCluster(n_workers=n_workers,
                               processes=False,
                               local_dir=self.local_dir_)

        self.client = Client(cluster)
        self.n_workers = n_workers

    def distribute(self, func, partitioned_chunks, kwargs):
        """
        Calculates the features in a parallel fashion by distributing the map command to the dask workers on a local
        machine

        :param func: the function to send to each worker.
        :type func: callable
        :param partitioned_chunks: The list of data chunks - each element is again
            a list of chunks - and should be processed by one worker.
        :type partitioned_chunks: iterable
        :param kwargs: parameters for the map function
        :type kwargs: dict of string to parameter

        :return: The result of the calculation as a list - each item should be the result of the application of func
            to a single element.
        """

        if isinstance(partitioned_chunks, Iterable):
            # since dask 2.0.0 client map no longer accepts iteratables
            partitioned_chunks = list(partitioned_chunks)
        result = self.client.gather(
            self.client.map(partial(func, **kwargs), partitioned_chunks))
        return [item for sublist in result for item in sublist]

    def close(self):
        """
        Closes the connection to the local Dask Scheduler
        """
        self.client.close()

Example #10

0

Show file

File: test_adaptive.py Project: slawler/distributed

def test_adaptive_local_cluster_multi_workers():
    loop = IOLoop.current()
    cluster = LocalCluster(0,
                           scheduler_port=0,
                           silence_logs=False,
                           processes=False,
                           diagnostics_port=None,
                           loop=loop,
                           start=False)
    cluster.scheduler.allowed_failures = 1000
    alc = Adaptive(cluster.scheduler, cluster, interval=100)
    c = Client(cluster, start=False, loop=loop)
    yield c._start()

    futures = c.map(slowinc, range(100), delay=0.01)

    start = time()
    while not cluster.scheduler.worker_info:
        yield gen.sleep(0.01)
        assert time() < start + 15

    yield c._gather(futures)
    del futures

    start = time()
    while cluster.workers:
        yield gen.sleep(0.01)
        assert time() < start + 5

    assert not cluster.workers
    assert not cluster.scheduler.workers
    yield gen.sleep(0.2)
    assert not cluster.workers
    assert not cluster.scheduler.workers

    futures = c.map(slowinc, range(100), delay=0.01)
    yield c._gather(futures)

    yield c._shutdown()
    yield cluster._close()

Example #11

0

Show file

File: distribution.py Project: SriRamaKusu/tsfresh

class LocalDaskDistributor(DistributorBaseClass):
    """
    Distributor using a local dask cluster and inproc communication.
    """

    def __init__(self, n_workers):
        """

        Initiates a LocalDaskDistributor instance.

        :param n_workers: How many workers should the local dask cluster have?
        :type n_workers: int
        """

        from distributed import LocalCluster, Client
        import tempfile

        # attribute .local_dir_ is the path where the local dask workers store temporary files
        self.local_dir_ = tempfile.mkdtemp()
        cluster = LocalCluster(n_workers=n_workers, processes=False, local_dir=self.local_dir_)

        self.client = Client(cluster)
        self.n_workers = n_workers

    def distribute(self, func, partitioned_chunks, kwargs):
        """
        Calculates the features in a parallel fashion by distributing the map command to the dask workers on a local
        machine

        :param func: the function to send to each worker.
        :type func: callable
        :param partitioned_chunks: The list of data chunks - each element is again
            a list of chunks - and should be processed by one worker.
        :type partitioned_chunks: iterable
        :param kwargs: parameters for the map function
        :type kwargs: dict of string to parameter

        :return: The result of the calculation as a list - each item should be the result of the application of func
            to a single element.
        """
        result = self.client.gather(self.client.map(partial(func, **kwargs), partitioned_chunks))
        return [item for sublist in result for item in sublist]

    def close(self):
        """
        Closes the connection to the local Dask Scheduler
        """
        self.client.close()

Example #12

0

Show file

File: test_adaptive.py Project: broxtronix/distributed

def test_adaptive_local_cluster_multi_workers():
    loop = IOLoop.current()
    cluster = LocalCluster(0, scheduler_port=0, silence_logs=False, nanny=False,
                      diagnostic_port=None, loop=loop, start=False)
    alc = Adaptive(cluster.scheduler, cluster, interval=100)
    c = Client(cluster, start=False, loop=loop)
    yield c._start()

    for i in range(20):
        futures = c.map(slowinc, range(100), delay=0.01)
        yield c._gather(futures)
        del futures
        yield gen.sleep(0.1)

    yield c._shutdown()
    yield cluster._close()

Example #13

0

Show file

File: __init__.py Project: sandabuliu/distributed-agent

class Sender(sender.BatchSender):
    def __init__(self, *args, **kwargs):
        super(Sender, self).__init__(*args, **kwargs)
        self.client = None
        self.parser = None
        self.sections = kwargs.get('sections', 10)

    def send(self, event):
        self._queue.put(event.raw_data)
        if self._queue.qsize() >= self._flush_size:
            self.need_flush.set()

    def catch(self, agent):
        super(Sender, self).catch(agent)
        args = agent.client.args
        kwargs = agent.client.kwargs
        self.client = Client(*args, **kwargs)
        self.parser = agent.real_parser

    def push(self):
        ret = False
        if not self._buffers:
            self._buffers.append([])
            for i in range(self._max_batch_size):
                if not self._queue.empty():
                    self._buffers[-1].append(self._queue.get())
                    if len(self._buffers[-1]) >= int(
                            self._max_batch_size / self.sections):
                        self._buffers.append([])
                else:
                    break

        if self._buffers:
            pmap = partial(Parser.map, parser=self.parser)
            buffers = self.client.map(pmap, self._buffers)
            buffers = self.client.submit(Parser.reduce, buffers).result()
            if hasattr(self._output, 'sendmany'):
                self._output.sendmany(buffers)
            else:
                for event in buffers:
                    self._output.send(event)
            self._buffers = []

        if self._queue.qsize() < self._flush_size:
            ret = True
        return ret

Example #14

0

Show file

File: test_screenshot.py Project: jdebacker/compute-studio-storage

def test_use_with_dask():
    try:
        import dask
        import dask.distributed
        from distributed import Client
    except ImportError:
        import warnings

        warnings.warn("Dask and/or Distributed are not installed")
        return
    with open(f"{CURRENT_DIR}/test-ogusa-remote.json") as f:
        remote_outputs = json.loads(f.read())
    outputs = cs_storage.read(remote_outputs["outputs"])

    c = Client()
    futures = c.map(cs_storage.screenshot, outputs["renderable"])
    results = c.gather(futures)
    for result in results:
        assert isinstance(result, bytes)

Example #15

0

Show file

File: test_adaptive.py Project: adamchainz/distributed

def test_adaptive_local_cluster_multi_workers():
    loop = IOLoop.current()
    cluster = LocalCluster(0,
                           scheduler_port=0,
                           silence_logs=False,
                           nanny=False,
                           diagnostic_port=None,
                           loop=loop,
                           start=False)
    alc = Adaptive(cluster.scheduler, cluster, interval=100)
    c = Client(cluster, start=False, loop=loop)
    yield c._start()

    for i in range(20):
        futures = c.map(slowinc, range(100), delay=0.01)
        yield c._gather(futures)
        del futures
        yield gen.sleep(0.1)

    yield c._shutdown()
    yield cluster._close()

Example #16

0

Show file

File: dask_example2.py Project: MPIBGC-TEE/bgc_md2

def main():
    from argparse import ArgumentParser

    parser = ArgumentParser()
    #parser.add_argument('min_num', type=int)
    #parser.add_argument('max_num', type=int)
    args = parser.parse_args()

    num_threads = int(
        environ.get('SLURM_CPUS_PER_TASK', environ.get('OMP_NUM_THREADS', 1)))
    initialize(interface='ib0', nthreads=num_threads)
    client = Client()

    min_num = 10
    max_num = 100
    start_time = datetime.now()
    num_primes = sum(
        client.gather(client.map(slow_is_prime, range(min_num, max_num + 1))))
    end_time = datetime.now()

    print(f'{num_primes} primes between {min_num} and {max_num} '
          f'[{end_time - start_time}]')

Example #17

0

Show file

class DaskClient(Thread):
    def __init__(self, clientUrl, clientId, daqObjectGenerator, resultQ):
        Thread.__init__(self, name='DaskClient-%s' % clientId)
        self.client = Client(clientUrl)
        self.clientId = clientId
        self.daqObjectGenerator = daqObjectGenerator
        self.resultQ = resultQ
        self.idQ = Queue()
        self.remoteIdQ = self.client.scatter(self.idQ)
        self.generatorQ = self.client.map(self.daqObjectGenerator.generate,
                                          self.remoteIdQ)
        self.pvQ = self.client.gather(self.generatorQ)
        self.nGenerated = 0
        self.event = Event()

    def putTask(self, objectId):
        #t0 = time.time()
        self.idQ.put(objectId)
        #t1 = time.time()
        #dt = t1-t0
        #print('PUSH TASK: %s' % dt)
        #self.event.set()

    def getPv(self, timeout=None):
        #t0 = time.time()
        pv = self.pvQ.get(timeout=timeout)
        #t1 = time.time()
        #dt = t1-t0
        #print('GET PV: %s' % dt)
        return pv

    def run(self):
        print('STARTING THREAD, CLIENT ID: %s' % self.clientId)
        while True:
            pv = self.pvQ.get(timeout=None)
            self.nGenerated += 1
            #print('GOT PV , CLIENT ID %s: %s' % (self.clientId, pv['ArrayId']))
            #print('CLIENT ID %s: N GENERATED=%s' % (self.clientId, self.nGenerated))
            self.resultQ.put((pv, self.clientId))

Example #18

0

Show file

def parallelStatsDaskSimple(urlSplits,
                            ds,
                            nEpochs,
                            variable,
                            mask,
                            coordinates,
                            reader,
                            outHdfsPath,
                            averagingConfig,
                            sparkConfig,
                            accumulators=['count', 'mean', 'M2', 'min',
                                          'max']):
    '''Compute N-day climatology statistics in parallel using PySpark or pysparkling.'''
    if not sparkConfig.startswith('dask,'):
        print("dask: configuration must be of form 'dask,n'", file=sys.stderr)
        sys.exit(1)
    numPartitions = int(sparkConfig.split(',')[1])

    with Timer("Configure Dask distributed"):
        from distributed import Client, as_completed
        client = Client(DaskClientEndpoint)

    print('Starting parallel Stats using Dask . . .', file=sys.stderr)
    start = time.time()
    futures = client.map(
        lambda urls: parallelStatsPipeline(
            urls, ds, nEpochs, variable, mask, coordinates, reader,
            averagingConfig, outHdfsPath, accumulators), urlSplits)

    outputFiles = []
    for future in as_completed(futures):
        outputFile = future.result()
        outputFiles.append(outputFile)
        end = time.time()
        print("parallelStats: Completed %s in %0.3f seconds." %
              (outputFile, (end - start)),
              file=sys.stderr)
    return outputFiles

Example #19

0

Show file

File: test_distributed_handler.py Project: AllenCellModeling/aics_dask_utils

def test_distributed_handler_distributed(values, expected_values):
    cluster = LocalCluster(processes=False)

    with DistributedHandler(cluster.scheduler_address) as handler:
        futures = handler.client.map(lambda x: x + 1, values)
        handler_map_results = handler.gather(futures)

    with DistributedHandler(cluster.scheduler_address) as handler:
        handler_batched_results = handler.batched_map(lambda x: x + 1, values)

    client = Client(cluster)
    futures = client.map(lambda x: x + 1, values)

    distributed_results = client.gather(futures)

    handler_map_results = set(handler_map_results)
    handler_batched_results = set(handler_batched_results)
    distributed_results = set(distributed_results)

    assert (handler_map_results == handler_batched_results
            and handler_map_results == distributed_results)

    cluster.close()

Example #20

0

Show file

class ClusterDaskDistributor(DistributorBaseClass):
    """
    Distributor using a dask cluster, meaning that the calculation is spread over a cluster
    """
    def __init__(self, address):
        """
        Sets up a distributor that connects to a Dask Scheduler to distribute the calculaton of the features

        Parameters
        ----------
        address : str
            The ip address and port number of the Dask Scheduler
        """

        super().__init__()
        from distributed import Client

        self.client = Client(address=address)

    def calculate_best_chunk_size(self, data_length):
        """
        Uses the number of dask workers in the cluster (during execution time, meaning when you start the extraction)
        to find the optimal chunk_size.

        Parameters
        ----------
        data_length: int
            A length which defines how many calculations there need to be.
        """

        n_workers = len(self.client.scheduler_info()["workers"])
        chunk_size, extra = divmod(data_length, n_workers * 5)
        if extra:
            chunk_size += 1
        return chunk_size

    def distribute(self, func, partitioned_chunks, kwargs):
        """
        Calculates the features in a parallel fashion by distributing the map command to the dask workers on a cluster

        Parameters
        ----------
        func : Callable
            Function to send to each worker.
        partitioned_chunks : List
            List of data chunks, each chunk is processed by one woker
        kwargs : Dict
            Parameters for the map function
        Returns
        -------
        List
            The result of the calculation as a list - each item should be the result of the application of func
            to a single element
        """

        if isinstance(partitioned_chunks, Iterable):
            # since dask 2.0.0 client map no longer accepts iterables
            partitioned_chunks = list(partitioned_chunks)
        result = self.client.gather(
            self.client.map(partial(func, **kwargs), partitioned_chunks))
        return result

    def close(self):
        """
        Closes the connection to the Dask Scheduler
        """
        self.client.close()

Example #21

0

Show file

        assert adadamp.__version__ == "0.1.4"

        return train.main(epochs=epochs,
                          verbose=False,
                          seed=seed,
                          tuning=False,
                          **kwargs)

    futures = []
    seeds = np.arange(seed_start, seed_start + n_runs)
    dampers = ["adadamp", "padadamp", "geodamp", "adagrad", "geodamplr"]
    assert set(dampers) == set(params.keys())

    for damper in dampers:
        kwargs = params[damper]
        futures.extend(client.map(submit, seeds, **kwargs))

    for future in as_completed(futures):
        try:
            data, train_data = future.result()
            #  data, train_data = future
        except:  # KilledWorker:
            # This is likely a problem with my code rather than with the
            # Dask cluster.
            #
            # https://stackoverflow.com/questions/46691675/what-do-killedworker-exceptions-mean-in-dask
            print("-" * 20)
            for info in sys.exc_info():
                print(info)
        else:
            df = pd.DataFrame(data)

Example #22

0

Show file

File: test_query_merra2_dask.py Project: hyoklee/Elastic

        a.append(url)
    return a

def get_url(r):
    url = 'https://s3.amazonaws.com/cloudydap/bytestream/'+r['md5']
    return url

def compute(url):
    # print url
    response = urllib2.urlopen(url)
    buf = response.read()
    # print len(buf)
    dec = zlib.decompressobj(32+zlib.MAX_WBITS)
    unzipped = dec.decompress(buf)
    # print len(unzipped)
    # Pick a specific point
    a = unzipped[1]+unzipped[13104]+unzipped[26208]+unzipped[39312]
    # print struct.unpack('<f', a)
    return struct.unpack('<f', a)

# a = search("PRECCU AND chunk_position:\[0,0,0\] AND filename:MERRA2_100*")
a = search("PRECCU AND chunk_position:\[0,91,288\] AND filename:MERRA2_100*")
# a = search("PRECCU AND chunk_position:\[0,0,0\] AND filename:*tavgM_2d_int_*")
# search("PRECCU AND chunk_position:\[0,91,288\] AND filename: MERRA2_400.tavgM_2d_int_Nx.201507.nc4")


c = Client('localhost:8786')
m = c.map(compute, a)
x = c.gather(m)
print x

Example #23

0

Show file

 arg_parser.add_argument('--scheduler_port',
                         default='8786',
                         help='scheduler port to use')
 arg_parser.add_argument('--n',
                         type=int,
                         default=100,
                         help='number of terms in sum')
 arg_parser.add_argument('--verbose',
                         action='store_true',
                         help='give verbose output')
 options = arg_parser.parse_args()
 client = Client('{0}:{1}'.format(options.scheduler,
                                  options.scheduler_port))
 if options.verbose:
     print('Client: {0}'.format(str(client)), flush=True)
 futures = client.map(square, range(options.n))
 total = client.submit(sum, futures)
 expected_total = (options.n - 1) * options.n * (2 * options.n - 1) // 6
 print('sum_i=0..99 i^2 = {0:d}, expected {1:d}'.format(
     total.result(), expected_total))
 futures = client.map(get_hostname, range(options.n))
 process_locations = client.gather(futures)
 if options.verbose:
     print('task placement:')
     print('\t' + '\n\t'.join(process_locations))
 count = dict()
 for process_location in process_locations:
     _, _, hostname = process_location.split()
     if hostname not in count:
         count[hostname] = 0
     count[hostname] += 1

Example #24

0

Show file

        return self.y * x**2**2**2


def bogus_helper(_args):
    bog, x = _args
    return bog.square(x)


def square(x):
    return x**2**2**2


def neg(x):
    return -x


# submit many function calls:
A = client.map(square, range(10))
# print(A)
B = client.map(neg, A)
# print(B)
# submit individual function calls:
total = client.submit(sum, B)
print(total.result())

bg = Bogus(2)
args = [[bg, x] for x in range(10)]
C = client.map(bogus_helper, args)
results = client.gather(C)
print(results)

Example #25

0

Show file

File: parallel_processing.py Project: rcjackson/PySP2

                                            parallel=False)
        pysp2.io.write_dat(my_binary, out_path + base + '.particle.dat')


# Get all of the unique dates
all_sp2_files = glob(sp2b_path + '*.sp2b')
sp2_date_list = [x.split(".")[3] for x in all_sp2_files]
sp2_date_list = sorted(list(set(sp2_date_list)))
#sp2_date_list = ['20200218']
#process_day('20200218')
print(sp2_date_list)
cluster = PBSCluster(processes=6,
                     cores=36,
                     walltime='5:00:00',
                     memory='270GB',
                     name='dask-worker',
                     queue='arm_high_mem',
                     project='arm',
                     job_extra=['-W group_list=cades-arm'],
                     interface='ib0',
                     extra=['--no-dashboard'])
cluster.scale(36 * 6)
client = Client(cluster)

print("Waiting for workers before starting processing...")
client.wait_for_workers(9)
print(client)
results = client.map(process_day, sp2_date_list)
wait(results)
#del client

Example #26

0

Show file

# Set up scheduler
s = Scheduler(loop=loop)
s.start()

#Set up Workers
w = Worker('comet-14-02.sdsc.edu', loop=loop)
w.start(0)

# Set up client
client = Client('comet-14-02.sdsc.edu:8786')


def chunks(l, n):
    for i in range(0, len(l), n):
        yield l[i:i + n]


#pprint.pprint(list(chunks(range(0, 255), 64)))
output = []
y = list(chunks(range(0, 255), 64))
#print y[0]

for ix in y:
    a = client.map(sum, ix)
    output.append(a)

total = client.submit(sum, output)
total.visualize()
print total.compute()
client.gather(total)

Example #27

0

Show file

File: preprocessing_script.py Project: noahpieta/pysmFISH

def preprocessing_script():
    """
    This script will process all the hybridization folders combined in a 
    processing folder. The input parameters are passed using arparse

    Parameters:
    -----------
    
    scheduler: string
        tcp address of the dask.distributed scheduler (ex. tcp://192.168.0.4:7003). 
        default = False. If False the process will run on the local computer using nCPUs-1

    path: string
        Path to the processing directory


    """


    # Inputs of the function
    parser = argparse.ArgumentParser(description='Preprocessing script')
    parser.add_argument('-scheduler', default=False, help='dask scheduler address ex. tcp://192.168.0.4:7003')
    parser.add_argument('-path', help='processing directory')
    args = parser.parse_args()
    
    # Directory to process
    processing_directory = args.path
    # Dask scheduler address
    scheduler_address = args.scheduler
    
    if scheduler_address:
        # Start dask client on server or cluster
        client=Client(scheduler_address)

    else:
        # Start dask client on local machine. It will use all the availabe
        # cores -1

        # number of core to use
        ncores = multiprocessing.cpu_count()-1
        cluster = LocalCluster(n_workers=ncores)
        client=Client(cluster)

    # Subdirectories of the processing_directory that need to be skipped for the
    # analysis
    blocked_directories = ['_logs']

    # Starting logger
    utils.init_file_logger(processing_directory)
    logger = logging.getLogger()

    # Determine the operating system running the code
    os_windows, add_slash = utils.determine_os()

    # Check training slash in the processing directory
    processing_directory=utils.check_trailing_slash(processing_directory,os_windows)

    # Get a list of the hybridization to process
    processing_hyb_list = next(os.walk(processing_directory))[1]

    # Remove the blocked directories from the directories to process
    processing_hyb_list = [el for el in processing_hyb_list if el not in blocked_directories ]

    for processing_hyb in processing_hyb_list:
    
        # Determine the hyb number from the name
        hybridization_number = processing_hyb.split('_hyb')[-1]
        hybridization = 'Hybridization' + hybridization_number
        hyb_dir = processing_directory + processing_hyb + add_slash
        
        # Parse the Experimental metadata file (serial)
        experiment_infos,image_properties, hybridizations_infos, \
        converted_positions, microscope_parameters =\
        utils.experimental_metadata_parser(hyb_dir)
        
        # Parse the configuration file 
        flt_rawcnt_config = utils.filtering_raw_counting_config_parser(hyb_dir)
        
        
        # ----------------- .nd2 FILE CONVERSION ------------------------------

        # Create the temporary subdirectory tree (serial)
        tmp_dir_path, tmp_gene_dirs=utils.create_subdirectory_tree(hyb_dir,\
                    hybridization,hybridizations_infos,processing_hyb,suffix='tmp',add_slash=add_slash)

        # Get the list of the nd2 files to process inside the directory
        files_list = glob.glob(hyb_dir+processing_hyb+'_raw_data'+add_slash+'*.nd2')

        # Get the list of genes that are analyzed in the current hybridization
        gene_list = list(hybridizations_infos[hybridization].keys())

        # Organize the file to process in a list which order match the gene_list for
        # parallel processing
        organized_files_list = [f for gene in gene_list for f in files_list if gene+'.nd2' in f  ]
        organized_tmp_dir_list = [f for gene in gene_list for f in tmp_gene_dirs if gene in f  ]

        # Each .nd2 file will be processed in a worker part of a different node
        # Get the addresses of one process/node to use for conversion
        node_addresses = utils.identify_nodes(client)
        workers_conversion = [list(el.items())[0][1] for key,el in node_addresses.items()]

        # Run the conversion
        futures_processes=client.map(io.nd2_to_npy,gene_list,organized_files_list,
                                    tmp_gene_dirs,processing_hyb=processing_hyb,
                                    use_ram=flt_rawcnt_config['use_ram'],
                                    max_ram=flt_rawcnt_config['max_ram'],
                                    workers=workers_conversion)
        client.gather(futures_processes)

        

        # ---------------------------------------------------------------------
        
        
        # ----------------- FILTERING AND RAW COUNTING ------------------------
        
        # Create directories 

        # Create the directory where to save the filtered images
        suffix = 'filtered_png'
        filtered_png_img_dir_path, filtered_png_img_gene_dirs = \
                utils.create_subdirectory_tree(hyb_dir,hybridization,hybridizations_infos,
                            processing_hyb,suffix,add_slash,analysis_name=flt_rawcnt_config['analysis_name'])

        suffix = 'filtered_npy'
        filtered_img_dir_path, filtered_img_gene_dirs = \
                utils.create_subdirectory_tree(hyb_dir,hybridization,hybridizations_infos,
                            processing_hyb,suffix,add_slash,analysis_name=flt_rawcnt_config['analysis_name'])

        # Create the directory where to save the counting
        suffix = 'counting'
        counting_dir_path, counting_gene_dirs = \
            utils.create_subdirectory_tree(hyb_dir,hybridization,hybridizations_infos,processing_hyb,
                            suffix,add_slash,flt_rawcnt_config['skip_tags_counting'],
                            flt_rawcnt_config['skip_genes_counting'],
                            analysis_name=flt_rawcnt_config['analysis_name'])


        if flt_rawcnt_config['illumination_correction']:

            # Create the directory where to save the counting
            suffix = 'illumination_funcs'
            illumination_func_dir_path, illumination_func_gene_dirs = \
                utils.create_subdirectory_tree(hyb_dir,hybridization,hybridizations_infos,processing_hyb,
                                                suffix,add_slash,analysis_name=flt_rawcnt_config['analysis_name'])

            # Loop through channels and calculate illumination
            for gene in hybridizations_infos[hybridization].keys():
                
                flist_img_to_filter=glob.glob(hyb_dir+processing_hyb+'_tmp/'+processing_hyb+'_'+gene+'_tmp/*.npy')

                logger.debug('Create average image for gene %s', gene)

                # Chunking the image list
                num_chunks = sum(list(client.ncores().values()))
                chunked_list = utils.list_chunking(flist_img_to_filter,num_chunks)

                # Scatter the images sublists to process in parallel
                futures = client.scatter(chunked_list)

                # Create dask processing graph
                output = []
                for future in futures:
                    ImgMean = delayed(utils.partial_image_mean)(future)
                    output.append(ImgMean)
                ImgMean_all = delayed(sum)(output)
                ImgMean_all = ImgMean_all/float(len(futures))

                # Compute the graph
                ImgMean = ImgMean_all.compute()

                logger.debug('Create illumination function for gene %s',gene)
                # Create illumination function
                Illumination=filters.gaussian(ImgMean,sigma=(20,300,300))

                # Normalization of the illumination
                Illumination_flat=np.amax(Illumination,axis=0)
                Illumination_norm=Illumination_flat/np.amax(Illumination_flat)

                logger.debug('Save illumination function for gene %s',gene)
                # Save the illumination function
                illumination_path = [ill_path for ill_path in illumination_func_gene_dirs if gene in ill_path][0]
                illumination_fname=illumination_path+gene+'_illumination_func.npy'
                np.save(illumination_fname,Illumination_norm,allow_pickle=False)  

                # Broadcast the illumination function to all the cores
                client.scatter(Illumination_norm, broadcast=True)

                logger.debug('Filtering %s',gene)
                # Filtering and counting
                futures_processes=client.map(counting.filtering_and_counting_ill_correction,flist_img_to_filter, \
                                illumination_function=Illumination_norm,\
                                filtered_png_img_gene_dirs=filtered_png_img_gene_dirs,\
                                filtered_img_gene_dirs =filtered_img_gene_dirs,\
                                counting_gene_dirs=counting_gene_dirs,plane_keep=flt_rawcnt_config['plane_keep'], \
                                min_distance=flt_rawcnt_config['min_distance'], stringency=flt_rawcnt_config['stringency'],\
                                skip_genes_counting=flt_rawcnt_config['skip_genes_counting'],skip_tags_counting=flt_rawcnt_config['skip_tags_counting'])
                client.gather(futures_processes)
               

        else:
            for gene in hybridizations_infos[hybridization].keys():
                flist_img_to_filter=glob.glob(hyb_dir+processing_hyb+'_tmp/'+processing_hyb+'_'+gene+'_tmp/*.npy')
                # filtering
                logger.debug('Filtering without illumination correction %s',gene)

                futures_processes=client.map(counting.filtering_and_counting,flist_img_to_filter, \
                                        filtered_png_img_gene_dirs=filtered_png_img_gene_dirs, \
                                        filtered_img_gene_dirs=filtered_img_gene_dirs, \
                                        counting_gene_dirs=counting_gene_dirs, \
                                        plane_keep=flt_rawcnt_config['plane_keep'], min_distance=flt_rawcnt_config['min_distance'],\
                                        stringency=flt_rawcnt_config['stringency'],\
                                        skip_genes_counting=flt_rawcnt_config['skip_genes_counting'],skip_tags_counting=flt_rawcnt_config['skip_tags_counting'])

                client.gather(futures_processes)
                
        # ---------------------------------------------------------------------
        
        # # ----------------- COMBINE THE FILTERED DATA IN .ppf.hdf5 ------------------------
        # # Combine the filter data in one single .ppf for each hybridization
        # # This step will run in serial mode and will not need to shuffle data
        # #  between cores because everything is on the common file system

        # logger.debug('Create .ppf.hdf5 file')

        # # Create the ppf.hdf5 file that contains the filtered data in uint16
        # preprocessing_file_path = hdf5_utils.hdf5_create_preprocessing_file(hybridizations_infos,processing_hyb,
        #                                 hybridization,flt_rawcnt_config['analysis_name'], hyb_dir,converted_positions,image_properties)

        # logger.debug('Write the .npy filtered files into the .ppf file')
        # # Load and write the .npy tmp images into the hdf5 file

        # # open the hdf5 file
        # with h5py.File(preprocessing_file_path) as f_hdl:
        #     # Loop through each gene
        #     for gene in hybridizations_infos[hybridization].keys():

        #         logger.debug('Writing %s images in .ppf.hdf5',gene)
        #         # list of the files to transfer
        #         filtered_gene_dir = [fdir for fdir in filtered_img_gene_dirs if gene in fdir][0]
        #         filtered_files_list = glob.glob(filtered_gene_dir+'*.npy')

        #         # loop through the list of file
        #         for f_file in filtered_files_list:
        #             pos = f_file.split('/')[-1].split('_')[-1].split('.')[0]
        #             f_hdl[gene]['FilteredData'][pos][:] =np.load(f_file)
        #             f_hdl.flush()
        
        # # ---------------------------------------------------------------------
        
        # # ----------------- STITCHING ------------------------
        # # Load the stitching parameters from the .yaml file

        # # Stitch the image in 2D or 3D (3D need more work/testing)
        # nr_dim = flt_rawcnt_config['nr_dim']

        # # Estimated overlapping between images according to the Nikon software
        # est_overlap = image_properties['Overlapping_percentage']

        # # Number of peaks to use for the alignment
        # nr_peaks = flt_rawcnt_config['nr_peaks']

        # # Determine if the coords need to be flipped

        # y_flip = flt_rawcnt_config['y_flip']

        # # Method to use for blending
        # # can be 'linear' or 'non linear'
        # # The methods that performs the best is the 'non linear'

        # blend = flt_rawcnt_config['blend']

        # # Reference gene for stitching
        # reference_gene = flt_rawcnt_config['reference_gene']

        # pixel_size = image_properties['PixelSize']

        # # Get the list of the filtered files of the reference gene
        # filtered_gene_dir = [gene_dir for gene_dir in filtered_img_gene_dirs if reference_gene in gene_dir][0]
        # filtered_files_list = glob.glob(filtered_gene_dir+'*.npy')

        # # Create pointer of the hdf5 file that will store the stitched reference image
        # # for the current hybridization
        # # Writing
        # tile_file_base_name = flt_rawcnt_config['analysis_name']+'_'+ processing_hyb
        # data_name   = (tile_file_base_name
        #                 + '_' + reference_gene
        #                 + '_stitching_data')

        # stitching_file_name = tile_file_base_name + '.sf.hdf5'
        # stitching_file= h5py.File(hyb_dir+stitching_file_name,'w',libver='latest')  # replace with 'a' as soon as you fix the error


        # # Determine the tiles organization
        # tiles, contig_tuples, nr_pixels, z_count, micData = stitching.get_pairwise_input_npy(image_properties,converted_positions, hybridization,
        #                         est_overlap = est_overlap, y_flip = False, nr_dim = 2)



        # # Align the tiles 
        # futures_processes=client.map(pairwisesingle.align_single_pair_npy,contig_tuples,
        #                             filtered_files_list=filtered_files_list,micData=micData, 
        #                         nr_peaks=nr_peaks)

        # # Gather the futures
        # data = client.gather(futures_processes)


        # # In this case the order of the returned contingency tuples is with
        # # the order of the input contig_tuples

        # # P_all = [el for data_single in data for el in data_single[0]]
        # P_all =[data_single[0] for data_single in data ]
        # P_all = np.array(P_all)
        # P_all = P_all.flat[:]
        # covs_all = [data_single[1] for data_single in data]
        # alignment = {'P': P_all,
        #             'covs': covs_all}


        # # Calculates a shift in global coordinates for each tile (global
        # # alignment) and then applies these shifts to the  corner coordinates
        # # of each tile and returns and saves these shifted corner coordinates.
        # joining = stitching.get_place_tile_input(hyb_dir, tiles, contig_tuples,
        #                                             micData, nr_pixels, z_count,
        #                                             alignment, data_name,
        #                                             nr_dim=nr_dim)

        # # Create the hdf5 file structure
        # stitched_group, linear_blending, blend =  hdf5preparation.create_structures_hdf5_stitched_ref_gene_file_npy(stitching_file, joining, nr_pixels,
        #                                 reference_gene, blend = 'non linear')

        # # Fill the hdf5 containing the stitched image with empty data and
        # # create the blending mask
        # stitched_group['final_image'][:]= np.zeros(joining['final_image_shape'],dtype=np.float64)
        # if blend is not None:
        #     # make mask
        #     stitched_group['blending_mask'][:] = np.zeros(joining['final_image_shape'][-2:],dtype=np.float64)
        #     tilejoining.make_mask(joining, nr_pixels, stitched_group['blending_mask'])

            
        # # Create the subdirectory used to save the blended tiles
        # suffix = 'blended_tiles'
        # blended_tiles_directory = utils.create_single_directory(hyb_dir,reference_gene, hybridization,processing_hyb,suffix,add_slash,
        #                                 analysis_name=flt_rawcnt_config['analysis_name'])

        # # Get the directory with the filtered npy images of the reference_gene to use for stitching
        # stitching_files_dir = [npy_dir for npy_dir in filtered_img_gene_dirs if reference_gene in npy_dir][0]


        # # Create the tmp directory where to save the masks
        # suffix = 'masks'
        # masked_tiles_directory = utils.create_single_directory(hyb_dir,reference_gene, hybridization,processing_hyb,suffix,add_slash,
        #                                 analysis_name=flt_rawcnt_config['analysis_name'])

        # # Create and save the mask files
        # for corn_value,corner_coords in joining['corner_list']:
        #     if not(np.isnan(corner_coords[0])):
        #         cur_mask = stitched_group['blending_mask'][int(corner_coords[0]):int(corner_coords[0]) + int(nr_pixels),
        #                             int(corner_coords[1]):int(corner_coords[1]) + int(nr_pixels)]

        #         fname = masked_tiles_directory + flt_rawcnt_config['analysis_name'] +'_'+processing_hyb+'_'+reference_gene+'_masks_joining_pos_'+str(corn_value)
        #         np.save(fname,cur_mask)


        # # Blend all the tiles and save them in a directory
        # futures_processes = client.map(tilejoining.generate_blended_tile_npy,joining['corner_list'],
        #                             stitching_files_dir = stitching_files_dir,
        #                             blended_tiles_directory = blended_tiles_directory,
        #                             masked_tiles_directory = masked_tiles_directory,
        #                             analysis_name = flt_rawcnt_config['analysis_name'],
        #                             processing_hyb = processing_hyb,reference_gene = reference_gene,
        #                             micData = micData,tiles = tiles,nr_pixels=nr_pixels,
        #                             linear_blending=linear_blending)



        # _ = client.gather(futures_processes)


        # # Write the stitched image
        # tilejoining.make_final_image_npy(joining, stitching_file, blended_tiles_directory, tiles,reference_gene, nr_pixels)

        # # close the hdf5 file
        # stitching_file.close()


        # # Delete the directories with blended tiles and masks
        # shutil.rmtree(blended_tiles_directory)
        # shutil.rmtree(masked_tiles_directory)

        # ----------------- DELETE FILES ------------------------
        # Don't delete the *.npy files here because can be used to 
        # create the final images using the apply stitching related function    









    client.close()

Example #28

0

Show file

from distributed import Client
import time

client = Client("192.168.0.106:8786")
client.restart()

from funcs import create_dirs, get_dirs, add_flag

future = client.map(create_dirs, range(100))
flags = client.submit(get_dirs, future)
client.gather(flags)
print(flags)

Example #29

0

Show file

File: calculate_feature_matrix.py Project: shannonyu/featuretools

def parallel_calculate_chunks(chunks,
                              features,
                              approximate,
                              training_window,
                              verbose,
                              save_progress,
                              entityset,
                              n_jobs,
                              no_unapproximated_aggs,
                              cutoff_df_time_var,
                              target_time,
                              pass_columns,
                              dask_kwargs=None):
    from distributed import Client, LocalCluster, as_completed
    from dask.base import tokenize

    client = None
    cluster = None
    try:
        if 'cluster' in dask_kwargs:
            cluster = dask_kwargs['cluster']
        else:
            diagnostics_port = None
            if 'diagnostics_port' in dask_kwargs:
                diagnostics_port = dask_kwargs['diagnostics_port']
                del dask_kwargs['diagnostics_port']

            workers = n_jobs_to_workers(n_jobs)
            workers = min(workers, len(chunks))
            cluster = LocalCluster(n_workers=workers,
                                   threads_per_worker=1,
                                   diagnostics_port=diagnostics_port,
                                   **dask_kwargs)
            # if cluster has bokeh port, notify user if unxepected port number
            if diagnostics_port is not None:
                if hasattr(cluster, 'scheduler') and cluster.scheduler:
                    info = cluster.scheduler.identity()
                    if 'bokeh' in info['services']:
                        msg = "Dashboard started on port {}"
                        print(msg.format(info['services']['bokeh']))

        client = Client(cluster)
        # scatter the entityset
        # denote future with leading underscore
        start = time.time()
        es_token = "EntitySet-{}".format(tokenize(entityset))
        if es_token in client.list_datasets():
            print("Using EntitySet persisted on the cluster as dataset %s" %
                  (es_token))
            _es = client.get_dataset(es_token)
        else:
            _es = client.scatter([entityset])[0]
            client.publish_dataset(**{_es.key: _es})

        # save features to a tempfile and scatter it
        pickled_feats = cloudpickle.dumps(features)
        _saved_features = client.scatter(pickled_feats)
        client.replicate([_es, _saved_features])
        end = time.time()
        scatter_time = end - start
        scatter_string = "EntitySet scattered to workers in {:.3f} seconds"
        print(scatter_string.format(scatter_time))

        # map chunks
        # TODO: consider handling task submission dask kwargs
        _chunks = client.map(calculate_chunk,
                             chunks,
                             features=_saved_features,
                             entityset=_es,
                             approximate=approximate,
                             training_window=training_window,
                             profile=False,
                             verbose=False,
                             save_progress=save_progress,
                             no_unapproximated_aggs=no_unapproximated_aggs,
                             cutoff_df_time_var=cutoff_df_time_var,
                             target_time=target_time,
                             pass_columns=pass_columns)

        feature_matrix = []
        iterator = as_completed(_chunks).batches()
        if verbose:
            pbar_str = ("Elapsed: {elapsed} | Remaining: {remaining} | "
                        "Progress: {l_bar}{bar}| "
                        "Calculated: {n}/{total} chunks")
            pbar = make_tqdm_iterator(total=len(_chunks), bar_format=pbar_str)
        for batch in iterator:
            results = client.gather(batch)
            for result in results:
                feature_matrix.append(result)
                if verbose:
                    pbar.update()
        if verbose:
            pbar.close()
    except Exception:
        raise
    finally:
        if 'cluster' not in dask_kwargs and cluster is not None:
            cluster.close()
        if client is not None:
            client.close()

    return feature_matrix

Example #30

0

Show file

def do(param):
    dataset = pickle.load(open(f'{os.environ["HOME"]}/dataset.pkl', 'rb'))
    Xs, ys, Xst, yst = dataset

    criterion, n_estimators, max_features, max_depth = param
    model = RandomForestClassifier(n_estimators=n_estimators,
                                   criterion=criterion,
                                   max_features=max_features,
                                   max_depth=max_depth)
    model.fit(Xs, ys)
    ysp = model.predict(Xst)
    acc = accuracy_score(yst, ysp)
    print(acc)
    return [acc, list(param)]


params = []
for cri in ['gini', 'entropy']:
    for n_esti in range(5, 15):
        for max_features in range(10, 20):
            for max_depth in range(4, 20):
                params.append((cri, n_esti, max_features, max_depth))
L = client.map(do, params)

ga = client.gather(L)

import json
json.dump(ga, open('ga.json', 'w'), indent=2)
print(ga)

Example #31

0

Show file

File: compute_avg_in_parallel_with_dask.py Project: petacube/thredds_tds_dask

# dask client
from distributed import Client
from os.path import join
from math import ceil
from thredds_configuration import file_list_url, data_request, data_folder, thredds_servers
from dask_configuration import dask_scheduler_url
from thredds_utils import list_thredds_folder, compute_url_to_thredds_server_map, compute_avg_func

array_list = []
file_list = list_thredds_folder(file_list_url)

# connect to dask
client = Client(dask_scheduler_url)

url_list = []
for f in file_list:
    url_list.append(data_request + "/" + data_folder + "/" + f +
                    "?time1[0],Temperature_surface[0][0:360][0:719]")

# allocate url to threads servers
server_url_mapping = compute_url_to_thredds_server_map(url_list,
                                                       thredds_servers)

# launch the dask computation and collect results
avg_results_status = client.map(compute_avg_func, server_url_mapping)
avg_results = client.gather(avg_results_status)

final_avg = np.mean(avg_results)

print(final_avg)

Example #32

0

Show file

###Aux channels###
##################

chunk = 16384
pad = 256

# Find the data
#cache1=find_raw_frames(ifo, st1, st1+dur)
#cache2=find_raw_frames(ifo, st2, st2+dur)

# Connect to Dask scheduler
client = Client(args.address)

for t1, t2 in chunk_segments(segs, chunk, pad):
    print 'Getting chunk', t1, t2

    # Set up the channel list
    params_list = [(chan, ifo, t1, t2) for chan in channels
                   ]  #Add in st1, st2, dur for psd comparison tool

    # Run jobs on the cluster and return results
    jobs = client.map(aux_feat_get, params_list)
    result = client.gather(jobs)

    # Write out the results
    #Will sort the results by how much difference in the PSD there is
    #result.sort(key=lambda x: x[1], reverse=True)

    with open('results_of_aux_%u-%u.dat' % (t1, (t2 - t1)), 'wb') as fout:
        pickle.dump(result, fout)

Example #33

0

Show file

                            help='port of the dask scheduler')
    options = arg_parser.parse_args()
    client = Client(f'{options.host}:{options.port:d}')
    if options.implementation == 'python':
        from julia_python import julia_set
    elif options.implementation == 'cython':
        from julia_cython import julia_set
        client.register_worker_callbacks(init_pyx)
    elif options.implementation == 'cython_omp':
        from julia_cython_omp import julia_set
        client.register_worker_callbacks(init_omp_pyx)
    else:
        msg = '{0} version not implemented\n'
        sys.stderr.write(msg.format(options.implementation))
        sys.exit(1)

    domain = init_julia((options.re_min, options.re_max),
                        (options.im_min, options.im_max),
                        (options.n_re, options.n_im))
    domains = np.array_split(domain, options.partitions)
    iterations = np.array_split(
        np.zeros(options.n_re * options.n_im, dtype=np.int32),
        options.partitions)
    start_time = time.time()
    futures = client.map(julia_set, domains, iterations)
    results = client.gather(futures)
    end_time = time.time()
    print('compute time = {0:.6f} s'.format(end_time - start_time))
    np.savetxt('julia.txt',
               np.concatenate(results).reshape(options.n_re, options.n_im))

Example #34

0

Show file

File: test_worker_memory.py Project: haraldschilly/distributed

async def assert_basic_futures(c: Client) -> None:
    futures = c.map(inc, range(10))
    results = await c.gather(futures)
    assert results == list(map(inc, range(10)))

Example #35

0

Show file

File: dask_distr_test.py Project: gjbex/training-material

if __name__ == '__main__':
    arg_parser = ArgumentParser(description='compute sum of squares and check '
                                            'task placement')
    arg_parser.add_argument('--scheduler', help='scheduler host')
    arg_parser.add_argument('--scheduler_port', default='8786',
                            help='scheduler port to use')
    arg_parser.add_argument('--n', type=int, default=100,
                            help='number of terms in sum')
    arg_parser.add_argument('--verbose', action='store_true',
                            help='give verbose output')
    options = arg_parser.parse_args()
    client = Client('{0}:{1}'.format(options.scheduler,
                                     options.scheduler_port))
    if options.verbose:
        print('Client: {0}'.format(str(client)), flush=True)
    futures = client.map(square, range(options.n))
    total = client.submit(sum, futures)
    expected_total = (options.n - 1)*options.n*(2*options.n - 1)//6
    print('sum_i=0..99 i^2 = {0:d}, expected {1:d}'.format(total.result(),
                                                           expected_total))
    futures = client.map(get_hostname, range(options.n))
    process_locations = client.gather(futures)
    if options.verbose:
        print('task placement:')
        print('\t' + '\n\t'.join(process_locations))
    count = dict()
    for process_location in process_locations:
        _, _, hostname = process_location.split()
        if hostname not in count:
            count[hostname] = 0
        count[hostname] += 1