Beispiel #1
0
def gen_samples():
    ''' generates all monte carlo samples '''
    if DASK:
        # list of delayed operations
        operations = [delayed(gen_sample)(k, STATE[k]) for k in range(NS)]
        # submit futures to client
        futures = CLIENT.compute(operations)
        # progress bar
        if VERBOSE:
            print('----------------------')
            print('performing monte carlo')
            print('----------------------')
            progress(futures)
    elif PARALLEL:
        operations = [delayed(gen_sample)(k, STATE[k]) for k in range(NS)]
        futures = Parallel(n_jobs=NTHREAD,
                           backend='threading',
                           verbose=VERBOSE)(operations)
    else:
        # loop through pressures
        if VERBOSE:
            print('----------------------')
            print('performing monte carlo')
            print('----------------------')
            futures = [gen_sample(k, STATE[k]) for k in tqdm(range(NS))]
        else:
            futures = [gen_sample(k, STATE[k]) for k in range(NS)]
    return futures
Beispiel #2
0
def build_cli(args=None):
    if not args:
        args = parse_args()
    else:
        args = parse_args(args)
    filter_dirty = any(args.packages) or not args._all

    outputs = get_dask_outputs(args.path,
                               packages=args.packages,
                               filter_dirty=filter_dirty,
                               git_rev=args.git_rev,
                               stop_rev=args.stop_rev,
                               steps=args.steps,
                               max_downstream=args.max_downstream,
                               visualize=args.visualize,
                               test=args.test)

    if args.visualize:
        # setattr(nx.drawing, 'graphviz_layout', nx.nx_pydot.graphviz_layout)
        # graphviz_graph = nx.draw_graphviz(graph, 'dot')
        # graphviz_graph.draw(args.visualize)
        visualize(*outputs,
                  filename=args.visualize)  # create neat looking graph.
    else:
        # many threads, because this is just the dispatch.  Takes very little compute.
        # Only waiting for build complete.
        cluster = LocalCluster(n_workers=1,
                               threads_per_worker=args.threads,
                               nanny=False)
        client = Client(cluster)

        futures = client.persist(outputs)
        progress(futures)
Beispiel #3
0
def write_outputs():
    ''' writes outputs for all samples '''
    if DASK:
        operations = [
            delayed(write_output)(OUTPUT[k], STATE[k]) for k in range(NS)
        ]
        futures = CLIENT.compute(operations)
        if VERBOSE:
            print('\n---------------')
            print('writing outputs')
            print('---------------')
            progress(futures)
    elif PARALLEL:
        operations = [
            delayed(write_output)(OUTPUT[k], STATE[k]) for k in range(NS)
        ]
        futures = Parallel(n_jobs=NTHREAD,
                           backend='threading',
                           verbose=VERBOSE)(operations)
    else:
        if VERBOSE:
            print('writing outputs')
            print('---------------')
            for k in tqdm(range(NS)):
                write_output(OUTPUT[k], STATE[k])
        else:
            for k in range(NS):
                write_output(OUTPUT[k], STATE[k])
Beispiel #4
0
    def submit(
        self,
        client: Client = None,
        scheduler_address: str = None,
        priority: int = None,
        resources: Dict[str, Any] = None,
        show_progress=False,
        **kwargs,
    ) -> None:

        if not priority:
            priority = self.priority

        if not resources:
            resources = self.resources

        if not client:
            client = Client(scheduler_address)

        self.scheduler_address = client.scheduler.address

        computation = client.compute(self.graph,
                                     retries=3,
                                     priority=priority,
                                     resources=resources)
        if show_progress:
            progress(computation)
        fire_and_forget(computation)
        if scheduler_address:
            client.close()
        return None
Beispiel #5
0
def calculate_cdfs():
    ''' calculate cdfs for all samples '''
    if VERBOSE:
        print('computing %s %s samples' % (NS, EL.lower()))
    if DASK:
        operations = [
            delayed(calculate_cdf)(NATOMS[i], BOX[i], BR, POS[i], RV, CD)
            for i in range(NS)
        ]
        futures = CLIENT.compute(operations)
        if VERBOSE:
            progress(futures)
            print('\n')
    elif PARALLEL:
        operations = [
            delayed(calculate_cdf)(NATOMS[i], BOX[i], BR, POS[i], RV, CD)
            for i in range(NS)
        ]
        futures = Parallel(n_jobs=NTHREAD,
                           backend='threading',
                           verbose=VERBOSE)(operations)
    else:
        if VERBOSE:
            futures = [
                calculate_cdf(NATOMS[i], BOX[i], BR, POS[i], RV, CD)
                for i in tqdm(range(NS))
            ]
        else:
            futures = [
                calculate_cdf(NATOMS[i], BOX[i], BR, POS[i], RV, CD)
                for i in range(NS)
            ]
    return futures
Beispiel #6
0
    def map_func(func, arg_list):
        with Cluster(**kwargs) as cluster:
            cluster.start_workers(n_workers)
            with Client(cluster) as client:
                _results = client.map(func, arg_list)
                progress(_results, notebook=False)
                result_list = client.gather(_results)
                print("")  # If there will be more output after a progressbar

        return result_list
Beispiel #7
0
    def get_step(self, res, show_progress=False):
        futures, step_t_start = res
        if show_progress:
            distributed.progress(futures)
            print()
        step_results = self.get_chunk(futures)

        descriptors, po_returns, po_lengths, po_novelties, extra = collect_po_results(
            step_results
        )
        episodes_this_step = 2 * len(po_returns)
        timesteps_this_step = po_lengths.sum()

        logger.info(
            "Optimizer {} finished running {} episodes, {} timesteps".format(
                self.optim_id, episodes_this_step, timesteps_this_step
            )
        )

        self.niche.update_extra(extra)

        self.update_population(descriptors, po_returns, po_novelties)
        logger.info("Optimizer {} finished updating population".format(self.optim_id))

        step_t_end = time.time()

        return (
            descriptors,
            po_returns,
            po_novelties,
            StepStats(
                po_returns_mean=po_returns.mean(),
                po_returns_median=np.median(po_returns),
                po_returns_std=po_returns.std(),
                po_returns_max=po_returns.max(),
                po_returns_min=po_returns.min(),
                po_novelties_mean=po_novelties.mean(),
                po_novelties_median=np.median(po_novelties),
                po_novelties_var=po_novelties.var(),
                po_novelties_max=po_novelties.max(),
                po_novelties_min=po_novelties.min(),
                po_len_mean=po_lengths.mean(),
                po_len_std=po_lengths.std(),
                episodes_this_step=episodes_this_step,
                timesteps_this_step=timesteps_this_step,
                time_elapsed_this_step=step_t_end - step_t_start,
            ),
        )
def init_samples():
    ''' initializes all samples '''
    if DASK:
        operations = [delayed(init_sample)(k) for k in range(NS)]
        futures = CLIENT.compute(operations)
        if VERBOSE:
            print('initializing samples')
            print('--------------------')
            progress(futures)
            print('\n')
    elif PARALLEL:
        operations = [delayed(init_sample)(k) for k in range(NS)]
        futures = Parallel(n_jobs=NTHREAD, backend='threading', verbose=VERBOSE)(operations)
    else:
        if VERBOSE:
            print('initializing samples')
            print('--------------------')
            futures = [init_sample(k) for k in tqdm(range(NS))]
        else:
            futures = [init_sample(k) for k in range(NS)]
    return futures
def init_headers():
    ''' writes headers for all samples '''
    if DASK:
        operations = [delayed(init_header)(k, OUTPUT[k]) for k in range(NS)]
        futures = CLIENT.compute(operations)
        if VERBOSE:
            print('initializing headers')
            print('--------------------')
            progress(futures)
    elif PARALLEL:
        operations = [delayed(init_header)(k, OUTPUT[k]) for k in range(NS)]
        futures = Parallel(n_jobs=NTHREAD, backend='threading', verbose=VERBOSE)(operations)
    else:
        if VERBOSE:
            print('initializing headers')
            print('--------------------')
            for k in tqdm(range(NS)):
                init_header(k, OUTPUT[k])
        else:
            for k in range(NS):
                init_header(k, OUTPUT[k])
Beispiel #10
0
def gen_mc_params():
    ''' generate adaptive monte carlo parameters for all samples '''
    if DASK:
        # list of delayed operations
        operations = [delayed(gen_mc_param)(STATE[k]) for k in range(NS)]
        # submit futures to client
        futures = CLIENT.compute(operations)
        # progress bar
        if VERBOSE:
            print('\n------------------')
            print('updating mc params')
            print('------------------')
            progress(futures)
    elif PARALLEL:
        operations = [delayed(gen_mc_param)(STATE[k]) for k in range(NS)]
        futures = Parallel(n_jobs=NTHREAD, backend='threading', verbose=VERBOSE)(operations)
    else:
        # loop through pressures
        if VERBOSE:
            print('updating mc params')
            print('------------------')
        futures = [gen_mc_param(STATE[k]) for k in range(NS)]
    return futures
Beispiel #11
0
def dask_executor(items, function, accumulator, **kwargs):
    """Execute using dask futures

    Parameters
    ----------
        items : list
            List of input arguments
        function : callable
            A function to be called on each input, which returns an accumulator instance
        accumulator : AccumulatorABC
            An accumulator to collect the output of the function
        client : distributed.client.Client
            A dask distributed client instance
        treereduction : int, optional
            Tree reduction factor for output accumulators (default: 20)
        status : bool, optional
            If true (default), enable progress bar
        compression : int, optional
            Compress accumulator outputs in flight with LZ4, at level specified (default 1)
            Set to ``None`` for no compression.
        priority : int, optional
            Task priority, default 0
        retries : int, optional
            Number of retries for failed tasks (default: 3)
        heavy_input : serializable, optional
            Any value placed here will be broadcast to workers and joined to input
            items in a tuple (item, heavy_input) that is passed to function.
        function_name : str, optional
            Name of the function being passed

            .. note:: If ``heavy_input`` is set, ``function`` is assumed to be pure.
    """
    from dask.delayed import delayed
    if len(items) == 0:
        return accumulator
    client = kwargs.pop('client')
    ntree = kwargs.pop('treereduction', 20)
    status = kwargs.pop('status', True)
    clevel = kwargs.pop('compression', 1)
    priority = kwargs.pop('priority', 0)
    retries = kwargs.pop('retries', 3)
    heavy_input = kwargs.pop('heavy_input', None)
    function_name = kwargs.pop('function_name', None)
    reducer = _reduce()
    # secret options
    direct_heavy = kwargs.pop('direct_heavy', None)
    worker_affinity = kwargs.pop('worker_affinity', False)

    if clevel is not None:
        function = _compression_wrapper(clevel, function, name=function_name)
        reducer = _compression_wrapper(clevel, reducer)

    if heavy_input is not None:
        heavy_token = client.scatter(heavy_input, broadcast=True, hash=False, direct=direct_heavy)
        items = list(zip(items, repeat(heavy_token)))

    work = []
    if worker_affinity:
        workers = list(client.run(lambda: 0))

        def belongsto(workerindex, item):
            if heavy_input is not None:
                item = item[0]
            hashed = _hash((item.fileuuid, item.treename, item.entrystart, item.entrystop))
            return hashed % len(workers) == workerindex

        for workerindex, worker in enumerate(workers):
            work.extend(client.map(
                function,
                [item for item in items if belongsto(workerindex, item)],
                pure=(heavy_input is not None),
                priority=priority,
                retries=retries,
                workers={worker},
                allow_other_workers=False,
            ))
    else:
        work = client.map(
            function,
            items,
            pure=(heavy_input is not None),
            priority=priority,
            retries=retries,
        )
    while len(work) > 1:
        work = client.map(
            reducer,
            [work[i:i + ntree] for i in range(0, len(work), ntree)],
            pure=True,
            priority=priority,
            retries=retries,
        )
    work = work[0]
    if status:
        from distributed import progress
        # FIXME: fancy widget doesn't appear, have to live with boring pbar
        progress(work, multi=True, notebook=False)
    accumulator += _maybe_decompress(work.result())
    return accumulator
Beispiel #12
0
#
# - Display head of the dataframe
# - Display number of rows of this dataframe.
# - Compute the total number of passengers.
# - Count occurrences in the payment_type column both for the full dataset, and filtered by zero tip (tip_amount == 0).
# - Create a new column, tip_fraction
# - Plot the average of the new column tip_fraction grouped by day of week.
# - Plot the average of the new column tip_fraction grouped by hour of day.
#
# [Dask dataframe documentation](http://docs.dask.org/en/latest/dataframe.html)
#

# +
import dask.dataframe as dd
from distributed import Client, progress

c = Client('127.0.0.1:8786')
nyc2014 = dd.read_csv('hdfs://svmass2.mass.uhb.fr:54310/user/datasets/nyc-tlc/2014/yellow*.csv',
parse_dates=['pickup_datetime', 'dropoff_datetime'],
skipinitialspace=True)

nyc2015 = dd.read_csv('hdfs://svmass2.mass.uhb.fr:54310/user/datasets/nyc-tlc/2015/yellow*.csv',
parse_dates=['tpep_pickup_datetime', 'tpep_dropoff_datetime'])
nyc2014, nyc2015 = c.persist([nyc2014, nyc2015])

progress(nyc2014, nyc2015)
# -



Beispiel #13
0
                tiles.append(gvi_)

            sng_date_utm_strip = merge_arrays(tiles, method='max', nodata=np.NAN)

            sng_date_reproj = sng_date_utm_strip.rio.reproject('EPSG:4326', )

            dates_da.append(sng_date_reproj)

    # time_agg = xr.combine_nested(dates_da, concat_dim=['time'])
    time_agg = xr.combine_nested(dates_da, concat_dim=[['time', 'x', 'y']])

    GVDM = xr.apply_ufunc(_decades, time_agg,
                          input_core_dims=[['time']],
                          exclude_dims={'time', },
                          dask='parallelized',
                          dask_gufunc_kwargs={'allow_rechunk': True},
                          vectorize=True,)
    GVDM.name = 'GVI'

    GVDM_f = GVDM.to_netcdf(rf'L:\HSL\observations\S2\GVI_S2{sensor}_{decad}_{alg}.nc',
                            compute=False,
                            encoding={'GVI': {'_FillValue': -999}}).persist()

    progress(GVDM_f)

    client.close()


    pass
Beispiel #14
0
def process_file(fname, delay):
    # return the size in bytes of the file
    result = os.path.getsize(fname)
    # simulate a long-running process by sleeping
    time.sleep(delay * random.random())
    return result


if __name__ == "__main__":
    # wrapping the client in a context manager ensures that the client and the cluster get cleaned up
    with get_cluster() as clust, Client(clust) as cl:
        print(f"Cluster dashboard running at {cl.cluster.dashboard_link}")
        # add workers
        cl.cluster.scale(num_workers)

        # list all the files in the home directory
        home_files = list(Path.home().glob("*"))

        # ensure that the total runtime is ~mean_runtime
        delays = (
            (mean_runtime * num_workers) / len(home_files), ) * len(home_files)

        # map the function `process_file` over the arguments `home_files` and `delays`
        # this returns a collection of futures
        futures = cl.map(process_file, home_files, delays)
        progress(futures)
        # block until all the futures are finished
        result = cl.gather(futures)

    print(*zip(map(lambda v: str(v.name), home_files), result))