Ejemplo n.º 1
0
def dask_cluster():
    cluster = LocalCluster(n_workers=2, threads_per_worker=2)
    yield cluster
    cluster.close()
Ejemplo n.º 2
0
def compare(X):

    if ('client' in X) == False:  #start DASK if required
        c = LocalCluster(n_workers=X['workers'].value)
        X['client'] = Client(c)

    if X['Mtype'].value == 'Magnetisations':
        Mswitch = 0
    else:
        Mswitch = 1

    X['Mswitch'] = Mswitch

    # Create variables
    M = X['M']
    DM = X['DM']
    X['Hc'] = 0.5 * (X['H'] - X['Hr'])
    X['Hb'] = 0.5 * (X['H'] + X['Hr'])
    X['Mnorm'] = M / np.max(M)
    X['DMnorm'] = DM / np.max(DM)
    #X['Xlsq'] = np.column_stack((np.ones((X['Hc'].size,1)),X['Hc'],X['Hb'],X['Hc']**2,X['Hb']**2,X['Hc']*X['Hb'],X['Hc']**3,X['Hb']**3,X['Hc']**2*X['Hb'],X['Hc']*X['Hb']**2))
    X['Xlsq'] = np.column_stack(
        (np.ones((X['Hc'].size, 1)), X['H'], X['Hr'], X['H']**2, X['Hr']**2,
         X['H'] * X['Hr'], X['H']**3, X['Hr']**3, X['H']**2 * X['Hr'],
         X['H'] * X['Hr']**2))

    idx = np.argwhere(in_window(X, X['Hc'], X['Hb']) == True)
    X['Hc0'] = X['Hc'][idx]
    X['Hb0'] = X['Hb'][idx]

    #scatter variables
    D = {}
    D['Xlsq'] = X['Xlsq']
    D['M'] = X['Mnorm']
    D['DM'] = X['DMnorm']
    D['Hc'] = X['Hc']
    D['Hb'] = X['Hb']
    D['dH'] = X['dH']
    D['Hc0'] = X['Hc0']
    D['Hb0'] = X['Hb0']
    X['Ds'] = X['client'].scatter(D, broadcast=True)

    Ntot = np.size(X['Hc0'])
    np.random.seed(999)
    Didx = np.sort(np.random.choice(Ntot, X['Ndown'].value,
                                    replace=False))  #downsampled indicies

    X = variforc_array(X)  #get smoothing parameter

    jobs = []
    for i in range(len(X['Sp_i'])):
        job = X['client'].submit(process_split, X['Ds'], X['Sp_i'][i], Didx,
                                 Mswitch)
        jobs.append(job)

    results = X['client'].gather(jobs)

    L = results[0]
    for i in range(len(results) - 1):
        L = np.concatenate((L, results[i + 1]))

    X['L'] = L

    #Make results plots
    i0 = np.argmax(L[:, 2])
    if (Mswitch < 0.5):
        BF = regress_split(X['Xlsq'], X['Mnorm'], X['Hc'], X['Hb'], X['dH'],
                           X['Hc'], X['Hb'], X['Sp'][i0, 0], X['Sp'][i0, 1],
                           X['Sp'][i0, 4], X['Sp'][i0, 2], X['Sp'][i0, 3],
                           X['Sp'][i0, 4])
    else:
        BF = regress_split(X['Xlsq'], X['DMnorm'], X['Hc'], X['Hb'], X['dH'],
                           X['Hc'], X['Hb'], X['Sp'][i0, 0], X['Sp'][i0, 1],
                           X['Sp'][i0, 4], X['Sp'][i0, 2], X['Sp'][i0, 3],
                           X['Sp'][i0, 4])

    BF[np.isinf(BF)] = 1E200
    X['BF'] = BF
    X['Pr'] = np.exp(BF - logsumexp(BF, axis=1)[:, np.newaxis])
    #Lpt provides labels to points for selected model order
    Lpt = np.argmax(BF - [np.log(3), 0, np.log(3), np.log(3)], axis=1)
    Lpt[np.max(X['BF'], axis=1) < 1] = 0

    X = plot_model_selection(X, Lpt[idx])

    return X
def combine_probes_memory_efficient(probes_summary_dir, util_dir, n_workers):
    cluster = LocalCluster(n_workers=n_workers,
                           threads_per_worker=1,
                           memory_limit="0")
    client = Client(cluster)
    probes_filenames = glob.glob('{}/*_probes.csv'.format(probes_summary_dir))
    blast_lineage = pd.read_csv('{}/blast_lineage.tab'.format(util_dir),
                                sep='\t')
    taxonomic_levels = [
        'phylum', 'class', 'order', 'family', 'genus', 'species'
    ]
    blast_lineage_slim = blast_lineage.loc[:,
                                           ['molecule_id'] + taxonomic_levels]
    index_list = np.arange(0, len(probes_filenames), 200)
    index_list = np.append(index_list, len(probes_filenames))
    extended_taxonomic_levels = [
        'superkingdom', 'phylum', 'class', 'order', 'family', 'genus',
        'species', 'molecule_id'
    ]
    probes_summary_full = []
    probes_properties_full = []
    for i in range(len(index_list) - 1):
        print('Summarizing probes in group {} out of {} groups...'.format(
            i + 1,
            len(index_list) - 1))
        probes = dd.read_csv(probes_filenames[index_list[i]:index_list[i + 1]])
        probes['molecule_id'] = probes.source.apply(get_molecule_id,
                                                    meta=('str'))
        probes = probes.merge(blast_lineage_slim, on='molecule_id', how='left')
        probes['superkingdom'] = 2
        probes_taxa = probes.loc[:, ['seq'] + extended_taxonomic_levels]
        probes_summary = probes_taxa.groupby('seq').apply(
            calculate_source,
            meta=[('superkingdom', 'str'), ('phylum', 'str'), ('class', 'str'),
                  ('order', 'str'), ('family', 'str'), ('genus', 'str'),
                  ('species', 'str'), ('molecule_id', 'str')])
        probes_properties = probes.loc[:, [
            'seq', 'length', 'Tm', 'GC', 'N', 'self_any_th', 'self_end_th',
            'hair-pin', 'quality'
        ]]
        probes_summary = probes_summary.compute()
        probes_summary = probes_summary.reset_index()
        probes_properties = probes_properties.drop_duplicates().compute()
        probes_properties_full.append(probes_properties)
        probes_summary_full.append(probes_summary)
    probes_summary_full = pd.concat(probes_summary_full).drop_duplicates()
    probes_properties_full = pd.concat(
        probes_properties_full).drop_duplicates()
    probes_taxa_full_dd = dd.from_pandas(probes_summary_full, npartitions=1000)
    probes_summary_consolidate = probes_taxa_full_dd.groupby('seq').apply(
        consolidate_source,
        meta=[('superkingdom', 'str'), ('phylum', 'str'), ('class', 'str'),
              ('order', 'str'), ('family', 'int'), ('genus', 'str'),
              ('species', 'str'), ('molecule_id', 'str'),
              ('max_design_level_numeric', 'int'), ('max_design_level', 'str'),
              ('max_design_target', 'str')])
    probes_summary_compute = probes_summary_consolidate.reset_index()
    probes_summary_compute = probes_summary_compute.compute()
    probes_summary_compute = probes_summary_compute.merge(
        probes_properties_full, on='seq', how='left', copy=False)
    client.close()
    cluster.close()
    probe_summary_filename = '{}/probes_summary.h5'.format(probes_summary_dir)
    probes_summary_compute[
        'max_design_target'] = probes_summary_compute.max_design_target.astype(
            str)
    taxonomic_levels = [
        'superkingdom', 'phylum', 'class', 'order', 'family', 'genus',
        'species', 'molecule_id'
    ]
    for i in range(8):
        probes_summary_working_design_level = probes_summary_compute.loc[
            probes_summary_compute.max_design_level_numeric.values >= i, :]
        probes_summary_working_design_level.loc[:,
                                                'design_level'] = taxonomic_levels[
                                                    i]
        probes_summary_working_design_level.loc[:,
                                                'design_target'] = probes_summary_working_design_level.loc[:, taxonomic_levels[
                                                    i]]
        probes_summary_working_design_level.groupby(
            ['design_level', 'design_target']).apply(write_to_hdf,
                                                     probe_summary_filename)
    return
Ejemplo n.º 4
0
import dask as da
from dask.distributed import LocalCluster, Client
from datetime import date
import glob
import numpy as np
import time
import xarray as xr
%pylab inline
local_dir = "/g/data/e14/cp3790/dask-workers" #Replace this with your local directory 
cluster = LocalCluster(processes=False, local_dir=local_dir)
client = Client(cluster)

def reshape_data(da):
	da_groupby = list(da.groupby('time.dayofyear'))
        dayofyear = []
        da_dayofyear = []
        for item in list(da_groupby):
            dayofyear.append(item[0])
            da_tmp = item[1]
            da_tmp['time'] = da_tmp['time.year']
            da_tmp = da_tmp.rename({'time': 'year'})
            da_tmp = da_tmp.assign_coords(dayofyear=item[0])
            da_dayofyear.append(da_tmp)
        da_reshaped = xr.concat(da_dayofyear, dim='dayofyear')
        return da_reshaped


files = sorted(glob.glob('/g/data/e14/cp3790/Charuni/ERA5-new/era5_dailytmax_*.nc'))

obs_aus = (xr.open_mfdataset(files, combine='nested', concat_dim='time', chunks={'latitude': 10})
           .sel(time=slice('1983', '2012'), longitude=slice(113, 154), latitude=slice(-10, -44)))
Ejemplo n.º 5
0
def client():
    cluster = LocalCluster(n_workers=2)
    client = Client(cluster)
    yield client
    client.close()
    cluster.close()
Ejemplo n.º 6
0
    for filename in os.listdir(folder):
        #print(filename)
        infilename = os.path.join(folder, filename)
        print(infilename)
        x = glob.glob(infilename + "/*/")
        list_stations_meteo = []

        for path in x:
            print(path)
            os.chdir(infilename + path.split(infilename)[1])
            var = path.split('/')[-2]
            #client = Client(n_workers=int(multiprocessing.cpu_count()))

            #client = Client(n_workers=3)
            cluster = LocalCluster(n_workers=3,
                                   processes=True,
                                   threads_per_worker=3)
            client = Client(cluster)
            print(os.getcwd())
            ds = xr.open_mfdataset(
                glob.glob(os.getcwd() + '/*' + var + '*.grib2'),
                concat_dim='valid_time',
                engine='cfgrib',
                combine='nested',
                parallel=True,
                chunks={
                    "x": -1,
                    "y": -1
                },
                coords='minimal',
                compat='override')  #ai-je besoin de faire les chunk?
Ejemplo n.º 7
0
# Print out the EntitySet
print(es_train)
print(es_test)
#%%
logging.debug("Writing TRAIN entity set".format())
es_train.to_pickle(os.path.join(PATH_OFFLINE, "entity set TRAIN.pck"))
logging.debug("Done writing TRAIN entity set".format())

logging.debug("Writing TEST entity set".format())
es_test.to_pickle(os.path.join(PATH_OFFLINE, "entity set TEST.pck"))
logging.debug("Done writing TEST entity set".format())

#%%
n_workers = 12
n_workers = 6
cluster = LocalCluster(n_workers=n_workers, silence_logs=False)
dir(cluster)
print(cluster)


#%% Feature generation
# All features to depth 2
def gen_features_all():
    # Default primitives from featuretools
    default_agg_primitives = [
        "sum", "std", "max", "skew", "min", "mean", "count", "percent_true",
        "num_unique", "mode"
    ]
    default_trans_primitives = [
        "day", "year", "month", "weekday", "haversine", "numwords",
        "characters"
Ejemplo n.º 8
0
    ncfiles = glob.glob(path + '/**/grid*.nc', recursive=True)
    ncfiles.sort()

    d = xr.open_dataset(ncfiles[0])
    relCOMS = d.RELCOM
    ind_receptor = d.ind_receptor
    d.close()

    if e_time == None:
        e_time = pd.to_datetime(ncfiles[-1][-17:-3]).strftime('%Y-%m-%d')
    if s_time == None:
        s_time = pd.to_datetime(ncfiles[0][-17:-3]).strftime('%Y-%m-%d')
    print(create_client)
    if create_client == True:
        cluster = LocalCluster(n_workers=32,
                               threads_per_worker=1,
                               memory_limit='16GB')
        client = Client(cluster)
        print(cluster)

    date_slice = slice(s_time, e_time)
    if ind_receptor == 1:
        f_name = 'Conc'
    elif ind_receptor == 3:
        f_name = 'WetDep'
    elif ind_receptor == 4:
        f_name = 'DryDep'
    else:
        f_name = 'Unknown'

    dir_p = outpath + '/' + f_name + '_mean_{}_{}'.format(s_time, e_time)
Ejemplo n.º 9
0
        logging.info("Starting Analyzer: {}".format(sid))
        if sid not in self._analyzers:
            raise RuntimeError("Analyzer not found")
        else:
            self._analyzers[sid].start()

    def on_stop(self, sid):
        logging.info("Stopping Analyzer: {}".format(sid))
        if sid not in self._analyzers:
            raise RuntimeError("Analyzer not found")
        else:
            self._analyzers[sid].stop()


if __name__ == "__main__":
    cluster = LocalCluster(n_workers=0)

    # Add GPU workers
    # TODO: Get the number of GPU from configuration file
    cluster.start_worker(name="GPU_WORKER-1", resources={"GPU": 1})

    with cluster, Client(cluster.scheduler_address) as client:
        # Initialize GPU workers
        results = client.run(gpu_worker.init_worker, ".")
        assert all([v == "OK" for _, v in results.items()
                    ]), "Failed to initialize GPU workers"

        # Start analyzer manager
        io_loop = asyncio.get_event_loop()
        manager = AnalyzerManager(cluster, io_loop, ["nats://localhost:4222"])
        io_loop.run_forever()
Ejemplo n.º 10
0
        },
    )

    return {"wf": wf, "acc": acc, "mol": mol, "mf": mf, "descriptors": descriptors, "descriptors_tbdm": descriptors_tbdm}

if __name__ == "__main__":
    import pyqmc
    import pyqmc.dasktools
    from pyqmc.dasktools import line_minimization, cvmc_optimize
    from dask.distributed import Client, LocalCluster   

    r = 1.1

    ncore = 2
    sys = setuph2(r)
    cluster = LocalCluster(n_workers=ncore, threads_per_worker=1)
    client = Client(cluster)

    # Set up calculation
    nconf = 800
    configs = pyqmc.initial_guess(sys["mol"], nconf)
    wf, df = line_minimization(
        sys["wf"],
        configs,
        pyqmc.gradient_generator(sys["mol"], sys["wf"]),
        client=client,
        maxiters=5,
    )

    forcing = {}
    obj = {}
Ejemplo n.º 11
0
def set_client(args, stack, log):

    from omegaconf import open_dict
    # number of threads per worker
    if args.nthreads is None:
        if args.host_address is not None:
            raise ValueError(
                "You have to specify nthreads when using a distributed scheduler"
            )
        import multiprocessing
        nthreads = multiprocessing.cpu_count()
        with open_dict(args):
            args.nthreads = nthreads
    else:
        nthreads = int(args.nthreads)

    # configure memory limit
    if args.mem_limit is None:
        if args.host_address is not None:
            raise ValueError(
                "You have to specify mem-limit when using a distributed scheduler"
            )
        import psutil
        mem_limit = int(psutil.virtual_memory()[1] /
                        1e9)  # all available memory by default
        with open_dict(args):
            args.mem_limit = mem_limit
    else:
        mem_limit = int(args.mem_limit)

    if args.nworkers is None:
        raise ValueError("You have to specify the number of workers")
    else:
        nworkers = args.nworkers

    if args.nthreads_per_worker is None:
        nthreads_per_worker = 1
        with open_dict(args):
            args.nthreads_per_worker = nthreads_per_worker
    else:
        nthreads_per_worker = int(args.nthreads_per_worker)

    # the number of chunks being read in simultaneously is equal to
    # the number of dask threads
    nthreads_dask = nworkers * nthreads_per_worker

    if args.nvthreads is None:
        if args.host_address is not None:
            nvthreads = nthreads // nthreads_per_worker
        else:
            nvthreads = nthreads // nthreads_dask
        with open_dict(args):
            args.nvthreads = nvthreads

    os.environ["OMP_NUM_THREADS"] = str(args.nvthreads)
    os.environ["OPENBLAS_NUM_THREADS"] = str(args.nvthreads)
    os.environ["MKL_NUM_THREADS"] = str(args.nvthreads)
    os.environ["VECLIB_MAXIMUM_THREADS"] = str(args.nvthreads)
    os.environ["NUMBA_NUM_THREADS"] = str(args.nvthreads)
    # TODO - does this result in thread over-subscription?
    os.environ["NUMEXPR_NUM_THREADS"] = str(args.nvthreads)

    # set up client
    if args.host_address is not None:
        from distributed import Client
        print("Initialising distributed client.", file=log)
        client = stack.enter_context(Client(address))
    else:
        if nthreads_dask * args.nvthreads > args.nthreads:
            print(
                "Warning - you are attempting to use more threads than available. "
                "This may lead to suboptimal performance.",
                file=log)
        from dask.distributed import Client, LocalCluster
        print("Initialising client with LocalCluster.", file=log)
        cluster = LocalCluster(processes=True,
                               n_workers=nworkers,
                               threads_per_worker=nthreads_per_worker,
                               memory_limit=str(mem_limit / nworkers) + 'GB')
        cluster = stack.enter_context(cluster)
        client = stack.enter_context(Client(cluster))

    from pfb.scheduling import install_plugin
    client.run_on_scheduler(install_plugin)

    # return updated args
    return args
Ejemplo n.º 12
0
import time
from dask.distributed import Client, LocalCluster
import sys

print(sys.argv)
worker = int(sys.argv[1])

if __name__ == '__main__':
    cluster = LocalCluster(
        n_workers=worker,
        scheduler_port=8786,
        host='0.0.0.0',
        dashboard_address='0.0.0.0:8787',
    )

    while True:
        time.sleep(600)
Ejemplo n.º 13
0
    def execute(self, pipeline_context, execution_plan):
        check.inst_param(pipeline_context, 'pipeline_context',
                         SystemPipelineExecutionContext)
        check.inst_param(execution_plan, 'execution_plan', ExecutionPlan)
        check.param_invariant(
            isinstance(pipeline_context.executor, DaskExecutor),
            'pipeline_context',
            'Expected executor to be DaskExecutor got {}'.format(
                pipeline_context.executor),
        )

        # Checks to ensure storage is compatible with Dask configuration
        storage = pipeline_context.run_config.get('storage')
        check.invariant(storage.keys(),
                        'Must specify storage to use Dask execution')

        check.invariant(
            pipeline_context.instance.is_persistent,
            'Dask execution requires a persistent DagsterInstance',
        )

        # https://github.com/dagster-io/dagster/issues/2440
        check.invariant(
            pipeline_context.system_storage_def.is_persistent,
            'Cannot use in-memory storage with Dask, use filesystem, S3, or GCS',
        )

        step_levels = execution_plan.execution_step_levels()

        pipeline_name = pipeline_context.pipeline_def.name

        instance = pipeline_context.instance

        cluster_type = self.cluster_type
        if cluster_type == 'local':
            from dask.distributed import LocalCluster

            cluster = LocalCluster(**self.build_dict(pipeline_name))
        elif cluster_type == 'yarn':
            from dask_yarn import YarnCluster

            cluster = YarnCluster(**self.build_dict(pipeline_name))
        elif cluster_type == 'ssh':
            from dask.distributed import SSHCluster

            cluster = SSHCluster(**self.build_dict(pipeline_name))
        elif cluster_type == 'pbs':
            from dask_jobqueue import PBSCluster

            cluster = PBSCluster(**self.build_dict(pipeline_name))
        elif cluster_type == 'moab':
            from dask_jobqueue import MoabCluster

            cluster = MoabCluster(**self.build_dict(pipeline_name))
        elif cluster_type == 'sge':
            from dask_jobqueue import SGECluster

            cluster = SGECluster(**self.build_dict(pipeline_name))
        elif cluster_type == 'lsf':
            from dask_jobqueue import LSFCluster

            cluster = LSFCluster(**self.build_dict(pipeline_name))
        elif cluster_type == 'slurm':
            from dask_jobqueue import SLURMCluster

            cluster = SLURMCluster(**self.build_dict(pipeline_name))
        elif cluster_type == 'oar':
            from dask_jobqueue import OARCluster

            cluster = OARCluster(**self.build_dict(pipeline_name))
        elif cluster_type == 'kube':
            from dask_kubernetes import KubeCluster

            cluster = KubeCluster(**self.build_dict(pipeline_name))
        else:
            raise ValueError(
                f"Must be providing one of the following ('local', 'yarn', 'ssh', 'pbs', 'moab', 'sge', 'lsf', 'slurm', 'oar', 'kube') not {cluster_type}"
            )

        with dask.distributed.Client(cluster) as client:
            execution_futures = []
            execution_futures_dict = {}

            for step_level in step_levels:
                for step in step_level:
                    # We ensure correctness in sequencing by letting Dask schedule futures and
                    # awaiting dependencies within each step.
                    dependencies = []
                    for step_input in step.step_inputs:
                        for key in step_input.dependency_keys:
                            dependencies.append(execution_futures_dict[key])

                    run_config = dict(pipeline_context.run_config,
                                      execution={'in_process': {}})
                    recon_repo = pipeline_context.pipeline.get_reconstructable_repository(
                    )
                    variables = {
                        'executionParams': {
                            'selector': {
                                'pipelineName': pipeline_name,
                                'repositoryName':
                                recon_repo.get_definition().name,
                                'repositoryLocationName': '<<in_process>>',
                            },
                            'runConfigData': run_config,
                            'mode': pipeline_context.mode_def.name,
                            'executionMetadata': {
                                'runId': pipeline_context.pipeline_run.run_id
                            },
                            'stepKeys': [step.key],
                        }
                    }

                    dask_task_name = '%s.%s' % (pipeline_name, step.key)

                    workspace = create_in_process_ephemeral_workspace(
                        pointer=pipeline_context.pipeline.
                        get_reconstructable_repository().pointer)

                    future = client.submit(
                        query_on_dask_worker,
                        workspace,
                        variables,
                        dependencies,
                        instance.get_ref(),
                        key=dask_task_name,
                        resources=get_dask_resource_requirements(step.tags),
                    )

                    execution_futures.append(future)
                    execution_futures_dict[step.key] = future

            # This tells Dask to awaits the step executions and retrieve their results to the
            # master
            for future in dask.distributed.as_completed(execution_futures):
                for step_event in future.result():
                    check.inst(step_event, DagsterEvent)

                    yield step_event
Ejemplo n.º 14
0
'''Dask interface demo:

Use scikit-learn regressor interface with CPU histogram tree method.'''
from dask.distributed import Client
from dask.distributed import LocalCluster
from dask import array as da
import xgboost

if __name__ == '__main__':
    cluster = LocalCluster(n_workers=2,
                           silence_logs=False)  # or use any other clusters
    client = Client(cluster)

    n = 100
    m = 10000
    partition_size = 100
    X = da.random.random((m, n), partition_size)
    y = da.random.random(m, partition_size)

    regressor = xgboost.dask.DaskXGBRegressor(verbosity=2, n_estimators=2)
    regressor.set_params(tree_method='hist')
    regressor.client = client

    regressor.fit(X, y, eval_set=[(X, y)])
    prediction = regressor.predict(X)

    bst = regressor.get_booster()
    history = regressor.evals_result()

    print('Evaluation history:', history)
    assert isinstance(prediction, da.Array)
Ejemplo n.º 15
0
def main(prefetch_storage, block_size, n_files, reps, types, nworkers):

    types = list(types)
    header = ["vhs-bucket/hydi-header.trk"]

    fs = S3FileSystem()
    files = fs.glob("hydi-tractography/hydi_tracks.*.trk")[:n_files]
    print(files)

    results_path = "../results/"

    bfile = op.join(
        results_path,
        f"real_{n_files}f_{reps}r_{block_size}b_{nworkers}w-recobundles.out",
    )

    helpers.setup_bench(bfile)

    cluster = LocalCluster(n_workers=nworkers, resources={"CPU": 3})

    client = Client(cluster)

    for r in range(reps):
        # random.shuffle(types)
        for t in types:
            print("***", t, "***")
            helpers.drop_caches()

            print(client)

            data = {}
            results = []

            if t == "s3fs":
                print(t)

                for i in range(nworkers):
                    f_per_w = n_files // nworkers
                    print(files[i * f_per_w:(i + 1) * f_per_w])
                    seg = client.submit(
                        segmentation_s3fs,
                        files[i * f_per_w:(i + 1) * f_per_w],
                        False,
                        block_size,
                        **data,
                        bfile=bfile,
                    )
                    results.append(seg)
            else:
                print(t)

                for i in range(nworkers):
                    f_per_w = n_files // nworkers
                    print(files[i * f_per_w:(i + 1) * f_per_w])
                    seg = client.submit(
                        segmentation_prefetch,
                        header + files[i * f_per_w:(i + 1) * f_per_w],
                        False,
                        block_size,
                        prefetch_storage,
                        **data,
                        bfile=bfile,
                    )
                    results.append(seg)

            print(client.gather(results))
            system("pkill -f joblib")
Ejemplo n.º 16
0
#importlib.reload(ELMlib)

# + endofcell="--"
port_dict = {
    'mm': 8789,
    #    'hmetzler':8790, # change at will
    'hmetzler': 8888,  # change at will
    'cs': 8791  # change at will
}
my_user_name = getuser()
print(my_user_name)

my_port = port_dict[my_user_name]
print(my_port)

my_cluster = LocalCluster(dashboard_address='localhost:' + str(my_port))

# -

Client(my_cluster)
# --

ELMDataDir = "/home/hmetzler/SOIL-R/Manuscripts/Berkeley/2019/Data/"
runID = "14C_transient_holger_fire.2x2_small"
fn = runID + ".nc"
ds = xr.open_dataset(Path(ELMDataDir).joinpath(runID + ".nc"))
ds

ds_depth = xr.open_dataset(Path(ELMDataDir).joinpath('DZSOI.nc'))

parameter_set = ELMlib.load_parameter_set(nstep=1, ds_depth=ds_depth)
Ejemplo n.º 17
0
def run_tasks(pl_conf, task_type, task_fn, logging_init_fn):
    # Initialize local dask cluster
    logger.debug('Pipeline configuration: %s', pl_conf)
    cluster = LocalCluster(n_workers=pl_conf.n_workers,
                           threads_per_worker=1,
                           processes=True,
                           memory_limit=pl_conf.memory_limit,
                           ip='0.0.0.0')
    client = Client(cluster)

    # Split total region + tile indexes to process into separate lists for each worker
    # (by indexes of those combinations)
    tiles = pl_conf.region_tiles
    idx_batches = np.array_split(np.arange(len(tiles)), pl_conf.n_workers)

    # Assign gpus to tasks in round-robin fashion
    def get_gpu(i):
        if pl_conf.gpus is None:
            return None
        return pl_conf.gpus[i % len(pl_conf.gpus)]

    # Generate a single task configuration for each worker
    tasks = [
        pl_conf.get_task_config(region_indexes=tiles[idx_batch, 0],
                                tile_indexes=tiles[idx_batch, 1],
                                gpu=get_gpu(i))
        for i, idx_batch in enumerate(idx_batches)
    ]

    logger.info('Starting %s pipeline for %s tasks (%s workers)', task_type,
                len(tasks), pl_conf.n_workers)
    logger.debug('Task definitions:\n\t%s',
                 '\n\t'.join([str(t) for t in tasks]))
    try:
        # Passing logging initialization operation, if given, to workers now
        # running in separate processes
        if logging_init_fn:
            client.run(logging_init_fn)

        # Disable the "auto_restart" feature of dask workers which is of no use in this context
        for worker in cluster.workers:
            worker.auto_restart = False

        # Pass tasks to each worker to execute in parallel
        res = client.map(task_fn, tasks)
        res = [r.result() for r in res]
        if len(res) != len(tasks):
            raise ValueError(
                'Parallel execution returned {} results but {} were expected'.
                format(len(res), len(tasks)))
    finally:
        # Note that this often produces a non-critical error due to: https://github.com/dask/distributed/issues/1969
        # but that closing these resources is necessary to avoid GPU oom in post-processing
        client.close()
        cluster.close()

    # Save measurement data to disk
    measure_data = concat(res)
    if measure_data:
        path = exec.record_processor_data(measure_data, pl_conf.output_dir)
        logging.info('%s complete; Measurement data saved to "%s"', task_type,
                     path)
    else:
        logging.info('%s complete', task_type)
    LOGGER = logger.get_logger(TEST_NAME)
    # Specify some constants
    URLPATH1 = "s3://dask-avro-data/application-data/app-1000*.avro"

    def filter_func(data):
        return data['payload']['originationCountryCode'] == 'CAN'


    for conf in [(1, 36), (4, 9), (12, 3), (36, 1)]:
        n_workers = conf[0]
        threads_per_worker = conf[1]

        test_name = "dsk_filter_pd_dist_{}_{}".format(n_workers, threads_per_worker)
        LOGGER.info('BEGIN: Running test: {}'.format(test_name))

        cluster = LocalCluster(
            n_workers=n_workers, threads_per_worker=threads_per_worker)
        client = Client(cluster)

        LOGGER.info('START: Creating dask bag with filter')
        bag = dask.bag.read_avro(
            URLPATH1,
            storage_options={
                'config_kwargs': {'max_pool_connections': 500}
            }, 
            blocksize=None
        )
        bag = bag.filter(filter_func)
        LOGGER.info('FINISH: Dask bag created')

        LOGGER.info('START: Creating dask dataframe')
        df = bag.to_dataframe(meta={'payload': 'object', 'metadata': 'object'})
RESULTS_DESTINATION = RESOURCES_DIR + "/results/python_dask/1/" + RUN_ID

if __name__ == "__main__":
    task_configs = [{
        "location": WAV_FILES_LOCATION,
        "name": file_metadata[0],
        "timestamp": parse(file_metadata[1]),
        "sample_rate": 1500.0,
        "wav_bits": 16,
        "n_samples": 3587,
        "n_channels": 1,
        "results_destination": RESULTS_DESTINATION,
        "calibration_factor": CALIBRATION_FACTOR,
        "segment_duration": SEGMENT_DURATION,
        "window_size": WINDOW_SIZE,
        "window_overlap": WINDOW_OVERLAP,
        "nfft": NFFT
    } for file_metadata in pd.read_csv(METADATA_FILE_PATH).values]

    ncpus = len(os.sched_getaffinity(0))

    cluster = LocalCluster(n_workers=1,
                           threads_per_worker=ncpus,
                           processes=False)

    client = Client(cluster)

    durations = client.map(single_file_handler.process_file, task_configs)
    avg_time = np.average(client.gather(durations))
Ejemplo n.º 20
0
 def setUpClass(cls):
     cls.execution_path = os.path.dirname(os.path.abspath(__file__))
     cluster = LocalCluster(n_workers=1, threads_per_worker=2)
     client = Client(cluster)
     cls.lmp = LammpsLibrary(cores=2, mode='dask', client=client)
     cls.lmp.file(os.path.join(cls.execution_path, "in.simple"))
Ejemplo n.º 21
0

def main():
    """docstring for main"""

    # Load the images with ASE
    images = Trajectory("cu_training.traj")

    calc = Potentials.load(
        model="cu_training.ml4c",
        params="cu_training.params",
        preprocessor="model.scaler",
    )

    for atoms in images:
        energy = calc.get_potential_energy(atoms)
        print("ML4Chem predicted energy = {}".format(energy))
        print("              DFT energy = {}".format(
            atoms.get_potential_energy()))


if __name__ == "__main__":
    logging.basicConfig(
        filename="cu_inference.log",
        level=logging.INFO,
        format="%(filename)s:%(lineno)s %(levelname)s:%(message)s",
    )
    cluster = LocalCluster(n_workers=8, threads_per_worker=2)
    client = Client(cluster, asyncronous=True)
    main()
Ejemplo n.º 22
0
@author: donbo
https://docs.dask.org/en/latest/setup/single-distributed.html
http://localhost:8787/status

"""

# from dask.distributed import Client
# client = Client()

import pandas as pd
import numpy as np
import dask.dataframe as dd
from dask.distributed import Client, LocalCluster

cluster = LocalCluster()
client = Client(cluster)

df = pd.DataFrame({
    'A': np.random.randint(1000, size=100000),
    'B': np.random.randint(1000, size=100000)
})
df

ddf = dd.from_pandas(df, npartitions=4)

client.close()
cluster.close()

# cluster.run_on_scheduler(lambda dask_scheduler=None:
#     dask_scheduler.close() & sys.exit(0))
Ejemplo n.º 23
0
##from dask.distributed import Client
##
##client = Client('localhost:8786')
from dask.distributed import Client, LocalCluster
cluster = LocalCluster(
    n_workers=4,
    ip='127.0.0.1',
)
client = Client(cluster)
Ejemplo n.º 24
0
def initialize_client():
    cluster = LocalCluster(n_workers=100, threads_per_worker=1)
    client = Client(cluster)
    return (client)
Ejemplo n.º 25
0
class YarnCluster(object):
    """Start a Dask cluster on YARN.

    You can define default values for this in Dask's ``yarn.yaml``
    configuration file. See http://docs.dask.org/en/latest/configuration.html
    for more information.

    Parameters
    ----------
    environment : str, optional
        Path to an archived Python environment (either ``tar.gz`` or ``zip``).
    n_workers : int, optional
        The number of workers to initially start.
    worker_vcores : int, optional
        The number of virtual cores to allocate per worker.
    worker_memory : str, optional
        The amount of memory to allocate per worker. Accepts a unit suffix
        (e.g. '2 GiB' or '4096 MiB'). Will be rounded up to the nearest MiB.
    worker_restarts : int, optional
        The maximum number of worker restarts to allow before failing the
        application. Default is unlimited.
    worker_env : dict, optional
        A mapping of environment variables to their values. These will be set
        in the worker containers before starting the dask workers.
    scheduler_vcores : int, optional
        The number of virtual cores to allocate per scheduler.
    scheduler_memory : str, optional
        The amount of memory to allocate to the scheduler. Accepts a unit
        suffix (e.g. '2 GiB' or '4096 MiB'). Will be rounded up to the nearest
        MiB.
    deploy_mode : {'remote', 'local'}, optional
        The deploy mode to use. If ``'remote'``, the scheduler will be deployed
        in a YARN container. If ``'local'``, the scheduler will run locally,
        which can be nice for debugging. Default is ``'remote'``.
    name : str, optional
        The application name.
    queue : str, optional
        The queue to deploy to.
    tags : sequence, optional
        A set of strings to use as tags for this application.
    skein_client : skein.Client, optional
        The ``skein.Client`` to use. If not provided, one will be started.

    Examples
    --------
    >>> cluster = YarnCluster(environment='my-env.tar.gz', ...)
    >>> cluster.scale(10)
    """
    def __init__(self,
                 environment=None,
                 n_workers=None,
                 worker_vcores=None,
                 worker_memory=None,
                 worker_restarts=None,
                 worker_env=None,
                 scheduler_vcores=None,
                 scheduler_memory=None,
                 deploy_mode=None,
                 name=None,
                 queue=None,
                 tags=None,
                 skein_client=None):

        spec = _make_specification(environment=environment,
                                   n_workers=n_workers,
                                   worker_vcores=worker_vcores,
                                   worker_memory=worker_memory,
                                   worker_restarts=worker_restarts,
                                   worker_env=worker_env,
                                   scheduler_vcores=scheduler_vcores,
                                   scheduler_memory=scheduler_memory,
                                   deploy_mode=deploy_mode,
                                   name=name,
                                   queue=queue,
                                   tags=tags)

        self._start_cluster(spec, skein_client)

    @cached_property
    def dashboard_link(self):
        """Link to the dask dashboard. None if dashboard isn't running"""
        if self._dashboard_address is None:
            return None
        template = dask.config.get('distributed.dashboard.link')
        dashboard = urlparse(self._dashboard_address)
        params = dict(os.environ)
        params.update({'host': dashboard.hostname, 'port': dashboard.port})
        return template.format(**params)

    @classmethod
    def from_specification(cls, spec, skein_client=None):
        """Start a dask cluster from a skein specification.

        Parameters
        ----------
        spec : skein.ApplicationSpec, dict, or filename
            The application specification to use. Must define at least one
            service: ``'dask.worker'``. If no ``'dask.scheduler'`` service is
            defined, a scheduler will be started locally.
        skein_client : skein.Client, optional
            The ``skein.Client`` to use. If not provided, one will be started.
        """
        self = super(YarnCluster, cls).__new__(cls)
        if isinstance(spec, dict):
            spec = skein.ApplicationSpec.from_dict(spec)
        elif isinstance(spec, str):
            spec = skein.ApplicationSpec.from_file(spec)
        elif not isinstance(spec, skein.ApplicationSpec):
            raise TypeError("spec must be an ApplicationSpec, dict, or path, "
                            "got %r" % type(spec).__name__)
        self._start_cluster(spec, skein_client)
        return self

    def _start_cluster(self, spec, skein_client=None):
        """Start the cluster and initialize state"""

        if 'dask.worker' not in spec.services:
            raise ValueError("Provided Skein specification must include a "
                             "'dask.worker' service")

        skein_client = _get_skein_client(skein_client)

        if 'dask.scheduler' not in spec.services:
            # deploy_mode == 'local'
            self._local_cluster = LocalCluster(n_workers=0,
                                               ip='0.0.0.0',
                                               diagnostics_port=('', 0),
                                               scheduler_port=0)
            scheduler = self._local_cluster.scheduler

            scheduler_address = scheduler.address
            try:
                dashboard_port = scheduler.services['bokeh'].port
            except KeyError:
                dashboard_address = None
            else:
                dashboard_host = urlparse(scheduler_address).hostname
                dashboard_address = 'http://%s:%d' % (dashboard_host,
                                                      dashboard_port)

            app = skein_client.submit_and_connect(spec)
            try:
                app.kv['dask.scheduler'] = scheduler_address.encode()
                if dashboard_address is not None:
                    app.kv['dask.dashboard'] = dashboard_address.encode()
            except BaseException:
                # Failed to connect, kill the application and reraise
                skein_client.kill_application(app.id)
                raise
        else:
            # deploy_mode == 'remote'
            app = skein_client.submit_and_connect(spec)
            try:
                scheduler_address = app.kv.wait('dask.scheduler').decode()
                dashboard_address = app.kv.get('dask.dashboard')
                if dashboard_address is not None:
                    dashboard_address = dashboard_address.decode()
            except BaseException:
                # Failed to connect, kill the application and reraise
                skein_client.kill_application(app.id)
                raise

        # Ensure application gets cleaned up
        self._finalizer = weakref.finalize(self, app.shutdown)

        self.scheduler_address = scheduler_address
        self._dashboard_address = dashboard_address
        self.app_id = app.id
        self.application_client = app

    @classmethod
    def from_current(cls):
        """Connect to an existing ``YarnCluster`` from inside the cluster.

        Returns
        -------
        YarnCluster
        """
        self = super(YarnCluster, cls).__new__(cls)
        app_id = os.environ.get('DASK_APPLICATION_ID', None)
        app_address = os.environ.get('DASK_APPMASTER_ADDRESS', None)
        if app_id is not None and app_address is not None:
            app = skein.ApplicationClient(app_address, app_id)
        else:
            app = skein.ApplicationClient.from_current()
        self._connect_existing(app)
        return self

    @classmethod
    def from_application_id(cls, app_id, skein_client=None):
        """Connect to an existing ``YarnCluster`` with a given application id.

        Parameters
        ----------
        app_id : str
            The existing cluster's application id.
        skein_client : skein.Client
            The ``skein.Client`` to use. If not provided, one will be started.

        Returns
        -------
        YarnCluster
        """
        self = super(YarnCluster, cls).__new__(cls)
        skein_client = _get_skein_client(skein_client)
        app = skein_client.connect(app_id)
        self._connect_existing(app)
        return self

    def _connect_existing(self, app):
        spec = app.get_specification()
        if 'dask.worker' not in spec.services:
            raise ValueError("%r is not a valid dask cluster" % app.id)

        scheduler_address = app.kv.wait('dask.scheduler').decode()
        dashboard_address = app.kv.get('dask.dashboard')
        if dashboard_address is not None:
            dashboard_address = dashboard_address.decode()

        self.app_id = app.id
        self.application_client = app
        self.scheduler_address = scheduler_address
        self._dashboard_address = dashboard_address
        self._finalizer = None

    def __repr__(self):
        return 'YarnCluster<%s>' % self.app_id

    def _dask_client(self):
        if hasattr(self, '_dask_client_ref'):
            client = self._dask_client_ref()
            if client is not None:
                return client
        client = get_client(address=self.scheduler_address)
        self._dask_client_ref = weakref.ref(client)
        return client

    def shutdown(self, status='SUCCEEDED', diagnostics=None):
        """Shutdown the application.

        Parameters
        ----------
        status : {'SUCCEEDED', 'FAILED', 'KILLED'}, optional
            The yarn application exit status.
        diagnostics : str, optional
            The application exit message, usually used for diagnosing failures.
            Can be seen in the YARN Web UI for completed applications under
            "diagnostics". If not provided, a default will be used.
        """
        if self._finalizer is not None and self._finalizer.peek() is not None:
            self.application_client.shutdown(status=status,
                                             diagnostics=diagnostics)
            self._finalizer.detach()  # don't call shutdown later
            # Shutdown in local deploy_mode
            if hasattr(self, '_local_cluster'):
                self._local_cluster.close()
                del self._local_cluster

    def close(self, **kwargs):
        """Close this cluster. An alias for ``shutdown``.

        See Also
        --------
        shutdown
        """
        self.shutdown(**kwargs)

    def __enter__(self):
        return self

    def __exit__(self, *args):
        self.close()

    def workers(self):
        """A list of all currently running worker containers."""
        return self.application_client.get_containers(services=['dask.worker'])

    def scale_up(self, n, workers=None):
        """Ensure there are atleast n dask workers available for this cluster.

        No-op if ``n`` is less than the current number of workers.

        Examples
        --------
        >>> cluster.scale_up(20)  # ask for twenty workers
        """
        if workers is None:
            workers = self.workers()
        if n > len(workers):
            self.application_client.scale(service='dask.worker', instances=n)

    def scale_down(self, workers):
        """Retire the selected workers.

        Parameters
        ----------
        workers: list
            List of addresses of workers to close.
        """
        self._dask_client().retire_workers(workers)

    def _select_workers_to_close(self, n):
        client = self._dask_client()
        worker_info = client.scheduler_info()['workers']
        # Sort workers by memory used
        workers = sorted(
            (v['metrics']['memory'], k) for k, v in worker_info.items())
        # Return just the ips
        return [w[1] for w in workers[:n]]

    def scale(self, n):
        """Scale cluster to n workers.

        Parameters
        ----------
        n : int
            Target number of workers

        Examples
        --------
        >>> cluster.scale(10)  # scale cluster to ten workers
        """
        workers = self.workers()
        if n >= len(workers):
            return self.scale_up(n, workers=workers)
        else:
            n_to_delete = len(workers) - n
            # Before trying to close running workers, check if there are any
            # pending containers and kill those first.
            pending = [
                w for w in workers if w.state in ('waiting', 'requested')
            ]

            for c in pending[:n_to_delete]:
                self.application_client.kill_container(c.id)
                n_to_delete -= 1

            if n_to_delete:
                to_close = self._select_workers_to_close(n_to_delete)
                self.scale_down(to_close)

    def _widget_status(self):
        client = self._dask_client()

        workers = client.scheduler_info()['workers']

        n_workers = len(workers)
        cores = sum(w['ncores'] for w in workers.values())
        memory = sum(w['memory_limit'] for w in workers.values())

        text = """
<div>
  <style scoped>
    .dataframe tbody tr th:only-of-type {
        vertical-align: middle;
    }

    .dataframe tbody tr th {
        vertical-align: top;
    }

    .dataframe thead th {
        text-align: right;
    }
  </style>
  <table style="text-align: right;">
    <tr><th>Workers</th> <td>%d</td></tr>
    <tr><th>Cores</th> <td>%d</td></tr>
    <tr><th>Memory</th> <td>%s</td></tr>
  </table>
</div>
""" % (n_workers, cores, format_bytes(memory))
        return text

    def _widget(self):
        """ Create IPython widget for display within a notebook """
        try:
            return self._cached_widget
        except AttributeError:
            pass

        from ipywidgets import Layout, VBox, HBox, IntText, Button, HTML

        client = self._dask_client()

        layout = Layout(width='150px')

        title = HTML('<h2>YarnCluster</h2>')

        status = HTML(self._widget_status(), layout=Layout(min_width='150px'))

        request = IntText(0, description='Workers', layout=layout)
        scale = Button(description='Scale', layout=layout)

        @scale.on_click
        def scale_cb(b):
            with log_errors():
                self.scale(request.value)

        elements = [title, HBox([status, request, scale])]

        if self.dashboard_link is not None:
            link = HTML('<p><b>Dashboard: </b><a href="%s" target="_blank">%s'
                        '</a></p>\n' %
                        (self.dashboard_link, self.dashboard_link))
            elements.append(link)

        self._cached_widget = box = VBox(elements)

        def update():
            status.value = self._widget_status()

        pc = PeriodicCallback(update, 500, io_loop=client.loop)
        pc.start()

        return box

    def _ipython_display_(self, **kwargs):
        try:
            return self._widget()._ipython_display_(**kwargs)
        except ImportError:
            print(self)
Ejemplo n.º 26
0
                i += 1


if __name__ == "__main__":
    logging.getLogger("tifffile").setLevel(logging.ERROR)
    coloredlogs.install(level="DEBUG",
                        fmt="%(asctime)s %(levelname)s %(message)s",
                        datefmt="%H:%M:%S")

    logger = logging.getLogger(__name__)

    use_local = True

    if use_local:
        logger.info("using local cluster")
        cluster = LocalCluster(n_workers=4, threads_per_worker=4)
        client = Client(cluster)
    else:
        logger.info("using remote cluster")
        client = Client("10.109.20.6:8786")
    logger.info(client)

    src_ds = open_dataset("Y:/ARod/4F/20200317_No5_CamA")
    print(src_ds.inventory)

    logger.info(f"tile by {src_ds.tile_shape}")

    # INPUT (x, y, z) -> TRUE (z, x, y)
    src_ds.remap_tiling_axes({"x": "z", "y": "x", "z": "y"})
    src_ds.flip_tiling_axes(["x", "y"])
Ejemplo n.º 27
0
    merged.train(
        inputs=inputs,
        targets=targets,
        data=data_handler,
        regularization=regularization,
        convergence=convergence,
        optimizer=optimizer,
        device="cpu",
        batch_size=batch_size,
        lr_scheduler=lr_scheduler,
        lossfxn=losses,
        independent_loss=True,
    )

    for index, model in enumerate(merged.models):
        label = "{}_{}".format(index, model.name())
        Potentials.save(model, label=label)

    dump_ls = merged.models[0].get_latent_space(inputs[0])
    dump(dump_ls, filename="checkme.latent")


if __name__ == "__main__":
    logger()
    cluster = LocalCluster(n_workers=5,
                           threads_per_worker=2,
                           dashboard_address=8798)
    client = Client(cluster)
    # Let's do this
    hybrid()
Ejemplo n.º 28
0
    def execute(self, pipeline_context, execution_plan):
        check.inst_param(pipeline_context, "pipeline_context",
                         SystemPipelineExecutionContext)
        check.inst_param(execution_plan, "execution_plan", ExecutionPlan)
        check.param_invariant(
            isinstance(pipeline_context.executor, DaskExecutor),
            "pipeline_context",
            "Expected executor to be DaskExecutor got {}".format(
                pipeline_context.executor),
        )

        check.invariant(
            pipeline_context.instance.is_persistent,
            "Dask execution requires a persistent DagsterInstance",
        )

        step_levels = execution_plan.get_steps_to_execute_by_level()

        pipeline_name = pipeline_context.pipeline_name

        instance = pipeline_context.instance

        cluster_type = self.cluster_type
        if cluster_type == "existing":
            # address passed directly to Client() below to connect to existing Scheduler
            cluster = self.cluster_configuration["address"]
        elif cluster_type == "local":
            from dask.distributed import LocalCluster

            cluster = LocalCluster(**self.build_dict(pipeline_name))
        elif cluster_type == "yarn":
            from dask_yarn import YarnCluster

            cluster = YarnCluster(**self.build_dict(pipeline_name))
        elif cluster_type == "ssh":
            from dask.distributed import SSHCluster

            cluster = SSHCluster(**self.build_dict(pipeline_name))
        elif cluster_type == "pbs":
            from dask_jobqueue import PBSCluster

            cluster = PBSCluster(**self.build_dict(pipeline_name))
        elif cluster_type == "moab":
            from dask_jobqueue import MoabCluster

            cluster = MoabCluster(**self.build_dict(pipeline_name))
        elif cluster_type == "sge":
            from dask_jobqueue import SGECluster

            cluster = SGECluster(**self.build_dict(pipeline_name))
        elif cluster_type == "lsf":
            from dask_jobqueue import LSFCluster

            cluster = LSFCluster(**self.build_dict(pipeline_name))
        elif cluster_type == "slurm":
            from dask_jobqueue import SLURMCluster

            cluster = SLURMCluster(**self.build_dict(pipeline_name))
        elif cluster_type == "oar":
            from dask_jobqueue import OARCluster

            cluster = OARCluster(**self.build_dict(pipeline_name))
        elif cluster_type == "kube":
            from dask_kubernetes import KubeCluster

            cluster = KubeCluster(**self.build_dict(pipeline_name))
        else:
            raise ValueError(
                f"Must be providing one of the following ('existing', 'local', 'yarn', 'ssh', 'pbs', 'moab', 'sge', 'lsf', 'slurm', 'oar', 'kube') not {cluster_type}"
            )

        with dask.distributed.Client(cluster) as client:
            execution_futures = []
            execution_futures_dict = {}

            for step_level in step_levels:
                for step in step_level:
                    # We ensure correctness in sequencing by letting Dask schedule futures and
                    # awaiting dependencies within each step.
                    dependencies = []
                    for step_input in step.step_inputs:
                        for key in step_input.dependency_keys:
                            dependencies.append(execution_futures_dict[key])

                    run_config = dict(pipeline_context.run_config,
                                      execution={"in_process": {}})
                    recon_repo = pipeline_context.pipeline.get_reconstructable_repository(
                    )

                    dask_task_name = "%s.%s" % (pipeline_name, step.key)

                    recon_pipeline = recon_repo.get_reconstructable_pipeline(
                        pipeline_name)

                    future = client.submit(
                        query_on_dask_worker,
                        dependencies,
                        recon_pipeline,
                        pipeline_context.pipeline_run,
                        run_config,
                        [step.key],
                        pipeline_context.mode_def.name,
                        instance.get_ref(),
                        key=dask_task_name,
                        resources=get_dask_resource_requirements(step.tags),
                    )

                    execution_futures.append(future)
                    execution_futures_dict[step.key] = future

            # This tells Dask to awaits the step executions and retrieve their results to the
            # master
            futures = dask.distributed.as_completed(execution_futures,
                                                    with_results=True)

            # Allow interrupts while waiting for the results from Dask
            for future, result in iterate_with_context(
                    raise_execution_interrupts, futures):
                for step_event in result:
                    check.inst(step_event, DagsterEvent)
                    yield step_event
Ejemplo n.º 29
0
import modin.config as config
import modin.pandas as pd
from contexttimer import Timer
from docopt import docopt
from dask.distributed import Client, LocalCluster

if __name__ == "__main__":
    args = docopt(__doc__, version="1.0")
    conn = os.environ["POSTGRES_URL"]
    table = os.environ["POSTGRES_TABLE"]

    partitions = int(args["<num>"])
    config.NPartitions.put(partitions)

    cluster = LocalCluster(n_workers=partitions,
                           scheduler_port=0,
                           memory_limit="230G")
    client = Client(cluster)

    with Timer() as timer:
        df = pd.read_sql(
            f"SELECT * FROM {table}",
            conn,
            parse_dates=[
                "l_shipdate",
                "l_commitdate",
                "l_receiptdate",
            ],
        )
    print(f"[Total] {timer.elapsed:.2f}s")
Ejemplo n.º 30
0
        help='Chunk size in x y f dimensions. Maybe helpful in many-cores '
             'low-memory-per-core system like Intel Xeon Phi.'
    )
    args = parser.parse_args()

    # FIXED parameters
    noise_dim = pd.Index(range(500), name='noise_field')
    mask = xr.open_dataarray('/scratch/pkittiwi/fg1p/hera331_fov_mask.nc')
    if args.xyf_chunks is not None:
        chunks = {'x': args.xyf_chunks[0], 'y': args.xyf_chunks[1],
                  'f': args.xyf_chunks[2]}
    else:
        chunks = None
    # Setup and start Dask Local Cluster
    cluster = LocalCluster(n_workers=args.n_workers, processes=args.processes,
                           threads_per_worker=args.threads_per_worker,
                           scheduler_port=args.scheduler_port,
                           diagnostics_port=args.diagnostics_port)
    client = Client(cluster)
    print('Hostname: {:s}'.format(os.environ['HOSTNAME']))
    print('Dask Scheduler address: {:s}'.format(cluster.scheduler_address))
    print('Dask Dashboard link: {:s}'.format(cluster.dashboard_link))

    # Loop over data parameters and perform calculation
    for bw, fbw, t, s in itertools.product(
            args.bin_width, args.filter_bandwidth, args.theta, args.shift
    ):
        start_time = datetime.now()
        ds = xr.open_mfdataset(
            ['/scratch/pkittiwi/fg1p/binned_noise_map/bin{:.2f}MHz/'
             'fbw{:.2f}MHz/theta{:.1f}/shift{:d}/binned_noise_map_bin{:.2f}MHz_'
             'fbw{:.2f}MHz_theta{:.1f}_shift{:d}_{:03d}.nc'
Ejemplo n.º 31
0
    def image_tikhonov(self, vis_arr, sphere, alpha, scale=True, usedask=False):
        n_s = sphere.pixels.shape[0]
        n_v = self.u_arr.shape[0]
        
        lambduh = alpha/np.sqrt(n_s)
        if not usedask:
            gamma = self.make_gamma(sphere)
            logger.info("Building Augmented Operator...")
            proj_operator_real = np.real(gamma).astype(np.float32)
            proj_operator_imag = np.imag(gamma).astype(np.float32)
            gamma = None
            proj_operator = np.block([[proj_operator_real], [proj_operator_imag]])
            proj_operator_real = None
            proj_operator_imag = None 
            logger.info('augmented: {}'.format(proj_operator.shape))
            
            vis_aux = np.array(np.concatenate((np.real(vis_arr), np.imag(vis_arr))), dtype=np.float32)
            logger.info('vis mean: {} shape: {}'.format(np.mean(vis_aux), vis_aux.shape))

            logger.info("Solving...")
            reg = linear_model.ElasticNet(alpha=lambduh, l1_ratio=0.05, max_iter=10000, positive=True)
            reg.fit(proj_operator, vis_aux)
            sky = reg.coef_
            
            score = reg.score(proj_operator, vis_aux)
            logger.info('Loss function: {}'.format(score))
            
        else:
            from dask_ml.linear_model import LinearRegression
            import dask_glm
            import dask.array as da
            from dask.distributed import Client, LocalCluster
            from dask.diagnostics import ProgressBar
            import dask
            
            logger.info('Starting Dask Client')
            
            if True:
                cluster = LocalCluster(dashboard_address=':8231', processes=False)
                client = Client(cluster)
            else:
                client = Client('tcp://localhost:8786')
                
            logger.info("Client = {}".format(client))
            
            harmonic_list = []
            p2j = 2*np.pi*1.0j
            
            dl = sphere.l
            dm = sphere.m
            dn = sphere.n
        
            n_arr_minus_1 = dn - 1

            du = self.u_arr
            dv = self.v_arr
            dw = self.w_arr
        
            for u, v, w in zip(du, dv, dw):
                harmonic = da.from_array(np.exp(p2j*(u*dl + v*dm + w*n_arr_minus_1)) / np.sqrt(sphere.npix), chunks=(n_s,))
                harminc = client.persist(harmonic)
                harmonic_list.append(harmonic)

            gamma = da.stack(harmonic_list)
            logger.info('Gamma Shape: {}'.format(gamma.shape))
            #gamma = gamma.reshape((n_v, n_s))
            gamma = gamma.conj()
            gamma = client.persist(gamma)
            
            logger.info('Gamma Shape: {}'.format(gamma.shape))
            
            logger.info("Building Augmented Operator...")
            proj_operator_real = da.real(gamma)
            proj_operator_imag = da.imag(gamma)
            proj_operator = da.block([[proj_operator_real], [proj_operator_imag]])
            
            proj_operator = client.persist(proj_operator)
            
            logger.info("Proj Operator shape {}".format(proj_operator.shape))
            vis_aux = da.from_array(np.array(np.concatenate((np.real(vis_arr), np.imag(vis_arr))), dtype=np.float32))
            
            #logger.info("Solving...")

            
            en = dask_glm.regularizers.ElasticNet(weight=0.01)
            en =  dask_glm.regularizers.L2()
            #dT = da.from_array(proj_operator, chunks=(-1, 'auto'))
            ##dT = da.from_array(proj_operator, chunks=(-1, 'auto'))
            #dv = da.from_array(vis_aux)
            

            dask.config.set({'array.chunk-size': '1024MiB'})
            A = da.rechunk(proj_operator, chunks=('auto', n_s))
            A = client.persist(A)
            y = vis_aux # da.rechunk(vis_aux, chunks=('auto', n_s))
            y = client.persist(y)
            #sky = dask_glm.algorithms.proximal_grad(A, y, regularizer=en, lambduh=alpha, max_iter=10000)

            logger.info("Rechunking completed.. A= {}.".format(A.shape))
            reg =  LinearRegression(penalty=en, C=1.0/lambduh,  
                                    fit_intercept=False, 
                                    solver='lbfgs', 
                                    max_iter=1000, tol=1e-8 )
            sky = reg.fit(A, y)
            sky = reg.coef_
            score = reg.score(proj_operator, vis_aux)
            logger.info('Loss function: {}'.format(score.compute()))

        logger.info("Solving Complete: sky = {}".format(sky.shape))

        sphere.set_visible_pixels(sky, scale=True)
        return sky.reshape(-1,1)