def train_on_jz_dask(job_name, train_function, *args, **kwargs):
    cluster = SLURMCluster(
        cores=1,
        job_cpu=20,
        memory='80GB',
        job_name=job_name,
        walltime='60:00:00',
        interface='ib0',
        job_extra=[
            f'--gres=gpu:1',
            '--qos=qos_gpu-t4',
            '--distribution=block:block',
            '--hint=nomultithread',
            '--output=%x_%j.out',
        ],
        env_extra=[
            'cd $WORK/fastmri-reproducible-benchmark',
            '. ./submission_scripts_jean_zay/env_config.sh',
        ],
    )
    cluster.scale(1)

    print(cluster.job_script())

    client = Client(cluster)
    futures = client.submit(
        # function to execute
        train_function,
        *args,
        **kwargs,
        # this function has potential side effects
        pure=True,
    )
    client.gather(futures)
    print('Shutting down dask workers')
Esempio n. 2
0
def main():
    n_mutation = 100
    client = Client('scheduler:8786')
    futures = client.map(initialize_network, range(n_mutation))
    results = client.gather(futures)
    results.sort(key=lambda x: -x[1])

    truncated = list(map(lambda x: x[0], results[:3]))
    futures = []
    for i, seed in enumerate(truncated):
        name = 'top-{}'.format(i)
        futures.append(
            client.submit(initialize_network, seed, store=True, name=name))
    results = client.gather(futures)
    print(results, flush=True)

    for g in range(10):
        futures = []
        for seed in range(n_mutation):
            futures.append(client.submit(update_network, seed, g + 1))
        results = client.gather(futures)
        results.sort(key=lambda x: -x[1])
        truncated = list(map(lambda x: x[0], results[:3]))

        futures = []
        for i, seed in enumerate(truncated):
            name = 'top-{}'.format(i)
            futures.append(
                client.submit(update_network,
                              seed,
                              g + 1,
                              store=True,
                              name=name))
        results = client.gather(futures)
        print(results, flush=True)
Esempio n. 3
0
def main():
    """."""
    host = os.getenv('DASK_SCHEDULER_HOST', default='localhost')
    port = os.getenv('DASK_SCHEDULER_PORT', default=8786)
    print(host, port)
    client = Client('{}:{}'.format(host, port))
    # client.run(init_logging)
    # client.run_on_scheduler(init_logging)

    # Run some mock functions and gather a result
    data = client.map(print_listdir, range(10))
    future = client.submit(print_values, data)
    progress(future)
    print('')
    result = client.gather(future)
    print(result)

    # Run a second stage which runs some additional processing.
    print('here A')
    data_a = client.map(set_value, range(100))
    print('here B')
    data_b = client.map(square, data_a)
    print('here C')
    data_c = client.map(neg, data_b)
    print('here D')
    # Submit a function application to the scheduler
    total = client.submit(sum, data_c)
    print('here E')
    progress(total)
    print(total.result())
    print('here F')
Esempio n. 4
0
def run_simulations_dask(clearance_heights, xgaps, Ds, tilts, kwargs):
    # Create client
    
    scheduler_file = '/scratch/sayala/dask_testing/scheduler.json'
    client = Client(scheduler_file=scheduler_file)
    
    # Iterate over inputs
    futures = []
    
    for ch in range (0, len(clearance_heights)):
        clearance_height = clearance_heights[ch]
        for xx in range (0, len(xgaps)):
            xgap = xgaps[xx]
            for tt in range (0, len(tilts)):
                tilt = tilts[tt]
                for dd in range (0, len(Ds)):
                    D = Ds[dd]
                    futures.append(client.submit(simulate_single, clearance_height=clearance_height,
                                                            xgap=xgap, tilt=tilt, D=D, **kwargs))

    # Get results for all simulations
    res = client.gather(futures)
    
    # Close all dask workers and scheduler
    try:
    	client.shutdown()
    except:
        pass

    # Close client
    client.close()

    res = 'FINISHED!!!!!!!!!!!!!!!!!!!!!!!!!!!!!'
    return res
Esempio n. 5
0
    def handle(self, *args, **options):
        # Unpack variables
        name = options['name']
        model = options['model']
        segmentation = options['segmentation']
        spatial_aggregation = options['spatial_aggregation']
        categorical_variables = options['categorical_variables']
        scheduler_file = options['scheduler']

        # datacube query
        gwf_kwargs = { k: options[k] for k in ['product', 'lat', 'long', 'region']}
        iterable = gwf_query(**gwf_kwargs)

        # Start cluster and run 
        client = Client(scheduler_file=scheduler_file)
        client.restart()
        C = client.map(predict_object,
                       iterable,
                       pure=False,
                       **{'model_name': model,
                          'segmentation_name': segmentation,
                          'categorical_variables': categorical_variables,
                          'aggregation': spatial_aggregation,
                          'name': name,
                          })
        result = client.gather(C)

        print('Successfully ran prediction on %d tiles' % sum(result))
        print('%d tiles failed' % result.count(False))
Esempio n. 6
0
def test_insert(pod):
    # Write with workers
    label = "my_label"
    repo = Repo(pod=pod)
    # Create collection and label
    collection = repo.create_collection(schema, "my_collection")
    token = pod.token
    cluster = LocalCluster(processes=False)
    client = Client(cluster)
    args = [(token, label, y) for y in years]
    with timeit(f"\nWRITE ({pod.protocol})"):
        fut = client.map(insert, args)
        assert sum(client.gather(fut)) == 10_519_200
    client.close()
    cluster.close()

    # Merge everything and read series
    with timeit(f"\nMERGE ({pod.protocol})"):
        collection.merge()

    with timeit(f"\nREAD ({pod.protocol})"):
        series = collection / label
        df = series["2015-01-01":"2015-01-02"].df()
        assert len(df) == 1440
        df = series["2015-12-31":"2016-01-02"].df()
        assert len(df) == 2880
    def rank_populations(self, top: int = 0.5):
        """Given all the populations, rank them according to the
        the given fitness function
        Inputs:
        =======
        populations (List): populations to evaluate
        top (int): percentage of top populations to return

        Outputs:
        ========
        best_populations (List): top populations
        """

        client = Client()

        client_input = [(self.data, p, self.yield_column)
                        for p in self.populations]

        futures = client.map(GeneticAlgorithm.evaluate_fitness, client_input)
        ranking = client.gather(futures)

        client.close()

        # return top performing populations
        top_n = int(top * len(self.populations))

        return [self.populations[i] for i in argsort(ranking)[-top_n:]]
def DASK_batch_mult(matrix_input, vector_input, workers, batch_size,
                    input_size, output_channels):
    client = Client(n_workers=workers)
    results = []
    batch_no = matrix_input.shape[0] // batch_size

    for i in range(batch_no):
        batch = client.scatter(matrix_input[i * batch_size:i * batch_size +
                                            batch_size])
        results.append(
            client.submit(convolution_mean, batch, vector_input, batch_size,
                          vector_input.shape[0]))

    wait(results)
    data = client.gather(results)
    out_tensor = np.empty(
        (batch_size * batch_no, output_channels, input_size, input_size))
    for i in range(batch_no):
        out_tensor[i * batch_size:i * batch_size +
                   batch_size] = data[i].reshape(batch_size, output_channels,
                                                 input_size, input_size)

    client.shutdown()

    return out_tensor
Esempio n. 9
0
class DaskParallelRunner(object):
    """Run the simulations using dask.distributed on a cluster. This requires some set up on the cluster
    (see the dask.distributed documentation).

    TO BE DOCUMENTED.
    """

    def __init__(self, client, chunk=10):

        if isinstance(client, str):
            from dask.distributed import Client
            self.client = Client(client)
        else:
            self.client = client
        self.chunk = chunk

    def __call__(self, function, argument_list):

        def function_with_single_numerical_threads(args):
            lib.set_max_numerical_threads(1)
            return function(*args)

        # make a bag
        argument_list = list(argument_list)
        n = self.chunk

        futures = []
        for i in range(0, len(argument_list), n):
            args = argument_list[i: i + n]
            future = self.client.map(function_with_single_numerical_threads, list(args))
            futures += future

        results = self.client.gather(futures, direct=False)

        return results
def get_words(data):
    '''
    find all high_frequency_words in a given column of strings
    or other types of data whose elements are all strings
    
    a distributed client 'c' is applied 
    
    :param: data
    :type : pd.Series
    
    '''
    assert isinstance(data, pd.Series)
    assert all(isinstance(i, str) for i in data)
    from dask.distributed import Client

    c = Client()
    lines = [_ for _ in data]
    tasks = c.map(word_frequency, [_ for _ in lines])
    allDicts = c.gather(tasks)
    allDict = {}

    for dic in allDicts:
        for key in dic.keys():
            if key in allDict.keys():
                allDict[key] += 1
            else:
                allDict[key] = 1

    words = pd.Series(list(allDict.keys()), index=list(allDict.values()))
    words = words.sort_index()
    threshold = int(words.shape[0] / 50)
    words = words.loc[threshold:]
    c.close()
    return words
def run_simulations_dask(tilts, kwargs):
    # Create client

    scheduler_file = '/scratch/sayala/dask_testing/scheduler.json'
    client = Client(scheduler_file=scheduler_file)

    # Iterate over inputs
    futures = []

    # Add Iterations HERE

    for tilt in tilts:
        futures.append(client.submit(simulate_single, tilt=tilt, **kwargs))

    # Get results for all simulations
    res = client.gather(futures)

    # Close all dask workers and scheduler
    try:
        client.shutdown()
    except:
        pass

    # Close client
    client.close()

    res = 'FINISHED!!!!!!!!!!!!!!!!!!!!!!!!!!!!!'
    return res
Esempio n. 12
0
    def handle(self, *args, **options):
        # Unpack variables
        model_id = options['model_id']
        out_dir = options['out_dir']

        # Create output dir if does not exist
        if not os.path.exists(out_dir):
            os.makedirs(out_dir)

        # datacube query
        gwf_kwargs = {
            k: options[k]
            for k in ['product', 'lat', 'long', 'region']
        }
        iterable = gwf_query(**gwf_kwargs)

        # Start cluster and run
        client = Client()
        client.restart()
        C = client.map(predict_pixel_tile, iterable, **{
            'model_id': model_id,
            'outdir': out_dir
        })
        filename_list = client.gather(C)
        print(filename_list)
Esempio n. 13
0
def load_data_parallel(data_path,
                       num_processes,
                       image_variable="abi",
                       count_variable="flash_counts",
                       time_variable="time"):
    cluster = LocalCluster(n_workers=num_processes, threads_per_worker=1)
    client = Client(cluster)
    data_files = sorted(glob(join(data_path, "*.nc")))
    data_jobs = []
    for data_file in data_files:
        data_jobs.append(
            client.submit(load_single_data_file,
                          data_file,
                          image_variable=image_variable,
                          count_variable=count_variable,
                          time_variable=time_variable))
    wait(data_jobs)
    data_results = client.gather(data_jobs)
    all_images = np.concatenate([d[0] for d in data_results])
    all_counts = np.concatenate([d[1] for d in data_results])
    all_time = pd.DatetimeIndex(np.concatenate([d[2] for d in data_results]))
    client.close()
    cluster.close()
    del client
    del cluster
    return all_images, all_counts, all_time
Esempio n. 14
0
class DaskHandler(IProcessingHandler):
    """This class wraps all Dask related functions."""

    def __init__(self, number_of_workers, class_cb: Callable, brain_class, worker_log_level=logging.WARNING):
        super().__init__(number_of_workers)
        self._client: Optional[Client] = None
        self._cluster: Optional[LocalCluster] = None

        self.class_cb = class_cb
        self.brain_class = brain_class
        self.worker_log_level = worker_log_level

    def init_framework(self):
        if self._client:
            raise RuntimeError("Dask client already initialized.")

        # threads_per_worker must be one, because atari-env is not thread-safe.
        # And because lower the thread-count from the default, we must increase the number of workers
        self._cluster = LocalCluster(processes=True, asynchronous=False, threads_per_worker=1,
                                     silence_logs=self.worker_log_level,
                                     n_workers=self.number_of_workers,
                                     memory_pause_fraction=False,
                                     lifetime='1 hour', lifetime_stagger='5 minutes', lifetime_restart=True,
                                     interface="lo")
        self._client = Client(self._cluster)
        self._client.register_worker_plugin(_CreatorPlugin(self.class_cb, self.brain_class), name="creator-plugin")
        logging.info("Dask dashboard available at port: " + str(self._client.scheduler_info()["services"]["dashboard"]))

    def map(self, func, *iterable):
        if not self._client:
            raise RuntimeError("Dask client not initialized. Call \"init_framework\" before calling \"map\"")
        return self._client.gather(self._client.map(func, *iterable))

    def cleanup_framework(self):
        self._client.shutdown()
Esempio n. 15
0
def run_simulations_dask(xgaps, numpanelss, sensorsxs, kwargs):
    # Create client

    scheduler_file = '/scratch/sayala/dask_testing/scheduler.json'
    client = Client(scheduler_file=scheduler_file)

    # Iterate over inputs
    futures = []

    for nn in range(0, len(numpanelss)):
        numpanels = numpanelss[nn]
        for xx in range(0, len(xgaps)):
            xgap = xgaps[xx]
            for ii in sensorsxs:
                futures.append(
                    client.submit(simulate_single,
                                  xgap=xgap,
                                  numpanels=numpanels,
                                  sensorx=ii,
                                  **kwargs))

    # Get results for all simulations
    res = client.gather(futures)

    # Close all dask workers and scheduler
    try:
        client.shutdown()
    except:
        pass

    # Close client
    client.close()

    res = 'FINISHED!!!!!!!!!!!!!!!!!!!!!!!!!!!!!'
    return res
Esempio n. 16
0
def run_simulations_dask(daylist, posxs, moduleWiths, kwargs):
    # Create client

    scheduler_file = '/scratch/sayala/dask_testing/scheduler.json'
    client = Client(scheduler_file=scheduler_file)

    # Iterate over inputs
    futures = []

    # Add Iterations HERE

    for daydate in daylist:
        for posx in posxs:
            for moduleWith in moduleWiths:
                futures.append(
                    client.submit(simulate_single,
                                  daydate=daydate,
                                  posx=posx,
                                  moduleWith=moduleWith,
                                  **kwargs))

    # Get results for all simulations
    res = client.gather(futures)

    # Close all dask workers and scheduler
    try:
        client.shutdown()
    except:
        pass

    # Close client
    client.close()

    res = 'FINISHED!!!!!!!!!!!!!!!!!!!!!!!!!!!!!'
    return res
def train_on_jz_dask(job_name, train_function, *args, **kwargs):
    cluster = SLURMCluster(
        cores=1,
        job_cpu=40,
        memory='80GB',
        job_name=job_name,
        walltime='20:00:00',
        interface='ib0',
        job_extra=[
            f'--gres=gpu:4',
            '--qos=qos_gpu-t3',
            '--distribution=block:block',
            '--hint=nomultithread',
            '--output=%x_%j.out',
        ],
        env_extra=[
            'cd $WORK/understanding-unets',
            '. ./submission_scripts_jean_zay/env_config.sh',
        ],
    )
    cluster.scale(1)

    print(cluster.job_script())

    client = Client(cluster)
    futures = client.submit(
        # function to execute
        train_function,
        *args,
        **kwargs,
        # this function has potential side effects
        pure=True,
    )
    run_id = client.gather(futures)
    print(f'Train run id: {run_id}')
Esempio n. 18
0
def start_futures():
    t = time()
    isins = get_isins()

    client = Client('127.0.0.1:8786')

    data = client.map(load_data, isins)
    params_a = client.map(get_param, data, ['param_a'] * len(isins))
    params_b = client.map(get_param, data, ['param_b'] * len(isins))

    result_a = client.map(task_a, isins, params_a, params_b)

    group_args = list(chain(*zip(isins, result_a, params_b)))
    result_group = client.submit(task_group_alter, *group_args)

    result_b = client.map(task_b, isins, params_b, [result_group] * len(isins))

    result_c = client.map(task_c, isins, params_b)

    result = client.gather([result_group] + result_a + result_b + result_c)

    total = time() - t
    print(total)
    print(len(result))
    with open('/Users/vladimirmarunov/git/dask-test/res.txt', 'w') as f:
        f.write('{}\n'.format(total))
        json.dump(result, f, indent=4)
def main(args):
    config_file = args.config_file

    # Configure on cluster
    if config_file:
        stream = open(config_file, 'r')
        inp = yaml.load(stream)
        cores = inp['jobqueue']['slurm']['cores']
        memory = inp['jobqueue']['slurm']['memory']
        jobs = inp['jobqueue']['slurm']['jobs']
        cluster = SLURMCluster(
            cores=cores,
            memory=memory,
        )
        cluster.scale(jobs=jobs)

    # Configure locally
    else:
        cluster = LocalCluster()

    client = Client(cluster)
    raised_futures = client.map(sleep_more, range(100))
    progress(raised_futures)
    raised = client.gather(raised_futures)
    print('\n', raised)
Esempio n. 20
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("config", help="Configuration yaml file")
    parser.add_argument("-p",
                        "--proc",
                        type=int,
                        default=1,
                        help="Number of processors")
    args = parser.parse_args()
    if not exists(args.config):
        raise FileNotFoundError(args.config + " not found.")
    with open(args.config) as config_file:
        config = yaml.load(config_file)
    #time_files = get_cam_output_times(config["model_path"], time_var=config["time_var"],
    #                                  file_start=config["model_file_start"],
    #                                  file_end=config["model_file_end"])
    if not exists(config["out_path"]):
        makedirs(config["out_path"])
    #print(time_files)

    #filenames = np.sort(time_files["filename"].unique())
    filenames = sorted(
        glob(
            join(config["model_path"],
                 config["model_file_start"] + "*" + config["model_file_end"])))
    if "dt" not in config.keys():
        config["dt"] = 1800
    if args.proc == 1:
        for filename in filenames:
            process_cesm_file_subset(
                filename,
                staggered_variables=config["staggered_variables"],
                out_variables=config["out_variables"],
                subset_variable=config["subset_variable"],
                subset_threshold=config["subset_threshold"],
                out_path=config["out_path"],
                out_format=config["out_format"],
                dt=config["dt"])
    else:
        cluster = LocalCluster(n_workers=0)
        cluster.scale(args.proc)
        client = Client(cluster)
        print(client)
        futures = client.map(process_cesm_file_subset,
                             filenames,
                             staggered_variables=config["staggered_variables"],
                             out_variables=config["out_variables"],
                             subset_variable=config["subset_variable"],
                             subset_threshold=config["subset_threshold"],
                             out_path=config["out_path"],
                             out_start=config["out_start"],
                             out_format=config["out_format"],
                             dt=config["dt"])
        out = client.gather(futures)
        print(out)
        client.close()
    return
def eval_parameter_grid(run_ids,
                        job_name,
                        eval_function,
                        parameter_grid,
                        n_gpus=1):
    parameters = list(ParameterGrid(parameter_grid))
    n_parameters_config = len(parameters)
    # eval
    eval_cluster = SLURMCluster(
        cores=1,
        job_cpu=40,
        memory='80GB',
        job_name=job_name,
        walltime='5:00:00',
        interface='ib0',
        job_extra=[
            f'--gres=gpu:{n_gpus}',
            '--qos=qos_gpu-t3',
            '--distribution=block:block',
            '--hint=nomultithread',
            '--output=%x_%j.out',
        ],
        env_extra=[
            'cd $WORK/fastmri-reproducible-benchmark',
            '. ./submission_scripts_jean_zay/env_config.sh',
        ],
    )
    eval_cluster.scale(n_parameters_config)
    client = Client(eval_cluster)
    original_parameters = []
    for params in parameters:
        original_params = {}
        original_params['n_samples'] = params.pop('n_samples', None)
        original_params['loss'] = params.pop('loss', 'mae')
        original_params['fixed_masks'] = params.pop('fixed_masks', False)
        original_parameters.append(original_params)
    futures = [
        client.submit(
            # function to execute
            eval_function,
            run_id=run_id,
            n_samples=50,
            **params,
        ) for run_id, params in zip(run_ids, parameters)
    ]

    for params, original_params, future in zip(parameters, original_parameters,
                                               futures):
        metrics_names, eval_res = client.gather(future)
        params.update(original_params)
        print('Parameters', params)
        print(metrics_names)
        print(eval_res)
    print('Shutting down dask workers')
    client.close()
    eval_cluster.close()
Esempio n. 22
0
def train_eval_dealiasers(contrast='CORPD_FBK', n_epochs=200, n_samples=None, model_name=None, model_size=None, loss='mae'):
    job_name = 'dealiasing_fastmri'
    model_specs = list(get_model_specs(force_res=True, dealiasing=True))
    if model_name is not None:
        model_specs = [ms for ms in model_specs if ms[0] == model_name]
    if model_size is not None:
        model_specs = [ms for ms in model_specs if ms[1] == model_size]
    n_models = len(model_specs)
    train_cluster = SLURMCluster(
        cores=1,
        job_cpu=20,
        memory='80GB',
        job_name=job_name,
        walltime='20:00:00',
        interface='ib0',
        job_extra=[
            f'--gres=gpu:1',
            '--qos=qos_gpu-t3',
            '--distribution=block:block',
            '--hint=nomultithread',
            '--output=%x_%j.out',
        ],
        env_extra=[
            'cd $WORK/fastmri-reproducible-benchmark',
            '. ./submission_scripts_jean_zay/env_config.sh',
        ],
    )
    train_cluster.adapt(minimum_jobs=0, maximum_jobs=n_models)
    client = Client(train_cluster)
    futures = [client.submit(
        # function to execute
        train_dealiaser,
        model_fun=model_fun,
        model_kwargs=kwargs,
        run_id=f'{model_name}_{model_size}',
        n_scales=n_scales,
        contrast=contrast,
        n_epochs=n_epochs,
        n_samples=n_samples,
        loss=loss,
    ) for model_name, model_size, model_fun, kwargs, _, n_scales, _ in model_specs]
    run_ids = client.gather(futures)
    client.close()
    train_cluster.close()
    # eval
    eval_dealiasers(
        run_ids,
        job_name=job_name,
        contrast=contrast,
        n_epochs=n_epochs,
        model_name=model_name,
        model_size=model_size,
        n_samples_train=n_samples,
        loss=loss,
    )
    return run_ids
Esempio n. 23
0
def _get_online_sp():
    client = Client()  # start local workers as threads
    # TODO: Figure out a way to not hardwire the pages
    futures = client.map(request_online, range(1, 48))
    df = pd.concat(client.gather(futures)).reset_index(drop='index')
    cleaned_df = df[(df['snow_depth'] != '') & (df['lat'] != '') & (df['lon'] != '')].sort_values(
        by='time').reset_index(drop='index')
    cleaned_df.loc[:, 'lon'] = cleaned_df['lon'].apply(lambda x: float(x))
    cleaned_df.loc[:, 'lat'] = cleaned_df['lat'].apply(lambda x: float(x))
    return cleaned_df
Esempio n. 24
0
def distribute(func, parameters, scheduler_addr=None):
    """Run the function with the parameters in parallel distributedly."""
    try:
        if scheduler_addr:
            addr = scheduler_addr
        elif not hasattr(parameters[0], 'scheduler_addr'):
            raise RuntimeError('The parameters or distribute() need a scheduler_addr parameter.')
        else:
            addr = parameters[0].scheduler_addr

        client = Client(addr)
        results = client.map(func, parameters)
        client.gather(results)
    except Exception as e:
        print('Distributed run failed.')
        raise e
    finally:
        client.close()

    return results
def eval_parameter_grid(job_name,
                        eval_function,
                        parameter_grid,
                        run_ids,
                        n_samples_eval=None):
    parameters = list(ParameterGrid(parameter_grid))
    n_parameters_config = len(parameters)
    assert n_parameters_config == len(
        run_ids), 'Not enough run ids provided for grid evaluation'
    eval_cluster = SLURMCluster(
        cores=1,
        job_cpu=40,
        memory='60GB',
        job_name=job_name,
        walltime='3:00:00',
        interface='ib0',
        job_extra=[
            f'--gres=gpu:4',
            '--qos=qos_gpu-t3',
            '--distribution=block:block',
            '--hint=nomultithread',
            '--output=%x_%j.out',
        ],
        env_extra=[
            'cd $WORK/understanding-unets',
            '. ./submission_scripts_jean_zay/env_config.sh',
        ],
    )
    eval_cluster.scale(n_parameters_config)
    client = Client(eval_cluster)
    n_samples_list = []
    for params in parameters:
        n_samples = params.pop('n_samples', -1)
        n_samples_list.append(n_samples)
    futures = [
        client.submit(
            # function to execute
            eval_function,
            run_id=run_id,
            n_samples=n_samples_eval,
            **params,
        ) for run_id, params in zip(run_ids, parameters)
    ]

    results = []
    for params, future, n_samples in zip(parameters, futures, n_samples_list):
        metrics_names, eval_res = client.gather(future)
        if n_samples != -1:
            params.update({'n_samples': n_samples})
        results.append((params, eval_res))
    print('Shutting down dask workers')
    client.close()
    eval_cluster.close()
    return metrics_names, results
Esempio n. 26
0
def test_gc():
    # Create pod, repo & collection
    pod = POD.from_uri("memory://")
    token = pod.token
    label = "my_label"
    repo = Repo(pod=pod)
    clc = repo.create_collection(schema, "my_collection")
    # Start cluster & schedule concurrent writes & gc
    cluster = LocalCluster(processes=False)
    client = Client(cluster)
    args = [(token, label, y) for y in years]
    insert_fut = client.map(insert, args)
    gc_fut = client.submit(do_squash_and_gc, token)
    assert sum(client.gather(insert_fut)) == 10_519_200
    client.gather(gc_fut)
    client.close()
    cluster.close()
    # Read data back
    clc.merge()
    frm = clc.series("my_label").frame()
    assert len(frm) == 10_519_200
Esempio n. 27
0
class daskerator(object):
    _DSCH = {
        'd': 'distributed',
        't': 'threads',
        'p': 'processes',
        's': 'synchronous'
    }

    def _get_sched(mp_type) -> str:
        if mp_type in daskerator._DSCH.keys():
            return daskerator._DSCH[mp_type]
        else:
            return mp_type

    mp_type = attr.ib(default='s',
                      type=str,
                      converter=_get_sched,
                      validator=attr.validators.in_(
                          list(_DSCH.keys()) + list(_DSCH.values())))
    sch_add = attr.ib(default='', type=str)

    @sch_add.validator
    def check_dask_opts(instance, attribute, value):
        if instance.mp_type != 'distributed' and value != '':
            raise ValueError(
                'Only distributed dask can accept scheduler address.')

    _client = attr.ib(default=None)
    _cluster = attr.ib(default=None)

    def __attrs_post_init__(self):
        if self.mp_type[0] == 'd':
            from dask.distributed import Client, LocalCluster
            dbg("Creating distributed client object.")
            if self.sch_add == '':
                dbg("Creating new cluster on localhost.")
                self._cluster = LocalCluster()
                self._client = Client(self._cluster)
            else:
                dbg(f"Existing scheduler address: {self.sch_add}")
                self._client = Client(self.sch_add)
            log.info(self._client)

    @curry
    def run_dask(self, func, iterator):
        dbg(f'Scheduler: {self.mp_type}')
        if self.mp_type[0] == 'd':
            dbg('Using dask client')
            return self._client.gather(self._client.map(func, iterator))
        else:
            dbg('Not using dask client.')
            return compute(*map(delayed(func), iterator),
                           scheduler=self.mp_type)
Esempio n. 28
0
def generate_captchas():
    # start Dask distributed client with 4 processes / 1 thread per process
    client = DaskClient(n_workers=6, threads_per_worker=1)
    # submit future functions to cluster
    futures = []
    for i in range(10000):
        futures.append(client.submit(synth_captcha, pure=False))
    # execute and compute results (synchronous / blocking!)
    results = client.gather(futures)
    print(len(results))
    # stop & release client
    client.close()
Esempio n. 29
0
def convert_batch(apkFilenameList):
    """Convert APK files to AppGene files in batch.  Dask creates multiple threads or use multiple nodes to execute the convertSingleApk function.
       Args: 
         apkFilenameList: A list of the base filenames of APK files available from the HTTP interface to be converted 
       Returns:
         A list of conversion result objects 
    """
    client = Client(daskSchedulerConnection)
    # One APK file per new task
    futures = client.map(convertSingleApk, apkFilenameList)
    # Await until all tasks are done
    results = client.gather(futures)
    return list(results)
Esempio n. 30
0
def main():
    cluster = LocalCluster(n_workers=4, threads_per_worker=1)
    client = Client(cluster)
    print("started cluster")
    num_layers = [2, 3, 4]
    num_neurons = [20, 40, 60]
    futures = []
    for l in num_layers:
        for n in num_neurons:
            futures.append(client.submit(train_random_model, l, n))
    results = client.gather(futures)
    print(results)
    client.close()
    return