Beispiel #1
0
def run_hg38_variant_retrieval(client: Client = None,
                               force: bool = False) -> List[Future]:
    """
    Executes the genomic variant retrieval step of the ETL pipeline for hg38 variants.

    arguments
        client: a dask Client object
        force:  if true, datasets will be downloaded even if they exist locally

    returns
        a list of Futures, one per chromosome variant build
    """

    client = get_client() if client is None else client
    futures = []

    for chrom in Globals().var_human_chromosomes:

        ## Download from Ensembl
        dl = client.submit(download_hg38_variant_build, chrom, force=force)

        ## Decompress
        dl_unzip = client.submit(_unzip, dl, force=force)

        futures.append(dl_unzip)

    return futures
Beispiel #2
0
def main():
    n_mutation = 100
    client = Client('scheduler:8786')
    futures = client.map(initialize_network, range(n_mutation))
    results = client.gather(futures)
    results.sort(key=lambda x: -x[1])

    truncated = list(map(lambda x: x[0], results[:3]))
    futures = []
    for i, seed in enumerate(truncated):
        name = 'top-{}'.format(i)
        futures.append(
            client.submit(initialize_network, seed, store=True, name=name))
    results = client.gather(futures)
    print(results, flush=True)

    for g in range(10):
        futures = []
        for seed in range(n_mutation):
            futures.append(client.submit(update_network, seed, g + 1))
        results = client.gather(futures)
        results.sort(key=lambda x: -x[1])
        truncated = list(map(lambda x: x[0], results[:3]))

        futures = []
        for i, seed in enumerate(truncated):
            name = 'top-{}'.format(i)
            futures.append(
                client.submit(update_network,
                              seed,
                              g + 1,
                              store=True,
                              name=name))
        results = client.gather(futures)
        print(results, flush=True)
def main():
    """."""
    host = os.getenv('DASK_SCHEDULER_HOST', default='localhost')
    port = os.getenv('DASK_SCHEDULER_PORT', default=8786)
    print(host, port)
    client = Client('{}:{}'.format(host, port))
    # client.run(init_logging)
    # client.run_on_scheduler(init_logging)

    # Run some mock functions and gather a result
    data = client.map(print_listdir, range(10))
    future = client.submit(print_values, data)
    progress(future)
    print('')
    result = client.gather(future)
    print(result)

    # Run a second stage which runs some additional processing.
    print('here A')
    data_a = client.map(set_value, range(100))
    print('here B')
    data_b = client.map(square, data_a)
    print('here C')
    data_c = client.map(neg, data_b)
    print('here D')
    # Submit a function application to the scheduler
    total = client.submit(sum, data_c)
    print('here E')
    progress(total)
    print(total.result())
    print('here F')
Beispiel #4
0
def log2tf_winsorizer(meta: pd.DataFrame, counts: pd.DataFrame,
                      log2_transform: bool, threads: int) -> dict:
    """
    Builds a cluster structure and calculates the means values
    """
    cluster_names = meta['cell_type'].drop_duplicates().tolist()

    def mstats_winsorizer(s):
        return mstats.winsorize(s, limits=[0, 0.05])

##########
# def WinsorizeSampleList(data):
#     # quantiles = data.quantile([0.95])
#     # q_05 = quantiles.loc[0.05]
#     # q_95 = quantiles.loc[0.95]

#     quantiles = data.quantile([0.25, 0.75])
#     q_25 = quantiles.loc[0.25]
#     q_75 = quantiles.loc[0.75]
#     step = (q_75 - q_25) * 1.5
#     # return data[(data.values >= q_05) & (data.values <= q_95)]
#     return data[(data.values >= q_75 + step)]
###########

# Winsorizer function

    def winsorizer_process(cluster_count, log2_transform):

        winsorized_cluster_count_array = cluster_count.apply(mstats_winsorizer,
                                                             axis=1)

        if log2_transform:
            winsorized_cluster_count_array = winsorized_cluster_count_array[:].apply(
                lambda x: np.log2(x + 1))

        return pd.DataFrame.from_records(winsorized_cluster_count_array, \
                                      index=cluster_count.index, columns=cluster_count.columns)

    chunks = [
        counts.loc[:, meta[meta['cell_type'] == cluster_name].index]
        for cluster_name in cluster_names
    ]

    # Concatenating the individual dataframes
    def df_concatenate(dfs):

        return pd.concat(dfs, axis=1)

    # Starting DASK client and submitting parallel processing jobs
    client = Client()
    L = [
        client.submit(winsorizer_process, future, log2_transform)
        for future in chunks
    ]
    future = client.submit(df_concatenate, L)
    result = future.result()

    return result
Beispiel #5
0
 def threadExample(self):
     world_rank = self.comms.comm.Get_rank()
     world_size = self.comms.comm.Get_size()
     client = Client()
     exec_time = 0
     exec_time -= time.time()
     a = client.submit(
         self.sendToRank, 1,
         world_size)  # calls inc(10) in background thread or process
     b = client.submit(
         self.recvFromRank, 0,
         world_size)  # calls inc(20) in background thread or process
     print(a.result(), b.result())
     exec_time += time.time()
     print("Dask Time Taken : " + str(exec_time))
Beispiel #6
0
def test_recv_any_rank(n_trials, ucx_cluster):

    client = Client(ucx_cluster)

    try:

        cb = CommsContext(comms_p2p=True)
        cb.init()

        dfs = [
            client.submit(func_test_recv_any_rank,
                          cb.sessionId,
                          n_trials,
                          random.random(),
                          workers=[w]) for w in cb.worker_addresses
        ]

        wait(dfs)

        result = [x.result() for x in dfs]

        assert result

    finally:
        cb.destroy()
        client.close()
Beispiel #7
0
class QAExtractor(Extractor):
    def __init__(self, client):
        if type(client) == str:
            logger.info('Input is a str, inferring to be scheduler address. Initializing client')
            self.addr = client
            self.client = Client(client)#, serializers=['msgpack', 'dask'], deserializers=['msgpack', 'dask'])
            logger.info(self.client)
        else:
            self.client = client
            self.addr = None

    def __del__(self):
        if self.addr is not None:
            self.client.close()

    def extract(self, query, context):
        result = self.client.submit(QAExtractor._extract, query, context, resources={'qa': 1})
        result = result.result()
        return result

    @classmethod
    def _extract(cls, query, context):
        worker = get_worker()
        dp = None
        for plg in worker.plugins:
            if 'qa_extractor' in plg:
                dp = worker.plugins[plg]
                break
        if dp is None:
            raise Exception('No QA plugin registered')
        model = dp.model
        answer, score = model.extract(query, context)
        return answer, score
Beispiel #8
0
def run_simulations_dask(xgaps, numpanelss, sensorsxs, kwargs):
    # Create client

    scheduler_file = '/scratch/sayala/dask_testing/scheduler.json'
    client = Client(scheduler_file=scheduler_file)

    # Iterate over inputs
    futures = []

    for nn in range(0, len(numpanelss)):
        numpanels = numpanelss[nn]
        for xx in range(0, len(xgaps)):
            xgap = xgaps[xx]
            for ii in sensorsxs:
                futures.append(
                    client.submit(simulate_single,
                                  xgap=xgap,
                                  numpanels=numpanels,
                                  sensorx=ii,
                                  **kwargs))

    # Get results for all simulations
    res = client.gather(futures)

    # Close all dask workers and scheduler
    try:
        client.shutdown()
    except:
        pass

    # Close client
    client.close()

    res = 'FINISHED!!!!!!!!!!!!!!!!!!!!!!!!!!!!!'
    return res
Beispiel #9
0
def start_futures():
    t = time()
    isins = get_isins()

    client = Client('127.0.0.1:8786')

    data = client.map(load_data, isins)
    params_a = client.map(get_param, data, ['param_a'] * len(isins))
    params_b = client.map(get_param, data, ['param_b'] * len(isins))

    result_a = client.map(task_a, isins, params_a, params_b)

    group_args = list(chain(*zip(isins, result_a, params_b)))
    result_group = client.submit(task_group_alter, *group_args)

    result_b = client.map(task_b, isins, params_b, [result_group] * len(isins))

    result_c = client.map(task_c, isins, params_b)

    result = client.gather([result_group] + result_a + result_b + result_c)

    total = time() - t
    print(total)
    print(len(result))
    with open('/Users/vladimirmarunov/git/dask-test/res.txt', 'w') as f:
        f.write('{}\n'.format(total))
        json.dump(result, f, indent=4)
 def _submit(self, cluster: "ClusterType", client: Client, f: Callable,
             *args, **kwargs) -> Future:
     # For normal tasks, we maintain the Dask default that functions are pure (by
     # default)
     kwargs.update(
         {"pure": getattr(cluster, "pure", getattr(kwargs, "pure", True))})
     return client.submit(f, *args, **kwargs)
Beispiel #11
0
def plugin_f_and_f(dump, plugin, params, user_pk):
    """
    Fire and forget plugin on dask
    """
    dask_client = Client(settings.DASK_SCHEDULER_URL)
    fire_and_forget(
        dask_client.submit(run_plugin, dump, plugin, params, user_pk))
def subdivideCheck(client: distributed.Client, lower: float, upper: float,
                   func: Callable[[float], float], step: float,
                   eps: float) -> distributed.Future:
    """Subdivide the range upper<=x<=lower into segments of size at most 1, and then call findZero(func, l, u, eps) on each."""
    if (upper - lower) <= 1:
        # proxy for determining if this interval should be searched. 1 in 4 chance.
        if 1 == randint(0, 3):
            return client.submit(piece, lower, upper, func, step, eps)
        else:
            return []
    else:
        # proxy for recursive exploration of a data space or structure.
        mid = (upper + lower) / 2.
        rl = subdivideCheck(client, lower, mid, func, step, eps)
        rr = subdivideCheck(client, mid, upper, func, step, eps)
        return client.submit(concat, rl, rr)
Beispiel #13
0
def run_simulations_dask(clearance_heights, xgaps, Ds, tilts, kwargs):
    # Create client
    
    scheduler_file = '/scratch/sayala/dask_testing/scheduler.json'
    client = Client(scheduler_file=scheduler_file)
    
    # Iterate over inputs
    futures = []
    
    for ch in range (0, len(clearance_heights)):
        clearance_height = clearance_heights[ch]
        for xx in range (0, len(xgaps)):
            xgap = xgaps[xx]
            for tt in range (0, len(tilts)):
                tilt = tilts[tt]
                for dd in range (0, len(Ds)):
                    D = Ds[dd]
                    futures.append(client.submit(simulate_single, clearance_height=clearance_height,
                                                            xgap=xgap, tilt=tilt, D=D, **kwargs))

    # Get results for all simulations
    res = client.gather(futures)
    
    # Close all dask workers and scheduler
    try:
    	client.shutdown()
    except:
        pass

    # Close client
    client.close()

    res = 'FINISHED!!!!!!!!!!!!!!!!!!!!!!!!!!!!!'
    return res
Beispiel #14
0
def test_allreduce(cluster):

    client = Client(cluster)

    try:
        cb = CommsContext()
        cb.init()

        start = time.time()
        dfs = [
            client.submit(func_test_allreduce,
                          cb.sessionId,
                          random.random(),
                          workers=[w]) for wid, w in
            zip(range(len(cb.worker_addresses)), cb.worker_addresses)
        ]
        wait(dfs)

        print("Time: " + str(time.time() - start))

        print(str(list(map(lambda x: x.result(), dfs))))

        assert all(list(map(lambda x: x.result(), dfs)))

    finally:
        cb.destroy()
        client.close()
def run_simulations_dask(tilts, kwargs):
    # Create client

    scheduler_file = '/scratch/sayala/dask_testing/scheduler.json'
    client = Client(scheduler_file=scheduler_file)

    # Iterate over inputs
    futures = []

    # Add Iterations HERE

    for tilt in tilts:
        futures.append(client.submit(simulate_single, tilt=tilt, **kwargs))

    # Get results for all simulations
    res = client.gather(futures)

    # Close all dask workers and scheduler
    try:
        client.shutdown()
    except:
        pass

    # Close client
    client.close()

    res = 'FINISHED!!!!!!!!!!!!!!!!!!!!!!!!!!!!!'
    return res
Beispiel #16
0
def main():
    client = Client(address=DASK_SCHEDULER_URL)  # , asynchronous=True
    print(client)
    # client.restart()
    # client.close(10)
    # scheduler_info = client.scheduler_info()
    # from pprint import pprint
    # pprint(scheduler_info)
    x = client.submit(add, 1, 2)
    # print("status:{},key:{},done():{},result:{}".format(x.status, x.key, x.done(), x.result()))
    print("status:{},key:{},done():{}".format(x.status, x.key, x.done()))

    y = client.submit(np.random.random, 1000, pure=False)
    print(y.key)
    z = client.submit(np.random.random, 1000, pure=False)
    print(z.key)
def train_on_jz_dask(job_name, train_function, *args, **kwargs):
    cluster = SLURMCluster(
        cores=1,
        job_cpu=40,
        memory='80GB',
        job_name=job_name,
        walltime='20:00:00',
        interface='ib0',
        job_extra=[
            f'--gres=gpu:4',
            '--qos=qos_gpu-t3',
            '--distribution=block:block',
            '--hint=nomultithread',
            '--output=%x_%j.out',
        ],
        env_extra=[
            'cd $WORK/understanding-unets',
            '. ./submission_scripts_jean_zay/env_config.sh',
        ],
    )
    cluster.scale(1)

    print(cluster.job_script())

    client = Client(cluster)
    futures = client.submit(
        # function to execute
        train_function,
        *args,
        **kwargs,
        # this function has potential side effects
        pure=True,
    )
    run_id = client.gather(futures)
    print(f'Train run id: {run_id}')
def DASK_batch_mult(matrix_input, vector_input, workers, batch_size,
                    input_size, output_channels):
    client = Client(n_workers=workers)
    results = []
    batch_no = matrix_input.shape[0] // batch_size

    for i in range(batch_no):
        batch = client.scatter(matrix_input[i * batch_size:i * batch_size +
                                            batch_size])
        results.append(
            client.submit(convolution_mean, batch, vector_input, batch_size,
                          vector_input.shape[0]))

    wait(results)
    data = client.gather(results)
    out_tensor = np.empty(
        (batch_size * batch_no, output_channels, input_size, input_size))
    for i in range(batch_no):
        out_tensor[i * batch_size:i * batch_size +
                   batch_size] = data[i].reshape(batch_size, output_channels,
                                                 input_size, input_size)

    client.shutdown()

    return out_tensor
Beispiel #19
0
def test_recv_any_rank(n_trials, ucx_cluster):

    client = Client(ucx_cluster)

    try:

        cb = CommsContext(comms_p2p=True)
        cb.init()

        dfs = [
            client.submit(func_test_recv_any_rank,
                          cb.sessionId,
                          n_trials,
                          random.random(),
                          workers=[w]) for wid, w in
            zip(range(len(cb.worker_addresses)), cb.worker_addresses)
        ]

        wait(dfs)

        result = list(map(lambda x: x.result(), dfs))

        assert (result)

    finally:
        cb.destroy()
        client.close()
def data_processing(input_file):
	try:
		featured_index_dict = fetch_pickle_FromS3('featured_index_dict.pkl')
		data_dataframe = pd.read_csv(input_file)
		total_rows = data_dataframe.shape[0]
		data_X = data_transformation(data_dataframe, featured_index_dict)
		print('1) data_X.shape: ', data_X.shape)
		for i in range(0, len(PICKLED_MODELS)):
			# Load Model
			model = fetch_pickle_FromS3(PICKLED_MODELS[i])
			model_name = secure_filename(PICKLED_MODELS[i]).rsplit('.', 1)[0]
			# Make prediction
			if total_rows > 5:
				client = Client(processes=False)
				print('2-if) data_X.shape: ', data_X.shape)
				prediction = client.submit(model.predict, data_X).result().tolist()
			else:
				print('2-else) data_X.shape: ', data_X.shape)
				prediction = model.predict(data_X).tolist()
			print('3) data_X.shape: ', data_X.shape)
			prediction_series = pd.Series(prediction)
			data_dataframe[model_name] = prediction_series
		return data_dataframe
	except Exception as e:
		print(str(e))
		raise e
Beispiel #21
0
def load_data_parallel(data_path,
                       num_processes,
                       image_variable="abi",
                       count_variable="flash_counts",
                       time_variable="time"):
    cluster = LocalCluster(n_workers=num_processes, threads_per_worker=1)
    client = Client(cluster)
    data_files = sorted(glob(join(data_path, "*.nc")))
    data_jobs = []
    for data_file in data_files:
        data_jobs.append(
            client.submit(load_single_data_file,
                          data_file,
                          image_variable=image_variable,
                          count_variable=count_variable,
                          time_variable=time_variable))
    wait(data_jobs)
    data_results = client.gather(data_jobs)
    all_images = np.concatenate([d[0] for d in data_results])
    all_counts = np.concatenate([d[1] for d in data_results])
    all_time = pd.DatetimeIndex(np.concatenate([d[2] for d in data_results]))
    client.close()
    cluster.close()
    del client
    del cluster
    return all_images, all_counts, all_time
Beispiel #22
0
def run_simulations_dask(daylist, posxs, moduleWiths, kwargs):
    # Create client

    scheduler_file = '/scratch/sayala/dask_testing/scheduler.json'
    client = Client(scheduler_file=scheduler_file)

    # Iterate over inputs
    futures = []

    # Add Iterations HERE

    for daydate in daylist:
        for posx in posxs:
            for moduleWith in moduleWiths:
                futures.append(
                    client.submit(simulate_single,
                                  daydate=daydate,
                                  posx=posx,
                                  moduleWith=moduleWith,
                                  **kwargs))

    # Get results for all simulations
    res = client.gather(futures)

    # Close all dask workers and scheduler
    try:
        client.shutdown()
    except:
        pass

    # Close client
    client.close()

    res = 'FINISHED!!!!!!!!!!!!!!!!!!!!!!!!!!!!!'
    return res
Beispiel #23
0
def test_dask_connection():
    cluster = LocalCluster(
        scheduler_port=0,
        silence_logs=True,
        processes=False,
        asynchronous=False,
    )
    client = Client(cluster, asynchronous=False)

    def square(x):
        return x**2

    def neg(x):
        return -x

    # Run a computation on Dask
    a = client.map(square, range(10))
    b = client.map(neg, a)
    total = client.submit(sum, b)
    result = total.result()

    if result != -285:
        raise AssertionError("Result is " + str(result))
    else:
        print("The result is correct!!!")

    client.close()
    cluster.close()
    return True
Beispiel #24
0
def _find_ports_for_workers(client: Client, worker_addresses: Iterable[str], local_listen_port: int) -> Dict[str, int]:
    """Find an open port on each worker.

    LightGBM distributed training uses TCP sockets by default, and this method is used to
    identify open ports on each worker so LightGBM can reliable create those sockets.

    Parameters
    ----------
    client : dask.distributed.Client
        Dask client.
    worker_addresses : Iterable[str]
        An iterable of addresses for workers in the cluster. These are strings of the form ``<protocol>://<host>:port``
    local_listen_port : int
        First port to try when searching for open ports.

    Returns
    -------
    result : Dict[str, int]
        Dictionary where keys are worker addresses and values are an open port for LightGBM to use.
    """
    lightgbm_ports = set()
    worker_ip_to_port = {}
    for worker_address in worker_addresses:
        port = client.submit(
            func=_find_open_port,
            workers=[worker_address],
            worker_ip=urlparse(worker_address).hostname,
            local_listen_port=local_listen_port,
            ports_to_skip=lightgbm_ports
        ).result()
        lightgbm_ports.add(port)
        worker_ip_to_port[worker_address] = port

    return worker_ip_to_port
def train_on_jz_dask(job_name, train_function, *args, **kwargs):
    cluster = SLURMCluster(
        cores=1,
        job_cpu=20,
        memory='80GB',
        job_name=job_name,
        walltime='60:00:00',
        interface='ib0',
        job_extra=[
            f'--gres=gpu:1',
            '--qos=qos_gpu-t4',
            '--distribution=block:block',
            '--hint=nomultithread',
            '--output=%x_%j.out',
        ],
        env_extra=[
            'cd $WORK/fastmri-reproducible-benchmark',
            '. ./submission_scripts_jean_zay/env_config.sh',
        ],
    )
    cluster.scale(1)

    print(cluster.job_script())

    client = Client(cluster)
    futures = client.submit(
        # function to execute
        train_function,
        *args,
        **kwargs,
        # this function has potential side effects
        pure=True,
    )
    client.gather(futures)
    print('Shutting down dask workers')
Beispiel #26
0
def test_send_recv(n_trials):

    cluster = LocalCUDACluster(threads_per_worker=1)
    client = Client(cluster)

    cb = CommsContext(comms_p2p=True)
    cb.init()

    cb = default_comms()

    start = time.time()
    dfs = [client.submit(func_test_send_recv,
                         cb.sessionId,
                         n_trials,
                         random.random(),
                         workers=[w])
           for wid, w in zip(range(len(cb.worker_addresses)),
                             cb.worker_addresses)]

    wait(dfs)
    print("Time: " + str(time.time() - start))

    result = list(map(lambda x: x.result(), dfs))

    print(str(result))

    assert(result)

    cb.destroy()
    client.close()
    cluster.close()
def predict_outcome(data_X, data_dataframe):
	pickled_models = fetch_pickle_FromS3('pickled_models.pkl')
	total_rows = data_dataframe.shape[0]
	small_df = data_dataframe.filter(['customerID'], axis=1)
	for i in range(1, (len(pickled_models)+1)):
		print('2) data_X.shape.in_predict_outcome: ', data_X.shape)
		# Load Model
		model_rank = i
		print('model_rank: ', i)
		model = pickled_models[i][0]
		model_name = 'Rank '+str(model_rank)+': '+pickled_models[i][1]
		print('model_name: ', model_name)
		# Make prediction
		if total_rows > 5:
			client = Client(processes=False)
			print('2-if) data_X.shape: ', data_X.shape)
			prediction = client.submit(model.predict, data_X).result().tolist()
		else:
			print('2-else) data_X.shape: ', data_X.shape)
			prediction = model.predict(data_X).tolist()
		print('3) data_X.shape: ', data_X.shape)
		prediction_series = pd.Series(prediction)
		data_dataframe[model_name] = prediction_series
		small_df[model_name] = prediction_series
	return data_dataframe, small_df
Beispiel #28
0
def plugin_f_and_f(dump, plugin, params):
    """
    Fire and forget plugin on dask
    """
    dask_client = Client(settings.DASK_SCHEDULER_URL)
    fire_and_forget(
        dask_client.submit(run_plugin, dump, plugin, settings.ELASTICSEARCH_URL, params)
    )
Beispiel #29
0
def index_f_and_f(dump_pk, user_pk):
    """
    Run all plugin for a new index on dask
    """
    dask_client = Client(settings.DASK_SCHEDULER_URL)
    fire_and_forget(
        dask_client.submit(unzip_then_run, dump_pk, user_pk, settings.ELASTICSEARCH_URL)
    )
Beispiel #30
0
def run_nta_dask(parameters,
                 input_dfs,
                 tracer_df=None,
                 jobid="00000000",
                 verbose=True):
    dask_client = Client(processes=False)
    return dask_client.submit(run_nta, parameters, input_dfs, tracer_df, jobid,
                              verbose)
def main():
    client = Client('localhost:8786')
    A = client.map(set_value, range(100))
    B = client.map(square, A)
    C = client.map(neg, B)
    total = client.submit(sum, C)
    print(progress(total))
    print(total.result())