Beispiel #1
0
 def training(self, examplesDf, daskClient: Client = None):
     clusters = []
     groupSize = self.CONSTS.k * self.CONSTS.representationThr
     for label, group in examplesDf.groupby('label'):
         for chunk in range(0, len(group), groupSize):
             subgroup = group[chunk:chunk + groupSize]
             subgroupDf = pd.DataFrame(iter(subgroup['item']))
             if daskClient:
                 daskClient.scatter(subgroupDf)
             clusters += self.trainGroup(subgroupDf, label)
     return clusters
Beispiel #2
0
def run_photoz_dask(runs, modelD, galcat, output_dir, fit_bands, ip_dask):
    """Run the photo-z on a Dask cluster."""

    path_out = Path(output_dir) / 'pzcat.pq'
    if path_out.exists():
        print('Photo-z catalogue already exists.')
        return

    # If not specified, we start up a local cluster.
    client = Client(ip_dask) if not ip_dask is None else Client()
    xnew_modelD = client.scatter(fix_model(modelD, fit_bands))
    #xnew_modelD = fix_model(modelD, fit_bands)

    galcat = dd.read_parquet(str(output_dir / 'galcat_in.pq'))

    #npartitions = int(302138 / 10) + 1
    npartitions = int(9900 / 10) + 1
    galcat = galcat.reset_index().repartition(npartitions=npartitions).set_index('ref_id')

    ebvD = dict(runs.EBV)
    pzcat = galcat.map_partitions(
        bcnz.fit.photoz_flatten, xnew_modelD, ebvD, fit_bands)


    pzcat = pzcat.repartition(npartitions=100)
    pzcat = dask.optimize(pzcat)[0]

    pzcat.to_parquet(str(path_out))
def DASK_batch_mult(matrix_input, vector_input, workers, batch_size,
                    input_size, output_channels):
    client = Client(n_workers=workers)
    results = []
    batch_no = matrix_input.shape[0] // batch_size

    for i in range(batch_no):
        batch = client.scatter(matrix_input[i * batch_size:i * batch_size +
                                            batch_size])
        results.append(
            client.submit(convolution_mean, batch, vector_input, batch_size,
                          vector_input.shape[0]))

    wait(results)
    data = client.gather(results)
    out_tensor = np.empty(
        (batch_size * batch_no, output_channels, input_size, input_size))
    for i in range(batch_no):
        out_tensor[i * batch_size:i * batch_size +
                   batch_size] = data[i].reshape(batch_size, output_channels,
                                                 input_size, input_size)

    client.shutdown()

    return out_tensor
Beispiel #4
0
def fit_spiking_likelihood(position,
                           spikes,
                           is_training,
                           place_bin_centers,
                           place_bin_edges,
                           is_track_interior,
                           penalty=1E1,
                           knot_spacing=30):
    """Estimate the place field model.

    Parameters
    ----------
    position : ndarray, shape (n_time,)
    spikes : ndarray, shape (n_time, n_neurons)
    place_bin_centers : ndarray, shape (n_place_bins,)
    penalty : float, optional
    time_bin_size : float, optional

    Returns
    -------
    spiking_likelihood : function

    """
    if np.any(np.ptp(place_bin_edges, axis=0) <= knot_spacing):
        logging.warning("Range of position is smaller than knot spacing.")

    is_training = np.asarray(is_training).astype(float)
    include = ~np.isclose(is_training, 0.0) & ~np.any(np.isnan(position),
                                                      axis=1)
    is_training = is_training[include]
    position = position[include]
    spikes = spikes[include]

    design_matrix = make_spline_design_matrix(position, place_bin_edges,
                                              knot_spacing)
    try:
        client = get_client()
    except ValueError:
        client = Client()
    dm = client.scatter(np.asarray(design_matrix), broadcast=True)

    place_field_coefficients = [
        fit_glm(is_spike, dm, is_training, penalty).params
        for is_spike in spikes.T
    ]
    place_field_coefficients = np.stack(
        dask.compute(*place_field_coefficients), axis=1)

    predict_matrix = make_spline_predict_matrix(design_matrix.design_info,
                                                place_bin_centers)
    place_conditional_intensity = get_firing_rate(predict_matrix,
                                                  place_field_coefficients,
                                                  sampling_frequency=1)

    return partial(spiking_likelihood,
                   design_matrix=design_matrix,
                   place_field_coefficients=place_field_coefficients,
                   place_conditional_intensity=place_conditional_intensity,
                   is_track_interior=is_track_interior)
Beispiel #5
0
def estimate_place_fields(position,
                          spikes,
                          place_bin_centers,
                          place_bin_edges,
                          penalty=1E-1,
                          knot_spacing=10):
    '''Gives the conditional intensity of the neurons' spiking with respect to
    position.

    Parameters
    ----------
    position : ndarray, shape (n_time, n_position_dims)
    spikes : ndarray, shape (n_time, n_neurons)
    place_bin_centers : ndarray, shape (n_bins, n_position_dims)
    place_bin_edges : ndarray, shape (n_bins + 1, n_position_dims)
    penalty : float, optional
    knot_spacing : int, optional

    Returns
    -------
    conditional_intensity : ndarray, shape (n_bins, n_neurons)

    '''
    if np.any(np.ptp(place_bin_edges, axis=0) <= knot_spacing):
        logging.warning("Range of position is smaller than knot spacing.")
    design_matrix = make_spline_design_matrix(position, place_bin_edges,
                                              knot_spacing)
    design_info = design_matrix.design_info
    try:
        client = get_client()
    except ValueError:
        client = Client()
    design_matrix = client.scatter(np.asarray(design_matrix), broadcast=True)
    results = [
        fit_glm(is_spike, design_matrix, penalty) for is_spike in spikes.T
    ]
    results = dask.compute(*results)

    predict_matrix = make_spline_predict_matrix(design_info, place_bin_centers)
    place_fields = np.stack(
        [get_firing_rate(predict_matrix, result) for result in results],
        axis=1)

    DIMS = ['position', 'neuron']
    if position.shape[1] == 1:
        names = ['position']
        coords = {'position': place_bin_centers.squeeze()}
    elif position.shape[1] == 2:
        names = ['x_position', 'y_position']
        coords = {
            'position':
            pd.MultiIndex.from_arrays(place_bin_centers.T.tolist(),
                                      names=names)
        }

    return xr.DataArray(data=place_fields, coords=coords, dims=DIMS)
Beispiel #6
0
    def build_histogram(self, client: Client = None):
        """
        Use numpy histogram2d to build out the counts for each cell.  Important if we go to filter out cells that have
        insufficient density (soundings per cell)

        Parameters
        ----------
        client
            optional dask client, if provided will map to cluster
        """

        if self.x_range is None:
            self.construct_base_grid()

        # numpy histogram2d is slow as hell
        # self.cell_count, xedges, yedges = np.histogram2d(self.x, self.y, bins=(self.x_range, self.y_range))

        bins = np.array([len(self.x_range) - 1, len(self.y_range) - 1])
        if client is not None:
            # first index of chunks is the chunks in the 1st dim
            strt = 0
            chnks = []
            for c in self.x.chunks[0]:
                chnks.append([strt, strt + c])
                strt += c
            bin_futs = client.scatter([bins] * len(chnks))
            range_futs = client.scatter([self.ranges] * len(chnks))
            x_futs = client.scatter([self.x[c[0]:c[1]].values for c in chnks])
            y_futs = client.scatter([self.y[c[0]:c[1]].values for c in chnks])

            rslt = client.map(hist2d_numba_seq, x_futs, y_futs, bin_futs,
                              range_futs)
            summed_rslt = client.submit(_hist2d_add, rslt)
            self.cell_count = summed_rslt.result()
        else:
            try:
                self.cell_count = hist2d_numba_seq(self.x.values,
                                                   self.y.values, bins,
                                                   self.ranges)
            except AttributeError:  # numpy workflow
                self.cell_count = hist2d_numba_seq(self.x, self.y, bins,
                                                   self.ranges)
Beispiel #7
0
    def train(self, algorithm, imgs, labels):
        params = self.get_params(algorithm)
        model = self.get_model(algorithm, params)
        logging.info("Training %s with the following parameters:" %
                     (algorithm))
        logging.info(params)

        dask_client = Client(DASK_IP_ADRESS)
        img_train, img_test, lbl_train, lbl_test = train_test_split(
            self.imgs, self.labels, test_size=0.2)

        futures_img_train = dask_client.scatter(img_train)
        futures_img_test = dask_client.scatter(img_test)
        futures_lbl_train = dask_client.scatter(lbl_train)
        futures_lbl_test = dask_client.scatter(lbl_test)

        future_model_fit = dask_client.submit(model.fit, futures_img_train,
                                              futures_lbl_train)

        model = future_model_fit.result()

        future_score_train = dask_client.submit(model.score, futures_img_train,
                                                futures_lbl_train)
        future_score_test = dask_client.submit(model.score, futures_img_test,
                                               futures_lbl_test)

        score_test = future_score_test.result()
        score_train = future_score_train.result()

        logging.info("Training complete, saving model %s to file" %
                     (algorithm))

        # saving the model to file
        with self.hdfs_client.write('/' + str(self.model_folder) +
                                    str(algorithm) + ".model") as writer:
            joblib.dump(model, writer)

        logging.info("Score on training set: %.4f, score on test set: %.4f" %
                     (score_train, score_test))

        return score_train, score_test
def run_dask_compute(h5_main):
    raw_data = h5_main[()]
    #cpu_cores = int(cpu_cores/8)
    #dask_raw_data = da.from_array(raw_data, chunks='auto')
    #cluster = LocalCluster(n_workers=cpu_cores/8)
    #client = Client(cluster, processes=True)
    #map = dask_raw_data.map_blocks(find_all_peaks, [20, 60], num_steps=30)
    #results = map.compute()
    client = Client(processes=False)
    dask_raw_data = client.scatter(raw_data)
    args = [[20, 60]]
    kwargs = {'num_steps': 30}
    L = client.submit(find_all_peaks, dask_raw_data, args, kwargs)
    dask_results = client.compute(L)
    cores = client.ncores()
    client.close()
    return cores
Beispiel #9
0
def main():
    x = np.random.normal(size=(1000000, 5))
    y = x.mean(axis=1)
    cluster = LocalCluster(n_workers=4, threads_per_worker=1, memory_limit='1G')
    client = Client(cluster)
    print(client)
    print("scattering")
    [x_ref, y_ref] = client.scatter([x, y], broadcast=True)
    jobs = []
    for e in range(1, 30):
        print(e)
        jobs.append(client.submit(train_rf, e, x_ref, y_ref))
    for job in as_completed(jobs):
        print(job.result())
        del job
        client.rebalance()
    client.close()
    cluster.close()
    return
Beispiel #10
0
def run_dask(instances, cluster):
    from dask.distributed import Client

    client = Client(cluster)

    graphs = {}
    instance_to_graph = {}
    instances = list(instances)
    for (i, instance) in enumerate(instances):
        if instance.graph not in graphs:
            graphs[instance.graph] = client.scatter([instance.graph],
                                                    broadcast=True)[0]
        inst = instance._replace(graph=None)
        instance_to_graph[inst] = graphs[instance.graph]
        instances[i] = inst

    results = client.map(process_dask,
                         ((instance_to_graph[i], i) for i in instances))
    return client.gather(results)
Beispiel #11
0
def main():

    # Data creation
    times = pd.date_range(
        '2000-01-01', periods=300
    )  # to stress more the system just increase the period value
    x = range(1)
    y = range(int(14e3))
    cube = xr.DataArray(np.random.rand(len(times), len(x), len(y)),
                        coords=[times, x, y],
                        dims=['time', 'x', 'y'])
    pixels_pairs = np.argwhere(cube.isel(time=0).values)

    # Client
    client = Client(processes=False, n_workers=1, threads_per_worker=1)
    # client = Client()

    url = 'http://localhost:8787/status'
    webbrowser.open_new(url)

    for row_idx in cube.x.values:
        row = cube.isel(dict([('x', row_idx)]))

        px_list = [ith for ith in pixels_pairs if ith[0] == row_idx]
        output_carrier = pd.DataFrame(index=cube.time.values,
                                      columns=cube.y.values)

        chunks = np.array_split(px_list, multiprocessing.cpu_count() * 4)
        rowi = client.scatter(row, broadcast=True)
        futures = client.map(function, chunks, **{
            'data': rowi,
            'parameter': 10
        })

        for future, result in as_completed(futures, with_results=True):
            output_carrier.update(result)

        cube[:, row_idx] = output_carrier.values

    print(cube)
    client.close()
Beispiel #12
0
def main():

    scheduler = os.environ.get('DASK_SCHEDULER', 'scheduler:8786')
    client = Client(scheduler)

    path, _ = os.path.split(DEM_TINDEX)
    dems = get_target_tiles(DEM_TINDEX, path)
    tindex = get_tif_fragments(LAS_TINDEX)
    frags = client.scatter(tindex, broadcast=True)

    print('submitting tasks', flush=True)
    futures = [
        client.submit(merge_frags, fname, bounds[1:], frags)
        for fname, bounds in zip(dems['location'], dems.bounds.itertuples())
    ]

    dems['processed'] = client.gather(futures)
    print('tasks gathered', flush=True)
    dems.to_file(DEM_TINDEX.replace('.gpkg', '_out.gpkg'), driver='GPKG')

    client.close()
    sys.exit(0)
Beispiel #13
0
class ClientFuture():
    def __init__(self,
                 local_client_n_workers,
                 local_client_threads_per_worker,
                 use_dashboard=True):

        self.use_dashboard = use_dashboard

        if use_dashboard:
            self.dashboard_address = ':8787'
        else:
            self.dashboard_address = None

        host_ip = get_host_ip_address()
        self.local_cluster = LocalCluster(
            n_workers=local_client_n_workers,
            threads_per_worker=local_client_threads_per_worker,
            processes=True,
            host=host_ip,
            dashboard_address=self.dashboard_address)
        self.local_client = Client(address=self.local_cluster, timeout='2s')

    def submit(self, func, *args, **kwargs):

        future = self.local_client.submit(func, *args, **kwargs)
        return future

    def scatter(self, *args):

        scattered_args = self.local_client.scatter(args, broadcast=True)
        return scattered_args

    def get_dashboard_link(self):

        if self.use_dashboard:
            print('local cluster: ', self.local_cluster.dashboard_link)
        else:
            print('dashboard disabled')
Beispiel #14
0
    ##### Loading up the ensemble results if needed #####
    if ensemble:
        try:
            ens_res, bestens = get_best_ensembles(CTDparams,
                                                  params_preproc_ens, dataseed)
        except FileNotFoundError:
            print("Ensemble not found, skipping\n")
            continue

    ##### Preparing data #####
    X, y, delaymask = get_X_y(dataseed, params_preproc_test, monkey, region,
                              taskvar)

    Xalpha = X[:, delaymask][:, ::5]

    Xfut, Xalphafut, yfut = client.scatter((X, Xalpha, y), broadcast=True)

    if not ensemble:
        nneurons = X.shape[2]
        bestens = [np.arange(nneurons) for i in range(5)]

    if permutes:
        np.random.seed(dataseed)
        permseeds = np.random.randint(0, 999999, permutes)
    else:
        permseeds = [None]

    outerxval = KFold(n_splits=nouterfolds)
    subxval = KFold(n_splits=2)

    acc_test_futs = []
Beispiel #15
0
        count += 1
    if (results != []):
        return results
    return None


for i in range(parallel_execs):
    if (i != parallel_execs - 1):
        scatter_data = data_sentence[int(i * len(data_sentence) /
                                         parallel_execs):int(
                                             (i + 1) * len(data_sentence) /
                                             parallel_execs)]
    else:
        scatter_data = data_sentence[int(i * len(data_sentence) /
                                         parallel_execs):]
    futures_array.append(client.scatter(scatter_data))

for i in range(parallel_execs):
    delayed_array.append(delayed(find_key)(futures_array[i], 'amet'))

results_array = compute(*delayed_array)
results_array = list(results_array)

final_result = list()
for index in range(len(results_array)):
    if (results_array[index] != None):
        for item in results_array[index]:
            item[0] = item[0] + int(
                index * len(data_sentence) / parallel_execs)
        final_result.extend(results_array[index])
print(final_result)
Beispiel #16
0
    def run(self, client: DaskClient):
        """
        Run the algorithm.

        Parameters
        ----------
        client : DaskClient
            A client to Dask.
        rj : RedisClient
            A Redist Client, a rejson.Client

        Notes
        -----
        This function runs the adaptive algorithm. Because it's asynchronous,
        this function should return if
        ``"reset" in rj.keys() and rj.jsonget("reset")``.

        """
        rj = self.redis_client()

        answers: List = []
        logger.info(f"Staring {self.ident}")

        def submit(fn: str, *args, allow_other_workers=True, **kwargs):
            if "workers" in kwargs:
                kwargs.update({"allow_other_workers": allow_other_workers})
            return client.submit(
                getattr(type(self), fn),
                *args,
                **kwargs,
            )

        update = False
        queries = np.array([])
        scores = np.array([])
        n_model_updates = 0
        rj.jsonset(f"alg-perf-{self.ident}", root, [])
        save_deadline = 0.0  # right away
        data: List[Dict[str, Any]] = []

        error_raised: List[int] = []
        for k in itertools.count():
            try:
                loop_start = time()
                datum = {"iteration": k, "ident": self.ident, "time": time()}

                answers = self.get_answers(rj, clear=True)
                datum["num_answers"] = len(answers)
                self_future = client.scatter(self)

                _start = time()
                if len(queries) and len(scores):
                    queries_f = client.scatter(queries)
                    scores_f = client.scatter(scores)
                else:
                    queries_f = scores_f = []
                if update:
                    datum["cleared_queries"] = True
                    __start = time()
                    self.clear_queries(rj)
                    datum["time_clearing"] = time() - __start
                else:
                    datum["cleared_queries"] = False
                done = distributed.Event(name="pa_finished")
                done.clear()

                workers = list(client.has_what())
                random.shuffle(workers)
                f_post = submit(
                    "post_queries",
                    self_future,
                    queries_f,
                    scores_f,
                    done=done,
                    workers=workers[0],
                )
                f_model = submit(
                    "process_answers",
                    self_future,
                    answers,
                    workers=workers[1],
                )

                f_search = submit(
                    "get_queries",
                    self_future,
                    stop=done,
                    workers=workers[2],
                )

                time_model = 0.0
                time_post = 0.0
                time_search = 0.0

                def _model_done(_):
                    nonlocal time_model
                    nonlocal done
                    done.set()
                    time_model += time() - _start

                def _post_done(_):
                    nonlocal time_post
                    time_post += time() - _start

                def _search_done(_):
                    nonlocal time_search
                    time_search += time() - _start

                f_model.add_done_callback(_model_done)
                f_post.add_done_callback(_post_done)
                f_search.add_done_callback(_search_done)

                # Future.result raises errors automatically
                posted = f_post.result()
                new_self, update = f_model.result()
                queries, scores, search_meta = f_search.result()

                _datum_update = {
                    "n_queries_posted": posted,
                    "n_queries_scored": len(queries),
                    "n_queries_in_db": rj.zcard(f"alg-{self.ident}-queries"),
                    "model_updated": update,
                    "n_model_updates": n_model_updates,
                    "time_posting_queries": time_post,
                    "time_model_update": time_model,
                    "time_search": time_search,
                    "time": time(),
                    **search_meta,
                }
                datum.update(_datum_update)
                if update:
                    _s = time()
                    self.__dict__.update(new_self.__dict__)
                    datum["time_update"] = time() - _s
                    n_model_updates += 1

                if time() > save_deadline + 1e-3:
                    save_deadline = time() + 60
                    _s = time()
                    self.save()
                    datum["time_save"] = time() - _s
                datum["time_loop"] = time() - loop_start

                data.append(datum)
                logger.info(datum)
                posting_deadline = data[0]["time"] + 2 * 60
                if time() >= posting_deadline or k == 10 or k == 20:
                    flush_logger(logger)
                    keys = data[-1].keys()
                    to_post = {}
                    for _k in keys:
                        vals = [d.get(_k, None) for d in data]
                        vals = [v for v in vals if v]
                        if not len(vals):
                            continue
                        if isinstance(vals[0], (int, np.integer)):
                            Type = int
                        elif isinstance(vals[0], (float, np.floating)):
                            Type = float
                        else:
                            continue
                        _update = {
                            f"{_k}_median": np.median(vals),
                            f"{_k}_mean": np.mean(vals),
                            f"{_k}_min": np.min(vals),
                            f"{_k}_max": np.max(vals),
                        }
                        if _k == "time":
                            _update = {"time": _update["time_median"]}
                        to_post.update(
                            {_k: Type(v)
                             for _k, v in _update.items()})

                    try:
                        rj.jsonarrappend(f"alg-perf-{self.ident}", root,
                                         to_post)
                    except ResponseError as e:
                        if ("could not perform this operation on a key that doesn't exist"
                                in str(e)):
                            # I think this happens when the frontend deletes
                            # the database when /reset is triggered
                            pass
                        else:
                            raise e

                    data = []

                if "reset" in rj.keys() and rj.jsonget("reset", root):
                    logger.warning(f"Resetting {self.ident}")
                    self.reset(client, rj, futures=[f_model, f_post, f_search])
                    break

            except Exception as e:
                logger.exception(e)
                flush_logger(logger)
                error_raised.append(k)

                __n = 5
                if np.diff(error_raised[-__n:]).tolist() == [1] * (__n - 1):
                    logger.exception(e)
                    flush_logger(logger)
                    raise e
        return True
Beispiel #17
0
def main():

    # client = Client(processes = False) # threads ?
    client = Client()
    size = 10000000
    # size       = 20
    # shards     = 20
    # shards     = 6
    # shards     = 1
    shards = 12
    shape = [size]
    lat = np.random.rand(size) * 180.0 - 90.0
    lon = np.random.rand(size) * 360.0 - 180.0
    resolution_ = 8
    resolution = np.full(shape, resolution_, dtype=np.int64)

    # print('lat shape: ',lat.shape)

    print('')
    serial_start = timer()
    s_sids = ps.from_latlon(lat, lon, resolution_)
    s_sidsstr = [hex16(s_sids[i]) for i in range(len(s_sids))]
    serial_end = timer()
    # print('0 s_sids: ',s_sids)
    print('time s_sids: ', serial_end - serial_start)

    def w_from_latlon(llr):
        # print('')
        # print('llr:  ',llr)
        sids = ps.from_latlon(llr[0], llr[1], int(llr[2][0]))
        # print('sids: ',sids)
        # print('')
        return sids

    # def w_from_latlon1(lat,lon,res):
    #     return ps.from_latlon(np.array([lat],dtype=np.double)\
    #                            ,np.array([lon],dtype=np.double)\
    #                            ,int(res))
    # sid        = ps.from_latlon(lat,lon,resolution)
    # sid        = client.map(w_from_latlon1,lat,lon,resolution) # futures

    dask_start = timer()
    shard_size = int(size / shards)
    shard_bins = np.arange(shards + 1) * shard_size
    shard_bins[-1] = size

    # print('---')
    # print('shards:     ',shards)
    # print('shard_size: ',shard_size)
    # print('shard_bins: ',shard_bins)
    # print('---')
    lat_shards = [lat[shard_bins[i]:shard_bins[i + 1]] for i in range(shards)]
    lon_shards = [lon[shard_bins[i]:shard_bins[i + 1]] for i in range(shards)]
    res_shards = [
        resolution[shard_bins[i]:shard_bins[i + 1]] for i in range(shards)
    ]

    llr_shards = []
    for i in range(shards):
        llr_shards.append([lat_shards[i], lon_shards[i], res_shards[i]])

    # print('llr_shards len: ',len(llr_shards))
    # print('llr_shards: ',llr_shards)

    ## future = client.submit(func, big_data)    # bad
    ##
    ## big_future = client.scatter(big_data)     # good
    ## future = client.submit(func, big_future)  # good

    # sid        = client.map(w_from_latlon,llr_shards) # futures

    big_future = client.scatter(llr_shards)
    sid = client.map(w_from_latlon, big_future)  # futures

    # print('0 sid:  ',sid)
    # print('9 len(sid): ',len(sid))
    # for i in range(shards):
    #     print(i, ' 10 sid: ',sid[i])
    #     print(i, ' 11 sid: ',sid[i].result())

    # print('15 sid:    ',[type(i) for i in sid])

    sid_cat = np.concatenate([i.result() for i in sid])
    sidsstr = [hex16(sid_cat[i]) for i in range(len(sid_cat))]
    dask_end = timer()
    # print('2 sids: ',sids)
    sids = sid_cat

    print('')
    # for i in range(size-20,size):
    for i in np.array(np.random.rand(20) * size, dtype=np.int64):
        print("%09i" % i, sidsstr[i], s_sidsstr[i], ' ', sids[i] - s_sids[i])

    print('')
    print('dask total threads:  ', sum(client.nthreads().values()))
    print('size:                ', size)
    print('shards:              ', shards)
    print('')
    print('time sids:           ', dask_end - dask_start)
    print('time s_sids:         ', serial_end - serial_start)
    print('parallel speed up:   ',
          (serial_end - serial_start) / (dask_end - dask_start))

    client.close()
from dask.distributed import Client
import dask.array as da
from time import sleep


def sqrt(x):
    return x**0.5


if __name__ == '__main__':
    client = Client(n_workers=2,
                    nthreads=1,
                    memory_limit='512mb',
                    dashboard_address=8787)
    print(client.scheduler_info())
    sleep(3)
    while True:
        x = client.scatter(da.random.random((1000, 1000), chunks=(50, 50)))
        _ = client.submit(sqrt, x).result().compute()
        sleep(3)
Beispiel #19
0
def parse_alignment_bam(
    input_bam: Path,
    fragment_df: FragmentDf,
    alignment_table: Path = None,
    read_table: Path = None,
    overlap_table: Path = None,
    alignment_summary: Path = None,
    read_summary: Path = None,
    chunksize: int = 50000,
    n_workers: int = 1,
):
    """Filter alignments to keep only alignments that contribute to contacts

    Parameters
    ----------

    input_bam : str
                Path to a namesorted bam with unfiltered alignments
    chunksize: int
                The alignments are batched for processing, this controls the batch size

    """

    source_aligns = NameSortedBamSource(input_bam, metadata={})
    source_aligns.discover()

    parallel = n_workers > 1
    fragment_df = fragment_df.set_index(
        ["fragment_id"]).sort_index()  # .rename_axis("index", axis=0)
    if parallel:
        from dask.distributed import Client, LocalCluster
        from time import sleep

        cluster = LocalCluster(processes=True,
                               n_workers=n_workers,
                               threads_per_worker=1)
        client = Client(cluster)
        fragment_df = client.scatter(fragment_df)

    writers = dict(
        alignment_table=TableWriter(alignment_table),
        read_table=TableWriter(read_table),
        overlap_table=TableWriter(overlap_table),
    )

    batch_progress_bar = tqdm(total=None,
                              desc="Alignments submitted: ",
                              unit=" alignments",
                              position=0)
    alignment_progress = AlignmentProgress(position=1)
    read_progress = ReadProgress(position=2)
    # perc_alignment_bar = tqdm(total=None, desc="Alignments processed: ", unit=" alignments", position=1)

    # stream that holds the raw alignment dfs
    bam_stream = Stream()

    # stream that holds the filtered/processed alignments
    if parallel:
        filtered_align_stream = (bam_stream.scatter().map(
            filter_read_alignments,
            fragment_df=fragment_df).buffer(n_workers).gather())
    else:
        filtered_align_stream = bam_stream.map(filter_read_alignments,
                                               fragment_df=fragment_df)

    # write the alignments using the table writer, updating progress bar as we go
    align_sink = (  # noqa: F841
        filtered_align_stream.pluck("alignment_table").accumulate(
            alignment_progress, returns_state=True,
            start=alignment_progress).sink(writers["alignment_table"]))

    read_sink = (  # noqa: F841
        filtered_align_stream.pluck("read_table").accumulate(
            read_progress, returns_state=True,
            start=read_progress).sink(writers["read_table"]))

    overlap_sink = filtered_align_stream.pluck("overlap_table").sink(
        writers["overlap_table"])  # noqa: F841

    for batch_idx, align_df in enumerate(
            source_aligns.read_chunked(chunksize=chunksize)):
        bam_stream.emit(align_df)
        batch_progress_bar.update(len(align_df))
        batch_progress_bar.set_postfix({"batches": batch_idx})

    if parallel:
        while True:
            processing = client.processing()
            still_running = [len(v) > 0 for k, v in processing.items()]
            if any(still_running):
                sleep(10)
            else:
                break
        client.close()
        cluster.close()

    batch_progress_bar.close()
    alignment_progress.close()
    alignment_progress.save(alignment_summary)
    read_progress.close()
    read_progress.save(read_summary)
    sys.stderr.write("\n\n\n")
    sys.stdout.write("\n")
    return read_progress.final_stats()
import pandas
import dask.dataframe as dd
from dask.distributed import Client

client = Client("10.110.122.238:8888")

df = pd.read_csv('trainingData.csv')
future = client.scatter(df)  # send dataframe to one worker
ddf = dd.from_delayed([future], meta=df)  # build dask.dataframe on remote data
ddf = ddf.repartition(npartitions=20).persist()  # split
client.rebalance(ddf)  # spread around all of your workers
Beispiel #21
0
def run_JK_distributed_massboosted(df, param):
    '''Receives the pandas dataframe with the objects containing the
    temperature decrements and the parameter object and run the kSZ
    statistic and generate Jack Knifes.
    Everything runs in the cluster, so current terminal does not need
    to request many cpus.

    df: dataframe object containing the variables for the calculation
    params: param file for this calculation
    NJK: how many subgroups we will make to run the calculation'''

    Ncores = envVars.Ncores
    NWorkers = envVars.NWorkers
    Ngroups = param.JK_NGROUPS

    #setup cluster
    cluster = SGECluster(
        walltime='172800',
        processes=1,
        cores=1,
        env_extra=[
            '#$-pe sge_pe %i' % Ncores,
            '-l m_core=%i' % Ncores, 'mkdir -p /tmp/pag227/dask/dask-scratch',
            'export NUMBA_NUM_THREADS=%i' % Ncores,
            'export OMP_NUM_THREADS=%i' % Ncores
            #                                    'export OMP_NUM_THREADS=1',  # noqa
        ])
    cluster.scale(NWorkers)
    client = Client(cluster)
    time.sleep(30)
    #end setting up cluster

    #send full dataset to the cluster
    future_fullDataset = client.scatter(df)
    future_params = client.scatter(param)
    res_fullDataset = client.submit(get_pairwise_ksz_massboosted,
                                    future_fullDataset,
                                    future_params,
                                    multithreading=True)
    #done with the full dataset
    jk_results = []
    futureData = []  #data to be sent in jk or bootstrap in galaxy space

    for j in range(Ngroups):
        df_bs = df.copy()
        choose = np.random.choice(len(df), len(df))
        df_bs['dT'] = df.dT.values[choose]
        futureData.append(client.scatter(df_bs))

    if param.JK_RESAMPLING_METHOD.lower() == "bs_dt_mass_boosted_est":
        get_pw_func = get_pairwise_ksz_massboosted
    elif param.JK_RESAMPLING_METHOD.lower(
    ) == 'bs_dt_mass_boosted_est_debiased':  # noqa
        get_pw_func = get_pairwise_ksz_massboosted_debiased

    for j in range(Ngroups):
        jk_results.append(
            client.submit(get_pw_func,
                          futureData[j],
                          future_params,
                          multithreading=True))


# extract results
    fullDataset_results = res_fullDataset.result()
    jk_results = client.gather(jk_results)
    client.close()
    #  cluster.close()

    return fullDataset_results, jk_results
Beispiel #22
0
        matched, ambiguous, failed = pickle.load(f)

    modifiers_dict = {
        k: bool(int(v))
        for k, v in read_pairs_list('data/modifiers.txt')
    }
    modifiers_dict[None] = True
    candidates = make_candidates(matched, modifiers_dict.keys())
    print('Number of candidates : {}'.format(len(candidates)))

    if True:  # Dask processing
        cluster = LocalCluster(n_workers=48)
        client = Client(cluster)
        b = db.from_sequence(failed, partition_size=200)
        [c] = client.scatter(
            [candidates],
            broadcast=True)  # Broadcast the list of candidates to the workers
        r = b.map(_fn, c)
        f = client.compute(r)
        progress(f)
        matching_results = f.result()
    else:  # Multiprocessing
        matching_results = []
        with Pool(40) as p:
            for simple_result in tqdm(p.imap(_fn, failed, chunksize=300),
                                      total=len(failed)):
                matching_results.append(simple_result)

    matching_results = sorted(matching_results,
                              key=lambda x: x[1][0][0],
                              reverse=True)
Beispiel #23
0
    epsilon = args['epsilon']  # convergence stopping criterion
    M = args['dictatoms']  # dimensionality of the learned dictionary
    R = int(args['pnonzero'] * P)  # enforces sparsity
    u_new = da.zeros(T)
    v = da.zeros(P)

    max_iterations = P * 10
    file_D = os.path.join(args['dictionary'],
                          "{}_D.txt".format(args["prefix"]))
    file_z = os.path.join(args['output'], "{}_z.txt".format(args["prefix"]))

    # Start the loop!
    for m in range(M):
        #Let us randomly generate a integer, broadcast that int, and create a seed.
        seed = np.random.randint(max_iterations + 1, high=4294967295)
        _SEED_ = client.scatter(seed, broadcast=True)
        np.random.seed(_SEED_.result())

        #Create a dense random vector
        #Then subtracting off the mean an normalizing it
        u_old = da.random.random(T)
        u_old = dask_normalize(u_old).compute()

        #Setting loop criteria
        num_iterations = 0
        delta = 2 * epsilon

        # Start the inner loop: this learns a single atom.
        while num_iterations < max_iterations and delta > epsilon:

            _U_ = client.scatter(u_old, broadcast=True)
        n.symbols = None
        n.scores = None
        n.event = None
        n.char = None
        n.eventype = None
        n.AAevent = 0

    for i, l in enumerate(tree.leaf_nodes()):
        l.event = {}
        l.scores = {}
        l.symbols = {}
        l.char = {}
        l.calc = {}

    print('scattering tree')
    remote_tree = client.scatter(pickle.dumps(tree), broadcast=True)
    row_index = client.scatter(row_index, broadcast=True)
    print('done')
    retmatsize = (len(tree.nodes()), align_array.shape[0])
    for annot_index, annot_row in annotation.iterrows():
        #indexing starts at 1 for blast
        #####switch to sending the coordinates and masking for the matrix
        for j, codon in enumerate(
                range(annot_row.qstart - 1, annot_row.qend - 1, 3)):
            keep_codons += [count, count, count]
            keep_positions += [codon, codon + 1, codon + 2]
            count += 1

    print('selecting positions')
    print('positions to analyze:', len(keep_positions))
    mapping = dict(zip(keep_positions, keep_codons))
    args = parser.parse_args()

    # Cluster scheduler
    cluster = args.scheduler
    client = Client(cluster)

    print(client)
    client.upload_file(
        "/nfs/paper-big-data-engines/utils.py")  # Allow workers to use module
    client.upload_file(
        "/nfs/paper-big-data-engines/incrementation/Increment.py")

    # Read images
    paths = crawl_dir(os.path.abspath(args.bb_dir))
    client.scatter(paths)

    results = []
    for path in paths:
        img = client.submit(read_img, path, start=start, args=args)

        # Increment the data n time:
        for _ in range(args.iterations):
            img = client.submit(increment,
                                img,
                                delay=args.delay,
                                start=start,
                                args=args)

        # Save the data
        results.append(client.submit(save_results, img, start=start,
Beispiel #26
0
from dask.distributed import Client

client = Client('tcp://172.17.0.2:8786')
df = pd.read_csv("train.csv")


def demo(df):

    X = df.drop(labels='Activity', axis=1)

    y = df['Activity']

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.25)

    gbm = GradientBoostingClassifier(learning_rate=0.05,
                                     max_features=106,
                                     n_estimators=300)
    gbm.fit(X_train, y_train)
    result = gbm.predict(X_test)
    score = accuracy_score(y_test, result)
    return score


start = time.time()
big_future = client.scatter(df)
output = client.submit(demo, big_future)
outcome = output.result()
# outcome = client.gather(output)
print(outcome)
print("Time_taken:", (time.time() - start) % 60)
Beispiel #27
0
from Generators.gen_wordlist import generate_data
from dask import delayed, compute
from dask.distributed import Client
import sys

if (len(sys.argv) < 3):
    print("USAGE ./sort.py <size of file> <scheduler url>")
    exit(1)

num_words = int(sys.argv[1])
sched_IP = sys.argv[2]

data = generate_data(num_words)

client = Client(sched_IP)
result = [x for x in data.split("\n") if x != '']


def sort(data):
    content = data.split("\n")
    return "\n".join(sorted(content))


future = client.scatter(data)
task = delayed(sort)(future)
output = compute(task)
print("Done")
Beispiel #28
0
    parser.add_argument("--benchmark", action="store_true", help="benchmark results")

    args = parser.parse_args()

    # Cluster scheduler
    cluster = args.scheduler
    client = Client(cluster)

    print(client)
    client.upload_file("/nfs/paper-big-data-engines/utils.py")
    client.upload_file("/nfs/paper-big-data-engines/bidsApp-examples/Example.py")
    from Example import run_group, run_participant, site_crawler, subject_crawler

    # Retrieve all subject path
    subjects = subject_crawler(args.bids_dir)
    client.scatter(subjects)

    results = list()
    for subject in subjects:
        results.append(
            client.submit(
                run_participant,
                subject_id=subject[1],
                start=start,
                args=args,
                site=subject[0],
            )
        )

    client.gather(results)
                    else:
                        AA_mutation  = sparseND.COO( coords =  (AAeventindex , np.ones(len(AAeventindex)) * column , AAeventypes ) , data = np.ones(len(AAeventindex)  ,  ) , shape = (matsize[0] , matsize[1] ,len(transitiondict_AA ) )   ,  dtype = np.int32 )
                count +=1
        print('FINAL SAVE !')
        save_mats(count, runName, AA_mutation,nucleotide_mutation)
        print('DONE ! ')
        brake.set(False)
        return None

    #######start the sankof algo here #######################
    print('starting sankof')
    #scale cluster
    #scatter the blank tree and row index for each process
    #remote_tree = client.scatter(tree)

    remote_index = client.scatter(IDindex)

    inq = Queue('inq')
    outq = Queue('outq')
    lock = Lock('x')

    stopiter = Variable(False)
    brake = Variable(True)


    saver_started = False
    workers_started = False

    #start workers
    for workers in range(NCORE*ncpu ):
        w = client.submit(  calculate_small_parsimony , inq= None ,outq = None  ,stopiter= stopiter ,  treefile=treefile , bootstrap_replicates = bootstrap_replicates,
Beispiel #30
0
    class DaskHashBag(HashBag):
        """A HashBag that uses the `Dask <http://dask.org>`_ library."""

        def start_client(self, **kwargs):
            global client
            from dask.distributed import Client

            try:
                client = Client(**kwargs)
                self.client = Client(**kwargs)
            except Exception as e:
                log.warn(e)

        def __init__(self, it=(), npartitions=None, client=None, **kwargs):
            self.client = client
            self.kwargs = kwargs
            self.try_npartitions = npartitions

            if kwargs:
                self.start_client(**kwargs)

            if isinstance(it, db.Bag):
                self.bag = it
            else:
                it = list(it)
                npartitions = npartitions or len(it) or None
                self.bag = db.from_sequence(it, npartitions=npartitions)

        def new(self, it):
            npartitions = max(self.try_npartitions or 1, self.bag.npartitions or 1)
            return DaskHashBag(it, npartitions=npartitions, client=self.client)

        def __repr__(self):
            kwargs = {"npartitions": self.bag.npartitions, **self.kwargs}
            args = (f"{k}={v.__repr__()}" for k, v in kwargs.items())
            return f"DaskHashBag(%s)" % ", ".join(args)

        def load(self, *f):
            cls = self.__class__
            from io import TextIOBase

            if isinstance(f, TextIOBase):
                return cls(robust_json_loads_lines(f), client=self.client)
            else:
                log.info(f"Reading {f} with {self.client}?")
                return cls(
                    db.read_text(f).map_partitions(robust_json_loads_lines),
                    client=self.client,
                )

        @classmethod
        def concat(cls, hashbags):
            return hashbags[0].new(db.concat([hb.bag for hb in hashbags]))

        def take(self, n):
            self.bag = self.bag.take(n, npartitions=-1, compute=False)
            return self

        def persist(self):
            try:
                self.bag = self.bag.persist()
            except Exception as e:
                log.error(e)
            return self

        def __iter__(self):
            return iter(self.bag.compute())

        def __len__(self):
            self.persist()
            return self.bag.count().compute()

        def pipe(self, func, *args, **kwargs):
            newargs = list(args)
            newkwargs = dict(kwargs)
            if self.client:
                try:
                    if newargs:
                        newargs = self.client.scatter(newargs, broadcast=True)
                    if newkwargs:
                        newkwargs = self.client.scatter(newkwargs, broadcast=True)
                except:
                    log.debug(f"Scattering for {func.__name__} failed!")

            @functools.wraps(func)
            def listify(x, *args, **kwargs):
                return list(func(x, *args, **kwargs))

            return self.new(self.bag.map_partitions(listify, *newargs, **newkwargs))

        def fold_tree(self, key, binop):
            return self.new(self.bag.foldby(key, binop=binop).map(lambda x: x[1]))

        def fold(self, key, binop):
            import pandas as pd

            def combine(df):
                return functools.reduce(binop, df.table)

            df = self.bag.map(lambda t: {'table':t} ).to_dataframe(meta={'table': 'object'})
            keymeta = pd.Series([key(t) for t in df.table.head(1)])
            index = df.table.apply(key, meta = keymeta)
            groups = df.assign(index=index).set_index("index").groupby("index")
            return self.new(groups.apply(combine).to_bag())

        def offset(self, get_attr, set_attr, default=0):

            d = self.bag.map(lambda t: {'table':t, get_attr: t.get(get_attr, default)})
            df = d.to_dataframe(meta={'table': 'object', get_attr: 'int'})
            vs = df[get_attr].cumsum() - df[get_attr]

            def setval(x, v):
                x[set_attr] = v
                return x

            return self.new(self.bag.map(setval, vs.to_bag()))

        def dump(self, f, **kwargs):
            from io import TextIOBase

            if isinstance(f, TextIOBase):
                HashBag.dump(self.bag.compute(), f)
                return self
            else:
                self.bag.map(json_dump).to_textfiles(f, last_endline=True)
                return self.load(f)