Beispiel #1
0
def forward_modeling_multi_shots(c, par_files, d):
    'Parallel modeling function'
    my_dict = c.scatter(d, broadcast=True)
    records = list(d.keys())
    print(records)
    futures = []
    for record in records:
        futures.append(c.submit(forward_modeling_single_shot, record, table=my_dict, par_files=par_files))

    # Check progress
    progress(futures)

    # Wait for all workers to finish and collect shots
    wait(futures)
    # Getting length of list
    length = len(futures)
    final_image = np.array(futures[0].result())
    i = 1

    print('\n::: start user output :::')
    print(c)
    print('::: end user output :::\n')

    # Iterating using while loop
    while i < length:
        final_image[:] += futures[i].result()
        i += 1

    return final_image
def main():
    """."""
    host = os.getenv('DASK_SCHEDULER_HOST', default='localhost')
    port = os.getenv('DASK_SCHEDULER_PORT', default=8786)
    print(host, port)
    client = Client('{}:{}'.format(host, port))
    # client.run(init_logging)
    # client.run_on_scheduler(init_logging)

    # Run some mock functions and gather a result
    data = client.map(print_listdir, range(10))
    future = client.submit(print_values, data)
    progress(future)
    print('')
    result = client.gather(future)
    print(result)

    # Run a second stage which runs some additional processing.
    print('here A')
    data_a = client.map(set_value, range(100))
    print('here B')
    data_b = client.map(square, data_a)
    print('here C')
    data_c = client.map(neg, data_b)
    print('here D')
    # Submit a function application to the scheduler
    total = client.submit(sum, data_c)
    print('here E')
    progress(total)
    print(total.result())
    print('here F')
Beispiel #3
0
    def launch_actor(self, _class):
        """
        wrapper to launch an actor

        Parameters
        ----------
        _class : class object
            class to put on a worker

        Returns
        -------
        actor : dask.distributed.Actor pointer (future)
        """
        if self.client is not None:
            if self.library[0] == 'dask':
                future = self.client.submit(
                    _class,
                    workers=[self.workers[self.worker_counter]],
                    actor=True)  # Create a _class on a worker
                distributed.progress(future)
                actor = future.result()  # Get back a pointer to that object
                return actor
            else:
                raise Exception(
                    f"{self.library} is supported, but without actor launch functionality!"
                )
        else:
            actor = _class()
            return actor
Beispiel #4
0
    def enrich(self,
               input_path: str,
               output_path: str,
               dataset_id: str,
               threshold: float = 0.8,
               spans: int = 20):
        """
        main method -calls all other processing methods and outputs enriched parquet file
        :param input_path: a directory full of parquets (ingest output) to process
        :param output_path: a directory to put the output context enriched parquets
        :param dataset_id: ingest process dataset_id
        :param threshold: float cut off for postprocess table detection score to process as table caption
        :param spans: number of words each side of label to pull in as context for each table label in content text
                if None will use regex to pull out full stop to full stop span around the table label
        """

        for pq in glob.glob(os.path.join(input_path, '*.parquet')):
            logger.info(f'processing file: {pq}')
            df = pd.read_parquet(pq)
            basename = os.path.basename(pq)

            needed_columns = [
                'content', 'postprocess_cls', 'postprocess_score'
            ]

            if needed_columns_are_in_df(needed_columns, list(df.columns)):

                if dataset_id:
                    logger.info(
                        f'limit enrichment to dataset id: {dataset_id}')
                    df = df[df['dataset_id'] == dataset_id]

                # GET ALL DOCUMENTS, LIST OF DFs
                pdf_names = list(df.pdf_name.unique())
                single_doc_dfs = []

                logger.info('split ingest output into docs')
                for name in tqdm(pdf_names):
                    single_doc_dfs.append(df[df['pdf_name'] == name])

                partial_get_context = functools.partial(
                    Enrich.get_contexts, threshold, spans)
                logger.info('start enrichment processing')
                enriched = [
                    self.client.submit(partial_get_context,
                                       doc_df,
                                       resources={'process': 1})
                    for doc_df in single_doc_dfs
                ]
                progress(enriched)
                logger.info('collecting all enriched docs')
                enriched = [e.result() for e in tqdm(enriched)]
                df = pd.concat(enriched)
                df = df.reset_index(drop=True)

            else:
                pass

            df.to_parquet(os.path.join(output_path, basename))
Beispiel #5
0
def dask_executor(items, function, accumulator, **kwargs):
    """Execute using dask futures

    Parameters
    ----------
        items : list
            List of input arguments
        function : callable
            A function to be called on each input, which returns an accumulator instance
        accumulator : AccumulatorABC
            An accumulator to collect the output of the function
        client : distributed.client.Client
            A dask distributed client instance
        treereduction : int, optional
            Tree reduction factor for output accumulators (default: 20)
        status : bool, optional
            If true (default), enable progress bar
        compression : int, optional
            Compress accumulator outputs in flight with LZ4, at level specified (default 1)
            Set to ``None`` for no compression.
        priority : int, optional
            Task priority, default 0
        heavy_input : serializable, optional
            Any value placed here will be broadcast to workers and joined to input
            items in a tuple (item, heavy_input) that is passed to function.
        function_name : str, optional
            Name of the function being passed
    """
    if len(items) == 0:
        return accumulator
    client = kwargs.pop('client')
    ntree = kwargs.pop('treereduction', 20)
    status = kwargs.pop('status', True)
    clevel = kwargs.pop('compression', 1)
    priority = kwargs.pop('priority', 0)
    heavy_input = kwargs.pop('heavy_input', None)
    function_name = kwargs.pop('function_name', None)
    reducer = _reduce()
    if clevel is not None:
        function = _compression_wrapper(clevel, function, name=function_name)
        reducer = _compression_wrapper(clevel, reducer)

    if heavy_input is not None:
        heavy_token = client.scatter(heavy_input, broadcast=True, hash=False)
        items = list(zip(items, repeat(heavy_token)))
    futures = client.map(function, items, priority=priority)
    while len(futures) > 1:
        futures = client.map(
            reducer,
            [futures[i:i + ntree] for i in range(0, len(futures), ntree)],
            priority=priority,
        )
    if status:
        from dask.distributed import progress
        # FIXME: fancy widget doesn't appear, have to live with boring pbar
        progress(futures, multi=True, notebook=False)
    accumulator += _maybe_decompress(futures.pop().result())
    return accumulator
Beispiel #6
0
    def enrich(self, file_path: str, dataset_id: str, threshold: float, spans: int):
        """
        iterate over all ingest output parquets and run distributed context enrichment process
        :param file_path: a directory full of parquets (ingest output) to process
        :param dataset_id: ingest process dataset_id
        :param threshold: float cut off for postprocess table detection score to process as table caption
        :param spans: number of words each side of label to pull in as context for each table label in content text
                if None will use regex to pull out full stop to full stop span around the table label
        """

        for pq in glob.glob(os.path.join(file_path, '*.parquet')):
            logger.info(f'processing file: {pq}')
            df = pd.read_parquet(pq)
            basename = os.path.basename(pq)

            needed_columns = [
                'content',
                'postprocess_cls',
                'postprocess_score'
            ]

            if Ingest.needed_columns_are_in_df(needed_columns, list(df.columns)):

                if dataset_id:
                    logger.info(f'limit enrichment to dataset id: {dataset_id}')
                    df = df[df['dataset_id'] == dataset_id]

                # GET ALL DOCUMENTS, LIST OF DFs
                all_pdf_names = list(df.pdf_name.unique())
                single_doc_dfs = []

                logger.info('split ingest output into docs')
                for name in all_pdf_names:
                    single_doc_dfs.append(df[df['pdf_name'] == name])

                partial_get_context = functools.partial(Ingest.get_contexts,
                                                        threshold,
                                                        spans)
                logger.info(f'start enrichment processing with doc count {len(single_doc_dfs)}')
                enriched = [self.client.submit(partial_get_context,
                                               doc_df,
                                               resources={'process': 1})
                            for doc_df in single_doc_dfs]
                progress(enriched)
                logger.info('collecting all enriched docs')
                enriched = [e.result() for e in enriched]
                df = pd.concat(enriched)
                logger.info(f'size of df returned from enrichment: {len(df)}')
                df = df.reset_index(drop=True)

            else:
                pass
            logger.info(f'outputting data: {os.path.join(file_path, basename)}')
            df.to_parquet(os.path.join(file_path, basename))
Beispiel #7
0
def parallel_write(filename: str, darray: dask.array) -> None:
    """Distribute Zarr writing task to workers using dask.
    Input filename should have extension .zarr"""
    client = Client()
    out = darray.to_zarr(filename, compressor=Blosc(cname='zstd', clevel=3, shuffle=Blosc.BITSHUFFLE),
        compute=False)
    try:
        progress(client) #  I believe this is for visualization purpose.
        fut = client.compute(out)
    except BrokenPipeError:
        print('Process complete (likely)...')
Beispiel #8
0
def write_file(ctx, *args, **kwargs):
    """Write the output dataset to file."""
    if ctx.obj["output"] is not None:
        if ctx.obj["verbose"]:
            click.echo(f"Writing to file {ctx.obj['output']}")
        with ProgressBar():
            r = ctx.obj["ds_out"].to_netcdf(ctx.obj["output"], compute=False)
            if ctx.obj["dask_nthreads"] is not None:
                progress(r.data)
            r.compute()
        if ctx.obj["dask_nthreads"] is not None:
            click.echo("")  # Distributed's progress doesn't print a final \n.
def save_multiple_images(
    array: da.Array, output_file: Path, write_mode: str = "x"
) -> None:
    """
    Calculate and store a Dask array in an HDF5 file without exceeding available memory.

    Use the Dask distributed scheduler to compute a Dask array and store the
    resulting values to a data set 'data' in the root group of an HDF5 file.  The
    distributed scheduler is capable of managing worker memory better than the
    default scheduler.  In the latter case, the workers can sometimes demand more
    than the available amount of memory.  Using the distributed scheduler avoids this
    problem.

    The distributed scheduler cannot write directly to HDF5 files because h5py.File
    objects are not serialisable.  To work around this issue, the data are first
    stored to a Zarr DirectoryStore, then copied to the final HDF5 file and the Zarr
    store deleted.

    Multithreading is used, as the calculation is assumed to be I/O bound.

    Args:
        array:  A Dask array to be calculated and stored.
        output_file:  Path to the output HDF5 file.
        write_mode:  HDF5 file opening mode.  See :class:`h5py.File`.
    """
    # Set a more generous connection timeout than the default 30s.
    with dask.config.set(
        {
            "distributed.comm.timeouts.connect": "60s",
            "distributed.comm.timeouts.tcp": "60s",
            "distributed.deploy.lost-worker-timeout": "60s",
            "distributed.scheduler.idle-timeout": "600s",
            "distributed.scheduler.locks.lease-timeout": "60s",
        }
    ):
        intermediate = str(output_file.with_suffix(".zarr"))

        # Overwrite any pre-existing Zarr storage.  Don't compute immediately but
        # return the Array object so we can compute it with a progress bar.
        method = {"overwrite": True, "compute": False, "return_stored": True}
        # Prepare to save the calculated images to the intermediate Zarr store.
        array = array.to_zarr(intermediate, component="data", **method)
        # Compute the Array and store the values, using a progress bar.
        progress(array.persist())

    print("\nTransferring the images to the output file.")
    store = zarr.DirectoryStore(intermediate)
    with h5py.File(output_file, write_mode) as f:
        zarr.copy_all(zarr.open(store), f, **Bitshuffle())

    # Delete the Zarr store.
    store.clear()
Beispiel #10
0
def get_dask_data(df, target_name):
    df = dd.from_pandas(df, chunksize=100)

    target = df[target_name]
    del df[target_name]  # Remove target  information from dataframe

    df, target = persist(
        df, target)  # Ask Dask to start work on these in the background
    progress(df, target)
    df = dd.get_dummies(df.categorize()).persist()
    X = df.to_dask_array(lengths=True)
    y = target.to_dask_array(lengths=True)
    return train_test_split_dask(X, y, test_size=.2, random_state=42)
def complexity_with_distributed(file_list):
    """function to compute complexity of the list of files in a distributed master-slave server methodology
                    Args:
                        file_list: list of files to be computed
    """

    # Start the timer to track time
    start_time = time.time()
    dask_master_server_node = dask_master_server.map(
        repo_complexity_analyzer.complexity_analyzer, file_list)
    progress(dask_master_server_node)
    print("--- %s seconds ---" % (time.time() - start_time))
    complexity_result_list = dask_master_server.gather(dask_master_server_node)
    return json.dumps(complexity_result_list)
Beispiel #12
0
 def _(w):
     if map_menu.marker is None:
         if at_time is not None:
             t = at_time.value
             da = ds.precipitationCal.sel(time=t,
                                          method='nearest').persist()
         else:
             t0 = from_time.value
             t1 = to_time.value
             da = ds.precipitationCal.sel(time=slice(t0, t1)).sum(
                 ['time']).persist()
         pbar = progress(da, notebook=True, multi=False)
         out.clear_output()
         with out:
             display(pbar)
         #hbox.children = list(hbox.children) + [pbar.bar_widget]
         map_menu.da = da.sel(lat=slice(-85, 85)).compute()
         io = overlay(map_menu.m, map_menu.current_io, da, label)
         map_menu.current_io = io
     else:
         lat, lon = map_menu.marker.location
         #label.value = str(ds)
         da = ds.precipitationCal.sel(lat=lat, lon=lon,
                                      method='nearest').compute()
         s = da.to_series()
         line.x = s.index.values
         line.y = s
def save_multiple_image_sequences(
    array: da.Array,
    intermediate_store: Union[Path, str],
    output_files: Iterable[Path],
    write_mode: str = "x",
) -> None:
    intermediate_store = Path(intermediate_store).with_suffix(".zarr")

    # Set a more generous connection timeout than the default 30s.
    with dask.config.set(
        {
            "distributed.comm.timeouts.connect": "60s",
            "distributed.comm.timeouts.tcp": "60s",
            "distributed.deploy.lost-worker-timeout": "60s",
            "distributed.scheduler.idle-timeout": "600s",
            "distributed.scheduler.locks.lease-timeout": "60s",
        }
    ):
        # Overwrite any pre-existing Zarr storage.  Don't compute immediately but
        # return the Array object so we can compute it with a progress bar.
        method = {"overwrite": True, "compute": False, "return_stored": True}
        # Prepare to save the calculated images to the intermediate Zarr store.
        array = [
            sub_array.to_zarr(intermediate_store, component=f"{i:d}/data", **method)
            for i, sub_array in enumerate(array)
        ]
        # Compute the Array and store the values, using a progress bar.
        progress([sub_array.persist() for sub_array in array])
        print()

    print("Transferring the images to the output files.")
    store = zarr.DirectoryStore(str(intermediate_store))
    arrays = zarr.open(store)

    @delayed
    def sequence_to_disk(i, output_file):
        with h5py.File(output_file, write_mode) as f:
            return zarr.copy_all(arrays[i], f, **Bitshuffle())

    transfer = [sequence_to_disk(i, o).persist() for i, o in enumerate(output_files)]
    progress(transfer)
    da.compute(transfer)

    print()

    # Delete the Zarr store.
    store.clear()
def main():
    client = Client('localhost:8786')
    A = client.map(set_value, range(100))
    B = client.map(square, A)
    C = client.map(neg, B)
    total = client.submit(sum, C)
    print(progress(total))
    print(total.result())
def main():
    client = Client('localhost:8786')
    A = client.map(set_value, range(100))
    B = client.map(square, A)
    C = client.map(neg, B)
    total = client.submit(sum, C)
    print(progress(total))
    print(total.result())
Beispiel #16
0
def distributed_main2():
    futures = []
    for i in range(NUM_JOBS):
        y_dim = Y_DIM // NUM_JOBS

        # Sends very little data over the network to each worker
        future = client.submit(parallel_func2, y_dim)
        futures.append(future)

    progress(futures)
    total = 0
    for future in as_completed(futures):
        total += future.result()

    print(total)
    client.close()
    return total
Beispiel #17
0
    def progress(self, futures):
        """
        wrapper to log the progress of futures

        Arguments
        ---------
        futures : list of <generalized> futures
            futures that are to be gathered
        """
        if self.client is None:
            pass
        else:
            if self.library[0] == 'dask':
                distributed.progress(futures)
            else:
                raise Exception(
                    f"{self.library} is supported, but without actor launch functionality!"
                )
Beispiel #18
0
def get_data(df, target, is_dask=False, chunksize=200):
    if is_dask:
        df = dd.from_pandas(df, chunksize=chunksize)
        target_s = df[target]
        del df[target]
        df, target_s = persist(df, target_s)
        progress(df, target_s)
        df = dd.get_dummies(df.categorize()).persist()
        y = target_s.to_dask_array(lengths=True)
        X = df.to_dask_array(lengths=True)
    else:
        y = df[target].to_numpy()
        del df[target]

        df = pd.get_dummies(df)
        X = df.to_numpy()

    return train_test_split_normal(X, y, test_size=.1, random_state=18)
Beispiel #19
0
def cache_sofk(fout, fjson, Sk, nx, verbose=True, para=True):
  if para:
    import dask
    from dask.distributed import Client, progress
    client = Client(processes=False)
    Sk = dask.delayed(Sk)

  import pandas as pd
  mdf = pd.read_json(fjson)
  box = mdf.iloc[0]['box']
  kvecs = legal_kvecs(nx, box)
  nk = len(kvecs)

  if verbose and (not para):
    from progressbar import ProgressBar, Bar, ETA
    widgets = [Bar('>'), ETA()]
    bar = ProgressBar(widgets=widgets, maxval=len(mdf))
    bar.start()

  skl = []
  sk2l = []
  isk = 0
  for label, row in mdf.iterrows():
    com = np.array(row['positions'])
    sk = Sk(kvecs, com)
    skl.append(sk)
    sk2l.append(sk**2)

    isk += 1
    if verbose and (not para):
      bar.update(isk)
  skm = np.mean(skl, axis=0)
  ske = (np.mean(sk2l, axis=0)-skm**2)**0.5/len(skl)**0.5

  if para:
    skm, ske = dask.persist(skm, ske)
    if verbose:
      progress(skm, ske)
    skm, ske = dask.compute(skm, ske)
    client.shutdown()

  # spherical average
  uk, uskm, uske = shavg(kvecs, skm, ske)
  np.savetxt(fout, np.array([uk, uskm, uske]).T)
Beispiel #20
0
def main_body(cluster, IO, DATA, RESULT, RESTART, futures, nfutures):
    with Client(cluster, processes=False) as client:
        print('--------------------------------------------------------------')
        print('\t Starting clients ... \n')
        print(client)
        print(
            '-------------------------------------------------------------- \n'
        )

        # Do only consider non-nan fields
        DATA = DATA.where(DATA.MASK == 1, drop=True)

        # Go over the whole grid
        for i, j in product(DATA.lat, DATA.lon):
            mask = DATA.MASK.sel(lat=i, lon=j)
            # Provide restart grid if necessary
            if ((mask == 1) & (restart == False)):
                nfutures = nfutures + 1
                futures.append(
                    client.submit(cosipy_core, DATA.sel(lat=i, lon=j)))
            elif ((mask == 1) & (restart == True)):
                nfutures = nfutures + 1
                futures.append(
                    client.submit(cosipy_core, DATA.sel(lat=i, lon=j),
                                  IO.create_grid_restart().sel(lat=i, lon=j)))

        # Finally, do the calculations and print the progress
        progress(futures)

        if (restart == True):
            IO.get_grid_restart().close()

        print('\n')
        print('--------------------------------------------------------------')
        print('Copy local results to global')
        print(
            '-------------------------------------------------------------- \n'
        )
        for future in as_completed(futures):
            results = future.result()
            result_data = results[0]
            restart_data = results[1]
            IO.write_results_future(result_data)
            IO.write_restart_future(restart_data)
Beispiel #21
0
def multiprocess(jobs, client):
    """Process jobs using a dask cluster.
    Inputs:
    jobs   = list of jobs where each job is a list of workflow scripts and parameters
    client = dask cluster client object

    :param jobs: list
    :param client: distributed.client.Client
    """
    # Keep a list of job futures
    processed = []
    # Submit the jobs to the scheduler
    for job in jobs:
        # Submit individual job
        processed.append(client.submit(_process_images_multiproc, job))
    # Watch job progress and print a progress bar
    progress(processed)
    # Each job outputs results to disk so we do not need to gather results here
    client.shutdown()
Beispiel #22
0
    def gather_actor_result(self, future):
        """
        wrapper to pull the .result() of a method called to an actor

        Arguments
        ---------
        future : <generalized> future
            the future object to be collected from an actor
        """
        if self.client is None:
            return future
        else:
            if self.library[0] == 'dask':
                distributed.progress(future)
                result = future.result()
                return result
            else:
                raise Exception(
                    f"{self.library} is supported, but without actor-gather functionality!"
                )
Beispiel #23
0
 def write_images_for_annotation(self, pdf_dir, img_dir):
     """
     Helper function that will write images from PDFs for annotation.
     :param pdf_dir: Path to PDFs to write images for
     :param img_dir: Output directory where images will be written
     """
     logger.info(f"Converting PDFs to images and writing to target directory: {img_dir}")
     pdfnames = get_pdf_names(pdf_dir)
     pdf_to_images = functools.partial(Ingest.pdf_to_images, 'na', self.images_tmp)
     images = [self.client.submit(pdf_to_images, pdf, resources={'process': 1}) for pdf in pdfnames]
     progress(images)
     images = [i.result() for i in images]
     images = [i for i in images if i is not None]
     images = [i for il in images for i in il]
     paths = [f'{tmp_dir}/{pdf_name}_{pn}' for tmp_dir, pdf_name, pn in images]
     for path in paths:
         bname = os.path.basename(path)
         new_bname = bname + '.png'
         shutil.copy(path, os.path.join(img_dir, new_bname))
     logger.info('Done.')
     shutil.rmtree(self.tmp_dir)
Beispiel #24
0
def _log_prob_pt_samples_dask(log_p_pt, samples,
		nthreads=1, cluster=None):
	from dask.distributed import Client, LocalCluster, progress
	import dask.bag as db

	# calculate the point-wise probabilities and stack them together
	if cluster is None:
		# start local dask cluster
		_cl = LocalCluster(n_workers=nthreads, threads_per_worker=1)
	else:
		# use provided dask cluster
		_cl = cluster

	with Client(_cl):
		_log_pred = db.from_sequence(samples).map(log_p_pt)
		progress(_log_pred)
		ret = np.stack(_log_pred.compute())

	if cluster is None:
		_cl.close()

	return ret
Beispiel #25
0
def search(model,
           X,
           y,
           params,
           method="randomized",
           n_iter=30,
           cv=5,
           **kwargs):
    """Run a cross-validated search for hyperparameters."""
    if method.lower() == "randomized":
        search = RandomizedSearchCV(model,
                                    param_distributions=params,
                                    n_iter=n_iter,
                                    cv=cv)
    elif method.lower() == "grid":
        search = GridSearchCV(model, param_grid=params, cv=cv)
    elif method.lower() == "bayes":
        search = BayesSearchCV(model,
                               search_spaces=params,
                               n_iter=n_iter,
                               cv=cv)
    else:
        message = ("'method' must be either 'randomized', 'grid' or 'bayes'."
                   " Got method='{}'".format(method))
        LOGGER.error(message)
        raise ValueError(message)

    method_name = method.capitalize() + "SearchCV"
    LOGGER.info("Beginning " + method_name)
    when_started = time()

    progress(search.fit(X, y))

    total_time = time() - when_started
    n_settings = len(search.cv_results_['params'])
    LOGGER.warn(
        "{} took {:.2f} seconds for {} candidates parameter settings.".format(
            method_name, total_time, n_settings))
    return search
Beispiel #26
0
def run_parallel(collector, seed_epoch, collect):
    if collect['dask']:
        # Launching Dask on cluster
        assert not collect['render'], 'Can not render using dask'
        from daskoia import CPUCluster
        cluster = CPUCluster(mem_req=6000)
        cluster.start_workers(collect['workers'])
        client = Client(cluster)
        print('Scheduler Info {}'.format(client.scheduler_info()))
        futures_traj = client.map(collector, seed_epoch)
        progress(futures_traj)
        results = client.gather(futures_traj)
    else:
        if collect['workers'] > 1:
            assert not collect[
                'render'], 'Can not render using multiple processes'
            results = Parallel(n_jobs=collect['workers'])(
                delayed(collector)(se) for se in tqdm(seed_epoch))
        else:
            results = (collector(se) for se in tqdm(seed_epoch))

    return results
Beispiel #27
0
def run_link(input_path, output_path, cluster, dataset_id):
    logger.info("Setting up client")
    client = Client(cluster, serializers=['msgpack', 'dask'], deserializers=['msgpack', 'dask', 'pickle'])
    logger.info(client)
    full_ent_set = []
    # Assumption: The input parquet fits into memory. Will need to switch to dask distributed otherwise
    for pq in glob.glob(os.path.join(input_path, '*.parquet')):
        df = pd.read_parquet(pq)
        if len(df) == 0:
            logger.warning(f"{pq} is empty -- skipping.")
            continue
        contents = df['content'].tolist()
        results = [client.submit(link, c, resources={'linking': 1}) for c in contents]
        progress(results)
        results = [r.result() for r in results]
        nonlinked_lists, ent_set = zip(*results)
        nonlinked_lists = list(nonlinked_lists)
        ent_set = list(ent_set)
        ent_set = [list(e) if e is not None or len(e) > 0 else None for e in ent_set]
        full_ent_set.extend(ent_set)
        nonlinked_lists = [list(e) for e in nonlinked_lists]
        df['ents_linked'] = ent_set
        df['ents_unlinked'] = nonlinked_lists
        basename = os.path.basename(pq)
        df.to_parquet(os.path.join(output_path, basename))

    logger.info('Starting entity info extraction')
    dfs = [client.submit(construct_linked_kb, e, resources={'linking': 1}) for e in full_ent_set]
    progress(dfs)
    dfs = [d.result() for d in dfs]
    dfs = [d for d in dfs if d is not None]
    dfs = pd.concat(dfs)
    dfs.drop_duplicates(inplace=True)
    dfs['aliases'] = dfs.apply(lambda row: list(row['aliases']), axis=1)
    dfs['types'] = dfs.apply(lambda row: list(row['types']), axis=1)
    dfs['dataset_id'] = dataset_id
    dfs.to_parquet(os.path.join(output_path, f'{dataset_id}_entities.parquet'))
def run(src_dir, dst_dir):
    client = get_client()

    # load data
    tiff_paths = find_src_files(src_dir, "tif")
    raw_data = tiff_paths.map(read_tiff)

    # create destination
    create_dst_dir(dst_dir)

    # # downsample
    # bin4_data = raw_data.map(partial(downsample_naive, ratio=(1, 4, 4)))
    # bin4_data = bin4_data.map(da.rechunk)
    # bin4_data = client.persist(bin4_data)
    #
    # logger.info("downsampling")
    # progress(bin4_data)

    logger.info("persist data on cluster")
    bin4_data = client.persist(raw_data, priority=-10)
    progress(bin4_data)

    # save intermediate result
    zarr_paths = tiff_paths.map(partial(build_zarr_path, dst_dir))
    name_data = db.zip(zarr_paths, bin4_data)
    futures = name_data.starmap(write_zarr, path="raw")

    logger.info("save as zarr")
    future = client.compute(futures, priority=10)
    progress(future)

    # convert to h5 for ingestion
    h5_paths = zarr_paths.map(partial(build_h5_path, dst_dir))
    src_dst = db.zip(zarr_paths, h5_paths)
    futures = src_dst.starmap(convert_hdf5)

    logger.info("convert zarr to h5")
    future = client.compute(futures, priority=20)
    progress(future)
Beispiel #29
0
def main():
    def signal_handler(*args):
        # Handle any cleanup here
        print('SIGINT or CTRL-C detected. Exiting gracefully'
              ' and shutting down the dask kubernetes cluster')
        if cluster:
            cluster.close()
        exit(0)

    arguments = docopt(__doc__)
    clear_output = arguments["--clear"]
    bucket_name = f's3://{arguments["--input-bucket"]}'
    output_bucket_name = arguments["--output-bucket"]
    outp_dir = arguments["--output-dir"]
    filter_config_file = arguments["--filter-config"]
    output_format = arguments["--format"]
    scheduler = arguments["--scheduler"]
    log_file = arguments["--log-file"]
    launch_kubernetes = arguments["--k8"]
    log_level = logging.DEBUG if arguments["--verbose"] else logging.INFO
    languages = arguments["--languages"]

    signal.signal(signal.SIGINT, signal_handler)

    if languages:
        languages = languages.split(',')

    init_logging(log_level, log_file)

    # clean output directory if existing
    if outp_dir is not None and os.path.exists(outp_dir):
        if clear_output is not None and clear_output:
            shutil.rmtree(outp_dir)
            os.mkdir(outp_dir)

    with open(filter_config_file, 'r') as file:
        config = json.load(file)

    # start the dask local cluster
    if scheduler is None:
        if launch_kubernetes:
            cluster = DaskCluster(
                namespace="dhlab",
                cluster_id="impresso-pycommons-k8-rebuild",
                scheduler_pod_spec=make_scheduler_configuration(),
                worker_pod_spec=make_worker_configuration(
                    docker_image=
                    "ic-registry.epfl.ch/dhlab/impresso_pycommons:v1",
                    memory="5G"))
            try:
                cluster.create()
                cluster.scale(50, blocking=True)
                client = cluster.make_dask_client()
                print(client.get_versions(check=False))
            except Exception as e:
                print(e)
                cluster.close()
                exit(0)

            print(client)
        else:
            cluster = None
            client = Client(processes=False, n_workers=8, threads_per_worker=1)
    else:
        cluster = None
        client = Client(scheduler)
    logger.info(f"Dask cluster: {client}")

    if arguments["rebuild_articles"]:

        try:
            for n, batch in enumerate(config):
                rebuilt_issues = []
                print(f'Processing batch {n + 1}/{len(config)} [{batch}]')
                newspaper = list(batch.keys())[0]
                start_year, end_year = batch[newspaper]

                for year in range(start_year, end_year):
                    print(f'Processing year {year}')
                    print('Retrieving issues...')
                    try:
                        input_issues = read_s3_issues(newspaper, year,
                                                      bucket_name)
                    except FileNotFoundError:
                        print(f'{newspaper}-{year} not found in {bucket_name}')
                        continue

                    issue_key, json_files = rebuild_issues(
                        issues=input_issues,
                        input_bucket=bucket_name,
                        output_dir=outp_dir,
                        dask_client=client,
                        format=output_format,
                        filter_language=languages)
                    rebuilt_issues.append((issue_key, json_files))

                print((f"Uploading {len(rebuilt_issues)} rebuilt bz2files "
                       f"to {output_bucket_name}"))
                b = db.from_sequence(rebuilt_issues) \
                    .starmap(compress, output_dir=outp_dir) \
                    .starmap(upload, bucket_name=output_bucket_name) \
                    .starmap(cleanup)
                future = b.persist()
                progress(future)

        except Exception as e:
            traceback.print_tb(e.__traceback__)
            print(e)
            if cluster:
                cluster.close()
        finally:
            if cluster:
                cluster.close()

    elif arguments["rebuild_pages"]:
        print("\nFunction not yet implemented (sorry!).\n")
Beispiel #30
0
def main():
    parser = argparse.ArgumentParser(
        description = 'Simple example for using dask-joqueue in SLURM')

    parser.add_argument('--proc_per_job', type = int, default = 1,
                        help = 'Number of processes per job.')
    parser.add_argument('--cores_per_proc', type = float, default = 2,
                        help = 'Number of cores per process.')
    parser.add_argument('--n_jobs', type = int, default = 1,
                        help = 'Number of jobs')
    parser.add_argument('--array', type = int, default = 0,
                        help = 'EXPERIMENTAL. If >0, then submit an job-array '+\
                        'of this size. The total number of jobs will'+\
                        ' be `array * n_jobs`.')
    parser.add_argument('--container', type = str,
                        help = 'Path to singularity container. If `None`, '+\
                        'then assumes conda environment.')
    parser.add_argument('--qos', type = str, help = 'QOS to use.')
    parser.add_argument('--dry', action = 'store_true',
                        help = 'Print job script and exit (no submission)')
    parser.add_argument('--load', type = int, default = 1000,
                        help = 'Load for the function.')
    args = parser.parse_args()

    n_procs = args.proc_per_job * args.n_jobs

    params = {
        'cores' : int(args.cores_per_proc * args.proc_per_job),
        'memory' : '{0:d}00MB'.format(args.proc_per_job*5),
        'processes' : args.proc_per_job,
        # The name to assign to each worker
        'name' : 'dask_test'
    }

    job_extra = ['--requeue']
    env_extra = []

    if not args.qos is None:
        job_extra.append('--qos {}'.format(args.qos))

    if args.array > 0:
        n_procs = n_procs * args.array
        job_extra.append('--array 0-{0:d}'.format(args.array - 1))
        """
        This is added to ensure that each worker has a unique ID.
        This may be unnecessary.
        """
        env_extra.append(
            'JOB_ID=${SLURM_ARRAY_JOB_ID%;*}_${SLURM_ARRAY_TASK_ID%;*}')

    if not args.container is None:
        """
        When using a  container, dask needs to know how to enter the python
        environment.

        Note:
        The binding `-B..` is cluster(OpenMind) specific but can generalize.
        The binding is required since `singularity` will not bind by default.
        """
        cont = os.path.normpath(args.container)
        bind = cont.split(os.sep)[1]
        bind = '-B /{0!s}:/{0!s}'.format(bind)
        py = 'singularity exec {0!s} {1!s} python3'.format(bind, cont)
        params.update({'python' : py})
        """
        Dask will generate a job script but some elements will be missing
        due to the way the singularity container with interface with slurm.
        The `modules` need to initialized and `singularity` needs to be added.
        """
        env_extra += [ 'source /etc/profile.d/modules.sh',
        'module add openmind/singularity/2.6.0']

    params.update({ 'job_extra' : job_extra,
                    'env_extra' : env_extra})

    cluster = SLURMCluster(**params)
    """
    Display the job script.
    """
    print(cluster.job_script())
    pprint(params)

    t0 = time.time()
    num_crunch(100)
    expected_dur = (time.time() - t0) * args.load
    print('Expected time of linear call: {0:f}'.format(expected_dur))

    if args.dry:
        return

    """
    Scale the cluster to the number of jobs.
    """
    print('Scaling by {}'.format(args.n_jobs))
    cluster.scale_up(args.proc_per_job * args.n_jobs)

    """
    Setup a client that interfaces with the workers
    """
    client = distributed.Client(cluster)
    time.sleep(10)
    print(cluster)
    print(client)
    pprint(client.has_what())
    # pprint(client.scheduler_info())
    """
    Generate a transaction.
    """
    futures = client.map(num_crunch, range(args.load))
    t0 = time.time()

    """
    Compute (and then discard) while keeping track of progress.
    """
    distributed.progress(futures)
    dur = time.time() - t0
    msg = '\n\nSpeed up of {0:f}x ({1:f}/{2:f})'.format((expected_dur / dur),
                                                    expected_dur, dur)
    print(msg)
    msg = 'Ideal speed up is {0:f}x'.format(n_procs)
    print(msg)
    """
Beispiel #31
0
    # sigma = [0.1]
    # r_fit = [6]
    # rep = [0, 1, 2, 3]
    param_list = [n, t, r, sigma, r_fit, rep, const]
    params = list(itertools.product(*param_list))
    param_df = pd.DataFrame(
        params, columns=['n', 't', 'r', 'sigma', 'r_fit', 'rep', 'const'])

    # setup dask job
    client = Client()
    client

    lazy_results = []
    for parameters in param_df.values:
        lazy_result = dask.delayed(run_simulation)(*parameters)
        lazy_results.append(lazy_result)

    futures = dask.persist(*lazy_results)
    progress(futures)
    # call computation
    results = dask.compute(*futures)
    data = pd.DataFrame(results,
                        columns=[
                            'loss_true', 'max_qnorm_ub_true', 'loss_fit',
                            'max_qnorm_ub_fit', 'gen_err_fit'
                        ])
    # param_df.to_csv("params_max.csv")
    # data.to_csv("results_max.csv")
    table = param_df.join(data)
    table.to_csv("max_n.csv")