Beispiel #1
0
def build_cli(args=None):
    if not args:
        args = parse_args()
    else:
        args = parse_args(args)
    filter_dirty = any(args.packages) or not args._all

    outputs = get_dask_outputs(args.path,
                               packages=args.packages,
                               filter_dirty=filter_dirty,
                               git_rev=args.git_rev,
                               stop_rev=args.stop_rev,
                               steps=args.steps,
                               max_downstream=args.max_downstream,
                               visualize=args.visualize,
                               test=args.test)

    if args.visualize:
        # setattr(nx.drawing, 'graphviz_layout', nx.nx_pydot.graphviz_layout)
        # graphviz_graph = nx.draw_graphviz(graph, 'dot')
        # graphviz_graph.draw(args.visualize)
        visualize(*outputs,
                  filename=args.visualize)  # create neat looking graph.
    else:
        # many threads, because this is just the dispatch.  Takes very little compute.
        # Only waiting for build complete.
        cluster = LocalCluster(n_workers=1,
                               threads_per_worker=args.threads,
                               nanny=False)
        client = Client(cluster)

        futures = client.persist(outputs)
        progress(futures)
Beispiel #2
0
def test_visualize_lists(tmpdir):
    pytest.importorskip('graphviz')
    fn = os.path.join(str(tmpdir), 'myfile.dot')
    dask.visualize([{'abc-xyz': (add, 1, 2)}], filename=fn)
    with open(fn) as f:
        text = f.read()
    assert 'abc-xyz' in text
Beispiel #3
0
def test_visualize_lists(tmpdir):
    pytest.importorskip("graphviz")
    fn = os.path.join(str(tmpdir), "myfile.dot")
    dask.visualize([{"abc-xyz": (add, 1, 2)}], filename=fn)
    with open(fn) as f:
        text = f.read()
    assert "abc-xyz" in text
Beispiel #4
0
    def exc(self):
        """
        
        """

        computations = [dask.delayed(self.series)(
            code=c['code'], institution=c['institution'], region=c['region'])
            for c in self.institutions.to_dict(orient='records')]

        dask.visualize(computations, filename='highcharts', format='pdf')
        dask.compute(computations, scheduler='processes')
        self.inspect()
Beispiel #5
0
 def __init__(self, root_dir, png_path=None):
     self.session_dirs = list(find_session_dirs((root_dir,)))
     d = {}
     for session_dir in self.session_dirs:
         session_dir = str(session_dir)
         for task in missing_tasks(session_dir):
             d[('task_name', task.name)] = task.name
             dependencies = [(dt, session_dir) for dt in task.depends_on]
             d[(task.name, session_dir)] = (
                 run_task, ('task_name', task.name), session_dir, dependencies)
         d[('end', session_dir)] = (
             _count, [(task_name, session_dir) for task_name in TASK_CLASSES.keys()])
     if png_path:
         visualize(d, filename=png_path)
     self.graph = d
     self.create_cluster()
    def visualize(self, filename='mydask', format=None, **kwargs):
        """Render the task graph for this parameter search using ``graphviz``.

        Requires ``graphviz`` to be installed.

        Parameters
        ----------
        filename : str or None, optional
            The name (without an extension) of the file to write to disk.  If
            `filename` is None, no file will be written, and we communicate
            with dot using only pipes.
        format : {'png', 'pdf', 'dot', 'svg', 'jpeg', 'jpg'}, optional
            Format in which to write output file.  Default is 'png'.
        **kwargs
            Additional keyword arguments to forward to ``dask.dot.to_graphviz``.

        Returns
        -------
        result : IPython.diplay.Image, IPython.display.SVG, or None
            See ``dask.dot.dot_graph`` for more information.
        """
        check_is_fitted(self, 'dask_graph_')
        return dask.visualize(self.dask_graph_,
                              filename=filename,
                              format=format,
                              **kwargs)
Beispiel #7
0
    def visualize(self, filename, **kwargs):
        """Generate an image describing the reporting structure.

        This is a shorthand for :meth:`dask.visualize`. Requires
        `graphviz <https://pypi.org/project/graphviz/>`__.
        """
        return dask.visualize(self.graph, filename=filename, **kwargs)
Beispiel #8
0
    def exc(self):
        """

        :return:
        """

        calculations = []

        for model in self.models:
            measures = self.measures_(model=model)
            densities = self.densities_(model=model)
            properties = self.properties_(model=model)

            calculations.append(self.concatenate(measures, densities, properties))

        dask.visualize(calculations, filename='calculations', format='pdf')
        values = dask.compute(calculations, scheduler='processes')[0]
        return pd.concat(values, ignore_index=True)
Beispiel #9
0
def main(args):

    start_date = datetime.strptime(str(args.start_date), "%Y%m%d")
    start_date = start_date.replace(hour=args.time_of_day)
    num_days = args.num_days
    dates = [start_date + timedelta(i) for i in range(-2, num_days)]
    source_path = args.source_file

    res = get_urls(dates, source_path)
    if args.visualize:
        dask.visualize(*res, filename='get_article_graph.svg')
    else:
        urlfiles = dask.compute(*res)
        urlfiles = sorted(urlfiles)

        rng = list(range(len(urlfiles)))[2:]
        _ = [pruneLinks(urlfiles[i - 2:i + 1]) for i in rng]

    print("\n\nDone!\n\n")
Beispiel #10
0
 def visualize(
     self,
     filename: str = "mydask",
     format: str | None = None,
     optimize_graph: bool = False,
     **kwargs: Any,
 ) -> DisplayObject | None:
     return dask.visualize(
         self,
         filename=filename,
         format=format,
         optimize_graph=optimize_graph,
         **kwargs,
     )
Beispiel #11
0
def visualize(dsk, df_tasks, label="", color="", current_time=0, **kwargs):
    """
    Draw a dask graph enhanced by additional information.

    Parameters
    ----------
    dsk: dict
        Dask task graph. Should be able to be plotted by dask.visualize.
    df_tasks: pd.DataFrame
        DataFrame of the dask task stream data. "key" column is mandatory to
        assign a row of the DataFrame to a node in the graph. "key" column
        must be of type string even when key is a tuple, because otherwise
        the type is not compatible with formats like parquet.
    label: str
        Column name of df_tasks DataFrame which contains the value for the
        node label.
    color: str
        Column name of df_tasks DataFrame which contains color information of
        node fill color.

        If the values are numerical the node is filled with grayscale tones.
        The label font color is adjusted to be always readable.

        If the values are strings each unique value is assigned a different
        color.

        If the value is "progress" each started node is filled with red and
        each finished is filled with blue. To set the current time use the
        argument "current_time". The option needs the columns "start_delta"
        and "stop_delta" in the df_tasks DataFrame containing the seconds
        passed since the start of the graph execution.
    current_time: float
        If color is set to "progress" this sets the current time influencing
        the fill color of the nodes.
    """
    attributes = _get_dsk_attributes(
        dsk, df_tasks, label_col=label, color_col=color, current_time=current_time
    )

    return dask.visualize(
        dsk,
        data_attributes=attributes["data"],
        function_attributes=attributes["func"],
        **kwargs,
    )
Beispiel #12
0
def visualize_one_block(dataset, **kwargs):
    """
    Visualize one block of a Dataset or DataArray.
    """
    graph = None
    if isinstance(dataset, xr.DataArray):
        dataset = dataset._to_temp_dataset()

    keys = []
    block = get_one_block(dataset.unify_chunks())
    graph = block.__dask_graph__()

    for name, variable in block.variables.items():
        if isinstance(variable.data, dask.array.Array):
            key = (variable.data.name,) + (0,) * variable.ndim
            keys.append(key)

    if graph is None:
        raise ValueError("No dask variables!")
    return dask.visualize(graph.cull(set(keys)), **kwargs)
Beispiel #13
0
    def visualize(self, filename='mydask', format=None, **kwargs):
        """Render the task graph for this parameter search using ``graphviz``.

        Requires ``graphviz`` to be installed.

        Parameters
        ----------
        filename : str or None, optional
            The name (without an extension) of the file to write to disk.  If
            `filename` is None, no file will be written, and we communicate
            with dot using only pipes.
        format : {'png', 'pdf', 'dot', 'svg', 'jpeg', 'jpg'}, optional
            Format in which to write output file.  Default is 'png'.
        **kwargs
            Additional keyword arguments to forward to ``dask.dot.to_graphviz``.

        Returns
        -------
        result : IPython.diplay.Image, IPython.display.SVG, or None
            See ``dask.dot.dot_graph`` for more information.
        """
        check_is_fitted(self, 'dask_graph_')
        return dask.visualize(self.dask_graph_, filename=filename,
                              format=format, **kwargs)
Beispiel #14
0
 def plot_dask(self, filename):
     visualize(self.as_dict(), filename=filename, collapse_outputs=True)
Beispiel #15
0
def _clean(**kw):
    args = OmegaConf.create(kw)
    OmegaConf.set_struct(args, True)

    import numpy as np
    import numexpr as ne
    import dask
    import dask.array as da
    from dask.distributed import performance_report
    from pfb.utils.fits import load_fits, set_wcs, save_fits, data_from_header
    from pfb.opt.hogbom import hogbom
    from astropy.io import fits

    print("Loading dirty", file=log)
    dirty = load_fits(args.dirty, dtype=args.output_type).squeeze()
    nband, nx, ny = dirty.shape
    hdr = fits.getheader(args.dirty)

    print("Loading psf", file=log)
    psf = load_fits(args.psf, dtype=args.output_type).squeeze()
    _, nx_psf, ny_psf = psf.shape
    hdr_psf = fits.getheader(args.psf)

    wsums = np.amax(psf.reshape(-1, nx_psf * ny_psf), axis=1)
    wsum = np.sum(wsums)

    psf /= wsum
    psf_mfs = np.sum(psf, axis=0)

    assert (psf_mfs.max() - 1.0) < 1e-4

    dirty /= wsum
    dirty_mfs = np.sum(dirty, axis=0)

    # get info required to set WCS
    ra = np.deg2rad(hdr['CRVAL1'])
    dec = np.deg2rad(hdr['CRVAL2'])
    radec = [ra, dec]

    cell_deg = np.abs(hdr['CDELT1'])
    if cell_deg != np.abs(hdr['CDELT2']):
        raise NotImplementedError('cell sizes have to be equal')
    cell_rad = np.deg2rad(cell_deg)

    freq_out, ref_freq = data_from_header(hdr, axis=3)

    hdr_mfs = set_wcs(cell_deg, cell_deg, nx, ny, radec, ref_freq)

    save_fits(args.output_filename + '_dirty_mfs.fits',
              dirty_mfs,
              hdr_mfs,
              dtype=args.output_type)

    # set up Hessian approximation
    if args.weight_table is not None:
        normfact = wsum
        from africanus.gridding.wgridder.dask import hessian
        from pfb.utils.misc import plan_row_chunk
        from daskms.experimental.zarr import xds_from_zarr

        xds = xds_from_zarr(args.weight_table)[0]
        nrow = xds.row.size
        freqs = xds.chan.data
        nchan = freqs.size

        # bin edges
        fmin = freqs.min()
        fmax = freqs.max()
        fbins = np.linspace(fmin, fmax, nband + 1)

        # chan <-> band mapping
        band_mapping = {}
        chan_chunks = {}
        freq_bin_idx = {}
        freq_bin_counts = {}
        band_map = np.zeros(freqs.size, dtype=np.int32)
        for band in range(nband):
            indl = freqs >= fbins[band]
            indu = freqs < fbins[band + 1] + 1e-6
            band_map = np.where(indl & indu, band, band_map)

        # to dask arrays
        bands, bin_counts = np.unique(band_map, return_counts=True)
        band_mapping = tuple(bands)
        chan_chunks = {'chan': tuple(bin_counts)}
        freqs = da.from_array(freqs, chunks=tuple(bin_counts))
        bin_idx = np.append(np.array([0]), np.cumsum(bin_counts))[0:-1]
        freq_bin_idx = da.from_array(bin_idx, chunks=1)
        freq_bin_counts = da.from_array(bin_counts, chunks=1)

        max_chan_chunk = bin_counts.max()
        bin_counts = tuple(bin_counts)
        # the first factor of 3 accounts for the intermediate visibilities
        # produced in Hessian (i.e. complex data + real weights)
        memory_per_row = (3 * max_chan_chunk * xds.WEIGHT.data.itemsize +
                          3 * xds.UVW.data.itemsize)

        # get approx image size
        pixel_bytes = np.dtype(args.output_type).itemsize
        band_size = nx * ny * pixel_bytes

        if args.host_address is None:
            # nworker bands on single node
            row_chunk = plan_row_chunk(args.mem_limit / args.nworkers,
                                       band_size, nrow, memory_per_row,
                                       args.nthreads_per_worker)
        else:
            # single band per node
            row_chunk = plan_row_chunk(args.mem_limit, band_size, nrow,
                                       memory_per_row,
                                       args.nthreads_per_worker)

        print(
            "nrows = %i, row chunks set to %i for a total of %i chunks per node"
            % (nrow, row_chunk, int(np.ceil(nrow / row_chunk))),
            file=log)

        def convolver(x):
            model = da.from_array(x, chunks=(1, nx, ny), name=False)

            xds = xds_from_zarr(args.weight_table,
                                chunks={
                                    'row': row_chunk,
                                    'chan': bin_counts
                                })[0]

            convolvedim = hessian(xds.UVW.data,
                                  freqs,
                                  model,
                                  freq_bin_idx,
                                  freq_bin_counts,
                                  cell_rad,
                                  weights=xds.WEIGHT.data.astype(
                                      args.output_type),
                                  nthreads=args.nvthreads,
                                  epsilon=args.epsilon,
                                  do_wstacking=args.wstack,
                                  double_accum=args.double_accum)
            return convolvedim
    else:
        normfact = 1.0
        from pfb.operators.psf import hessian
        from ducc0.fft import r2c
        iFs = np.fft.ifftshift

        npad_xl = (nx_psf - nx) // 2
        npad_xr = nx_psf - nx - npad_xl
        npad_yl = (ny_psf - ny) // 2
        npad_yr = ny_psf - ny - npad_yl
        padding = ((0, 0), (npad_xl, npad_xr), (npad_yl, npad_yr))
        unpad_x = slice(npad_xl, -npad_xr)
        unpad_y = slice(npad_yl, -npad_yr)
        lastsize = ny + np.sum(padding[-1])
        psf_pad = iFs(psf, axes=(1, 2))
        psfhat = r2c(psf_pad,
                     axes=(1, 2),
                     forward=True,
                     nthreads=nthreads,
                     inorm=0)

        psfhat = da.from_array(psfhat, chunks=(1, -1, -1))

        def convolver(x):
            model = da.from_array(x, chunks=(1, nx, ny), name=False)

            convolvedim = hessian(model, psfhat, padding, nvthreads, unpad_x,
                                  unpad_y, lastsize)
            return convolvedim

        # psfo = PSF(psf, dirty.shape, nthreads=args.nthreads)
        # def convolver(x): return psfo.convolve(x)

    rms = np.std(dirty_mfs)
    rmax = np.abs(dirty_mfs).max()

    print("Iter %i: peak residual = %f, rms = %f" % (0, rmax, rms), file=log)

    residual = dirty.copy()
    residual_mfs = dirty_mfs.copy()
    model = np.zeros_like(residual)
    for k in range(args.nmiter):
        print("Running Hogbom", file=log)
        x = hogbom(residual,
                   psf,
                   gamma=args.hb_gamma,
                   pf=args.hb_peak_factor,
                   maxit=args.hb_maxit,
                   verbosity=args.hb_verbose,
                   report_freq=args.hb_report_freq)

        model += x
        print("Getting residual", file=log)

        convimage = convolver(model)
        dask.visualize(convimage,
                       filename=args.output_filename + '_hessian' + str(k) +
                       '_graph.pdf',
                       optimize_graph=False)
        with performance_report(filename=args.output_filename + '_hessian' +
                                str(k) + '_per.html'):
            convimage = dask.compute(convimage, optimize_graph=False)[0]
        ne.evaluate('dirty - convimage/normfact',
                    out=residual,
                    casting='same_kind')
        ne.evaluate('sum(residual, axis=0)',
                    out=residual_mfs,
                    casting='same_kind')

        rms = np.std(residual_mfs)
        rmax = np.abs(residual_mfs).max()

        print("Iter %i: peak residual = %f, rms = %f" % (k + 1, rmax, rms),
              file=log)

    print("Saving results", file=log)
    save_fits(args.output_filename + '_model.fits', model, hdr)
    model_mfs = np.mean(model, axis=0)
    save_fits(args.output_filename + '_model_mfs.fits', model_mfs, hdr_mfs)
    save_fits(args.output_filename + '_residual.fits',
              residual * wsums[:, None, None], hdr)
    save_fits(args.output_filename + '_residual.fits', residual_mfs, hdr_mfs)

    print("All done here.", file=log)
Beispiel #16
0
import dask.array as da

kwargs = {
    'bgcolor': '#00000000',
    'rankdir': 'BT',
    'node_attr': {
        'color': 'black',
        'fontcolor': '#000000',
        'penwidth': '3'
    },
    'edge_attr': {
        'color': 'black',
        'penwidth': '3'
    }
}

x = da.ones((15, 15), chunks=(5, 5))

x.mean(split_every=10).visualize('array-mean.svg', **kwargs)
(x + x.T).visualize('array-xxT.svg', **kwargs)
(x.dot(x.T + 1)).visualize('array-xdotxT.svg', **kwargs)
(x.dot(x.T + 1) - x.mean()).visualize('array-xdotxT-mean.svg', **kwargs)
(x.dot(x.T + 1) - x.mean()).std().visualize('array-xdotxT-mean-std.svg',
                                            **kwargs)

N = 25
x = da.ones((N, N), chunks=(5, 5))
xxT = x + x.T
U, S, V = da.linalg.svd(xxT.rechunk((5, N)) - x.mean())
dask.visualize(U, S, V, filename='array-svd.svg', **kwargs)
Beispiel #17
0
import dask
import dask.array as da

kwargs = {'bgcolor': '#00000000',
          'rankdir': 'BT',
          'node_attr': {'color': 'black',
                        'fontcolor': '#000000',
                        'penwidth': '3'},
          'edge_attr': {'color': 'black', 'penwidth': '3'}}

x = da.ones((15, 15), chunks=(5, 5))

x.mean(split_every=10).visualize('array-mean.svg', **kwargs)
(x + x.T).visualize('array-xxT.svg', **kwargs)
(x.dot(x.T + 1)).visualize('array-xdotxT.svg', **kwargs)
(x.dot(x.T + 1) - x.mean()).visualize('array-xdotxT-mean.svg', **kwargs)
(x.dot(x.T + 1) - x.mean()).std().visualize('array-xdotxT-mean-std.svg', **kwargs)


N = 25
x = da.ones((N, N), chunks=(5, 5))
xxT = x + x.T
U, S, V = da.linalg.svd(xxT.rechunk((5, N)) - x.mean())
dask.visualize(U, S, V, filename='array-svd.svg', **kwargs)
Beispiel #18
0
    'nwords': (len, (str.split, 'words')),
    'val1': 'orange',
    'val2': 'apple',
    'val3': 'pear',
    'count1': (str.count, 'words', 'val1'),
    'count2': (str.count, 'words', 'val2'),
    'count3': (str.count, 'words', 'val3'),
    'out1': (format_str, 'count1', 'val1', 'nwords'),
    'out2': (format_str, 'count2', 'val2', 'nwords'),
    'out3': (format_str, 'count3', 'val3', 'nwords'),
    'print1': (print_and_return, 'out1'),
    'print2': (print_and_return, 'out2'),
    'print3': (print_and_return, 'out3')
}

dask.visualize(dsk, filename='/Users/longguangbin/Work/temp/dask2.pdf')

from dask.threaded import get
from dask.optimization import cull
from dask.optimization import inline

outputs = ['print1', 'print2']
results = get(dsk, outputs)

dsk1, dependencies = cull(dsk, outputs)

dsk2 = inline(dsk1, dependencies=dependencies)
results = get(dsk2, outputs)

# https://docs.dask.org/en/latest/optimize.html
Beispiel #19
0
 def visualize_job(self, name):
     if self.jobs.get(name):
         dask.visualize(self.jobs['name']['future'])
Beispiel #20
0
def _predict(ms, stack, **kw):
    args = OmegaConf.create(kw)
    OmegaConf.set_struct(args, True)
    pyscilog.log_to_file(args.output_filename + '.log')
    pyscilog.enable_memory_logging(level=3)

    # number of threads per worker
    if args.nthreads is None:
        if args.host_address is not None:
            raise ValueError(
                "You have to specify nthreads when using a distributed scheduler"
            )
        import multiprocessing
        nthreads = multiprocessing.cpu_count()
        args.nthreads = nthreads
    else:
        nthreads = args.nthreads

    if args.mem_limit is None:
        if args.host_address is not None:
            raise ValueError(
                "You have to specify mem-limit when using a distributed scheduler"
            )
        import psutil
        mem_limit = int(psutil.virtual_memory()[0] /
                        1e9)  # 100% of memory by default
        args.mem_limit = mem_limit
    else:
        mem_limit = args.mem_limit

    nband = args.nband
    if args.nworkers is None:
        nworkers = nband
        args.nworkers = nworkers
    else:
        nworkers = args.nworkers

    if args.nthreads_per_worker is None:
        nthreads_per_worker = 1
        args.nthreads_per_worker = nthreads_per_worker
    else:
        nthreads_per_worker = args.nthreads_per_worker

    # the number of chunks being read in simultaneously is equal to
    # the number of dask threads
    nthreads_dask = nworkers * nthreads_per_worker

    if args.ngridder_threads is None:
        if args.host_address is not None:
            ngridder_threads = nthreads // nthreads_per_worker
        else:
            ngridder_threads = nthreads // nthreads_dask
        args.ngridder_threads = ngridder_threads
    else:
        ngridder_threads = args.ngridder_threads

    ms = list(ms)
    print('Input Options:', file=log)
    for key in kw.keys():
        print('     %25s = %s' % (key, args[key]), file=log)

    # numpy imports have to happen after this step
    from pfb import set_client
    set_client(nthreads, mem_limit, nworkers, nthreads_per_worker,
               args.host_address, stack, log)

    import numpy as np
    from pfb.utils.misc import chan_to_band_mapping
    import dask
    from dask.distributed import performance_report
    from dask.graph_manipulation import clone
    from daskms import xds_from_storage_ms as xds_from_ms
    from daskms import xds_from_storage_table as xds_from_table
    from daskms.utils import dataset_type
    mstype = dataset_type(ms[0])
    if mstype == 'casa':
        from daskms import xds_to_table
    elif mstype == 'zarr':
        from daskms.experimental.zarr import xds_to_zarr as xds_to_table
    import dask.array as da
    from africanus.constants import c as lightspeed
    from africanus.gridding.wgridder.dask import model as im2vis
    from pfb.utils.fits import load_fits
    from pfb.utils.misc import restore_corrs, plan_row_chunk
    from astropy.io import fits

    # always returns 4D
    # gridder expects freq axis
    model = np.atleast_3d(load_fits(args.model).squeeze())
    nband, nx, ny = model.shape
    hdr = fits.getheader(args.model)
    cell_d = np.abs(hdr['CDELT1'])
    cell_rad = np.deg2rad(cell_d)

    # chan <-> band mapping
    freqs, freq_bin_idx, freq_bin_counts, freq_out, band_mapping, chan_chunks = chan_to_band_mapping(
        ms, nband=nband)

    # degridder memory budget
    max_chan_chunk = 0
    for ims in ms:
        for spw in freqs[ims]:
            counts = freq_bin_counts[ims][spw].compute()
            max_chan_chunk = np.maximum(max_chan_chunk, counts.max())

    # assumes number of correlations are the same across MS/SPW
    xds = xds_from_ms(ms[0])
    ncorr = xds[0].dims['corr']
    nrow = xds[0].dims['row']
    if args.output_type is not None:
        output_type = np.dtype(args.output_type)
    else:
        output_type = np.result_type(np.dtype(args.real_type), np.complex64)
    data_bytes = output_type.itemsize
    bytes_per_row = max_chan_chunk * ncorr * data_bytes
    memory_per_row = bytes_per_row  # model
    memory_per_row += 3 * 8  # uvw

    if mstype == 'zarr':
        if args.model_column in xds[0].keys():
            model_chunks = getattr(xds[0], args.model_column).data.chunks
        else:
            model_chunks = xds[0].DATA.data.chunks
            print('Chunking model same as data')

    # get approx image size
    # this is not a conservative estimate when multiple SPW's map to a single
    # imaging band
    pixel_bytes = np.dtype(args.output_type).itemsize
    band_size = nx * ny * pixel_bytes

    if args.host_address is None:
        # full image on single node
        row_chunk = plan_row_chunk(mem_limit / nworkers, band_size, nrow,
                                   memory_per_row, nthreads_per_worker)

    else:
        # single band per node
        row_chunk = plan_row_chunk(mem_limit, band_size, nrow, memory_per_row,
                                   nthreads_per_worker)

    if args.row_chunks is not None:
        row_chunk = int(args.row_chunks)
        if row_chunk == -1:
            row_chunk = nrow

    print(
        "nrows = %i, row chunks set to %i for a total of %i chunks per node" %
        (nrow, row_chunk, int(np.ceil(nrow / row_chunk))),
        file=log)

    chunks = {}
    for ims in ms:
        chunks[ims] = []  # xds_from_ms expects a list per ds
        for spw in freqs[ims]:
            chunks[ims].append({
                'row': row_chunk,
                'chan': chan_chunks[ims][spw]['chan']
            })

    model = da.from_array(model.astype(args.real_type),
                          chunks=(1, nx, ny),
                          name=False)
    writes = []
    radec = None  # assumes we are only imaging field 0 of first MS
    for ims in ms:
        xds = xds_from_ms(ims, chunks=chunks[ims], columns=('UVW'))

        # subtables
        ddids = xds_from_table(ims + "::DATA_DESCRIPTION")
        fields = xds_from_table(ims + "::FIELD")
        spws = xds_from_table(ims + "::SPECTRAL_WINDOW")
        pols = xds_from_table(ims + "::POLARIZATION")

        # subtable data
        ddids = dask.compute(ddids)[0]
        fields = dask.compute(fields)[0]
        spws = dask.compute(spws)[0]
        pols = dask.compute(pols)[0]

        out_data = []
        for ds in xds:
            field = fields[ds.FIELD_ID]
            radec = field.PHASE_DIR.data.squeeze()

            # check fields match
            if radec is None:
                radec = field.PHASE_DIR.data.squeeze()

            if not np.array_equal(radec, field.PHASE_DIR.data.squeeze()):
                continue

            spw = ds.DATA_DESC_ID  # this is not correct, need to use spw

            uvw = clone(ds.UVW.data)

            bands = band_mapping[ims][spw]
            model = model[list(bands), :, :]
            vis = im2vis(uvw,
                         freqs[ims][spw],
                         model,
                         freq_bin_idx[ims][spw],
                         freq_bin_counts[ims][spw],
                         cell_rad,
                         nthreads=ngridder_threads,
                         epsilon=args.epsilon,
                         do_wstacking=args.wstack)

            model_vis = restore_corrs(vis, ncorr)
            if mstype == 'zarr':
                model_vis = model_vis.rechunk(model_chunks)
                uvw = uvw.rechunk((model_chunks[0], 3))

            out_ds = ds.assign(
                **{
                    args.model_column: (("row", "chan", "corr"), model_vis),
                    'UVW': (("row", "three"), uvw)
                })
            # out_ds = ds.assign(**{args.model_column: (("row", "chan", "corr"), model_vis)})
            out_data.append(out_ds)

        writes.append(xds_to_table(out_data, ims, columns=[args.model_column]))

    dask.visualize(*writes,
                   filename=args.output_filename + '_predict_graph.pdf',
                   optimize_graph=False,
                   collapse_outputs=True)

    if not args.mock:
        with performance_report(filename=args.output_filename +
                                '_predict_per.html'):
            dask.compute(writes, optimize_graph=False)

    print("All done here.", file=log)
Beispiel #21
0

def do_something_1(x, y):
    return x + y + 2*x*y

def do_something_2(a, b):
    return a**3 - b**3

def do_something_3(p, q):
    return p*p + q*q

def do_something_4(x):
    return x * 3

#  define the graph
dsk = {
    'thrice_1': (do_something_4, 10),
    'thrice_2': (do_something_4, 20),
    'thrice_3': (do_something_4, 30),
    'thrice_4': (do_something_4, 40),
    'square_sum': (do_something_3, 'thrice_1', 'thrice_2'),
    'a_plus_b_wholeSqaure': (do_something_1, 'square_sum', 'thrice_3'),
    'some_complex_stuff': (do_something_2, 'thrice_4', 'a_plus_b_wholeSqaure')
}

print(get(dsk, 'some_complex_stuff'))

visualize(dsk, rankdir="LR", filename="task_graph.png")

# Do More.....
avg_by_postalcode.compute()


# In[ ]:


ops_by_postcalcode = narrow_df.set_index("PostCode", npartitions=10)
len(list(ops_by_postcalcode.partitions))


# In[ ]:


# Le sad, you can see this doesn't actually respect the partition size of one byte.
dask.visualize(narrow_df.set_index("PostCode", npartitions="auto", partition_size=1))


# In[ ]:


indexed = narrow_df.set_index("PostCode")
#tag::repartition[]
reparted = indexed.repartition(partition_size="20kb")
#end::repartition[]
dask.visualize(narrow_df.set_index("PostCode").repartition(partition_size="20kb"))


# In[ ]:

Beispiel #23
0
# %% {"slideshow": {"slide_type": "fragment"}}
%%time
mean_delay_res, std_delay_res = dask.compute(mean_delay, std_delay)

# %% [markdown] {"slideshow": {"slide_type": "slide"}}
# Using `dask.compute` takes roughly 1/2 the time. This is because the task graphs for both results are merged when calling `dask.compute`, allowing shared operations to only be done once instead of twice. In particular, using `dask.compute` only does the following once:
#
# - the calls to `read_csv`
# - the filter (`df[~df.Cancelled]`)
# - some of the necessary reductions (`sum`, `count`)
#
# To see what the merged task graphs between multiple results look like (and what's shared), you can use the `dask.visualize` function (we might want to use `filename='graph.pdf'` to zoom in on the graph better):

# %% {"slideshow": {"slide_type": "slide"}}
dask.visualize(mean_delay, std_delay)

# %% [markdown] {"slideshow": {"slide_type": "slide"}}
# ## Converting `CRSDepTime` to a timestamp
#
# This dataset stores timestamps as `HHMM`, which are read in as integers in `read_csv`:

# %% {"slideshow": {"slide_type": "fragment"}}
# recreate the read_csv task with parsed dates
df = dd.read_csv(filename,
                 parse_dates={'Date': [0, 1, 2]},
                 dtype={'TailNum': str,
                        'CRSElapsedTime': float,
                        'Cancelled': bool})

# %% {"slideshow": {"slide_type": "fragment"}}
Beispiel #24
0
        self.features = features


i = NumpyInfo("boo", np.array(0))
numpybits = [i]

# Surprisingly this works, despite the implication that we would need to call register_generic
from distributed.protocol import register_generic
register_generic(NumpyInfo)

dask.compute(ok_fun(1))
#end::serialize_class_with_numpy[]

# In[ ]:

dask.visualize(ok_fun(1))

# In[ ]:

ok_fun(1).visualize()

# In[ ]:

ok_fun(1)

# In[ ]:


# From ch2 for visualize
@dask.delayed
def crawl(url, depth=0, maxdepth=1, maxlinks=4):
Beispiel #25
0
 def show_workflow(self, output_filepath=None):
     dask.visualize(self._result, filename=output_filepath)
Beispiel #26
0
                               cv=3,
                               n_jobs=-1)
    with joblib.parallel_backend("dask"):
        grid_search.fit(X, y)
    return np.sum(grid_search.predict(X)[:5])


if __name__ == "__main__":
    import networkx

    os.makedirs("graphs", exist_ok=True)
    usecases = {
        "pandas-groupby-1-1T-1H": bench_pandas_groupby(1, "1T", "1H"),
        "pandas-groupby-1-1T-8H": bench_pandas_groupby(1, "1T", "8H"),
        "pandas-join-1-1T-1H": bench_pandas_join(1, "1T", "1H"),
        "pandas-join-1-1T-8H": bench_pandas_join(1, "1T", "8H"),
        "bag-1000": bench_bag(1000),
        "merge-1000": bench_merge(1000),
        "numpy-2000": bench_numpy(2000),
        "tree-8": bench_tree(8),
        "xarray-20": bench_xarray(20)
    }
    for (name, graph) in usecases.items():
        dot_filename = f"graphs/{name}"
        dask.visualize(graph, format="dot", filename=dot_filename)
        dask.visualize(graph, filename=f"graphs/{name}.svg")
        g = networkx.drawing.nx_agraph.read_dot(f"{dot_filename}.dot")
        print(f"""
{name}: {len(g.nodes)} vertices, {len(g.edges)} edges, longest path: {networkx.dag_longest_path_length(g)}
""".strip())