Ejemplo n.º 1
0
Archivo: core.py Proyecto: flrgsr/dask
 def _visualize(self, optimize_graph=False):
     from dask.dot import dot_graph
     from .optimize import optimize
     if optimize_graph:
         dot_graph(optimize(self.dask, self._keys()))
     else:
         dot_graph(self.dask)
Ejemplo n.º 2
0
 def visualize(self, optimize_graph=False, **kwargs):
     """Visualize the dask as a graph"""
     from dask.dot import dot_graph
     if optimize_graph:
         return dot_graph(optimize(self.dask, self.key), **kwargs)
     else:
         return dot_graph(self.dask, **kwargs)
def printGraph(dsk,outfile):
    '''output file without extension. Will make a pdf.
    Make sure you have two packages install:
    python-graphviz (the python bindings to graphviz), and graphviz (the system library)
    Try `pip install graphviz` for the bindings and `conda install graphviz` for the library.
    If you don't use conda, or conda library install fails then install from `http://www.graphviz.org/`
    '''
    dot_graph(dsk,filename=outfile,format='pdf')
    print("output image in {}.pdf".format(outfile))
Ejemplo n.º 4
0
def test_dot_graph():
    fn = 'test_dot_graph'
    fns = [fn + ext for ext in ['.png', '.pdf', '.dot']]
    try:
        dot_graph(dsk, filename=fn)
        assert all(os.path.exists(f) for f in fns)
    finally:
        for f in fns:
            if os.path.exists(f):
                os.remove(f)
Ejemplo n.º 5
0
def test_dot_graph():
    fn = 'test_dot_graph'
    fns = [fn + ext for ext in ['.png', '.pdf', '.dot']]
    try:
        dot_graph(dsk, filename=fn)
        assert all(os.path.exists(f) for f in fns)
    except (ImportError, AttributeError):
        pass
    finally:
        for f in fns:
            if os.path.exists(f):
                os.remove(f)
Ejemplo n.º 6
0
def test_dot_graph_no_filename(tmpdir, format, typ):
    before = tmpdir.listdir()
    result = dot_graph(dsk, filename=None, format=format)
    # We shouldn't write any files if filename is None.
    after = tmpdir.listdir()
    assert before == after
    assert isinstance(result, typ)
Ejemplo n.º 7
0
def run_all(values, base, get=get_proc, num_workers = 4):
    full_dask = toolz.merge(val.dask for val in values)
    full_keys = [val._key for val in values]

    cache = {}
    if exists("{}.cache".format(base["prefix"])):
        with open("{}.cache".format(base["prefix"]), "r") as f:
            cache = json.load(f)

    full_dask.update(cache)
    dot_graph(full_dask)

    with ProgressBar(), NekCallback(base) as rprof:
        res = get(full_dask, full_keys, cache=cache, num_workers=num_workers)

    return res
Ejemplo n.º 8
0
 def visualize(self, filename=None, optimize_graph=False):
     from dask.dot import dot_graph
     if optimize_graph:
         dsk = self._optimize(self.dask, self._keys())
     else:
         dsk = self.dask
     return dot_graph(dsk, filename=filename)
Ejemplo n.º 9
0
def test_dot_graph_no_filename(tmpdir):
    # Map from format extension to expected return type.
    result_types = {"png": Image, "jpeg": Image, "dot": type(None), "pdf": type(None), "svg": SVG}
    for format in result_types:
        before = tmpdir.listdir()
        result = dot_graph(dsk, filename=None, format=format)
        # We shouldn't write any files if filename is None.
        after = tmpdir.listdir()
        assert before == after
        assert isinstance(result, result_types[format])
Ejemplo n.º 10
0
Archivo: base.py Proyecto: zcf7822/dask
def visualize(*args, **kwargs):
    """
    Visualize several dask graphs at once.

    Requires ``graphviz`` to be installed. All options that are not the dask
    graph(s) should be passed as keyword arguments.

    Parameters
    ----------
    dsk : dict(s) or collection(s)
        The dask graph(s) to visualize.
    filename : str or None, optional
        The name (without an extension) of the file to write to disk.  If
        `filename` is None, no file will be written, and we communicate
        with dot using only pipes.
    format : {'png', 'pdf', 'dot', 'svg', 'jpeg', 'jpg'}, optional
        Format in which to write output file.  Default is 'png'.
    optimize_graph : bool, optional
        If True, the graph is optimized before rendering.  Otherwise,
        the graph is displayed as is. Default is False.
    **kwargs
       Additional keyword arguments to forward to ``to_graphviz``.

    Returns
    -------
    result : IPython.diplay.Image, IPython.display.SVG, or None
        See dask.dot.dot_graph for more information.

    See also
    --------
    dask.dot.dot_graph

    Notes
    -----
    For more information on optimization see here:

    http://dask.pydata.org/en/latest/optimize.html
    """

    dsks = [arg for arg in args if isinstance(arg, dict)]
    args = [arg for arg in args if isinstance(arg, Base)]
    filename = kwargs.pop('filename', 'mydask')
    optimize_graph = kwargs.pop('optimize_graph', False)
    from dask.dot import dot_graph
    if optimize_graph:
        dsks.extend([
            optimization_function(arg)(ensure_dict(arg.dask), arg._keys())
            for arg in args
        ])
    else:
        dsks.extend([arg.dask for arg in args])
    dsk = merge(dsks)

    return dot_graph(dsk, filename=filename, **kwargs)
Ejemplo n.º 11
0
def visualize(*args, **kwargs):
    filename = kwargs.pop('filename', 'mydask')
    optimize_graph = kwargs.pop('optimize_graph', False)
    from dask.dot import dot_graph
    if optimize_graph:
        dsks = [arg._optimize(arg.dask, arg._keys()) for arg in args]
    else:
        dsks = [arg.dask for arg in args]
    dsk = merge(dsks)

    return dot_graph(dsk, filename=filename, **kwargs)
Ejemplo n.º 12
0
def visualize(*args, **kwargs):
    filename = kwargs.get('filename', 'mydask')
    optimize_graph = kwargs.get('optimize_graph', False)
    from dask.dot import dot_graph
    if optimize_graph:
        dsks = [arg._optimize(arg.dask, arg._keys()) for arg in args]
    else:
        dsks = [arg.dask for arg in args]
    dsk = merge(dsks)

    return dot_graph(dsk, filename=filename)
Ejemplo n.º 13
0
def test_dot_graph():
    fn = 'test_dot_graph'
    fns = [fn + ext for ext in ['.png', '.pdf', '.dot']]
    try:
        i = dot_graph(dsk, filename=fn)
        assert all(os.path.exists(f) for f in fns)
        assert isinstance(i, Image)
    finally:
        for f in fns:
            if os.path.exists(f):
                os.remove(f)

    fn = 'mydask' # default, remove existing files
    fns = [fn + ext for ext in ['.png', '.pdf', '.dot']]
    for f in fns:
        if os.path.exists(f):
            os.remove(f)
    i = dot_graph(dsk, filename=None)
    assert all(not os.path.exists(f) for f in fns)
    assert isinstance(i, Image)
Ejemplo n.º 14
0
Archivo: base.py Proyecto: rlugojr/dask
def visualize(*args, **kwargs):
    """
    Visualize several dask graphs at once.

    Requires ``graphviz`` to be installed. All options that are not the dask
    graph(s) should be passed as keyword arguments.

    Parameters
    ----------
    dsk : dict(s) or collection(s)
        The dask graph(s) to visualize.
    filename : str or None, optional
        The name (without an extension) of the file to write to disk.  If
        `filename` is None, no file will be written, and we communicate
        with dot using only pipes.
    format : {'png', 'pdf', 'dot', 'svg', 'jpeg', 'jpg'}, optional
        Format in which to write output file.  Default is 'png'.
    optimize_graph : bool, optional
        If True, the graph is optimized before rendering.  Otherwise,
        the graph is displayed as is. Default is False.
    **kwargs
       Additional keyword arguments to forward to ``to_graphviz``.

    Returns
    -------
    result : IPython.diplay.Image, IPython.display.SVG, or None
        See dask.dot.dot_graph for more information.

    See also
    --------
    dask.dot.dot_graph

    Notes
    -----
    For more information on optimization see here:

    http://dask.pydata.org/en/latest/optimize.html
    """

    dsks = [arg for arg in args if isinstance(arg, dict)]
    args = [arg for arg in args if isinstance(arg, Base)]
    filename = kwargs.pop('filename', 'mydask')
    optimize_graph = kwargs.pop('optimize_graph', False)
    from dask.dot import dot_graph
    if optimize_graph:
        dsks.extend([arg._optimize(dict(arg.dask), arg._keys())
                     for arg in args])
    else:
        dsks.extend([arg.dask for arg in args])
    dsk = merge(dsks)

    return dot_graph(dsk, filename=filename, **kwargs)
Ejemplo n.º 15
0
def visualize(*args, **kwargs):
    dsks = [arg for arg in args if isinstance(arg, dict)]
    args = [arg for arg in args if isinstance(arg, Base)]
    filename = kwargs.pop('filename', 'mydask')
    optimize_graph = kwargs.pop('optimize_graph', False)
    from dask.dot import dot_graph
    if optimize_graph:
        dsks.extend([arg._optimize(arg.dask, arg._keys()) for arg in args])
    else:
        dsks.extend([arg.dask for arg in args])
    dsk = merge(dsks)

    return dot_graph(dsk, filename=filename, **kwargs)
Ejemplo n.º 16
0
def test_dot_graph_defaults():
    # Test with default args.
    default_name = 'mydask'
    default_format = 'png'
    target = '.'.join([default_name, default_format])

    ensure_not_exists(target)
    try:
        result = dot_graph(dsk)
        assert os.path.isfile(target)
        assert isinstance(result, Image)
    finally:
        ensure_not_exists(target)
Ejemplo n.º 17
0
def visualize(*args, **kwargs):
    dsks = [arg for arg in args if isinstance(arg, dict)]
    args = [arg for arg in args if isinstance(arg, Base)]
    filename = kwargs.pop('filename', 'mydask')
    optimize_graph = kwargs.pop('optimize_graph', False)
    from dask.dot import dot_graph
    if optimize_graph:
        dsks.extend([arg._optimize(arg.dask, arg._keys()) for arg in args])
    else:
        dsks.extend([arg.dask for arg in args])
    dsk = merge(dsks)

    return dot_graph(dsk, filename=filename, **kwargs)
Ejemplo n.º 18
0
def test_dot_graph_defaults():
    # Test with default args.
    default_name = 'mydask'
    default_format = 'png'
    target = '.'.join([default_name, default_format])

    ensure_not_exists(target)
    try:
        result = dot_graph(dsk)
        assert os.path.isfile(target)
        assert isinstance(result, Image)
    finally:
        ensure_not_exists(target)
Ejemplo n.º 19
0
def test_tsqr(create_func):
    mat, data = create_func()
    n = mat.shape[1]

    q, r = csnmf.tsqr.qr(data)

    dot_graph(q.dask, filename='q')
    dot_graph(r.dask, filename='r')

    print q.shape
    q = np.array(q)

    r = np.array(r)
    print r.shape

    print np.linalg.norm(mat - np.dot(q, r))

    assert np.allclose(mat, np.dot(q, r))
    assert np.allclose(np.eye(n, n), np.dot(q.T, q))
    assert np.all(r == np.triu(r))

    plt.figure()
    plt.subplot(2, 4, 1)
    plt.imshow(mat, interpolation='nearest')
    plt.title('Original matrix')
    plt.subplot(2, 4, 2)
    plt.imshow(q, interpolation='nearest')
    plt.title('$\mathbf{Q}$')
    plt.subplot(2, 4, 3)
    plt.imshow(np.dot(q.T, q), interpolation='nearest')
    plt.title('$\mathbf{Q}^T \mathbf{Q}$')
    plt.subplot(2, 4, 4)
    plt.imshow(r, interpolation='nearest')
    plt.title('$\mathbf{R}$')

    plt.subplot(2, 4, 8)
    plt.spy(r)
    plt.title('Nonzeros in $\mathbf{R}$')
Ejemplo n.º 20
0
def test_tsqr(create_func):
    mat, data = create_func()
    n = mat.shape[1]

    q, r = csnmf.tsqr.qr(data)

    dot_graph(q.dask, filename='q')
    dot_graph(r.dask, filename='r')

    print q.shape
    q = np.array(q)

    r = np.array(r)
    print r.shape

    print np.linalg.norm(mat - np.dot(q, r))

    assert np.allclose(mat, np.dot(q, r))
    assert np.allclose(np.eye(n, n), np.dot(q.T, q))
    assert np.all(r == np.triu(r))

    plt.figure()
    plt.subplot(2, 4, 1)
    plt.imshow(mat, interpolation='nearest')
    plt.title('Original matrix')
    plt.subplot(2, 4, 2)
    plt.imshow(q, interpolation='nearest')
    plt.title('$\mathbf{Q}$')
    plt.subplot(2, 4, 3)
    plt.imshow(np.dot(q.T, q), interpolation='nearest')
    plt.title('$\mathbf{Q}^T \mathbf{Q}$')
    plt.subplot(2, 4, 4)
    plt.imshow(r, interpolation='nearest')
    plt.title('$\mathbf{R}$')

    plt.subplot(2, 4, 8)
    plt.spy(r)
    plt.title('Nonzeros in $\mathbf{R}$')
Ejemplo n.º 21
0
def test_filenames_and_formats():
    # Test with a variety of user provided args
    filenames = ["mydaskpdf", "mydask.pdf", "mydask.pdf", "mydaskpdf"]
    formats = ["svg", None, "svg", None]
    targets = ["mydaskpdf.svg", "mydask.pdf", "mydask.pdf.svg", "mydaskpdf.png"]

    result_types = {"png": Image, "jpeg": Image, "dot": type(None), "pdf": type(None), "svg": SVG}

    for filename, format, target in zip(filenames, formats, targets):
        expected_result_type = result_types[target.split(".")[-1]]
        result = dot_graph(dsk, filename=filename, format=format)
        assert os.path.isfile(target)
        assert isinstance(result, expected_result_type)
        ensure_not_exists(target)
Ejemplo n.º 22
0
def test_dot_graph(tmpdir, format, typ):
    # Use a name that the shell would interpret specially to ensure that we're
    # not vulnerable to shell injection when interacting with `dot`.
    filename = str(tmpdir.join('$(touch should_not_get_created.txt)'))

    target = '.'.join([filename, format])
    ensure_not_exists(target)
    try:
        result = dot_graph(dsk, filename=filename, format=format)

        assert not os.path.exists('should_not_get_created.txt')
        assert os.path.isfile(target)
        assert isinstance(result, typ)
    finally:
        ensure_not_exists(target)
Ejemplo n.º 23
0
def test_dot_graph(tmpdir, format, typ):
    # Use a name that the shell would interpret specially to ensure that we're
    # not vulnerable to shell injection when interacting with `dot`.
    filename = str(tmpdir.join('$(touch should_not_get_created.txt)'))

    target = '.'.join([filename, format])
    ensure_not_exists(target)
    try:
        result = dot_graph(dsk, filename=filename, format=format)

        assert not os.path.exists('should_not_get_created.txt')
        assert os.path.isfile(target)
        assert isinstance(result, typ)
    finally:
        ensure_not_exists(target)
Ejemplo n.º 24
0
def test_dot_graph_no_filename(tmpdir):
    # Map from format extension to expected return type.
    result_types = {
        'png': Image,
        'jpeg': Image,
        'dot': type(None),
        'pdf': type(None),
        'svg': SVG,
    }
    for format in result_types:
        before = tmpdir.listdir()
        result = dot_graph(dsk, filename=None, format=format)
        # We shouldn't write any files if filename is None.
        after = tmpdir.listdir()
        assert before == after
        assert isinstance(result, result_types[format])
Ejemplo n.º 25
0
def test_dot_graph(tmpdir):
    # Use a name that the shell would interpret specially to ensure that we're
    # not vulnerable to shell injection when interacting with `dot`.
    filename = str(tmpdir.join("$(touch should_not_get_created.txt)"))

    # Map from format extension to expected return type.
    result_types = {"png": Image, "jpeg": Image, "dot": type(None), "pdf": type(None), "svg": SVG}
    for format in result_types:
        target = ".".join([filename, format])
        ensure_not_exists(target)
        try:
            result = dot_graph(dsk, filename=filename, format=format)

            assert not os.path.exists("should_not_get_created.txt")
            assert os.path.isfile(target)
            assert isinstance(result, result_types[format])
        finally:
            ensure_not_exists(target)
Ejemplo n.º 26
0
def test_filenames_and_formats():
    # Test with a variety of user provided args
    filenames = ['mydaskpdf', 'mydask.pdf', 'mydask.pdf', 'mydaskpdf', 'mydask.pdf.svg']
    formats = ['svg', None, 'svg', None, None]
    targets = ['mydaskpdf.svg', 'mydask.pdf', 'mydask.pdf.svg', 'mydaskpdf.png', 'mydask.pdf.svg']

    result_types = {
        'png': Image,
        'jpeg': Image,
        'dot': type(None),
        'pdf': type(None),
        'svg': SVG,
    }

    for filename, format, target in zip(filenames, formats, targets):
        expected_result_type = result_types[target.split('.')[-1]]
        result = dot_graph(dsk, filename=filename, format=format)
        assert os.path.isfile(target)
        assert isinstance(result, expected_result_type)
        ensure_not_exists(target)
Ejemplo n.º 27
0
def test_filenames_and_formats():
    # Test with a variety of user provided args
    filenames = ['mydaskpdf', 'mydask.pdf', 'mydask.pdf', 'mydaskpdf', 'mydask.pdf.svg']
    formats = ['svg', None, 'svg', None, None]
    targets = ['mydaskpdf.svg', 'mydask.pdf', 'mydask.pdf.svg', 'mydaskpdf.png', 'mydask.pdf.svg']

    result_types = {
        'png': Image,
        'jpeg': Image,
        'dot': type(None),
        'pdf': type(None),
        'svg': SVG,
    }

    for filename, format, target in zip(filenames, formats, targets):
        expected_result_type = result_types[target.split('.')[-1]]
        result = dot_graph(dsk, filename=filename, format=format)
        assert os.path.isfile(target)
        assert isinstance(result, expected_result_type)
        ensure_not_exists(target)
Ejemplo n.º 28
0
def test_dot_graph(tmpdir):
    # Use a name that the shell would interpret specially to ensure that we're
    # not vulnerable to shell injection when interacting with `dot`.
    filename = str(tmpdir.join('$(touch should_not_get_created.txt)'))

    # Map from format extension to expected return type.
    result_types = {
        'png': Image,
        'jpeg': Image,
        'dot': type(None),
        'pdf': type(None),
        'svg': SVG,
    }
    for format in result_types:
        target = '.'.join([filename, format])
        ensure_not_exists(target)
        try:
            result = dot_graph(dsk, filename=filename, format=format)

            assert not os.path.exists('should_not_get_created.txt')
            assert os.path.isfile(target)
            assert isinstance(result, result_types[format])
        finally:
            ensure_not_exists(target)
Ejemplo n.º 29
0
def test_dot_graph(tmpdir):
    # Use a name that the shell would interpret specially to ensure that we're
    # not vulnerable to shell injection when interacting with `dot`.
    filename = str(tmpdir.join('$(touch should_not_get_created.txt)'))

    # Map from format extension to expected return type.
    result_types = {
        'png': Image,
        'jpeg': Image,
        'dot': type(None),
        'pdf': type(None),
        'svg': SVG,
    }
    for format in result_types:
        target = '.'.join([filename, format])
        ensure_not_exists(target)
        try:
            result = dot_graph(dsk, filename=filename, format=format)

            assert not os.path.exists('should_not_get_created.txt')
            assert os.path.isfile(target)
            assert isinstance(result, result_types[format])
        finally:
            ensure_not_exists(target)
Ejemplo n.º 30
0
def visualize(dsk, state, filename='dask'):
    """ Visualize state of compputation as dot graph """
    from dask.dot import dot_graph
    data, func = color_nodes(dsk, state)
    dot_graph(dsk, filename=filename, data_attributes=data,
              func_attributes=func)
Ejemplo n.º 31
0
# %% [markdown]
# ### The following calculation uses numpy, so it releases the GIL

# %%
result = (da_input**2. + da_input**3.).mean(axis=0)
result

# %% [markdown]
# ### Note that result hasn't been computed yet
#
# Here is a graph of how the calculation will be split among 4 threads

# %%
from dask.dot import dot_graph
dot_graph(result.dask)

# %% [markdown]
# ### Now do the calculation

# %%
with Profiler() as prof, ResourceProfiler(dt=0.1) as rprof,\
              CacheProfiler() as cprof:
    answer = result.compute()

# %% [markdown]
# Visualize the cpu, memory and cache for the 4 threads

# %%
visualize([prof, rprof, cprof], min_border_top=15, min_border_bottom=15)
Ejemplo n.º 32
0
def test_filenames_and_formats(tmpdir, filename, format, target,
                               expected_result_type):
    result = dot_graph(dsk, filename=str(tmpdir.join(filename)), format=format)
    assert tmpdir.join(target).exists()
    assert isinstance(result, expected_result_type)
Ejemplo n.º 33
0
def vis(blocker_list):
    _b = convert_ldicts_to_sdict(blocker_list)
    dot_graph(_b)
Ejemplo n.º 34
0
def feature_extraction(dirpath,
                       suffix_seg,
                       suffix_int,
                       num_LMs,
                       downsample,
                       clustering,
                       features,
                       recurse=False,
                       select_IDs='all',
                       assign_landmarks_kwargs='default',
                       compute_TFOR=True,
                       transform_to_TFOR_kwargs='default',
                       perform_CBE_TFOR_kwargs='default',
                       compute_CFOR=True,
                       perform_CBE_CFOR_kwargs='default',
                       processes=None,
                       dask_graph_path=None,
                       profiling=False,
                       verbose=False):
    """Extract latent features from fluorescence distributions of single-cell
    segmentations by point cloud sampling and cluster-based embedding.

    This is a dask pipeline that applies point-cloud sampling from
    `katachi.tools.assign_landmars`, transformation to the TFOR (optional)
    from `katachi.tools.find_TFOR` and cluster-based embedding (either on TFOR
    data or by constructing a CFOR, or both) from `katachi.tools.perform_CBE`
    to a dataset of single-cell segmentations that has been generated by
    `katachi.pipelines.segmentation` or an equivalent approach.

    WARNING: Not all options provided by this pipeline have been extensively
    tested. Use with prudence!

    Parameters
    ----------
    dirpath : string
        The path (either local from cwd or global) to the directory with the
        input data to be processed.
    suffix_seg : string
        File suffix that identifies target segmentation files as produced by
        `katachi.pipelines.segmentation`. This will usually be "seg.tif" but
        could contain more information to distinguish different segmentations.
    suffix_int : string
        File suffix that identifies target intensity files matching the shape
        of the target segmentation files. Each retrieved segmentation file must
        have a matching intensity file.
    num_LMs : int
        The number of landmarks to extract for each cell.
    downsample : tuple (algorithm, output_size) or None
        A tuple specifying the algorithm to use for downsampling of the merged
        point cloud prior to cluster extraction.
        See `katachi.tools.perform_CBE` for more information.
    clustering : tuple (algorithm, n_clusters)
        A tuple specifying the algorithm to use for computing the clusters to
        use in cluster-based feature extraction.
        See `katachi.tools.perform_CBE` for more information.
        Special case: both elements of clustering (i.e. `algorithm` and
        `n_clusters`) may themselves be tuples. In this case, their first and
        second elements will be used in CBE on TFOR and CFOR, respectively.
    features : list of strings
        List containing any number of cluster features to be extracted.
        See `katachi.tools.perform_CBE` for more information.
    recurse : bool, optional, default False
        If True, files are searched recursively in the subdirs of fpath.
    select_IDs : 'all' or list of strings, optional, default 'all'
        If 'all' (default), all detected input files (i.e. all samples) are
        used. Instead, a list of strings containing IDs (as assigned by
        `katachi.tools.initialize`) can be passed, in which case only samples
        whose IDs are in the list are used. If there are IDs in the list for
        which no matching files were found, a warning is shown.
    assign_landmarks_kwargs : dict or 'default', optional, default 'default'
        Dictionary specifying kwargs for assign_landmarks function.
        See `katachi.tools.assign_landmarks.assign_landmarks` for information
        about available options.
        See section "Prepare kwargs for landmark assignment" in this function
        for information on default settings.
    compute_TFOR : bool, optional, default True
        If True, the prim frame of reference is computed and CBE is performed
        on the TFOR landmark data.
        At least one of compute_TFOR or compute_CFOR must be set to True.
    transform_to_TFOR_kwargs : dict or 'default', optional, default 'default'
        Dictionary specifying kwargs for transform_to_TFOR function.
        See `katachi.tools.find_TFOR.transform_to_TFOR` for information
        about available options.
        See section "Prepare kwargs for transformation to TFOR" in this
        function for information on default settings.
    perform_CBE_TFOR_kwargs : dict or 'default', optional, default 'default'
        Dictionary specifying kwargs for cbe function applied to TFOR.
        See `katachi.tools.perform_CBE.cbe` for information about available
        options.
        See section "Prepare kwargs for CBE on TFOR" in this function for
        information on default settings.
    compute_CFOR : bool, optional, default True
        If True, the cell frame of reference is computed and CBE is performed
        on the CFOR landmark data.
        At least one of compute_TFOR or compute_CFOR must be set to True.
    perform_CBE_CFOR_kwargs : dict or 'default', optional, default 'default'
        Dictionary specifying kwargs for cbe function applied to CFOR.
        See `katachi.tools.perform_CBE.cbe` for information about available
        options.
        See section "Prepare kwargs for CBE on CFOR" in this function for
        information on default settings.
    processes : int or None, optional
        Number of processes dask may use for parallel processing. If None, half
        of the available CPUs are used. If set to 1, the entire code is run
        sequentially (but dask is still required for CBE!).
    dask_graph_path : string or None, optional, default None
        If a path (including a file ending matching a known image format, such
        as '.png') is specified as a string, a dask graph image is created that
        shows the constructed dask pipeline.
        Note: The resulting graph may get very large if many samples are used
        at the same time.
    profiling: bool, optional, default False
        If True, dask resource profiling is performed and visualized after the
        pipeline run is finished. This may generate a `profile.html` file in
        the working directory [bug in dask].
    verbose : bool, optional, default False
        If True, more information is printed.
    """

    #--------------------------------------------------------------------------

    ### Get a list of files to run

    # Function to select pairs of files (seg, dir) and create paths
    def prepare_fpaths(dirpath, fnames):

        # Find segmentation files
        seg_names = [
            fname for fname in fnames if fname.endswith(suffix_seg + ".tif")
        ]

        # Exclude files not in select_IDs
        if not select_IDs == 'all':
            seg_names = [
                fname for fname in seg_names
                if any([fname.startswith(ID) for ID in select_IDs])
            ]

        # Get IDs
        seg_IDs = [fname[:10] for fname in seg_names]

        # Get matching intensity files
        int_names = []
        for ID in seg_IDs:
            int_name = [
                fname for fname in fnames
                if fname.startswith(ID) and fname.endswith(suffix_int + ".tif")
            ]
            try:
                int_names.append(int_name[0])
            except IndexError:
                raise IOError("Could not find matching intensity file for " +
                              "segmentation file with ID " + ID)

        # Create path
        seg_paths = [os.path.join(dirpath, name) for name in seg_names]
        int_paths = [os.path.join(dirpath, name) for name in int_names]

        # Return results
        return [(seg_paths[i], int_paths[i]) for i in range(len(seg_paths))]

    # Remove .tif if it was specified with the suffix
    if suffix_seg.endswith(".tif"): suffix_seg = suffix_seg[:-4]
    if suffix_int.endswith(".tif"): suffix_int = suffix_int[:-4]

    # Run for single dir
    if not recurse:
        fnames = os.listdir(dirpath)
        fpaths = prepare_fpaths(dirpath, fnames)

    # Run for multiple subdirs
    if recurse:
        fpaths = []
        for dpath, _, fnames in os.walk(dirpath):
            fpaths += prepare_fpaths(dpath, fnames)

    # Test if all samples in select_IDs are present
    if not select_IDs == 'all':
        fpaths_IDs = [os.path.split(fp[0])[1][:10] for fp in fpaths]
        orphan_IDs = [ID for ID in select_IDs if ID not in fpaths_IDs]
        if any(orphan_IDs):
            warn(
                "No matching files found for some of the IDs in select_IDs: " +
                ", ".join(orphan_IDs))

    # Check
    if len(fpaths) == 0:
        raise IOError("No matching files found in target directory.")

    # Handle processes
    if processes is None:
        processes = cpu_count() // 2

    # More checks
    if not compute_TFOR and not compute_CFOR:
        raise IOError("At least one of compute_TFOR or compute_CFOR must be " +
                      "set to True.")

    # Report
    if verbose:
        print "Detected", len(fpaths), "target file pairs."

    #--------------------------------------------------------------------------

    ### Prepare kwargs for landmark assignment

    # Default kwargs for landmark assignment
    la_kwargs = dict()
    la_kwargs['save_centroids'] = True
    la_kwargs['fpath_out'] = None
    la_kwargs['show_cells'] = None
    la_kwargs['verbose'] = False
    la_kwargs['global_prep_func'] = None
    la_kwargs['global_prep_params'] = None
    la_kwargs['local_prep_func'] = None
    la_kwargs['local_prep_params'] = None
    la_kwargs['landmark_func'] = 'default'
    la_kwargs['landmark_func_params'] = None

    # User-specified kwargs for landmark assignment
    if assign_landmarks_kwargs != 'default':
        for kw in assign_landmarks_kwargs.keys():
            la_kwargs[kw] = assign_landmarks_kwargs[kw]

    # Safety check
    if la_kwargs['fpath_out'] is not None:
        raise IOError(
            "`assign_landmarks_kwargs['fpath_out']` must be set to " +
            "`None`, otherwise files will overwrite each other.")

    #--------------------------------------------------------------------------

    ### Prepare kwargs for TFOR transformation

    # Default kwargs for transformation to TFOR
    TFOR_kwargs = dict()
    TFOR_kwargs['n_points'] = 3000
    TFOR_kwargs['verbose'] = False
    TFOR_kwargs['show'] = False

    # User-specified kwargs for TFOR
    if transform_to_TFOR_kwargs != 'default':
        for kw in transform_to_TFOR_kwargs.keys():
            TFOR_kwargs[kw] = transform_to_TFOR_kwargs[kw]

    # Safety check
    if not compute_TFOR and transform_to_TFOR_kwargs is not 'default':
        warn("Non-default kwargs were passed for transformation to TFOR but " +
             "compute_TFOR is set to False!")

    #--------------------------------------------------------------------------

    ### Prepare args for CBE

    # Handle differing clustering inputs for TFOR and CFOR
    if type(clustering[0]) == tuple:
        clustering_TFOR = (clustering[0][0], clustering[1][0])
        clustering_cfor = (clustering[0][1], clustering[1][1])
    else:
        clustering_TFOR = clustering_cfor = clustering

    #--------------------------------------------------------------------------

    ### Prepare kwargs for CBE on TFOR

    # Default kwargs for CBE
    cbe_TFOR_kwargs = dict()
    cbe_TFOR_kwargs['normalize_vol'] = None
    cbe_TFOR_kwargs['presample'] = None
    cbe_TFOR_kwargs['cfor'] = None
    cbe_TFOR_kwargs['standardize'] = False
    cbe_TFOR_kwargs['custom_feature_funcs'] = None
    cbe_TFOR_kwargs['dask_graph_path'] = None
    cbe_TFOR_kwargs['processes'] = processes
    cbe_TFOR_kwargs['profiling'] = False
    cbe_TFOR_kwargs['suffix_out'] = {'META': suffix_int}
    cbe_TFOR_kwargs['save_metadata'] = True
    cbe_TFOR_kwargs['save_presampled'] = False
    cbe_TFOR_kwargs['save_cfor'] = False
    cbe_TFOR_kwargs['verbose'] = False

    # User-specified kwargs for CBE
    if perform_CBE_TFOR_kwargs != 'default':
        for kw in perform_CBE_TFOR_kwargs.keys():
            cbe_TFOR_kwargs[kw] = perform_CBE_TFOR_kwargs[kw]

    #--------------------------------------------------------------------------

    ### Prepare kwargs for CBE on CFOR

    # Default kwargs for CBE
    cbe_cfor_kwargs = dict()
    cbe_cfor_kwargs['normalize_vol'] = True
    cbe_cfor_kwargs['presample'] = None
    cbe_cfor_kwargs['cfor'] = ('PD', 3)
    cbe_cfor_kwargs['standardize'] = True
    cbe_cfor_kwargs['custom_feature_funcs'] = None
    cbe_cfor_kwargs['dask_graph_path'] = None
    cbe_cfor_kwargs['processes'] = processes
    cbe_cfor_kwargs['profiling'] = False
    cbe_cfor_kwargs['suffix_out'] = {'META': suffix_int}
    cbe_cfor_kwargs['save_metadata'] = True
    cbe_cfor_kwargs['save_presampled'] = False
    cbe_cfor_kwargs['save_cfor'] = True
    cbe_cfor_kwargs['verbose'] = False

    # User-specified kwargs for CBE
    if perform_CBE_CFOR_kwargs != 'default':
        for kw in perform_CBE_CFOR_kwargs.keys():
            cbe_cfor_kwargs[kw] = perform_CBE_CFOR_kwargs[kw]

    #--------------------------------------------------------------------------

    ### If desired: run sequentially

    if processes == 1:

        if verbose: print "Processing target file pairs sequentially..."

        # Landmark extraction
        if verbose: print "--Assigning landmarks..."
        fpaths_lm = []
        for seg_path, int_path in fpaths:
            assign_landmarks(seg_path, int_path, num_LMs, **la_kwargs)
            fpaths_lm.append((seg_path, int_path[:-4] + "_LMs.npy"))

        # Computing the TFOR and performing CBE on TFOR
        if compute_TFOR:

            # Run the transformation to TFOR
            if verbose: print "--Transforming to TFOR..."
            fpaths_TFOR = []
            for seg_path, lm_path in fpaths_lm:
                transform_to_TFOR(seg_path, lm_path, **TFOR_kwargs)
                fpaths_TFOR.append(lm_path[:-4] + "_TFOR.npy")

            # Performing CBE on TFOR
            if verbose: print "--Performing CBE on TFOR..."
            cbe(fpaths_TFOR, downsample, clustering_TFOR, features,
                **cbe_TFOR_kwargs)

        # Performing CBE on CFOR
        if compute_CFOR:
            if verbose: print "--Performing CBE on CFOR..."
            lm_paths = [fpath[1] for fpath in fpaths_lm]
            cbe(lm_paths, downsample, clustering_cfor, features,
                **cbe_cfor_kwargs)

        # Done
        if verbose: print "Processing complete!"
        return

    #--------------------------------------------------------------------------

    ### Prepare dask dict

    dask_graph = dict()

    # For each input...
    fpaths_lm = []
    fpaths_TFOR = []
    for idx, fpath in enumerate(fpaths):

        # Landmark extraction nodes
        seg_path, int_path = fpath
        asgn_lms = partial(assign_landmarks, **la_kwargs)
        dask_graph["asgn_lms_%i" % idx] = (asgn_lms, seg_path, int_path,
                                           num_LMs)
        lm_path = int_path[:-4] + "_LMs.npy"
        fpaths_lm.append(lm_path)

        # Transform to TFOR
        if compute_TFOR:

            # Transform to TFOR
            tf2TFOR = partial(transform_to_TFOR, **TFOR_kwargs)
            tf2TFOR_await = lambda _, s, lmp: tf2TFOR(s, lmp)
            dask_graph["tf2TFOR_%i" % idx] = (tf2TFOR_await,
                                              "asgn_lms_%i" % idx, seg_path,
                                              lm_path)
            fpaths_TFOR.append(lm_path[:-4] + "_TFOR.npy")

    # Perform CBE on TFOR
    if compute_TFOR:
        cbe_TFOR = partial(cbe, **cbe_TFOR_kwargs)
        cbe_TFOR_await = lambda _, lmp, ds, cl, fe: cbe_TFOR(lmp, ds, cl, fe)
        dask_graph["CBE_TFOR"] = (cbe_TFOR_await, [
            "tf2TFOR_%i" % idx for idx in range(len(fpaths))
        ], fpaths_TFOR, downsample, clustering_TFOR, features)

    # Perform CBE on CFOR
    if compute_CFOR:

        cbe_cfor = partial(cbe, **cbe_cfor_kwargs)
        cbe_cfor_await = lambda _, lmp, ds, cl, fe: cbe_cfor(lmp, ds, cl, fe)

        # Don't parallelize CBEs; wait for TFOR-CBE to finish
        if compute_TFOR:
            dask_graph["CBE_CFOR"] = (cbe_cfor_await, "CBE_TFOR", fpaths_lm,
                                      downsample, clustering_cfor, features)
        else:
            dask_graph["CBE_CFOR"] = (cbe_cfor_await, [
                "asgn_lms_%i" % idx for idx in range(len(fpaths))
            ], fpaths_lm, downsample, clustering_cfor, features)

    # Create dask graph
    if dask_graph_path is not None:
        from dask.dot import dot_graph
        dot_graph(dask_graph, filename=dask_graph_path)

    #--------------------------------------------------------------------------

    ### Run in parallel (with dask)

    # Report
    if verbose: print "Processing target file pairs in parallel..."

    # Set number of threads
    dask.set_options(pool=ThreadPool(processes))

    # Run the pipeline (no profiling)
    if not profiling:
        if compute_CFOR:
            with ProgressBar(dt=1):
                dask.threaded.get(dask_graph, 'CBE_CFOR')
        else:
            with ProgressBar(dt=1):
                dask.threaded.get(dask_graph, 'CBE_TFOR')

    # Run the pipeline (with resource profiling)
    if profiling:
        if compute_CFOR:
            with ProgressBar(dt=1):
                with Profiler() as prof, ResourceProfiler(dt=0.1) as rprof:
                    dask.threaded.get(dask_graph, 'CBE_CFOR')
                visualize([prof, rprof], save=False)
        else:
            with ProgressBar(dt=1):
                with Profiler() as prof, ResourceProfiler(dt=0.1) as rprof:
                    dask.threaded.get(dask_graph, 'CBE_TFOR')
                visualize([prof, rprof], save=False)

    # Report and return
    if verbose: print "Processing complete!"
    return
Ejemplo n.º 35
0
        if np.array_equal(da_new_centroids.compute(), da_centroids.compute()):
            break

        da_centroids = da_new_centroids

    return da_clusters, da_centroids


if __name__ == '__main__':

    # Generate sample data
    centers = [[1, 1], [-1, -1], [1, -1]]

    X, labels_true = make_blobs(n_samples=50,
                                centers=centers,
                                cluster_std=0.5,
                                random_state=0)

    result = kmeans(X, k=10)
    dot_graph(result[0].dask, filename='clusters')
    dot_graph(result[1].dask, filename='centroids')

    print("Result:\nClusters")
    print(result[0].compute())

    print("Centroids")
    print(result[1].compute())

    print("Silhouette Coefficient: %0.3f" % metrics.silhouette_score(
        X.tolist(), result[0].compute().tolist(), metric='euclidean'))
Ejemplo n.º 36
0
# - Airflow - https://airflow.apache.org/
# - KNIME - https://www.knime.com/
# - Google Tensorflow - https://www.tensorflow.org/
# - Pytorch / Torch - http://pytorch.org/

# # Concrete example
# What is a DAG good for?

# In[32]:

import dask.array as da
from dask.dot import dot_graph

image_1 = da.zeros((5, 5), chunks=(5, 5))
image_2 = da.ones((5, 5), chunks=(5, 5))
dot_graph(image_1.dask)

# In[33]:

image_3 = image_1 + image_2
dot_graph(image_3.dask)

# In[34]:

image_4 = (image_1 - 10) + (image_2 * 50)
dot_graph(image_4.dask)

# # Let's go big
# Now let's see where this can be really useful

# In[35]:
Ejemplo n.º 37
0
 def _visualize(self, optimize_graph=False):
     from dask.dot import dot_graph
     if optimize_graph:
         return dot_graph(optimize(self.dask, self._keys()))
     else:
         return dot_graph(self.dask)
def cbe(fpaths_lm,
        downsample,
        clustering,
        features,
        normalize_vol=False,
        presample=None,
        cfor=None,
        standardize='default',
        custom_feature_funcs=None,
        bw_method=None,
        dask_graph_path=None,
        processes=None,
        profiling=False,
        suffix_out='default',
        save_metadata=True,
        save_presampled=False,
        save_cfor=False,
        verbose=False,
        legacy=False):
    """Create a feature space from a set of point clouds by cluster-based
    embedding (CBE).

    This includes the following steps:
        1. Loading a set of point clouds
        2. Normalizing point clouds by volume (optional)
        3. Down-sampling of each point cloud individually (optional)
            - Available options are random, kmeans or custom downsampling
        4. Making point clouds invariant to spatial transformation (optional)
            - Also called the "Cell Frame Of Reference" (CFOR)
            - There are currently 2 ways of accomplishing this
                - Transform to pairwise distance space (PD)
                - Transform to PCA space (PCA) [DEPRECATED]
            - It is also possible to pass a custom transform function.
        5. Merging point clouds
        6. Downsampling of merged point clouds (optional but recommended!)
            - Reduces computational cost/scaling of subsequent step
            - Options are density-dep., kmeans, random or custom downsampling
        7. Extracting cluster centers as common reference points
            - Options are kmeans, dbscan and custom clustering
        8. Extracting "cluster features" relative to the reference points
            - Done with dask for effecient chaining of operations
            - Multiple feature options available, see below
        9. Saving the resulting feature space as well as intermediate results

    Cluster features that can be extracted:
        - "kNN-distsManh"  : Manhatten distance in all dimensions of each
                             cluster to the mean point of its k nearest
                             neighbor landmarks.
        - "kNN-distEuclid" : Euclidean distance of each cluster to the mean
                             point of its k nearest neighbor landmarks.
        - "NN-distsManh"   : Manhatten distance in all dimensions of each
                             cluster to the nearest neighboring landmark.
        - "NN-distEuclid"  : Euclidean distance of each cluster to the nearest
                             neighboring landmark.
        - "count-near"     : Number of landmarks near to the cluster, where
                             'near' is the mean distance of the k nearest
                             neighbor landmarks of the cluster.
        - "count-assigned" : Number of landmarks assigned to the cluster during
                             the clustering itself.
        - "kde"            : KDE estimated from cell landmarks sampled for each
                             cluster center.
        - custom features  : See custom_feature_funcs in parameters.

    Feature computations are in part dependent on each other. To make this both
    efficient and readable/elegant, dask is used for chaining the feature
    extraction steps appropriately.

    At the end, features are concatenated into a single array of shape
    (cells, features) and then saved for each input stack separately.

    Parameters
    ----------
    fpaths_lm : single string or list of strings
        A path or list of paths (either local from cwd or global) to npy files
        containing cellular landmarks as generated by
        `katachi.tools.assign_landmarks` or `...find_TFOR`.
    downsample : tuple (algorithm, output_size) or None
        A tuple specifying the algorithm to use for downsampling of the merged
        point cloud prior to cluster extraction. Available algorithms are
        "ddds" (density-dependent downsampling), "kmeans" (perform kmeans and
        use cluster centers as new points) or "random". If "default" is passed,
        "ddds" is used.
        Example: ("ddds", 200000).
        Alternatively, if instead of a string denoting the algorithm a callable
        is passed, that callable is used for downsampling.
        The call signature is
        `all_lms_ds = downsample[0](all_lms, downsample)`
        where all_lms is an array of shape (all_landmarks, dimensions) holding
        all input landmarks merged into one point cloud. Since the `downsample`
        tuple itself is passed, additional arguments can be specified in
        additional elements of that tuple. all_lms_ds must be an array of shape
        (output_size, dimensions).
        If None, no downsampling is performed. This is not recommended for
        inputs of relevant sizes (total landmarks > 20000).
        WARNING: downsampling (especially by ddds) can be very expensive for
        large numbers of cells. In those cases, it is recommended to first run
        a representative subsets of the cells and then use the resulting CBE
        clusters to extract features for the entire dataset (using the
        `previous` setting in the `clustering` argument).
    clustering : tuple (algorithm, n_clusters)
        A tuple specifying the algorithm to use for computing the clusters to
        use in cluster-based feature extraction. Available algorithms are
        "kmeans" or "dbscan". If "default" is passed, "kmeans" is used.
        Example: ('kmeans', 10)
        Alternatively, one may pass a tuple `('previous', clustering_object)`,
        where `clustering_object` is a previously fitted clustering instance
        similar to an instantiated and fitted sklearn.cluster.KMeans object. It
        must have the attribute `cluster_centers_`, which is an array of shape
        (clusters, dimensions) and the method `predict`, which given an array
        of shape `(all_landmarks, dimensions)` will return cluster labels for
        each landmark. Clustering objects from previous runs are stored in
        the metadata under the key `"clustobj-"+identifier`.
        Alternatively, if instead of a string denoting the algorithm a callable
        is passed, that callable is used for clustering.
        The call signature is
        `clust_labels, clust_centers = clustering[0](all_lms, clustering)`
        where all_lms is an array of shape (all_landmarks, dimensions) holding
        all input landmarks merged into one point cloud (and downsampled in the
        previous step). Since the `clustering` tuple itself is passed,
        additional arguments can be specified in additional elements of that
        tuple. `clust_labels` must be a 1D integer array assigning each input
        landmark to a corresponding cluster center. `clust_centers` must be an
        array of shape (clusters, dimensions) and contain the coordinates of
        the cluster centers. The first axis must be ordered such that the
        integers in `clust_labels` index it correctly. The number of clusters
        must match n_clusters.
    features : list of strings
        List containing any number of cluster features to be extracted. The
        strings noted in the explanation above are allowed. If custom feature
        extraction functions are passed (see below), their names must also be
        included in this list.
        Example: ["kNN-distEuclid", "count-near"]
    normalize_vol : bool, optional, default False
        If True, the volume of each input point cloud is normalized by dividing
        each landmark vector magnitude by the sum of all magnitudes.
    presample : tuple (algorithm, output_size) or None, optional, default None
        If not None, the algorithm specified is used to downsample each input
        cloud individually to output_size points. Available algorithms are
        "kmeans" (perform kmeans and use cluster centers as new points) or
        "random".
        Example: ('random', 50)
        Alternatively, if instead of a string denoting the algorithm a callable
        is passed, that callable is used for downsampling.
        The call signature is
        ```for cell in range(lms.shape[0]):
               lms_ds[cell,:,:] = presample[0](lms[cell,:,:], presample)```
        where lms is an array of shape (cells, landmarks, dimensions) holding
        the set of input point clouds. Since the `presample` tuple itself is
        passed, additional arguments can be specified in additional elements of
        that tuple. lms_ds must be an array of shape
        (cells, output_size, dimensions).
        If None, no presampling is performed.
    cfor : tuple (algorithm, dimensions) or None, optional, default None
        A tuple specifying the algorithm to use for recasting the landmarks in
        a space that is invariant to spatial transformations. There are two
        options available: "PD" (pairwise distance transform) and "PCA"
        (per-cell PCA and transform).
        For "PD", the total complement of pairwise distances between all points
        is computed and then subsampled to `dimensions` by selecting a
        corresponding number of distance percentiles in a linear range between
        the 10th to the 90th percentile (inclusive).
        For "PCA", the number of dimensions in the resulting space is equal to
        the number of dimensions of the input (should be 3). The `dimensions`
        part of the argument is ignored (but it must still be suplied!).
        If "default" is passed, "PD" is used.
        Example 1: ('PD', 6)
        Example 2: ('default', 6)  # defaults to 'PD'
        Example 3: ('PCA', 3)
        Alternatively, if a callable is passed instead of a stringm that
        callable is used for downsampling.
        The call signature is
        ```for cell in range(lms.shape[0]):
               lms_cfor[cell,:,:] = cfor[0](lms[cell,:,:], cfor)```
        where lms is an array of shape (cells, landmarks, dimensions) holding
        the set of input point clouds. Since the `cfor` tuple itself is passed,
        additional arguments can be specified in additional elements of that
        tuple. lms_ds must be an array of shape
        (cells, output_size, dimensions).
        If None, no transformation is performed; cells are left in the original
        3D space.
    standardize : bool or 'default', optional, default 'default'
        If True, the point cloud dimensions of the merged CFOR point cloud are
        standardised to zero mean and unit variance. This is also propagated
        to the individual clouds used for feature extraction and for saving
        in case the CFOR is being saved.
        If 'default', standardization is performed only if cfor is set to "PD".
        If False, no standardization is performed.
    custom_feature_funcs : list of tuples or None, optional, default None
        List used to specify one or more custom feature extraction functions.
        Each custom function is specified through a tuple in the list that
        is structured as such:
            `(feature_name, extraction_func, parent_names, other_params)`
        where `feature_name` is the name of the feature as it appears in the
        `features` argument, `extraction_func` is a callable, `parent_names`
        is a lsit of parent feature names (as they appear in the `features`
        argument) used as input to `extraction_func`, and `other_params` is a
        list of other parameters for `extraction_func`.
        The call signature is
        ```dask_graph[custom_func[0]+"_%i" % c] =
               (feature_name, [parent+"_%i" % c for parent in parent_names],
                other_params, lms[c,:,:], clust_centers, clust_labels[c]) ```
        within the dask graph, where `c` is the index of a cell.
        The callable must therefore accept a list of parent features (can be
        an empty list), a list of other parameters (can alos be empty), the
        (preprocessed) landmarks of the given cell, the cluster centers and
        the cluster labels of the given cell.
        It must return a 1D array of float values; the feature vector for the
        current cell `c`.
    bw_method : str, scalar, callable or None, optional, default None
        The method used to calculate the estimator bandwidth for the gaussian
        kde when computing the "kde" feature. This can be ‘scott’, ‘silverman’,
        a scalar constant or a callable. If a scalar, this will be used
        directly as `kde.factor`. If a callable, it should take a gaussian_kde
        instance as only parameter and return a scalar. If None (default),
        ‘scott’ is used. This is ignored if "kde" is not in `features`.
        < Modified from `scipy.stats.gaussian_kde` doc string. >
    dask_graph_path : string or None, optional, default None
        If a path (including a file ending matching a known image format, such
        as '.png') is specified as a string, a dask graph image is created that
        summarizes the feature extraction pipeline for the first 3 cells.
        Note: If the resulting graph contains multiple separate graphs, the
        only relevant graph is the one leading into `fspace` as an end result.
    processes : int or None, optional, default None
        Number of processes to use in multiprocessed and dask-controlled
        operations. If None, a number equal to half the available PCUs is used.
        If `1` (one), no multiprocessing is performed and `dask.get` is used
        instead of `dask.threaded.get`.
    profiling : bool, optional, default False
        If True, dask resource profiling is performed and visualized after the
        pipeline run is finished. This may generate a `profile.html` file in
        the working directory [bug in dask].
    suffix_out : 'default' or dict, optional, default 'default'
        If 'default', the ouput is saved using '_PRES', '_CFOR', '_DS', and
        '_CBE' as suffices for the presampled landmarks (if `presample` is not
        None), for the CFOR-transformed landmarks (if `cfor` is not None), for
        overlayed downsampling (if `downsample` is not None)(note that this is
        not saved explicitly but is part of the suffix for the CBE-embedded
        feature space), and for the CBE-embedded feature space, respectively.
        The suffices are chained as appropriate. If a dict is passed, each of
        these suffices can be specified manually using the keys 'PRES', 'CFOR',
        'DS', 'CBE' and 'META'.
        The suffix specified in 'META' is added to all relevant metadata
        dictionary keys. For any suffices not specified in the suffix_out dict,
        the 'default' suffix is used.
    save_metadata : bool, optional, default True
        If True, cluster samples, cluster labels and a feature header are saved
        to the metadata of each input stack as appropriate.
    save_presampled : bool, optional, default False
        If True, the result of the presampling step is saved with the suffix
        "PRES" for later use.
    save_cfor : bool, optional, default False
        If True, the result of the cfor step is saved with the suffix "CFOR"
        for later use.
    verbose : bool, optional, default False
        If True, more information is printed.
    legacy : bool, optional, default False
        If True (and standardize is also set to True), the feature extraction
        is not performed in standardized space. Instead, the cluster centroids
        are transformed back to the un-standardized space.
        Triggers a deprecation warning.
    """

    #--------------------------------------------------------------------------

    ### Load data

    if verbose: print "Loading data..."

    # Handle cases of single paths
    if type(fpaths_lm) == str:
        fpaths_lm = [fpaths_lm]
    if len(fpaths_lm) == 1:
        warn(
            "fpaths_lm specifies only a single path. Usually, multiple paths" +
            " are specified so that many samples can be overlayed for" +
            " feature extraction!")

    # Import the landmark data
    # Note: The order of fpaths_lm is maintained and an index array is created!
    lms = []
    lms_idx = []
    for idx, fpath_lm in enumerate(fpaths_lm):
        try:
            lms_in = np.load(fpath_lm)
            lms.append(lms_in)
            lms_idx += [idx for i in range(lms_in.shape[0])]
        except:
            print "Attempting to load landmark data from " + str(fpath_lm),
            print "failed with this error:"
            raise
    lms_idx = np.array(lms_idx, dtype=np.int)
    lms = np.concatenate(lms)
    if verbose: print "Total input data shape:", lms.shape

    # Check if downsampling is specified
    if downsample is None:
        warn("It is highly recommended to use downsampling (unless the data " +
             "set is very small)!")

    # Handle processes being None
    if processes is None:
        processes = cpu_count() // 2

    # Handle standardize being default
    if standardize == 'default':
        standardize = False
        if cfor[0] == 'PD':
            standardize = True

    # Handle legacy mode
    if legacy:
        warn("Running in LEGACY mode! This is DEPRECATED!", DeprecationWarning)

    #--------------------------------------------------------------------------

    ### Normalize volume [per cell]

    if normalize_vol:
        if verbose: print "Normalizing volumes..."
        lms = vol_normalize(lms, verbose=verbose)

    #--------------------------------------------------------------------------

    ### Individual downsampling (presampling) [per cell]

    if presample is not None:
        if verbose: print "Presampling..."

        # Prep
        lms_ps = np.zeros((lms.shape[0], presample[1], lms.shape[2]))

        # Random subsampling
        if presample[0] == 'random':
            for cell in range(lms.shape[0]):
                lms_ps[cell, :, :] = ds.random_subsample(
                    lms[cell, :, :], presample[1])

        # Kmeans-based downsampling
        elif presample[0] == 'kmeans':
            for cell in range(lms.shape[0]):
                lms_ps[cell, :, :] = ds.kmeans_subsample(
                    lms[cell, :, :], presample[1])

        # Custom downsampling function
        elif callable(presample[0]):
            for cell in range(lms.shape[0]):
                lms_ps[cell, :, :] = presample[0](lms[cell, :, :], presample)

        # Handle other cases
        else:
            raise ValueError("Invalid presampling method: " +
                             str(presample[0]))

        # Assign the downsampled data back
        lms = lms_ps

    #--------------------------------------------------------------------------

    ### Transform to "Cell Frame Of Reference" (CFOR) [per cell]

    if cfor is not None:
        if verbose: print "Transforming to CFOR..."

        # Prep
        lms_cfor = np.zeros((lms.shape[0], lms.shape[1], cfor[1]))

        # Pairwise distance transform
        if cfor[0] == 'PD' or cfor[0] == 'default':
            for cell in range(lms.shape[0]):
                lms_cfor[cell, :, :] = pd_transform(lms[cell, :, :],
                                                    percentiles=cfor[1])

        # PCA transform
        elif cfor[0] == 'PCA':
            for cell in range(lms.shape[0]):
                lms_cfor[cell, :, :] = PCA().fit_transform(lms[cell, :, :])

        ## RBF transform by Nystroem embedding
        ## REMOVED: This does not create matched dimensions and thus cannot be
        ##          used for this purpose.
        #if cfor[0] == 'RBF':
        #    for cell in range(lms.shape[0]):
        #        Ny = kernel_approximation.Nystroem(kernel='rbf',
        #                                           gamma=1/lms.shape[1],
        #                                           n_components=cfor[1],
        #                                           random_state=42)
        #        lms_cfor[cell,:,:] = Ny.fit_transform(lms[cell,:,:])

        # Custom CFOR transform
        elif callable(cfor[0]):
            for cell in range(lms.shape[0]):
                lms_cfor[cell, :, :] = cfor[0](lms[cell, :, :], cfor)

        # Handle other cases
        else:
            raise ValueError("Invalid CFOR method: " + str(cfor[0]))

        # Assign the CFOR data back
        lms = lms_cfor

    #--------------------------------------------------------------------------

    ### Collective downsampling (all cells overlayed) [altogether]
    #   Note: This is done to improve cluster retrieval and to make it more
    #         efficient. It does not affect the feature extraction afterwards.

    # Flatten cells of all samples together
    all_lms = lms.reshape((lms.shape[0] * lms.shape[1], lms.shape[2]))

    # For CFOR-PD: standardize the dimensions
    if standardize and not legacy:

        # Standardize pooled landmarks
        cloud_means = all_lms.mean(axis=0)
        cloud_stds = all_lms.std(axis=0)
        all_lms = (all_lms - cloud_means) / cloud_stds

        # Overwrite unpooled landmarks for feature extraction in standard space
        lms = all_lms.reshape((lms.shape[0], lms.shape[1], lms.shape[2]))

    # Downsampling
    if downsample is not None and clustering[0] != 'previous':
        if verbose: print "Downsampling merged cloud..."

        # Default is density dependent downsampling
        if downsample[0] == 'default' or downsample[0] == 'ddds':
            all_lms_ds = ds.ddds(all_lms,
                                 downsample[1],
                                 presample=downsample[1],
                                 processes=processes)

        # Alternative: kmeans downsampling
        elif downsample[0] == 'kmeans':
            all_lms_ds = ds.kmeans_subsample(all_lms, downsample[1])

        # Alternative: random downsampling
        elif downsample[0] == 'random':
            all_lms_ds = ds.random_subsample(all_lms, downsample[1])

        # Custom downsampling
        elif callable(downsample[0]):
            all_lms_ds = downsample[0](all_lms, downsample)

        # Handle other cases
        else:
            raise ValueError("Invalid downsampling method: " +
                             str(downsample[0]))

    # No downsampling
    else:
        all_lms_ds = all_lms

    # LEGACY: Standardization after downsampling and without overwriting the
    #         unpooled landmarks!
    if legacy and standardize:
        cloud_means = all_lms_ds.mean(axis=0)
        cloud_stds = all_lms_ds.std(axis=0)
        all_lms_ds = (all_lms_ds - cloud_means) / cloud_stds

    #--------------------------------------------------------------------------

    ### Find reference points by clustering [altogether]

    if verbose: print "Clustering to find reference points..."

    # Default: kmeans clustering
    if clustering[0] == 'default' or clustering[0] == 'kmeans':

        # Perform clustering
        my_clust = MiniBatchKMeans(n_clusters=clustering[1], random_state=42)
        my_clust.fit(all_lms_ds)

        # Get labels and centroids
        clust_labels = my_clust.labels_
        clust_centers = my_clust.cluster_centers_

        # Predict labels for whole data set (if downsampled)
        if downsample is not None:
            clust_labels = my_clust.predict(all_lms)

    # To be added: DBSCAN
    elif clustering[0] == 'dbscan':
        raise NotImplementedError("And likely never will be...")

    # Using a given (already fitted) clustering object
    elif clustering[0] == 'previous':
        my_clust = clustering[1]
        clust_centers = my_clust.cluster_centers_
        clust_labels = my_clust.predict(all_lms)

    # Custom alternative
    elif callable(clustering[0]):
        clust_labels, clust_centers = clustering[0](all_lms, clustering)

    # Handle other cases
    else:
        raise ValueError("Invalid clustering method: " + str(clustering[0]))

    # LEGACY: Back-transform of centroids to un-standardized space
    #         In legacy, feature extraction was done on the un-standardized
    #         space, using the back-transformed centroids
    if legacy and standardize:
        clust_centers = clust_centers * cloud_stds + cloud_means

    # Unpool cluster labels
    clust_labels = clust_labels.reshape((lms.shape[0], lms.shape[1]))

    #--------------------------------------------------------------------------

    ### Extract features relative to reference points [per cell]

    if verbose: print "Extracting cluster features..."

    # Init dask graph
    dask_graph = dict()

    # For each cell...
    for c in range(lms.shape[0]):

        # Node to compute kdtree
        dask_graph["kdtree_%i" % c] = (fe.build_kdtree, lms[c, :, :])

        # Nodes for the features
        dask_graph["kNN-distsManh_%i" % c] = (fe.feature_distsManhatten_kNN,
                                              "kdtree_%i" % c, lms[c, :, :],
                                              clust_centers)

        dask_graph["kNN-distEuclid_%i" % c] = (fe.feature_distEuclidean_kNN,
                                               "kNN-distsManh_%i" % c,
                                               lms.shape[2])

        dask_graph["NN-distsManh_%i" % c] = (fe.feature_distsManhatten_NN,
                                             "kdtree_%i" % c, lms[c, :, :],
                                             clust_centers)

        dask_graph["NN-distEuclid_%i" % c] = (fe.feature_distEuclidean_NN,
                                              "NN-distsManh_%i" % c,
                                              lms.shape[2])

        dask_graph["count-near_%i" % c] = (fe.feature_count_near, [
            "kdtree_%i" % c, "kNN-distEuclid_%i" % c
        ], lms[c, :, :], clust_centers)

        dask_graph["count-assigned_%i" % c] = (fe.feature_count_assigned,
                                               clust_centers, clust_labels[c])

        dask_graph["kde_%i" % c] = (fe.feature_kde, lms[c, :, :],
                                    clust_centers, bw_method)

        # Nodes for custom feature extraction functions
        if custom_feature_funcs is not None:
            for custom_func in custom_feature_funcs:
                custom_parents = [
                    parent + "_%i" % c for parent in custom_func[2]
                ]
                dask_graph[custom_func[0] +
                           "_%i" % c] = (custom_func[1], custom_parents,
                                         custom_func[3], lms[c, :, :],
                                         clust_centers, clust_labels[c])

        # Node to collect requested features for a cell
        dask_graph["fvector_%i" % c] = (fe.assemble_cell,
                                        [f + "_%i" % c
                                         for f in features], features)

        # Render example graph for first 3 cells
        if c == 2 and dask_graph_path is not None:
            from dask.dot import dot_graph
            dask_graph["fspace"] = (fe.assemble_fspace,
                                    ["fvector_%i" % c for c in range(3)])
            dot_graph(dask_graph, filename=dask_graph_path)

    # Final node to put per-cell features into a feature space
    dask_graph["fspace"] = (fe.assemble_fspace,
                            ["fvector_%i" % c for c in range(lms.shape[0])])

    # Run without multiprocessing
    if processes == 1:
        with ProgressBar(dt=1):
            fspace, fheader = dask.get(dask_graph, 'fspace')

    # Run with multiprocessing
    else:

        # Set number of threads
        dask.set_options(pool=ThreadPool(processes))

        # Run the pipeline (no profiling)
        if not profiling:
            with ProgressBar(dt=1):
                fspace, fheader = dask.threaded.get(dask_graph, 'fspace')

        # Run the pipeline (with resource profiling)
        if profiling:
            with ProgressBar(dt=1):
                with Profiler() as prof, ResourceProfiler(dt=0.1) as rprof:
                    fspace, fheader = dask.threaded.get(dask_graph, 'fspace')
                visualize([prof, rprof], save=False)

    #--------------------------------------------------------------------------

    ### Save [per stack], report and return

    if verbose: print "Saving result..."

    # For each stack...
    for sample_idx, sample_fpath in enumerate(fpaths_lm):

        # Prepare suffix
        suffix = ""

        # Save individually downsampled landmark distributions if desired
        if presample is not None and save_presampled:
            if suffix_out == 'default' or 'PRES' not in suffix_out.keys():
                suffix = suffix + "_PRES"
            else:
                suffix = suffix + suffix_out['PRES']
            np.save(sample_fpath[:-4] + suffix,
                    lms_ps[lms_idx == sample_idx, :, :])

        # Save CFOR if desired
        if cfor is not None and save_cfor:
            if suffix_out == 'default' or 'CFOR' not in suffix_out.keys():
                suffix = suffix + "_CFOR"
            else:
                suffix = suffix + suffix_out['CFOR']
            np.save(sample_fpath[:-4] + suffix,
                    lms[lms_idx == sample_idx, :, :])

        # Include downsampling in suffix
        if downsample is not None:
            if suffix_out == 'default' or 'DS' not in suffix_out.keys():
                suffix = suffix + '_DS'
            else:
                suffix = suffix + suffix_out['DS']

        # Save shape space
        if suffix_out == 'default' or 'CBE' not in suffix_out.keys():
            suffix = suffix + "_CBE"
        else:
            suffix = suffix + suffix_out['CBE']
        np.save(sample_fpath[:-4] + suffix, fspace[lms_idx == sample_idx, :])

        # Save new metadata
        if save_metadata:

            # Construct metadata path
            dirpath, fname = os.path.split(sample_fpath)
            fpath_meta = os.path.join(dirpath,
                                      fname[:10] + "_stack_metadata.pkl")

            # Open metadata
            with open(fpath_meta, 'rb') as metafile:
                meta_dict = pickle.load(metafile)

            # Prepare metadata suffix
            if suffix_out == 'default' or 'META' not in suffix_out.keys():
                if suffix[0] == '_':
                    m_suffix = suffix[1:]
                else:
                    m_suffix = suffix
            else:
                if suffix[0] == '_':
                    m_suffix = suffix[1:] + suffix_out['META']
                else:
                    m_suffix = suffix + suffix_out['META']

            # Slightly awkward addition of TFOR tag
            if 'TFOR' in fpaths_lm[0]:
                m_suffix = 'TFOR_' + m_suffix

            # Add new metadata
            meta_dict["clustobj-" + m_suffix] = my_clust
            meta_dict["clusters-" + m_suffix] = clust_centers
            meta_dict["labels-" +
                      m_suffix] = clust_labels[lms_idx == sample_idx]
            meta_dict["features-" + m_suffix] = fheader

            # Write metadata
            with open(fpath_meta, 'wb') as metafile:
                pickle.dump(meta_dict, metafile, pickle.HIGHEST_PROTOCOL)

    # Report and return
    if verbose: print "Processing complete!"
    return
Ejemplo n.º 39
0
def visualize(
    *args, filename="mydask", traverse=True, optimize_graph=False, maxval=None, **kwargs
):
    """
    Visualize several dask graphs simultaneously.

    Requires ``graphviz`` to be installed. All options that are not the dask
    graph(s) should be passed as keyword arguments.

    Parameters
    ----------
    args : object
        Any number of objects. If it is a dask collection (for example, a
        dask DataFrame, Array, Bag, or Delayed), its associated graph
        will be included in the output of visualize. By default, python builtin
        collections are also traversed to look for dask objects (for more
        information see the ``traverse`` keyword). Arguments lacking an
        associated graph will be ignored.
    filename : str or None, optional
        The name of the file to write to disk. If the provided `filename`
        doesn't include an extension, '.png' will be used by default.
        If `filename` is None, no file will be written, and we communicate
        with dot using only pipes.
    format : {'png', 'pdf', 'dot', 'svg', 'jpeg', 'jpg'}, optional
        Format in which to write output file.  Default is 'png'.
    traverse : bool, optional
        By default, dask traverses builtin python collections looking for dask
        objects passed to ``visualize``. For large collections this can be
        expensive. If none of the arguments contain any dask objects, set
        ``traverse=False`` to avoid doing this traversal.
    optimize_graph : bool, optional
        If True, the graph is optimized before rendering.  Otherwise,
        the graph is displayed as is. Default is False.
    color : {None, 'order', 'ages', 'freed', 'memoryincreases', 'memorydecreases', 'memorypressure'}, optional
        Options to color nodes. colormap:

        - None, the default, no colors.
        - 'order', colors the nodes' border based on the order they appear in the graph.
        - 'ages', how long the data of a node is held.
        - 'freed', the number of dependencies released after running a node.
        - 'memoryincreases', how many more outputs are held after the lifetime of a node.
          Large values may indicate nodes that should have run later.
        - 'memorydecreases', how many fewer outputs are held after the lifetime of a node.
          Large values may indicate nodes that should have run sooner.
        - 'memorypressure', the number of data held when the node is run (circle), or
          the data is released (rectangle).
    maxval : {int, float}, optional
        Maximum value for colormap to normalize form 0 to 1.0. Default is ``None``
        will make it the max number of values
    collapse_outputs : bool, optional
        Whether to collapse output boxes, which often have empty labels.
        Default is False.
    verbose : bool, optional
        Whether to label output and input boxes even if the data aren't chunked.
        Beware: these labels can get very long. Default is False.
    **kwargs
       Additional keyword arguments to forward to ``to_graphviz``.

    Examples
    --------
    >>> x.visualize(filename='dask.pdf')  # doctest: +SKIP
    >>> x.visualize(filename='dask.pdf', color='order')  # doctest: +SKIP

    Returns
    -------
    result : IPython.diplay.Image, IPython.display.SVG, or None
        See dask.dot.dot_graph for more information.

    See Also
    --------
    dask.dot.dot_graph

    Notes
    -----
    For more information on optimization see here:

    https://docs.dask.org/en/latest/optimize.html
    """
    from dask.dot import dot_graph

    args, _ = unpack_collections(*args, traverse=traverse)

    dsk = dict(collections_to_dsk(args, optimize_graph=optimize_graph))

    color = kwargs.get("color")

    if color in {
        "order",
        "order-age",
        "order-freed",
        "order-memoryincreases",
        "order-memorydecreases",
        "order-memorypressure",
        "age",
        "freed",
        "memoryincreases",
        "memorydecreases",
        "memorypressure",
    }:
        import matplotlib.pyplot as plt

        from dask.order import diagnostics, order

        o = order(dsk)
        try:
            cmap = kwargs.pop("cmap")
        except KeyError:
            cmap = plt.cm.RdBu
        if isinstance(cmap, str):
            import matplotlib.pyplot as plt

            cmap = getattr(plt.cm, cmap)

        def label(x):
            return str(values[x])

        data_values = None
        if color != "order":
            info = diagnostics(dsk, o)[0]
            if color.endswith("age"):
                values = {key: val.age for key, val in info.items()}
            elif color.endswith("freed"):
                values = {key: val.num_dependencies_freed for key, val in info.items()}
            elif color.endswith("memorypressure"):
                values = {key: val.num_data_when_run for key, val in info.items()}
                data_values = {
                    key: val.num_data_when_released for key, val in info.items()
                }
            elif color.endswith("memoryincreases"):
                values = {
                    key: max(0, val.num_data_when_released - val.num_data_when_run)
                    for key, val in info.items()
                }
            else:  # memorydecreases
                values = {
                    key: max(0, val.num_data_when_run - val.num_data_when_released)
                    for key, val in info.items()
                }

            if color.startswith("order-"):

                def label(x):
                    return str(o[x]) + "-" + str(values[x])

        else:
            values = o
        if maxval is None:
            maxval = max(1, max(values.values()))
        colors = {k: _colorize(cmap(v / maxval, bytes=True)) for k, v in values.items()}
        if data_values is None:
            data_values = values
            data_colors = colors
        else:
            data_colors = {
                k: _colorize(cmap(v / maxval, bytes=True))
                for k, v in data_values.items()
            }

        kwargs["function_attributes"] = {
            k: {"color": v, "label": label(k)} for k, v in colors.items()
        }
        kwargs["data_attributes"] = {k: {"color": v} for k, v in data_colors.items()}
    elif color:
        raise NotImplementedError("Unknown value color=%s" % color)

    return dot_graph(dsk, filename=filename, **kwargs)
Ejemplo n.º 40
0
 def _visualize(self, optimize_graph=False):
     from dask.dot import dot_graph
     if optimize_graph:
         dot_graph(optimize(self.dask, self._keys()))
     else:
         dot_graph(self.dask)
Ejemplo n.º 41
0
                         B,
                         '_id',
                         'ltable_id',
                         'rtable_id',
                         'id',
                         'id',
                         nchunks=4,
                         feature_table=F,
                         attrs_after='label',
                         show_progress=False,
                         compute=False)

# print(len(L))
# print(L.head(1))
predictions = dt.predict(
    table=L,
    exclude_attrs=['_id', 'ltable_id', 'rtable_id', 'label'],
    append=True,
    target_attr='predicted',
    inplace=False,
    nchunks=2,
    compute=False)
from dmagellan.optimization.exfeatvecs_predict_sequence_opt import delay_concat, fuse_dag
opt1 = delay_concat(dict(predictions.dask))
opt2 = fuse_dag(opt1)

from dask.dot import dot_graph
dot_graph(opt2)
# print(predictions.head())
# predictions.visualize()
Ejemplo n.º 42
0
def test_filenames_and_formats(filename, format, target, expected_result_type):
    result = dot_graph(dsk, filename=filename, format=format)
    assert os.path.isfile(target)
    assert isinstance(result, expected_result_type)
    ensure_not_exists(target)
Ejemplo n.º 43
0

def fit(): pass


def vis_struct(): pass


t = {'raw': (get_raw,),
     # 'vis': (vis,),
     'dark': (get_dark, 'raw'),
     'dark_corrected': (sub, 'raw', 'dark'),
     'calibration': (get_calibration, 'raw'),
     'polarization_corrected': (pol_correct, 'calibration',
                                'dark_corrected'),
     # '2dvis': (vis2d, 'dark_corrected', 'vis'),
     'mask': (make_mask, 'calibration', 'polarization_corrected'),
     'iq': (
     integrate, 'polarization_corrected', 'calibration', 'mask'),
     # 'vis1d_iq': (vis1d, 'iq', 'vis'),
     # 'bg_iq': (get_background, 'raw'),
     # 'bg_corrected_iq': (sub, 'iq', 'muxed_bg'),
     # 'gr': (get_gr, 'bg_corrected_iq', 'raw'),
     # 'vis1d_gr': (vis1d, 'gr', 'vis'),
     # 'candidate_structures': (get_candidates, 'raw'),
     # 'fit_structures': (fit, 'candidate_structures', 'gr'),
     # 'vis_struc': (vis_struct, 'fit_structures', 'vis')
     }

dot_graph(t, 'xpd_pipeline2.pdf')
Ejemplo n.º 44
0
dsk = {}
files = sorted(glob.glob("{0}/*.tif".format(data_path)))
final_saves = []
for filename in files:
    filename_cleaned = filename.split("/")[-1].split(".")[0]
    dsk['threshold-{0}'.format(filename_cleaned)] = (threshold, filename)
    dsk['min_size-{0}'.format(filename_cleaned)] = (
        min_size, 'threshold-{0}'.format(filename_cleaned))
    dsk['clean-{0}'.format(filename_cleaned)] = (
        clean, 'min_size-{0}'.format(filename_cleaned))
    dsk['reveal-{0}'.format(filename_cleaned)] = (
        reveal, 'clean-{0}'.format(filename_cleaned))
    dsk['pearlite-{0}'.format(filename_cleaned)] = (
        pearlite, 'reveal-{0}'.format(filename_cleaned))
    dsk['ferrite-{0}'.format(filename_cleaned)] = (
        ferrite, 'pearlite-{0}'.format(filename_cleaned))
    dsk['cemmentite-{0}'.format(filename_cleaned)] = (
        cemmentite, 'ferrite-{0}'.format(filename_cleaned))
    dsk['save-{0}'.format(filename_cleaned)] = (
        save, 'cemmentite-{0}'.format(filename_cleaned))
    final_saves.append('save-{0}'.format(filename_cleaned))
dsk['finalize'] = (finalize, final_saves)

dot_graph(dsk)

with ResourceProfiler(0.25) as rprof, Profiler() as prof, CacheProfiler(
) as cprof, ProgressBar():
    dak_get(dsk, 'finalize')

visualize([prof, rprof, cprof])
Ejemplo n.º 45
0
def test_dask():
    import dask.array as da
    x = da.ones((5, 15), chunks=(5, 5))
    d = (x + 1).dask
    from dask.dot import dot_graph
    dot_graph(d, format='svg')
Ejemplo n.º 46
0
Archivo: base.py Proyecto: fm100/dask
def visualize(*args, **kwargs):
    """
    Visualize several low level dask graphs at once.

    Requires ``graphviz`` to be installed. All options that are not the dask
    graph(s) should be passed as keyword arguments.

    Parameters
    ----------
    args : dict(s) or collection(s)
        The low level dask graph(s) to visualize.
    filename : str or None, optional
        The name of the file to write to disk. If the provided `filename`
        doesn't include an extension, '.png' will be used by default.
        If `filename` is None, no file will be written, and we communicate
        with dot using only pipes.
    format : {'png', 'pdf', 'dot', 'svg', 'jpeg', 'jpg'}, optional
        Format in which to write output file.  Default is 'png'.
    optimize_graph : bool, optional
        If True, the graph is optimized before rendering.  Otherwise,
        the graph is displayed as is. Default is False.
    color : {None, 'order', 'ages', 'freed', 'memoryincreases', 'memorydecreases',
             'memorypressure'}, optional
        Options to color nodes.
        colormap
        - None, the default, no colors.
        - 'order', colors the nodes' border based on the order they appear in the graph.
        - 'ages', how long the data of a node is held.
        - 'freed', the number of dependencies released after running a node.
        - 'memoryincreases', how many more outputs are held after the lifetime of a node.
          Large values may indicate nodes that should have run later.
        - 'memorydecreases', how many fewer outputs are held after the lifetime of a node.
          Large values may indicate nodes that should have run sooner.
        - 'memorypressure', the number of data held when:
            - the node is run (circle)
            - the data is released (rectangle)
    collapse_outputs : bool, optional
        Whether to collapse output boxes, which often have empty labels.
        Default is False.
    verbose : bool, optional
        Whether to label output and input boxes even if the data aren't chunked.
        Beware: these labels can get very long. Default is False.
    **kwargs
       Additional keyword arguments to forward to ``to_graphviz``.

    Examples
    --------
    >>> x.visualize(filename='dask.pdf')  # doctest: +SKIP
    >>> x.visualize(filename='dask.pdf', color='order')  # doctest: +SKIP

    Returns
    -------
    result : IPython.diplay.Image, IPython.display.SVG, or None
        See dask.dot.dot_graph for more information.

    See Also
    --------
    dask.dot.dot_graph

    Notes
    -----
    For more information on optimization see here:

    https://docs.dask.org/en/latest/optimize.html
    """
    from dask.dot import dot_graph

    filename = kwargs.pop("filename", "mydask")
    optimize_graph = kwargs.pop("optimize_graph", False)

    dsks = []
    args3 = []
    for arg in args:
        if isinstance(arg, (list, tuple, set)):
            for a in arg:
                if isinstance(a, Mapping):
                    dsks.append(a)
                if is_dask_collection(a):
                    args3.append(a)
        else:
            if isinstance(arg, Mapping):
                dsks.append(arg)
            if is_dask_collection(arg):
                args3.append(arg)

    dsk = dict(collections_to_dsk(args3, optimize_graph=optimize_graph))
    for d in dsks:
        dsk.update(d)

    color = kwargs.get("color")

    if color in {
            "order",
            "order-age",
            "order-freed",
            "order-memoryincreases",
            "order-memorydecreases",
            "order-memorypressure",
            "age",
            "freed",
            "memoryincreases",
            "memorydecreases",
            "memorypressure",
    }:
        import matplotlib.pyplot as plt

        from .order import diagnostics, order

        o = order(dsk)
        try:
            cmap = kwargs.pop("cmap")
        except KeyError:
            cmap = plt.cm.RdBu
        if isinstance(cmap, str):
            import matplotlib.pyplot as plt

            cmap = getattr(plt.cm, cmap)

        def label(x):
            return str(values[x])

        data_values = None
        if color != "order":
            info = diagnostics(dsk, o)[0]
            if color.endswith("age"):
                values = {key: val.age for key, val in info.items()}
            elif color.endswith("freed"):
                values = {
                    key: val.num_dependencies_freed
                    for key, val in info.items()
                }
            elif color.endswith("memorypressure"):
                values = {
                    key: val.num_data_when_run
                    for key, val in info.items()
                }
                data_values = {
                    key: val.num_data_when_released
                    for key, val in info.items()
                }
            elif color.endswith("memoryincreases"):
                values = {
                    key:
                    max(0, val.num_data_when_released - val.num_data_when_run)
                    for key, val in info.items()
                }
            else:  # memorydecreases
                values = {
                    key:
                    max(0, val.num_data_when_run - val.num_data_when_released)
                    for key, val in info.items()
                }

            if color.startswith("order-"):

                def label(x):
                    return str(o[x]) + "-" + str(values[x])

        else:
            values = o
        maxval = kwargs.pop("maxval", None)
        if maxval is None:
            maxval = max(1, max(values.values()))
        colors = {
            k: _colorize(cmap(v / maxval, bytes=True))
            for k, v in values.items()
        }
        if data_values is None:
            data_values = values
            data_colors = colors
        else:
            data_colors = {
                k: _colorize(cmap(v / maxval, bytes=True))
                for k, v in data_values.items()
            }

        kwargs["function_attributes"] = {
            k: {
                "color": v,
                "label": label(k)
            }
            for k, v in colors.items()
        }
        kwargs["data_attributes"] = {
            k: {
                "color": v
            }
            for k, v in data_colors.items()
        }
    elif color:
        raise NotImplementedError("Unknown value color=%s" % color)

    return dot_graph(dsk, filename=filename, **kwargs)
Ejemplo n.º 47
0
    def test_dask_workflow_and_paramenter_sweeping(self):
        """
        We test a workflow with dask
        """
        import dask
        # runner = GlobalFakeRunner()
        runner = FakeRunner()

        # decorate functions...
        generate_pricedata = dfp.job_delayed(runner)(self.generate_pricedata)
        generate_fundata = dfp.job_delayed(runner)(self.generate_fundata)
        generate_riskdata = dfp.job_delayed(runner)(self.generate_riskdata)
        generate_predictors = dfp.job_delayed(runner)(self.generate_predictors)
        generate_positions = dfp.job_delayed(runner)(self.generate_positions)
        # declare the dataflow
        dsk = dict()
        pools = ['pool1', 'pool2', 'pool3']
        for pool in pools:
            dsk[(pool, 'pricedata')] = generate_pricedata(pool),
            dsk[(pool, 'fundata')] = generate_fundata(pool),
            dsk[(pool, 'riskdata')] = generate_riskdata(pool,
                                                        'risk'), (pool,
                                                                  'pricedata')
            dsk[(pool, 'pred')] = generate_predictors(pool, 'risk'), [
                (pool, t) for t in ['pricedata', 'fundata', 'riskdata']
            ]
            for max_risk in range(3):
                dsk[(pool, 'positions', ('max_risk',
                                         max_risk))] = generate_positions(
                                             pool,
                                             'risk',
                                             'momentum',
                                             'markowitz_aversion',
                                             max_risk=max_risk), (pool, 'pred')
        # from dask.multiprocessing import get
        # from dask.threaded import get
        from dask. async import get_sync as get
        # get(dsk, [(pool,'pred') for pool in pools])  # executes in parallel
        # results = get(dsk, dsk.keys())
        # Execute (to convert in other formats): dot mydask.dot -Teps > mydask.eps
        import pandas as pd
        jobids = dict(zip(dsk.keys(), get(dsk, dsk.keys())))
        jobids_s = pd.DataFrame(jobids).ix[0]
        assert len(jobids) == 21
        status = runner.get_status(jobids)
        assert status == {
            ('pool3', 'pred'): 'valid',
            ('pool2', 'positions', ('max_risk', 2)): 'invalid',
            ('pool2', 'riskdata'): 'valid',
            ('pool3', 'riskdata'): 'valid',
            ('pool3', 'pricedata'): 'valid',
            ('pool3', 'fundata'): 'valid',
            ('pool2', 'positions', ('max_risk', 0)): 'invalid',
            ('pool1', 'pred'): 'pending',
            ('pool2', 'pred'): 'invalid',
            ('pool1', 'positions', ('max_risk', 1)): 'pending',
            ('pool3', 'positions', ('max_risk', 0)): 'valid',
            ('pool3', 'positions', ('max_risk', 2)): 'valid',
            ('pool2', 'fundata'): 'valid',
            ('pool1', 'positions', ('max_risk', 2)): 'pending',
            ('pool1', 'positions', ('max_risk', 0)): 'pending',
            ('pool1', 'riskdata'): 'pending',
            ('pool2', 'pricedata'): 'valid',
            ('pool1', 'pricedata'): 'valid',
            ('pool1', 'fundata'): 'valid',
            ('pool3', 'positions', ('max_risk', 1)): 'valid',
            ('pool2', 'positions', ('max_risk', 1)): 'invalid'
        }
        # Plot the graph with color corresponding to the status of jobs
        from dask.dot import dot_graph

        # dot_graph(dsk)
        # sdfsdf
        def get_status_dot_attributes(v):
            if v == 'valid':
                return dict(style='filled', color='lightgreen')
            if v == 'invalid':
                return dict(style='filled', color='red')
            if v == 'pending':
                return dict(style='filled', color='lightgrey')

        dot_status = {
            k: get_status_dot_attributes(v)
            for k, v in status.iteritems()
        }
        dot_graph(dsk,
                  filename='dask_graph',
                  format='dot',
                  data_attributes=dot_status,
                  function_attributes=dot_status)
        # dot_graph(dsk, filename='dask_graph', format='png', data_attributes=dot_status, function_attributes=dot_status)
        # dot_graph(dsk, filename='dask_graph', format='pdf', data_attributes=dot_status, function_attributes=dot_status)
        dot_graph(dsk,
                  filename='dask_graph',
                  format='svg',
                  data_attributes=dot_status,
                  function_attributes=dot_status)
Ejemplo n.º 48
0
import dask
import dask.array as da

x = da.random.normal(10, 0.1, size=(2000, 2000), chunks=(100, 100))
result = x.mean()

from dask.dot import dot_graph


dot_graph(result.dask)
Ejemplo n.º 49
0
@profile
def dask_compute(dask_array):
    res = dask_array.compute()
    return res

# Random 3D array (drop-in NumPy replacement)
x = da.random.normal(10, 0.1, size=(100, 100, 100), chunks=(50, 50, 50))

# print x.dask

# Squash to 2D (DO not compute!)
mean = x.mean(axis=0)

# print mean.dask

# See the plot!
dot_graph(mean.dask)

res = dask_compute(mean)

# print res.shape

plt.figure()
image = mpimg.imread("mydask.png")
plt.imshow(image)

plt.figure()
plt.imshow(res)
plt.show()
Ejemplo n.º 50
0
Archivo: base.py Proyecto: z7ye/dask-1
def visualize(*args, **kwargs):
    """
    Visualize several dask graphs at once.

    Requires ``graphviz`` to be installed. All options that are not the dask
    graph(s) should be passed as keyword arguments.

    Parameters
    ----------
    dsk : dict(s) or collection(s)
        The dask graph(s) to visualize.
    filename : str or None, optional
        The name of the file to write to disk. If the provided `filename`
        doesn't include an extension, '.png' will be used by default.
        If `filename` is None, no file will be written, and we communicate
        with dot using only pipes.
    format : {'png', 'pdf', 'dot', 'svg', 'jpeg', 'jpg'}, optional
        Format in which to write output file.  Default is 'png'.
    optimize_graph : bool, optional
        If True, the graph is optimized before rendering.  Otherwise,
        the graph is displayed as is. Default is False.
    color : {None, 'order'}, optional
        Options to color nodes.  Provide ``cmap=`` keyword for additional
        colormap
    collapse_outputs : bool, optional
        Whether to collapse output boxes, which often have empty labels.
        Default is False.
    verbose : bool, optional
        Whether to label output and input boxes even if the data aren't chunked.
        Beware: these labels can get very long. Default is False.
    **kwargs
       Additional keyword arguments to forward to ``to_graphviz``.

    Examples
    --------
    >>> x.visualize(filename='dask.pdf')  # doctest: +SKIP
    >>> x.visualize(filename='dask.pdf', color='order')  # doctest: +SKIP

    Returns
    -------
    result : IPython.diplay.Image, IPython.display.SVG, or None
        See dask.dot.dot_graph for more information.

    See Also
    --------
    dask.dot.dot_graph

    Notes
    -----
    For more information on optimization see here:

    https://docs.dask.org/en/latest/optimize.html
    """
    from dask.dot import dot_graph

    filename = kwargs.pop("filename", "mydask")
    optimize_graph = kwargs.pop("optimize_graph", False)

    dsks = []
    args3 = []
    for arg in args:
        if isinstance(arg, (list, tuple, set)):
            for a in arg:
                if isinstance(a, Mapping):
                    dsks.append(a)
                if is_dask_collection(a):
                    args3.append(a)
        else:
            if isinstance(arg, Mapping):
                dsks.append(arg)
            if is_dask_collection(arg):
                args3.append(arg)

    dsk = dict(collections_to_dsk(args3, optimize_graph=optimize_graph))
    for d in dsks:
        dsk.update(d)

    color = kwargs.get("color")

    if color == "order":
        from .order import order
        import matplotlib.pyplot as plt

        o = order(dsk)
        try:
            cmap = kwargs.pop("cmap")
        except KeyError:
            cmap = plt.cm.RdBu
        if isinstance(cmap, str):
            import matplotlib.pyplot as plt

            cmap = getattr(plt.cm, cmap)
        mx = max(o.values()) + 1
        colors = {k: _colorize(cmap(v / mx, bytes=True)) for k, v in o.items()}

        kwargs["function_attributes"] = {
            k: {
                "color": v,
                "label": str(o[k])
            }
            for k, v in colors.items()
        }
        kwargs["data_attributes"] = {
            k: {
                "color": v
            }
            for k, v in colors.items()
        }
    elif color:
        raise NotImplementedError("Unknown value color=%s" % color)

    return dot_graph(dsk, filename=filename, **kwargs)
Ejemplo n.º 51
0
def visualize(*args, **kwargs):
    """
    Visualize several dask graphs at once.

    Requires ``graphviz`` to be installed. All options that are not the dask
    graph(s) should be passed as keyword arguments.

    Parameters
    ----------
    dsk : dict(s) or collection(s)
        The dask graph(s) to visualize.
    filename : str or None, optional
        The name (without an extension) of the file to write to disk.  If
        `filename` is None, no file will be written, and we communicate
        with dot using only pipes.
    format : {'png', 'pdf', 'dot', 'svg', 'jpeg', 'jpg'}, optional
        Format in which to write output file.  Default is 'png'.
    optimize_graph : bool, optional
        If True, the graph is optimized before rendering.  Otherwise,
        the graph is displayed as is. Default is False.
    color: {None, 'order'}, optional
        Options to color nodes.  Provide ``cmap=`` keyword for additional
        colormap
    **kwargs
       Additional keyword arguments to forward to ``to_graphviz``.

    Examples
    --------
    >>> x.visualize(filename='dask.pdf')  # doctest: +SKIP
    >>> x.visualize(filename='dask.pdf', color='order')  # doctest: +SKIP

    Returns
    -------
    result : IPython.diplay.Image, IPython.display.SVG, or None
        See dask.dot.dot_graph for more information.

    See Also
    --------
    dask.dot.dot_graph

    Notes
    -----
    For more information on optimization see here:

    http://dask.pydata.org/en/latest/optimize.html
    """
    from dask.dot import dot_graph

    filename = kwargs.pop('filename', 'mydask')
    optimize_graph = kwargs.pop('optimize_graph', False)

    dsks = [arg for arg in args if isinstance(arg, dict)]
    args = [arg for arg in args if is_dask_collection(arg)]

    dsk = collections_to_dsk(args, optimize_graph=optimize_graph)
    for d in dsks:
        dsk.update(d)

    color = kwargs.get('color')

    if color == 'order':
        from .order import order
        import matplotlib.pyplot as plt
        o = order(dsk)
        try:
            cmap = kwargs.pop('cmap')
        except KeyError:
            cmap = plt.cm.RdBu
        if isinstance(cmap, str):
            import matplotlib.pyplot as plt
            cmap = getattr(plt.cm, cmap)
        mx = max(o.values()) + 1
        colors = {k: _colorize(cmap(v / mx, bytes=True)) for k, v in o.items()}

        kwargs['function_attributes'] = {k: {'color': v, 'label': str(o[k])}
                                         for k, v in colors.items()}
        kwargs['data_attributes'] = {k: {'color': v} for k, v in colors.items()}
    elif color:
        raise NotImplementedError("Unknown value color=%s" % color)

    return dot_graph(dsk, filename=filename, **kwargs)
Ejemplo n.º 52
0
 def plot(self, *args, **kwargs):
     self.graph[0] = 'data'
     dot_graph(self.graph)
     del self.graph[0]
Ejemplo n.º 53
0
# - Google Tensorflow - https://www.tensorflow.org/
# - Pytorch / Torch - http://pytorch.org/

# ## Tensor Comprehensions
# Facebook shows an example of why such representations are useful since they allow for the operations to be optimized later and massive performance improvements even for _fairly_ basic operations.
#
# ![Comprehensions](https://research.fb.com/wp-content/uploads/2018/02/tc_evol_slower.gif)

# In[14]:

import dask.array as da
from dask.dot import dot_graph

image_1 = da.zeros((5, 5), chunks=(5, 5))
image_2 = da.ones((5, 5), chunks=(5, 5))
dot_graph(image_1.dask)

# In[15]:

image_4 = (image_1 - 10) + (image_2 * 50)
dot_graph(image_4.dask)

# In[16]:

image_5 = da.matmul(image_1, image_4)
dot_graph(image_5.dask)

# ## Image Processing
# the initial examples were shown on very simple image problems. Here we can see how it looks for real imaging issues.

# In[17]:
Ejemplo n.º 54
0
def visualize(*args, **kwargs):
    """
    Visualize several dask graphs at once.

    Requires ``graphviz`` to be installed. All options that are not the dask
    graph(s) should be passed as keyword arguments.

    Parameters
    ----------
    dsk : dict(s) or collection(s)
        The dask graph(s) to visualize.
    filename : str or None, optional
        The name (without an extension) of the file to write to disk.  If
        `filename` is None, no file will be written, and we communicate
        with dot using only pipes.
    format : {'png', 'pdf', 'dot', 'svg', 'jpeg', 'jpg'}, optional
        Format in which to write output file.  Default is 'png'.
    optimize_graph : bool, optional
        If True, the graph is optimized before rendering.  Otherwise,
        the graph is displayed as is. Default is False.
    color: {None, 'order'}, optional
        Options to color nodes.  Provide ``cmap=`` keyword for additional
        colormap
    **kwargs
       Additional keyword arguments to forward to ``to_graphviz``.

    Examples
    --------
    >>> x.visualize(filename='dask.pdf')  # doctest: +SKIP
    >>> x.visualize(filename='dask.pdf', color='order')  # doctest: +SKIP

    Returns
    -------
    result : IPython.diplay.Image, IPython.display.SVG, or None
        See dask.dot.dot_graph for more information.

    See Also
    --------
    dask.dot.dot_graph

    Notes
    -----
    For more information on optimization see here:

    https://docs.dask.org/en/latest/optimize.html
    """
    from dask.dot import dot_graph

    filename = kwargs.pop('filename', 'mydask')
    optimize_graph = kwargs.pop('optimize_graph', False)

    dsks = [arg for arg in args if isinstance(arg, dict)]
    args = [arg for arg in args if is_dask_collection(arg)]

    dsk = collections_to_dsk(args, optimize_graph=optimize_graph)
    for d in dsks:
        dsk.update(d)

    color = kwargs.get('color')

    if color == 'order':
        from .order import order
        import matplotlib.pyplot as plt
        o = order(dsk)
        try:
            cmap = kwargs.pop('cmap')
        except KeyError:
            cmap = plt.cm.RdBu
        if isinstance(cmap, str):
            import matplotlib.pyplot as plt
            cmap = getattr(plt.cm, cmap)
        mx = max(o.values()) + 1
        colors = {k: _colorize(cmap(v / mx, bytes=True)) for k, v in o.items()}

        kwargs['function_attributes'] = {
            k: {
                'color': v,
                'label': str(o[k])
            }
            for k, v in colors.items()
        }
        kwargs['data_attributes'] = {
            k: {
                'color': v
            }
            for k, v in colors.items()
        }
    elif color:
        raise NotImplementedError("Unknown value color=%s" % color)

    return dot_graph(dsk, filename=filename, **kwargs)
Ejemplo n.º 55
0
def feature_engineering(dirpath,
                        channels,
                        IDs=None,
                        recurse=False,
                        overwrite_previous=False,
                        seg_channel="",
                        no_lms=False,
                        no_tfor=False,
                        no_cfor=False,
                        mem_d=3,
                        M=8,
                        save_baselines=True,
                        processes=None,
                        dask_graph_path=None,
                        profiling=False,
                        verbose=False):
    """Extract a series of measurements from segmented images and point clouds.

    This is a dask pipeline that runs the covariate extraction functions in
    `katachi.tools.get_image_covariates` & `katachi.tools.get_cloud_covariates`
    on datasets that have been initialized, segmented and feature-extracted
    using other katachi pipelines.

    WARNING: The approach used here has been developed for the Zebrafish
    posterior lateral line primordium. It is likely not readily applicable to
    other tissues!

    Parameters
    ----------
    dirpath : string
        The path (either local from cwd or global) to the directory with the
        input data to be processed.
    channels : list
        A list of channels from which to extract channel-specific covariates.
        For each channel, a tif file must be present that ends on
        `channel+".tif"` and a .npy file must be present that ends either on
        `channel+"_LMs_TFOR.npy"` (recommended) or on `channel+"_LMs.npy"`.
        The channels will be used as class attributes in the output object and
        therefore must not contain characters incompatible with this use.
    IDs : list of strings or None, optional, default None
        If a list of strings (IDs) is given, only samples within dirpath that
        match this ID will be processed.
    recurse : bool, optional, default False
        If True, files are searched recursively in the subdirs of dirpath.
    overwrite_previous : bool, optional, default False
        If True and a covariate file already exists for a given sample, that
        file will be deleted and a completely new file will be written in its
        place. If False and a covariate file already exists for a given sample,
        the new covariates will be added to it if they have a different name.
        For covariates with identical names, the new will overwrite the old.
    seg_channel : str or "", optional, default ""
        If for some reason the target directories are expected to contain more
        than one file that ends on "_seg.tif", seg_channel can be specified to
        identify the correct target file, which will have the form
        `<basename> + seg_channel + "_seg.tif"`.
        Note that having multiple segmentation files in one target directory is
        deprecated in general.
    no_lms : bool, optional, default False
        If True, it is expected that no landmark data is available. In this
        case, only image covariates are computed.
    no_tfor : bool, optional, default False
        If True, it is expected that no TFOR landmark data is available. In
        this case, untransformed landmarks are loaded and covariates depending
        on TFOR covariates are not computed (specifically pcl_covars_sample and
        pcl_covars_tissue).
    no_cfor : bool, optional, default False
        If True, the CFOR-based moments and baseline will not be computed and
        no CFOR data is required at any point.
    mem_d : int, optional, default 3
        Estimated diameter (in pixels) of the membrane region in the shell of a
        single cell. Used for extraction of intensity-based covariates.
    M : int, optional, default 8
        Highest-level moments to extract from point cloud. The moments array
        constructed will have shape (M+1,M+1,M+1).
    save_baselines : bool, optional, default True
        Whether to save the flattened moments arrays as feature space baselines
        in the form (N_cells, N_features), where N_features is length (M+1)**3.
        If True, two files are created for each channel, one for the base
        moments (usually TFOR, unless no_tfor is set to True or no TFOR data is
        available) and one for the PD-transformed (rotationally invariant) and
        volume-normalized cells, suffixed "_baseline.npy" and
        "_volnormPDbaseline.npy", respectively.
    processes : int or None, optional
        Number of processes dask may use for parallel processing. If None, half
        of the available CPUs are used. If set to 1, the entire code is run
        sequentially (dask is not used).
    dask_graph_path : string or None, optional, default None
        If a path (including a file ending matching a known image format, such
        as '.png') is specified as a string, a dask graph image is created that
        shows the constructed dask pipeline.
        Note: The resulting graph may get very large if many samples are used
        at the same time.
    profiling: bool, optional, default False
        If True, dask resource profiling is performed and visualized after the
        pipeline run is finished. This may generate a `profile.html` file in
        the working directory [bug in dask].
    verbose : bool, optional, default False
        If True, more information is printed.
    """

    #--------------------------------------------------------------------------

    ### Get a list of files to run

    if verbose: print "Retrieving matching datasets..."

    # Function to select suitable datasets and create paths
    def prepare_fpaths(fpath, fnames):

        # Keep only those in specified IDs
        if IDs is not None:
            fnames = [
                fname for fname in fnames
                if any([fname.startswith(ID) for ID in IDs])
            ]

        # Find the metadata file
        meta_file = None
        for fname in fnames:
            if fname.endswith("_stack_metadata.pkl"):
                meta_file = fname
                meta_path = os.path.join(fpath, meta_file)

        # Quit if no metadata file is found
        if meta_file is None:
            return None

        # Find segmentation file
        seg_file = [
            fname for fname in fnames
            if fname.endswith(seg_channel + "_seg.tif")
        ]

        # Handle failure cases
        if len(seg_file) == 0:
            return None
        if len(seg_file) > 1:
            raise IOError(
                "More than one segmentation file (*_seg.tif) found " +
                "in directory " + fpath + ". Use seg_channel kwarg to " +
                "specify which file to use.")
        else:
            seg_file = seg_file[0]
            seg_path = os.path.join(fpath, seg_file)

        # Find TFOR segmentation landmarks
        tfor_path = []
        if not no_tfor and not no_lms:

            # Search for the file
            tfor_file = [
                fname for fname in fnames
                if fname.endswith(seg_channel + "_seg_LMs_TFOR.npy")
            ]

            # Give up if nothing is found
            if len(tfor_file) == 0:
                return None

            # Else keep the result
            tfor_file = tfor_file[0]
            tfor_path = os.path.join(fpath, tfor_file)

        # Find channel landmark files
        lm_paths = []
        if not no_lms:
            for channel in channels:

                # Search for TFOR landmarks
                if not no_tfor:
                    lm_file = [
                        fname for fname in fnames
                        if fname.endswith(channel + "_LMs_TFOR.npy")
                    ]
                else:
                    lm_file = []

                # Search for non-TFOR landmarks
                if len(lm_file) == 0:
                    lm_file = [
                        fname for fname in fnames
                        if fname.endswith(channel + "_LMs.npy")
                    ]
                    if not no_tfor:
                        warn("No TFOR landmarks found for channel " + channel +
                             ". " + "Using standard landmarks.")

                # Give up if nothing is found
                if not lm_file:
                    return None

                # Else keep the result
                lm_file = lm_file[0]
                lm_path = os.path.join(fpath, lm_file)
                lm_paths.append(lm_path)

        # Find CFOR-transformed channel landmark files
        cfor_paths = []
        if not no_cfor and not no_lms:
            for channel in channels:

                # Get CFOR landmark paths
                cfor_file = [
                    fname for fname in fnames
                    if channel in fname and fname.endswith('CFOR.npy')
                ][0]
                cfor_path = os.path.join(fpath, cfor_file)
                cfor_paths.append(cfor_path)

        # Find image files
        img_paths = []
        for channel in channels:

            # Search for image files
            img_file = [
                fname for fname in fnames if fname.endswith(channel + ".tif")
            ]

            # Give up if nothing is found
            if not img_file:
                return None

            # Else keep the result
            img_file = img_file[0]
            img_path = os.path.join(fpath, img_file)
            img_paths.append(img_path)

        # Return the paths
        return {
            "meta_path": meta_path,
            "seg_path": seg_path,
            "tfor_path": tfor_path,
            "lm_paths": lm_paths,
            "img_paths": img_paths,
            "cfor_paths": cfor_paths
        }

    # Run for single dir
    if not recurse:
        fnames = os.listdir(dirpath)
        all_paths = [prepare_fpaths(dirpath, fnames)]
        if all_paths is None:
            raise IOError("The specified path does not contain the required " +
                          "files (and recurse=False).")

    # Run for multiple subdirs
    if recurse:
        all_paths = []
        for dpath, _, fnames in os.walk(dirpath):
            fpaths = prepare_fpaths(dpath, fnames)
            if fpaths is not None:
                all_paths.append(fpaths)
        if not all_paths:
            raise IOError("Could not find any data directories containing " +
                          "all required files.")

    # Report
    if verbose: print "-- Retrieved", len(all_paths), "matching data sets."

    #--------------------------------------------------------------------------

    ### If desired: run sequentially (does not use dask/multiprocessing)

    if processes == 1:

        if verbose: print "Processing target files sequentially..."

        # For each dataset...
        for paths in all_paths:

            # Load previously generated covariates file (if available)
            has_previous = False
            if not overwrite_previous:
                mroot, mfile = os.path.split(paths["meta_path"])
                prevfpath = os.path.join(mroot, mfile[:10] + "_covariates.pkl")
                if os.path.isfile(prevfpath):
                    with open(prevfpath, 'rb') as prevfile:
                        covars = pickle.load(prevfile)
                    has_previous = True

            # Load data
            img_seg = imread(paths["seg_path"])
            if not no_lms and not no_tfor:
                tfor_lms = np.load(paths["tfor_path"])
            with open(paths["meta_path"], 'rb') as metafile:
                meta_dict = pickle.load(metafile)

            # Extract image covariates
            covars = gic.get_img_covars_sample(
                "_", img_seg=img_seg, covars=covars if has_previous else None)
            covars = gic.get_img_covars_tissue("_",
                                               img_seg=img_seg,
                                               covars=covars)
            covars = gic.get_img_covars_cell_seg("_",
                                                 '_',
                                                 img_seg=img_seg,
                                                 metadata=meta_dict,
                                                 covars=covars)
            for c, channel in enumerate(channels):
                covars = gic.get_img_covars_cell_int("_",
                                                     paths["img_paths"][c],
                                                     channel,
                                                     mem_d,
                                                     img_seg=img_seg,
                                                     covars=covars)

            # Extract point cloud covariates
            if not no_tfor and not no_lms:
                covars = gcc.get_pcl_covars_sample("_",
                                                   "_",
                                                   tfor_lms=tfor_lms,
                                                   metadata=meta_dict,
                                                   covars=covars)
                covars = gcc.get_pcl_covars_tissue("_",
                                                   "_",
                                                   tfor_lms=tfor_lms,
                                                   metadata=meta_dict,
                                                   covars=covars)
            if not no_lms:
                for c, channel in enumerate(channels):
                    covars = gcc.get_pcl_covars_cell(
                        paths["lm_paths"][c],
                        channel,
                        M=M,
                        no_cfor=no_cfor,
                        fpath_lms_cfor=paths["cfor_paths"][c],
                        covars=covars)

                # Saving the moments as a baseline feature space
                if save_baselines:

                    # Prep base path
                    bp = paths["lm_paths"][c][:-4]

                    # Save TFOR baseline
                    m = covars.pcl.cell._gad(channel).moments
                    np.save(bp + "_baseline.npy", m)

                    # Save CFOR baseline
                    if not no_cfor:
                        m = covars.pcl.cell._gad(channel).moments_cfor
                        np.save(bp + "_CFORbaseline.npy", m)

            # Saving the extracted covariates
            mroot, mfile = os.path.split(paths["meta_path"])
            outfpath = os.path.join(mroot, mfile[:10] + "_covariates.pkl")
            with open(outfpath, 'wb') as outfile:
                pickle.dump(covars, outfile, pickle.HIGHEST_PROTOCOL)

        # Report and return
        if verbose: print "Processing complete!"
        return

    #--------------------------------------------------------------------------

    ### Prepare dask dict
    # Note: This is slightly suboptimal because some datasets have to be
    #       reloaded multiple times. However, it seems difficult to solve this
    #       in a way that permits carrying them over.

    if verbose: print "Processing target files in parallel..."

    dask_graph = dict()

    # For each dataset...
    for idx, paths in enumerate(all_paths):

        # Getting previous covariates: function
        def get_previous_covariates(prevfpath):
            with open(prevfpath, 'rb') as prevfile:
                covars = pickle.load(prevfile)
            return covars

        # Get previous covars (if existing and desired)
        has_previous = False
        if not overwrite_previous:
            mroot, mfile = os.path.split(paths["meta_path"])
            prevfpath = os.path.join(mroot, mfile[:10] + "_covariates.pkl")
            if os.path.isfile(prevfpath):
                dask_graph['prev_covars_%i' % idx] = (get_previous_covariates,
                                                      prevfpath)
                has_previous = True

        # Extract image covariates
        dask_graph["img_sample_%i" % idx] = (gic.get_img_covars_sample,
                                             paths["seg_path"])
        dask_graph["img_tissue_%i" % idx] = (gic.get_img_covars_tissue,
                                             paths["seg_path"])
        dask_graph["img_cell_seg_%i" % idx] = (gic.get_img_covars_cell_seg,
                                               paths["seg_path"],
                                               paths["meta_path"])
        for c, channel in enumerate(channels):
            dask_graph["img_cell_int_%s_%i" %
                       (channel, idx)] = (gic.get_img_covars_cell_int,
                                          paths["seg_path"],
                                          paths["img_paths"][c], channel,
                                          mem_d)

        # Extract point cloud covariates
        if not no_tfor and not no_lms:
            dask_graph["pcl_sample_%i" % idx] = (gcc.get_pcl_covars_sample,
                                                 paths["tfor_path"],
                                                 paths["meta_path"])
            dask_graph["pcl_tissue_%i" % idx] = (gcc.get_pcl_covars_tissue,
                                                 paths["tfor_path"],
                                                 paths["meta_path"])
        if not no_lms:
            for c, channel in enumerate(channels):
                dask_graph["pcl_cell_%s_%i" %
                           (channel, idx)] = (gcc.get_pcl_covars_cell,
                                              paths["lm_paths"][c], channel, M,
                                              no_cfor, paths["cfor_paths"][c])

                # Saving the moments as a baseline feature space
                if save_baselines:

                    # Baseline saving function
                    def save_baseline(covars, channel, basepath, no_cfor):

                        # Save TFOR baseline
                        m = covars.pcl.cell._gad(channel).moments
                        np.save(basepath + "_baseline.npy", m)

                        # Save CFOR baseline
                        if not no_cfor:
                            m = covars.pcl.cell._gad(channel).moments_cfor
                            np.save(basepath + "_CFORbaseline.npy", m)

                        # Forward result
                        return covars

                    # Add to graph
                    basepath = paths["lm_paths"][c][:-4]
                    dask_graph["pcl_cell_blsave_%s_%i" %
                               (channel, idx)] = (save_baseline,
                                                  "pcl_cell_%s_%i" %
                                                  (channel, idx), channel,
                                                  basepath, no_cfor)

        # Merging the extracted covariates: function
        def merge_covariates(covars_list):
            covars = covars_list[0]
            for cv in covars_list[1:]:
                covars._merge(cv)
            return covars

        # Merging the extracted covariates: input name list construction
        covars_list = [
            "img_sample_%i" % idx,
            "img_tissue_%i" % idx,
            "img_cell_seg_%i" % idx
        ]
        covars_list += [
            "img_cell_int_%s_%i" % (channel, idx) for channel in channels
        ]
        if not no_tfor and not no_lms:
            covars_list += ["pcl_sample_%i" % idx, "pcl_tissue_%i" % idx]
        if save_baselines and not no_lms:
            covars_list += [
                "pcl_cell_blsave_%s_%i" % (channel, idx)
                for channel in channels
            ]
        elif not no_lms:
            covars_list += [
                "pcl_cell_%s_%i" % (channel, idx) for channel in channels
            ]
        if has_previous:
            covars_list += ['prev_covars_%i' % idx]

        # Merging the extracted covariates: dask call
        dask_graph["merge_results_%i" % idx] = (merge_covariates, covars_list)

        # Saving the extracted covariates
        def save_covariates(covars, outfpath):
            with open(outfpath, 'wb') as outfile:
                pickle.dump(covars, outfile, pickle.HIGHEST_PROTOCOL)

        mroot, mfile = os.path.split(paths["meta_path"])
        outfpath = os.path.join(mroot, mfile[:10] + "_covariates.pkl")
        dask_graph["save_results_%i" % idx] = (save_covariates,
                                               "merge_results_%i" % idx,
                                               outfpath)

    # Collecting the results
    dask_graph['done'] = (lambda x: "done", [
        "save_results_%i" % idx for idx in range(len(all_paths))
    ])

    # Saving the graph visualization
    if dask_graph_path is not None:
        from dask.dot import dot_graph
        dot_graph(dask_graph, filename=dask_graph_path)

    #--------------------------------------------------------------------------

    ### Run in parallel (with dask)

    # If necessary: choose number of threads (half of available cores)
    if processes is None:
        processes = cpu_count() // 2

    # Set number of threads
    dask.set_options(pool=ThreadPool(processes))

    # Run the pipeline (no profiling)
    if not profiling:
        with ProgressBar(dt=1):
            dask.threaded.get(dask_graph, 'done')

    # Run the pipeline (with resource profiling)
    if profiling:
        with ProgressBar(dt=1):
            with Profiler() as prof, ResourceProfiler(dt=0.1) as rprof:
                dask.threaded.get(dask_graph, 'done')
            visualize([prof, rprof], save=False)

    # Report and return
    if verbose: print "Processing complete!"
    return