Exemple #1
0
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Fri Feb 22 17:18:08 2019

@author: cliffk
"""

from dask import compute, delayed
import pylab as pl
import sciris as sc

from distributed import LocalCluster
c = LocalCluster(processes=False)

print(c.scheduler)
print(c.workers)

inputs = [0, 1, 2, 3]


def process(data):
    pl.seed(data)
    output = 0
    for i in pl.arange(1e6):
        this = pl.randn()
        # print('%s: %s' % (i, this))
        output += this
    return output

Exemple #2
0
def client():
    with Client(LocalCluster(n_workers=2)) as client:
        yield (client)
Exemple #3
0
import os
import pandas as pd
import argparse
from dask.distributed import Client
from distributed import Client, LocalCluster

if __name__ == '__main__':
    parser = argparse.ArgumentParser()
    parser.add_argument('--cell_line', nargs=1, type=str, help='cell line to run on')
    parser.add_argument('--name', nargs=1, type=str, help='name of dataset')
    args = parser.parse_args()

    cl = args.cell_line[0]
    name = args.name[0]

    from arboreto.algo import grnboost2, genie3
    from arboreto.utils import load_tf_names

    ex_matrix = pd.read_csv('~/data/spate116/GCN/%s/%s_expression_matrix_imputed.tsv' % (cl, name), sep='\t').transpose()

    cluster = LocalCluster()
    client = Client(cluster)
    print('here')
    network = grnboost2(expression_data=ex_matrix.to_numpy(), gene_names=ex_matrix.columns, client_or_address=client)
    network.to_csv('~/data/spate116/GCN/%s/%s_GRN.tsv' % (cl, name), sep='\t', header=True, index=False)
    client.close()
    cluster.close()
Exemple #4
0
 def __init__(self, n_cores=None):
     if n_cores is None:
         n_cores = psutil.cpu_count() - 2
     self.cluster = LocalCluster(processes=True, n_workers=1)
     self.client = Client(self.cluster)
Exemple #5
0
def _simulate_tn(circuit: any, initial_state: any, final_state: any,
                 optimize: any, backend: any, complex_type: any,
                 tensor_only: bool, verbose: bool, **kwargs):
    import quimb.tensor as tn
    import cotengra as ctg

    # Get random leaves_prefix
    leaves_prefix = ''.join(
        np.random.choice(list('abcdefghijklmnopqrstuvwxyz'), size=20))

    # Initialize info
    _sim_info = {}

    # Alias for tn
    if optimize == 'tn':
        optimize = 'cotengra'

    if isinstance(circuit, Circuit):

        # Get number of qubits
        qubits = circuit.all_qubits()
        n_qubits = len(qubits)

        # If initial/final state is None, set to all .'s
        initial_state = '.' * n_qubits if initial_state is None else initial_state
        final_state = '.' * n_qubits if final_state is None else final_state

        # Initial and final states must be valid strings
        for state, sname in [(initial_state, 'initial_state'),
                             (final_state, 'final_state')]:
            # Get alphabet
            from string import ascii_letters

            # Check if string
            if not isinstance(state, str):
                raise ValueError(f"'{sname}' must be a valid string.")

            # Deprecated error
            if any(x in 'xX' for x in state):
                from hybridq.utils import DeprecationWarning
                from warnings import warn

                # Warn the user that '.' is used to represent open qubits
                warn(
                    "Since '0.6.3', letters in the alphabet are used to "
                    "trace selected qubits (including 'x' and 'X'). "
                    "Instead, '.' is used to represent an open qubit.",
                    DeprecationWarning)

            # Check only valid symbols are present
            if set(state).difference('01+-.' + ascii_letters):
                raise ValueError(f"'{sname}' contains invalid symbols.")

            # Check number of qubits
            if len(state) != n_qubits:
                raise ValueError(f"'{sname}' has the wrong number of qubits "
                                 f"(expected {n_qubits}, got {len(state)})")

        # Check memory
        if 2**(initial_state.count('.') +
               final_state.count('.')) > kwargs['max_largest_intermediate']:
            raise MemoryError("Memory for the given number of open qubits "
                              "exceeds the 'max_largest_intermediate'.")

        # Compress circuit
        if kwargs['compress']:
            if verbose:
                print(
                    f"Compress circuit (max_n_qubits={kwargs['compress']}): ",
                    end='',
                    file=stderr)
                _time = time()

            circuit = utils.compress(
                circuit,
                kwargs['compress']['max_n_qubits'] if isinstance(
                    kwargs['compress'], dict) else kwargs['compress'],
                verbose=verbose,
                **({
                    k: v
                    for k, v in kwargs['compress'].items()
                    if k != 'max_n_qubits'
                } if isinstance(kwargs['compress'], dict) else {}))

            circuit = Circuit(
                utils.to_matrix_gate(c, complex_type=complex_type)
                for c in circuit)
            if verbose:
                print(f"Done! ({time()-_time:1.2f}s)", file=stderr)

        # Get tensor network representation of circuit
        tensor, tn_qubits_map = utils.to_tn(circuit,
                                            return_qubits_map=True,
                                            leaves_prefix=leaves_prefix)

        # Define basic MPS
        _mps = {
            '0': np.array([1, 0]),
            '1': np.array([0, 1]),
            '+': np.array([1, 1]) / np.sqrt(2),
            '-': np.array([1, -1]) / np.sqrt(2)
        }

        # Attach initial/final state
        for state, ext in [(initial_state, 'i'), (final_state, 'f')]:
            for s, q in ((s, q) for s, q in zip(state, qubits) if s in _mps):
                inds = [f'{leaves_prefix}_{tn_qubits_map[q]}_{ext}']
                tensor &= tn.Tensor(_mps[s], inds=inds, tags=inds)

        # For each unique letter, apply trace
        for x in set(initial_state + final_state).difference(''.join(_mps) +
                                                             '.'):
            # Get indexes
            inds = [
                f'{leaves_prefix}_{tn_qubits_map[q]}_i'
                for s, q in zip(initial_state, qubits) if s == x
            ]
            inds += [
                f'{leaves_prefix}_{tn_qubits_map[q]}_f'
                for s, q in zip(final_state, qubits) if s == x
            ]

            # Apply trace
            tensor &= tn.Tensor(np.reshape([1] + [0] * (2**len(inds) - 2) +
                                           [1], (2, ) * len(inds)),
                                inds=inds)

        # Simplify if requested
        if kwargs['simplify_tn']:
            tensor.full_simplify_(kwargs['simplify_tn']).astype_(complex_type)
        else:
            # Otherwise, just convert to the given complex_type
            tensor.astype_(complex_type)

        # Get contraction from heuristic
        if optimize == 'cotengra' and kwargs['max_iterations'] > 0:

            # Create local client if MPI has been detected (not compatible with Dask at the moment)
            if _mpi_env and kwargs['parallel']:

                from distributed import Client, LocalCluster
                _client = Client(LocalCluster(processes=False))

            else:

                _client = None

            # Set cotengra parameters
            cotengra_params = lambda: ctg.HyperOptimizer(
                methods=kwargs['methods'],
                max_time=kwargs['max_time'],
                max_repeats=kwargs['max_repeats'],
                minimize=kwargs['minimize'],
                progbar=verbose,
                parallel=kwargs['parallel'],
                **kwargs['cotengra'])

            # Get optimized path
            opt = cotengra_params()
            info = tensor.contract(all, optimize=opt, get='path-info')

            # Get target size
            tli = kwargs['target_largest_intermediate']

            # Repeat for the requested number of iterations
            for _ in range(1, kwargs['max_iterations']):

                # Break if largest intermediate is equal or smaller than target
                if info.largest_intermediate <= tli:
                    break

                # Otherwise, restart
                _opt = cotengra_params()
                _info = tensor.contract(all, optimize=_opt, get='path-info')

                # Store the best
                if kwargs['minimize'] == 'size':

                    if _info.largest_intermediate < info.largest_intermediate or (
                            _info.largest_intermediate
                            == info.largest_intermediate
                            and _opt.best['flops'] < opt.best['flops']):
                        info = _info
                        opt = _opt

                else:

                    if _opt.best['flops'] < opt.best['flops'] or (
                            _opt.best['flops'] == opt.best['flops']
                            and _info.largest_intermediate <
                            info.largest_intermediate):
                        info = _info
                        opt = _opt

            # Close client if exists
            if _client:

                _client.shutdown()
                _client.close()

        # Just return tensor if required
        if tensor_only:
            if optimize == 'cotengra' and kwargs['max_iterations'] > 0:
                return tensor, (info, opt)
            else:
                return tensor

    else:

        # Set tensor
        tensor = circuit

        if len(optimize) == 2 and isinstance(
                optimize[0], PathInfo) and isinstance(
                    optimize[1], ctg.hyper.HyperOptimizer):

            # Get info and opt from optimize
            info, opt = optimize

            # Set optimization
            optimize = 'cotengra'

        else:

            # Get tensor and path
            tensor = circuit

    # Print some info
    if verbose:
        print(
            f'Largest Intermediate: 2^{np.log2(float(info.largest_intermediate)):1.2f}',
            file=stderr)
        print(
            f'Max Largest Intermediate: 2^{np.log2(float(kwargs["max_largest_intermediate"])):1.2f}',
            file=stderr)
        print(f'Flops: 2^{np.log2(float(info.opt_cost)):1.2f}', file=stderr)

    if optimize == 'cotengra':

        # Get indexes
        _inds = tensor.outer_inds()

        # Get input indexes and output indexes
        _i_inds = sort([x for x in _inds if x[-2:] == '_i'],
                       key=lambda x: int(x.split('_')[1]))
        _f_inds = sort([x for x in _inds if x[-2:] == '_f'],
                       key=lambda x: int(x.split('_')[1]))

        # Get order
        _inds = [_inds.index(x) for x in _i_inds + _f_inds]

        # Get slice finder
        sf = ctg.SliceFinder(info,
                             target_size=kwargs['max_largest_intermediate'])

        # Find slices
        with tqdm(kwargs['temperatures'], disable=not verbose,
                  leave=False) as pbar:
            for _temp in pbar:
                pbar.set_description(f'Find slices (T={_temp})')
                ix_sl, cost_sl = sf.search(temperature=_temp)

        # Get slice contractor
        sc = sf.SlicedContractor([t.data for t in tensor])

        # Update infos
        _sim_info.update({
            'flops': info.opt_cost,
            'largest_intermediate': info.largest_intermediate,
            'n_slices': cost_sl.nslices,
            'total_flops': cost_sl.total_flops
        })

        # Print some infos
        if verbose:
            print(
                f'Number of slices: 2^{np.log2(float(cost_sl.nslices)):1.2f}',
                file=stderr)
            print(f'Flops+Cuts: 2^{np.log2(float(cost_sl.total_flops)):1.2f}',
                  file=stderr)

        if kwargs['max_n_slices'] and sc.nslices > kwargs['max_n_slices']:
            raise RuntimeError(
                f'Too many slices ({sc.nslices} > {kwargs["max_n_slices"]})')

        # Contract tensor
        _li = np.log2(float(info.largest_intermediate))
        _mli = np.log2(float(kwargs["max_largest_intermediate"]))
        _tensor = sc.gather_slices((sc.contract_slice(
            i, backend=backend
        ) for i in tqdm(
            range(sc.nslices),
            desc=f'Contracting tensor (li=2^{_li:1.0f}, mli=2^{_mli:1.1f})',
            leave=False)))

        # Create map
        _map = ''.join([get_symbol(x) for x in range(len(_inds))])
        _map += '->'
        _map += ''.join([get_symbol(x) for x in _inds])

        # Reorder tensor
        tensor = contract(_map, _tensor)

        # Deprecated
        ## Reshape tensor
        #if _inds:
        #    if _i_inds and _f_inds:
        #        tensor = np.reshape(tensor, (2**len(_i_inds), 2**len(_f_inds)))
        #    else:
        #        tensor = np.reshape(tensor,
        #                            (2**max(len(_i_inds), len(_f_inds)),))

    else:

        # Contract tensor
        tensor = tensor.contract(optimize=optimize, backend=backend)

        if hasattr(tensor, 'inds'):

            # Get input indexes and output indexes
            _i_inds = sort([x for x in tensor.inds if x[-2:] == '_i'],
                           key=lambda x: int(x.split('_')[1]))
            _f_inds = sort([x for x in tensor.inds if x[-2:] == '_f'],
                           key=lambda x: int(x.split('_')[1]))

            # Transpose tensor
            tensor.transpose(*(_i_inds + _f_inds), inplace=True)

            # Deprecated
            ## Reshape tensor
            #if _i_inds and _f_inds:
            #    tensor = np.reshape(tensor, (2**len(_i_inds), 2**len(_f_inds)))
            #else:
            #    tensor = np.reshape(tensor,
            #                        (2**max(len(_i_inds), len(_f_inds)),))

    if kwargs['return_info']:
        return tensor, _sim_info
    else:
        return tensor
Exemple #6
0
def tall_clutter(files,
                 config,
                 clutter_thresh_min=0.0002,
                 clutter_thresh_max=0.25,
                 radius=1,
                 max_height=2000.,
                 write_radar=True,
                 out_file=None,
                 use_dask=False):
    """
    Wind Farm Clutter Calculation

    Parameters
    ----------
    files : list
        List of radar files used for the clutter calculation.
    config : str
        String representing the configuration for the radar.
        Such possible configurations are listed in default_config.py

    Other Parameters
    ----------------
    clutter_thresh_min : float
        Threshold value for which, any clutter values above the
        clutter_thres_min will be considered clutter, as long as they
        are also below the clutter_thres_max.
    clutter_thresh_max : float
        Threshold value for which, any clutter values below the
        clutter_thres_max will be considered clutter, as long as they
        are also above the clutter_thres_min.
    radius : int
        Radius of the area surrounding the clutter gate that will
        be also flagged as clutter.
    max_height: float
        Maximum height above the radar to mark a gate as clutter.
    write_radar : bool
        Whether to or not, to write the clutter radar as a netCDF file.
        Default is True.
    out_file : string
        String of location and filename to write the radar object too,
        if write_radar is True.
    use_dask : bool
        Use dask instead of running stats for calculation. The will reduce
        run time.

    Returns
    -------
    clutter_radar : Radar
        Radar object with the clutter field that was calculated.
        This radar only has the clutter field, but maintains all
        other radar specifications.

    """
    field_names = get_field_names(config)
    refl_field = field_names["reflectivity"]
    vel_field = field_names["velocity"]
    ncp_field = field_names["normalized_coherent_power"]

    def get_reflect_array(file, first_shape):
        """ Retrieves a reflectivity array for a radar volume. """
        try:
            radar = pyart.io.read(
                file, include_fields=[refl_field, ncp_field, vel_field])
            reflect_array = deepcopy(radar.fields[refl_field]['data'])
            ncp = radar.fields[ncp_field]['data']
            height = radar.gate_z["data"]
            up_in_the_air = height > max_height
            the_mask = np.logical_or.reduce(
                (ncp < 0.9, reflect_array.mask, up_in_the_air))
            reflect_array = np.ma.masked_where(the_mask, reflect_array)
            del radar
            if reflect_array.shape == first_shape:
                return reflect_array.filled(fill_value=np.nan)
        except (TypeError, OSError):
            print(file + ' is corrupt...skipping!')
        return np.nan * np.zeros(first_shape)

    if use_dask is False:
        run_stats = _RunningStats()
        first_shape = 0
        for file in files:
            try:
                radar = pyart.io.read(file)
                reflect_array = radar.fields[refl_field]['data']
                ncp = deepcopy(radar.fields[ncp_field]['data'])
                height = radar.gate_z["data"]
                reflect_array = np.ma.masked_where(
                    np.logical_or(height > max_height, ncp < 0.8),
                    reflect_array)

                if first_shape == 0:
                    first_shape = reflect_array.shape
                    clutter_radar = radar
                    run_stats.push(reflect_array)
                if reflect_array.shape == first_shape:
                    run_stats.push(reflect_array)
                del radar
            except (TypeError, OSError):
                print(file + ' is corrupt...skipping!')
                continue
        mean = run_stats.mean()
        stdev = run_stats.standard_deviation()
        clutter_values = stdev / mean
        clutter_values = np.ma.masked_invalid(clutter_values)
        clutter_values_no_mask = clutter_values.filled(clutter_thresh_max + 1)
    else:
        cluster = LocalCluster(n_workers=20, processes=True)
        client = Client(cluster)
        first_shape = 0
        i = 0
        while first_shape == 0:
            try:
                radar = pyart.io.read(files[i])
                reflect_array = radar.fields[refl_field]['data']
                first_shape = reflect_array.shape
                clutter_radar = radar
            except (TypeError, OSError):
                i = i + 1
                print(file + ' is corrupt...skipping!')
                continue
        arrays = [
            delayed(get_reflect_array)(file, first_shape) for file in files
        ]
        array = [
            da.from_delayed(a, shape=first_shape, dtype=float) for a in arrays
        ]
        array = da.stack(array, axis=0)
        print('## Calculating mean in parallel...')
        mean = np.array(da.nanmean(array, axis=0))
        print('## Calculating standard deviation...')
        count = np.array(da.sum(da.isfinite(array), axis=0))
        stdev = np.array(da.nanstd(array, axis=0))
        clutter_values = stdev / mean
        clutter_values = np.ma.masked_invalid(clutter_values)
        clutter_values = np.ma.masked_where(
            np.logical_or(clutter_values.mask, count < 20), clutter_values)
        # Masked arrays can suck
        clutter_values_no_mask = clutter_values.filled(
            (clutter_thresh_max + 1))

    shape = clutter_values.shape
    mask = np.ma.getmask(clutter_values)
    is_clutters = np.argwhere(
        np.logical_and.reduce((
            clutter_values_no_mask > clutter_thresh_min,
            clutter_values_no_mask < clutter_thresh_max,
        )))
    clutter_array = _clutter_marker(is_clutters, shape, mask, radius)
    clutter_radar.fields.clear()
    clutter_array = clutter_array.filled(0)
    clutter_dict = _clutter_to_dict(clutter_array)
    clutter_value_dict = _clutter_to_dict(clutter_values)
    clutter_value_dict["long_name"] = "Clutter value (std. dev/mean Z)"
    clutter_radar.add_field('ground_clutter',
                            clutter_dict,
                            replace_existing=True)
    clutter_radar.add_field('clutter_value',
                            clutter_value_dict,
                            replace_existing=True)
    if write_radar is True:
        pyart.io.write_cfradial(out_file, clutter_radar)
    del clutter_radar
    return
Exemple #7
0
def test_list():
    with LocalCluster(name="testcluster", scheduler_port=8786) as _:
        output = check_output(["daskctl", "list"])
        assert b"ProxyCluster" in output
        assert b"Running" in output
Exemple #8
0
    def run(
        self,
        dataset: str,
        include_raw: bool = False,
        batch_size: Optional[int] = None,
        distributed: bool = False,
        n_workers: int = 10,
        worker_cpu: int = 8,
        worker_mem: str = "120GB",
        overwrite: bool = False,
        debug: bool = False,
        **kwargs,
    ):
        """
        Run a flow with your steps.

        Parameters
        ----------
        dataset: str
            The dataset to use for the pipeline.

        include_raw: bool
            A boolean option to determine if the raw data should be included in the
            Quilt package.
            Default: False (Do not include the raw data)

        batch_size: Optional[int]
            An optional batch size to provide to each step for processing their items.
            Default: None (auto batch size depending on CPU / threads available)

        distributed: bool
            A boolean option to determine if the jobs should be distributed to a SLURM
            cluster when possible.
            Default: False (Do not distribute)

        n_workers: int
            Number of workers to request (when distributed is enabled).
            Default: 10

        worker_cpu: int
            Number of cores to provide per worker (when distributed is enabled).
            Default: 8

        worker_mem: str
            Amount of memory to provide per worker (when distributed is enabled).
            Default: 120GB

        overwrite: bool
            If this pipeline has already partially or completely run, should it
            overwrite the previous files or not.
            Default: False (Do not overwrite or regenerate files)

        debug: bool
            A debug flag for the developer to use to manipulate how much data runs,
            how it is processed, etc. Additionally, if debug is True, any mapped
            operation will run on threads instead of processes.
            Default: False (Do not debug)
        """
        # Initalize steps
        raw = steps.Raw()
        standardize_fov_array = steps.StandardizeFOVArray()
        single_cell_features = steps.SingleCellFeatures()
        single_cell_images = steps.SingleCellImages()
        diagnostic_sheets = steps.DiagnosticSheets()

        # Cluster / distributed defaults
        distributed_executor_address = None

        # Choose executor
        if debug:
            exe = LocalExecutor()
            log.info("Debug flagged. Will use threads instead of Dask.")
        else:
            if distributed:
                # Create or get log dir
                # Do not include ms
                log_dir_name = datetime.now().isoformat().split(".")[0]
                log_dir = Path(f".dask_logs/{log_dir_name}").expanduser()
                # Log dir settings
                log_dir.mkdir(parents=True, exist_ok=True)

                # Create cluster
                log.info("Creating SLURMCluster")
                cluster = SLURMCluster(
                    cores=worker_cpu,
                    memory=worker_mem,
                    queue="aics_cpu_general",
                    walltime="10:00:00",
                    local_directory=str(log_dir),
                    log_directory=str(log_dir),
                )

                # Spawn workers
                cluster.scale(n_workers)
                log.info("Created SLURMCluster")

                # Use the port from the created connector to set executor address
                distributed_executor_address = cluster.scheduler_address

                # Only auto batch size if it is not None
                if batch_size is None:
                    # Batch size is n_workers * worker_cpu * 0.75
                    # We could just do n_workers * worker_cpu but 3/4 of that is safer
                    batch_size = int(n_workers * worker_cpu * 0.75)

                # Log dashboard URI
                log.info(
                    f"Dask dashboard available at: {cluster.dashboard_link}")
            else:
                # Create local cluster
                log.info("Creating LocalCluster")
                cluster = LocalCluster()
                log.info("Created LocalCluster")

                # Set distributed_executor_address
                distributed_executor_address = cluster.scheduler_address

                # Log dashboard URI
                log.info(
                    f"Dask dashboard available at: {cluster.dashboard_link}")

            # Use dask cluster
            exe = DaskExecutor(distributed_executor_address)

        # Configure your flow
        with Flow("actk") as flow:
            if include_raw:
                dataset = raw(dataset, **kwargs)

            standardized_fov_paths_dataset = standardize_fov_array(
                dataset=dataset,
                distributed_executor_address=distributed_executor_address,
                batch_size=batch_size,
                overwrite=overwrite,
                debug=debug,
                # Allows us to pass `--desired_pixel_sizes [{float},{float},{float}]`
                **kwargs,
            )

            single_cell_features_dataset = single_cell_features(
                dataset=standardized_fov_paths_dataset,
                distributed_executor_address=distributed_executor_address,
                batch_size=batch_size,
                overwrite=overwrite,
                debug=debug,
                # Allows us to pass `--cell_ceiling_adjustment {int}`
                **kwargs,
            )

            single_cell_images_dataset = single_cell_images(
                dataset=single_cell_features_dataset,
                distributed_executor_address=distributed_executor_address,
                batch_size=batch_size,
                overwrite=overwrite,
                debug=debug,
                # Allows us to pass `--cell_ceiling_adjustment {int}`
                **kwargs,
            )

            diagnostic_sheets(
                dataset=single_cell_images_dataset,
                distributed_executor_address=distributed_executor_address,
                overwrite=overwrite,
                # Allows us to pass `--metadata {str}`,
                # `--feature {str}'`
                **kwargs,
            )

        # Run flow and get ending state, log duration
        start = datetime.now()
        state = flow.run(executor=exe)
        duration = datetime.now() - start
        log.info(f"Total duration of pipeline: "
                 f"{duration.seconds // 60 // 60}:"
                 f"{duration.seconds // 60}:"
                 f"{duration.seconds % 60}")

        # Get and display any outputs you want to see on your local terminal
        log.info(single_cell_images_dataset.get_result(state, flow))
Exemple #9
0
import pyart
from netCDF4 import Dataset
import numpy as np
from datetime import datetime, timedelta
from copy import deepcopy
import glob
import math
import dask.array as da
from distributed import Client, LocalCluster
from dask import delayed, compute
import time
import sys
from scipy import ndimage

# Start a cluster with x workers
cluster = LocalCluster(n_workers=int(sys.argv[1]))
client = Client(cluster)

# Input the range of dates and time wanted for the collection of images
start_year = 2006
start_day = 1
start_month = 1
start_hour = 1
start_minute = 0
start_second = 0

end_year = 2006
end_month = 3
end_day = 1
end_hour = 0
end_minute = 00
Exemple #10
0
    folder = to_convert[int(folder_num) -
                        1].rpartition('/')[0].rpartition(visit)[2][1:]
    try:
        save_location = os.path.join('/dls', beamline, 'data', year, visit,
                                     'processing', folder)
        if os.path.exists(save_location) == False:
            os.makedirs(save_location)
        watch_convert(beamline, year, visit, folder)

    except Exception as e:
        print('** ERROR processing** \n ', e)


if __name__ == "__main__":
    from distributed import Client, LocalCluster
    cluster = LocalCluster(n_workers=20, memory_limit=100e9)
    client = Client(cluster)
    parser = argparse.ArgumentParser()
    parser.add_argument('beamline', help='Beamline name')
    parser.add_argument('year', help='Year')
    parser.add_argument('visit', help='Session visit code')
    parser.add_argument('folder',
                        nargs='?',
                        default=None,
                        help='OPTION to add a specific folder within a visit \
                        to look for data, e.g. sample1/dataset1/. If None the assumption would be to look in Merlin folder'
                        )
    parser.add_argument('folder_num', nargs='?', help='passed by scheduler')
    v_help = "Display all debug log messages"
    parser.add_argument("-v",
                        "--verbose",
Exemple #11
0
def test_empty_dmatrix_hist():
    with LocalCluster(n_workers=kWorkers) as cluster:
        with Client(cluster) as client:
            parameters = {'tree_method': 'hist'}
            run_empty_dmatrix_reg(client, parameters)
            run_empty_dmatrix_cls(client, parameters)
Exemple #12
0
def run_external_link_checker(
    google_api_credentials_path: str,
    master_spreadsheet_id: Optional[str] = None,
    spreadsheet_ids_str: Optional[str] = None,
):
    """
    Run the the external link checker.
    If a list of spreadsheet ids are provided, run the external link checker
    against the list of spreadsheet ids, instead of the spreadsheet ids gathered
    from the master spreadsheet.

    Parameters
    ----------
    master_spreadsheet_id: str
        The master spreadsheet id.
    google_api_credentials_path: str
        The path to Google API credentials file needed to read Google Sheets.
    spreadsheet_ids_str: Optional[str]
        The list spreadsheet ids, delimited by comma.
    """
    log.info("Finished external link checker set up, start checking external link.")
    log.info("=" * 80)
    # Spawn local dask cluster
    cluster = LocalCluster()
    # Log the dashboard link
    log.info(f"Dashboard available at: {cluster.dashboard_link}")
    # Setup workflow
    with Flow("Check external links") as flow:
        # Get spreadsheet ids
        spreadsheet_ids = _get_spreadsheet_ids(
            master_spreadsheet_id, google_api_credentials_path, spreadsheet_ids_str
        )

        # Extract sheets data.
        # Get back list of list of SheetData
        spreadsheets_data = _extract.map(
            spreadsheet_ids,
            unmapped(google_api_credentials_path),
        )
        # Extract links from list of SheetData
        # Get back list of list of URLData
        links_data = _extract_external_links.map(flatten(spreadsheets_data))
        # Unique the url data
        unique_links_data = _unique_external_links(flatten(links_data))
        # Check external links
        _check_external_link.map(unique_links_data)

    # Run the flow
    state = flow.run(executor=DaskExecutor(cluster.scheduler_address))
    if state.is_failed():
        raise PrefectFlowFailure(ErrorInfo({"flow_name": flow.name}))
    # Get the list of CheckedURL
    checked_links = state.result[flow.get_tasks(name="_check_external_link")[0]].result
    log.info("=" * 80)
    # Get error links
    error_links = [link for link in checked_links if link.has_error]
    gs_cells = []
    for error_link in error_links:
        for cell in error_link.url_data.cells:
            gs_cells.append(
                GoogleSheetCell(
                    spreadsheet_title=cell.spreadsheet_title,
                    sheet_title=cell.sheet_title,
                    row_index=cell.row_index,
                    col_index=cell.col_index,
                    url=error_link.url_data.url,
                    msg=error_link.msg,
                )
            )

    sorted_gs_cells = sorted(
        gs_cells,
        key=lambda x: (
            x.spreadsheet_title,
            x.sheet_title,
            x.row_index,
            x.col_index,
            x.url,
        ),
    )
    # Write error links to a csv file
    with open("external_links.csv", mode="w") as csv_file:
        fieldnames = ["spreadsheet_title", "sheet_title", "cell", "url", "reason"]
        writer = csv.DictWriter(csv_file, fieldnames=fieldnames, delimiter="\t")
        writer.writeheader()
        for gs_cell in sorted_gs_cells:
            writer.writerow(
                {
                    "spreadsheet_title": gs_cell.spreadsheet_title,
                    "sheet_title": gs_cell.sheet_title,
                    "cell": convert_rowcol_to_A1_name(
                        gs_cell.row_index, gs_cell.col_index
                    ),
                    "url": gs_cell.url,
                    "reason": f"{gs_cell.msg}",
                }
            )
    log.info("Finished writing external links csv file")
Exemple #13
0
def test_rabit_ops():
    from distributed import Client, LocalCluster
    n_workers = 3
    with LocalCluster(n_workers=n_workers) as cluster:
        with Client(cluster) as client:
            run_rabit_ops(client, n_workers)
Exemple #14
0
WSGI config for AutoOut project.

It exposes the WSGI callable as a module-level variable named ``application``.

For more information on this file, see
https://docs.djangoproject.com/en/2.1/howto/deployment/wsgi/
"""

import math
import multiprocessing
import os

from distributed import LocalCluster, Client
from django.core.wsgi import get_wsgi_application
from psutil import virtual_memory

os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'AutoOut.settings')

# Start dask cluster
no_cpus = multiprocessing.cpu_count()
threads_per_worker = 2
no_workers = math.floor((no_cpus-2)/threads_per_worker)

mem = virtual_memory()

c = LocalCluster(processes=False, n_workers=no_workers, threads_per_worker=threads_per_worker,
                 memory_limit=mem.free/no_workers)
dask_client = Client(c)

application = get_wsgi_application()
Exemple #15
0
def test_empty_dmatrix_approx() -> None:
    with LocalCluster(n_workers=kWorkers) as cluster:
        with Client(cluster) as client:
            parameters = {'tree_method': 'approx'}
            run_empty_dmatrix_reg(client, parameters)
            run_empty_dmatrix_cls(client, parameters)
def preprocessing_script():
    """
    This script will process all the hybridization folders combined in a 
    processing folder. The input parameters are passed using arparse

    Parameters:
    -----------
    
    scheduler: string
        tcp address of the dask.distributed scheduler (ex. tcp://192.168.0.4:7003). 
        default = False. If False the process will run on the local computer using nCPUs-1

    path: string
        Path to the processing directory


    """


    # Inputs of the function
    parser = argparse.ArgumentParser(description='Preprocessing script')
    parser.add_argument('-scheduler', default=False, help='dask scheduler address ex. tcp://192.168.0.4:7003')
    parser.add_argument('-path', help='processing directory')
    args = parser.parse_args()
    
    # Directory to process
    processing_directory = args.path
    # Dask scheduler address
    scheduler_address = args.scheduler
    
    if scheduler_address:
        # Start dask client on server or cluster
        client=Client(scheduler_address)

    else:
        # Start dask client on local machine. It will use all the availabe
        # cores -1

        # number of core to use
        ncores = multiprocessing.cpu_count()-1
        cluster = LocalCluster(n_workers=ncores)
        client=Client(cluster)

    # Subdirectories of the processing_directory that need to be skipped for the
    # analysis
    blocked_directories = ['_logs']

    # Starting logger
    utils.init_file_logger(processing_directory)
    logger = logging.getLogger()

    # Determine the operating system running the code
    os_windows, add_slash = utils.determine_os()

    # Check training slash in the processing directory
    processing_directory=utils.check_trailing_slash(processing_directory,os_windows)

    # Get a list of the hybridization to process
    processing_hyb_list = next(os.walk(processing_directory))[1]

    # Remove the blocked directories from the directories to process
    processing_hyb_list = [el for el in processing_hyb_list if el not in blocked_directories ]

    for processing_hyb in processing_hyb_list:
    
        # Determine the hyb number from the name
        hybridization_number = processing_hyb.split('_hyb')[-1]
        hybridization = 'Hybridization' + hybridization_number
        hyb_dir = processing_directory + processing_hyb + add_slash
        
        # Parse the Experimental metadata file (serial)
        experiment_infos,image_properties, hybridizations_infos, \
        converted_positions, microscope_parameters =\
        utils.experimental_metadata_parser(hyb_dir)
        
        # Parse the configuration file 
        flt_rawcnt_config = utils.filtering_raw_counting_config_parser(hyb_dir)
        
        
        # ----------------- .nd2 FILE CONVERSION ------------------------------

        # Create the temporary subdirectory tree (serial)
        tmp_dir_path, tmp_gene_dirs=utils.create_subdirectory_tree(hyb_dir,\
                    hybridization,hybridizations_infos,processing_hyb,suffix='tmp',add_slash=add_slash)

        # Get the list of the nd2 files to process inside the directory
        files_list = glob.glob(hyb_dir+processing_hyb+'_raw_data'+add_slash+'*.nd2')

        # Get the list of genes that are analyzed in the current hybridization
        gene_list = list(hybridizations_infos[hybridization].keys())

        # Organize the file to process in a list which order match the gene_list for
        # parallel processing
        organized_files_list = [f for gene in gene_list for f in files_list if gene+'.nd2' in f  ]
        organized_tmp_dir_list = [f for gene in gene_list for f in tmp_gene_dirs if gene in f  ]

        # Each .nd2 file will be processed in a worker part of a different node
        # Get the addresses of one process/node to use for conversion
        node_addresses = utils.identify_nodes(client)
        workers_conversion = [list(el.items())[0][1] for key,el in node_addresses.items()]

        # Run the conversion
        futures_processes=client.map(io.nd2_to_npy,gene_list,organized_files_list,
                                    tmp_gene_dirs,processing_hyb=processing_hyb,
                                    use_ram=flt_rawcnt_config['use_ram'],
                                    max_ram=flt_rawcnt_config['max_ram'],
                                    workers=workers_conversion)
        client.gather(futures_processes)

        

        # ---------------------------------------------------------------------
        
        
        # ----------------- FILTERING AND RAW COUNTING ------------------------
        
        # Create directories 

        # Create the directory where to save the filtered images
        suffix = 'filtered_png'
        filtered_png_img_dir_path, filtered_png_img_gene_dirs = \
                utils.create_subdirectory_tree(hyb_dir,hybridization,hybridizations_infos,
                            processing_hyb,suffix,add_slash,analysis_name=flt_rawcnt_config['analysis_name'])

        suffix = 'filtered_npy'
        filtered_img_dir_path, filtered_img_gene_dirs = \
                utils.create_subdirectory_tree(hyb_dir,hybridization,hybridizations_infos,
                            processing_hyb,suffix,add_slash,analysis_name=flt_rawcnt_config['analysis_name'])

        # Create the directory where to save the counting
        suffix = 'counting'
        counting_dir_path, counting_gene_dirs = \
            utils.create_subdirectory_tree(hyb_dir,hybridization,hybridizations_infos,processing_hyb,
                            suffix,add_slash,flt_rawcnt_config['skip_tags_counting'],
                            flt_rawcnt_config['skip_genes_counting'],
                            analysis_name=flt_rawcnt_config['analysis_name'])


        if flt_rawcnt_config['illumination_correction']:

            # Create the directory where to save the counting
            suffix = 'illumination_funcs'
            illumination_func_dir_path, illumination_func_gene_dirs = \
                utils.create_subdirectory_tree(hyb_dir,hybridization,hybridizations_infos,processing_hyb,
                                                suffix,add_slash,analysis_name=flt_rawcnt_config['analysis_name'])

            # Loop through channels and calculate illumination
            for gene in hybridizations_infos[hybridization].keys():
                
                flist_img_to_filter=glob.glob(hyb_dir+processing_hyb+'_tmp/'+processing_hyb+'_'+gene+'_tmp/*.npy')

                logger.debug('Create average image for gene %s', gene)

                # Chunking the image list
                num_chunks = sum(list(client.ncores().values()))
                chunked_list = utils.list_chunking(flist_img_to_filter,num_chunks)

                # Scatter the images sublists to process in parallel
                futures = client.scatter(chunked_list)

                # Create dask processing graph
                output = []
                for future in futures:
                    ImgMean = delayed(utils.partial_image_mean)(future)
                    output.append(ImgMean)
                ImgMean_all = delayed(sum)(output)
                ImgMean_all = ImgMean_all/float(len(futures))

                # Compute the graph
                ImgMean = ImgMean_all.compute()

                logger.debug('Create illumination function for gene %s',gene)
                # Create illumination function
                Illumination=filters.gaussian(ImgMean,sigma=(20,300,300))

                # Normalization of the illumination
                Illumination_flat=np.amax(Illumination,axis=0)
                Illumination_norm=Illumination_flat/np.amax(Illumination_flat)

                logger.debug('Save illumination function for gene %s',gene)
                # Save the illumination function
                illumination_path = [ill_path for ill_path in illumination_func_gene_dirs if gene in ill_path][0]
                illumination_fname=illumination_path+gene+'_illumination_func.npy'
                np.save(illumination_fname,Illumination_norm,allow_pickle=False)  

                # Broadcast the illumination function to all the cores
                client.scatter(Illumination_norm, broadcast=True)

                logger.debug('Filtering %s',gene)
                # Filtering and counting
                futures_processes=client.map(counting.filtering_and_counting_ill_correction,flist_img_to_filter, \
                                illumination_function=Illumination_norm,\
                                filtered_png_img_gene_dirs=filtered_png_img_gene_dirs,\
                                filtered_img_gene_dirs =filtered_img_gene_dirs,\
                                counting_gene_dirs=counting_gene_dirs,plane_keep=flt_rawcnt_config['plane_keep'], \
                                min_distance=flt_rawcnt_config['min_distance'], stringency=flt_rawcnt_config['stringency'],\
                                skip_genes_counting=flt_rawcnt_config['skip_genes_counting'],skip_tags_counting=flt_rawcnt_config['skip_tags_counting'])
                client.gather(futures_processes)
               

        else:
            for gene in hybridizations_infos[hybridization].keys():
                flist_img_to_filter=glob.glob(hyb_dir+processing_hyb+'_tmp/'+processing_hyb+'_'+gene+'_tmp/*.npy')
                # filtering
                logger.debug('Filtering without illumination correction %s',gene)

                futures_processes=client.map(counting.filtering_and_counting,flist_img_to_filter, \
                                        filtered_png_img_gene_dirs=filtered_png_img_gene_dirs, \
                                        filtered_img_gene_dirs=filtered_img_gene_dirs, \
                                        counting_gene_dirs=counting_gene_dirs, \
                                        plane_keep=flt_rawcnt_config['plane_keep'], min_distance=flt_rawcnt_config['min_distance'],\
                                        stringency=flt_rawcnt_config['stringency'],\
                                        skip_genes_counting=flt_rawcnt_config['skip_genes_counting'],skip_tags_counting=flt_rawcnt_config['skip_tags_counting'])

                client.gather(futures_processes)
                
        # ---------------------------------------------------------------------
        
        # # ----------------- COMBINE THE FILTERED DATA IN .ppf.hdf5 ------------------------
        # # Combine the filter data in one single .ppf for each hybridization
        # # This step will run in serial mode and will not need to shuffle data
        # #  between cores because everything is on the common file system

        # logger.debug('Create .ppf.hdf5 file')

        # # Create the ppf.hdf5 file that contains the filtered data in uint16
        # preprocessing_file_path = hdf5_utils.hdf5_create_preprocessing_file(hybridizations_infos,processing_hyb,
        #                                 hybridization,flt_rawcnt_config['analysis_name'], hyb_dir,converted_positions,image_properties)

        # logger.debug('Write the .npy filtered files into the .ppf file')
        # # Load and write the .npy tmp images into the hdf5 file

        # # open the hdf5 file
        # with h5py.File(preprocessing_file_path) as f_hdl:
        #     # Loop through each gene
        #     for gene in hybridizations_infos[hybridization].keys():

        #         logger.debug('Writing %s images in .ppf.hdf5',gene)
        #         # list of the files to transfer
        #         filtered_gene_dir = [fdir for fdir in filtered_img_gene_dirs if gene in fdir][0]
        #         filtered_files_list = glob.glob(filtered_gene_dir+'*.npy')

        #         # loop through the list of file
        #         for f_file in filtered_files_list:
        #             pos = f_file.split('/')[-1].split('_')[-1].split('.')[0]
        #             f_hdl[gene]['FilteredData'][pos][:] =np.load(f_file)
        #             f_hdl.flush()
        
        # # ---------------------------------------------------------------------
        
        # # ----------------- STITCHING ------------------------
        # # Load the stitching parameters from the .yaml file

        # # Stitch the image in 2D or 3D (3D need more work/testing)
        # nr_dim = flt_rawcnt_config['nr_dim']

        # # Estimated overlapping between images according to the Nikon software
        # est_overlap = image_properties['Overlapping_percentage']

        # # Number of peaks to use for the alignment
        # nr_peaks = flt_rawcnt_config['nr_peaks']

        # # Determine if the coords need to be flipped

        # y_flip = flt_rawcnt_config['y_flip']

        # # Method to use for blending
        # # can be 'linear' or 'non linear'
        # # The methods that performs the best is the 'non linear'

        # blend = flt_rawcnt_config['blend']

        # # Reference gene for stitching
        # reference_gene = flt_rawcnt_config['reference_gene']

        # pixel_size = image_properties['PixelSize']

        # # Get the list of the filtered files of the reference gene
        # filtered_gene_dir = [gene_dir for gene_dir in filtered_img_gene_dirs if reference_gene in gene_dir][0]
        # filtered_files_list = glob.glob(filtered_gene_dir+'*.npy')

        # # Create pointer of the hdf5 file that will store the stitched reference image
        # # for the current hybridization
        # # Writing
        # tile_file_base_name = flt_rawcnt_config['analysis_name']+'_'+ processing_hyb
        # data_name   = (tile_file_base_name
        #                 + '_' + reference_gene
        #                 + '_stitching_data')

        # stitching_file_name = tile_file_base_name + '.sf.hdf5'
        # stitching_file= h5py.File(hyb_dir+stitching_file_name,'w',libver='latest')  # replace with 'a' as soon as you fix the error


        # # Determine the tiles organization
        # tiles, contig_tuples, nr_pixels, z_count, micData = stitching.get_pairwise_input_npy(image_properties,converted_positions, hybridization,
        #                         est_overlap = est_overlap, y_flip = False, nr_dim = 2)



        # # Align the tiles 
        # futures_processes=client.map(pairwisesingle.align_single_pair_npy,contig_tuples,
        #                             filtered_files_list=filtered_files_list,micData=micData, 
        #                         nr_peaks=nr_peaks)

        # # Gather the futures
        # data = client.gather(futures_processes)


        # # In this case the order of the returned contingency tuples is with
        # # the order of the input contig_tuples

        # # P_all = [el for data_single in data for el in data_single[0]]
        # P_all =[data_single[0] for data_single in data ]
        # P_all = np.array(P_all)
        # P_all = P_all.flat[:]
        # covs_all = [data_single[1] for data_single in data]
        # alignment = {'P': P_all,
        #             'covs': covs_all}


        # # Calculates a shift in global coordinates for each tile (global
        # # alignment) and then applies these shifts to the  corner coordinates
        # # of each tile and returns and saves these shifted corner coordinates.
        # joining = stitching.get_place_tile_input(hyb_dir, tiles, contig_tuples,
        #                                             micData, nr_pixels, z_count,
        #                                             alignment, data_name,
        #                                             nr_dim=nr_dim)

        # # Create the hdf5 file structure
        # stitched_group, linear_blending, blend =  hdf5preparation.create_structures_hdf5_stitched_ref_gene_file_npy(stitching_file, joining, nr_pixels,
        #                                 reference_gene, blend = 'non linear')

        # # Fill the hdf5 containing the stitched image with empty data and
        # # create the blending mask
        # stitched_group['final_image'][:]= np.zeros(joining['final_image_shape'],dtype=np.float64)
        # if blend is not None:
        #     # make mask
        #     stitched_group['blending_mask'][:] = np.zeros(joining['final_image_shape'][-2:],dtype=np.float64)
        #     tilejoining.make_mask(joining, nr_pixels, stitched_group['blending_mask'])

            
        # # Create the subdirectory used to save the blended tiles
        # suffix = 'blended_tiles'
        # blended_tiles_directory = utils.create_single_directory(hyb_dir,reference_gene, hybridization,processing_hyb,suffix,add_slash,
        #                                 analysis_name=flt_rawcnt_config['analysis_name'])

        # # Get the directory with the filtered npy images of the reference_gene to use for stitching
        # stitching_files_dir = [npy_dir for npy_dir in filtered_img_gene_dirs if reference_gene in npy_dir][0]


        # # Create the tmp directory where to save the masks
        # suffix = 'masks'
        # masked_tiles_directory = utils.create_single_directory(hyb_dir,reference_gene, hybridization,processing_hyb,suffix,add_slash,
        #                                 analysis_name=flt_rawcnt_config['analysis_name'])

        # # Create and save the mask files
        # for corn_value,corner_coords in joining['corner_list']:
        #     if not(np.isnan(corner_coords[0])):
        #         cur_mask = stitched_group['blending_mask'][int(corner_coords[0]):int(corner_coords[0]) + int(nr_pixels),
        #                             int(corner_coords[1]):int(corner_coords[1]) + int(nr_pixels)]

        #         fname = masked_tiles_directory + flt_rawcnt_config['analysis_name'] +'_'+processing_hyb+'_'+reference_gene+'_masks_joining_pos_'+str(corn_value)
        #         np.save(fname,cur_mask)


        # # Blend all the tiles and save them in a directory
        # futures_processes = client.map(tilejoining.generate_blended_tile_npy,joining['corner_list'],
        #                             stitching_files_dir = stitching_files_dir,
        #                             blended_tiles_directory = blended_tiles_directory,
        #                             masked_tiles_directory = masked_tiles_directory,
        #                             analysis_name = flt_rawcnt_config['analysis_name'],
        #                             processing_hyb = processing_hyb,reference_gene = reference_gene,
        #                             micData = micData,tiles = tiles,nr_pixels=nr_pixels,
        #                             linear_blending=linear_blending)



        # _ = client.gather(futures_processes)


        # # Write the stitched image
        # tilejoining.make_final_image_npy(joining, stitching_file, blended_tiles_directory, tiles,reference_gene, nr_pixels)

        # # close the hdf5 file
        # stitching_file.close()


        # # Delete the directories with blended tiles and masks
        # shutil.rmtree(blended_tiles_directory)
        # shutil.rmtree(masked_tiles_directory)

        # ----------------- DELETE FILES ------------------------
        # Don't delete the *.npy files here because can be used to 
        # create the final images using the apply stitching related function    









    client.close()
Exemple #17
0
def test_aft_survival() -> None:
    with LocalCluster(n_workers=kWorkers) as cluster:
        with Client(cluster) as client:
            run_aft_survival(client, DaskDMatrix)
Exemple #18
0
    def run(
        self,
        distributed: bool = False,
        clean: bool = False,
        debug: bool = False,
        structs: list = ["Nuc"],
        flow_viz: bool = False,
        **kwargs,
    ):
        """
        Run a flow with your steps.
        Parameters
        ----------
        distributed: bool
            A boolean option to determine if the jobs should be distributed to a remote
            cluster when possible.
            Default: False (Do not distribute)
        clean: bool
            Should the local staging directory be cleaned prior to this run.
            Default: False (Do not clean)
        debug: bool
            A debug flag for the developer to use to manipulate how much data runs,
            how it is processed, etc.
            Default: False (Do not debug)
        structs: List
            List of structure data to run pipeline on. Currently, only
            'Nuc' (nuclear membrane) and 'Cell' (cell membrane) are supported.
        flow_viz: bool
            Make flow chart to visualize pipeline - requires conda install of graphviz.

        Notes
        -----
        Documentation on prefect:
        https://docs.prefect.io/core/
        Basic prefect example:
        https://docs.prefect.io/core/
        """

        # Initalize steps
        if "Nuc" in structs:
            loaddata_nuc = steps.LoadData()
            shparam_nuc = steps.Shparam(step_name="shparam_nuc")
            avgshape_nuc = steps.Avgshape(step_name="avgshape_nuc")
            nma_nuc = steps.Nma(step_name="nma_nuc")

        if "Cell" in structs:
            single_cell = steps.Singlecell(step_name="single_cell")
            shparam_cell = steps.Shparam(step_name="shparam_cell")
            avgshape_cell = steps.Avgshape(step_name="avgshape_cell")
            nma_cell = steps.Nma(step_name="nma_cell")

        if "Nuc" in structs and "Cell" in structs:
            compare_nuc_cell = steps.CompareNucCell()

        # Choose executor
        if debug:
            exe = LocalExecutor()
            distributed_executor_address = None
            log.info(f"Debug flagged. Will use threads instead of Dask.")
        else:
            if distributed:
                # Create or get log dir
                # Do not include ms
                log_dir_name = datetime.now().isoformat().split(".")[0]
                log_dir = Path(f".dask_logs/{log_dir_name}").expanduser()
                # Log dir settings
                log_dir.mkdir(parents=True, exist_ok=True)

                # Configure dask config
                dask.config.set({
                    "scheduler.work-stealing": False,
                    "logging.distributed.worker": "info",
                })

                # Create cluster
                log.info("Creating SLURMCluster")
                cluster = SLURMCluster(
                    cores=4,
                    memory="20GB",
                    queue="aics_cpu_general",
                    walltime="10:00:00",
                    local_directory=str(log_dir),
                    log_directory=str(log_dir),
                )
                log.info("Created SLURMCluster")

                # Scale cluster
                cluster.scale(60)

                # Use the port from the created connector to set executor address
                distributed_executor_address = cluster.scheduler_address

                # Log dashboard URI
                log.info(
                    f"Dask dashboard available at: {cluster.dashboard_link}")
            else:
                # Create local cluster
                log.info("Creating LocalCluster")
                cluster = LocalCluster()
                log.info("Created LocalCluster")

                # Set distributed_executor_address
                distributed_executor_address = cluster.scheduler_address

                # Log dashboard URI
                log.info(
                    f"Dask dashboard available at: {cluster.dashboard_link}")

            # Use dask cluster
            exe = DaskExecutor(distributed_executor_address)

        try:
            # Configure your flow
            with Flow("mti_nma") as flow:
                # If your step utilizes dask pass the executor address
                # If you want to clean the local staging directories pass clean
                # If you want to utilize some debugging functionality pass debug
                # If you don't utilize any of these, just pass the parameters you need.

                if "Nuc" in structs:
                    struct = "Nuc"

                    ld_nuc_df = loaddata_nuc(distributed_executor_address=
                                             distributed_executor_address,
                                             clean=clean,
                                             debug=debug,
                                             struct=struct,
                                             **kwargs)
                    sh_nuc_df = shparam_nuc(sc_df=ld_nuc_df,
                                            distributed_executor_address=
                                            distributed_executor_address,
                                            clean=clean,
                                            debug=debug,
                                            struct=struct,
                                            **kwargs)
                    avg_nuc_df = avgshape_nuc(sh_df=sh_nuc_df,
                                              distributed_executor_address=
                                              distributed_executor_address,
                                              clean=clean,
                                              debug=debug,
                                              struct=struct,
                                              **kwargs)
                    nma_nuc_df = nma_nuc(avg_df=avg_nuc_df,
                                         distributed_executor_address=
                                         distributed_executor_address,
                                         clean=clean,
                                         debug=debug,
                                         struct=struct,
                                         **kwargs)

                if "Cell" in structs:
                    struct = "Cell"

                    sc_cell_df = single_cell(distributed_executor_address=
                                             distributed_executor_address,
                                             clean=clean,
                                             debug=debug,
                                             struct=struct,
                                             **kwargs)
                    sh_cell_df = shparam_cell(sc_df=sc_cell_df,
                                              distributed_executor_address=
                                              distributed_executor_address,
                                              clean=clean,
                                              debug=debug,
                                              struct=struct,
                                              **kwargs)
                    avg_cell_df = avgshape_cell(sh_df=sh_cell_df,
                                                distributed_executor_address=
                                                distributed_executor_address,
                                                clean=clean,
                                                debug=debug,
                                                struct=struct,
                                                **kwargs)
                    nma_cell_df = nma_cell(avg_df=avg_cell_df,
                                           distributed_executor_address=
                                           distributed_executor_address,
                                           clean=clean,
                                           debug=debug,
                                           struct=struct,
                                           **kwargs)

                # If nucleus and cell membrane were anlyzed, draw comparison plot
                if "Nuc" in structs and "Cell" in structs:
                    compare_nuc_cell(nma_nuc_df, nma_cell_df)

            # Run flow, get ending state, and visualize pipeline
            flow.run(executor=exe)

            # Create pipeline visualization if flag is True
            # Note:
            # Flag False by default as a required package is not pip-installable
            # To use this feature, first `conda install graphviz`
            if flow_viz:
                flow.visualize()

        # Catch any error and kill the remote dask cluster
        except Exception as err:
            log.error(f"Something went wrong during pipeline run: {err}")
Exemple #19
0
def test_autocompletion():
    with LocalCluster(scheduler_port=8786) as _:
        assert len(autocomplete_cluster_names(None, None, "")) == 1
        assert len(autocomplete_cluster_names(None, None, "proxy")) == 1
        assert len(autocomplete_cluster_names(None, None, "local")) == 0
Exemple #20
0
    def __init__(self,
                 name=dask.config.get('jobqueue.name'),
                 threads=dask.config.get('jobqueue.threads'),
                 processes=dask.config.get('jobqueue.processes'),
                 memory=dask.config.get('jobqueue.memory'),
                 interface=dask.config.get('jobqueue.interface'),
                 death_timeout=dask.config.get('jobqueue.death-timeout'),
                 local_directory=dask.config.get('jobqueue.local-directory'),
                 extra=dask.config.get('jobqueue.extra'),
                 env_extra=dask.config.get('jobqueue.env-extra'),
                 **kwargs):
        """ """
        # """
        # This initializer should be considered as Abstract, and never used
        # directly.
        # """
        if not self.cancel_command or not self.submit_command:
            raise NotImplementedError('JobQueueCluster is an abstract class '
                                      'that should not be instanciated.')

        #This attribute should be overriden
        self.job_header = None

        if interface:
            host = get_ip_interface(interface)
            extra += ' --interface  %s ' % interface
        else:
            host = socket.gethostname()

        self.cluster = LocalCluster(n_workers=0, ip=host, **kwargs)

        # Keep information on process, threads and memory, for use in
        # subclasses
        self.worker_memory = parse_bytes(
            memory) if memory is not None else None
        self.worker_processes = processes
        self.worker_threads = threads
        self.name = name

        self.jobs = dict()
        self.n = 0
        self._adaptive = None

        self._env_header = '\n'.join(env_extra)

        # dask-worker command line build
        dask_worker_command = ('%(python)s -m distributed.cli.dask_worker' %
                               dict(python=sys.executable))
        self._command_template = ' '.join(
            [dask_worker_command, self.scheduler.address])
        if threads is not None:
            self._command_template += " --nthreads %d" % threads
        if processes is not None:
            self._command_template += " --nprocs %d" % processes
        if memory is not None:
            self._command_template += " --memory-limit %s" % memory
        if name is not None:
            self._command_template += " --name %s" % name
            self._command_template += "-%(n)d"  # Keep %(n) to be replaced later
        if death_timeout is not None:
            self._command_template += " --death-timeout %s" % death_timeout
        if local_directory is not None:
            self._command_template += " --local-directory %s" % local_directory
        if extra is not None:
            self._command_template += extra
Exemple #21
0
    def __init__(self,
                 name=None,
                 cores=None,
                 memory=None,
                 processes=None,
                 interface=None,
                 death_timeout=None,
                 local_directory=None,
                 extra=None,
                 env_extra=None,
                 log_directory=None,
                 threads=None,
                 shebang=None,
                 python=sys.executable,
                 config_name=None,
                 **kwargs):
        """ """
        # """
        # This initializer should be considered as Abstract, and never used directly.
        # """
        super(JobQueueCluster, self).__init__()

        if threads is not None:
            raise ValueError(threads_deprecation_message)

        if config_name is None:
            raise NotImplementedError(
                "JobQueueCluster is an abstract class that should not be instantiated."
            )

        if name is None:
            name = dask.config.get("jobqueue.%s.name" % config_name)
        if cores is None:
            cores = dask.config.get("jobqueue.%s.cores" % config_name)
        if memory is None:
            memory = dask.config.get("jobqueue.%s.memory" % config_name)
        if processes is None:
            processes = dask.config.get("jobqueue.%s.processes" % config_name)
        if interface is None:
            interface = dask.config.get("jobqueue.%s.interface" % config_name)
        if death_timeout is None:
            death_timeout = dask.config.get("jobqueue.%s.death-timeout" %
                                            config_name)
        if local_directory is None:
            local_directory = dask.config.get("jobqueue.%s.local-directory" %
                                              config_name)
        if extra is None:
            extra = dask.config.get("jobqueue.%s.extra" % config_name)
        if env_extra is None:
            env_extra = dask.config.get("jobqueue.%s.env-extra" % config_name)
        if log_directory is None:
            log_directory = dask.config.get("jobqueue.%s.log-directory" %
                                            config_name)
        if shebang is None:
            shebang = dask.config.get("jobqueue.%s.shebang" % config_name)

        if dask.config.get("jobqueue.%s.threads", None):
            warnings.warn(threads_deprecation_message)

        if cores is None:
            raise ValueError(
                "You must specify how many cores to use per job like ``cores=8``"
            )

        if memory is None:
            raise ValueError(
                "You must specify how much memory to use per job like ``memory='24 GB'``"
            )

        # This attribute should be overridden
        self.job_header = None

        if interface:
            extra += ["--interface", interface]
            kwargs.setdefault("ip", get_ip_interface(interface))
        else:
            kwargs.setdefault("ip", "")

        # Bokeh diagnostics server should listen on all interfaces
        kwargs.setdefault("dashboard_address", ("", 8787))
        self.local_cluster = LocalCluster(n_workers=0, **kwargs)

        # Keep information on process, cores, and memory, for use in subclasses
        self.worker_memory = parse_bytes(
            memory) if memory is not None else None
        self.worker_processes = processes
        self.worker_cores = cores
        self.name = name

        # plugin for tracking job status
        self._scheduler_plugin = JobQueuePlugin()
        self.local_cluster.scheduler.add_plugin(self._scheduler_plugin)

        self._adaptive = None

        self.shebang = shebang

        self._env_header = "\n".join(env_extra)

        # dask-worker command line build
        dask_worker_command = "%(python)s -m distributed.cli.dask_worker" % dict(
            python=python)
        command_args = [dask_worker_command, self.scheduler.address]
        command_args += ["--nthreads", self.worker_process_threads]
        if processes is not None and processes > 1:
            command_args += ["--nprocs", processes]

        command_args += ["--memory-limit", self.worker_process_memory]
        command_args += ["--name", "%s--${JOB_ID}--" % name]

        if death_timeout is not None:
            command_args += ["--death-timeout", death_timeout]
        if local_directory is not None:
            command_args += ["--local-directory", local_directory]
        if extra is not None:
            command_args += extra

        self._command_template = " ".join(map(str, command_args))

        self.log_directory = log_directory
        if self.log_directory is not None:
            if not os.path.exists(self.log_directory):
                os.makedirs(self.log_directory)
Exemple #22
0
from copy import deepcopy
import math
import dask.array as da
from distributed import Client, LocalCluster
from dask import delayed, compute
import time
import sys
from scipy import ndimage
import pandas
import time_procedures
import matplotlib
matplotlib.use('Agg')
import pyart

# Start a cluster with x workers
cluster = LocalCluster(n_workers=int(sys.argv[1]), processes=False)
client = Client(cluster)

# Input the range of dates and time wanted for the collection of images
start_year = 2005
start_month = 11
start_day = 1
start_hour = 1
start_minute = 0
start_second = 0

end_year = 2011
end_month = 5
end_day = 2
end_hour = 0
end_minute = 00
def run_sigla_pipeline(master_spreadsheet_id: str,
                       google_api_credentials_path: str,
                       db_connection_url: str):
    """
    Run the SIGLA ETL pipeline

    Parameters
    ----------
    master_spreadsheet_id:
        The master spreadsheet id.
    google_api_credentials_path: str
        The path to Google API credentials file needed to read Google Sheets.
    db_connection_url: str
        The DB's connection url str.
    """
    log.info("Finished pipeline set up, start running pipeline")
    log.info("=" * 80)
    # Spawn local dask cluster
    cluster = LocalCluster()
    # Log the dashboard link
    log.info(f"Dashboard available at: {cluster.dashboard_link}")
    # Setup workflow
    with Flow("SIGLA Data Pipeline") as flow:
        # Delete all documents from db
        clean_up_task = _clean_up(db_connection_url)
        # Get spreadsheet ids
        spreadsheet_ids = _get_spreadsheet_ids(master_spreadsheet_id,
                                               google_api_credentials_path)
        # Extract sheets data.
        # Get back list of list of SheetData
        spreadsheets_data = _extract.map(
            spreadsheet_ids,
            unmapped(google_api_credentials_path),
            upstream_tasks=[unmapped(clean_up_task)],
        )

        # Transform list of SheetData into FormattedSheetData
        formatted_spreadsheets_data = _transform.map(
            flatten(spreadsheets_data))
        # Create instituton filter
        gs_institution_filter = _create_filter_task([
            gs_format.standard_institution,
            gs_format.multiple_sigla_answer_variable,
        ])
        # Filter to list of institutional formatted sheet data
        gs_institutions_data = gs_institution_filter(
            formatted_spreadsheets_data)
        # Create composite filter
        gs_composite_filter = _create_filter_task([
            gs_format.composite_variable,
            gs_format.institution_and_composite_variable,
        ])
        # Filter to list of composite formatted sheet data
        gs_composites_data = gs_composite_filter(formatted_spreadsheets_data)

        # Load instutional data
        load_institutions_data_task = _load_institutions_data.map(
            gs_institutions_data, unmapped(db_connection_url))
        # Load composite data
        load_composites_data_task = _load_composites_data.map(
            gs_composites_data,
            unmapped(db_connection_url),
            upstream_tasks=[unmapped(load_institutions_data_task)],
        )
        # Log spreadsheets that were loaded
        _log_spreadsheets(spreadsheets_data,
                          upstream_tasks=[load_composites_data_task])

    # Run the flow
    state = flow.run(executor=DaskExecutor(cluster.scheduler_address))
    if state.is_failed():
        raise PrefectFlowFailure(ErrorInfo({"flow_name": flow.name}))
Exemple #24
0
def create_client_and_cluster(n_jobs, num_tasks, dask_kwargs, entityset_size):
    cluster = None
    if 'cluster' in dask_kwargs:
        cluster = dask_kwargs['cluster']
    else:
        # diagnostics_port sets the default port to launch bokeh web interface
        # if it is set to None web interface will not be launched
        diagnostics_port = None
        if 'diagnostics_port' in dask_kwargs:
            diagnostics_port = dask_kwargs['diagnostics_port']
            del dask_kwargs['diagnostics_port']

        cpu_workers = n_jobs_to_workers(n_jobs)
        workers = min(cpu_workers, num_tasks)
        if n_jobs != -1 and workers < n_jobs:
            warning_string = "{} workers requested, but only {} workers created."
            warning_string = warning_string.format(n_jobs, workers)
            if cpu_workers < n_jobs:
                warning_string += " Not enough cpu cores ({}).".format(
                    cpu_workers)

            if num_tasks < n_jobs:
                chunk_warning = " Not enough chunks ({}), consider reducing the chunk size"
                warning_string += chunk_warning.format(num_tasks)
            warnings.warn(warning_string)

        # Distributed default memory_limit for worker is 'auto'. It calculates worker
        # memory limit as total virtual memory divided by the number
        # of cores available to the workers (alwasy 1 for featuretools setup).
        # This means reducing the number of workers does not increase the memory
        # limit for other workers.  Featuretools default is to calculate memory limit
        # as total virtual memory divided by number of workers. To use distributed
        # default memory limit, set dask_kwargs['memory_limit']='auto'
        if 'memory_limit' in dask_kwargs:
            memory_limit = dask_kwargs['memory_limit']
            del dask_kwargs['memory_limit']
        else:
            total_memory = psutil.virtual_memory().total
            memory_limit = int(total_memory / float(workers))

        cluster = LocalCluster(n_workers=workers,
                               threads_per_worker=1,
                               diagnostics_port=diagnostics_port,
                               memory_limit=memory_limit,
                               **dask_kwargs)

        # if cluster has bokeh port, notify user if unxepected port number
        if diagnostics_port is not None:
            if hasattr(cluster, 'scheduler') and cluster.scheduler:
                info = cluster.scheduler.identity()
                if 'bokeh' in info['services']:
                    msg = "Dashboard started on port {}"
                    print(msg.format(info['services']['bokeh']))

    client = Client(cluster)

    warned_of_memory = False
    for worker in list(client.scheduler_info()['workers'].values()):
        worker_limit = worker['memory_limit']
        if worker_limit < entityset_size:
            raise ValueError("Insufficient memory to use this many workers")
        elif worker_limit < 2 * entityset_size and not warned_of_memory:
            logger.warn(
                "Worker memory is between 1 to 2 times the memory"
                " size of the EntitySet. If errors occur that do"
                " not occur with n_jobs equals 1, this may be the "
                "cause.  See https://docs.featuretools.com/guides/parallel.html"
                " for more information.")
            warned_of_memory = True

    return client, cluster
Exemple #25
0
def run_espei(run_settings):
    """Wrapper around the ESPEI fitting procedure, taking only a settings dictionary.

    Parameters
    ----------
    run_settings : dict
        Dictionary of input settings

    Returns
    -------
    Either a Database (for generate parameters only) or a tuple of (Database, sampler)
    """
    run_settings = get_run_settings(run_settings)
    system_settings = run_settings['system']
    output_settings = run_settings['output']
    generate_parameters_settings = run_settings.get('generate_parameters')
    mcmc_settings = run_settings.get('mcmc')

    # handle verbosity
    verbosity = {
        0: logging.WARNING,
        1: logging.INFO,
        2: TRACE,
        3: logging.DEBUG
    }
    logging.basicConfig(level=verbosity[output_settings['verbosity']],
                        filename=output_settings['logfile'])

    log_version_info()

    # load datasets and handle i/o
    logging.log(TRACE, 'Loading and checking datasets.')
    dataset_path = system_settings['datasets']
    datasets = load_datasets(sorted(recursive_glob(dataset_path, '*.json')))
    if len(datasets.all()) == 0:
        logging.warning(
            'No datasets were found in the path {}. This should be a directory containing dataset files ending in `.json`.'
            .format(dataset_path))
    apply_tags(datasets, system_settings.get('tags', dict()))
    add_ideal_exclusions(datasets)
    logging.log(TRACE, 'Finished checking datasets')

    with open(system_settings['phase_models']) as fp:
        phase_models = json.load(fp)

    if generate_parameters_settings is not None:
        refdata = generate_parameters_settings['ref_state']
        excess_model = generate_parameters_settings['excess_model']
        ridge_alpha = generate_parameters_settings['ridge_alpha']
        aicc_penalty = generate_parameters_settings['aicc_penalty_factor']
        input_dbf = generate_parameters_settings.get('input_db', None)
        if input_dbf is not None:
            input_dbf = Database(input_dbf)
        dbf = generate_parameters(
            phase_models,
            datasets,
            refdata,
            excess_model,
            ridge_alpha=ridge_alpha,
            dbf=input_dbf,
            aicc_penalty_factor=aicc_penalty,
        )
        dbf.to_file(output_settings['output_db'], if_exists='overwrite')

    if mcmc_settings is not None:
        tracefile = output_settings['tracefile']
        probfile = output_settings['probfile']
        # check that the MCMC output files do not already exist
        # only matters if we are actually running MCMC
        if os.path.exists(tracefile):
            raise OSError(
                'Tracefile "{}" exists and would be overwritten by a new run. Use the ``output.tracefile`` setting to set a different name.'
                .format(tracefile))
        if os.path.exists(probfile):
            raise OSError(
                'Probfile "{}" exists and would be overwritten by a new run. Use the ``output.probfile`` setting to set a different name.'
                .format(probfile))

        # scheduler setup
        if mcmc_settings['scheduler'] == 'dask':
            _raise_dask_work_stealing()  # check for work-stealing
            from distributed import LocalCluster
            cores = mcmc_settings.get('cores', multiprocessing.cpu_count())
            if (cores > multiprocessing.cpu_count()):
                cores = multiprocessing.cpu_count()
                logging.warning(
                    "The number of cores chosen is larger than available. "
                    "Defaulting to run on the {} available cores.".format(
                        cores))
            # TODO: make dask-scheduler-verbosity a YAML input so that users can debug. Should have the same log levels as verbosity
            scheduler = LocalCluster(n_workers=cores,
                                     threads_per_worker=1,
                                     processes=True,
                                     memory_limit=0)
            client = ImmediateClient(scheduler)
            client.run(logging.basicConfig,
                       level=verbosity[output_settings['verbosity']],
                       filename=output_settings['logfile'])
            logging.info("Running with dask scheduler: %s [%s cores]" %
                         (scheduler, sum(client.ncores().values())))
            try:
                bokeh_server_info = client.scheduler_info(
                )['services']['bokeh']
                logging.info(
                    "bokeh server for dask scheduler at localhost:{}".format(
                        bokeh_server_info))
            except KeyError:
                logging.info("Install bokeh to use the dask bokeh server.")
        elif mcmc_settings['scheduler'] == 'None':
            client = None
            logging.info(
                "Not using a parallel scheduler. ESPEI is running MCMC on a single core."
            )
        else:  # we were passed a scheduler file name
            _raise_dask_work_stealing()  # check for work-stealing
            client = ImmediateClient(scheduler_file=mcmc_settings['scheduler'])
            client.run(logging.basicConfig,
                       level=verbosity[output_settings['verbosity']],
                       filename=output_settings['logfile'])
            logging.info("Running with dask scheduler: %s [%s cores]" %
                         (client.scheduler, sum(client.ncores().values())))

        # get a Database
        if mcmc_settings.get('input_db'):
            dbf = Database(mcmc_settings.get('input_db'))

        # load the restart trace if needed
        if mcmc_settings.get('restart_trace'):
            restart_trace = np.load(mcmc_settings.get('restart_trace'))
        else:
            restart_trace = None

        # load the remaining mcmc fitting parameters
        iterations = mcmc_settings.get('iterations')
        save_interval = mcmc_settings.get('save_interval')
        chains_per_parameter = mcmc_settings.get('chains_per_parameter')
        chain_std_deviation = mcmc_settings.get('chain_std_deviation')
        deterministic = mcmc_settings.get('deterministic')
        prior = mcmc_settings.get('prior')
        data_weights = mcmc_settings.get('data_weights')
        syms = mcmc_settings.get('symbols')

        # set up and run the EmceeOptimizer
        optimizer = EmceeOptimizer(dbf, scheduler=client)
        optimizer.save_interval = save_interval
        all_symbols = syms if syms is not None else database_symbols_to_fit(
            dbf)
        optimizer.fit(all_symbols,
                      datasets,
                      prior=prior,
                      iterations=iterations,
                      chains_per_parameter=chains_per_parameter,
                      chain_std_deviation=chain_std_deviation,
                      deterministic=deterministic,
                      restart_trace=restart_trace,
                      tracefile=tracefile,
                      probfile=probfile,
                      mcmc_data_weights=data_weights)
        optimizer.commit()

        optimizer.dbf.to_file(output_settings['output_db'],
                              if_exists='overwrite')
        # close the scheduler, if possible
        if hasattr(client, 'close'):
            client.close()
        return optimizer.dbf, optimizer.sampler
    return dbf
Exemple #26
0
    def __init__(self,
                 name='dask',
                 queue='dav',
                 project=None,
                 threads_per_worker=4,
                 processes=8,
                 memory='7GB',
                 walltime='00:30:00',
                 interface=None,
                 extra='',
                 **kwargs):
        """ Initialize a SLURM Cluster

        Parameters
        ----------
        name : str
            Name of worker jobs. Passed to `#SBATCH -J` option.
        queue : str
            Destination queue for each worker job.
            Passed to `#SBATCH -p` option.
        project : str
            Accounting string associated with each worker job. Passed to
            `#SBATCH -A` option.
        threads_per_worker : int
            Number of threads per process.
        processes : int
            Number of processes per node.
        memory : str
            Bytes of memory that the worker can use. This should be a string
            like "7GB" that can be interpretted both by PBS and Dask.
        walltime : str
            Walltime for each worker job.
        interface : str
            Network interface like 'eth0' or 'ib0'.
        extra : str
            Additional arguments to pass to `dask-worker`
        kwargs : dict
            Additional keyword arguments to pass to `LocalCluster`
        """
        self._template = """
#!/bin/bash

#SBATCH -J %(name)s
#SBATCH -n %(processes)d
#SBATCH -p %(queue)s
#SBATCH -A %(project)s
#SBATCH -t %(walltime)s
#SBATCH -e %(name)s.err
#SBATCH -o %(name)s.out

%(base_path)s/dask-worker %(scheduler)s \
    --nthreads %(threads_per_worker)d \
    --nprocs %(processes)s \
    --memory-limit %(memory)s \
    --name %(name)s-%(n)d \
     %(extra)s
""".lstrip()

        if interface:
            host = get_ip_interface(interface)
            extra += ' --interface  %s ' % interface
        else:
            host = socket.gethostname()

        project = project or os.environ.get('SLURM_ACCOUNT')
        if not project:
            raise ValueError("Must specify a project like `project='UCLB1234' "
                             "or set SLURM_ACCOUNT environment variable")
        self.cluster = LocalCluster(n_workers=0, ip=host, **kwargs)
        memory = memory.replace(' ', '')
        self.config = {
            'name': name,
            'queue': queue,
            'project': project,
            'threads_per_worker': threads_per_worker,
            'processes': processes,
            'scheduler': self.scheduler.address,
            'walltime': walltime,
            'base_path': dirname,
            'memory': memory,
            'extra': extra
        }
        self.jobs = dict()
        self.n = 0
        self._adaptive = None
        self._submitcmd = 'sbatch'
        self._cancelcmd = 'scancel'

        logger.debug("Job script: \n %s" % self.job_script())
 def setUp(self):
     self.dagbag = DagBag(include_examples=True)
     self.cluster = LocalCluster()
Exemple #28
0
def test_boost_from_prediction(tree_method: str) -> None:
    if tree_method == 'approx':
        pytest.xfail(reason='test_boost_from_prediction[approx] is flaky')

    from sklearn.datasets import load_breast_cancer
    X, y = load_breast_cancer(return_X_y=True)

    X_ = dd.from_array(X, chunksize=100)
    y_ = dd.from_array(y, chunksize=100)

    with LocalCluster(n_workers=4) as cluster:
        with Client(cluster) as _:
            model_0 = xgb.dask.DaskXGBClassifier(
                learning_rate=0.3,
                random_state=123,
                n_estimators=4,
                tree_method=tree_method,
            )
            model_0.fit(X=X_, y=y_)
            margin = model_0.predict(X_, output_margin=True)

            model_1 = xgb.dask.DaskXGBClassifier(
                learning_rate=0.3,
                random_state=123,
                n_estimators=4,
                tree_method=tree_method,
            )
            model_1.fit(X=X_, y=y_, base_margin=margin)
            predictions_1 = model_1.predict(X_, base_margin=margin)
            proba_1 = model_1.predict_proba(X_, base_margin=margin)

            cls_2 = xgb.dask.DaskXGBClassifier(
                learning_rate=0.3,
                random_state=123,
                n_estimators=8,
                tree_method=tree_method,
            )
            cls_2.fit(X=X_, y=y_)
            predictions_2 = cls_2.predict(X_)
            proba_2 = cls_2.predict_proba(X_)

            cls_3 = xgb.dask.DaskXGBClassifier(
                learning_rate=0.3,
                random_state=123,
                n_estimators=8,
                tree_method=tree_method,
            )
            cls_3.fit(X=X_, y=y_)
            proba_3 = cls_3.predict_proba(X_)

            # compute variance of probability percentages between two of the
            # same model, use this to check to make sure approx is functioning
            # within normal parameters
            expected_variance = np.max(np.abs(proba_3 - proba_2)).compute()

            if expected_variance > 0:
                margin_variance = np.max(np.abs(proba_1 - proba_2)).compute()
                # Ensure the margin variance is less than the expected variance + 10%
                assert np.all(margin_variance <= expected_variance + .1)
            else:
                np.testing.assert_equal(predictions_1.compute(), predictions_2.compute())
                np.testing.assert_almost_equal(proba_1.compute(), proba_2.compute())
Exemple #29
0
def client(tmpdir):
    cluster = LocalCluster(local_dir=str(tmpdir))
    client = Client(cluster)
    yield client
    client.close()
Exemple #30
0
import dask.array as da
from distributed import Client, LocalCluster
from sklearn.datasets import make_regression

import lightgbm as lgb

if __name__ == "__main__":
    print("loading data")

    X, y = make_regression(n_samples=1000, n_features=50)

    print("initializing a Dask cluster")

    cluster = LocalCluster(n_workers=2)
    client = Client(cluster)

    print("created a Dask LocalCluster")

    print("distributing training data on the Dask cluster")

    dX = da.from_array(X, chunks=(100, 50))
    dy = da.from_array(y, chunks=(100, ))

    print("beginning training")

    dask_model = lgb.DaskLGBMRegressor(n_estimators=10)
    dask_model.fit(dX, dy)
    assert dask_model.fitted_

    print("done training")