#!/usr/bin/env python3 # -*- coding: utf-8 -*- """ Created on Fri Feb 22 17:18:08 2019 @author: cliffk """ from dask import compute, delayed import pylab as pl import sciris as sc from distributed import LocalCluster c = LocalCluster(processes=False) print(c.scheduler) print(c.workers) inputs = [0, 1, 2, 3] def process(data): pl.seed(data) output = 0 for i in pl.arange(1e6): this = pl.randn() # print('%s: %s' % (i, this)) output += this return output
def client(): with Client(LocalCluster(n_workers=2)) as client: yield (client)
import os import pandas as pd import argparse from dask.distributed import Client from distributed import Client, LocalCluster if __name__ == '__main__': parser = argparse.ArgumentParser() parser.add_argument('--cell_line', nargs=1, type=str, help='cell line to run on') parser.add_argument('--name', nargs=1, type=str, help='name of dataset') args = parser.parse_args() cl = args.cell_line[0] name = args.name[0] from arboreto.algo import grnboost2, genie3 from arboreto.utils import load_tf_names ex_matrix = pd.read_csv('~/data/spate116/GCN/%s/%s_expression_matrix_imputed.tsv' % (cl, name), sep='\t').transpose() cluster = LocalCluster() client = Client(cluster) print('here') network = grnboost2(expression_data=ex_matrix.to_numpy(), gene_names=ex_matrix.columns, client_or_address=client) network.to_csv('~/data/spate116/GCN/%s/%s_GRN.tsv' % (cl, name), sep='\t', header=True, index=False) client.close() cluster.close()
def __init__(self, n_cores=None): if n_cores is None: n_cores = psutil.cpu_count() - 2 self.cluster = LocalCluster(processes=True, n_workers=1) self.client = Client(self.cluster)
def _simulate_tn(circuit: any, initial_state: any, final_state: any, optimize: any, backend: any, complex_type: any, tensor_only: bool, verbose: bool, **kwargs): import quimb.tensor as tn import cotengra as ctg # Get random leaves_prefix leaves_prefix = ''.join( np.random.choice(list('abcdefghijklmnopqrstuvwxyz'), size=20)) # Initialize info _sim_info = {} # Alias for tn if optimize == 'tn': optimize = 'cotengra' if isinstance(circuit, Circuit): # Get number of qubits qubits = circuit.all_qubits() n_qubits = len(qubits) # If initial/final state is None, set to all .'s initial_state = '.' * n_qubits if initial_state is None else initial_state final_state = '.' * n_qubits if final_state is None else final_state # Initial and final states must be valid strings for state, sname in [(initial_state, 'initial_state'), (final_state, 'final_state')]: # Get alphabet from string import ascii_letters # Check if string if not isinstance(state, str): raise ValueError(f"'{sname}' must be a valid string.") # Deprecated error if any(x in 'xX' for x in state): from hybridq.utils import DeprecationWarning from warnings import warn # Warn the user that '.' is used to represent open qubits warn( "Since '0.6.3', letters in the alphabet are used to " "trace selected qubits (including 'x' and 'X'). " "Instead, '.' is used to represent an open qubit.", DeprecationWarning) # Check only valid symbols are present if set(state).difference('01+-.' + ascii_letters): raise ValueError(f"'{sname}' contains invalid symbols.") # Check number of qubits if len(state) != n_qubits: raise ValueError(f"'{sname}' has the wrong number of qubits " f"(expected {n_qubits}, got {len(state)})") # Check memory if 2**(initial_state.count('.') + final_state.count('.')) > kwargs['max_largest_intermediate']: raise MemoryError("Memory for the given number of open qubits " "exceeds the 'max_largest_intermediate'.") # Compress circuit if kwargs['compress']: if verbose: print( f"Compress circuit (max_n_qubits={kwargs['compress']}): ", end='', file=stderr) _time = time() circuit = utils.compress( circuit, kwargs['compress']['max_n_qubits'] if isinstance( kwargs['compress'], dict) else kwargs['compress'], verbose=verbose, **({ k: v for k, v in kwargs['compress'].items() if k != 'max_n_qubits' } if isinstance(kwargs['compress'], dict) else {})) circuit = Circuit( utils.to_matrix_gate(c, complex_type=complex_type) for c in circuit) if verbose: print(f"Done! ({time()-_time:1.2f}s)", file=stderr) # Get tensor network representation of circuit tensor, tn_qubits_map = utils.to_tn(circuit, return_qubits_map=True, leaves_prefix=leaves_prefix) # Define basic MPS _mps = { '0': np.array([1, 0]), '1': np.array([0, 1]), '+': np.array([1, 1]) / np.sqrt(2), '-': np.array([1, -1]) / np.sqrt(2) } # Attach initial/final state for state, ext in [(initial_state, 'i'), (final_state, 'f')]: for s, q in ((s, q) for s, q in zip(state, qubits) if s in _mps): inds = [f'{leaves_prefix}_{tn_qubits_map[q]}_{ext}'] tensor &= tn.Tensor(_mps[s], inds=inds, tags=inds) # For each unique letter, apply trace for x in set(initial_state + final_state).difference(''.join(_mps) + '.'): # Get indexes inds = [ f'{leaves_prefix}_{tn_qubits_map[q]}_i' for s, q in zip(initial_state, qubits) if s == x ] inds += [ f'{leaves_prefix}_{tn_qubits_map[q]}_f' for s, q in zip(final_state, qubits) if s == x ] # Apply trace tensor &= tn.Tensor(np.reshape([1] + [0] * (2**len(inds) - 2) + [1], (2, ) * len(inds)), inds=inds) # Simplify if requested if kwargs['simplify_tn']: tensor.full_simplify_(kwargs['simplify_tn']).astype_(complex_type) else: # Otherwise, just convert to the given complex_type tensor.astype_(complex_type) # Get contraction from heuristic if optimize == 'cotengra' and kwargs['max_iterations'] > 0: # Create local client if MPI has been detected (not compatible with Dask at the moment) if _mpi_env and kwargs['parallel']: from distributed import Client, LocalCluster _client = Client(LocalCluster(processes=False)) else: _client = None # Set cotengra parameters cotengra_params = lambda: ctg.HyperOptimizer( methods=kwargs['methods'], max_time=kwargs['max_time'], max_repeats=kwargs['max_repeats'], minimize=kwargs['minimize'], progbar=verbose, parallel=kwargs['parallel'], **kwargs['cotengra']) # Get optimized path opt = cotengra_params() info = tensor.contract(all, optimize=opt, get='path-info') # Get target size tli = kwargs['target_largest_intermediate'] # Repeat for the requested number of iterations for _ in range(1, kwargs['max_iterations']): # Break if largest intermediate is equal or smaller than target if info.largest_intermediate <= tli: break # Otherwise, restart _opt = cotengra_params() _info = tensor.contract(all, optimize=_opt, get='path-info') # Store the best if kwargs['minimize'] == 'size': if _info.largest_intermediate < info.largest_intermediate or ( _info.largest_intermediate == info.largest_intermediate and _opt.best['flops'] < opt.best['flops']): info = _info opt = _opt else: if _opt.best['flops'] < opt.best['flops'] or ( _opt.best['flops'] == opt.best['flops'] and _info.largest_intermediate < info.largest_intermediate): info = _info opt = _opt # Close client if exists if _client: _client.shutdown() _client.close() # Just return tensor if required if tensor_only: if optimize == 'cotengra' and kwargs['max_iterations'] > 0: return tensor, (info, opt) else: return tensor else: # Set tensor tensor = circuit if len(optimize) == 2 and isinstance( optimize[0], PathInfo) and isinstance( optimize[1], ctg.hyper.HyperOptimizer): # Get info and opt from optimize info, opt = optimize # Set optimization optimize = 'cotengra' else: # Get tensor and path tensor = circuit # Print some info if verbose: print( f'Largest Intermediate: 2^{np.log2(float(info.largest_intermediate)):1.2f}', file=stderr) print( f'Max Largest Intermediate: 2^{np.log2(float(kwargs["max_largest_intermediate"])):1.2f}', file=stderr) print(f'Flops: 2^{np.log2(float(info.opt_cost)):1.2f}', file=stderr) if optimize == 'cotengra': # Get indexes _inds = tensor.outer_inds() # Get input indexes and output indexes _i_inds = sort([x for x in _inds if x[-2:] == '_i'], key=lambda x: int(x.split('_')[1])) _f_inds = sort([x for x in _inds if x[-2:] == '_f'], key=lambda x: int(x.split('_')[1])) # Get order _inds = [_inds.index(x) for x in _i_inds + _f_inds] # Get slice finder sf = ctg.SliceFinder(info, target_size=kwargs['max_largest_intermediate']) # Find slices with tqdm(kwargs['temperatures'], disable=not verbose, leave=False) as pbar: for _temp in pbar: pbar.set_description(f'Find slices (T={_temp})') ix_sl, cost_sl = sf.search(temperature=_temp) # Get slice contractor sc = sf.SlicedContractor([t.data for t in tensor]) # Update infos _sim_info.update({ 'flops': info.opt_cost, 'largest_intermediate': info.largest_intermediate, 'n_slices': cost_sl.nslices, 'total_flops': cost_sl.total_flops }) # Print some infos if verbose: print( f'Number of slices: 2^{np.log2(float(cost_sl.nslices)):1.2f}', file=stderr) print(f'Flops+Cuts: 2^{np.log2(float(cost_sl.total_flops)):1.2f}', file=stderr) if kwargs['max_n_slices'] and sc.nslices > kwargs['max_n_slices']: raise RuntimeError( f'Too many slices ({sc.nslices} > {kwargs["max_n_slices"]})') # Contract tensor _li = np.log2(float(info.largest_intermediate)) _mli = np.log2(float(kwargs["max_largest_intermediate"])) _tensor = sc.gather_slices((sc.contract_slice( i, backend=backend ) for i in tqdm( range(sc.nslices), desc=f'Contracting tensor (li=2^{_li:1.0f}, mli=2^{_mli:1.1f})', leave=False))) # Create map _map = ''.join([get_symbol(x) for x in range(len(_inds))]) _map += '->' _map += ''.join([get_symbol(x) for x in _inds]) # Reorder tensor tensor = contract(_map, _tensor) # Deprecated ## Reshape tensor #if _inds: # if _i_inds and _f_inds: # tensor = np.reshape(tensor, (2**len(_i_inds), 2**len(_f_inds))) # else: # tensor = np.reshape(tensor, # (2**max(len(_i_inds), len(_f_inds)),)) else: # Contract tensor tensor = tensor.contract(optimize=optimize, backend=backend) if hasattr(tensor, 'inds'): # Get input indexes and output indexes _i_inds = sort([x for x in tensor.inds if x[-2:] == '_i'], key=lambda x: int(x.split('_')[1])) _f_inds = sort([x for x in tensor.inds if x[-2:] == '_f'], key=lambda x: int(x.split('_')[1])) # Transpose tensor tensor.transpose(*(_i_inds + _f_inds), inplace=True) # Deprecated ## Reshape tensor #if _i_inds and _f_inds: # tensor = np.reshape(tensor, (2**len(_i_inds), 2**len(_f_inds))) #else: # tensor = np.reshape(tensor, # (2**max(len(_i_inds), len(_f_inds)),)) if kwargs['return_info']: return tensor, _sim_info else: return tensor
def tall_clutter(files, config, clutter_thresh_min=0.0002, clutter_thresh_max=0.25, radius=1, max_height=2000., write_radar=True, out_file=None, use_dask=False): """ Wind Farm Clutter Calculation Parameters ---------- files : list List of radar files used for the clutter calculation. config : str String representing the configuration for the radar. Such possible configurations are listed in default_config.py Other Parameters ---------------- clutter_thresh_min : float Threshold value for which, any clutter values above the clutter_thres_min will be considered clutter, as long as they are also below the clutter_thres_max. clutter_thresh_max : float Threshold value for which, any clutter values below the clutter_thres_max will be considered clutter, as long as they are also above the clutter_thres_min. radius : int Radius of the area surrounding the clutter gate that will be also flagged as clutter. max_height: float Maximum height above the radar to mark a gate as clutter. write_radar : bool Whether to or not, to write the clutter radar as a netCDF file. Default is True. out_file : string String of location and filename to write the radar object too, if write_radar is True. use_dask : bool Use dask instead of running stats for calculation. The will reduce run time. Returns ------- clutter_radar : Radar Radar object with the clutter field that was calculated. This radar only has the clutter field, but maintains all other radar specifications. """ field_names = get_field_names(config) refl_field = field_names["reflectivity"] vel_field = field_names["velocity"] ncp_field = field_names["normalized_coherent_power"] def get_reflect_array(file, first_shape): """ Retrieves a reflectivity array for a radar volume. """ try: radar = pyart.io.read( file, include_fields=[refl_field, ncp_field, vel_field]) reflect_array = deepcopy(radar.fields[refl_field]['data']) ncp = radar.fields[ncp_field]['data'] height = radar.gate_z["data"] up_in_the_air = height > max_height the_mask = np.logical_or.reduce( (ncp < 0.9, reflect_array.mask, up_in_the_air)) reflect_array = np.ma.masked_where(the_mask, reflect_array) del radar if reflect_array.shape == first_shape: return reflect_array.filled(fill_value=np.nan) except (TypeError, OSError): print(file + ' is corrupt...skipping!') return np.nan * np.zeros(first_shape) if use_dask is False: run_stats = _RunningStats() first_shape = 0 for file in files: try: radar = pyart.io.read(file) reflect_array = radar.fields[refl_field]['data'] ncp = deepcopy(radar.fields[ncp_field]['data']) height = radar.gate_z["data"] reflect_array = np.ma.masked_where( np.logical_or(height > max_height, ncp < 0.8), reflect_array) if first_shape == 0: first_shape = reflect_array.shape clutter_radar = radar run_stats.push(reflect_array) if reflect_array.shape == first_shape: run_stats.push(reflect_array) del radar except (TypeError, OSError): print(file + ' is corrupt...skipping!') continue mean = run_stats.mean() stdev = run_stats.standard_deviation() clutter_values = stdev / mean clutter_values = np.ma.masked_invalid(clutter_values) clutter_values_no_mask = clutter_values.filled(clutter_thresh_max + 1) else: cluster = LocalCluster(n_workers=20, processes=True) client = Client(cluster) first_shape = 0 i = 0 while first_shape == 0: try: radar = pyart.io.read(files[i]) reflect_array = radar.fields[refl_field]['data'] first_shape = reflect_array.shape clutter_radar = radar except (TypeError, OSError): i = i + 1 print(file + ' is corrupt...skipping!') continue arrays = [ delayed(get_reflect_array)(file, first_shape) for file in files ] array = [ da.from_delayed(a, shape=first_shape, dtype=float) for a in arrays ] array = da.stack(array, axis=0) print('## Calculating mean in parallel...') mean = np.array(da.nanmean(array, axis=0)) print('## Calculating standard deviation...') count = np.array(da.sum(da.isfinite(array), axis=0)) stdev = np.array(da.nanstd(array, axis=0)) clutter_values = stdev / mean clutter_values = np.ma.masked_invalid(clutter_values) clutter_values = np.ma.masked_where( np.logical_or(clutter_values.mask, count < 20), clutter_values) # Masked arrays can suck clutter_values_no_mask = clutter_values.filled( (clutter_thresh_max + 1)) shape = clutter_values.shape mask = np.ma.getmask(clutter_values) is_clutters = np.argwhere( np.logical_and.reduce(( clutter_values_no_mask > clutter_thresh_min, clutter_values_no_mask < clutter_thresh_max, ))) clutter_array = _clutter_marker(is_clutters, shape, mask, radius) clutter_radar.fields.clear() clutter_array = clutter_array.filled(0) clutter_dict = _clutter_to_dict(clutter_array) clutter_value_dict = _clutter_to_dict(clutter_values) clutter_value_dict["long_name"] = "Clutter value (std. dev/mean Z)" clutter_radar.add_field('ground_clutter', clutter_dict, replace_existing=True) clutter_radar.add_field('clutter_value', clutter_value_dict, replace_existing=True) if write_radar is True: pyart.io.write_cfradial(out_file, clutter_radar) del clutter_radar return
def test_list(): with LocalCluster(name="testcluster", scheduler_port=8786) as _: output = check_output(["daskctl", "list"]) assert b"ProxyCluster" in output assert b"Running" in output
def run( self, dataset: str, include_raw: bool = False, batch_size: Optional[int] = None, distributed: bool = False, n_workers: int = 10, worker_cpu: int = 8, worker_mem: str = "120GB", overwrite: bool = False, debug: bool = False, **kwargs, ): """ Run a flow with your steps. Parameters ---------- dataset: str The dataset to use for the pipeline. include_raw: bool A boolean option to determine if the raw data should be included in the Quilt package. Default: False (Do not include the raw data) batch_size: Optional[int] An optional batch size to provide to each step for processing their items. Default: None (auto batch size depending on CPU / threads available) distributed: bool A boolean option to determine if the jobs should be distributed to a SLURM cluster when possible. Default: False (Do not distribute) n_workers: int Number of workers to request (when distributed is enabled). Default: 10 worker_cpu: int Number of cores to provide per worker (when distributed is enabled). Default: 8 worker_mem: str Amount of memory to provide per worker (when distributed is enabled). Default: 120GB overwrite: bool If this pipeline has already partially or completely run, should it overwrite the previous files or not. Default: False (Do not overwrite or regenerate files) debug: bool A debug flag for the developer to use to manipulate how much data runs, how it is processed, etc. Additionally, if debug is True, any mapped operation will run on threads instead of processes. Default: False (Do not debug) """ # Initalize steps raw = steps.Raw() standardize_fov_array = steps.StandardizeFOVArray() single_cell_features = steps.SingleCellFeatures() single_cell_images = steps.SingleCellImages() diagnostic_sheets = steps.DiagnosticSheets() # Cluster / distributed defaults distributed_executor_address = None # Choose executor if debug: exe = LocalExecutor() log.info("Debug flagged. Will use threads instead of Dask.") else: if distributed: # Create or get log dir # Do not include ms log_dir_name = datetime.now().isoformat().split(".")[0] log_dir = Path(f".dask_logs/{log_dir_name}").expanduser() # Log dir settings log_dir.mkdir(parents=True, exist_ok=True) # Create cluster log.info("Creating SLURMCluster") cluster = SLURMCluster( cores=worker_cpu, memory=worker_mem, queue="aics_cpu_general", walltime="10:00:00", local_directory=str(log_dir), log_directory=str(log_dir), ) # Spawn workers cluster.scale(n_workers) log.info("Created SLURMCluster") # Use the port from the created connector to set executor address distributed_executor_address = cluster.scheduler_address # Only auto batch size if it is not None if batch_size is None: # Batch size is n_workers * worker_cpu * 0.75 # We could just do n_workers * worker_cpu but 3/4 of that is safer batch_size = int(n_workers * worker_cpu * 0.75) # Log dashboard URI log.info( f"Dask dashboard available at: {cluster.dashboard_link}") else: # Create local cluster log.info("Creating LocalCluster") cluster = LocalCluster() log.info("Created LocalCluster") # Set distributed_executor_address distributed_executor_address = cluster.scheduler_address # Log dashboard URI log.info( f"Dask dashboard available at: {cluster.dashboard_link}") # Use dask cluster exe = DaskExecutor(distributed_executor_address) # Configure your flow with Flow("actk") as flow: if include_raw: dataset = raw(dataset, **kwargs) standardized_fov_paths_dataset = standardize_fov_array( dataset=dataset, distributed_executor_address=distributed_executor_address, batch_size=batch_size, overwrite=overwrite, debug=debug, # Allows us to pass `--desired_pixel_sizes [{float},{float},{float}]` **kwargs, ) single_cell_features_dataset = single_cell_features( dataset=standardized_fov_paths_dataset, distributed_executor_address=distributed_executor_address, batch_size=batch_size, overwrite=overwrite, debug=debug, # Allows us to pass `--cell_ceiling_adjustment {int}` **kwargs, ) single_cell_images_dataset = single_cell_images( dataset=single_cell_features_dataset, distributed_executor_address=distributed_executor_address, batch_size=batch_size, overwrite=overwrite, debug=debug, # Allows us to pass `--cell_ceiling_adjustment {int}` **kwargs, ) diagnostic_sheets( dataset=single_cell_images_dataset, distributed_executor_address=distributed_executor_address, overwrite=overwrite, # Allows us to pass `--metadata {str}`, # `--feature {str}'` **kwargs, ) # Run flow and get ending state, log duration start = datetime.now() state = flow.run(executor=exe) duration = datetime.now() - start log.info(f"Total duration of pipeline: " f"{duration.seconds // 60 // 60}:" f"{duration.seconds // 60}:" f"{duration.seconds % 60}") # Get and display any outputs you want to see on your local terminal log.info(single_cell_images_dataset.get_result(state, flow))
import pyart from netCDF4 import Dataset import numpy as np from datetime import datetime, timedelta from copy import deepcopy import glob import math import dask.array as da from distributed import Client, LocalCluster from dask import delayed, compute import time import sys from scipy import ndimage # Start a cluster with x workers cluster = LocalCluster(n_workers=int(sys.argv[1])) client = Client(cluster) # Input the range of dates and time wanted for the collection of images start_year = 2006 start_day = 1 start_month = 1 start_hour = 1 start_minute = 0 start_second = 0 end_year = 2006 end_month = 3 end_day = 1 end_hour = 0 end_minute = 00
folder = to_convert[int(folder_num) - 1].rpartition('/')[0].rpartition(visit)[2][1:] try: save_location = os.path.join('/dls', beamline, 'data', year, visit, 'processing', folder) if os.path.exists(save_location) == False: os.makedirs(save_location) watch_convert(beamline, year, visit, folder) except Exception as e: print('** ERROR processing** \n ', e) if __name__ == "__main__": from distributed import Client, LocalCluster cluster = LocalCluster(n_workers=20, memory_limit=100e9) client = Client(cluster) parser = argparse.ArgumentParser() parser.add_argument('beamline', help='Beamline name') parser.add_argument('year', help='Year') parser.add_argument('visit', help='Session visit code') parser.add_argument('folder', nargs='?', default=None, help='OPTION to add a specific folder within a visit \ to look for data, e.g. sample1/dataset1/. If None the assumption would be to look in Merlin folder' ) parser.add_argument('folder_num', nargs='?', help='passed by scheduler') v_help = "Display all debug log messages" parser.add_argument("-v", "--verbose",
def test_empty_dmatrix_hist(): with LocalCluster(n_workers=kWorkers) as cluster: with Client(cluster) as client: parameters = {'tree_method': 'hist'} run_empty_dmatrix_reg(client, parameters) run_empty_dmatrix_cls(client, parameters)
def run_external_link_checker( google_api_credentials_path: str, master_spreadsheet_id: Optional[str] = None, spreadsheet_ids_str: Optional[str] = None, ): """ Run the the external link checker. If a list of spreadsheet ids are provided, run the external link checker against the list of spreadsheet ids, instead of the spreadsheet ids gathered from the master spreadsheet. Parameters ---------- master_spreadsheet_id: str The master spreadsheet id. google_api_credentials_path: str The path to Google API credentials file needed to read Google Sheets. spreadsheet_ids_str: Optional[str] The list spreadsheet ids, delimited by comma. """ log.info("Finished external link checker set up, start checking external link.") log.info("=" * 80) # Spawn local dask cluster cluster = LocalCluster() # Log the dashboard link log.info(f"Dashboard available at: {cluster.dashboard_link}") # Setup workflow with Flow("Check external links") as flow: # Get spreadsheet ids spreadsheet_ids = _get_spreadsheet_ids( master_spreadsheet_id, google_api_credentials_path, spreadsheet_ids_str ) # Extract sheets data. # Get back list of list of SheetData spreadsheets_data = _extract.map( spreadsheet_ids, unmapped(google_api_credentials_path), ) # Extract links from list of SheetData # Get back list of list of URLData links_data = _extract_external_links.map(flatten(spreadsheets_data)) # Unique the url data unique_links_data = _unique_external_links(flatten(links_data)) # Check external links _check_external_link.map(unique_links_data) # Run the flow state = flow.run(executor=DaskExecutor(cluster.scheduler_address)) if state.is_failed(): raise PrefectFlowFailure(ErrorInfo({"flow_name": flow.name})) # Get the list of CheckedURL checked_links = state.result[flow.get_tasks(name="_check_external_link")[0]].result log.info("=" * 80) # Get error links error_links = [link for link in checked_links if link.has_error] gs_cells = [] for error_link in error_links: for cell in error_link.url_data.cells: gs_cells.append( GoogleSheetCell( spreadsheet_title=cell.spreadsheet_title, sheet_title=cell.sheet_title, row_index=cell.row_index, col_index=cell.col_index, url=error_link.url_data.url, msg=error_link.msg, ) ) sorted_gs_cells = sorted( gs_cells, key=lambda x: ( x.spreadsheet_title, x.sheet_title, x.row_index, x.col_index, x.url, ), ) # Write error links to a csv file with open("external_links.csv", mode="w") as csv_file: fieldnames = ["spreadsheet_title", "sheet_title", "cell", "url", "reason"] writer = csv.DictWriter(csv_file, fieldnames=fieldnames, delimiter="\t") writer.writeheader() for gs_cell in sorted_gs_cells: writer.writerow( { "spreadsheet_title": gs_cell.spreadsheet_title, "sheet_title": gs_cell.sheet_title, "cell": convert_rowcol_to_A1_name( gs_cell.row_index, gs_cell.col_index ), "url": gs_cell.url, "reason": f"{gs_cell.msg}", } ) log.info("Finished writing external links csv file")
def test_rabit_ops(): from distributed import Client, LocalCluster n_workers = 3 with LocalCluster(n_workers=n_workers) as cluster: with Client(cluster) as client: run_rabit_ops(client, n_workers)
WSGI config for AutoOut project. It exposes the WSGI callable as a module-level variable named ``application``. For more information on this file, see https://docs.djangoproject.com/en/2.1/howto/deployment/wsgi/ """ import math import multiprocessing import os from distributed import LocalCluster, Client from django.core.wsgi import get_wsgi_application from psutil import virtual_memory os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'AutoOut.settings') # Start dask cluster no_cpus = multiprocessing.cpu_count() threads_per_worker = 2 no_workers = math.floor((no_cpus-2)/threads_per_worker) mem = virtual_memory() c = LocalCluster(processes=False, n_workers=no_workers, threads_per_worker=threads_per_worker, memory_limit=mem.free/no_workers) dask_client = Client(c) application = get_wsgi_application()
def test_empty_dmatrix_approx() -> None: with LocalCluster(n_workers=kWorkers) as cluster: with Client(cluster) as client: parameters = {'tree_method': 'approx'} run_empty_dmatrix_reg(client, parameters) run_empty_dmatrix_cls(client, parameters)
def preprocessing_script(): """ This script will process all the hybridization folders combined in a processing folder. The input parameters are passed using arparse Parameters: ----------- scheduler: string tcp address of the dask.distributed scheduler (ex. tcp://192.168.0.4:7003). default = False. If False the process will run on the local computer using nCPUs-1 path: string Path to the processing directory """ # Inputs of the function parser = argparse.ArgumentParser(description='Preprocessing script') parser.add_argument('-scheduler', default=False, help='dask scheduler address ex. tcp://192.168.0.4:7003') parser.add_argument('-path', help='processing directory') args = parser.parse_args() # Directory to process processing_directory = args.path # Dask scheduler address scheduler_address = args.scheduler if scheduler_address: # Start dask client on server or cluster client=Client(scheduler_address) else: # Start dask client on local machine. It will use all the availabe # cores -1 # number of core to use ncores = multiprocessing.cpu_count()-1 cluster = LocalCluster(n_workers=ncores) client=Client(cluster) # Subdirectories of the processing_directory that need to be skipped for the # analysis blocked_directories = ['_logs'] # Starting logger utils.init_file_logger(processing_directory) logger = logging.getLogger() # Determine the operating system running the code os_windows, add_slash = utils.determine_os() # Check training slash in the processing directory processing_directory=utils.check_trailing_slash(processing_directory,os_windows) # Get a list of the hybridization to process processing_hyb_list = next(os.walk(processing_directory))[1] # Remove the blocked directories from the directories to process processing_hyb_list = [el for el in processing_hyb_list if el not in blocked_directories ] for processing_hyb in processing_hyb_list: # Determine the hyb number from the name hybridization_number = processing_hyb.split('_hyb')[-1] hybridization = 'Hybridization' + hybridization_number hyb_dir = processing_directory + processing_hyb + add_slash # Parse the Experimental metadata file (serial) experiment_infos,image_properties, hybridizations_infos, \ converted_positions, microscope_parameters =\ utils.experimental_metadata_parser(hyb_dir) # Parse the configuration file flt_rawcnt_config = utils.filtering_raw_counting_config_parser(hyb_dir) # ----------------- .nd2 FILE CONVERSION ------------------------------ # Create the temporary subdirectory tree (serial) tmp_dir_path, tmp_gene_dirs=utils.create_subdirectory_tree(hyb_dir,\ hybridization,hybridizations_infos,processing_hyb,suffix='tmp',add_slash=add_slash) # Get the list of the nd2 files to process inside the directory files_list = glob.glob(hyb_dir+processing_hyb+'_raw_data'+add_slash+'*.nd2') # Get the list of genes that are analyzed in the current hybridization gene_list = list(hybridizations_infos[hybridization].keys()) # Organize the file to process in a list which order match the gene_list for # parallel processing organized_files_list = [f for gene in gene_list for f in files_list if gene+'.nd2' in f ] organized_tmp_dir_list = [f for gene in gene_list for f in tmp_gene_dirs if gene in f ] # Each .nd2 file will be processed in a worker part of a different node # Get the addresses of one process/node to use for conversion node_addresses = utils.identify_nodes(client) workers_conversion = [list(el.items())[0][1] for key,el in node_addresses.items()] # Run the conversion futures_processes=client.map(io.nd2_to_npy,gene_list,organized_files_list, tmp_gene_dirs,processing_hyb=processing_hyb, use_ram=flt_rawcnt_config['use_ram'], max_ram=flt_rawcnt_config['max_ram'], workers=workers_conversion) client.gather(futures_processes) # --------------------------------------------------------------------- # ----------------- FILTERING AND RAW COUNTING ------------------------ # Create directories # Create the directory where to save the filtered images suffix = 'filtered_png' filtered_png_img_dir_path, filtered_png_img_gene_dirs = \ utils.create_subdirectory_tree(hyb_dir,hybridization,hybridizations_infos, processing_hyb,suffix,add_slash,analysis_name=flt_rawcnt_config['analysis_name']) suffix = 'filtered_npy' filtered_img_dir_path, filtered_img_gene_dirs = \ utils.create_subdirectory_tree(hyb_dir,hybridization,hybridizations_infos, processing_hyb,suffix,add_slash,analysis_name=flt_rawcnt_config['analysis_name']) # Create the directory where to save the counting suffix = 'counting' counting_dir_path, counting_gene_dirs = \ utils.create_subdirectory_tree(hyb_dir,hybridization,hybridizations_infos,processing_hyb, suffix,add_slash,flt_rawcnt_config['skip_tags_counting'], flt_rawcnt_config['skip_genes_counting'], analysis_name=flt_rawcnt_config['analysis_name']) if flt_rawcnt_config['illumination_correction']: # Create the directory where to save the counting suffix = 'illumination_funcs' illumination_func_dir_path, illumination_func_gene_dirs = \ utils.create_subdirectory_tree(hyb_dir,hybridization,hybridizations_infos,processing_hyb, suffix,add_slash,analysis_name=flt_rawcnt_config['analysis_name']) # Loop through channels and calculate illumination for gene in hybridizations_infos[hybridization].keys(): flist_img_to_filter=glob.glob(hyb_dir+processing_hyb+'_tmp/'+processing_hyb+'_'+gene+'_tmp/*.npy') logger.debug('Create average image for gene %s', gene) # Chunking the image list num_chunks = sum(list(client.ncores().values())) chunked_list = utils.list_chunking(flist_img_to_filter,num_chunks) # Scatter the images sublists to process in parallel futures = client.scatter(chunked_list) # Create dask processing graph output = [] for future in futures: ImgMean = delayed(utils.partial_image_mean)(future) output.append(ImgMean) ImgMean_all = delayed(sum)(output) ImgMean_all = ImgMean_all/float(len(futures)) # Compute the graph ImgMean = ImgMean_all.compute() logger.debug('Create illumination function for gene %s',gene) # Create illumination function Illumination=filters.gaussian(ImgMean,sigma=(20,300,300)) # Normalization of the illumination Illumination_flat=np.amax(Illumination,axis=0) Illumination_norm=Illumination_flat/np.amax(Illumination_flat) logger.debug('Save illumination function for gene %s',gene) # Save the illumination function illumination_path = [ill_path for ill_path in illumination_func_gene_dirs if gene in ill_path][0] illumination_fname=illumination_path+gene+'_illumination_func.npy' np.save(illumination_fname,Illumination_norm,allow_pickle=False) # Broadcast the illumination function to all the cores client.scatter(Illumination_norm, broadcast=True) logger.debug('Filtering %s',gene) # Filtering and counting futures_processes=client.map(counting.filtering_and_counting_ill_correction,flist_img_to_filter, \ illumination_function=Illumination_norm,\ filtered_png_img_gene_dirs=filtered_png_img_gene_dirs,\ filtered_img_gene_dirs =filtered_img_gene_dirs,\ counting_gene_dirs=counting_gene_dirs,plane_keep=flt_rawcnt_config['plane_keep'], \ min_distance=flt_rawcnt_config['min_distance'], stringency=flt_rawcnt_config['stringency'],\ skip_genes_counting=flt_rawcnt_config['skip_genes_counting'],skip_tags_counting=flt_rawcnt_config['skip_tags_counting']) client.gather(futures_processes) else: for gene in hybridizations_infos[hybridization].keys(): flist_img_to_filter=glob.glob(hyb_dir+processing_hyb+'_tmp/'+processing_hyb+'_'+gene+'_tmp/*.npy') # filtering logger.debug('Filtering without illumination correction %s',gene) futures_processes=client.map(counting.filtering_and_counting,flist_img_to_filter, \ filtered_png_img_gene_dirs=filtered_png_img_gene_dirs, \ filtered_img_gene_dirs=filtered_img_gene_dirs, \ counting_gene_dirs=counting_gene_dirs, \ plane_keep=flt_rawcnt_config['plane_keep'], min_distance=flt_rawcnt_config['min_distance'],\ stringency=flt_rawcnt_config['stringency'],\ skip_genes_counting=flt_rawcnt_config['skip_genes_counting'],skip_tags_counting=flt_rawcnt_config['skip_tags_counting']) client.gather(futures_processes) # --------------------------------------------------------------------- # # ----------------- COMBINE THE FILTERED DATA IN .ppf.hdf5 ------------------------ # # Combine the filter data in one single .ppf for each hybridization # # This step will run in serial mode and will not need to shuffle data # # between cores because everything is on the common file system # logger.debug('Create .ppf.hdf5 file') # # Create the ppf.hdf5 file that contains the filtered data in uint16 # preprocessing_file_path = hdf5_utils.hdf5_create_preprocessing_file(hybridizations_infos,processing_hyb, # hybridization,flt_rawcnt_config['analysis_name'], hyb_dir,converted_positions,image_properties) # logger.debug('Write the .npy filtered files into the .ppf file') # # Load and write the .npy tmp images into the hdf5 file # # open the hdf5 file # with h5py.File(preprocessing_file_path) as f_hdl: # # Loop through each gene # for gene in hybridizations_infos[hybridization].keys(): # logger.debug('Writing %s images in .ppf.hdf5',gene) # # list of the files to transfer # filtered_gene_dir = [fdir for fdir in filtered_img_gene_dirs if gene in fdir][0] # filtered_files_list = glob.glob(filtered_gene_dir+'*.npy') # # loop through the list of file # for f_file in filtered_files_list: # pos = f_file.split('/')[-1].split('_')[-1].split('.')[0] # f_hdl[gene]['FilteredData'][pos][:] =np.load(f_file) # f_hdl.flush() # # --------------------------------------------------------------------- # # ----------------- STITCHING ------------------------ # # Load the stitching parameters from the .yaml file # # Stitch the image in 2D or 3D (3D need more work/testing) # nr_dim = flt_rawcnt_config['nr_dim'] # # Estimated overlapping between images according to the Nikon software # est_overlap = image_properties['Overlapping_percentage'] # # Number of peaks to use for the alignment # nr_peaks = flt_rawcnt_config['nr_peaks'] # # Determine if the coords need to be flipped # y_flip = flt_rawcnt_config['y_flip'] # # Method to use for blending # # can be 'linear' or 'non linear' # # The methods that performs the best is the 'non linear' # blend = flt_rawcnt_config['blend'] # # Reference gene for stitching # reference_gene = flt_rawcnt_config['reference_gene'] # pixel_size = image_properties['PixelSize'] # # Get the list of the filtered files of the reference gene # filtered_gene_dir = [gene_dir for gene_dir in filtered_img_gene_dirs if reference_gene in gene_dir][0] # filtered_files_list = glob.glob(filtered_gene_dir+'*.npy') # # Create pointer of the hdf5 file that will store the stitched reference image # # for the current hybridization # # Writing # tile_file_base_name = flt_rawcnt_config['analysis_name']+'_'+ processing_hyb # data_name = (tile_file_base_name # + '_' + reference_gene # + '_stitching_data') # stitching_file_name = tile_file_base_name + '.sf.hdf5' # stitching_file= h5py.File(hyb_dir+stitching_file_name,'w',libver='latest') # replace with 'a' as soon as you fix the error # # Determine the tiles organization # tiles, contig_tuples, nr_pixels, z_count, micData = stitching.get_pairwise_input_npy(image_properties,converted_positions, hybridization, # est_overlap = est_overlap, y_flip = False, nr_dim = 2) # # Align the tiles # futures_processes=client.map(pairwisesingle.align_single_pair_npy,contig_tuples, # filtered_files_list=filtered_files_list,micData=micData, # nr_peaks=nr_peaks) # # Gather the futures # data = client.gather(futures_processes) # # In this case the order of the returned contingency tuples is with # # the order of the input contig_tuples # # P_all = [el for data_single in data for el in data_single[0]] # P_all =[data_single[0] for data_single in data ] # P_all = np.array(P_all) # P_all = P_all.flat[:] # covs_all = [data_single[1] for data_single in data] # alignment = {'P': P_all, # 'covs': covs_all} # # Calculates a shift in global coordinates for each tile (global # # alignment) and then applies these shifts to the corner coordinates # # of each tile and returns and saves these shifted corner coordinates. # joining = stitching.get_place_tile_input(hyb_dir, tiles, contig_tuples, # micData, nr_pixels, z_count, # alignment, data_name, # nr_dim=nr_dim) # # Create the hdf5 file structure # stitched_group, linear_blending, blend = hdf5preparation.create_structures_hdf5_stitched_ref_gene_file_npy(stitching_file, joining, nr_pixels, # reference_gene, blend = 'non linear') # # Fill the hdf5 containing the stitched image with empty data and # # create the blending mask # stitched_group['final_image'][:]= np.zeros(joining['final_image_shape'],dtype=np.float64) # if blend is not None: # # make mask # stitched_group['blending_mask'][:] = np.zeros(joining['final_image_shape'][-2:],dtype=np.float64) # tilejoining.make_mask(joining, nr_pixels, stitched_group['blending_mask']) # # Create the subdirectory used to save the blended tiles # suffix = 'blended_tiles' # blended_tiles_directory = utils.create_single_directory(hyb_dir,reference_gene, hybridization,processing_hyb,suffix,add_slash, # analysis_name=flt_rawcnt_config['analysis_name']) # # Get the directory with the filtered npy images of the reference_gene to use for stitching # stitching_files_dir = [npy_dir for npy_dir in filtered_img_gene_dirs if reference_gene in npy_dir][0] # # Create the tmp directory where to save the masks # suffix = 'masks' # masked_tiles_directory = utils.create_single_directory(hyb_dir,reference_gene, hybridization,processing_hyb,suffix,add_slash, # analysis_name=flt_rawcnt_config['analysis_name']) # # Create and save the mask files # for corn_value,corner_coords in joining['corner_list']: # if not(np.isnan(corner_coords[0])): # cur_mask = stitched_group['blending_mask'][int(corner_coords[0]):int(corner_coords[0]) + int(nr_pixels), # int(corner_coords[1]):int(corner_coords[1]) + int(nr_pixels)] # fname = masked_tiles_directory + flt_rawcnt_config['analysis_name'] +'_'+processing_hyb+'_'+reference_gene+'_masks_joining_pos_'+str(corn_value) # np.save(fname,cur_mask) # # Blend all the tiles and save them in a directory # futures_processes = client.map(tilejoining.generate_blended_tile_npy,joining['corner_list'], # stitching_files_dir = stitching_files_dir, # blended_tiles_directory = blended_tiles_directory, # masked_tiles_directory = masked_tiles_directory, # analysis_name = flt_rawcnt_config['analysis_name'], # processing_hyb = processing_hyb,reference_gene = reference_gene, # micData = micData,tiles = tiles,nr_pixels=nr_pixels, # linear_blending=linear_blending) # _ = client.gather(futures_processes) # # Write the stitched image # tilejoining.make_final_image_npy(joining, stitching_file, blended_tiles_directory, tiles,reference_gene, nr_pixels) # # close the hdf5 file # stitching_file.close() # # Delete the directories with blended tiles and masks # shutil.rmtree(blended_tiles_directory) # shutil.rmtree(masked_tiles_directory) # ----------------- DELETE FILES ------------------------ # Don't delete the *.npy files here because can be used to # create the final images using the apply stitching related function client.close()
def test_aft_survival() -> None: with LocalCluster(n_workers=kWorkers) as cluster: with Client(cluster) as client: run_aft_survival(client, DaskDMatrix)
def run( self, distributed: bool = False, clean: bool = False, debug: bool = False, structs: list = ["Nuc"], flow_viz: bool = False, **kwargs, ): """ Run a flow with your steps. Parameters ---------- distributed: bool A boolean option to determine if the jobs should be distributed to a remote cluster when possible. Default: False (Do not distribute) clean: bool Should the local staging directory be cleaned prior to this run. Default: False (Do not clean) debug: bool A debug flag for the developer to use to manipulate how much data runs, how it is processed, etc. Default: False (Do not debug) structs: List List of structure data to run pipeline on. Currently, only 'Nuc' (nuclear membrane) and 'Cell' (cell membrane) are supported. flow_viz: bool Make flow chart to visualize pipeline - requires conda install of graphviz. Notes ----- Documentation on prefect: https://docs.prefect.io/core/ Basic prefect example: https://docs.prefect.io/core/ """ # Initalize steps if "Nuc" in structs: loaddata_nuc = steps.LoadData() shparam_nuc = steps.Shparam(step_name="shparam_nuc") avgshape_nuc = steps.Avgshape(step_name="avgshape_nuc") nma_nuc = steps.Nma(step_name="nma_nuc") if "Cell" in structs: single_cell = steps.Singlecell(step_name="single_cell") shparam_cell = steps.Shparam(step_name="shparam_cell") avgshape_cell = steps.Avgshape(step_name="avgshape_cell") nma_cell = steps.Nma(step_name="nma_cell") if "Nuc" in structs and "Cell" in structs: compare_nuc_cell = steps.CompareNucCell() # Choose executor if debug: exe = LocalExecutor() distributed_executor_address = None log.info(f"Debug flagged. Will use threads instead of Dask.") else: if distributed: # Create or get log dir # Do not include ms log_dir_name = datetime.now().isoformat().split(".")[0] log_dir = Path(f".dask_logs/{log_dir_name}").expanduser() # Log dir settings log_dir.mkdir(parents=True, exist_ok=True) # Configure dask config dask.config.set({ "scheduler.work-stealing": False, "logging.distributed.worker": "info", }) # Create cluster log.info("Creating SLURMCluster") cluster = SLURMCluster( cores=4, memory="20GB", queue="aics_cpu_general", walltime="10:00:00", local_directory=str(log_dir), log_directory=str(log_dir), ) log.info("Created SLURMCluster") # Scale cluster cluster.scale(60) # Use the port from the created connector to set executor address distributed_executor_address = cluster.scheduler_address # Log dashboard URI log.info( f"Dask dashboard available at: {cluster.dashboard_link}") else: # Create local cluster log.info("Creating LocalCluster") cluster = LocalCluster() log.info("Created LocalCluster") # Set distributed_executor_address distributed_executor_address = cluster.scheduler_address # Log dashboard URI log.info( f"Dask dashboard available at: {cluster.dashboard_link}") # Use dask cluster exe = DaskExecutor(distributed_executor_address) try: # Configure your flow with Flow("mti_nma") as flow: # If your step utilizes dask pass the executor address # If you want to clean the local staging directories pass clean # If you want to utilize some debugging functionality pass debug # If you don't utilize any of these, just pass the parameters you need. if "Nuc" in structs: struct = "Nuc" ld_nuc_df = loaddata_nuc(distributed_executor_address= distributed_executor_address, clean=clean, debug=debug, struct=struct, **kwargs) sh_nuc_df = shparam_nuc(sc_df=ld_nuc_df, distributed_executor_address= distributed_executor_address, clean=clean, debug=debug, struct=struct, **kwargs) avg_nuc_df = avgshape_nuc(sh_df=sh_nuc_df, distributed_executor_address= distributed_executor_address, clean=clean, debug=debug, struct=struct, **kwargs) nma_nuc_df = nma_nuc(avg_df=avg_nuc_df, distributed_executor_address= distributed_executor_address, clean=clean, debug=debug, struct=struct, **kwargs) if "Cell" in structs: struct = "Cell" sc_cell_df = single_cell(distributed_executor_address= distributed_executor_address, clean=clean, debug=debug, struct=struct, **kwargs) sh_cell_df = shparam_cell(sc_df=sc_cell_df, distributed_executor_address= distributed_executor_address, clean=clean, debug=debug, struct=struct, **kwargs) avg_cell_df = avgshape_cell(sh_df=sh_cell_df, distributed_executor_address= distributed_executor_address, clean=clean, debug=debug, struct=struct, **kwargs) nma_cell_df = nma_cell(avg_df=avg_cell_df, distributed_executor_address= distributed_executor_address, clean=clean, debug=debug, struct=struct, **kwargs) # If nucleus and cell membrane were anlyzed, draw comparison plot if "Nuc" in structs and "Cell" in structs: compare_nuc_cell(nma_nuc_df, nma_cell_df) # Run flow, get ending state, and visualize pipeline flow.run(executor=exe) # Create pipeline visualization if flag is True # Note: # Flag False by default as a required package is not pip-installable # To use this feature, first `conda install graphviz` if flow_viz: flow.visualize() # Catch any error and kill the remote dask cluster except Exception as err: log.error(f"Something went wrong during pipeline run: {err}")
def test_autocompletion(): with LocalCluster(scheduler_port=8786) as _: assert len(autocomplete_cluster_names(None, None, "")) == 1 assert len(autocomplete_cluster_names(None, None, "proxy")) == 1 assert len(autocomplete_cluster_names(None, None, "local")) == 0
def __init__(self, name=dask.config.get('jobqueue.name'), threads=dask.config.get('jobqueue.threads'), processes=dask.config.get('jobqueue.processes'), memory=dask.config.get('jobqueue.memory'), interface=dask.config.get('jobqueue.interface'), death_timeout=dask.config.get('jobqueue.death-timeout'), local_directory=dask.config.get('jobqueue.local-directory'), extra=dask.config.get('jobqueue.extra'), env_extra=dask.config.get('jobqueue.env-extra'), **kwargs): """ """ # """ # This initializer should be considered as Abstract, and never used # directly. # """ if not self.cancel_command or not self.submit_command: raise NotImplementedError('JobQueueCluster is an abstract class ' 'that should not be instanciated.') #This attribute should be overriden self.job_header = None if interface: host = get_ip_interface(interface) extra += ' --interface %s ' % interface else: host = socket.gethostname() self.cluster = LocalCluster(n_workers=0, ip=host, **kwargs) # Keep information on process, threads and memory, for use in # subclasses self.worker_memory = parse_bytes( memory) if memory is not None else None self.worker_processes = processes self.worker_threads = threads self.name = name self.jobs = dict() self.n = 0 self._adaptive = None self._env_header = '\n'.join(env_extra) # dask-worker command line build dask_worker_command = ('%(python)s -m distributed.cli.dask_worker' % dict(python=sys.executable)) self._command_template = ' '.join( [dask_worker_command, self.scheduler.address]) if threads is not None: self._command_template += " --nthreads %d" % threads if processes is not None: self._command_template += " --nprocs %d" % processes if memory is not None: self._command_template += " --memory-limit %s" % memory if name is not None: self._command_template += " --name %s" % name self._command_template += "-%(n)d" # Keep %(n) to be replaced later if death_timeout is not None: self._command_template += " --death-timeout %s" % death_timeout if local_directory is not None: self._command_template += " --local-directory %s" % local_directory if extra is not None: self._command_template += extra
def __init__(self, name=None, cores=None, memory=None, processes=None, interface=None, death_timeout=None, local_directory=None, extra=None, env_extra=None, log_directory=None, threads=None, shebang=None, python=sys.executable, config_name=None, **kwargs): """ """ # """ # This initializer should be considered as Abstract, and never used directly. # """ super(JobQueueCluster, self).__init__() if threads is not None: raise ValueError(threads_deprecation_message) if config_name is None: raise NotImplementedError( "JobQueueCluster is an abstract class that should not be instantiated." ) if name is None: name = dask.config.get("jobqueue.%s.name" % config_name) if cores is None: cores = dask.config.get("jobqueue.%s.cores" % config_name) if memory is None: memory = dask.config.get("jobqueue.%s.memory" % config_name) if processes is None: processes = dask.config.get("jobqueue.%s.processes" % config_name) if interface is None: interface = dask.config.get("jobqueue.%s.interface" % config_name) if death_timeout is None: death_timeout = dask.config.get("jobqueue.%s.death-timeout" % config_name) if local_directory is None: local_directory = dask.config.get("jobqueue.%s.local-directory" % config_name) if extra is None: extra = dask.config.get("jobqueue.%s.extra" % config_name) if env_extra is None: env_extra = dask.config.get("jobqueue.%s.env-extra" % config_name) if log_directory is None: log_directory = dask.config.get("jobqueue.%s.log-directory" % config_name) if shebang is None: shebang = dask.config.get("jobqueue.%s.shebang" % config_name) if dask.config.get("jobqueue.%s.threads", None): warnings.warn(threads_deprecation_message) if cores is None: raise ValueError( "You must specify how many cores to use per job like ``cores=8``" ) if memory is None: raise ValueError( "You must specify how much memory to use per job like ``memory='24 GB'``" ) # This attribute should be overridden self.job_header = None if interface: extra += ["--interface", interface] kwargs.setdefault("ip", get_ip_interface(interface)) else: kwargs.setdefault("ip", "") # Bokeh diagnostics server should listen on all interfaces kwargs.setdefault("dashboard_address", ("", 8787)) self.local_cluster = LocalCluster(n_workers=0, **kwargs) # Keep information on process, cores, and memory, for use in subclasses self.worker_memory = parse_bytes( memory) if memory is not None else None self.worker_processes = processes self.worker_cores = cores self.name = name # plugin for tracking job status self._scheduler_plugin = JobQueuePlugin() self.local_cluster.scheduler.add_plugin(self._scheduler_plugin) self._adaptive = None self.shebang = shebang self._env_header = "\n".join(env_extra) # dask-worker command line build dask_worker_command = "%(python)s -m distributed.cli.dask_worker" % dict( python=python) command_args = [dask_worker_command, self.scheduler.address] command_args += ["--nthreads", self.worker_process_threads] if processes is not None and processes > 1: command_args += ["--nprocs", processes] command_args += ["--memory-limit", self.worker_process_memory] command_args += ["--name", "%s--${JOB_ID}--" % name] if death_timeout is not None: command_args += ["--death-timeout", death_timeout] if local_directory is not None: command_args += ["--local-directory", local_directory] if extra is not None: command_args += extra self._command_template = " ".join(map(str, command_args)) self.log_directory = log_directory if self.log_directory is not None: if not os.path.exists(self.log_directory): os.makedirs(self.log_directory)
from copy import deepcopy import math import dask.array as da from distributed import Client, LocalCluster from dask import delayed, compute import time import sys from scipy import ndimage import pandas import time_procedures import matplotlib matplotlib.use('Agg') import pyart # Start a cluster with x workers cluster = LocalCluster(n_workers=int(sys.argv[1]), processes=False) client = Client(cluster) # Input the range of dates and time wanted for the collection of images start_year = 2005 start_month = 11 start_day = 1 start_hour = 1 start_minute = 0 start_second = 0 end_year = 2011 end_month = 5 end_day = 2 end_hour = 0 end_minute = 00
def run_sigla_pipeline(master_spreadsheet_id: str, google_api_credentials_path: str, db_connection_url: str): """ Run the SIGLA ETL pipeline Parameters ---------- master_spreadsheet_id: The master spreadsheet id. google_api_credentials_path: str The path to Google API credentials file needed to read Google Sheets. db_connection_url: str The DB's connection url str. """ log.info("Finished pipeline set up, start running pipeline") log.info("=" * 80) # Spawn local dask cluster cluster = LocalCluster() # Log the dashboard link log.info(f"Dashboard available at: {cluster.dashboard_link}") # Setup workflow with Flow("SIGLA Data Pipeline") as flow: # Delete all documents from db clean_up_task = _clean_up(db_connection_url) # Get spreadsheet ids spreadsheet_ids = _get_spreadsheet_ids(master_spreadsheet_id, google_api_credentials_path) # Extract sheets data. # Get back list of list of SheetData spreadsheets_data = _extract.map( spreadsheet_ids, unmapped(google_api_credentials_path), upstream_tasks=[unmapped(clean_up_task)], ) # Transform list of SheetData into FormattedSheetData formatted_spreadsheets_data = _transform.map( flatten(spreadsheets_data)) # Create instituton filter gs_institution_filter = _create_filter_task([ gs_format.standard_institution, gs_format.multiple_sigla_answer_variable, ]) # Filter to list of institutional formatted sheet data gs_institutions_data = gs_institution_filter( formatted_spreadsheets_data) # Create composite filter gs_composite_filter = _create_filter_task([ gs_format.composite_variable, gs_format.institution_and_composite_variable, ]) # Filter to list of composite formatted sheet data gs_composites_data = gs_composite_filter(formatted_spreadsheets_data) # Load instutional data load_institutions_data_task = _load_institutions_data.map( gs_institutions_data, unmapped(db_connection_url)) # Load composite data load_composites_data_task = _load_composites_data.map( gs_composites_data, unmapped(db_connection_url), upstream_tasks=[unmapped(load_institutions_data_task)], ) # Log spreadsheets that were loaded _log_spreadsheets(spreadsheets_data, upstream_tasks=[load_composites_data_task]) # Run the flow state = flow.run(executor=DaskExecutor(cluster.scheduler_address)) if state.is_failed(): raise PrefectFlowFailure(ErrorInfo({"flow_name": flow.name}))
def create_client_and_cluster(n_jobs, num_tasks, dask_kwargs, entityset_size): cluster = None if 'cluster' in dask_kwargs: cluster = dask_kwargs['cluster'] else: # diagnostics_port sets the default port to launch bokeh web interface # if it is set to None web interface will not be launched diagnostics_port = None if 'diagnostics_port' in dask_kwargs: diagnostics_port = dask_kwargs['diagnostics_port'] del dask_kwargs['diagnostics_port'] cpu_workers = n_jobs_to_workers(n_jobs) workers = min(cpu_workers, num_tasks) if n_jobs != -1 and workers < n_jobs: warning_string = "{} workers requested, but only {} workers created." warning_string = warning_string.format(n_jobs, workers) if cpu_workers < n_jobs: warning_string += " Not enough cpu cores ({}).".format( cpu_workers) if num_tasks < n_jobs: chunk_warning = " Not enough chunks ({}), consider reducing the chunk size" warning_string += chunk_warning.format(num_tasks) warnings.warn(warning_string) # Distributed default memory_limit for worker is 'auto'. It calculates worker # memory limit as total virtual memory divided by the number # of cores available to the workers (alwasy 1 for featuretools setup). # This means reducing the number of workers does not increase the memory # limit for other workers. Featuretools default is to calculate memory limit # as total virtual memory divided by number of workers. To use distributed # default memory limit, set dask_kwargs['memory_limit']='auto' if 'memory_limit' in dask_kwargs: memory_limit = dask_kwargs['memory_limit'] del dask_kwargs['memory_limit'] else: total_memory = psutil.virtual_memory().total memory_limit = int(total_memory / float(workers)) cluster = LocalCluster(n_workers=workers, threads_per_worker=1, diagnostics_port=diagnostics_port, memory_limit=memory_limit, **dask_kwargs) # if cluster has bokeh port, notify user if unxepected port number if diagnostics_port is not None: if hasattr(cluster, 'scheduler') and cluster.scheduler: info = cluster.scheduler.identity() if 'bokeh' in info['services']: msg = "Dashboard started on port {}" print(msg.format(info['services']['bokeh'])) client = Client(cluster) warned_of_memory = False for worker in list(client.scheduler_info()['workers'].values()): worker_limit = worker['memory_limit'] if worker_limit < entityset_size: raise ValueError("Insufficient memory to use this many workers") elif worker_limit < 2 * entityset_size and not warned_of_memory: logger.warn( "Worker memory is between 1 to 2 times the memory" " size of the EntitySet. If errors occur that do" " not occur with n_jobs equals 1, this may be the " "cause. See https://docs.featuretools.com/guides/parallel.html" " for more information.") warned_of_memory = True return client, cluster
def run_espei(run_settings): """Wrapper around the ESPEI fitting procedure, taking only a settings dictionary. Parameters ---------- run_settings : dict Dictionary of input settings Returns ------- Either a Database (for generate parameters only) or a tuple of (Database, sampler) """ run_settings = get_run_settings(run_settings) system_settings = run_settings['system'] output_settings = run_settings['output'] generate_parameters_settings = run_settings.get('generate_parameters') mcmc_settings = run_settings.get('mcmc') # handle verbosity verbosity = { 0: logging.WARNING, 1: logging.INFO, 2: TRACE, 3: logging.DEBUG } logging.basicConfig(level=verbosity[output_settings['verbosity']], filename=output_settings['logfile']) log_version_info() # load datasets and handle i/o logging.log(TRACE, 'Loading and checking datasets.') dataset_path = system_settings['datasets'] datasets = load_datasets(sorted(recursive_glob(dataset_path, '*.json'))) if len(datasets.all()) == 0: logging.warning( 'No datasets were found in the path {}. This should be a directory containing dataset files ending in `.json`.' .format(dataset_path)) apply_tags(datasets, system_settings.get('tags', dict())) add_ideal_exclusions(datasets) logging.log(TRACE, 'Finished checking datasets') with open(system_settings['phase_models']) as fp: phase_models = json.load(fp) if generate_parameters_settings is not None: refdata = generate_parameters_settings['ref_state'] excess_model = generate_parameters_settings['excess_model'] ridge_alpha = generate_parameters_settings['ridge_alpha'] aicc_penalty = generate_parameters_settings['aicc_penalty_factor'] input_dbf = generate_parameters_settings.get('input_db', None) if input_dbf is not None: input_dbf = Database(input_dbf) dbf = generate_parameters( phase_models, datasets, refdata, excess_model, ridge_alpha=ridge_alpha, dbf=input_dbf, aicc_penalty_factor=aicc_penalty, ) dbf.to_file(output_settings['output_db'], if_exists='overwrite') if mcmc_settings is not None: tracefile = output_settings['tracefile'] probfile = output_settings['probfile'] # check that the MCMC output files do not already exist # only matters if we are actually running MCMC if os.path.exists(tracefile): raise OSError( 'Tracefile "{}" exists and would be overwritten by a new run. Use the ``output.tracefile`` setting to set a different name.' .format(tracefile)) if os.path.exists(probfile): raise OSError( 'Probfile "{}" exists and would be overwritten by a new run. Use the ``output.probfile`` setting to set a different name.' .format(probfile)) # scheduler setup if mcmc_settings['scheduler'] == 'dask': _raise_dask_work_stealing() # check for work-stealing from distributed import LocalCluster cores = mcmc_settings.get('cores', multiprocessing.cpu_count()) if (cores > multiprocessing.cpu_count()): cores = multiprocessing.cpu_count() logging.warning( "The number of cores chosen is larger than available. " "Defaulting to run on the {} available cores.".format( cores)) # TODO: make dask-scheduler-verbosity a YAML input so that users can debug. Should have the same log levels as verbosity scheduler = LocalCluster(n_workers=cores, threads_per_worker=1, processes=True, memory_limit=0) client = ImmediateClient(scheduler) client.run(logging.basicConfig, level=verbosity[output_settings['verbosity']], filename=output_settings['logfile']) logging.info("Running with dask scheduler: %s [%s cores]" % (scheduler, sum(client.ncores().values()))) try: bokeh_server_info = client.scheduler_info( )['services']['bokeh'] logging.info( "bokeh server for dask scheduler at localhost:{}".format( bokeh_server_info)) except KeyError: logging.info("Install bokeh to use the dask bokeh server.") elif mcmc_settings['scheduler'] == 'None': client = None logging.info( "Not using a parallel scheduler. ESPEI is running MCMC on a single core." ) else: # we were passed a scheduler file name _raise_dask_work_stealing() # check for work-stealing client = ImmediateClient(scheduler_file=mcmc_settings['scheduler']) client.run(logging.basicConfig, level=verbosity[output_settings['verbosity']], filename=output_settings['logfile']) logging.info("Running with dask scheduler: %s [%s cores]" % (client.scheduler, sum(client.ncores().values()))) # get a Database if mcmc_settings.get('input_db'): dbf = Database(mcmc_settings.get('input_db')) # load the restart trace if needed if mcmc_settings.get('restart_trace'): restart_trace = np.load(mcmc_settings.get('restart_trace')) else: restart_trace = None # load the remaining mcmc fitting parameters iterations = mcmc_settings.get('iterations') save_interval = mcmc_settings.get('save_interval') chains_per_parameter = mcmc_settings.get('chains_per_parameter') chain_std_deviation = mcmc_settings.get('chain_std_deviation') deterministic = mcmc_settings.get('deterministic') prior = mcmc_settings.get('prior') data_weights = mcmc_settings.get('data_weights') syms = mcmc_settings.get('symbols') # set up and run the EmceeOptimizer optimizer = EmceeOptimizer(dbf, scheduler=client) optimizer.save_interval = save_interval all_symbols = syms if syms is not None else database_symbols_to_fit( dbf) optimizer.fit(all_symbols, datasets, prior=prior, iterations=iterations, chains_per_parameter=chains_per_parameter, chain_std_deviation=chain_std_deviation, deterministic=deterministic, restart_trace=restart_trace, tracefile=tracefile, probfile=probfile, mcmc_data_weights=data_weights) optimizer.commit() optimizer.dbf.to_file(output_settings['output_db'], if_exists='overwrite') # close the scheduler, if possible if hasattr(client, 'close'): client.close() return optimizer.dbf, optimizer.sampler return dbf
def __init__(self, name='dask', queue='dav', project=None, threads_per_worker=4, processes=8, memory='7GB', walltime='00:30:00', interface=None, extra='', **kwargs): """ Initialize a SLURM Cluster Parameters ---------- name : str Name of worker jobs. Passed to `#SBATCH -J` option. queue : str Destination queue for each worker job. Passed to `#SBATCH -p` option. project : str Accounting string associated with each worker job. Passed to `#SBATCH -A` option. threads_per_worker : int Number of threads per process. processes : int Number of processes per node. memory : str Bytes of memory that the worker can use. This should be a string like "7GB" that can be interpretted both by PBS and Dask. walltime : str Walltime for each worker job. interface : str Network interface like 'eth0' or 'ib0'. extra : str Additional arguments to pass to `dask-worker` kwargs : dict Additional keyword arguments to pass to `LocalCluster` """ self._template = """ #!/bin/bash #SBATCH -J %(name)s #SBATCH -n %(processes)d #SBATCH -p %(queue)s #SBATCH -A %(project)s #SBATCH -t %(walltime)s #SBATCH -e %(name)s.err #SBATCH -o %(name)s.out %(base_path)s/dask-worker %(scheduler)s \ --nthreads %(threads_per_worker)d \ --nprocs %(processes)s \ --memory-limit %(memory)s \ --name %(name)s-%(n)d \ %(extra)s """.lstrip() if interface: host = get_ip_interface(interface) extra += ' --interface %s ' % interface else: host = socket.gethostname() project = project or os.environ.get('SLURM_ACCOUNT') if not project: raise ValueError("Must specify a project like `project='UCLB1234' " "or set SLURM_ACCOUNT environment variable") self.cluster = LocalCluster(n_workers=0, ip=host, **kwargs) memory = memory.replace(' ', '') self.config = { 'name': name, 'queue': queue, 'project': project, 'threads_per_worker': threads_per_worker, 'processes': processes, 'scheduler': self.scheduler.address, 'walltime': walltime, 'base_path': dirname, 'memory': memory, 'extra': extra } self.jobs = dict() self.n = 0 self._adaptive = None self._submitcmd = 'sbatch' self._cancelcmd = 'scancel' logger.debug("Job script: \n %s" % self.job_script())
def setUp(self): self.dagbag = DagBag(include_examples=True) self.cluster = LocalCluster()
def test_boost_from_prediction(tree_method: str) -> None: if tree_method == 'approx': pytest.xfail(reason='test_boost_from_prediction[approx] is flaky') from sklearn.datasets import load_breast_cancer X, y = load_breast_cancer(return_X_y=True) X_ = dd.from_array(X, chunksize=100) y_ = dd.from_array(y, chunksize=100) with LocalCluster(n_workers=4) as cluster: with Client(cluster) as _: model_0 = xgb.dask.DaskXGBClassifier( learning_rate=0.3, random_state=123, n_estimators=4, tree_method=tree_method, ) model_0.fit(X=X_, y=y_) margin = model_0.predict(X_, output_margin=True) model_1 = xgb.dask.DaskXGBClassifier( learning_rate=0.3, random_state=123, n_estimators=4, tree_method=tree_method, ) model_1.fit(X=X_, y=y_, base_margin=margin) predictions_1 = model_1.predict(X_, base_margin=margin) proba_1 = model_1.predict_proba(X_, base_margin=margin) cls_2 = xgb.dask.DaskXGBClassifier( learning_rate=0.3, random_state=123, n_estimators=8, tree_method=tree_method, ) cls_2.fit(X=X_, y=y_) predictions_2 = cls_2.predict(X_) proba_2 = cls_2.predict_proba(X_) cls_3 = xgb.dask.DaskXGBClassifier( learning_rate=0.3, random_state=123, n_estimators=8, tree_method=tree_method, ) cls_3.fit(X=X_, y=y_) proba_3 = cls_3.predict_proba(X_) # compute variance of probability percentages between two of the # same model, use this to check to make sure approx is functioning # within normal parameters expected_variance = np.max(np.abs(proba_3 - proba_2)).compute() if expected_variance > 0: margin_variance = np.max(np.abs(proba_1 - proba_2)).compute() # Ensure the margin variance is less than the expected variance + 10% assert np.all(margin_variance <= expected_variance + .1) else: np.testing.assert_equal(predictions_1.compute(), predictions_2.compute()) np.testing.assert_almost_equal(proba_1.compute(), proba_2.compute())
def client(tmpdir): cluster = LocalCluster(local_dir=str(tmpdir)) client = Client(cluster) yield client client.close()
import dask.array as da from distributed import Client, LocalCluster from sklearn.datasets import make_regression import lightgbm as lgb if __name__ == "__main__": print("loading data") X, y = make_regression(n_samples=1000, n_features=50) print("initializing a Dask cluster") cluster = LocalCluster(n_workers=2) client = Client(cluster) print("created a Dask LocalCluster") print("distributing training data on the Dask cluster") dX = da.from_array(X, chunks=(100, 50)) dy = da.from_array(y, chunks=(100, )) print("beginning training") dask_model = lgb.DaskLGBMRegressor(n_estimators=10) dask_model.fit(dX, dy) assert dask_model.fitted_ print("done training")