def __invert__(self) -> FutureWrapper: client = get_client() new_fut = client.submit(op.not_, self) return FutureWrapper.from_future(new_fut)
def __xor__(self, other) -> FutureWrapper: client = get_client() new_fut = client.submit(op.xor, self, other) return FutureWrapper.from_future(new_fut)
def cross_validation(model, horizon, period=None, initial=None, parallel=None, cutoffs=None, disable_tqdm=False): """Cross-Validation for time series. Computes forecasts from historical cutoff points, which user can input. If not provided, begins from (end - horizon) and works backwards, making cutoffs with a spacing of period until initial is reached. When period is equal to the time interval of the data, this is the technique described in https://robjhyndman.com/hyndsight/tscv/ . Parameters ---------- model: Prophet class object. Fitted Prophet model. horizon: string with pd.Timedelta compatible style, e.g., '5 days', '3 hours', '10 seconds'. period: string with pd.Timedelta compatible style. Simulated forecast will be done at every this period. If not provided, 0.5 * horizon is used. initial: string with pd.Timedelta compatible style. The first training period will include at least this much data. If not provided, 3 * horizon is used. cutoffs: list of pd.Timestamp specifying cutoffs to be used during cross validation. If not provided, they are generated as described above. parallel : {None, 'processes', 'threads', 'dask', object} disable_tqdm: if True it disables the progress bar that would otherwise show up when parallel=None How to parallelize the forecast computation. By default no parallelism is used. * None : No parallelism. * 'processes' : Parallelize with concurrent.futures.ProcessPoolExectuor. * 'threads' : Parallelize with concurrent.futures.ThreadPoolExecutor. Note that some operations currently hold Python's Global Interpreter Lock, so parallelizing with threads may be slower than training sequentially. * 'dask': Parallelize with Dask. This requires that a dask.distributed Client be created. * object : Any instance with a `.map` method. This method will be called with :func:`single_cutoff_forecast` and a sequence of iterables where each element is the tuple of arguments to pass to :func:`single_cutoff_forecast` .. code-block:: class MyBackend: def map(self, func, *iterables): results = [ func(*args) for args in zip(*iterables) ] return results Returns ------- A pd.DataFrame with the forecast, actual value and cutoff. """ df = model.history.copy().reset_index(drop=True) horizon = pd.Timedelta(horizon) predict_columns = ['ds', 'yhat'] if model.uncertainty_samples: predict_columns.extend(['yhat_lower', 'yhat_upper']) # Identify largest seasonality period period_max = 0. for s in model.seasonalities.values(): period_max = max(period_max, s['period']) seasonality_dt = pd.Timedelta(str(period_max) + ' days') if cutoffs is None: # Set period period = 0.5 * horizon if period is None else pd.Timedelta(period) # Set initial initial = (max(3 * horizon, seasonality_dt) if initial is None else pd.Timedelta(initial)) # Compute Cutoffs cutoffs = generate_cutoffs(df, horizon, initial, period) else: # add validation of the cutoff to make sure that the min cutoff is strictly greater than the min date in the history if min(cutoffs) <= df['ds'].min(): raise ValueError( "Minimum cutoff value is not strictly greater than min date in history" ) # max value of cutoffs is <= (end date minus horizon) end_date_minus_horizon = df['ds'].max() - horizon if max(cutoffs) > end_date_minus_horizon: raise ValueError( "Maximum cutoff value is greater than end date minus horizon, no value for cross-validation remaining" ) initial = cutoffs[0] - df['ds'].min() # Check if the initial window # (that is, the amount of time between the start of the history and the first cutoff) # is less than the maximum seasonality period if initial < seasonality_dt: msg = 'Seasonality has period of {} days '.format(period_max) msg += 'which is larger than initial window. ' msg += 'Consider increasing initial.' logger.warning(msg) if parallel: valid = {"threads", "processes", "dask"} if parallel == "threads": pool = concurrent.futures.ThreadPoolExecutor() elif parallel == "processes": pool = concurrent.futures.ProcessPoolExecutor() elif parallel == "dask": try: from dask.distributed import get_client except ImportError as e: raise ImportError("parallel='dask' requies the optional " "dependency dask.") from e pool = get_client() # delay df and model to avoid large objects in task graph. df, model = pool.scatter([df, model]) elif hasattr(parallel, "map"): pool = parallel else: msg = ("'parallel' should be one of {} for an instance with a " "'map' method".format(', '.join(valid))) raise ValueError(msg) iterables = ((df, model, cutoff, horizon, predict_columns) for cutoff in cutoffs) iterables = zip(*iterables) logger.info("Applying in parallel with %s", pool) predicts = pool.map(single_cutoff_forecast, *iterables) if parallel == "dask": # convert Futures to DataFrames predicts = pool.gather(predicts) else: predicts = [ single_cutoff_forecast(df, model, cutoff, horizon, predict_columns) for cutoff in (tqdm(cutoffs) if not disable_tqdm else cutoffs) ] # Combine all predicted pd.DataFrame into one pd.DataFrame return pd.concat(predicts, axis=0).reset_index(drop=True)
def __getitem__(self, item) -> FutureWrapper: client = get_client() new_fut = client.submit(op.getitem, self, item) return FutureWrapper.from_future(new_fut)
def session_run_at_end(): client = get_client(address) print("Closed Dask client={}\n".format(client)) client.shutdown() client.close() del client
def get_chunking(filelist, chunksize, treename="Events", workers=12, skip_bad_files=False, xrootd=False, client=None, use_dask=False): """ Return 2-tuple of - chunks: triplets of (filename,entrystart,entrystop) calculated with input `chunksize` and `filelist` - total_nevents: total event count over `filelist` """ import uproot3 from tqdm.auto import tqdm import concurrent.futures if xrootd: temp = [] for fname in filelist: if fname.startswith("/hadoop/cms"): temp.append( fname.replace("/hadoop/cms", "root://redirector.t2.ucsd.edu/")) else: temp.append( fname.replace("/store/", "root://xrootd.t2.ucsd.edu:2040//store/")) filelist = temp chunksize = int(chunksize) chunks = [] nevents = 0 if use_dask: if not client: from dask.distributed import get_client client = get_client() def numentries(fname): import uproot3 try: return (fname, uproot3.numentries(fname, treename)) except: return (fname, -1) futures = client.map(numentries, filelist) info = [] for future, result in tqdm(as_completed(futures, with_results=True), total=len(futures)): info.append(result) for fn, nentries in info: if nentries < 0: if skip_bad_files: print("Skipping bad file: {}".format(fn)) continue else: raise RuntimeError("Bad file: {}".format(fn)) nevents += nentries for index in range(nentries // chunksize + 1): chunks.append((fn, chunksize * index, min(chunksize * (index + 1), nentries))) else: if skip_bad_files: # slightly slower (serial loop), but can skip bad files for fname in tqdm(filelist): try: items = uproot3.numentries(fname, treename, total=False).items() except (IndexError, ValueError) as e: print("Skipping bad file", fname) continue for fn, nentries in items: nevents += nentries for index in range(nentries // chunksize + 1): chunks.append((fn, chunksize * index, min(chunksize * (index + 1), nentries))) else: executor = None if len( filelist) < 5 else concurrent.futures.ThreadPoolExecutor( min(workers, len(filelist))) for fn, nentries in uproot3.numentries(filelist, treename, total=False, executor=executor).items(): nevents += nentries for index in range(nentries // chunksize + 1): chunks.append((fn, chunksize * index, min(chunksize * (index + 1), nentries))) return chunks, nevents
def __setstate__(self, state): super(Dask, self).__setstate__(state) self.client = get_client()
def set_window(shear_zbins={}, f_sky=0.3, nside=256, mask_start_pix=0, window_cl_fact=None, unit_win=False, scheduler_info=None, mask=None, delta_W=True): from skylens.skylens_main import Skylens w_lmax = 3 * nside l0 = np.arange(w_lmax, dtype='int') corr = ('galaxy', 'galaxy') kappa0 = Skylens(galaxy_zbins=shear_zbins, do_cov=False, bin_cl=False, l_bins=None, l=l0, use_window=False, corrs=[corr], f_sky=f_sky, scheduler_info=scheduler_info) cl0G = kappa0.cl_tomo() npix0 = hp.nside2npix(nside) npix = np.int(npix0 * f_sky) if mask is None: mask = np.zeros(npix0, dtype='bool') # mask[int(npix):]=0 mask[mask_start_pix:mask_start_pix + int(npix)] = 1 cl_map0 = hp.ma(np.ones(npix0)) cl_map0[~mask] = hp.UNSEEN if scheduler_info is None: client = get_client() else: client = get_client(address=scheduler_info['address']) for i in np.arange(shear_zbins['n_bins']): cl_i = client.compute(cl0G['cl'][corr][(i, i)]).result() if np.any(np.isnan(cl_i)): print('survey utils, set_window:', cl_i) crash if unit_win: cl_map = hp.ma(np.ones(12 * nside * nside)) # cl_i=1 else: cl_i += shear_zbins['SN']['galaxy'][:, i, i] if window_cl_fact is not None: cl_i *= window_cl_fact cl_map = hp.ma(1 + hp.synfast(cl_i, nside=nside)) cl_map[cl_map <= 0] = 1.e-4 cl_map[~mask] = hp.UNSEEN cl_t = hp.anafast(cl_map) # cl_map/=cl_map[mask].mean() # if not unit_win: # cl_map/=np.sqrt(cl_t[0]) #this is important for shear map normalization in correlation functions. cl_map[~mask] = hp.UNSEEN cl_map_noise = np.sqrt(cl_map) cl_map_noise[~mask] = hp.UNSEEN # cl_map.mask=mask shear_zbins[i]['window_cl0'] = cl_i shear_zbins[i]['window'] = cl_map if delta_W: shear_zbins[i]['window_N'] = np.sqrt(shear_zbins[i]['window']) shear_zbins[i]['window_N'][~mask] = hp.UNSEEN else: print('not using delta_W window') shear_zbins[i]['window_N'] = np.sqrt(1. / shear_zbins[i]['window']) shear_zbins[i]['window_N'][~mask] = hp.UNSEEN shear_zbins[i]['window'][:] = 1 shear_zbins[i]['window'][~mask] = hp.UNSEEN del cl0G, kappa0 return shear_zbins
def check_sq_variance(dataset_path, dataset_id, variable, pngpath, pbar, debug=False): issues = [] client = get_client() num_segments = 10 pbar.total = num_segments + 2 if pbar.n != 0: pbar.n = 0 pbar.last_print_n = 0 pbar.update() with xr.open_mfdataset(dataset_path + '/*.nc') as ds: segments = list( range(0, ds['time'].size, ds['time'].size // num_segments)) vmax = np.zeros(ds['time'].size) futures = [] if 'time' not in ds.coords: return [], dataset_id dims = list() possible_dims = ['depth', 'lat', 'lon', 'plev', 'tau', 'lev', 'sector'] for i in possible_dims: if i in ds.dims: if i == 'sector': dims.append('basin') else: dims.append(i) dims = tuple(dims) if 'lat' not in dims and 'lon' not in dims: ds['means'] = ds[variable] # maxrollingstd = client.compute( ds['means'].std ).result() # maxrollingvar = client.compute( ds['means'].mean ).result() else: ds['means'] = client.compute(ds[variable].mean(dim=dims)).result() # maxrollingstd = ds['means'].std().compute() # maxrollingvar = ds['means'].mean().compute() # maxrollingstd = client.compute( ds['means'].std ).result() # maxrollingvar = client.compute( ds['means'].mean ).result() # import ipdb; ipdb.set_trace() ds['means'] = ds['means'][~np.isnan(ds['means'])] pbar.update(1) maxrollingvar = client.compute(ds['means'].rolling( { 'time': 120 }, min_periods=1).mean()) maxrollingstd = client.compute(ds['means'].rolling( { 'time': 120 }, min_periods=1).std()) for idx, seg in enumerate(segments): if idx == num_segments - 1: seg_end = ds['time'].size else: seg_end = segments[idx + 1] temp_ds = ds['time'][seg:seg_end] chunk = ds.sel(time=temp_ds) futures.append( client.submit(run_chunk, chunk, dataset_id, maxrollingvar, maxrollingstd, (seg, seg_end), idx)) for f in as_completed(futures): pbar.update(1) vx, issues, seg, threshold = f.result() vmax[seg[0]:seg[1]] = vx plot_minmaxmean(pngpath, ds, vmax, dataset_id, debug=debug) return issues
def _cluster_mode(): try: get_client() return True except ValueError: return False
def retry_with_timeout(func, retry_freq=10, n_tries=1, use_dask=True): """Execute ``func`` ``n_tries`` times, each time only allowing ``retry_freq`` seconds for the function to complete. There are two main cases where this could be useful: 1. You have a function that you know should execute quickly, but you may get occasional errors when running it simultaneously on a large number of workers. An example of this is massively parallelized I/O operations of netcdfs on GCS. 2. You have a function that may or may not take a long time, but you want to skip it if it takes too long. There are two possible ways that this timeout function is implemented, each with pros and cons: 1. Using python's native ``threading`` module. If you are executing ``func`` outside of a ``dask`` worker, you likely will want this approach. It may be slightly faster and has the benefit of starting the timeout clock when the function starts executing (rather than when the function is *submitted* to a dask scheduler). **Note**: This approach will also work if calling ``func`` *from* a dask worker, but only if the cluster was set up such that ``threads_per_worker=1``. Otherwise, this may cause issues if used from a dask worker. 2. Using ``dask``. If you would like a dask worker to execute this function, you likely will want this approach. It can be executed from a dask worker regardless of the number of threads per worker (see above), but has the downside that the timeout clock begins once ``func`` is submitted, rather than when it begins executing. Parameters ---------- func : callable The function you would like to execute with a timeout backoff. retry_freq : float The number of seconds to wait between successive retries of ``func``. n_tries : int The number of retries to attempt before raising an error if none were successful use_dask : bool If true, will try to use the ``dask``-based implementation (see description above). If no ``Client`` instance is present, will fall back to ``use_dask=False``. Returns ------- The return value of ``func`` Raises ------ dask.distributed.TimeoutError : If the function does not execute successfully in the specified ``retry_freq``, after trying ``n_tries`` times. ValueError : If ``use_dask=True``, and a ``Client`` instance is present, but this fucntion is executed from the client (rather than as a task submitted to a worker), you will get ``ValueError("No workers found")``. Examples -------- .. code-block:: python >>> import time >>> @retry_with_timeout(retry_freq=.5, n_tries=1) ... def wait_func(timeout): ... time.sleep(timeout) >>> wait_func(.1) >>> wait_func(1) Traceback (most recent call last): ... asyncio.exceptions.TimeoutError: Func did not complete successfully in allowed time/number of retries. """ # if use_dask specified, check if there is an active client, otherwise set to false if use_dask: try: dd.get_client() except ValueError: use_dask = False @functools.wraps(func) def inner(*args, **kwargs): if use_dask: # dask version with dd.worker_client() as client: for try_n in range(n_tries): fut = client.submit(func, *args, **kwargs) try: return fut.result(timeout=retry_freq) except dd.TimeoutError: ... else: # non-dask version def this_func(q): args = q.get_nowait() kwargs = q.get_nowait() out = func(*args, **kwargs) q.put(out) for try_n in range(n_tries): q = queue.Queue() p = threading.Thread(target=this_func, args=(q, )) q.put_nowait(args) q.put_nowait(kwargs) p.start() p.join(timeout=retry_freq) if p.is_alive(): del p, q continue elif q.qsize() == 0: raise RuntimeError( "Queue is not empty. Something malfunctined in ``func``" ) return q.get() raise dd.TimeoutError( "Func did not complete successfully in allowed time/number of retries." ) return inner
def __init__(self): try: self._client = get_client() except ValueError: assert False, ("Should connect to Dask scheduler before" " initializing this object.")
def get_distributed_client(): try: return get_client() except ValueError: return None