def __init__(self, width, node_config: NodeConfig, node_count: int): self.width = width self.node_config = node_config self.node_count = node_count self.row_count = row_count( self.width, self.node_config.margin_function, self.node_config.radius_scale, self.node_count, ) self.radius_scale = self.node_config.radius_scale.domain([0, self.row_count]) self.rows = pipe( range(self.row_count), curried.map(self.bound_radius_scale), curried.map(lambda radius: row.Row(width, margin_ratio * radius, radius)), list, ) self.coordinates_lookup = pipe( self.rows, operator.attrgetter("length"), curried.accumulate(operator.add), lambda it: chain([0], it), curried.sliding_window(2), curried.map(lambda t: range(*t)), list, )
def getem(arr, blockdims=None, blockshape=None, shape=None): """ Dask getting various chunks from an array-like >>> getem('X', blockshape=(2, 3), shape=(4, 6)) # doctest: +SKIP {('X', 0, 0): (getitem, 'X', (slice(0, 2), slice(0, 3))), ('X', 1, 0): (getitem, 'X', (slice(2, 4), slice(0, 3))), ('X', 1, 1): (getitem, 'X', (slice(2, 4), slice(3, 6))), ('X', 0, 1): (getitem, 'X', (slice(0, 2), slice(3, 6)))} >>> getem('X', blockdims=((2, 2), (3, 3))) # doctest: +SKIP {('X', 0, 0): (getitem, 'X', (slice(0, 2), slice(0, 3))), ('X', 1, 0): (getitem, 'X', (slice(2, 4), slice(0, 3))), ('X', 1, 1): (getitem, 'X', (slice(2, 4), slice(3, 6))), ('X', 0, 1): (getitem, 'X', (slice(0, 2), slice(3, 6)))} """ if not blockdims: blockdims = blockdims_from_blockshape(shape, blockshape) cumdims = [list(accumulate(add, (0, ) + bds[:-1])) for bds in blockdims] keys = list(product([arr], *[range(len(bds)) for bds in blockdims])) shapes = product(*blockdims) starts = product(*cumdims) values = ((getitem, arr) + (tuple(slice(s, s + dim) for s, dim in zip(start, shape)), ) for start, shape in zip(starts, shapes)) return dict(zip(keys, values))
def getem(arr, blockdims=None, blockshape=None, shape=None): """ Dask getting various chunks from an array-like >>> getem('X', blockshape=(2, 3), shape=(4, 6)) # doctest: +SKIP {('X', 0, 0): (getitem, 'X', (slice(0, 2), slice(0, 3))), ('X', 1, 0): (getitem, 'X', (slice(2, 4), slice(0, 3))), ('X', 1, 1): (getitem, 'X', (slice(2, 4), slice(3, 6))), ('X', 0, 1): (getitem, 'X', (slice(0, 2), slice(3, 6)))} >>> getem('X', blockdims=((2, 2), (3, 3))) # doctest: +SKIP {('X', 0, 0): (getitem, 'X', (slice(0, 2), slice(0, 3))), ('X', 1, 0): (getitem, 'X', (slice(2, 4), slice(0, 3))), ('X', 1, 1): (getitem, 'X', (slice(2, 4), slice(3, 6))), ('X', 0, 1): (getitem, 'X', (slice(0, 2), slice(3, 6)))} """ if not blockdims: blockdims = blockdims_from_blockshape(shape, blockshape) cumdims = [list(accumulate(add, (0,) + bds[:-1])) for bds in blockdims] keys = list(product([arr], *[range(len(bds)) for bds in blockdims])) shapes = product(*blockdims) starts = product(*cumdims) values = ((getitem, arr) + (tuple(slice(s, s+dim) for s, dim in zip(start, shape)),) for start, shape in zip(starts, shapes)) return dict(zip(keys, values))
def insert_to_ooc(out, arr): from threading import Lock lock = Lock() locs = [[0] + list(accumulate(add, bl)) for bl in arr.blockdims] def store(x, *args): with lock: ind = tuple([slice(loc[i], loc[i+1]) for i, loc in zip(args, locs)]) out[ind] = np.asanyarray(x) return None name = 'store-%s' % arr.name return dict(((name,) + t[1:], (store, t) + t[1:]) for t in core.flatten(arr._keys()))
def fromfunction(func, shape=None, blockshape=None, blockdims=None, dtype=None): name = next(fromfunction_names) if shape and blockshape and not blockdims: blockdims = blockdims_from_blockshape(shape, blockshape) keys = list(product([name], *[range(len(bd)) for bd in blockdims])) aggdims = [list(accumulate(add, (0,) + bd[:-1])) for bd in blockdims] offsets = list(product(*aggdims)) shapes = list(product(*blockdims)) values = [(np.fromfunction, offset_func(func, offset), shape) for offset, shape in zip(offsets, shapes)] dsk = dict(zip(keys, values)) return Array(dsk, name, blockdims=blockdims, dtype=dtype)
def insert_to_ooc(out, arr): from threading import Lock lock = Lock() locs = [[0] + list(accumulate(add, bl)) for bl in arr.blockdims] def store(x, *args): with lock: ind = tuple( [slice(loc[i], loc[i + 1]) for i, loc in zip(args, locs)]) out[ind] = np.asanyarray(x) return None name = 'store-%s' % arr.name return dict(((name, ) + t[1:], (store, t) + t[1:]) for t in core.flatten(arr._keys()))
def fromfunction(func, shape=None, blockshape=None, blockdims=None, dtype=None): name = next(fromfunction_names) if shape and blockshape and not blockdims: blockdims = blockdims_from_blockshape(shape, blockshape) keys = list(product([name], *[range(len(bd)) for bd in blockdims])) aggdims = [list(accumulate(add, (0, ) + bd[:-1])) for bd in blockdims] offsets = list(product(*aggdims)) shapes = list(product(*blockdims)) values = [(np.fromfunction, offset_func(func, offset), shape) for offset, shape in zip(offsets, shapes)] dsk = dict(zip(keys, values)) return Array(dsk, name, blockdims=blockdims, dtype=dtype)
def spatial_learning_curve_splitter(train_data: pd.DataFrame, space_column: str, time_column: str, training_limit: DateType, holdout_gap: timedelta = timedelta(days=0), train_percentages: Iterable[float] = (0.25, 0.5, 0.75, 1.0), random_state: int = None) -> SplitterReturnType: """ Splits the data for a spatial learning curve. Progressively adds more and more examples to the training in order to verify the impact of having more data available on a validation set. The validation set starts after the training set, with an optional time gap. Similar to the temporal learning curves, but with spatial increases in the training set. Parameters ---------- train_data : pandas.DataFrame A Pandas' DataFrame that will be split for learning curve estimation. space_column : str The name of the ID column of `train_data`. time_column : str The name of the temporal column of `train_data`. training_limit: datetime or str The date limiting the training (after which the holdout begins). holdout_gap: timedelta The gap between the end of training and the start of the holdout. If you have censored data, use a gap similar to the censor time. train_percentages: list or tuple of floats A list containing the percentages of IDs to use in the training. Defaults to (0.25, 0.5, 0.75, 1.0). For example: For the default value, there would be four model trainings, containing respectively 25%, 50%, 75%, and 100% of the IDs that are not part of the held out set. random_state : int A seed for the random number generator that shuffles the IDs. """ if np.min(train_percentages) < 0 or np.max(train_percentages) > 1: raise ValueError('Train percentages must be between 0 and 1') if isinstance(training_limit, str): training_limit = datetime.strptime(training_limit, "%Y-%m-%d") if training_limit < train_data[time_column].min() or training_limit > train_data[time_column].max(): raise ValueError('Temporal training limit should be within datasets temporal bounds (min and max times)') if timedelta(days=0) > holdout_gap: raise ValueError('Holdout gap cannot be negative') if holdout_gap >= (train_data[time_column].max() - training_limit): raise ValueError('After taking the gap into account, there should be enough time for the holdout set') train_data = train_data.reset_index() # We need to sample the space column before getting its unique values so their order in the DF won't matter here spatial_ids = train_data[space_column].sample(frac=1, random_state=random_state).unique() cumulative_ids = pipe( spatial_ids, lambda ids: (np.array(train_percentages) * len(ids)).astype(int), # Get the corresponding indices for each % lambda idx: np.split(spatial_ids, idx)[:-1], # Split spatial ids by the indices lambda l: map(lambda x: x.tolist(), l), # Transform sub-arrays into sub-lists lambda l: filter(None, l), # Drop empty sub-lists accumulate(operator.add) # Cumulative sum of lists ) validation_set = train_data[train_data[time_column] > (training_limit + holdout_gap)] train_data = train_data[train_data[time_column] <= training_limit] folds = [(train_data[train_data[space_column].isin(ids)][time_column], validation_set[time_column]) for ids in cumulative_ids] folds_indices = _lc_fold_to_indexes(folds) # final formatting with idx logs = [assoc(learner, "percentage", p) for learner, p in zip(map(_log_time_fold, folds), train_percentages)] return folds_indices, logs
def concatenate(seq, axis=0): """ Concatenate arrays along an existing axis Given a sequence of dask Arrays form a new dask Array by stacking them along an existing dimension (axis=0 by default) Example ------- Create slices >>> import dask.array as da >>> import numpy as np >>> data = [from_array(np.ones((4, 4)), blockshape=(2, 2)) ... for i in range(3)] >>> x = da.concatenate(data, axis=0) >>> x.shape (12, 4) >>> da.concatenate(data, axis=1).shape (4, 12) Result is a new dask Array See Also: stack """ n = len(seq) ndim = len(seq[0].shape) if axis < 0: axis = ndim + axis if axis >= ndim: raise ValueError("Axis must be less than than number of dimensions" "\nData has %d dimensions, but got axis=%d" % (ndim, axis)) bds = [a.blockdims for a in seq] if not all(len(set(bds[i][j] for i in range(n))) == 1 for j in range(len(bds[0])) if j != axis): raise ValueError("Block shapes do not align") shape = (seq[0].shape[:axis] + (sum(a.shape[axis] for a in seq),) + seq[0].shape[axis + 1:]) blockdims = ( seq[0].blockdims[:axis] + (sum([bd[axis] for bd in bds], ()),) + seq[0].blockdims[axis + 1:]) name = next(concatenate_names) keys = list(product([name], *[range(len(bd)) for bd in blockdims])) cum_dims = [0] + list(accumulate(add, [len(a.blockdims[axis]) for a in seq])) names = [a.name for a in seq] values = [(names[bisect(cum_dims, key[axis + 1]) - 1],) + key[1:axis + 1] + (key[axis + 1] - cum_dims[bisect(cum_dims, key[axis+1]) - 1],) + key[axis + 2:] for key in keys] dsk = dict(zip(keys, values)) dsk2 = merge(dsk, *[a.dask for a in seq]) if all(a._dtype is not None for a in seq): dt = reduce(np.promote_types, [a._dtype for a in seq]) else: dt = None return Array(dsk2, name, shape, blockdims=blockdims, dtype=dt)
def concatenate(seq, axis=0): """ Concatenate arrays along an existing axis Given a sequence of dask Arrays form a new dask Array by stacking them along an existing dimension (axis=0 by default) Example ------- Create slices >>> import dask.array as da >>> import numpy as np >>> data = [from_array(np.ones((4, 4)), blockshape=(2, 2)) ... for i in range(3)] >>> x = da.concatenate(data, axis=0) >>> x.shape (12, 4) >>> da.concatenate(data, axis=1).shape (4, 12) Result is a new dask Array See Also: stack """ n = len(seq) ndim = len(seq[0].shape) if axis < 0: axis = ndim + axis if axis >= ndim: raise ValueError("Axis must be less than than number of dimensions" "\nData has %d dimensions, but got axis=%d" % (ndim, axis)) bds = [a.blockdims for a in seq] if not all( len(set(bds[i][j] for i in range(n))) == 1 for j in range(len(bds[0])) if j != axis): raise ValueError("Block shapes do not align") shape = (seq[0].shape[:axis] + (sum(a.shape[axis] for a in seq), ) + seq[0].shape[axis + 1:]) blockdims = (seq[0].blockdims[:axis] + (sum([bd[axis] for bd in bds], ()), ) + seq[0].blockdims[axis + 1:]) name = next(concatenate_names) keys = list(product([name], *[range(len(bd)) for bd in blockdims])) cum_dims = [0] + list( accumulate(add, [len(a.blockdims[axis]) for a in seq])) names = [a.name for a in seq] values = [ (names[bisect(cum_dims, key[axis + 1]) - 1], ) + key[1:axis + 1] + (key[axis + 1] - cum_dims[bisect(cum_dims, key[axis + 1]) - 1], ) + key[axis + 2:] for key in keys ] dsk = dict(zip(keys, values)) dsk2 = merge(dsk, *[a.dask for a in seq]) if all(a._dtype is not None for a in seq): dt = reduce(np.promote_types, [a._dtype for a in seq]) else: dt = None return Array(dsk2, name, shape, blockdims=blockdims, dtype=dt)