Esempio n. 1
0
    def __init__(self, width, node_config: NodeConfig, node_count: int):
        self.width = width
        self.node_config = node_config
        self.node_count = node_count

        self.row_count = row_count(
            self.width,
            self.node_config.margin_function,
            self.node_config.radius_scale,
            self.node_count,
        )

        self.radius_scale = self.node_config.radius_scale.domain([0, self.row_count])

        self.rows = pipe(
            range(self.row_count),
            curried.map(self.bound_radius_scale),
            curried.map(lambda radius: row.Row(width, margin_ratio * radius, radius)),
            list,
        )
        self.coordinates_lookup = pipe(
            self.rows,
            operator.attrgetter("length"),
            curried.accumulate(operator.add),
            lambda it: chain([0], it),
            curried.sliding_window(2),
            curried.map(lambda t: range(*t)),
            list,
        )
Esempio n. 2
0
def getem(arr, blockdims=None, blockshape=None, shape=None):
    """ Dask getting various chunks from an array-like

    >>> getem('X', blockshape=(2, 3), shape=(4, 6))  # doctest: +SKIP
    {('X', 0, 0): (getitem, 'X', (slice(0, 2), slice(0, 3))),
     ('X', 1, 0): (getitem, 'X', (slice(2, 4), slice(0, 3))),
     ('X', 1, 1): (getitem, 'X', (slice(2, 4), slice(3, 6))),
     ('X', 0, 1): (getitem, 'X', (slice(0, 2), slice(3, 6)))}

    >>> getem('X', blockdims=((2, 2), (3, 3)))  # doctest: +SKIP
    {('X', 0, 0): (getitem, 'X', (slice(0, 2), slice(0, 3))),
     ('X', 1, 0): (getitem, 'X', (slice(2, 4), slice(0, 3))),
     ('X', 1, 1): (getitem, 'X', (slice(2, 4), slice(3, 6))),
     ('X', 0, 1): (getitem, 'X', (slice(0, 2), slice(3, 6)))}
    """
    if not blockdims:
        blockdims = blockdims_from_blockshape(shape, blockshape)

    cumdims = [list(accumulate(add, (0, ) + bds[:-1])) for bds in blockdims]
    keys = list(product([arr], *[range(len(bds)) for bds in blockdims]))

    shapes = product(*blockdims)
    starts = product(*cumdims)

    values = ((getitem, arr) +
              (tuple(slice(s, s + dim) for s, dim in zip(start, shape)), )
              for start, shape in zip(starts, shapes))

    return dict(zip(keys, values))
Esempio n. 3
0
def getem(arr, blockdims=None, blockshape=None, shape=None):
    """ Dask getting various chunks from an array-like

    >>> getem('X', blockshape=(2, 3), shape=(4, 6))  # doctest: +SKIP
    {('X', 0, 0): (getitem, 'X', (slice(0, 2), slice(0, 3))),
     ('X', 1, 0): (getitem, 'X', (slice(2, 4), slice(0, 3))),
     ('X', 1, 1): (getitem, 'X', (slice(2, 4), slice(3, 6))),
     ('X', 0, 1): (getitem, 'X', (slice(0, 2), slice(3, 6)))}

    >>> getem('X', blockdims=((2, 2), (3, 3)))  # doctest: +SKIP
    {('X', 0, 0): (getitem, 'X', (slice(0, 2), slice(0, 3))),
     ('X', 1, 0): (getitem, 'X', (slice(2, 4), slice(0, 3))),
     ('X', 1, 1): (getitem, 'X', (slice(2, 4), slice(3, 6))),
     ('X', 0, 1): (getitem, 'X', (slice(0, 2), slice(3, 6)))}
    """
    if not blockdims:
        blockdims = blockdims_from_blockshape(shape, blockshape)

    cumdims = [list(accumulate(add, (0,) + bds[:-1])) for bds in blockdims]
    keys = list(product([arr], *[range(len(bds)) for bds in blockdims]))

    shapes = product(*blockdims)
    starts = product(*cumdims)

    values = ((getitem, arr) + (tuple(slice(s, s+dim)
                                 for s, dim in zip(start, shape)),)
                for start, shape in zip(starts, shapes))

    return dict(zip(keys, values))
Esempio n. 4
0
def insert_to_ooc(out, arr):
    from threading import Lock
    lock = Lock()

    locs = [[0] + list(accumulate(add, bl)) for bl in arr.blockdims]

    def store(x, *args):
        with lock:
            ind = tuple([slice(loc[i], loc[i+1]) for i, loc in zip(args, locs)])
            out[ind] = np.asanyarray(x)
        return None

    name = 'store-%s' % arr.name
    return dict(((name,) + t[1:], (store, t) + t[1:])
                for t in core.flatten(arr._keys()))
Esempio n. 5
0
def fromfunction(func, shape=None, blockshape=None, blockdims=None, dtype=None):
    name = next(fromfunction_names)
    if shape and blockshape and not blockdims:
        blockdims = blockdims_from_blockshape(shape, blockshape)

    keys = list(product([name], *[range(len(bd)) for bd in blockdims]))
    aggdims = [list(accumulate(add, (0,) + bd[:-1])) for bd in blockdims]
    offsets = list(product(*aggdims))
    shapes = list(product(*blockdims))

    values = [(np.fromfunction, offset_func(func, offset), shape)
                for offset, shape in zip(offsets, shapes)]

    dsk = dict(zip(keys, values))

    return Array(dsk, name, blockdims=blockdims, dtype=dtype)
Esempio n. 6
0
def insert_to_ooc(out, arr):
    from threading import Lock
    lock = Lock()

    locs = [[0] + list(accumulate(add, bl)) for bl in arr.blockdims]

    def store(x, *args):
        with lock:
            ind = tuple(
                [slice(loc[i], loc[i + 1]) for i, loc in zip(args, locs)])
            out[ind] = np.asanyarray(x)
        return None

    name = 'store-%s' % arr.name
    return dict(((name, ) + t[1:], (store, t) + t[1:])
                for t in core.flatten(arr._keys()))
Esempio n. 7
0
def fromfunction(func,
                 shape=None,
                 blockshape=None,
                 blockdims=None,
                 dtype=None):
    name = next(fromfunction_names)
    if shape and blockshape and not blockdims:
        blockdims = blockdims_from_blockshape(shape, blockshape)

    keys = list(product([name], *[range(len(bd)) for bd in blockdims]))
    aggdims = [list(accumulate(add, (0, ) + bd[:-1])) for bd in blockdims]
    offsets = list(product(*aggdims))
    shapes = list(product(*blockdims))

    values = [(np.fromfunction, offset_func(func, offset), shape)
              for offset, shape in zip(offsets, shapes)]

    dsk = dict(zip(keys, values))

    return Array(dsk, name, blockdims=blockdims, dtype=dtype)
Esempio n. 8
0
def spatial_learning_curve_splitter(train_data: pd.DataFrame,
                                    space_column: str,
                                    time_column: str,
                                    training_limit: DateType,
                                    holdout_gap: timedelta = timedelta(days=0),
                                    train_percentages: Iterable[float] = (0.25, 0.5, 0.75, 1.0),
                                    random_state: int = None) -> SplitterReturnType:
    """
    Splits the data for a spatial learning curve. Progressively adds more and
    more examples to the training in order to verify the impact of having more
    data available on a validation set.

    The validation set starts after the training set, with an optional time gap.

    Similar to the temporal learning curves, but with spatial increases in the training set.

    Parameters
    ----------

    train_data : pandas.DataFrame
        A Pandas' DataFrame that will be split for learning curve estimation.

    space_column : str
        The name of the ID column of `train_data`.

    time_column : str
        The name of the temporal column of `train_data`.

    training_limit: datetime or str
        The date limiting the training (after which the holdout begins).

    holdout_gap: timedelta
        The gap between the end of training and the start of the holdout.
        If you have censored data, use a gap similar to the censor time.

    train_percentages: list or tuple of floats
        A list containing the percentages of IDs to use in the training.
        Defaults to (0.25, 0.5, 0.75, 1.0). For example: For the default value,
        there would be four model trainings, containing respectively 25%, 50%,
        75%, and 100% of the IDs that are not part of the held out set.

    random_state : int
        A seed for the random number generator that shuffles the IDs.
    """
    if np.min(train_percentages) < 0 or np.max(train_percentages) > 1:
        raise ValueError('Train percentages must be between 0 and 1')

    if isinstance(training_limit, str):
        training_limit = datetime.strptime(training_limit, "%Y-%m-%d")

    if training_limit < train_data[time_column].min() or training_limit > train_data[time_column].max():
        raise ValueError('Temporal training limit should be within datasets temporal bounds (min and max times)')
    if timedelta(days=0) > holdout_gap:
        raise ValueError('Holdout gap cannot be negative')
    if holdout_gap >= (train_data[time_column].max() - training_limit):
        raise ValueError('After taking the gap into account, there should be enough time for the holdout set')

    train_data = train_data.reset_index()

    # We need to sample the space column before getting its unique values so their order in the DF won't matter here
    spatial_ids = train_data[space_column].sample(frac=1, random_state=random_state).unique()

    cumulative_ids = pipe(
        spatial_ids,
        lambda ids: (np.array(train_percentages) * len(ids)).astype(int),  # Get the corresponding indices for each %
        lambda idx: np.split(spatial_ids, idx)[:-1],  # Split spatial ids by the indices
        lambda l: map(lambda x: x.tolist(), l),  # Transform sub-arrays into sub-lists
        lambda l: filter(None, l),  # Drop empty sub-lists
        accumulate(operator.add)  # Cumulative sum of lists
    )

    validation_set = train_data[train_data[time_column] > (training_limit + holdout_gap)]
    train_data = train_data[train_data[time_column] <= training_limit]

    folds = [(train_data[train_data[space_column].isin(ids)][time_column], validation_set[time_column])
             for ids in cumulative_ids]

    folds_indices = _lc_fold_to_indexes(folds)  # final formatting with idx

    logs = [assoc(learner, "percentage", p) for learner, p in zip(map(_log_time_fold, folds), train_percentages)]

    return folds_indices, logs
Esempio n. 9
0
def concatenate(seq, axis=0):
    """
    Concatenate arrays along an existing axis

    Given a sequence of dask Arrays form a new dask Array by stacking them
    along an existing dimension (axis=0 by default)

    Example
    -------

    Create slices

    >>> import dask.array as da
    >>> import numpy as np

    >>> data = [from_array(np.ones((4, 4)), blockshape=(2, 2))
    ...          for i in range(3)]

    >>> x = da.concatenate(data, axis=0)
    >>> x.shape
    (12, 4)

    >>> da.concatenate(data, axis=1).shape
    (4, 12)

    Result is a new dask Array

    See Also:
        stack
    """
    n = len(seq)
    ndim = len(seq[0].shape)
    if axis < 0:
        axis = ndim + axis
    if axis >= ndim:
        raise ValueError("Axis must be less than than number of dimensions"
                "\nData has %d dimensions, but got axis=%d" % (ndim, axis))

    bds = [a.blockdims for a in seq]

    if not all(len(set(bds[i][j] for i in range(n))) == 1
            for j in range(len(bds[0])) if j != axis):
        raise ValueError("Block shapes do not align")

    shape = (seq[0].shape[:axis]
            + (sum(a.shape[axis] for a in seq),)
            + seq[0].shape[axis + 1:])
    blockdims = (  seq[0].blockdims[:axis]
                + (sum([bd[axis] for bd in bds], ()),)
                + seq[0].blockdims[axis + 1:])

    name = next(concatenate_names)
    keys = list(product([name], *[range(len(bd)) for bd in blockdims]))

    cum_dims = [0] + list(accumulate(add, [len(a.blockdims[axis]) for a in seq]))
    names = [a.name for a in seq]
    values = [(names[bisect(cum_dims, key[axis + 1]) - 1],)
                + key[1:axis + 1]
                + (key[axis + 1] - cum_dims[bisect(cum_dims, key[axis+1]) - 1],)
                + key[axis + 2:]
                for key in keys]

    dsk = dict(zip(keys, values))
    dsk2 = merge(dsk, *[a.dask for a in seq])

    if all(a._dtype is not None for a in seq):
        dt = reduce(np.promote_types, [a._dtype for a in seq])
    else:
        dt = None

    return Array(dsk2, name, shape, blockdims=blockdims, dtype=dt)
Esempio n. 10
0
def concatenate(seq, axis=0):
    """
    Concatenate arrays along an existing axis

    Given a sequence of dask Arrays form a new dask Array by stacking them
    along an existing dimension (axis=0 by default)

    Example
    -------

    Create slices

    >>> import dask.array as da
    >>> import numpy as np

    >>> data = [from_array(np.ones((4, 4)), blockshape=(2, 2))
    ...          for i in range(3)]

    >>> x = da.concatenate(data, axis=0)
    >>> x.shape
    (12, 4)

    >>> da.concatenate(data, axis=1).shape
    (4, 12)

    Result is a new dask Array

    See Also:
        stack
    """
    n = len(seq)
    ndim = len(seq[0].shape)
    if axis < 0:
        axis = ndim + axis
    if axis >= ndim:
        raise ValueError("Axis must be less than than number of dimensions"
                         "\nData has %d dimensions, but got axis=%d" %
                         (ndim, axis))

    bds = [a.blockdims for a in seq]

    if not all(
            len(set(bds[i][j] for i in range(n))) == 1
            for j in range(len(bds[0])) if j != axis):
        raise ValueError("Block shapes do not align")

    shape = (seq[0].shape[:axis] + (sum(a.shape[axis] for a in seq), ) +
             seq[0].shape[axis + 1:])
    blockdims = (seq[0].blockdims[:axis] + (sum([bd[axis] for bd in bds],
                                                ()), ) +
                 seq[0].blockdims[axis + 1:])

    name = next(concatenate_names)
    keys = list(product([name], *[range(len(bd)) for bd in blockdims]))

    cum_dims = [0] + list(
        accumulate(add, [len(a.blockdims[axis]) for a in seq]))
    names = [a.name for a in seq]
    values = [
        (names[bisect(cum_dims, key[axis + 1]) - 1], ) + key[1:axis + 1] +
        (key[axis + 1] - cum_dims[bisect(cum_dims, key[axis + 1]) - 1], ) +
        key[axis + 2:] for key in keys
    ]

    dsk = dict(zip(keys, values))
    dsk2 = merge(dsk, *[a.dask for a in seq])

    if all(a._dtype is not None for a in seq):
        dt = reduce(np.promote_types, [a._dtype for a in seq])
    else:
        dt = None

    return Array(dsk2, name, shape, blockdims=blockdims, dtype=dt)