Exemple #1
0
def test_rechunk():
    sa = StackedArray([da.random.random(size=(4, 4)) for _ in range(7)])

    for chunk_shape in [(1, 1), (2, 2), (4, 4)]:
        sa1 = sa.rechunk(chunk_shape)

        assert sa1.chunks == sa1.array.chunks
Exemple #2
0
def test_mean_non_consistent_shape(shapes):
    sa = StackedArray([da.random.random(shape) for shape in shapes])
    for axis in [
            None, -1, *list(list(range(x)) for x in range(len(sa.shape)))
    ]:
        note(f"shapes: {shapes}, axis: {axis}")
        np.testing.assert_array_almost_equal(sa.mean(axis=axis),
                                             sa.array.mean(axis=axis))
Exemple #3
0
def test_dot_non_consistent_shape(shapes):
    assume(all(max(shape) > 1 for shape in shapes))
    sa = StackedArray([da.random.random(shape) for shape in shapes])
    N, P = sa.shape
    y = da.random.random(P)
    note(f"shapes: {shapes}, y shape: {y.shape}")
    np.testing.assert_array_almost_equal(sa.dot(y),
                                         sa.array.dot(y),
                                         decimal=12)
Exemple #4
0
def test_reshape():
    sa = StackedArray([da.random.random(size=(4, 4)) for _ in range(7)])

    for new_shape in [(16, ), (2, 8), (4, 4), (8, 2)]:
        sa1 = sa.reshape(new_shape)

        assert sa1.shape == new_shape
        np.testing.assert_array_almost_equal(sa1.mean(), sa.mean(), decimal=12)
        np.testing.assert_array_almost_equal(sa1.std(), sa.std(), decimal=12)
Exemple #5
0
def test_dot_constant_shape_2D(shape):
    N, P = shape
    sa = StackedArray([da.random.random(size=(N, P)) for _ in range(7)])
    y = da.random.random(P)

    np.testing.assert_array_almost_equal(sa.dot(y),
                                         sa.array.dot(y),
                                         decimal=12)

    y = da.random.random((P, 2))

    np.testing.assert_array_almost_equal(sa.dot(y),
                                         sa.array.dot(y),
                                         decimal=12)
Exemple #6
0
def test_dot_2D_1D(shape):
    N, P = shape
    assume(N > 1)
    assume(P > 1)
    sa = StackedArray([
        da.random.random(size=(N, P)),
        da.random.random(size=(N, 1)),
        da.random.random(size=(P, )),
        da.random.random(size=(1, P))
    ])

    for size in [(P, 2), (P, )]:
        y = da.random.random(size=size)
    np.testing.assert_array_almost_equal(sa.dot(y),
                                         sa.array.dot(y),
                                         decimal=12)
Exemple #7
0
def test_mask_imputation(shape, axis, clip_value):
    assume(shape[0] > 1 and shape[1] > 1)
    axis = int(axis)

    n, p = shape

    arr = da.random.random(size=shape)
    arr_result = arr.copy()

    arr[arr < clip_value] = float('nan')
    arr_result[arr_result < clip_value] = 1

    if axis:
        values = da.ones(n)
    else:
        values = da.ones(p)

    try:
        filled_arr, mask_arr = utils.mask_imputation(arr,
                                                     mask_values=values,
                                                     mask_axis=axis)
    except ValueError:
        assert np.count_nonzero(np.isnan(arr)) == 0
    else:
        assume(mask_arr.compute().data.size > 0)

        combined_arr = StackedArray((filled_arr, mask_arr))

        np.testing.assert_array_equal(arr_result, combined_arr.array)
Exemple #8
0
def test___getitem__():
    sa = StackedArray([da.random.random(size=(10, 10)) for _ in range(7)])

    np.testing.assert_array_almost_equal(sa[0, 0], sa.array[0, 0], decimal=12)
    np.testing.assert_array_almost_equal(sa[0, :], sa.array[0, :], decimal=12)
    np.testing.assert_array_almost_equal(sa[:, 0], sa.array[:, 0], decimal=12)
    np.testing.assert_array_almost_equal(sa[:, :], sa.array[:, :], decimal=12)
Exemple #9
0
def load_stacked_array(file: Path, **kwargs):
    dirs, files = walk_one_level(file)
    sub_arrays = dirs + files
    sub_arrays = sort_files_by_int(sub_arrays)
    return StackedArray([
        load_array_from_disk(Path(file, sub_array), **kwargs)
        for sub_array in sub_arrays
    ])
Exemple #10
0
def test_underlying_arrays():
    arrays = [da.random.random(size=(4, )) for _ in range(3)]

    sa = StackedArray(arrays)

    id_list = [id(x) for x in sa.arrays]

    for array in arrays:
        assert id(array) in id_list
Exemple #11
0
def test__getitem__non_consistent_shape(shapes_indicies):
    shapes, indices, shape = shapes_indicies
    sa = StackedArray([da.random.random(shape) for shape in shapes])
    note(f"shapes: {shapes}, indices: {indices}")
    np.testing.assert_array_equal(sa.array.shape, shape)
    if np.product(sa.array[indices].shape) > 0:
        np.testing.assert_array_almost_equal(sa[indices],
                                             sa.array[indices],
                                             decimal=12)
Exemple #12
0
def test_reduce_arrays_sizeN():
    for N in range(3, 15):
        arrays = [da.random.random(size=(4, )) for _ in range(N)]

        array = reduce(da.add, arrays)

        for tree_reduce in [True, False]:
            sa = StackedArray(arrays, tree_reduce=tree_reduce)

            np.testing.assert_array_almost_equal(sa, array, decimal=12)
Exemple #13
0
def test_reduce_arrays_size3():
    x1 = da.random.random(size=(4, ))
    x2 = da.random.random(size=(4, ))
    x3 = da.random.random(size=(4, ))
    x = x1 + x2 + x3

    for tree_reduce in [True, False]:
        sa = StackedArray([x1, x2, x3], tree_reduce=tree_reduce)

        np.testing.assert_array_almost_equal(sa, x, decimal=14)
Exemple #14
0
def test_reduce_arrays_size2():
    x1 = da.random.random(size=(4, ))
    x2 = da.random.random(size=(4, ))

    x = x1 + x2

    for tree_reduce in [True, False]:
        sa = StackedArray([x1, x2], tree_reduce=tree_reduce)

        np.testing.assert_array_equal(sa, x)
Exemple #15
0
def test_persist():
    def delay_array(x):
        # slow operation takes about 2 seconds on my computer
        d = da.mean(da.random.random(1e9))
        return d + x - d

    arrays = [delay_array(da.random.random(size=(4, ))) for _ in range(2)]

    sa = StackedArray(arrays)
    sa_persist = sa.persist()

    start = time.time()
    sa_persist.mean().compute()
    persist_took = time.time() - start

    start_mean = time.time()
    sa.mean().compute()
    mean_took = time.time() - start_mean
    assert persist_took <= mean_took / 10
Exemple #16
0
def test_T_2D_1D(shape):
    N, P = shape
    assume(N > 1)
    assume(P > 1)
    sa = StackedArray([
        da.random.random(size=(N, P)),
        da.random.random(size=(N, 1)),
        da.random.random(size=(P, )),
        da.random.random(size=(1, P))
    ])

    np.testing.assert_array_almost_equal(sa.T, sa.array.T)
    np.testing.assert_array_almost_equal(sa, sa.T.T)
Exemple #17
0
def test_T_dot_2D_1D(shape):
    N, P = shape
    assume(N > 1)
    assume(P > 1)
    sa = StackedArray([
        da.random.random(size=(N, P)),
        da.random.random(size=(N, 1)),
        da.random.random(size=(P, )),
        da.random.random(size=(1, P))
    ])

    n, p = sa.shape
    for size in [(n, 2), (n, )]:
        y = da.random.random(size=size)
        np.testing.assert_array_equal(sa.T.array.dot(y), sa.array.T.dot(y))
        np.testing.assert_array_almost_equal(sa.T.dot(y), sa.array.T.dot(y))
Exemple #18
0
def test_fallback_methods():
    sa = StackedArray([da.random.random(size=(4, 4)) for x in range(7)])

    assert sa.shape == sa.array.shape
    assert sa.chunks == sa.array.chunks

    for axis in [None, 0, 1]:
        np.testing.assert_array_almost_equal(sa.std(axis=axis),
                                             sa.array.std(axis=axis),
                                             decimal=12)
        np.testing.assert_array_almost_equal(sa.mean(axis=axis),
                                             sa.array.mean(axis=axis),
                                             decimal=12)
        np.testing.assert_array_almost_equal(sa.max(axis=axis),
                                             sa.array.max(axis=axis),
                                             decimal=12)
        np.testing.assert_array_almost_equal(sa.min(axis=axis),
                                             sa.array.min(axis=axis),
                                             decimal=12)
Exemple #19
0
def test_T_constant_shape_2D(shape):
    N, P = shape
    sa = StackedArray([da.random.random(size=(N, P)) for _ in range(7)])

    np.testing.assert_array_almost_equal(sa.T, sa.array.T)
    np.testing.assert_array_almost_equal(sa, sa.T.T)
Exemple #20
0
def test_bad_stacked_types():
    with pytest.raises(ValueError):
        StackedArray(['4', da.random.random(size=(10, 10))])
Exemple #21
0
def test_bad_stacked_shapes():
    with pytest.raises(ValueError):
        StackedArray(
            [da.random.random(size=(2, )),
             da.random.random(size=(3, ))])
Exemple #22
0
def test_non_stacked():
    with pytest.raises(ValueError):
        StackedArray(da.random.random(size=(10, 10)))
Exemple #23
0
def make_snp_array(array: da.core.Array,
                   mean: bool = True,
                   std: bool = True,
                   std_method: str = 'binom',
                   dtype='int8',
                   rechunk: Union[bool, str, dict] = 'auto',
                   mask_method: str = 'mean',
                   mask_nan: bool = True) -> da.core.Array:
    """Creates a SNP Array from "array" that;
        1. Has zero column mean, if 'mean' is True
        2. Has unit standard deviation according to 'std_method', if 'std' is True
        3. Has type of type "dtype".
        4. Is rechunked according to 'rechunk'
        5. Has NaN values replaced with imputed values following 'mask_method' for each column if 'mask_nan' is True

    Parameters
    ----------
    array : da.core.Array, shape (N, P)
        base array to be masked, centered, and scaled (if specified)

    mean : bool
        Flag whether to use center 'array'

    std : bool
        Flag whether to scale  'array'

    std_method : str
        Specification for how to scale 'std'

    dtype : str
        Numpy datatype that array will be cast into once NaN's are removed if specified by 'mask_nan'

    rechunk : bool, str, dict
        Underlying array is rechunked once cast into 'dtype' an NaN values are removed if specified by 'mask_nan'

        See https://docs.dask.org/en/latest/array-api.html#dask.array.rechunk

    mask_method : str
        Method for imputing values to replace NaN values.

        if 'mask_method' == 'mean':
            NaN values are replaced with column means
            (see https://numpy.org/doc/1.18/reference/generated/numpy.nanmean.html)

    mask_nan : bool
        Flag whether to mask NaN values (or to check if NaN values exist)

    Returns
    -------
    array : da.core.Array

    Notes
    -----
    It is assumed that NaN values are sparse, if they exist.
    If this is not the case, the performance will be quite slow.

    NaN values are filled with 0 in array
    NaN value locations are recorded as coords for a COO sparse array.


    SNP = ((array + mask) - U)D
            ^^^^^^^^^^^^    |  |
            Masked Array    |  |
            ^^^^^^^^^^^^^^^^^  |
            Centered Array     |
            ^^^^^^^^^^^^^^^^^^^^
            Scaled Array

    """
    mean_array, std_array = get_array_moments(array,
                                              mean=mean,
                                              std=std,
                                              std_method=std_method,
                                              axis=0)

    mask_valid = False
    if mask_nan:
        try:
            if mask_method == 'mean' and mean:
                array, mask_array = mask_imputation(array,
                                                    mean_array,
                                                    fill_value=0,
                                                    mask_axis=0)
            else:
                array, mask_array = mask_imputation(array,
                                                    mask_method=mask_method,
                                                    fill_value=0,
                                                    mask_axis=0)
            mask_valid = True
        except ValueError:
            pass

    array = array.astype(dtype)

    if rechunk:
        if isinstance(rechunk, dict):
            array = array.rechunk(**rechunk)
        else:
            array = array.rechunk(rechunk)

    if mask_nan and mask_valid:
        array = StackedArray((array, mask_array))

    if mean:
        array = StackedArray((array, -mean_array))

    if std:
        array = ChainedArray((array, 1 / std_array))

    return array
Exemple #24
0
def test_StackedArray_of_StackedArrays():
    sa_arrays = [
        StackedArray([da.random.random(size=(4, 4)) for _ in range(2)])
        for _ in range(2)
    ]
    StackedArray(sa_arrays)
Exemple #25
0
def test_mean_consistent_shape():
    sa = StackedArray([da.random.random(size=(10, 10, 10)) for _ in range(7)])

    for axis in [None, 0, 1, 2, (0, 1), -1, (1, 2), (0, 1, 2)]:
        np.testing.assert_array_almost_equal(sa.mean(axis=axis),
                                             sa.array.mean(axis=axis))