Exemple #1
0
Fichier : ds.py Projet : elaeon/ML
    def stadistics(self):
        headers = ["group", "mean", "std dev", "min", "25%", "50%", "75%", "max", "nonzero", "nonan", "unique", "dtype"]
        self.chunksize = Chunks.build_from_shape(self.shape, self.dtypes)
        table = []
        for group, (dtype, _) in self.dtypes.fields.items():
            values = dict()
            values["dtype"] = dtype
            values["group"] = group
            darray = self.data[group].da
            if dtype == np.dtype(float) or dtype == np.dtype(int):
                da_mean = da.around(darray.mean(), decimals=3)
                da_std = da.around(darray.std(), decimals=3)
                da_min = da.around(darray.min(), decimals=3)
                da_max = da.around(darray.max(), decimals=3)
                result = dask.compute([da_mean, da_std, da_min, da_max])[0]
                values["mean"] = result[0] if not np.isnan(result[0]) else da.around(da.nanmean(darray), decimals=3).compute()
                values["std dev"] = result[1] if not np.isnan(result[0]) else da.around(da.nanstd(darray), decimals=3).compute()
                values["min"] = result[2] if not np.isnan(result[0]) else da.around(da.nanmin(darray), decimals=3).compute()
                values["max"] = result[3] if not np.isnan(result[0]) else da.around(da.nanmax(darray), decimals=3).compute()
                if len(self.shape[group]) == 1:
                    da_percentile = da.around(da.percentile(darray, [25, 50, 75]), decimals=3)
                    result = da_percentile.compute()
                    values["25%"] = result[0]
                    values["50%"] = result[1]
                    values["75%"] = result[2]
                else:
                    values["25%"] = "-"
                    values["50%"] = "-"
                    values["75%"] = "-"
                values["nonzero"] = da.count_nonzero(darray).compute()
                values["nonan"] = da.count_nonzero(da.notnull(darray)).compute()
                values["unique"] = "-"
            else:
                values["mean"] = "-"
                values["std dev"] = "-"
                values["min"] = "-"
                values["max"] = "-"
                values["25%"] = "-"
                values["50%"] = "-"
                values["75%"] = "-"
                values["nonzero"] = "-"
                values["nonan"] = da.count_nonzero(da.notnull(darray)).compute()
                vunique = darray.to_dask_dataframe().fillna('').nunique().compute()
                values["unique"] = vunique

            row = []
            for column in headers:
                row.append(values[column])
            table.append(row)

        print("# rows {}".format(self.shape[0]))
        return tabulate(table, headers)
Exemple #2
0
 def _band_hist(band_data):
     cdf = da.arange(0., 1., 1. / nwidth, chunks=nwidth)
     if approximate:
         # need a 1D array
         flat_data = band_data.ravel()
         # replace with nanpercentile in the future, if available
         # dask < 0.17 returns all NaNs for this
         bins = da.percentile(flat_data[da.notnull(flat_data)],
                              cdf * 100.)
     else:
         bins = dask.delayed(np.nanpercentile)(band_data, cdf * 100.)
         bins = da.from_delayed(bins, shape=(nwidth,), dtype=cdf.dtype)
     res = dask.delayed(np.interp)(band_data, bins, cdf)
     res = da.from_delayed(res, shape=band_data.shape,
                           dtype=band_data.dtype)
     return res
Exemple #3
0
 def _band_hist(band_data):
     cdf = da.arange(0., 1., 1. / nwidth, chunks=nwidth)
     if approximate:
         # need a 1D array
         flat_data = band_data.ravel()
         # replace with nanpercentile in the future, if available
         # dask < 0.17 returns all NaNs for this
         bins = da.percentile(flat_data[da.notnull(flat_data)],
                              cdf * 100.)
     else:
         bins = dask.delayed(np.nanpercentile)(band_data, cdf * 100.)
         bins = da.from_delayed(bins, shape=(nwidth, ), dtype=cdf.dtype)
     res = dask.delayed(np.interp)(band_data, bins, cdf)
     res = da.from_delayed(res,
                           shape=band_data.shape,
                           dtype=band_data.dtype)
     return res
Exemple #4
0
def test_dtype_complex():
    x = np.arange(24).reshape((4, 6)).astype('f4')
    y = np.arange(24).reshape((4, 6)).astype('i8')
    z = np.arange(24).reshape((4, 6)).astype('i2')

    a = da.from_array(x, chunks=(2, 3))
    b = da.from_array(y, chunks=(2, 3))
    c = da.from_array(z, chunks=(2, 3))

    def eq(a, b):
        return (isinstance(a, np.dtype) and
                isinstance(b, np.dtype) and
                str(a) == str(b))

    assert eq(a._dtype, x.dtype)
    assert eq(b._dtype, y.dtype)

    assert eq((a + 1)._dtype, (x + 1).dtype)
    assert eq((a + b)._dtype, (x + y).dtype)
    assert eq(a.T._dtype, x.T.dtype)
    assert eq(a[:3]._dtype, x[:3].dtype)
    assert eq((a.dot(b.T))._dtype, (x.dot(y.T)).dtype)

    assert eq(stack([a, b])._dtype, np.vstack([x, y]).dtype)
    assert eq(concatenate([a, b])._dtype, np.concatenate([x, y]).dtype)

    assert eq(b.std()._dtype, y.std().dtype)
    assert eq(c.sum()._dtype, z.sum().dtype)
    assert eq(a.min()._dtype, a.min().dtype)
    assert eq(b.std()._dtype, b.std().dtype)
    assert eq(a.argmin(axis=0)._dtype, a.argmin(axis=0).dtype)

    assert eq(da.sin(c)._dtype, np.sin(z).dtype)
    assert eq(da.exp(b)._dtype, np.exp(y).dtype)
    assert eq(da.floor(a)._dtype, np.floor(x).dtype)
    assert eq(da.isnan(b)._dtype, np.isnan(y).dtype)
    with ignoring(ImportError):
        assert da.isnull(b)._dtype == 'bool'
        assert da.notnull(b)._dtype == 'bool'

    x = np.array([('a', 1)], dtype=[('text', 'S1'), ('numbers', 'i4')])
    d = da.from_array(x, chunks=(1,))

    assert eq(d['text']._dtype, x['text'].dtype)
    assert eq(d[['numbers', 'text']]._dtype, x[['numbers', 'text']].dtype)
Exemple #5
0
def test_dtype_complex():
    x = np.arange(24).reshape((4, 6)).astype('f4')
    y = np.arange(24).reshape((4, 6)).astype('i8')
    z = np.arange(24).reshape((4, 6)).astype('i2')

    a = da.from_array(x, chunks=(2, 3))
    b = da.from_array(y, chunks=(2, 3))
    c = da.from_array(z, chunks=(2, 3))

    def eq(a, b):
        return (isinstance(a, np.dtype) and isinstance(b, np.dtype)
                and str(a) == str(b))

    assert eq(a._dtype, x.dtype)
    assert eq(b._dtype, y.dtype)

    assert eq((a + 1)._dtype, (x + 1).dtype)
    assert eq((a + b)._dtype, (x + y).dtype)
    assert eq(a.T._dtype, x.T.dtype)
    assert eq(a[:3]._dtype, x[:3].dtype)
    assert eq((a.dot(b.T))._dtype, (x.dot(y.T)).dtype)

    assert eq(stack([a, b])._dtype, np.vstack([x, y]).dtype)
    assert eq(concatenate([a, b])._dtype, np.concatenate([x, y]).dtype)

    assert eq(b.std()._dtype, y.std().dtype)
    assert eq(c.sum()._dtype, z.sum().dtype)
    assert eq(a.min()._dtype, a.min().dtype)
    assert eq(b.std()._dtype, b.std().dtype)
    assert eq(a.argmin(axis=0)._dtype, a.argmin(axis=0).dtype)

    assert eq(da.sin(c)._dtype, np.sin(z).dtype)
    assert eq(da.exp(b)._dtype, np.exp(y).dtype)
    assert eq(da.floor(a)._dtype, np.floor(x).dtype)
    assert eq(da.isnan(b)._dtype, np.isnan(y).dtype)
    with ignoring(ImportError):
        assert da.isnull(b)._dtype == 'bool'
        assert da.notnull(b)._dtype == 'bool'

    x = np.array([('a', 1)], dtype=[('text', 'S1'), ('numbers', 'i4')])
    d = da.from_array(x, chunks=(1, ))

    assert eq(d['text']._dtype, x['text'].dtype)
    assert eq(d[['numbers', 'text']]._dtype, x[['numbers', 'text']].dtype)
Exemple #6
0
def push(array, n, axis):
    """
    Dask-aware bottleneck.push
    """
    import bottleneck
    import dask.array as da
    import numpy as np

    def _fill_with_last_one(a, b):
        # cumreduction apply the push func over all the blocks first so, the only missing part is filling
        # the missing values using the last data of the previous chunk
        return np.where(~np.isnan(b), b, a)

    if n is not None and 0 < n < array.shape[axis] - 1:
        arange = da.broadcast_to(
            da.arange(array.shape[axis],
                      chunks=array.chunks[axis],
                      dtype=array.dtype).reshape(
                          tuple(size if i == axis else 1
                                for i, size in enumerate(array.shape))),
            array.shape,
            array.chunks,
        )
        valid_arange = da.where(da.notnull(array), arange, np.nan)
        valid_limits = (arange - push(valid_arange, None, axis)) <= n
        # omit the forward fill that violate the limit
        return da.where(valid_limits, push(array, None, axis), np.nan)

    # The method parameter makes that the tests for python 3.7 fails.
    return da.reductions.cumreduction(
        func=bottleneck.push,
        binop=_fill_with_last_one,
        ident=np.nan,
        x=array,
        axis=axis,
        dtype=array.dtype,
    )
def test_isnull():
    x = np.array([1, np.nan])
    a = da.from_array(x, chunks=(2, ))
    with ignoring(ImportError):
        assert_eq(da.isnull(a), np.isnan(x))
        assert_eq(da.notnull(a), ~np.isnan(x))
Exemple #8
0
def test_isnull():
    x = np.array([1, np.nan])
    a = da.from_array(x, chunks=(2,))
    with ignoring(ImportError):
        assert_eq(da.isnull(a), np.isnan(x))
        assert_eq(da.notnull(a), ~np.isnan(x))
Exemple #9
0
    def boundary_weights(self,
                         mode='reflect',
                         mask=None,
                         drop_dims=[],
                         compute=False):
        """
		Compute the boundary weights

		Parameters
		----------
		mode : {'reflect', 'periodic', 'any-constant'}, optional
			The mode parameter determines how the array borders are handled.
			Default is 'reflect'.
		mask : array-like, optional
			Specify the mask, if None the mask is inferred from missing values
		drop_dims : list, optional
			Specify dimensions along which the weights do not need to be
			computed
		compute : bool, optional
			If True, the computation is performed after the dask graph has
			been made. If False, only the dask graph is made is the computation
			will be performed later on.

		Returns
		-------
		weights : xarray.DataArray
			Return a DataArray containing the weights
		"""
        if mode is 'periodic':
            mode_conv = 'wrap'
        else:
            mode_conv = mode
        # Normalize coefficients
        coeffs = self.coefficients / self.coefficients.sum()
        if drop_dims:
            new_coeffs = da.squeeze(
                coeffs, axis=[self.obj.get_axis_num(di) for di in drop_dims])
        else:
            new_coeffs = coeffs
        new_obj = self.obj.isel(**{di: 0 for di in drop_dims}).squeeze()
        #depth = {new_obj.get_axis_num(di): self.order[di] // 2
        #         for di in self.dims}
        boundary = {self._obj.get_axis_num(di): mode for di in self.dims}
        if mask is None:
            mask = da.notnull(new_obj.data)
        conv = lambda x: im.convolve(x, new_coeffs, mode=mode_conv)
        weights = mask.astype(float).map_overlap(conv,
                                                 depth=self._depth,
                                                 boundary=boundary,
                                                 trim=True)

        res = xr.DataArray(mask * weights,
                           dims=new_obj.dims,
                           coords=new_obj.coords,
                           name='boundary_weights')
        res = res.where(res != 0)
        if compute:
            with ProgressBar():
                out = res.compute()
        else:
            out = res
        return out