Beispiel #1
0
def put_fake_dataset(store, prefix, shape, chunk_overrides=None):
    """Write a fake dataset into the chunk store."""
    data = {
        'correlator_data': ramp(shape, dtype=np.float32) * (1 - 1j),
        'flags': np.ones(shape, dtype=np.uint8),
        'weights': ramp(shape, slope=255. / np.prod(shape), dtype=np.uint8),
        'weights_channel': ramp(shape[:-1], dtype=np.float32)
    }
    if chunk_overrides is None:
        chunk_overrides = {}
    ddata = {
        k: to_dask_array(array, chunk_overrides.get(k))
        for k, array in data.items()
    }
    chunk_info = {
        k: {
            'prefix': prefix,
            'chunks': darray.chunks,
            'dtype': darray.dtype,
            'shape': darray.shape
        }
        for k, darray in ddata.items()
    }
    push = [
        store.put_dask_array(store.join(prefix, k), darray)
        for k, darray in ddata.items()
    ]
    da.compute(*push)
    return data, chunk_info
Beispiel #2
0
 def range(cls, dataset, dimension):
     if dataset._binned and dimension in dataset.kdims:
         expanded = cls.irregular(dataset, dimension)
         column = cls.coords(dataset,
                             dimension,
                             expanded=expanded,
                             edges=True)
     else:
         column = cls.values(dataset, dimension, expanded=False, flat=False)
     if column.dtype.kind == 'M':
         dmin, dmax = column.min(), column.max()
         if da and isinstance(column, da.Array):
             return da.compute(dmin, dmax)
         return dmin, dmax
     elif len(column) == 0:
         return np.NaN, np.NaN
     else:
         try:
             dmin, dmax = (np.nanmin(column), np.nanmax(column))
             if da and isinstance(column, da.Array):
                 return da.compute(dmin, dmax)
             return dmin, dmax
         except TypeError:
             column.sort()
             return column[0], column[-1]
Beispiel #3
0
def put_fake_dataset(store, prefix, shape, chunk_overrides=None, array_overrides=None, flags_only=False):
    """Write a fake dataset into the chunk store."""
    if flags_only:
        data = {'flags': np.random.RandomState(1).randint(0, 7, shape, dtype=np.uint8)}
    else:
        data = {'correlator_data': ramp(shape, dtype=np.float32) * (1 - 1j),
                'flags': np.random.RandomState(2).randint(0, 7, shape, dtype=np.uint8),
                'weights': ramp(shape, slope=255. / np.prod(shape), dtype=np.uint8),
                'weights_channel': ramp(shape[:-1], dtype=np.float32)}
    if array_overrides is not None:
        for name in data:
            if name in array_overrides:
                data[name] = array_overrides[name]
    if chunk_overrides is None:
        chunk_overrides = {}
    ddata = {k: to_dask_array(array, chunk_overrides.get(k)) for k, array in data.items()}
    chunk_info = {k: {'prefix': prefix, 'chunks': darray.chunks,
                      'dtype': np.lib.format.dtype_to_descr(darray.dtype),
                      'shape': darray.shape}
                  for k, darray in ddata.items()}
    for k, darray in ddata.items():
        store.create_array(store.join(prefix, k))
    push = [store.put_dask_array(store.join(prefix, k), darray)
            for k, darray in ddata.items()]
    da.compute(*push)
    return data, chunk_info
Beispiel #4
0
def test_multireadwrite(ms, group_cols, index_cols):
    xds = xds_from_ms(ms, group_cols=group_cols, index_cols=index_cols)

    nds = [ds.copy() for ds in xds]
    writes = [xds_to_table(sds, ms, sds.data_vars.keys()) for sds in nds]

    da.compute(writes)
Beispiel #5
0
    def test_get_angles_satpos_preference(self, forced_preference):
        """Test that 'actual' satellite position is used for generating sensor angles."""
        from satpy.modifiers.angles import get_angles

        input_data1 = _get_angle_test_data()
        # add additional satellite position metadata
        input_data1.attrs["orbital_parameters"]["nadir_longitude"] = 9.0
        input_data1.attrs["orbital_parameters"]["nadir_latitude"] = 0.01
        input_data1.attrs["orbital_parameters"][
            "satellite_actual_longitude"] = 9.5
        input_data1.attrs["orbital_parameters"][
            "satellite_actual_latitude"] = 0.005
        input_data1.attrs["orbital_parameters"][
            "satellite_actual_altitude"] = 12345679
        input_data2 = input_data1.copy(deep=True)
        input_data2.attrs = deepcopy(input_data1.attrs)
        input_data2.attrs["orbital_parameters"]["nadir_longitude"] = 9.1
        input_data2.attrs["orbital_parameters"]["nadir_latitude"] = 0.02
        input_data2.attrs["orbital_parameters"][
            "satellite_actual_longitude"] = 9.5
        input_data2.attrs["orbital_parameters"][
            "satellite_actual_latitude"] = 0.005
        input_data2.attrs["orbital_parameters"][
            "satellite_actual_altitude"] = 12345679

        from pyorbital.orbital import get_observer_look
        with mock.patch("satpy.modifiers.angles.get_observer_look", wraps=get_observer_look) as gol, \
                satpy.config.set(sensor_angles_position_preference=forced_preference):
            angles1 = get_angles(input_data1)
            da.compute(angles1)
            angles2 = get_angles(input_data2)
            da.compute(angles2)

        # get_observer_look should have been called once per array chunk
        assert gol.call_count == input_data1.data.blocks.size * 2
        if forced_preference == "actual":
            exp_call = mock.call(9.5, 0.005, 12345.679,
                                 input_data1.attrs["start_time"], mock.ANY,
                                 mock.ANY, 0)
            all_same_calls = [exp_call] * gol.call_count
            gol.assert_has_calls(all_same_calls)
            # the dask arrays should have the same name to prove they are the same computation
            for angle_arr1, angle_arr2 in zip(angles1, angles2):
                assert angle_arr1.data.name == angle_arr2.data.name
        else:
            # nadir 1
            gol.assert_any_call(9.0, 0.01, 12345.679,
                                input_data1.attrs["start_time"], mock.ANY,
                                mock.ANY, 0)
            # nadir 2
            gol.assert_any_call(9.1, 0.02, 12345.679,
                                input_data1.attrs["start_time"], mock.ANY,
                                mock.ANY, 0)
Beispiel #6
0
def test_save_future(comm):

    cosmo = cosmology.Planck15

    import tempfile
    import shutil

    tmpfile = tempfile.mkdtemp()

    data = numpy.ones(100,
                      dtype=[('Position', ('f4', 3)), ('Velocity', ('f4', 3)),
                             ('Mass', ('f4'))])

    data['Mass'] = numpy.arange(len(data))
    data['Position'] = numpy.arange(len(data) * 3).reshape(
        data['Position'].shape)
    data['Velocity'] = numpy.arange(len(data) * 3).reshape(
        data['Velocity'].shape)

    import dask.array as da
    source = ArrayCatalog(data, BoxSize=100, Nmesh=32, comm=comm)
    source['Rogue'] = da.ones((3, len(data)), chunks=(1, 1)).T

    # add a non-array attrs (saved as JSON)
    source.attrs['empty'] = None

    # save to a BigFile
    d = source.save(tmpfile, dataset='1', compute=False)

    # load as a BigFileCatalog; only attributes are saved
    source2 = BigFileCatalog(tmpfile, dataset='1', comm=comm)

    # check sources
    for k in source.attrs:
        assert_array_equal(source2.attrs[k], source.attrs[k])

    da.compute(d)

    # reload as a BigFileCatalog, data is saved
    source2 = BigFileCatalog(tmpfile, dataset='1', comm=comm)

    # check the data
    def allconcat(data):
        return numpy.concatenate(comm.allgather(data), axis=0)

    assert_allclose(allconcat(source['Position']),
                    allconcat(source2['Position']))
    assert_allclose(allconcat(source['Velocity']),
                    allconcat(source2['Velocity']))
    assert_allclose(allconcat(source['Mass']), allconcat(source2['Mass']))
Beispiel #7
0
 def _cache_results(self, res, zarr_format):
     os.makedirs(os.path.dirname(zarr_format), exist_ok=True)
     new_res = []
     for idx, sub_res in enumerate(res):
         if not isinstance(sub_res, da.Array):
             raise ValueError("Zarr caching currently only supports dask "
                              f"arrays. Got {type(sub_res)}")
         zarr_path = zarr_format.format(idx)
         # See https://github.com/dask/dask/issues/8380
         with dask.config.set({"optimization.fuse.active": False}):
             new_sub_res = sub_res.to_zarr(zarr_path, compute=False)
         new_res.append(new_sub_res)
     # actually compute the storage to zarr
     da.compute(new_res)
Beispiel #8
0
def test_save_future(comm):

    cosmo = cosmology.Planck15

    import tempfile
    import shutil

    tmpfile = tempfile.mkdtemp()

    data = numpy.ones(100, dtype=[
            ('Position', ('f4', 3)),
            ('Velocity', ('f4', 3)),
            ('Mass', ('f4'))]
            )

    data['Mass'] = numpy.arange(len(data))
    data['Position'] = numpy.arange(len(data) * 3).reshape(data['Position'].shape)
    data['Velocity'] = numpy.arange(len(data) * 3).reshape(data['Velocity'].shape)

    import dask.array as da
    source = ArrayCatalog(data, BoxSize=100, Nmesh=32, comm=comm)
    source['Rogue'] = da.ones((3, len(data)), chunks=(1, 1)).T

    # add a non-array attrs (saved as JSON)
    source.attrs['empty'] = None

    # save to a BigFile
    d = source.save(tmpfile, dataset='1', compute=False)

    # load as a BigFileCatalog; only attributes are saved
    source2 = BigFileCatalog(tmpfile, dataset='1', comm=comm)

    # check sources
    for k in source.attrs:
        assert_array_equal(source2.attrs[k], source.attrs[k])

    da.compute(d)

    # reload as a BigFileCatalog, data is saved
    source2 = BigFileCatalog(tmpfile, dataset='1', comm=comm)

    # check the data
    def allconcat(data):
        return numpy.concatenate(comm.allgather(data), axis=0)

    assert_allclose(allconcat(source['Position']), allconcat(source2['Position']))
    assert_allclose(allconcat(source['Velocity']), allconcat(source2['Velocity']))
    assert_allclose(allconcat(source['Mass']), allconcat(source2['Mass']))
Beispiel #9
0
def _concat_zarrs_optimized(
    zarr_files: List[str],
    output: PathType,
    vars_to_rechunk: List[Hashable],
    vars_to_copy: List[Hashable],
) -> None:
    zarr_groups = [zarr.open_group(f) for f in zarr_files]

    first_zarr_group = zarr_groups[0]

    # create the top-level group
    zarr.open_group(str(output), mode="w")

    # copy variables that are to be rechunked
    # NOTE: that this uses _to_zarr function defined here that is needed to avoid
    # race conditions between writing the array contents and its metadata
    # see https://github.com/pystatgen/sgkit/pull/486
    delayed = []  # do all the rechunking operations in one computation
    for var in vars_to_rechunk:
        dtype = None
        if var in {"variant_id", "variant_allele"}:
            max_len = _get_max_len(zarr_groups, f"max_length_{var}")
            dtype = f"S{max_len}"

        arr = concatenate_and_rechunk([group[var] for group in zarr_groups],
                                      dtype=dtype)
        d = _to_zarr(  # type: ignore[no-untyped-call]
            arr,
            str(output),
            component=var,
            overwrite=True,
            compute=False,
            fill_value=None,
            attrs=first_zarr_group[var].attrs.asdict(),
        )
        delayed.append(d)
    da.compute(*delayed)

    # copy unchanged variables and top-level metadata
    with zarr.open_group(str(output)) as output_zarr:

        # copy variables that are not rechunked (e.g. sample_id)
        for var in vars_to_copy:
            output_zarr[var] = first_zarr_group[var]
            output_zarr[var].attrs.update(first_zarr_group[var].attrs)

        # copy top-level attributes
        output_zarr.attrs.update(first_zarr_group.attrs)
Beispiel #10
0
def _co_realise_lazy_arrays(arrays):
    """
    Compute multiple lazy arrays and return a list of real values.

    All the arrays are computed together, so they can share results for common
    graph elements.

    Casts all results with `np.asanyarray`, and converts any MaskedConstants
    appearing into masked arrays, to ensure that all return values are
    writeable NumPy array objects.

    Any non-lazy arrays are passed through, as they are by `da.compute`.
    They undergo the same result standardisation.

    """
    computed_arrays = da.compute(*arrays)
    results = []
    for lazy_in, real_out in zip(arrays, computed_arrays):
        # Ensure we always have arrays.
        # Note : in some cases dask (and numpy) will return a scalar
        # numpy.int/numpy.float object rather than an ndarray.
        # Recorded in https://github.com/dask/dask/issues/2111.
        real_out = np.asanyarray(real_out)
        if isinstance(real_out, ma.core.MaskedConstant):
            # Convert any masked constants into NumPy masked arrays.
            # NOTE: in this case, also apply the original lazy-array dtype, as
            # masked constants *always* have dtype float64.
            real_out = ma.masked_array(real_out.data, mask=real_out.mask,
                                       dtype=lazy_in.dtype)
        results.append(real_out)
    return results
Beispiel #11
0
def finite_range(column, cmin, cmax):
    try:
        min_inf = np.isinf(cmin)
    except TypeError:
        min_inf = False
    try:
        max_inf = np.isinf(cmax)
    except TypeError:
        max_inf = False
    if (min_inf or max_inf):
        column = column[np.isfinite(column)]
        if len(column):
            cmin = np.nanmin(column) if min_inf else cmin
            cmax = np.nanmmax(column) if max_inf else cmax
            if is_dask(column):
                import dask.array as da
                if min_inf and max_inf:
                    cmin, cmax = da.compute(cmin, cmax)
                elif min_inf:
                    cmin = cmin.compute()
                else:
                    cmax = cmax.compute()
    else:
        return cmin, cmax
    if isinstance(cmin, np.ndarray) and cmin.shape == ():
        cmin = cmin[()]
    if isinstance(cmax, np.ndarray) and cmax.shape == ():
        cmax = cmax[()]
    cmin = cmin if np.isscalar(cmin) or isinstance(
        cmin, util.datetime_types) else cmin.item()
    cmax = cmax if np.isscalar(cmax) or isinstance(
        cmax, util.datetime_types) else cmax.item()
    return cmin, cmax
Beispiel #12
0
    def test_get_bucket_indices(self):
        """Test calculation of array indices."""
        # Ensure nothing is calculated
        with dask.config.set(scheduler=CustomScheduler(max_computes=0)):
            self.resampler._get_indices()
        x_idxs, y_idxs = da.compute(self.resampler.x_idxs,
                                    self.resampler.y_idxs)
        np.testing.assert_equal(x_idxs, np.array([1710, 1710, 1707, 1705]))
        np.testing.assert_equal(y_idxs, np.array([465, 465, 459, 455]))

        # Additional small test case
        adef = create_area_def(area_id='test',
                               projection={'proj': 'latlong'},
                               width=2,
                               height=2,
                               center=(0, 0),
                               resolution=10)
        lons = da.from_array(np.array(
            [-10.0, -9.9, -0.1, 0, 0.1, 9.9, 10.0, -10.1, 0]),
                             chunks=2)
        lats = da.from_array(np.array(
            [-10.0, -9.9, -0.1, 0, 0.1, 9.9, 10.0, 0, 10.1]),
                             chunks=2)
        resampler = bucket.BucketResampler(source_lats=lats,
                                           source_lons=lons,
                                           target_area=adef)
        resampler._get_indices()
        np.testing.assert_equal(resampler.x_idxs,
                                np.array([-1, 0, 0, 1, 1, 1, -1, -1, -1]))
        np.testing.assert_equal(resampler.y_idxs,
                                np.array([-1, 1, 1, 1, 0, 0, -1, -1, -1]))
Beispiel #13
0
    def get_dataset_with_area_def(self, arr, dataset_id):
        """Get dataset with an AreaDefinition."""
        if dataset_id['name'] in ['latitude', 'longitude']:
            self.__setattr__(dataset_id['name'], arr)
            xarr = xr.DataArray(arr, dims=["y"])
        else:
            lons_1d, lats_1d, data_1d = da.compute(self.longitude,
                                                   self.latitude, arr)

            self._area_def = self._construct_area_def(dataset_id)
            icol, irow = self._area_def.get_array_indices_from_lonlat(
                lons_1d, lats_1d)

            data_2d = np.empty(self._area_def.shape)
            data_2d[:] = np.nan
            data_2d[irow.compressed(), icol.compressed()] = data_1d[~irow.mask]

            xarr = xr.DataArray(da.from_array(data_2d, CHUNK_SIZE),
                                dims=('y', 'x'))

            ntotal = len(icol)
            nvalid = len(icol.compressed())
            if nvalid < ntotal:
                logging.warning(
                    f'{ntotal-nvalid} out of {ntotal} data points could not be put on '
                    f'the grid {self._area_def.area_id}.')

        return xarr
def test_predict_proba(dataset, datatype, n_neighbors,
                       n_parts, batch_size, client):
    X_train, X_test, y_train, y_test = dataset

    l_model = lKNNClf(n_neighbors=n_neighbors)
    l_model.fit(X_train, y_train)
    l_probas = l_model.predict_proba(X_test)

    X_train = generate_dask_array(X_train, n_parts)
    X_test = generate_dask_array(X_test, n_parts)
    y_train = generate_dask_array(y_train, n_parts)

    if datatype == 'dask_cudf':
        X_train = to_dask_cudf(X_train, client)
        X_test = to_dask_cudf(X_test, client)
        y_train = to_dask_cudf(y_train, client)

    d_model = dKNNClf(client=client, n_neighbors=n_neighbors)
    d_model.fit(X_train, y_train)
    d_probas = d_model.predict_proba(X_test, convert_dtype=True)
    d_probas = da.compute(d_probas)[0]

    if datatype == 'dask_cudf':
        d_probas = list(map(lambda o: o.as_matrix()
                            if isinstance(o, DataFrame)
                            else o.to_array()[..., np.newaxis],
                            d_probas))

    check_probabilities(l_probas, d_probas)
def process(input_path, pedestal_path, output_path):
    reader = TIOReader(input_path)
    wf_calib = WaveformCalibrator(
        pedestal_path, reader.n_pixels, reader.n_samples
    )

    wfs = get_da(reader, wf_calib)

    mean, std, mean_pix, std_pix, (hist, edges) = da.compute(
        wfs.mean(),
        wfs.std(),
        wfs.mean(axis=(0, 2)),
        wfs.std(axis=(0, 2)),
        da.histogram(wfs, bins=1000, range=(-10, 10))
    )

    np.savez(
        output_path,
        mean=mean,
        std=std,
        mean_pix=mean_pix,
        std_pix=std_pix,
        hist=hist,
        edges=edges
    )
Beispiel #16
0
    def _create_resample_kdtree(self):
        """Set up kd tree on input"""
        # Get input information
        valid_input_index, source_lons, source_lats = \
            _get_valid_input_index_dask(self.source_geo_def,
                                        self.target_geo_def,
                                        self.reduce_data,
                                        self.radius_of_influence,
                                        nprocs=self.nprocs)

        # FIXME: Is dask smart enough to only compute the pixels we end up
        #        using even with this complicated indexing
        input_coords = lonlat2xyz(source_lons, source_lats)
        valid_input_index = da.ravel(valid_input_index)
        input_coords = input_coords[valid_input_index, :]
        input_coords = input_coords.compute()
        # Build kd-tree on input
        input_coords = input_coords.astype(np.float)
        valid_input_index, input_coords = da.compute(valid_input_index,
                                                     input_coords)
        if kd_tree_name == 'pykdtree':
            resample_kdtree = KDTree(input_coords)
        else:
            resample_kdtree = sp.cKDTree(input_coords)

        return valid_input_index, resample_kdtree
Beispiel #17
0
    def _box_stats(self, vals):
        is_finite = isfinite
        is_dask = is_dask_array(vals)
        is_cupy = is_cupy_array(vals)
        if is_cupy:
            import cupy
            percentile = cupy.percentile
            is_finite = cupy.isfinite
        elif is_dask:
            import dask.array as da
            percentile = da.percentile
        else:
            percentile = np.percentile

        vals = vals[is_finite(vals)]

        if len(vals):
            q1, q2, q3 = (percentile(vals, q=q) for q in range(25, 100, 25))
            iqr = q3 - q1
            upper = vals[vals <= q3 + 1.5 * iqr].max()
            lower = vals[vals >= q1 - 1.5 * iqr].min()
        else:
            q1, q2, q3 = 0, 0, 0
            upper, lower = 0, 0
        outliers = vals[(vals > upper) | (vals < lower)]

        if is_cupy:
            return (q1.item(), q2.item(), q3.item(), upper.item(),
                    lower.item(), cupy.asnumpy(outliers))
        elif is_dask:
            return da.compute(q1, q2, q3, upper, lower, outliers)
        else:
            return q1, q2, q3, upper, lower, outliers
Beispiel #18
0
def black_scholes(nopt, price, strike, t, rate, vol, schd=None):
    mr = -rate
    sig_sig_two = vol * vol * 2

    P = price
    S = strike
    T = t

    a = log(P / S)
    b = T * mr

    z = T * sig_sig_two
    c = 0.25 * z
    y = da.map_blocks(invsqrt, z)

    w1 = (a - b + c) * y
    w2 = (a - b - c) * y

    d1 = 0.5 + 0.5 * da.map_blocks(erf, w1)
    d2 = 0.5 + 0.5 * da.map_blocks(erf, w2)

    Se = exp(b) * S

    call = P * d1 - Se * d2
    put = call - P + Se

    return da.compute(da.stack((put, call)), get=schd)
def test_no_shared_keys_with_different_depths():
    da.random.seed(0)
    a = da.random.random((9, 9), chunks=(3, 3))

    def check(x):
        assert x.shape == (3, 3)
        return x

    r = [a.map_overlap(lambda a: a + 1,
                       dtype=a.dtype,
                       depth={j: int(i == j) for j in range(a.ndim)},
                       boundary="none").map_blocks(check, dtype=a.dtype)
         for i in range(a.ndim)]

    assert set(r[0].dask) & set(r[1].dask) == set(a.dask)
    da.compute(*r, scheduler='single-threaded')
Beispiel #20
0
def _co_realise_lazy_arrays(arrays):
    """
    Compute multiple lazy arrays and return a list of real values.

    All the arrays are computed together, so they can share results for common
    graph elements.

    Casts all results with `np.asanyarray`, and converts any MaskedConstants
    appearing into masked arrays, to ensure that all return values are
    writeable NumPy array objects.

    Any non-lazy arrays are passed through, as they are by `da.compute`.
    They undergo the same result standardisation.

    """
    computed_arrays = da.compute(*arrays)
    results = []
    for lazy_in, real_out in zip(arrays, computed_arrays):
        # Ensure we always have arrays.
        # Note : in some cases dask (and numpy) will return a scalar
        # numpy.int/numpy.float object rather than an ndarray.
        # Recorded in https://github.com/dask/dask/issues/2111.
        real_out = np.asanyarray(real_out)
        if isinstance(real_out, ma.core.MaskedConstant):
            # Convert any masked constants into NumPy masked arrays.
            # NOTE: in this case, also apply the original lazy-array dtype, as
            # masked constants *always* have dtype float64.
            real_out = ma.masked_array(
                real_out.data, mask=real_out.mask, dtype=lazy_in.dtype
            )
        results.append(real_out)
    return results
def find_start_end(data: Data, distributed: bool = False) -> (int, int):
    """
    Find the shutter open and shutter close timestamps.

    Args:
        data:  A LATRD data dictionary (a dictionary with data set names as keys
               and Dask arrays as values).  Must contain one entry for cue id
               messages and one for cue timestamps.  The two arrays are assumed
               to have the same length.
        distributed:  Whether the computation uses the Dask distributed scheduler,
                      in which case a progress bar will be displayed.

    Returns:
        The shutter open and shutter close timestamps, in clock cycles.
    """
    start_time = first_cue_time(data, shutter_open)
    end_time = first_cue_time(data, shutter_close)

    # If we are using the distributed scheduler (for multiple images), show progress.
    if distributed:
        print("Finding detector shutter open and close times.")
        progress(start_time.persist(), end_time.persist())
    start_time, end_time = da.compute(start_time, end_time)
    print()

    return start_time, end_time
Beispiel #22
0
def test_svd_supported_array_shapes(chunks, shape):
    x = np.random.random(shape)
    dx = da.from_array(x, chunks=chunks)

    def svd_flip(u, v):
        """Sign correction to ensure deterministic output from SVD.

        See:
        - https://github.com/scikit-learn/scikit-learn/blob/master/sklearn/utils/extmath.py#L504
        - https://github.com/dask/dask/issues/6599
        """
        max_abs_cols = np.argmax(np.abs(u), axis=0)
        signs = np.sign(u[max_abs_cols, range(u.shape[1])])
        u *= signs
        v *= signs[:, np.newaxis]
        return u, v

    du, ds, dv = da.linalg.svd(dx)
    du, dv = da.compute(du, dv)
    # Workaround for no `full_matrices=False`
    # https://github.com/dask/dask/issues/3576
    k = min(du.shape + dv.shape)
    du, dv = du[:, :k], dv[:k, :]

    nu, ns, nv = np.linalg.svd(x)

    # Correct signs before comparison
    du, dv = svd_flip(du, dv)
    nu, nv = svd_flip(du, dv)

    assert_eq(du, nu)
    assert_eq(ds, ns)
    assert_eq(dv, nv)
Beispiel #23
0
def test_no_shared_keys_with_different_depths():
    da.random.seed(0)
    a = da.random.random((9, 9), chunks=(3, 3))

    def check(x):
        assert x.shape == (3, 3)
        return x

    r = [a.map_overlap(lambda a: a + 1,
                       dtype=a.dtype,
                       depth={j: int(i == j) for j in range(a.ndim)},
                       boundary="none").map_blocks(check, dtype=a.dtype)
         for i in range(a.ndim)]

    assert set(r[0].dask) & set(r[1].dask) == set(a.dask)
    da.compute(*r, scheduler='single-threaded')
Beispiel #24
0
def test_svd_compressed_deterministic():
    m, n = 30, 25
    x = da.random.RandomState(1234).random_sample(size=(m, n), chunks=(5, 5))
    u, s, vt = svd_compressed(x, 3, seed=1234)
    u2, s2, vt2 = svd_compressed(x, 3, seed=1234)

    assert all(da.compute((u == u2).all(), (s == s2).all(), (vt == vt2).all()))
Beispiel #25
0
    def test_data_with_area_definition(self, input_file):
        """Test data loaded with AreaDefinition."""
        bufr_obj = SeviriL2BufrData(input_file, with_adef=True)
        _ = bufr_obj.get_data(
            DATASET_INFO_LAT)  # We need to load the lat/lon data in order to
        _ = bufr_obj.get_data(
            DATASET_INFO_LON)  # populate the file handler with these data
        z = bufr_obj.get_data(DATASET_INFO)

        ad = bufr_obj.fh.get_area_def(None)
        assert ad == AREA_DEF
        data_1d = np.concatenate((DATA, DATA), axis=0)

        # Put BUFR data on 2D grid that the 2D array returned by get_dataset should correspond to
        lons_1d, lats_1d = da.compute(bufr_obj.fh.longitude,
                                      bufr_obj.fh.latitude)
        icol, irow = ad.get_array_indices_from_lonlat(lons_1d, lats_1d)

        data_2d = np.empty(ad.shape)
        data_2d[:] = np.nan
        data_2d[irow.compressed(), icol.compressed()] = data_1d[~irow.mask]
        np.testing.assert_array_equal(z.values, data_2d)

        # Test that the correct AreaDefinition is identified for products with 3 pixel segements
        bufr_obj.fh.seg_size = 3
        ad_ext = bufr_obj.fh._construct_area_def(
            make_dataid(name='dummmy', resolution=9000))
        assert ad_ext == AREA_DEF_EXT
Beispiel #26
0
    def compute(self, **kwargs):
        """
        Calls dask compute on the dask arrays in this Dataset,
        returning a new Dataset.

        Returns
        -------
        :class:`~daskms.dataset.Dataset`
            Dataset containing computed arrays.
        """

        # Compute dask arrays separately
        dask_data = {}
        data_vars = {}

        # Split variables into dask and other data
        for k, v in self._data_vars.items():
            if isinstance(v.data, da.Array):
                dask_data[k] = v
            else:
                data_vars[k] = v

        # Compute dask arrays if present and add them to data variables
        if len(dask_data) > 0:
            data_vars.update(da.compute(dask_data, **kwargs)[0])

        return Dataset(data_vars,
                       coords=self._coords,
                       attrs=self._attrs.copy())
Beispiel #27
0
def materialize_as_ndarray(a):
    """Convert distributed arrays to ndarrays."""
    if type(a) in (list, tuple):
        if da is not None and any(isinstance(arr, da.Array) for arr in a):
            return da.compute(*a, sync=True)
        return tuple(np.asarray(arr) for arr in a)
    return np.asarray(a)
Beispiel #28
0
def test_svd_compressed():
    m, n = 2000, 250
    r = 10
    np.random.seed(4321)
    mat1 = np.random.randn(m, r)
    mat2 = np.random.randn(r, n)
    mat = mat1.dot(mat2)
    data = da.from_array(mat, chunks=(500, 50))

    u, s, vt = svd_compressed(data, r, seed=4321, n_power_iter=2)
    u, s, vt = da.compute(u, s, vt)

    usvt = np.dot(u, np.dot(np.diag(s), vt))

    tol = 0.2
    assert_eq(np.linalg.norm(mat - usvt),
              np.linalg.norm(mat),
              rtol=tol, atol=tol)  # average accuracy check

    u = u[:, :r]
    s = s[:r]
    vt = vt[:r, :]

    s_exact = np.linalg.svd(mat)[1]
    s_exact = s_exact[:r]

    assert_eq(np.eye(r, r), np.dot(u.T, u))  # u must be orthonormal
    assert_eq(np.eye(r, r), np.dot(vt, vt.T))  # v must be orthonormal
    assert_eq(s, s_exact)  # s must contain the singular values
Beispiel #29
0
def test_svd_compressed():
    m, n = 2000, 250
    r = 10
    np.random.seed(4321)
    mat1 = np.random.randn(m, r)
    mat2 = np.random.randn(r, n)
    mat = mat1.dot(mat2)
    data = da.from_array(mat, chunks=(500, 50))

    u, s, vt = svd_compressed(data, r, seed=4321, n_power_iter=2)
    u, s, vt = da.compute(u, s, vt)

    usvt = np.dot(u, np.dot(np.diag(s), vt))

    tol = 0.2
    assert_eq(np.linalg.norm(mat - usvt),
              np.linalg.norm(mat),
              rtol=tol,
              atol=tol)  # average accuracy check

    u = u[:, :r]
    s = s[:r]
    vt = vt[:r, :]

    s_exact = np.linalg.svd(mat)[1]
    s_exact = s_exact[:r]

    assert_eq(np.eye(r, r), np.dot(u.T, u))  # u must be orthonormal
    assert_eq(np.eye(r, r), np.dot(vt, vt.T))  # v must be orthonormal
    assert_eq(s, s_exact)  # s must contain the singular values
Beispiel #30
0
def _estimate_gaussian_parameters(signal, x1, x2, only_current):
    axis = signal.axes_manager.signal_axes[0]
    i1, i2 = axis.value_range_to_indices(x1, x2)
    X = axis.axis[i1:i2]

    if only_current is True:
        data = signal()[i1:i2]
        X_shape = (len(X),)
        i = 0
        centre_shape = (1,)
    else:
        i = axis.index_in_array
        data_gi = [slice(None), ] * len(signal.data.shape)
        data_gi[axis.index_in_array] = slice(i1, i2)
        data = signal.data[tuple(data_gi)]
        X_shape = [1, ] * len(signal.data.shape)
        X_shape[axis.index_in_array] = data.shape[i]
        centre_shape = list(data.shape)
        centre_shape[i] = 1

    centre = np.sum(X.reshape(X_shape) * data, i) / np.sum(data, i)
    sigma = np.sqrt(np.abs(np.sum((X.reshape(X_shape) - centre.reshape(
        centre_shape)) ** 2 * data, i) / np.sum(data, i)))
    height = data.max(i)

    if isinstance(data, da.Array):
        return da.compute(centre, height, sigma)
    else:
        return centre, height, sigma
Beispiel #31
0
def test_make_regression(n_samples, n_features, n_informative, n_targets, bias,
                         effective_rank, tail_strength, noise, shuffle, coef,
                         random_state, n_parts, cluster):
    c = Client(cluster)
    try:
        from cuml.dask.datasets import make_regression

        result = make_regression(n_samples=n_samples,
                                 n_features=n_features,
                                 n_informative=n_informative,
                                 n_targets=n_targets,
                                 bias=bias,
                                 effective_rank=effective_rank,
                                 noise=noise,
                                 shuffle=shuffle,
                                 coef=coef,
                                 random_state=random_state,
                                 n_parts=n_parts)

        if coef:
            out, values, coefs = result
        else:
            out, values = result

        assert out.shape == (n_samples, n_features), "out shape mismatch"

        if n_targets > 1:
            assert values.shape == (n_samples, n_targets), \
                   "values shape mismatch"
        else:
            assert values.shape == (n_samples, ), "values shape mismatch"

        assert len(out.chunks[0]) == n_parts
        assert len(out.chunks[1]) == 1

        if coef:
            if n_targets > 1:
                assert coefs.shape == (n_features, n_targets), \
                       "coefs shape mismatch"
                assert len(coefs.chunks[1]) == 1
            else:
                assert coefs.shape == (n_features, ), "coefs shape mismatch"
                assert len(coefs.chunks[0]) == 1

            test1 = da.all(da.sum(coefs != 0.0, axis=0) == n_informative)

            std_test2 = da.std(values - (da.dot(out, coefs) + bias), axis=0)

            test1, std_test2 = da.compute(test1, std_test2)

            diff = cp.abs(1.0 - std_test2)
            test2 = cp.all(diff < 1.5 * 10**(-1.))

            assert test1, \
                "Unexpected number of informative features"

            assert test2, "Unexpectedly incongruent outputs"

    finally:
        c.close()
Beispiel #32
0
def pearson_1xn(
    x: da.Array,
    data: da.Array,
    value_range: Optional[Tuple[float, float]] = None,
    k: Optional[int] = None,
) -> Tuple[np.ndarray, np.ndarray]:
    """
    Parameters
    ----------
    x : da.Array
    data : da.Array
    value_range : Optional[Tuple[float, float]] = None
    k : Optional[int] = None
    """
    _, ncols = data.shape

    corrs = []
    for j in range(ncols):
        mask = ~(da.isnan(x) | da.isnan(data[:, j]))
        _, (corr,
            _) = da.corrcoef(np.array(x)[mask],
                             np.array(data[:, j])[mask])
        corrs.append(corr)

    (corrs, ) = da.compute(corrs)
    corrs = np.asarray(corrs)

    return corr_filter(corrs, value_range, k)
Beispiel #33
0
def _estimate_lorentzian_parameters(signal, x1, x2, only_current):
    axis = signal.axes_manager.signal_axes[0]
    i1, i2 = axis.value_range_to_indices(x1, x2)
    X = axis.axis[i1:i2]

    if only_current is True:
        data = signal()[i1:i2]
        i = 0
        centre_shape = (1,)
    else:
        i = axis.index_in_array
        data_gi = [slice(None), ] * len(signal.data.shape)
        data_gi[axis.index_in_array] = slice(i1, i2)
        data = signal.data[tuple(data_gi)]
        centre_shape = list(data.shape)
        centre_shape[i] = 1

    cdf = np.cumsum(data,i)
    cdfnorm = cdf/np.max(cdf, i).reshape(centre_shape)

    icentre = np.argmin(abs(0.5 - cdfnorm), i)
    igamma1 = np.argmin(abs(0.75 - cdfnorm), i)
    igamma2 = np.argmin(abs(0.25 - cdfnorm), i)

    if isinstance(data, da.Array):
        icentre, igamma1, igamma2 = da.compute(icentre, igamma1, igamma2)

    centre = X[icentre]
    gamma = (X[igamma1] - X[igamma2]) / 2
    height = data.max(i)

    return centre, height, gamma
Beispiel #34
0
def kendall_tau_1xn(
    x: da.Array,
    data: da.Array,
    value_range: Optional[Tuple[float, float]] = None,
    k: Optional[int] = None,
) -> Tuple[np.ndarray, np.ndarray]:
    """
    Parameters
    ----------
    x : da.Array
    data : da.Array
    value_range : Optional[Tuple[float, float]] = None
    k : Optional[int] = None
    """

    _, ncols = data.shape

    corrs = []
    for j in range(ncols):
        mask = ~(da.isnan(x) | da.isnan(data[:, j]))
        corr = dask.delayed(lambda a, b: kendalltau(a, b)[0])(
            np.array(x)[mask], np.array(data[:, j])[mask])
        corrs.append(corr)

    (corrs, ) = da.compute(corrs)
    corrs = np.asarray(corrs)
    return corr_filter(corrs, value_range, k)
Beispiel #35
0
def test_svd_compressed_deterministic():
    m, n = 30, 25
    x = da.random.RandomState(1234).random_sample(size=(m, n), chunks=(5, 5))
    u, s, vt = svd_compressed(x, 3, seed=1234)
    u2, s2, vt2 = svd_compressed(x, 3, seed=1234)

    assert all(da.compute((u == u2).all(), (s == s2).all(), (vt == vt2).all()))
Beispiel #36
0
def test_dask_svd_self_consistent(m, n):
    a = np.random.rand(m, n)
    d_a = da.from_array(a, chunks=(3, n), name='A')

    d_u, d_s, d_vt = da.linalg.svd(d_a)
    u, s, vt = da.compute(d_u, d_s, d_vt)

    for d_e, e in zip([d_u, d_s, d_vt], [u, s, vt]):
        assert d_e.shape == e.shape
        assert d_e.dtype == e.dtype
Beispiel #37
0
 def _calculate_summary_statistics(self):
     data = self._lazy_data()
     _raveled = data.ravel()
     _mean, _std, _min, _q1, _q2, _q3, _max = da.compute(
         da.nanmean(data),
         da.nanstd(data),
         da.nanmin(data),
         da.percentile(_raveled, [25, ]),
         da.percentile(_raveled, [50, ]),
         da.percentile(_raveled, [75, ]),
         da.nanmax(data), )
     return _mean, _std, _min, _q1, _q2, _q3, _max
Beispiel #38
0
def dasky_scotts_bin_width(data, return_bins=True):
    r"""Dask version of scotts_bin_width

    Parameters
    ----------
    data : dask array
        the data
    return_bins : bool (optional)
        if True, then return the bin edges

    Returns
    -------
    width : float
        optimal bin width using Scott's rule
    bins : ndarray
        bin edges: returned if `return_bins` is True

    Notes
    -----
    The optimal bin width is:

    .. math::

        \Delta_b = \frac{3.5\sigma}{n^{1/3}}

    where :math:`\sigma` is the standard deviation of the data, and
    :math:`n` is the number of data points.

    See Also
    --------
    knuth_bin_width,
    freedman_bin_width,
    astroML.plotting.hist
    """
    if not isinstance(data, da.Array):
        raise TypeError('data has to be a dask array')
    if data.ndim != 1:
        data = data.flatten()

    n = data.size
    sigma = da.nanstd(data)

    dx = 3.5 * sigma * 1. / (n ** (1. / 3))
    c_dx, mx, mn = da.compute(dx, data.max(), data.min())

    if return_bins:
        Nbins = np.ceil((mx - mn) * 1. / c_dx)
        Nbins = max(1, Nbins)
        bins = mn + c_dx * np.arange(Nbins + 1)
        return c_dx, bins
    else:
        return c_dx
Beispiel #39
0
def dasky_freedman_bin_width(data, return_bins=True):
    r"""Dask version of freedman_bin_width

    Parameters
    ----------
    data : dask array
        the data
    return_bins : bool (optional)
        if True, then return the bin edges

    Returns
    -------
    width : float
        optimal bin width using Scott's rule
    bins : ndarray
        bin edges: returned if `return_bins` is True

    Notes
    -----
    The optimal bin width is

    .. math::

        \Delta_b = \frac{2(q_{75} - q_{25})}{n^{1/3}}

    where :math:`q_{N}` is the :math:`N` percent quartile of the data, and
    :math:`n` is the number of data points.

    See Also
    --------
    knuth_bin_width,
    scotts_bin_width,
    astroML.plotting.hist
    """
    if not isinstance(data, da.Array):
        raise TypeError('data has to be a dask array')
    if data.ndim != 1:
        data = data.flatten()

    n = data.size

    v25, v75 = da.percentile(data, [25, 75])
    dx = 2 * (v75 - v25) * 1. / (n ** (1. / 3))
    c_dx, mx, mn = da.compute(dx, data.max(), data.min())

    if return_bins:
        Nbins = np.ceil((mx - mn) * 1. / c_dx)
        Nbins = max(1, Nbins)
        bins = mn + c_dx * np.arange(Nbins + 1)
        return c_dx, bins
    else:
        return c_dx
Beispiel #40
0
def test_svd_compressed():
    m, n = 300, 250
    r = 10
    np.random.seed(4321)
    mat1 = np.random.randn(m, r)
    mat2 = np.random.randn(r, n)
    mat = mat1.dot(mat2)
    data = da.from_array(mat, chunks=(50, 50))

    n_iter = 6
    for i in range(n_iter):
        u, s, vt = svd_compressed(data, r, seed=4321)
        u, s, vt = da.compute(u, s, vt)
        if i == 0:
            usvt = np.dot(u, np.dot(np.diag(s), vt))
        else:
            usvt += np.dot(u, np.dot(np.diag(s), vt))
    usvt /= n_iter

    tol = 2e-1
    assert np.allclose(np.linalg.norm(mat - usvt),
                       np.linalg.norm(mat),
                       rtol=tol, atol=tol)  # average accuracy check

    u, s, vt = svd_compressed(data, r, seed=4321)
    u, s, vt = da.compute(u, s, vt)
    u = u[:, :r]
    s = s[:r]
    vt = vt[:r, :]

    s_exact = np.linalg.svd(mat)[1]
    s_exact = s_exact[:r]

    assert np.allclose(np.eye(r, r), np.dot(u.T, u))  # u must be orthonormal
    assert np.allclose(np.eye(r, r), np.dot(vt, vt.T))  # v must be orthonormal
    assert np.allclose(s, s_exact)  # s must contain the singular values
Beispiel #41
0
 def _calculate_summary_statistics(self, rechunk=True):
     if rechunk is True:
         # Use dask auto rechunk instead of HyperSpy's one, what should be
         # better for these operations
         rechunk = "dask_auto"
     data = self._lazy_data(rechunk=rechunk)
     _raveled = data.ravel()
     _mean, _std, _min, _q1, _q2, _q3, _max = da.compute(
         da.nanmean(data),
         da.nanstd(data),
         da.nanmin(data),
         da.percentile(_raveled, [25, ]),
         da.percentile(_raveled, [50, ]),
         da.percentile(_raveled, [75, ]),
         da.nanmax(data), )
     return _mean, _std, _min, _q1, _q2, _q3, _max
Beispiel #42
0
def dasky_histogram(a, bins=10, **kwargs):
    """Enhanced histogram for dask arrays.
    The range keyword is ignored. Reads the data at most two times - once to
    determine best bins (if required), and second time to actually calculate
    the histogram.

    Parameters
    ----------
    a : array_like
        array of data to be histogrammed
    bins : int or list or str (optional)
        If bins is a string, then it must be one of:
        'scotts' : use Scott's rule to determine bins
        'freedman' : use the Freedman-Diaconis rule to determine bins
    other keyword arguments are described in numpy.hist().

    Returns
    -------
    hist : array
        The values of the histogram. See `normed` and `weights` for a
        description of the possible semantics.
    bin_edges : array of dtype float
        Return the bin edges ``(length(hist)+1)``.

    See Also
    --------
    numpy.histogram
    astroML.plotting.hist
    """
    if not isinstance(a, da.Array):
        raise TypeError('the given array has to be a dask.Array')
    if a.ndim != 1:
        a = a.flatten()

    if bins == 'scotts':
        _, bins = dasky_scotts_bin_width(a, True)
    elif bins == 'freedman':
        _, bins = dasky_freedman_bin_width(a, True)
    elif isinstance(bins, str):
        raise ValueError("unrecognized bin code: '%s'" % bins)
    elif not np.iterable(bins):
        with ProgressBar():
            kwargs['range'] = da.compute(a.min(), a.max())

    h, bins = da.histogram(a, bins=bins, **kwargs)
    with ProgressBar():
        return h.compute(), bins
Beispiel #43
0
    def _create_resample_kdtree(self):
        """Set up kd tree on input"""
        # Get input information
        valid_input_index, source_lons, source_lats = \
            _get_valid_input_index_dask(self.source_geo_def,
                                        self.target_geo_def,
                                        self.reduce_data,
                                        self.radius_of_influence,
                                        nprocs=self.nprocs)

        # FIXME: Is dask smart enough to only compute the pixels we end up
        #        using even with this complicated indexing
        input_coords = lonlat2xyz(source_lons, source_lats)
        valid_input_index = da.ravel(valid_input_index)
        input_coords = input_coords[valid_input_index, :]
        input_coords = input_coords.compute()
        # Build kd-tree on input
        input_coords = input_coords.astype(np.float)
        valid_input_index, input_coords = da.compute(valid_input_index,
                                                     input_coords)
        return valid_input_index, KDTree(input_coords)
Beispiel #44
0
    def compute(self, progressbar=True, close_file=False):
        """Attempt to store the full signal in memory.

        close_file: bool
            If True, attemp to close the file associated with the dask
            array data if any. Note that closing the file will make all other
            associated lazy signals inoperative.

        """
        if progressbar:
            cm = ProgressBar
        else:
            cm = dummy_context_manager
        with cm():
            da = self.data
            data = da.compute()
            if close_file:
                self.close_file()
            self.data = data
        self._lazy = False
        self._assign_subclass()
Beispiel #45
0
def _estimate_gaussian_parameters(signal, x1, x2, only_current):
    axis = signal.axes_manager.signal_axes[0]
    i1, i2 = axis.value_range_to_indices(x1, x2)
    X = axis.axis[i1:i2]
    if only_current is True:
        data = signal()[i1:i2]
        X_shape = (len(X),)
        i = 0
        centre_shape = (1,)
    else:
        i = axis.index_in_array
        data_gi = [slice(None), ] * len(signal.data.shape)
        data_gi[axis.index_in_array] = slice(i1, i2)
        data = signal.data[tuple(data_gi)]
        X_shape = [1, ] * len(signal.data.shape)
        X_shape[axis.index_in_array] = data.shape[i]
        centre_shape = list(data.shape)
        centre_shape[i] = 1

    if isinstance(data, da.Array):
        _sum = da.sum
        _sqrt = da.sqrt
        _abs = da.numpy_compat.builtins.abs
    else:
        _sum = np.sum
        _sqrt = np.sqrt
        _abs = np.abs

    centre = _sum(X.reshape(X_shape) * data, i) / _sum(data, i)

    sigma = _sqrt(_abs(_sum((X.reshape(X_shape) - centre.reshape(
        centre_shape)) ** 2 * data, i) / _sum(data, i)))
    height = data.max(i)
    if isinstance(data, da.Array):
        return da.compute(centre, height, sigma)
    else:
        return centre, height, sigma
Beispiel #46
0
    def decomposition(self,
                      output_dimension,
                      normalize_poissonian_noise=False,
                      algorithm='PCA',
                      signal_mask=None,
                      navigation_mask=None,
                      get=threaded.get,
                      num_chunks=None,
                      reproject=True,
                      bounds=True,
                      **kwargs):
        """Perform Incremental (Batch) decomposition on the data, keeping n
        significant components.

        Parameters
        ----------
        output_dimension : int
            the number of significant components to keep
        normalize_poissonian_noise : bool
            If True, scale the SI to normalize Poissonian noise
        algorithm : str
            One of ('PCA', 'ORPCA', 'ONMF'). By default ('PCA') IncrementalPCA
            from scikit-learn is run.
        get : dask scheduler
            the dask scheduler to use for computations;
            default `dask.threaded.get`
        num_chunks : int
            the number of dask chunks to pass to the decomposition model.
            More chunks require more memory, but should run faster. Will be
            increased to contain atleast output_dimension signals.
        navigation_mask : {BaseSignal, numpy array, dask array}
            The navigation locations marked as True are not used in the
            decompostion.
        signal_mask : {BaseSignal, numpy array, dask array}
            The signal locations marked as True are not used in the
            decomposition.
        reproject : bool
            Reproject data on the learnt components (factors) after learning.
        bounds : {tuple, bool}
            The (min, max) values of the data to normalize before learning.
            If tuple (min, max), those values will be used for normalization.
            If True, extremes will be looked up (expensive), default.
            If False, no normalization is done (learning may be very slow).
            If normalize_poissonian_noise is True, this cannot be True.
        **kwargs
            passed to the partial_fit/fit functions.

        Notes
        -----
        Various algorithm parameters and their default values:
            ONMF:
                lambda1=1,
                kappa=1,
                robust=False,
                store_r=False
                batch_size=None
            ORPCA:
                fast=True,
                lambda1=None,
                lambda2=None,
                method=None,
                learning_rate=None,
                init=None,
                training_samples=None,
                momentum=None
            PCA:
                batch_size=None,
                copy=True,
                white=False


        """
        explained_variance = None
        explained_variance_ratio = None
        _al_data = self._data_aligned_with_axes
        nav_chunks = _al_data.chunks[:self.axes_manager.navigation_dimension]
        sig_chunks = _al_data.chunks[self.axes_manager.navigation_dimension:]

        num_chunks = 1 if num_chunks is None else num_chunks
        blocksize = np.min([multiply(ar) for ar in product(*nav_chunks)])
        nblocks = multiply([len(c) for c in nav_chunks])
        if blocksize / output_dimension < num_chunks:
            num_chunks = np.ceil(blocksize / output_dimension)
        blocksize *= num_chunks

        ## LEARN
        if algorithm == 'PCA':
            from sklearn.decomposition import IncrementalPCA
            obj = IncrementalPCA(n_components=output_dimension)
            method = partial(obj.partial_fit, **kwargs)
            reproject = True

        elif algorithm == 'ORPCA':
            from hyperspy.learn.rpca import ORPCA
            kwg = {'fast': True}
            kwg.update(kwargs)
            obj = ORPCA(output_dimension, **kwg)
            method = partial(obj.fit, iterating=True)

        elif algorithm == 'ONMF':
            from hyperspy.learn.onmf import ONMF
            batch_size = kwargs.pop('batch_size', None)
            obj = ONMF(output_dimension, **kwargs)
            method = partial(obj.fit, batch_size=batch_size)

        else:
            raise ValueError('algorithm not known')

        original_data = self.data
        try:
            if normalize_poissonian_noise:
                if bounds is True:
                    bounds = False
                    # warnings.warn?
                data = self._data_aligned_with_axes
                ndim = self.axes_manager.navigation_dimension
                sdim = self.axes_manager.signal_dimension
                nm = da.logical_not(
                    da.zeros(
                        self.axes_manager.navigation_shape[::-1],
                        chunks=nav_chunks)
                    if navigation_mask is None else to_array(
                        navigation_mask, chunks=nav_chunks))
                sm = da.logical_not(
                    da.zeros(
                        self.axes_manager.signal_shape[::-1],
                        chunks=sig_chunks)
                    if signal_mask is None else to_array(
                        signal_mask, chunks=sig_chunks))
                ndim = self.axes_manager.navigation_dimension
                sdim = self.axes_manager.signal_dimension
                bH, aG = da.compute(
                    data.sum(axis=range(ndim)),
                    data.sum(axis=range(ndim, ndim + sdim)))
                bH = da.where(sm, bH, 1)
                aG = da.where(nm, aG, 1)

                raG = da.sqrt(aG)
                rbH = da.sqrt(bH)

                coeff = raG[(..., ) + (None, )*rbH.ndim] *\
                        rbH[(None, )*raG.ndim + (...,)]
                coeff.map_blocks(np.nan_to_num)
                coeff = da.where(coeff == 0, 1, coeff)
                data = data / coeff
                self.data = data

            # normalize the data for learning algs:
            if bounds:
                if bounds is True:
                    _min, _max = da.compute(self.data.min(), self.data.max())
                else:
                    _min, _max = bounds
                self.data = (self.data - _min) / (_max - _min)

            # LEARN
            this_data = []
            try:
                for chunk in progressbar(
                        self._block_iterator(
                            flat_signal=True,
                            get=get,
                            signal_mask=signal_mask,
                            navigation_mask=navigation_mask),
                        total=nblocks,
                        leave=True,
                        desc='Learn'):
                    this_data.append(chunk)
                    if len(this_data) == num_chunks:
                        thedata = np.concatenate(this_data, axis=0)
                        method(thedata)
                        this_data = []
                if len(this_data):
                    thedata = np.concatenate(this_data, axis=0)
                    method(thedata)
            except KeyboardInterrupt:
                pass

            # GET ALREADY CALCULATED RESULTS
            if algorithm == 'PCA':
                explained_variance = obj.explained_variance_
                explained_variance_ratio = obj.explained_variance_ratio_
                factors = obj.components_.T

            elif algorithm == 'ORPCA':
                _, _, U, S, V = obj.finish()
                factors = U * S
                loadings = V
                explained_variance = S**2 / len(factors)

            elif algorithm == 'ONMF':
                factors, loadings = obj.finish()
                loadings = loadings.T

            # REPROJECT
            if reproject:
                if algorithm == 'PCA':
                    method = obj.transform
                    post = lambda a: np.concatenate(a, axis=0)
                elif algorithm == 'ORPCA':
                    method = obj.project
                    obj.R = []
                    post = lambda a: obj.finish()[4]
                elif algorithm == 'ONMF':
                    method = obj.project
                    post = lambda a: np.concatenate(a, axis=1).T

                _map = map(lambda thing: method(thing),
                           self._block_iterator(
                               flat_signal=True,
                               get=get,
                               signal_mask=signal_mask,
                               navigation_mask=navigation_mask))
                H = []
                try:
                    for thing in progressbar(
                            _map, total=nblocks, desc='Project'):
                        H.append(thing)
                except KeyboardInterrupt:
                    pass
                loadings = post(H)

            if explained_variance is not None and \
                    explained_variance_ratio is None:
                explained_variance_ratio = \
                    explained_variance / explained_variance.sum()

            # RESHUFFLE "blocked" LOADINGS
            ndim = self.axes_manager.navigation_dimension
            try:
                loadings = _reshuffle_mixed_blocks(
                    loadings,
                    ndim,
                    (output_dimension,),
                    nav_chunks).reshape((-1, output_dimension))
            except ValueError:
                # In case the projection step was not finished, it's left
                # as scrambled
                pass
        finally:
            self.data = original_data

        target = self.learning_results
        target.decomposition_algorithm = algorithm
        target.output_dimension = output_dimension
        target._object = obj
        target.factors = factors
        target.loadings = loadings
        target.explained_variance = explained_variance
        target.explained_variance_ratio = explained_variance_ratio
Beispiel #47
0
def estimate_image_shift(ref, image, roi=None, sobel=True,
                         medfilter=True, hanning=True, plot=False,
                         dtype='float', normalize_corr=False,
                         return_maxval=True):
    """Estimate the shift in a image using phase correlation

    This method can only estimate the shift by comparing
    bidimensional features that should not change the position
    in the given axis. To decrease the memory usage, the time of
    computation and the accuracy of the results it is convenient
    to select a region of interest by setting the roi keyword.

    Parameters
    ----------

    roi : tuple of ints (top, bottom, left, right)
         Define the region of interest
    sobel : bool
        apply a sobel filter for edge enhancement
    medfilter :  bool
        apply a median filter for noise reduction
    hanning : bool
        Apply a 2d hanning filter
    plot : bool | matplotlib.Figure
        If True, plots the images after applying the filters and the phase
        correlation. If a figure instance, the images will be plotted to the
        given figure.
    reference : \'current\' | \'cascade\'
        If \'current\' (default) the image at the current
        coordinates is taken as reference. If \'cascade\' each image
        is aligned with the previous one.
    dtype : str or dtype
        Typecode or data-type in which the calculations must be
        performed.

    normalize_corr : bool
        If True use phase correlation instead of standard correlation

    Returns
    -------

    shifts: np.array
        containing the estimate shifts
    max_value : float
        The maximum value of the correlation

    """
    ref, image = da.compute(ref, image)
    # Make a copy of the images to avoid modifying them
    ref = ref.copy().astype(dtype)
    image = image.copy().astype(dtype)
    if roi is not None:
        top, bottom, left, right = roi
    else:
        top, bottom, left, right = [None, ] * 4

    # Select region of interest
    ref = ref[top:bottom, left:right]
    image = image[top:bottom, left:right]

    # Apply filters
    for im in (ref, image):
        if hanning is True:
            im *= hanning2d(*im.shape)
        if medfilter is True:
            im[:] = sp.signal.medfilt(im)
        if sobel is True:
            im[:] = sobel_filter(im)

    phase_correlation = fft_correlation(ref, image,
                                        normalize=normalize_corr)

    # Estimate the shift by getting the coordinates of the maximum
    argmax = np.unravel_index(np.argmax(phase_correlation),
                              phase_correlation.shape)
    threshold = (phase_correlation.shape[0] / 2 - 1,
                 phase_correlation.shape[1] / 2 - 1)
    shift0 = argmax[0] if argmax[0] < threshold[0] else  \
        argmax[0] - phase_correlation.shape[0]
    shift1 = argmax[1] if argmax[1] < threshold[1] else \
        argmax[1] - phase_correlation.shape[1]
    max_val = phase_correlation.max()

    # Plot on demand
    if plot is True or isinstance(plot, plt.Figure):
        if isinstance(plot, plt.Figure):
            f = plot
            axarr = plot.axes
            if len(axarr) < 3:
                for i in range(3):
                    f.add_subplot(1, 3, i)
                axarr = plot.axes
        else:
            f, axarr = plt.subplots(1, 3)
        full_plot = len(axarr[0].images) == 0
        if full_plot:
            axarr[0].set_title('Reference')
            axarr[1].set_title('Image')
            axarr[2].set_title('Phase correlation')
            axarr[0].imshow(ref)
            axarr[1].imshow(image)
            d = (np.array(phase_correlation.shape) - 1) // 2
            extent = [-d[1], d[1], -d[0], d[0]]
            axarr[2].imshow(np.fft.fftshift(phase_correlation),
                            extent=extent)
            plt.show()
        else:
            axarr[0].images[0].set_data(ref)
            axarr[1].images[0].set_data(image)
            axarr[2].images[0].set_data(np.fft.fftshift(phase_correlation))
            # TODO: Renormalize images
            f.canvas.draw()
    # Liberate the memory. It is specially necessary if it is a
    # memory map
    del ref
    del image
    if return_maxval:
        return -np.array((shift0, shift1)), max_val
    else:
        return -np.array((shift0, shift1))
Beispiel #48
0
    def get_reflectance(self, sun_zenith, sat_zenith, azidiff, bandname, redband=None):
        """Get the reflectance from the three sun-sat angles"""
        # Get wavelength in nm for band:
        if isinstance(bandname, float):
            LOG.warning('A wavelength is provided instead of band name - ' +
                        'disregard the relative spectral responses and assume ' +
                        'it is the effective wavelength: %f (micro meter)', bandname)
            wvl = bandname * 1000.0
        else:
            wvl = self.get_effective_wavelength(bandname)
            wvl = wvl * 1000.0

        rayl, wvl_coord, azid_coord, satz_sec_coord, sunz_sec_coord = self.get_reflectance_lut()

        # force dask arrays
        compute = False
        if HAVE_DASK and not isinstance(sun_zenith, Array):
            compute = True
            sun_zenith = from_array(sun_zenith, chunks=sun_zenith.shape)
            sat_zenith = from_array(sat_zenith, chunks=sat_zenith.shape)
            azidiff = from_array(azidiff, chunks=azidiff.shape)
            if redband is not None:
                redband = from_array(redband, chunks=redband.shape)

        clip_angle = rad2deg(arccos(1. / sunz_sec_coord.max()))
        sun_zenith = clip(sun_zenith, 0, clip_angle)
        sunzsec = 1. / cos(deg2rad(sun_zenith))
        clip_angle = rad2deg(arccos(1. / satz_sec_coord.max()))
        sat_zenith = clip(sat_zenith, 0, clip_angle)
        satzsec = 1. / cos(deg2rad(sat_zenith))
        shape = sun_zenith.shape

        if not(wvl_coord.min() < wvl < wvl_coord.max()):
            LOG.warning(
                "Effective wavelength for band %s outside 400-800 nm range!",
                str(bandname))
            LOG.info(
                "Set the rayleigh/aerosol reflectance contribution to zero!")
            if HAVE_DASK:
                chunks = sun_zenith.chunks if redband is None else redband.chunks
                res = zeros(shape, chunks=chunks)
                return res.compute() if compute else res
            else:
                return zeros(shape)

        idx = np.searchsorted(wvl_coord, wvl)
        wvl1 = wvl_coord[idx - 1]
        wvl2 = wvl_coord[idx]

        fac = (wvl2 - wvl) / (wvl2 - wvl1)
        raylwvl = fac * rayl[idx - 1, :, :, :] + (1 - fac) * rayl[idx, :, :, :]
        tic = time.time()

        smin = [sunz_sec_coord[0], azid_coord[0], satz_sec_coord[0]]
        smax = [sunz_sec_coord[-1], azid_coord[-1], satz_sec_coord[-1]]
        orders = [
            len(sunz_sec_coord), len(azid_coord), len(satz_sec_coord)]
        f_3d_grid = atleast_2d(raylwvl.ravel())

        if HAVE_DASK and isinstance(smin[0], Array):
            # compute all of these at the same time before passing to the interpolator
            # otherwise they are computed separately
            smin, smax, orders, f_3d_grid = da.compute(smin, smax, orders, f_3d_grid)
        minterp = MultilinearInterpolator(smin, smax, orders)
        minterp.set_values(f_3d_grid)

        if HAVE_DASK:
            ipn = map_blocks(self._do_interp, minterp, sunzsec, azidiff,
                             satzsec, dtype=raylwvl.dtype, chunks=azidiff.chunks)
        else:
            ipn = self._do_interp(minterp, sunzsec, azidiff, satzsec)

        LOG.debug("Time - Interpolation: {0:f}".format(time.time() - tic))

        ipn *= 100
        res = ipn
        if redband is not None:
            res = where(redband < 20., res,
                        (1 - (redband - 20) / 80) * res)

        res = clip(res, 0, 100)
        if compute:
            res = res.compute()
        return res
Beispiel #49
0
def estimate_image_shift(ref, image, roi=None, sobel=True,
                         medfilter=True, hanning=True, plot=False,
                         dtype='float', normalize_corr=False,
                         sub_pixel_factor=1,
                         return_maxval=True):
    """Estimate the shift in a image using phase correlation

    This method can only estimate the shift by comparing
    bidimensional features that should not change the position
    in the given axis. To decrease the memory usage, the time of
    computation and the accuracy of the results it is convenient
    to select a region of interest by setting the roi keyword.

    Parameters
    ----------

    ref : 2D numpy.ndarray
        Reference image
    image : 2D numpy.ndarray
        Image to register
    roi : tuple of ints (top, bottom, left, right)
         Define the region of interest
    sobel : bool
        apply a sobel filter for edge enhancement
    medfilter :  bool
        apply a median filter for noise reduction
    hanning : bool
        Apply a 2d hanning filter
    plot : bool | matplotlib.Figure
        If True, plots the images after applying the filters and the phase
        correlation. If a figure instance, the images will be plotted to the
        given figure.
    reference : 'current' | 'cascade'
        If 'current' (default) the image at the current
        coordinates is taken as reference. If 'cascade' each image
        is aligned with the previous one.
    dtype : str or dtype
        Typecode or data-type in which the calculations must be
        performed.
    normalize_corr : bool
        If True use phase correlation instead of standard correlation
    sub_pixel_factor : float
        Estimate shifts with a sub-pixel accuracy of 1/sub_pixel_factor parts
        of a pixel. Default is 1, i.e. no sub-pixel accuracy.

    Returns
    -------

    shifts: np.array
        containing the estimate shifts
    max_value : float
        The maximum value of the correlation

    Notes
    -----

    The statistical analysis approach to the translation estimation
    when using `reference`='stat' roughly follows [1]_ . If you use
    it please cite their article.

    References
    ----------

    .. [1] Bernhard Schaffer, Werner Grogger and Gerald
        Kothleitner. “Automated Spatial Drift Correction for EFTEM
        Image Series.”
        Ultramicroscopy 102, no. 1 (December 2004): 27–36.


    """

    ref, image = da.compute(ref, image)
    # Make a copy of the images to avoid modifying them
    ref = ref.copy().astype(dtype)
    image = image.copy().astype(dtype)
    if roi is not None:
        top, bottom, left, right = roi
    else:
        top, bottom, left, right = [None, ] * 4

    # Select region of interest
    ref = ref[top:bottom, left:right]
    image = image[top:bottom, left:right]

    # Apply filters
    for im in (ref, image):
        if hanning is True:
            im *= hanning2d(*im.shape)
        if medfilter is True:
            im[:] = sp.signal.medfilt(im)
        if sobel is True:
            im[:] = sobel_filter(im)
    phase_correlation, image_product = fft_correlation(
        ref, image, normalize=normalize_corr)

    # Estimate the shift by getting the coordinates of the maximum
    argmax = np.unravel_index(np.argmax(phase_correlation),
                              phase_correlation.shape)
    threshold = (phase_correlation.shape[0] / 2 - 1,
                 phase_correlation.shape[1] / 2 - 1)
    shift0 = argmax[0] if argmax[0] < threshold[0] else  \
        argmax[0] - phase_correlation.shape[0]
    shift1 = argmax[1] if argmax[1] < threshold[1] else \
        argmax[1] - phase_correlation.shape[1]
    max_val = phase_correlation.real.max()
    shifts = np.array((shift0, shift1))

    # The following code is more or less copied from
    # skimage.feature.register_feature, to gain access to the maximum value:
    if sub_pixel_factor != 1:
        # Initial shift estimate in upsampled grid
        shifts = np.round(shifts * sub_pixel_factor) / sub_pixel_factor
        upsampled_region_size = np.ceil(sub_pixel_factor * 1.5)
        # Center of output array at dftshift + 1
        dftshift = np.fix(upsampled_region_size / 2.0)
        sub_pixel_factor = np.array(sub_pixel_factor, dtype=np.float64)
        normalization = (image_product.size * sub_pixel_factor ** 2)
        # Matrix multiply DFT around the current shift estimate
        sample_region_offset = dftshift - shifts * sub_pixel_factor
        correlation = _upsampled_dft(image_product.conj(),
                                     upsampled_region_size,
                                     sub_pixel_factor,
                                     sample_region_offset).conj()
        correlation /= normalization
        # Locate maximum and map back to original pixel grid
        maxima = np.array(np.unravel_index(
            np.argmax(np.abs(correlation)),
            correlation.shape),
            dtype=np.float64)
        maxima -= dftshift
        shifts = shifts + maxima / sub_pixel_factor
        max_val = correlation.real.max()

    # Plot on demand
    if plot is True or isinstance(plot, plt.Figure):
        if isinstance(plot, plt.Figure):
            fig = plot
            axarr = plot.axes
            if len(axarr) < 3:
                for i in range(3):
                    fig.add_subplot(1, 3, i + 1)
                axarr = fig.axes
        else:
            fig, axarr = plt.subplots(1, 3)
        full_plot = len(axarr[0].images) == 0
        if full_plot:
            axarr[0].set_title('Reference')
            axarr[1].set_title('Image')
            axarr[2].set_title('Phase correlation')
            axarr[0].imshow(ref)
            axarr[1].imshow(image)
            d = (np.array(phase_correlation.shape) - 1) // 2
            extent = [-d[1], d[1], -d[0], d[0]]
            axarr[2].imshow(np.fft.fftshift(phase_correlation),
                            extent=extent)
            plt.show()
        else:
            axarr[0].images[0].set_data(ref)
            axarr[1].images[0].set_data(image)
            axarr[2].images[0].set_data(np.fft.fftshift(phase_correlation))
            # TODO: Renormalize images
            fig.canvas.draw_idle()
    # Liberate the memory. It is specially necessary if it is a
    # memory map
    del ref
    del image
    if return_maxval:
        return -shifts, max_val
    else:
        return -shifts
Beispiel #50
0
    def decomposition(self,
                      normalize_poissonian_noise=False,
                      algorithm='svd',
                      output_dimension=None,
                      signal_mask=None,
                      navigation_mask=None,
                      get=threaded.get,
                      num_chunks=None,
                      reproject=True,
                      bounds=False,
                      **kwargs):
        """Perform Incremental (Batch) decomposition on the data, keeping n
        significant components.

        Parameters
        ----------
        normalize_poissonian_noise : bool
            If True, scale the SI to normalize Poissonian noise
        algorithm : str
            One of ('svd', 'PCA', 'ORPCA', 'ONMF'). By default 'svd',
            lazy SVD decomposition from dask.
        output_dimension : int
            the number of significant components to keep. If None, keep all
            (only valid for SVD)
        get : dask scheduler
            the dask scheduler to use for computations;
            default `dask.threaded.get`
        num_chunks : int
            the number of dask chunks to pass to the decomposition model.
            More chunks require more memory, but should run faster. Will be
            increased to contain atleast output_dimension signals.
        navigation_mask : {BaseSignal, numpy array, dask array}
            The navigation locations marked as True are not used in the
            decompostion.
        signal_mask : {BaseSignal, numpy array, dask array}
            The signal locations marked as True are not used in the
            decomposition.
        reproject : bool
            Reproject data on the learnt components (factors) after learning.
        **kwargs
            passed to the partial_fit/fit functions.

        Notes
        -----
        Various algorithm parameters and their default values:
            ONMF:
                lambda1=1,
                kappa=1,
                robust=False,
                store_r=False
                batch_size=None
            ORPCA:
                fast=True,
                lambda1=None,
                lambda2=None,
                method=None,
                learning_rate=None,
                init=None,
                training_samples=None,
                momentum=None
            PCA:
                batch_size=None,
                copy=True,
                white=False


        """
        if bounds:
            msg = (
                "The `bounds` keyword is deprecated and will be removed "
                "in v2.0. Since version > 1.3 this has no effect.")
            warnings.warn(msg, VisibleDeprecationWarning)
        explained_variance = None
        explained_variance_ratio = None
        _al_data = self._data_aligned_with_axes
        nav_chunks = _al_data.chunks[:self.axes_manager.navigation_dimension]
        sig_chunks = _al_data.chunks[self.axes_manager.navigation_dimension:]

        num_chunks = 1 if num_chunks is None else num_chunks
        blocksize = np.min([multiply(ar) for ar in product(*nav_chunks)])
        nblocks = multiply([len(c) for c in nav_chunks])
        if algorithm != "svd" and output_dimension is None:
            raise ValueError("With the %s the output_dimension "
                             "must be specified" % algorithm)
        if output_dimension and blocksize / output_dimension < num_chunks:
            num_chunks = np.ceil(blocksize / output_dimension)
        blocksize *= num_chunks
        # LEARN
        if algorithm == 'PCA':
            from sklearn.decomposition import IncrementalPCA
            obj = IncrementalPCA(n_components=output_dimension)
            method = partial(obj.partial_fit, **kwargs)
            reproject = True

        elif algorithm == 'ORPCA':
            from hyperspy.learn.rpca import ORPCA
            kwg = {'fast': True}
            kwg.update(kwargs)
            obj = ORPCA(output_dimension, **kwg)
            method = partial(obj.fit, iterating=True)

        elif algorithm == 'ONMF':
            from hyperspy.learn.onmf import ONMF
            batch_size = kwargs.pop('batch_size', None)
            obj = ONMF(output_dimension, **kwargs)
            method = partial(obj.fit, batch_size=batch_size)
        elif algorithm != "svd":
            raise ValueError('algorithm not known')

        original_data = self.data
        try:
            if normalize_poissonian_noise:
                data = self._data_aligned_with_axes
                ndim = self.axes_manager.navigation_dimension
                sdim = self.axes_manager.signal_dimension
                nm = da.logical_not(
                    da.zeros(
                        self.axes_manager.navigation_shape[::-1],
                        chunks=nav_chunks)
                    if navigation_mask is None else to_array(
                        navigation_mask, chunks=nav_chunks))
                sm = da.logical_not(
                    da.zeros(
                        self.axes_manager.signal_shape[::-1],
                        chunks=sig_chunks)
                    if signal_mask is None else to_array(
                        signal_mask, chunks=sig_chunks))
                ndim = self.axes_manager.navigation_dimension
                sdim = self.axes_manager.signal_dimension
                bH, aG = da.compute(
                    data.sum(axis=tuple(range(ndim))),
                    data.sum(axis=tuple(range(ndim, ndim + sdim))))
                bH = da.where(sm, bH, 1)
                aG = da.where(nm, aG, 1)

                raG = da.sqrt(aG)
                rbH = da.sqrt(bH)

                coeff = raG[(..., ) + (None, ) * rbH.ndim] *\
                    rbH[(None, ) * raG.ndim + (...,)]
                coeff.map_blocks(np.nan_to_num)
                coeff = da.where(coeff == 0, 1, coeff)
                data = data / coeff
                self.data = data

            # LEARN
            if algorithm == "svd":
                reproject = False
                from dask.array.linalg import svd
                try:
                    self._unfolded4decomposition = self.unfold()
                    # TODO: implement masking
                    if navigation_mask or signal_mask:
                        raise NotImplemented(
                            "Masking is not yet implemented for lazy SVD."
                        )
                    U, S, V = svd(self.data)
                    factors = V.T
                    explained_variance = S ** 2 / self.data.shape[0]
                    loadings = U * S
                finally:
                    if self._unfolded4decomposition is True:
                        self.fold()
                        self._unfolded4decomposition is False
            else:
                this_data = []
                try:
                    for chunk in progressbar(
                            self._block_iterator(
                                flat_signal=True,
                                get=get,
                                signal_mask=signal_mask,
                                navigation_mask=navigation_mask),
                            total=nblocks,
                            leave=True,
                            desc='Learn'):
                        this_data.append(chunk)
                        if len(this_data) == num_chunks:
                            thedata = np.concatenate(this_data, axis=0)
                            method(thedata)
                            this_data = []
                    if len(this_data):
                        thedata = np.concatenate(this_data, axis=0)
                        method(thedata)
                except KeyboardInterrupt:
                    pass

            # GET ALREADY CALCULATED RESULTS
            if algorithm == 'PCA':
                explained_variance = obj.explained_variance_
                explained_variance_ratio = obj.explained_variance_ratio_
                factors = obj.components_.T

            elif algorithm == 'ORPCA':
                _, _, U, S, V = obj.finish()
                factors = U * S
                loadings = V
                explained_variance = S**2 / len(factors)

            elif algorithm == 'ONMF':
                factors, loadings = obj.finish()
                loadings = loadings.T

            # REPROJECT
            if reproject:
                if algorithm == 'PCA':
                    method = obj.transform

                    def post(a): return np.concatenate(a, axis=0)
                elif algorithm == 'ORPCA':
                    method = obj.project
                    obj.R = []

                    def post(a): return obj.finish()[4]
                elif algorithm == 'ONMF':
                    method = obj.project

                    def post(a): return np.concatenate(a, axis=1).T

                _map = map(lambda thing: method(thing),
                           self._block_iterator(
                               flat_signal=True,
                               get=get,
                               signal_mask=signal_mask,
                               navigation_mask=navigation_mask))
                H = []
                try:
                    for thing in progressbar(
                            _map, total=nblocks, desc='Project'):
                        H.append(thing)
                except KeyboardInterrupt:
                    pass
                loadings = post(H)

            if explained_variance is not None and \
                    explained_variance_ratio is None:
                explained_variance_ratio = \
                    explained_variance / explained_variance.sum()

            # RESHUFFLE "blocked" LOADINGS
            ndim = self.axes_manager.navigation_dimension
            if algorithm != "svd":  # Only needed for online algorithms
                try:
                    loadings = _reshuffle_mixed_blocks(
                        loadings,
                        ndim,
                        (output_dimension,),
                        nav_chunks).reshape((-1, output_dimension))
                except ValueError:
                    # In case the projection step was not finished, it's left
                    # as scrambled
                    pass
        finally:
            self.data = original_data

        target = self.learning_results
        target.decomposition_algorithm = algorithm
        target.output_dimension = output_dimension
        if algorithm != "svd":
            target._object = obj
        target.factors = factors
        target.loadings = loadings
        target.explained_variance = explained_variance
        target.explained_variance_ratio = explained_variance_ratio

        # Rescale the results if the noise was normalized
        if normalize_poissonian_noise is True:
            target.factors = target.factors * rbH.ravel()[:, np.newaxis]
            target.loadings = target.loadings * raG.ravel()[:, np.newaxis]