def get_observer_look(sat_lon, sat_lat, sat_alt, utc_time, lon, lat, alt): """Calculate observers look angle to a satellite. http://celestrak.com/columns/v02n02/ utc_time: Observation time (datetime object) lon: Longitude of observer position on ground in degrees east lat: Latitude of observer position on ground in degrees north alt: Altitude above sea-level (geoid) of observer position on ground in km Return: (Azimuth, Elevation) """ (pos_x, pos_y, pos_z), (vel_x, vel_y, vel_z) = astronomy.observer_position( utc_time, sat_lon, sat_lat, sat_alt) (opos_x, opos_y, opos_z), (ovel_x, ovel_y, ovel_z) = \ astronomy.observer_position(utc_time, lon, lat, alt) lon = np.deg2rad(lon) lat = np.deg2rad(lat) theta = (astronomy.gmst(utc_time) + lon) % (2 * np.pi) rx = pos_x - opos_x ry = pos_y - opos_y rz = pos_z - opos_z sin_lat = np.sin(lat) cos_lat = np.cos(lat) sin_theta = np.sin(theta) cos_theta = np.cos(theta) top_s = sin_lat * cos_theta * rx + \ sin_lat * sin_theta * ry - cos_lat * rz top_e = -sin_theta * rx + cos_theta * ry top_z = cos_lat * cos_theta * rx + \ cos_lat * sin_theta * ry + sin_lat * rz az_ = np.arctan(-top_e / top_s) if has_xarray and isinstance(az_, xr.DataArray): az_data = az_.data else: az_data = az_ if has_dask and isinstance(az_data, da.Array): az_data = da.where(top_s > 0, az_data + np.pi, az_data) az_data = da.where(az_data < 0, az_data + 2 * np.pi, az_data) else: az_data[np.where(top_s > 0)] += np.pi az_data[np.where(az_data < 0)] += 2 * np.pi if has_xarray and isinstance(az_, xr.DataArray): az_.data = az_data else: az_ = az_data rg_ = np.sqrt(rx * rx + ry * ry + rz * rz) el_ = np.arcsin(top_z / rg_) return np.rad2deg(az_), np.rad2deg(el_)
def test_where_incorrect_args(): a = da.ones(5, chunks=3) for kwd in ["x", "y"]: kwargs = {kwd: a} try: da.where(a > 0, **kwargs) except ValueError as e: assert 'either both or neither of x and y should be given' in str(e)
def _mask_coordinates_dask(lons, lats): """Mask invalid coordinate values""" # lons = da.ravel(lons) # lats = da.ravel(lats) idxs = ((lons < -180.) | (lons > 180.) | (lats < -90.) | (lats > 90.)) lons = da.where(idxs, np.nan, lons) lats = da.where(idxs, np.nan, lats) return lons, lats
def test_where_scalar_dtype(): x = np.int32(3) y1 = np.array([4, 5, 6], dtype=np.int16) c1 = np.array([1, 0, 1]) y2 = da.from_array(y1, chunks=2) c2 = da.from_array(c1, chunks=2) w1 = np.where(c1, x, y1) w2 = da.where(c2, x, y2) assert_eq(w1, w2) # Test again for the bool optimization w3 = np.where(True, x, y1) w4 = da.where(True, x, y1) assert_eq(w3, w4)
def kurtosis(a, axis=0, fisher=True, bias=True, nan_policy='propagate'): if nan_policy != 'propagate': raise NotImplementedError("`nan_policy` other than 'propagate' " "have not been implemented.") n = a.shape[axis] # noqa; for bias m2 = moment(a, 2, axis) m4 = moment(a, 4, axis) zero = (m2 == 0) olderr = np.seterr(all='ignore') try: vals = da.where(zero, 0, m4 / m2**2.0) finally: np.seterr(**olderr) if not bias: # need a version of np.place raise NotImplementedError("bias=False is not implemented.") if vals.ndim == 0: return vals # TODO: scalar # vals = vals.item() # array scalar if fisher: return vals - 3 else: return vals
def periodic_distance(a, b, periodic): '''Periodic distance between two arrays. Periodic is a 3 dimensional array containing the 3 box sizes. ''' delta = abs(a - b) delta = da.where(delta > 0.5 * periodic, periodic - delta, delta) return da.sqrt((delta ** 2).sum(axis=-1))
def _unequal_var_ttest_denom(v1, n1, v2, n2): vn1 = v1 / n1 vn2 = v2 / n2 with np.errstate(divide='ignore', invalid='ignore'): df = (vn1 + vn2)**2 / (vn1**2 / (n1 - 1) + vn2**2 / (n2 - 1)) # If df is undefined, variances are zero (assumes n1 > 0 & n2 > 0). # Hence it doesn't matter what df is as long as it's not NaN. df = da.where(da.isnan(df), 1, df) # XXX: np -> da denom = da.sqrt(vn1 + vn2) return df, denom
def _solve_quadratic_dask(a__, b__, c__, min_val=0.0, max_val=1.0): """Solve quadratic equation and return the valid roots from interval [*min_val*, *max_val*] """ discriminant = b__ * b__ - 4 * a__ * c__ # Solve the quadratic polynomial x_1 = (-b__ + da.sqrt(discriminant)) / (2 * a__) x_2 = (-b__ - da.sqrt(discriminant)) / (2 * a__) # Find valid solutions, ie. 0 <= t <= 1 idxs = (x_1 < min_val) | (x_1 > max_val) x__ = da.where(idxs, x_2, x_1) idxs = (x__ < min_val) | (x__ > max_val) x__ = da.where(idxs, np.nan, x__) return x__
def _get_ts_parallellogram_dask(pt_1, pt_2, pt_3, out_y, out_x): """Get parameters for the case where uprights are parallel""" # Pairwise longitudal separations between reference points x_21 = pt_2[:, 0] - pt_1[:, 0] x_31 = pt_3[:, 0] - pt_1[:, 0] # Pairwise latitudal separations between reference points y_21 = pt_2[:, 1] - pt_1[:, 1] y_31 = pt_3[:, 1] - pt_1[:, 1] t__ = (x_21 * (out_y - pt_1[:, 1]) - y_21 * (out_x - pt_1[:, 0])) / \ (x_21 * y_31 - y_21 * x_31) idxs = (t__ < 0.) | (t__ > 1.) t__ = da.where(idxs, np.nan, t__) s__ = (out_x - pt_1[:, 0] + x_31 * t__) / x_21 idxs = (s__ < 0.) | (s__ > 1.) s__ = da.where(idxs, np.nan, s__) return t__, s__
def _get_ts_dask(pt_1, pt_2, pt_3, pt_4, out_x, out_y): """Calculate vertical and horizontal fractional distances t and s""" # General case, ie. where the the corners form an irregular rectangle t__, s__ = _get_ts_irregular_dask(pt_1, pt_2, pt_3, pt_4, out_y, out_x) # Cases where verticals are parallel idxs = da.isnan(t__) | da.isnan(s__) # Remove extra dimensions idxs = da.ravel(idxs) if da.any(idxs): t_new, s_new = _get_ts_uprights_parallel_dask(pt_1, pt_2, pt_3, pt_4, out_y, out_x) t__ = da.where(idxs, t_new, t__) s__ = da.where(idxs, s_new, s__) # Cases where both verticals and horizontals are parallel idxs = da.isnan(t__) | da.isnan(s__) # Remove extra dimensions idxs = da.ravel(idxs) if da.any(idxs): t_new, s_new = _get_ts_parallellogram_dask(pt_1, pt_2, pt_3, out_y, out_x) t__ = da.where(idxs, t_new, t__) s__ = da.where(idxs, s_new, s__) idxs = (t__ < 0) | (t__ > 1) | (s__ < 0) | (s__ > 1) t__ = da.where(idxs, np.nan, t__) s__ = da.where(idxs, np.nan, s__) return t__, s__
def test_where_nonzero(): for shape, chunks in [(0, ()), ((0, 0), (0, 0)), ((15, 16), (4, 5))]: x = np.random.randint(10, size=shape) d = da.from_array(x, chunks=chunks) x_w = np.where(x) d_w = da.where(d) assert isinstance(d_w, type(x_w)) assert len(d_w) == len(x_w) for i in range(len(x_w)): assert_eq(d_w[i], x_w[i])
def _solve_another_fractional_distance_dask(f__, y_1, y_2, y_3, y_4, out_y): """Solve parameter t__ from s__, or vice versa. For solving s__, switch order of y_2 and y_3.""" y_21 = y_2 - y_1 y_43 = y_4 - y_3 g__ = ((out_y - y_1 - y_21 * f__) / (y_3 + y_43 * f__ - y_1 - y_21 * f__)) # Limit values to interval [0, 1] idxs = (g__ < 0) | (g__ > 1) g__ = da.where(idxs, np.nan, g__) return g__
def test_where_bool_optimization(): x = np.random.randint(10, size=(15, 16)) d = da.from_array(x, chunks=(4, 5)) y = np.random.randint(10, size=(15, 16)) e = da.from_array(y, chunks=(4, 5)) for c in [True, False, np.True_, np.False_, 1, 0]: w1 = da.where(c, d, e) w2 = np.where(c, x, y) assert_eq(w1, w2) ex_w1 = d if c else e assert w1 is ex_w1
def test_where(): x = np.random.randint(10, size=(15, 14)) x[5, 5] = x[4, 4] = 0 # Ensure some false elements d = da.from_array(x, chunks=(4, 5)) y = np.random.randint(10, size=15).astype(np.uint8) e = da.from_array(y, chunks=(4,)) for c1, c2 in [(d > 5, x > 5), (d, x), (1, 1), (0, 0), (5, 5), (True, True), (np.True_, np.True_), (False, False), (np.False_, np.False_)]: for b1, b2 in [(0, 0), (-e[:, None], -y[:, None]), (e[:14], y[:14])]: w1 = da.where(c1, d, b1) w2 = np.where(c2, x, b2) assert_eq(w1, w2)
def skew(a, axis=0, bias=True, nan_policy='propagate'): if nan_policy != 'propagate': raise NotImplementedError("`nan_policy` other than 'propagate' " "have not been implemented.") n = a.shape[axis] # noqa; for bias m2 = moment(a, 2, axis) m3 = moment(a, 3, axis) zero = (m2 == 0) vals = da.where(~zero, m3 / m2**1.5, 0.) # vals = da.where(~zero, (m2, m3), # lambda m2, m3: m3 / m2**1.5, # 0.) if not bias: # Need a version of np.place raise NotImplementedError("bias=False is not implemented.") if vals.ndim == 0: return vals # TODO: scalar # return vals.item() return vals
def _residual(ms, stack, **kw): args = OmegaConf.create(kw) OmegaConf.set_struct(args, True) pyscilog.log_to_file(args.output_filename + '.log') pyscilog.enable_memory_logging(level=3) # number of threads per worker if args.nthreads is None: if args.host_address is not None: raise ValueError( "You have to specify nthreads when using a distributed scheduler" ) import multiprocessing nthreads = multiprocessing.cpu_count() args.nthreads = nthreads else: nthreads = args.nthreads # configure memory limit if args.mem_limit is None: if args.host_address is not None: raise ValueError( "You have to specify mem-limit when using a distributed scheduler" ) import psutil mem_limit = int(psutil.virtual_memory()[0] / 1e9) # 100% of memory by default args.mem_limit = mem_limit else: mem_limit = args.mem_limit nband = args.nband if args.nworkers is None: nworkers = nband args.nworkers = nworkers else: nworkers = args.nworkers if args.nthreads_per_worker is None: nthreads_per_worker = 1 args.nthreads_per_worker = nthreads_per_worker else: nthreads_per_worker = args.nthreads_per_worker # the number of chunks being read in simultaneously is equal to # the number of dask threads nthreads_dask = nworkers * nthreads_per_worker if args.ngridder_threads is None: if args.host_address is not None: ngridder_threads = nthreads // nthreads_per_worker else: ngridder_threads = nthreads // nthreads_dask args.ngridder_threads = ngridder_threads else: ngridder_threads = args.ngridder_threads ms = list(ms) print('Input Options:', file=log) for key in kw.keys(): print(' %25s = %s' % (key, args[key]), file=log) # numpy imports have to happen after this step from pfb import set_client set_client(nthreads, mem_limit, nworkers, nthreads_per_worker, args.host_address, stack, log) import numpy as np from pfb.utils.misc import chan_to_band_mapping import dask from dask.graph_manipulation import clone from dask.distributed import performance_report from daskms import xds_from_storage_ms as xds_from_ms from daskms import xds_from_storage_table as xds_from_table import dask.array as da from africanus.constants import c as lightspeed from africanus.gridding.wgridder.dask import residual as im2residim from ducc0.fft import good_size from pfb.utils.misc import stitch_images, plan_row_chunk from pfb.utils.fits import set_wcs, save_fits # chan <-> band mapping freqs, freq_bin_idx, freq_bin_counts, freq_out, band_mapping, chan_chunks = chan_to_band_mapping( ms, nband=nband) # gridder memory budget max_chan_chunk = 0 max_freq = 0 for ims in ms: for spw in freqs[ims]: counts = freq_bin_counts[ims][spw].compute() freq = freqs[ims][spw].compute() max_chan_chunk = np.maximum(max_chan_chunk, counts.max()) max_freq = np.maximum(max_freq, freq.max()) # assumes measurement sets have the same columns, # number of correlations etc. xds = xds_from_ms(ms[0]) ncorr = xds[0].dims['corr'] nrow = xds[0].dims['row'] data_bytes = getattr(xds[0], args.data_column).data.itemsize bytes_per_row = max_chan_chunk * ncorr * data_bytes memory_per_row = bytes_per_row # real valued weights wdims = getattr(xds[0], args.weight_column).data.ndim if wdims == 2: # WEIGHT memory_per_row += ncorr * data_bytes / 2 else: # WEIGHT_SPECTRUM memory_per_row += bytes_per_row / 2 # flags (uint8 or bool) memory_per_row += np.dtype(np.uint8).itemsize * max_chan_chunk * ncorr # UVW memory_per_row += xds[0].UVW.data.itemsize * 3 # ANTENNA1/2 memory_per_row += xds[0].ANTENNA1.data.itemsize * 2 columns = (args.data_column, args.weight_column, args.flag_column, 'UVW', 'ANTENNA1', 'ANTENNA2') # flag row if 'FLAG_ROW' in xds[0]: columns += ('FLAG_ROW', ) memory_per_row += xds[0].FLAG_ROW.data.itemsize # imaging weights if args.imaging_weight_column is not None: columns += (args.imaging_weight_column, ) memory_per_row += bytes_per_row / 2 # Mueller term (complex valued) if args.mueller_column is not None: columns += (args.mueller_column, ) memory_per_row += bytes_per_row # get max uv coords over all fields uvw = [] u_max = 0.0 v_max = 0.0 for ims in ms: xds = xds_from_ms(ims, columns=('UVW'), chunks={'row': -1}) for ds in xds: uvw = ds.UVW.data u_max = da.maximum(u_max, abs(uvw[:, 0]).max()) v_max = da.maximum(v_max, abs(uvw[:, 1]).max()) uv_max = da.maximum(u_max, v_max) uv_max = uv_max.compute() del uvw # image size cell_N = 1.0 / (2 * uv_max * max_freq / lightspeed) if args.cell_size is not None: cell_size = args.cell_size cell_rad = cell_size * np.pi / 60 / 60 / 180 if cell_N / cell_rad < 1: raise ValueError( "Requested cell size too small. " "Super resolution factor = ", cell_N / cell_rad) print("Super resolution factor = %f" % (cell_N / cell_rad), file=log) else: cell_rad = cell_N / args.super_resolution_factor cell_size = cell_rad * 60 * 60 * 180 / np.pi print("Cell size set to %5.5e arcseconds" % cell_size, file=log) if args.nx is None: fov = args.field_of_view * 3600 npix = int(fov / cell_size) if npix % 2: npix += 1 nx = good_size(npix) ny = good_size(npix) else: nx = args.nx ny = args.ny if args.ny is not None else nx print("Image size set to (%i, %i, %i)" % (nband, nx, ny), file=log) # get approx image size # this is not a conservative estimate when multiple SPW's map to a single # imaging band pixel_bytes = np.dtype(args.output_type).itemsize band_size = nx * ny * pixel_bytes if args.host_address is None: # full image on single node row_chunk = plan_row_chunk(mem_limit / nworkers, band_size, nrow, memory_per_row, nthreads_per_worker) else: # single band per node row_chunk = plan_row_chunk(mem_limit, band_size, nrow, memory_per_row, nthreads_per_worker) if args.row_chunks is not None: row_chunk = int(args.row_chunks) if row_chunk == -1: row_chunk = nrow print( "nrows = %i, row chunks set to %i for a total of %i chunks per node" % (nrow, row_chunk, int(np.ceil(nrow / row_chunk))), file=log) chunks = {} for ims in ms: chunks[ims] = [] # xds_from_ms expects a list per ds for spw in freqs[ims]: chunks[ims].append({ 'row': row_chunk, 'chan': chan_chunks[ims][spw]['chan'] }) dirties = [] radec = None # assumes we are only imaging field 0 of first MS for ims in ms: xds = xds_from_ms(ims, chunks=chunks[ims], columns=columns) # subtables ddids = xds_from_table(ims + "::DATA_DESCRIPTION") fields = xds_from_table(ims + "::FIELD") spws = xds_from_table(ims + "::SPECTRAL_WINDOW") pols = xds_from_table(ims + "::POLARIZATION") # subtable data ddids = dask.compute(ddids)[0] fields = dask.compute(fields)[0] spws = dask.compute(spws)[0] pols = dask.compute(pols)[0] for ds in xds: field = fields[ds.FIELD_ID] # check fields match if radec is None: radec = field.PHASE_DIR.data.squeeze() if not np.array_equal(radec, field.PHASE_DIR.data.squeeze()): continue # this is not correct, need to use spw spw = ds.DATA_DESC_ID uvw = clone(ds.UVW.data) data = getattr(ds, args.data_column).data dataxx = data[:, :, 0] datayy = data[:, :, -1] weights = getattr(ds, args.weight_column).data if len(weights.shape) < 3: weights = da.broadcast_to(weights[:, None, :], data.shape, chunks=data.chunks) if args.imaging_weight_column is not None: imaging_weights = getattr(ds, args.imaging_weight_column).data if len(imaging_weights.shape) < 3: imaging_weights = da.broadcast_to(imaging_weights[:, None, :], data.shape, chunks=data.chunks) weightsxx = imaging_weights[:, :, 0] * weights[:, :, 0] weightsyy = imaging_weights[:, :, -1] * weights[:, :, -1] else: weightsxx = weights[:, :, 0] weightsyy = weights[:, :, -1] # apply adjoint of mueller term. # Phases modify data amplitudes modify weights. if args.mueller_column is not None: mueller = getattr(ds, args.mueller_column).data dataxx *= da.exp(-1j * da.angle(mueller[:, :, 0])) datayy *= da.exp(-1j * da.angle(mueller[:, :, -1])) weightsxx *= da.absolute(mueller[:, :, 0]) weightsyy *= da.absolute(mueller[:, :, -1]) # weighted sum corr to Stokes I weights = weightsxx + weightsyy data = (weightsxx * dataxx + weightsyy * datayy) # TODO - turn off this stupid warning data = da.where(weights, data / weights, 0.0j) # MS may contain auto-correlations if 'FLAG_ROW' in xds[0]: frow = ds.FLAG_ROW.data | (ds.ANTENNA1.data == ds.ANTENNA2.data) else: frow = (ds.ANTENNA1.data == ds.ANTENNA2.data) # only keep data where both corrs are unflagged flag = getattr(ds, args.flag_column).data flagxx = flag[:, :, 0] flagyy = flag[:, :, -1] # ducc0 uses uint8 mask not flag mask = ~da.logical_or((flagxx | flagyy), frow[:, None]) dirty = vis2im(uvw, freqs[ims][spw], data, freq_bin_idx[ims][spw], freq_bin_counts[ims][spw], nx, ny, cell_rad, weights=weights, flag=mask.astype(np.uint8), nthreads=ngridder_threads, epsilon=args.epsilon, do_wstacking=args.wstack, double_accum=args.double_accum) dirties.append(dirty) # dask.visualize(dirties, filename=args.output_filename + '_graph.pdf', optimize_graph=False) if not args.mock: # result = dask.compute(dirties, wsum, optimize_graph=False) with performance_report(filename=args.output_filename + '_per.html'): result = dask.compute(dirties, optimize_graph=False) dirties = result[0] dirty = stitch_images(dirties, nband, band_mapping) hdr = set_wcs(cell_size / 3600, cell_size / 3600, nx, ny, radec, freq_out) save_fits(args.output_filename + '_dirty.fits', dirty, hdr, dtype=args.output_type) print("All done here.", file=log)
def from_lon_360(lon_var: Union[np.ndarray, da.Array, xr.DataArray]): if isinstance(lon_var, xr.DataArray): return lon_var.where(lon_var <= 180.0, lon_var - 360.0) else: lon_var = da.asarray(lon_var) return da.where(lon_var <= 180.0, lon_var, lon_var - 360.0)
def execute_node_nullif_scalar_series(op, value, series, **kwargs): # TODO - not preserving the index return dd.from_array(da.where(series.eq(value).values, np.nan, value))
def __call__(self, signal, out=None, axes=None): """Slice the signal according to the ROI, and return it. Arguments --------- signal : Signal The signal to slice with the ROI. out : Signal, default = None If the 'out' argument is supplied, the sliced output will be put into this instead of returning a Signal. See Signal.__getitem__() for more details on 'out'. axes : specification of axes to use, default = None The axes argument specifies which axes the ROI will be applied on. The items in the collection can be either of the following: * a tuple of: - DataAxis. These will not be checked with signal.axes_manager. - anything that will index signal.axes_manager * For any other value, it will check whether the navigation space can fit the right number of axis, and use that if it fits. If not, it will try the signal space. """ if axes is None and signal in self.signal_map: axes = self.signal_map[signal][1] else: axes = self._parse_axes(axes, signal.axes_manager) natax = signal.axes_manager._get_axes_in_natural_order() # Slice original data with a circumscribed rectangle cx = self.cx + 0.5001 * axes[0].scale cy = self.cy + 0.5001 * axes[1].scale ranges = [[cx - self.r, cx + self.r], [cy - self.r, cy + self.r]] slices = self._make_slices(natax, axes, ranges) ir = [slices[natax.index(axes[0])], slices[natax.index(axes[1])]] vx = axes[0].axis[ir[0]] - cx vy = axes[1].axis[ir[1]] - cy gx, gy = np.meshgrid(vx, vy) gr = gx**2 + gy**2 mask = gr > self.r**2 if self.r_inner != t.Undefined: mask |= gr < self.r_inner**2 tiles = [] shape = [] chunks = [] for i in range(len(slices)): if signal._lazy: chunks.append(signal.data.chunks[i][0]) if i == natax.index(axes[0]): thisshape = mask.shape[0] tiles.append(thisshape) shape.append(thisshape) elif i == natax.index(axes[1]): thisshape = mask.shape[1] tiles.append(thisshape) shape.append(thisshape) else: tiles.append(signal.axes_manager._axes[i].size) shape.append(1) mask = mask.reshape(shape) nav_axes = [ax.navigate for ax in axes] nav_dim = signal.axes_manager.navigation_dimension if True in nav_axes: if False in nav_axes: slicer = signal.inav[slices[:nav_dim]].isig.__getitem__ slices = slices[nav_dim:] else: slicer = signal.inav.__getitem__ slices = slices[0:nav_dim] else: slicer = signal.isig.__getitem__ slices = slices[nav_dim:] roi = slicer(slices, out=out) roi = out or roi if roi._lazy: import dask.array as da mask = da.from_array(mask, chunks=chunks) mask = da.broadcast_to(mask, tiles) # By default promotes dtype to float if required roi.data = da.where(mask, np.nan, roi.data) else: mask = np.broadcast_to(mask, tiles) roi.data = np.ma.masked_array(roi.data, mask, hard_mask=True) if out is None: return roi else: out.events.data_changed.trigger(out)
def decomposition(self, normalize_poissonian_noise=False, algorithm='svd', output_dimension=None, signal_mask=None, navigation_mask=None, get=threaded.get, num_chunks=None, reproject=True, bounds=False, **kwargs): """Perform Incremental (Batch) decomposition on the data, keeping n significant components. Parameters ---------- normalize_poissonian_noise : bool If True, scale the SI to normalize Poissonian noise algorithm : str One of ('svd', 'PCA', 'ORPCA', 'ONMF'). By default 'svd', lazy SVD decomposition from dask. output_dimension : int the number of significant components to keep. If None, keep all (only valid for SVD) get : dask scheduler the dask scheduler to use for computations; default `dask.threaded.get` num_chunks : int the number of dask chunks to pass to the decomposition model. More chunks require more memory, but should run faster. Will be increased to contain atleast output_dimension signals. navigation_mask : {BaseSignal, numpy array, dask array} The navigation locations marked as True are not used in the decompostion. signal_mask : {BaseSignal, numpy array, dask array} The signal locations marked as True are not used in the decomposition. reproject : bool Reproject data on the learnt components (factors) after learning. **kwargs passed to the partial_fit/fit functions. Notes ----- Various algorithm parameters and their default values: ONMF: lambda1=1, kappa=1, robust=False, store_r=False batch_size=None ORPCA: fast=True, lambda1=None, lambda2=None, method=None, learning_rate=None, init=None, training_samples=None, momentum=None PCA: batch_size=None, copy=True, white=False """ if bounds: msg = ("The `bounds` keyword is deprecated and will be removed " "in v2.0. Since version > 1.3 this has no effect.") warnings.warn(msg, VisibleDeprecationWarning) explained_variance = None explained_variance_ratio = None _al_data = self._data_aligned_with_axes nav_chunks = _al_data.chunks[:self.axes_manager.navigation_dimension] sig_chunks = _al_data.chunks[self.axes_manager.navigation_dimension:] num_chunks = 1 if num_chunks is None else num_chunks blocksize = np.min([multiply(ar) for ar in product(*nav_chunks)]) nblocks = multiply([len(c) for c in nav_chunks]) if algorithm != "svd" and output_dimension is None: raise ValueError("With the %s the output_dimension " "must be specified" % algorithm) if output_dimension and blocksize / output_dimension < num_chunks: num_chunks = np.ceil(blocksize / output_dimension) blocksize *= num_chunks # LEARN if algorithm == 'PCA': from sklearn.decomposition import IncrementalPCA obj = IncrementalPCA(n_components=output_dimension) method = partial(obj.partial_fit, **kwargs) reproject = True elif algorithm == 'ORPCA': from hyperspy.learn.rpca import ORPCA kwg = {'fast': True} kwg.update(kwargs) obj = ORPCA(output_dimension, **kwg) method = partial(obj.fit, iterating=True) elif algorithm == 'ONMF': from hyperspy.learn.onmf import ONMF batch_size = kwargs.pop('batch_size', None) obj = ONMF(output_dimension, **kwargs) method = partial(obj.fit, batch_size=batch_size) elif algorithm != "svd": raise ValueError('algorithm not known') original_data = self.data try: if normalize_poissonian_noise: data = self._data_aligned_with_axes ndim = self.axes_manager.navigation_dimension sdim = self.axes_manager.signal_dimension nm = da.logical_not( da.zeros(self.axes_manager.navigation_shape[::-1], chunks=nav_chunks) if navigation_mask is None else to_array(navigation_mask, chunks=nav_chunks)) sm = da.logical_not( da.zeros(self.axes_manager.signal_shape[::-1], chunks=sig_chunks) if signal_mask is None else to_array(signal_mask, chunks=sig_chunks)) ndim = self.axes_manager.navigation_dimension sdim = self.axes_manager.signal_dimension bH, aG = da.compute( data.sum(axis=tuple(range(ndim))), data.sum(axis=tuple(range(ndim, ndim + sdim)))) bH = da.where(sm, bH, 1) aG = da.where(nm, aG, 1) raG = da.sqrt(aG) rbH = da.sqrt(bH) coeff = raG[(..., ) + (None, ) * rbH.ndim] *\ rbH[(None, ) * raG.ndim + (...,)] coeff.map_blocks(np.nan_to_num) coeff = da.where(coeff == 0, 1, coeff) data = data / coeff self.data = data # LEARN if algorithm == "svd": reproject = False from dask.array.linalg import svd try: self._unfolded4decomposition = self.unfold() # TODO: implement masking if navigation_mask or signal_mask: raise NotImplemented( "Masking is not yet implemented for lazy SVD.") U, S, V = svd(self.data) factors = V.T explained_variance = S**2 / self.data.shape[0] loadings = U * S finally: if self._unfolded4decomposition is True: self.fold() self._unfolded4decomposition is False else: this_data = [] try: for chunk in progressbar(self._block_iterator( flat_signal=True, get=get, signal_mask=signal_mask, navigation_mask=navigation_mask), total=nblocks, leave=True, desc='Learn'): this_data.append(chunk) if len(this_data) == num_chunks: thedata = np.concatenate(this_data, axis=0) method(thedata) this_data = [] if len(this_data): thedata = np.concatenate(this_data, axis=0) method(thedata) except KeyboardInterrupt: pass # GET ALREADY CALCULATED RESULTS if algorithm == 'PCA': explained_variance = obj.explained_variance_ explained_variance_ratio = obj.explained_variance_ratio_ factors = obj.components_.T elif algorithm == 'ORPCA': _, _, U, S, V = obj.finish() factors = U * S loadings = V explained_variance = S**2 / len(factors) elif algorithm == 'ONMF': factors, loadings = obj.finish() loadings = loadings.T # REPROJECT if reproject: if algorithm == 'PCA': method = obj.transform def post(a): return np.concatenate(a, axis=0) elif algorithm == 'ORPCA': method = obj.project obj.R = [] def post(a): return obj.finish()[4] elif algorithm == 'ONMF': method = obj.project def post(a): return np.concatenate(a, axis=1).T _map = map( lambda thing: method(thing), self._block_iterator(flat_signal=True, get=get, signal_mask=signal_mask, navigation_mask=navigation_mask)) H = [] try: for thing in progressbar(_map, total=nblocks, desc='Project'): H.append(thing) except KeyboardInterrupt: pass loadings = post(H) if explained_variance is not None and \ explained_variance_ratio is None: explained_variance_ratio = \ explained_variance / explained_variance.sum() # RESHUFFLE "blocked" LOADINGS ndim = self.axes_manager.navigation_dimension if algorithm != "svd": # Only needed for online algorithms try: loadings = _reshuffle_mixed_blocks(loadings, ndim, (output_dimension, ), nav_chunks).reshape( (-1, output_dimension)) except ValueError: # In case the projection step was not finished, it's left # as scrambled pass finally: self.data = original_data target = self.learning_results target.decomposition_algorithm = algorithm target.output_dimension = output_dimension if algorithm != "svd": target._object = obj target.factors = factors target.loadings = loadings target.explained_variance = explained_variance target.explained_variance_ratio = explained_variance_ratio # Rescale the results if the noise was normalized if normalize_poissonian_noise is True: target.factors = target.factors * rbH.ravel()[:, np.newaxis] target.loadings = target.loadings * raG.ravel()[:, np.newaxis]
def norm_topo(self, data, elev, solar_za, solar_az, slope=None, aspect=None, method='empirical-rotation', slope_thresh=2, nodata=0, elev_nodata=-32768, scale_factor=1, angle_scale=0.01, n_jobs=1, robust=False, min_samples=100, slope_kwargs=None, aspect_kwargs=None, band_coeffs=None): """ Applies topographic normalization Args: data (2d or 3d DataArray): The data to normalize, in the range 0-1. elev (2d DataArray): The elevation data. solar_za (2d DataArray): The solar zenith angles (degrees). solar_az (2d DataArray): The solar azimuth angles (degrees). slope (2d DataArray): The slope data. If not given, slope is calculated from ``elev``. aspect (2d DataArray): The aspect data. If not given, aspect is calculated from ``elev``. method (Optional[str]): The method to apply. Choices are ['c', 'empirical-rotation']. slope_thresh (Optional[float or int]): The slope threshold. Any samples with values < ``slope_thresh`` are not adjusted. nodata (Optional[int or float]): The 'no data' value for ``data``. elev_nodata (Optional[float or int]): The 'no data' value for ``elev``. scale_factor (Optional[float]): A scale factor to apply to the input data. angle_scale (Optional[float]): The angle scale factor. n_jobs (Optional[int]): The number of parallel workers for ``LinearRegression.fit``. robust (Optional[bool]): Whether to fit a robust regression. min_samples (Optional[int]): The minimum number of samples required to fit a regression. slope_kwargs (Optional[dict]): Keyword arguments passed to ``gdal.DEMProcessingOptions`` to calculate the slope. aspect_kwargs (Optional[dict]): Keyword arguments passed to ``gdal.DEMProcessingOptions`` to calculate the aspect. band_coeffs (Optional[dict]): Slope and intercept coefficients for each band. References: See :cite:`teillet_etal_1982` for the C-correction method. See :cite:`tan_etal_2010` for the Empirical Rotation method. Returns: ``xarray.DataArray`` Examples: >>> import geowombat as gw >>> from geowombat.radiometry import Topo >>> >>> topo = Topo() >>> >>> # Example where pixel angles are stored in separate GeoTiff files >>> with gw.config.update(sensor='l7', scale_factor=0.0001, nodata=0): >>> >>> with gw.open('landsat.tif') as src, >>> gw.open('srtm') as elev, >>> gw.open('solarz.tif') as solarz, >>> gw.open('solara.tif') as solara: >>> >>> src_norm = topo.norm_topo(src, elev, solarz, solara, n_jobs=-1) """ method = method.strip().lower() if method not in ['c', 'empirical-rotation']: logger.exception( " Currently, the only supported methods are 'c' and 'empirical-rotation'." ) raise NameError attrs = data.attrs.copy() if not nodata: nodata = data.gw.nodata if scale_factor == 1.0: scale_factor = data.gw.scale_factor # Scale the reflectance data if scale_factor != 1: data = data * scale_factor if not slope_kwargs: slope_kwargs = dict(format='MEM', computeEdges=True, alg='ZevenbergenThorne', slopeFormat='degree') if not aspect_kwargs: aspect_kwargs = dict(format='MEM', computeEdges=True, alg='ZevenbergenThorne', trigonometric=False, zeroForFlat=True) slope_kwargs['format'] = 'MEM' slope_kwargs['slopeFormat'] = 'degree' aspect_kwargs['format'] = 'MEM' # Force to SRTM resolution proc_dims = (int((data.gw.ncols * data.gw.cellx) / 30.0), int((data.gw.nrows * data.gw.celly) / 30.0)) w = int((5 * 30.0) / data.gw.celly) if w % 2 == 0: w += 1 if isinstance(slope, xr.DataArray): slope_deg_fd = slope.squeeze().data else: slope_deg = calc_slope_delayed(elev.squeeze().data, proc_dims=proc_dims, w=w, **slope_kwargs) slope_deg_fd = da.from_delayed(slope_deg, (data.gw.nrows, data.gw.ncols), dtype='float64') if isinstance(aspect, xr.DataArray): aspect_deg_fd = aspect.squeeze().data else: aspect_deg = calc_aspect_delayed(elev.squeeze().data, proc_dims=proc_dims, w=w, **aspect_kwargs) aspect_deg_fd = da.from_delayed(aspect_deg, (data.gw.nrows, data.gw.ncols), dtype='float64') nodata_samps = da.where( (elev.data == elev_nodata) | (data.max(dim='band').data == nodata) | (slope_deg_fd < slope_thresh), 1, 0) slope_rad = da.deg2rad(slope_deg_fd) aspect_rad = da.deg2rad(aspect_deg_fd) # Convert degrees to radians solar_za = da.deg2rad(solar_za.squeeze().data * angle_scale) solar_az = da.deg2rad(solar_az.squeeze().data * angle_scale) cos_z = da.cos(solar_za) # Calculate the illumination angle il = da.cos(slope_rad) * cos_z + da.sin(slope_rad) * da.sin( solar_za) * da.cos(solar_az - aspect_rad) sr_adj = list() for band in data.band.values.tolist(): if method == 'c': sr_adj.append( self._method_c( data.sel(band=band).data, il, cos_z, nodata_samps, min_samples, n_jobs, robust, band_coeffs, band)) else: sr_adj.append( self._method_empirical_rotation( data.sel(band=band).data, il, cos_z, nodata_samps, min_samples, n_jobs, robust, band_coeffs, band)) adj_data = xr.DataArray(data=da.concatenate(sr_adj).reshape( (data.gw.nbands, data.gw.nrows, data.gw.ncols)), coords={ 'band': data.band.values.tolist(), 'y': data.y.values, 'x': data.x.values }, dims=('band', 'y', 'x'), attrs=data.attrs) attrs['calibration'] = 'Topographic-adjusted' attrs['nodata'] = nodata attrs['drange'] = (0, 1) adj_data.attrs = attrs return adj_data
def decomposition(self, normalize_poissonian_noise=False, algorithm='svd', output_dimension=None, signal_mask=None, navigation_mask=None, get=threaded.get, num_chunks=None, reproject=True, bounds=False, **kwargs): """Perform Incremental (Batch) decomposition on the data, keeping n significant components. Parameters ---------- normalize_poissonian_noise : bool If True, scale the SI to normalize Poissonian noise algorithm : str One of ('svd', 'PCA', 'ORPCA', 'ONMF'). By default 'svd', lazy SVD decomposition from dask. output_dimension : int the number of significant components to keep. If None, keep all (only valid for SVD) get : dask scheduler the dask scheduler to use for computations; default `dask.threaded.get` num_chunks : int the number of dask chunks to pass to the decomposition model. More chunks require more memory, but should run faster. Will be increased to contain atleast output_dimension signals. navigation_mask : {BaseSignal, numpy array, dask array} The navigation locations marked as True are not used in the decompostion. signal_mask : {BaseSignal, numpy array, dask array} The signal locations marked as True are not used in the decomposition. reproject : bool Reproject data on the learnt components (factors) after learning. **kwargs passed to the partial_fit/fit functions. Notes ----- Various algorithm parameters and their default values: ONMF: lambda1=1, kappa=1, robust=False, store_r=False batch_size=None ORPCA: fast=True, lambda1=None, lambda2=None, method=None, learning_rate=None, init=None, training_samples=None, momentum=None PCA: batch_size=None, copy=True, white=False """ if bounds: msg = ( "The `bounds` keyword is deprecated and will be removed " "in v2.0. Since version > 1.3 this has no effect.") warnings.warn(msg, VisibleDeprecationWarning) explained_variance = None explained_variance_ratio = None _al_data = self._data_aligned_with_axes nav_chunks = _al_data.chunks[:self.axes_manager.navigation_dimension] sig_chunks = _al_data.chunks[self.axes_manager.navigation_dimension:] num_chunks = 1 if num_chunks is None else num_chunks blocksize = np.min([multiply(ar) for ar in product(*nav_chunks)]) nblocks = multiply([len(c) for c in nav_chunks]) if algorithm != "svd" and output_dimension is None: raise ValueError("With the %s the output_dimension " "must be specified" % algorithm) if output_dimension and blocksize / output_dimension < num_chunks: num_chunks = np.ceil(blocksize / output_dimension) blocksize *= num_chunks # LEARN if algorithm == 'PCA': from sklearn.decomposition import IncrementalPCA obj = IncrementalPCA(n_components=output_dimension) method = partial(obj.partial_fit, **kwargs) reproject = True elif algorithm == 'ORPCA': from hyperspy.learn.rpca import ORPCA kwg = {'fast': True} kwg.update(kwargs) obj = ORPCA(output_dimension, **kwg) method = partial(obj.fit, iterating=True) elif algorithm == 'ONMF': from hyperspy.learn.onmf import ONMF batch_size = kwargs.pop('batch_size', None) obj = ONMF(output_dimension, **kwargs) method = partial(obj.fit, batch_size=batch_size) elif algorithm != "svd": raise ValueError('algorithm not known') original_data = self.data try: if normalize_poissonian_noise: data = self._data_aligned_with_axes ndim = self.axes_manager.navigation_dimension sdim = self.axes_manager.signal_dimension nm = da.logical_not( da.zeros( self.axes_manager.navigation_shape[::-1], chunks=nav_chunks) if navigation_mask is None else to_array( navigation_mask, chunks=nav_chunks)) sm = da.logical_not( da.zeros( self.axes_manager.signal_shape[::-1], chunks=sig_chunks) if signal_mask is None else to_array( signal_mask, chunks=sig_chunks)) ndim = self.axes_manager.navigation_dimension sdim = self.axes_manager.signal_dimension bH, aG = da.compute( data.sum(axis=tuple(range(ndim))), data.sum(axis=tuple(range(ndim, ndim + sdim)))) bH = da.where(sm, bH, 1) aG = da.where(nm, aG, 1) raG = da.sqrt(aG) rbH = da.sqrt(bH) coeff = raG[(..., ) + (None, ) * rbH.ndim] *\ rbH[(None, ) * raG.ndim + (...,)] coeff.map_blocks(np.nan_to_num) coeff = da.where(coeff == 0, 1, coeff) data = data / coeff self.data = data # LEARN if algorithm == "svd": reproject = False from dask.array.linalg import svd try: self._unfolded4decomposition = self.unfold() # TODO: implement masking if navigation_mask or signal_mask: raise NotImplemented( "Masking is not yet implemented for lazy SVD." ) U, S, V = svd(self.data) factors = V.T explained_variance = S ** 2 / self.data.shape[0] loadings = U * S finally: if self._unfolded4decomposition is True: self.fold() self._unfolded4decomposition is False else: this_data = [] try: for chunk in progressbar( self._block_iterator( flat_signal=True, get=get, signal_mask=signal_mask, navigation_mask=navigation_mask), total=nblocks, leave=True, desc='Learn'): this_data.append(chunk) if len(this_data) == num_chunks: thedata = np.concatenate(this_data, axis=0) method(thedata) this_data = [] if len(this_data): thedata = np.concatenate(this_data, axis=0) method(thedata) except KeyboardInterrupt: pass # GET ALREADY CALCULATED RESULTS if algorithm == 'PCA': explained_variance = obj.explained_variance_ explained_variance_ratio = obj.explained_variance_ratio_ factors = obj.components_.T elif algorithm == 'ORPCA': _, _, U, S, V = obj.finish() factors = U * S loadings = V explained_variance = S**2 / len(factors) elif algorithm == 'ONMF': factors, loadings = obj.finish() loadings = loadings.T # REPROJECT if reproject: if algorithm == 'PCA': method = obj.transform def post(a): return np.concatenate(a, axis=0) elif algorithm == 'ORPCA': method = obj.project obj.R = [] def post(a): return obj.finish()[4] elif algorithm == 'ONMF': method = obj.project def post(a): return np.concatenate(a, axis=1).T _map = map(lambda thing: method(thing), self._block_iterator( flat_signal=True, get=get, signal_mask=signal_mask, navigation_mask=navigation_mask)) H = [] try: for thing in progressbar( _map, total=nblocks, desc='Project'): H.append(thing) except KeyboardInterrupt: pass loadings = post(H) if explained_variance is not None and \ explained_variance_ratio is None: explained_variance_ratio = \ explained_variance / explained_variance.sum() # RESHUFFLE "blocked" LOADINGS ndim = self.axes_manager.navigation_dimension if algorithm != "svd": # Only needed for online algorithms try: loadings = _reshuffle_mixed_blocks( loadings, ndim, (output_dimension,), nav_chunks).reshape((-1, output_dimension)) except ValueError: # In case the projection step was not finished, it's left # as scrambled pass finally: self.data = original_data target = self.learning_results target.decomposition_algorithm = algorithm target.output_dimension = output_dimension if algorithm != "svd": target._object = obj target.factors = factors target.loadings = loadings target.explained_variance = explained_variance target.explained_variance_ratio = explained_variance_ratio # Rescale the results if the noise was normalized if normalize_poissonian_noise is True: target.factors = target.factors * rbH.ravel()[:, np.newaxis] target.loadings = target.loadings * raG.ravel()[:, np.newaxis]
def _transform_array(image: da.Array, scale: Tuple[float, ...], offset: Tuple[float, ...], shape: Tuple[int, ...], chunks: Optional[Tuple[int, ...]], spline_order: int, recover_nan: bool) -> da.Array: """ Apply affine transformation to ND-image. :param image: ND-image with shape (..., size_y, size_x) :param scale: Scaling factors (1, ..., 1, sy, sx) :param offset: Offset values (0, ..., 0, oy, ox) :param shape: (..., size_y, size_x) :param chunks: (..., chunk_size_y, chunk_size_x) :param spline_order: 0 ... 5 :param recover_nan: True/False :return: Transformed ND-image. """ assert_true(len(scale) == image.ndim, 'invalid scale') assert_true(len(offset) == image.ndim, 'invalid offset') assert_true(len(shape) == image.ndim, 'invalid shape') assert_true(chunks is None or len(chunks) == image.ndim, 'invalid chunks') if _is_no_op(image, scale, offset, shape): return image # As of scipy 0.18, matrix = scale is no longer supported. # Therefore we use the diagonal matrix form here, # where scale is the diagonal. matrix = np.diag(scale) at_kwargs = dict( offset=offset, order=spline_order, output_shape=shape, output_chunks=chunks, mode='constant', ) if recover_nan and spline_order > 0: # We can "recover" values that are neighbours to NaN values # that would otherwise become NaN too. mask = da.isnan(image) # First check if there are NaN values ar all if da.any(mask): # Yes, then # 1. replace NaN by zero filled_im = da.where(mask, 0.0, image) # 2. transform the zeo-filled image scaled_im = ndinterp.affine_transform(filled_im, matrix, **at_kwargs, cval=0.0) # 3. transform the inverted mask scaled_norm = ndinterp.affine_transform(1.0 - mask, matrix, **at_kwargs, cval=0.0) # 4. put back NaN where there was zero, # otherwise decode using scaled mask return da.where(da.isclose(scaled_norm, 0.0), np.nan, scaled_im / scaled_norm) # No dealing with NaN required return ndinterp.affine_transform(image, matrix, **at_kwargs, cval=np.nan)
def test_where_has_informative_error(): x = da.ones(5, chunks=3) try: result = da.where(x > 0) except Exception as e: assert 'dask' in str(e)
def update_W_da(M, H, W): denominator = da.dot(W, da.dot(H, H.T)) denominator_new = da.where( da.fabs(denominator) < EPSILON, EPSILON, denominator) W_new = W * da.dot(M, H.T) / denominator_new return (W_new)
def update_H_da(M, H, W): denominator = da.dot(W.T, da.dot(W, H)) denominator_new = da.where( da.fabs(denominator) < EPSILON, EPSILON, denominator) H_new = H * da.dot(W.T, M) / denominator_new return (H_new)
def read_band(self, key, info): """Read the data.""" tic = datetime.now() header = {} with open(self.filename, "rb") as fp_: header['block1'] = np.fromfile(fp_, dtype=_BASIC_INFO_TYPE, count=1) header["block2"] = np.fromfile(fp_, dtype=_DATA_INFO_TYPE, count=1) header["block3"] = np.fromfile(fp_, dtype=_PROJ_INFO_TYPE, count=1) header["block4"] = np.fromfile(fp_, dtype=_NAV_INFO_TYPE, count=1) header["block5"] = np.fromfile(fp_, dtype=_CAL_INFO_TYPE, count=1) logger.debug("Band number = " + str(header["block5"]['band_number'][0])) logger.debug('Time_interval: %s - %s', str(self.start_time), str(self.end_time)) band_number = header["block5"]['band_number'][0] if band_number < 7: cal = np.fromfile(fp_, dtype=_VISCAL_INFO_TYPE, count=1) else: cal = np.fromfile(fp_, dtype=_IRCAL_INFO_TYPE, count=1) header['calibration'] = cal header["block6"] = np.fromfile(fp_, dtype=_INTER_CALIBRATION_INFO_TYPE, count=1) header["block7"] = np.fromfile(fp_, dtype=_SEGMENT_INFO_TYPE, count=1) header["block8"] = np.fromfile( fp_, dtype=_NAVIGATION_CORRECTION_INFO_TYPE, count=1) # 8 The navigation corrections: ncorrs = header["block8"]['numof_correction_info_data'][0] dtype = np.dtype([ ("line_number_after_rotation", "<u2"), ("shift_amount_for_column_direction", "f4"), ("shift_amount_for_line_direction", "f4"), ]) corrections = [] for i in range(ncorrs): corrections.append(np.fromfile(fp_, dtype=dtype, count=1)) fp_.seek(40, 1) header['navigation_corrections'] = corrections header["block9"] = np.fromfile(fp_, dtype=_OBS_TIME_INFO_TYPE, count=1) numobstimes = header["block9"]['number_of_observation_times'][0] dtype = np.dtype([ ("line_number", "<u2"), ("observation_time", "f8"), ]) lines_and_times = [] for i in range(numobstimes): lines_and_times.append(np.fromfile(fp_, dtype=dtype, count=1)) header['observation_time_information'] = lines_and_times fp_.seek(40, 1) header["block10"] = np.fromfile(fp_, dtype=_ERROR_INFO_TYPE, count=1) dtype = np.dtype([ ("line_number", "<u2"), ("numof_error_pixels_per_line", "<u2"), ]) num_err_info_data = header["block10"]['number_of_error_info_data'][ 0] err_info_data = [] for i in range(num_err_info_data): err_info_data.append(np.fromfile(fp_, dtype=dtype, count=1)) header['error_information_data'] = err_info_data fp_.seek(40, 1) np.fromfile(fp_, dtype=_SPARE_TYPE, count=1) nlines = int(header["block2"]['number_of_lines'][0]) ncols = int(header["block2"]['number_of_columns'][0]) res = da.from_array(np.memmap(self.filename, offset=fp_.tell(), dtype='<u2', shape=(nlines, ncols), mode='r'), chunks=CHUNK_SIZE) res = da.where(res == 65535, np.float32(np.nan), res) self._header = header logger.debug("Reading time " + str(datetime.now() - tic)) res = self.calibrate(res, key.calibration) new_info = dict( units=info['units'], standard_name=info['standard_name'], wavelength=info['wavelength'], resolution='resolution', id=key, name=key.name, scheduled_time=self.scheduled_time, platform_name=self.platform_name, sensor=self.sensor, satellite_longitude=float(self.nav_info['SSP_longitude']), satellite_latitude=float(self.nav_info['SSP_latitude']), satellite_altitude=float( self.nav_info['distance_earth_center_to_satellite'] - self.proj_info['earth_equatorial_radius']) * 1000) res = xr.DataArray(res, attrs=new_info, dims=['y', 'x']) res = res.where( header['block5']["count_value_outside_scan_pixels"][0] != res) res = res.where(header['block5']["count_value_error_pixels"][0] != res) res = res.where(self.geo_mask()) return res
def density_flux(population, total_population, carrying_capacity, distance, csx, csy, **kwargs): """ 'density-based dispersion' Dispersal is calculated using the following sequence of methods: Portions of populations at each element (node, or grid cell) in the study area array (raster) are moved to surrounding elements (a neighbourhood) within a radius that is defined by the input distance (:math:`d`), as presented in the conceptual figure below. .. image:: images/density_flux_neighbourhood.png :align: center .. attention:: No dispersal will occur if the provided distance is less than the distance between elements (grid cells) in the model domain, as none will be included in the neighbourhood The mean density (:math:`\\rho`) of all elements in the neighbourhood is calculated as: .. math:: \\rho=\\frac{\\sum_{i=1}^{n} \\frac{pop_T(i)}{k_T(i)}}{n} where, :math:`pop_T` is the total population (of the entire species) at each element (:math:`i`); and\n :math:`k_T` is the total carrying capacity for the species The density gradient at each element (:math:`\\Delta`) with respect to the mean is calculated as: .. math:: \\Delta(i)=\\frac{pop_T(i)}{k_T(i)}-\\rho If the centroid element is above the mean :math:`[\\Delta(i_0) > 0]`, it is able to release a portion of its population to elements in the neighbourhood. The eligible population to be received by surrounding elements is equal to the sum of populations at elements with negative density gradients, the :math:`candidates`: .. math:: candidates=\\sum_{i=1}^{n} \\Delta(i)[\\Delta(i) < 0]k_T(i) The minimum of either the population above the mean at the centroid element - :math:`source=\\Delta(i_0)*k_T(i_0)`, or the :math:`candidates` are used to determine the total population that is dispersed from the centroid element to the other elements in the neighbourhood: .. math:: dispersal=min\{source, candidates\} The population at the centroid element becomes: .. math:: pop_a(i_0)=pop_a(i_0)-\\frac{pop_a(i_0)}{pop_T(i_0)}dispersal where, :math:`pop_a` is the age (stage) group population, which is a sub-population of the total. The populations of the candidate elements in the neighbourhood become (a net gain due to negative gradients): .. math:: pop_a(i)=pop_a(i)-\\frac{\\Delta(i)[\\Delta(i) < 0]k_T(i)}{candidates}dispersal\\frac{pop_a(i)}{pop_T(i)} :param da.Array population: Sub-population to redistribute (subset of the ``total_population``) :param da.Array total_population: Total population :param da.Array carrying_capacity: Total Carrying Capacity (k) :param float distance: Maximum dispersal distance :param float csx: Cell size of the domain in the x-direction :param float csy: Cell size of the domain in the y-direction .. Attention:: Ensure the cell sizes are in the same units as the specified direction :Keyword Arguments: **mask** (*array*) -- A weighting mask that scales dispersal based on the normalized mask value (default: None) :return: Redistributed population """ if any([ not isinstance(a, da.Array) for a in [population, total_population, carrying_capacity] ]): raise DispersalError('Inputs must be a dask arrays') if distance == 0: # Don't do anything return population chunks = tuple(c[0] if c else 0 for c in population.chunks)[:2] mask = kwargs.get('mask', None) if mask is None: mask = da.ones(shape=population.shape, dtype='float32', chunks=chunks) # Normalize the mask mask_min = da.min(mask) _range = da.max(mask) - mask_min mask = da.where(_range > 0, (mask - mask_min) / _range, 1.) # Calculate the kernel indices and shape kernel = calculate_kernel(distance, csx, csy) if kernel is None: # Not enough distance to cover a grid cell return population kernel, m, n = kernel m = int(m) n = int(n) a = da.pad(da.dstack( [population, total_population, carrying_capacity, mask]), ((m, m), (n, n), (0, 0)), 'constant', constant_values=0) _m = -m if m == 0: _m = None _n = -n if n == 0: _n = None output = delayed(density_flux_task)(a, kernel, m, n)[m:_m, n:_n, 0] output = da.from_delayed(output, population.shape, np.float32) return output.rechunk(chunks)
def get_bil_info(self): """Return neighbour info. Returns ------- t__ : numpy array Vertical fractional distances from corner to the new points s__ : numpy array Horizontal fractional distances from corner to the new points valid_input_index : numpy array Valid indices in the input data index_array : numpy array Mapping array from valid source points to target points """ if self.source_geo_def.size < self.neighbours: warnings.warn('Searching for %s neighbours in %s data points' % (self.neighbours, self.source_geo_def.size)) # Create kd-tree valid_input_index, resample_kdtree = self._create_resample_kdtree() # This is a numpy array self.valid_input_index = valid_input_index if resample_kdtree.n == 0: # Handle if all input data is reduced away bilinear_t, bilinear_s, valid_input_index, index_array = \ _create_empty_bil_info(self.source_geo_def, self.target_geo_def) self.bilinear_t = bilinear_t self.bilinear_s = bilinear_s self.valid_input_index = valid_input_index self.index_array = index_array return bilinear_t, bilinear_s, valid_input_index, index_array target_lons, target_lats = self.target_geo_def.get_lonlats() valid_output_idx = ((target_lons >= -180) & (target_lons <= 180) & (target_lats <= 90) & (target_lats >= -90)) index_array, distance_array = self._query_resample_kdtree( resample_kdtree, target_lons, target_lats, valid_output_idx) # Reduce index reference input_size = da.sum(self.valid_input_index) index_mask = index_array == input_size index_array = da.where(index_mask, 0, index_array) # Get output projection as pyproj object proj = Proj(self.target_geo_def.proj_str) # Get output x/y coordinates out_x, out_y = self.target_geo_def.get_proj_coords(chunks=CHUNK_SIZE) out_x = da.ravel(out_x) out_y = da.ravel(out_y) # Get input x/y coordinates in_x, in_y = _get_input_xy_dask(self.source_geo_def, proj, self.valid_input_index, index_array) # Get the four closest corner points around each output location pt_1, pt_2, pt_3, pt_4, index_array = \ _get_bounding_corners_dask(in_x, in_y, out_x, out_y, self.neighbours, index_array) # Calculate vertical and horizontal fractional distances t and s t__, s__ = _get_ts_dask(pt_1, pt_2, pt_3, pt_4, out_x, out_y) self.bilinear_t, self.bilinear_s = t__, s__ self.valid_output_index = valid_output_idx self.index_array = index_array self.distance_array = distance_array self._get_slices() return (self.bilinear_t, self.bilinear_s, self.slices, self.mask_slices, self.out_coords)
def _get_valid_lonlats(self, vis): lons, lats = vis.attrs['area'].get_lonlats(chunks=vis.data.chunks) lons = da.where(lons >= 1e30, np.nan, lons) lats = da.where(lats >= 1e30, np.nan, lats) return lons, lats
def decomposition(self, output_dimension, normalize_poissonian_noise=False, algorithm='PCA', signal_mask=None, navigation_mask=None, get=threaded.get, num_chunks=None, reproject=True, bounds=True, **kwargs): """Perform Incremental (Batch) decomposition on the data, keeping n significant components. Parameters ---------- output_dimension : int the number of significant components to keep normalize_poissonian_noise : bool If True, scale the SI to normalize Poissonian noise algorithm : str One of ('PCA', 'ORPCA', 'ONMF'). By default ('PCA') IncrementalPCA from scikit-learn is run. get : dask scheduler the dask scheduler to use for computations; default `dask.threaded.get` num_chunks : int the number of dask chunks to pass to the decomposition model. More chunks require more memory, but should run faster. Will be increased to contain atleast output_dimension signals. navigation_mask : {BaseSignal, numpy array, dask array} The navigation locations marked as True are not used in the decompostion. signal_mask : {BaseSignal, numpy array, dask array} The signal locations marked as True are not used in the decomposition. reproject : bool Reproject data on the learnt components (factors) after learning. bounds : {tuple, bool} The (min, max) values of the data to normalize before learning. If tuple (min, max), those values will be used for normalization. If True, extremes will be looked up (expensive), default. If False, no normalization is done (learning may be very slow). If normalize_poissonian_noise is True, this cannot be True. **kwargs passed to the partial_fit/fit functions. Notes ----- Various algorithm parameters and their default values: ONMF: lambda1=1, kappa=1, robust=False, store_r=False batch_size=None ORPCA: fast=True, lambda1=None, lambda2=None, method=None, learning_rate=None, init=None, training_samples=None, momentum=None PCA: batch_size=None, copy=True, white=False """ explained_variance = None explained_variance_ratio = None _al_data = self._data_aligned_with_axes nav_chunks = _al_data.chunks[:self.axes_manager.navigation_dimension] sig_chunks = _al_data.chunks[self.axes_manager.navigation_dimension:] num_chunks = 1 if num_chunks is None else num_chunks blocksize = np.min([multiply(ar) for ar in product(*nav_chunks)]) nblocks = multiply([len(c) for c in nav_chunks]) if blocksize / output_dimension < num_chunks: num_chunks = np.ceil(blocksize / output_dimension) blocksize *= num_chunks ## LEARN if algorithm == 'PCA': from sklearn.decomposition import IncrementalPCA obj = IncrementalPCA(n_components=output_dimension) method = partial(obj.partial_fit, **kwargs) reproject = True elif algorithm == 'ORPCA': from hyperspy.learn.rpca import ORPCA kwg = {'fast': True} kwg.update(kwargs) obj = ORPCA(output_dimension, **kwg) method = partial(obj.fit, iterating=True) elif algorithm == 'ONMF': from hyperspy.learn.onmf import ONMF batch_size = kwargs.pop('batch_size', None) obj = ONMF(output_dimension, **kwargs) method = partial(obj.fit, batch_size=batch_size) else: raise ValueError('algorithm not known') original_data = self.data try: if normalize_poissonian_noise: if bounds is True: bounds = False # warnings.warn? data = self._data_aligned_with_axes ndim = self.axes_manager.navigation_dimension sdim = self.axes_manager.signal_dimension nm = da.logical_not( da.zeros( self.axes_manager.navigation_shape[::-1], chunks=nav_chunks) if navigation_mask is None else to_array( navigation_mask, chunks=nav_chunks)) sm = da.logical_not( da.zeros( self.axes_manager.signal_shape[::-1], chunks=sig_chunks) if signal_mask is None else to_array( signal_mask, chunks=sig_chunks)) ndim = self.axes_manager.navigation_dimension sdim = self.axes_manager.signal_dimension bH, aG = da.compute( data.sum(axis=range(ndim)), data.sum(axis=range(ndim, ndim + sdim))) bH = da.where(sm, bH, 1) aG = da.where(nm, aG, 1) raG = da.sqrt(aG) rbH = da.sqrt(bH) coeff = raG[(..., ) + (None, )*rbH.ndim] *\ rbH[(None, )*raG.ndim + (...,)] coeff.map_blocks(np.nan_to_num) coeff = da.where(coeff == 0, 1, coeff) data = data / coeff self.data = data # normalize the data for learning algs: if bounds: if bounds is True: _min, _max = da.compute(self.data.min(), self.data.max()) else: _min, _max = bounds self.data = (self.data - _min) / (_max - _min) # LEARN this_data = [] try: for chunk in progressbar( self._block_iterator( flat_signal=True, get=get, signal_mask=signal_mask, navigation_mask=navigation_mask), total=nblocks, leave=True, desc='Learn'): this_data.append(chunk) if len(this_data) == num_chunks: thedata = np.concatenate(this_data, axis=0) method(thedata) this_data = [] if len(this_data): thedata = np.concatenate(this_data, axis=0) method(thedata) except KeyboardInterrupt: pass # GET ALREADY CALCULATED RESULTS if algorithm == 'PCA': explained_variance = obj.explained_variance_ explained_variance_ratio = obj.explained_variance_ratio_ factors = obj.components_.T elif algorithm == 'ORPCA': _, _, U, S, V = obj.finish() factors = U * S loadings = V explained_variance = S**2 / len(factors) elif algorithm == 'ONMF': factors, loadings = obj.finish() loadings = loadings.T # REPROJECT if reproject: if algorithm == 'PCA': method = obj.transform post = lambda a: np.concatenate(a, axis=0) elif algorithm == 'ORPCA': method = obj.project obj.R = [] post = lambda a: obj.finish()[4] elif algorithm == 'ONMF': method = obj.project post = lambda a: np.concatenate(a, axis=1).T _map = map(lambda thing: method(thing), self._block_iterator( flat_signal=True, get=get, signal_mask=signal_mask, navigation_mask=navigation_mask)) H = [] try: for thing in progressbar( _map, total=nblocks, desc='Project'): H.append(thing) except KeyboardInterrupt: pass loadings = post(H) if explained_variance is not None and \ explained_variance_ratio is None: explained_variance_ratio = \ explained_variance / explained_variance.sum() # RESHUFFLE "blocked" LOADINGS ndim = self.axes_manager.navigation_dimension try: loadings = _reshuffle_mixed_blocks( loadings, ndim, (output_dimension,), nav_chunks).reshape((-1, output_dimension)) except ValueError: # In case the projection step was not finished, it's left # as scrambled pass finally: self.data = original_data target = self.learning_results target.decomposition_algorithm = algorithm target.output_dimension = output_dimension target._object = obj target.factors = factors target.loadings = loadings target.explained_variance = explained_variance target.explained_variance_ratio = explained_variance_ratio
def get_reflectance(self, sun_zenith, sat_zenith, azidiff, bandname, redband=None): """Get the reflectance from the three sun-sat angles""" # Get wavelength in nm for band: if isinstance(bandname, float): LOG.warning( 'A wavelength is provided instead of band name - ' + 'disregard the relative spectral responses and assume ' + 'it is the effective wavelength: %f (micro meter)', bandname) wvl = bandname * 1000.0 else: wvl = self.get_effective_wavelength(bandname) * 1000.0 rayl, wvl_coord, azid_coord, satz_sec_coord, sunz_sec_coord = \ self.get_reflectance_lut() # force dask arrays compute = False if HAVE_DASK and not isinstance(sun_zenith, Array): compute = True sun_zenith = from_array(sun_zenith, chunks=sun_zenith.shape) sat_zenith = from_array(sat_zenith, chunks=sat_zenith.shape) azidiff = from_array(azidiff, chunks=azidiff.shape) if redband is not None: redband = from_array(redband, chunks=redband.shape) clip_angle = rad2deg(arccos(1. / sunz_sec_coord.max())) sun_zenith = clip(sun_zenith, 0, clip_angle) sunzsec = 1. / cos(deg2rad(sun_zenith)) clip_angle = rad2deg(arccos(1. / satz_sec_coord.max())) sat_zenith = clip(sat_zenith, 0, clip_angle) satzsec = 1. / cos(deg2rad(sat_zenith)) shape = sun_zenith.shape if not (wvl_coord.min() < wvl < wvl_coord.max()): LOG.warning( "Effective wavelength for band %s outside 400-800 nm range!", str(bandname)) LOG.info( "Set the rayleigh/aerosol reflectance contribution to zero!") if HAVE_DASK: chunks = sun_zenith.chunks if redband is None \ else redband.chunks res = zeros(shape, chunks=chunks) return res.compute() if compute else res else: return zeros(shape) idx = np.searchsorted(wvl_coord, wvl) wvl1 = wvl_coord[idx - 1] wvl2 = wvl_coord[idx] fac = (wvl2 - wvl) / (wvl2 - wvl1) raylwvl = fac * rayl[idx - 1, :, :, :] + (1 - fac) * rayl[idx, :, :, :] tic = time.time() smin = [sunz_sec_coord[0], azid_coord[0], satz_sec_coord[0]] smax = [sunz_sec_coord[-1], azid_coord[-1], satz_sec_coord[-1]] orders = [len(sunz_sec_coord), len(azid_coord), len(satz_sec_coord)] minterp = MultilinearInterpolator(smin, smax, orders) f_3d_grid = raylwvl minterp.set_values(atleast_2d(f_3d_grid.ravel())) def _do_interp(minterp, sunzsec, azidiff, satzsec): interp_points2 = np.vstack( (sunzsec.ravel(), 180 - azidiff.ravel(), satzsec.ravel())) res = minterp(interp_points2) return res.reshape(sunzsec.shape) if HAVE_DASK: ipn = map_blocks(_do_interp, minterp, sunzsec, azidiff, satzsec, dtype=raylwvl.dtype, chunks=azidiff.chunks) else: ipn = _do_interp(minterp, sunzsec, azidiff, satzsec) LOG.debug("Time - Interpolation: {0:f}".format(time.time() - tic)) ipn *= 100 res = ipn if redband is not None: res = where(redband < 20., res, (1 - (redband - 20) / 80) * res) res = clip(res, 0, 100) if compute: res = res.compute() return res
def _bt_threshold(band_data): # expects dask array to be passed return da.where(band_data >= threshold, high_offset - high_factor * band_data, low_offset - low_factor * band_data)
def _vis_calibrate(data, chn, calib_type, pre_launch_coeffs=False, calib_coeffs=None, mask=True): """Calibrate visible channel data. *calib_type* in count, reflectance, radiance. """ # Calibration count to albedo, the calibration is performed separately for # two value ranges. if calib_type not in ['counts', 'radiance', 'reflectance']: raise ValueError('Calibration ' + calib_type + ' unknown!') channel = da.from_array(data["hrpt"][:, :, chn], chunks=(LINE_CHUNK, 2048)) mask &= channel != 0 if calib_type == 'counts': return channel channel = channel.astype(np.float64) if calib_type == 'radiance': logger.info("Radiances are not yet supported for " + "the VIS/NIR channels!") if pre_launch_coeffs: coeff_idx = 2 else: # check that coeffs are valid if np.all(data["calvis"][:, chn, 0, 4] == 0): logger.info( "No valid operational coefficients, fall back to pre-launch") coeff_idx = 2 else: coeff_idx = 0 intersection = da.from_array(data["calvis"][:, chn, coeff_idx, 4], chunks=LINE_CHUNK) if calib_coeffs is not None: logger.info("Updating from external calibration coefficients.") slope1 = da.from_array(calib_coeffs[0], chunks=LINE_CHUNK) intercept1 = da.from_array(calib_coeffs[1], chunks=LINE_CHUNK) slope2 = da.from_array(calib_coeffs[2], chunks=LINE_CHUNK) intercept2 = da.from_array(calib_coeffs[3], chunks=LINE_CHUNK) else: slope1 = da.from_array(data["calvis"][:, chn, coeff_idx, 0], chunks=LINE_CHUNK) * 1e-10 intercept1 = da.from_array(data["calvis"][:, chn, coeff_idx, 1], chunks=LINE_CHUNK) * 1e-7 slope2 = da.from_array(data["calvis"][:, chn, coeff_idx, 2], chunks=LINE_CHUNK) * 1e-10 intercept2 = da.from_array(data["calvis"][:, chn, coeff_idx, 3], chunks=LINE_CHUNK) * 1e-7 if chn == 1: # In the level 1b file, the visible coefficients are stored as 4-byte integers. Scaling factors then convert # them to real numbers which are applied to the measured counts. The coefficient is different depending on # whether the counts are less than or greater than the high-gain/low-gain transition value (nominally 500). # The slope for visible channels should always be positive (reflectance increases with count). With the # pre-launch coefficients the channel 2 slope is always positive but with the operational coefs the stored # number in the high-reflectance regime overflows the maximum 2147483647, i.e. it is negative when # interpreted as a signed integer. So you have to modify it. slope2 = da.where(slope2 < 0, slope2 + 0.4294967296, slope2) channel = da.where(channel <= intersection[:, None], channel * slope1[:, None] + intercept1[:, None], channel * slope2[:, None] + intercept2[:, None]) channel = channel.clip(min=0) return da.where(mask, channel, np.nan)
def _stage_2( YP: Array, X: Array, Y: Array, alphas: Optional[ndarray] = None, normalize: bool = True, _glow_adj_alpha: bool = False, _glow_adj_scaling: bool = False, ) -> Tuple[Array, Array]: """Stage 2 - WGR Meta Regression This stage will train separate ridge regression models for each outcome using the predictions from stage 1 for that same outcome as features. These predictions are then evaluated based on R2 score to determine an optimal "meta" estimator (see `_stage_1` for the "base" estimator description). Results then include only predictions and coefficients from this optimal model. For more details, see the level 1 regression model described in step 1 of [Mbatchou et al. 2020](https://www.biorxiv.org/content/10.1101/2020.06.19.162354v2). """ assert YP.ndim == 4 assert X.ndim == 2 assert Y.ndim == 2 # Check that chunking across samples is the same for all arrays assert YP.numblocks[2] == X.numblocks[0] == Y.numblocks[0] assert YP.chunks[2] == X.chunks[0] == Y.chunks[0] # Assert single chunks for covariates and outcomes assert X.numblocks[1] == Y.numblocks[1] == 1 # Extract shape statistics n_variant_block, n_alpha_1 = YP.shape[:2] n_sample_block = Y.numblocks[0] n_sample, n_outcome = Y.shape n_covar = X.shape[1] n_indvar = n_covar + n_variant_block * n_alpha_1 sample_chunks = Y.chunks[0] if normalize: assert_block_shape(YP, n_variant_block, 1, n_sample_block, 1) assert_chunk_shape(YP, 1, n_alpha_1, sample_chunks[0], n_outcome) # See: https://github.com/projectglow/glow/issues/260 if _glow_adj_scaling: YP = da.map_blocks( lambda x: (x - x.mean(axis=2, keepdims=True)) / x.std(axis=2, keepdims=True), YP, ) else: YP = (YP - YP.mean(axis=2, keepdims=True)) / YP.std(axis=2, keepdims=True) # Tranpose for refit on level 1 predictions YP = YP.transpose((3, 2, 0, 1)) assert_array_shape(YP, n_outcome, n_sample, n_variant_block, n_alpha_1) if alphas is None: # See: https://github.com/projectglow/glow/issues/255 if _glow_adj_alpha: alphas = get_alphas(n_variant_block * n_alpha_1 * n_outcome) else: alphas = get_alphas(n_variant_block * n_alpha_1) n_alpha_2 = alphas.size YR = [] BR = [] for i in range(n_outcome): # Slice and reshape to new 2D covariate matrix; # The order of raveling in trailing dimensions is important # and later reshapes will assume variants, alphas order XPB = YP[i].reshape((n_sample, n_variant_block * n_alpha_1)) # Prepend covariates and chunk along first dim only XPB = da.concatenate((X, XPB), axis=1) XPB = XPB.rechunk(chunks=(None, -1)) assert_array_shape(XPB, n_sample, n_indvar) assert XPB.numblocks == (n_sample_block, 1) # Extract outcome vector YB = Y[:, [i]] assert XPB.ndim == YB.ndim == 2 # Fit and predict folds for each parameter BB, YPB = _ridge_regression_cv(XPB, YB, alphas, n_zero_reg=n_covar)[-2:] assert_array_shape(BB, n_alpha_2, n_sample_block * n_indvar, 1) assert_array_shape(YPB, n_alpha_2, n_sample, 1) BR.append(BB) YR.append(YPB) # Concatenate predictions along outcome dimension YR = da.concatenate(YR, axis=2) assert_block_shape(YR, 1, n_sample_block, n_outcome) assert_chunk_shape(YR, n_alpha_2, sample_chunks[0], 1) assert_array_shape(YR, n_alpha_2, n_sample, n_outcome) # Move samples to last dim so all others are batch # dims for R2 calculations YR = da.transpose(YR, (0, 2, 1)) assert_array_shape(YR, n_alpha_2, n_outcome, n_sample) YR = YR.rechunk((-1, -1, None)) assert_block_shape(YR, 1, 1, n_sample_block) assert YR.shape[1:] == Y.T.shape # Concatenate betas along outcome dimension BR = da.concatenate(BR, axis=2) assert_block_shape(BR, 1, n_sample_block, n_outcome) assert_chunk_shape(BR, n_alpha_2, n_indvar, 1) assert_array_shape(BR, n_alpha_2, n_sample_block * n_indvar, n_outcome) # Compute R2 scores within each sample block for each outcome + alpha R2 = da.stack( [ r2_score(YR.blocks[..., i], Y.T.blocks[..., i]) # Avoid warnings on R2 calculations for blocks with single rows if YR.chunks[-1][i] > 1 else da.full(YR.shape[:-1], np.nan) for i in range(n_sample_block) ] ) assert_array_shape(R2, n_sample_block, n_alpha_2, n_outcome) # Coerce to finite or nan before nan-aware mean R2 = da.where(da.isfinite(R2), R2, np.nan) # Find highest mean alpha score for each outcome across blocks R2M = da.nanmean(R2, axis=0) assert_array_shape(R2M, n_alpha_2, n_outcome) # Identify index for the alpha value with the highest mean score R2I = da.argmax(R2M, axis=0) assert_array_shape(R2I, n_outcome) # Choose the predictions corresponding to the model with best score YRM = da.stack([YR[R2I[i], i, :] for i in range(n_outcome)], axis=-1) YRM = YRM.rechunk((None, -1)) assert_block_shape(YRM, n_sample_block, 1) assert_chunk_shape(YRM, sample_chunks[0], n_outcome) assert_array_shape(YRM, n_sample, n_outcome) # Choose the betas corresponding to the model with the best score BRM = da.stack([BR[R2I[i], :, i] for i in range(n_outcome)], axis=-1) BRM = BRM.rechunk((None, -1)) assert_block_shape(BRM, n_sample_block, 1) assert_chunk_shape(BRM, n_indvar, n_outcome) assert_array_shape(BRM, n_sample_block * n_indvar, n_outcome) return BRM, YRM
def randn(shape, chunks=None, nan=False, seed=0): rng = da.random.RandomState(seed) x = 5 + 3 * rng.standard_normal(shape, chunks=chunks) if nan: x = da.where(x < 0, np.nan, x) return x
def read_band(self, key, info): """Read the data""" tic = datetime.now() header = {} with open(self.filename, "rb") as fp_: header['block1'] = np.fromfile( fp_, dtype=_BASIC_INFO_TYPE, count=1) header["block2"] = np.fromfile(fp_, dtype=_DATA_INFO_TYPE, count=1) header["block3"] = np.fromfile(fp_, dtype=_PROJ_INFO_TYPE, count=1) header["block4"] = np.fromfile(fp_, dtype=_NAV_INFO_TYPE, count=1) header["block5"] = np.fromfile(fp_, dtype=_CAL_INFO_TYPE, count=1) logger.debug("Band number = " + str(header["block5"]['band_number'][0])) logger.debug('Time_interval: %s - %s', str(self.start_time), str(self.end_time)) band_number = header["block5"]['band_number'][0] if band_number < 7: cal = np.fromfile(fp_, dtype=_VISCAL_INFO_TYPE, count=1) else: cal = np.fromfile(fp_, dtype=_IRCAL_INFO_TYPE, count=1) header['calibration'] = cal header["block6"] = np.fromfile( fp_, dtype=_INTER_CALIBRATION_INFO_TYPE, count=1) header["block7"] = np.fromfile( fp_, dtype=_SEGMENT_INFO_TYPE, count=1) header["block8"] = np.fromfile( fp_, dtype=_NAVIGATION_CORRECTION_INFO_TYPE, count=1) # 8 The navigation corrections: ncorrs = header["block8"]['numof_correction_info_data'][0] dtype = np.dtype([ ("line_number_after_rotation", "<u2"), ("shift_amount_for_column_direction", "f4"), ("shift_amount_for_line_direction", "f4"), ]) corrections = [] for i in range(ncorrs): corrections.append(np.fromfile(fp_, dtype=dtype, count=1)) fp_.seek(40, 1) header['navigation_corrections'] = corrections header["block9"] = np.fromfile(fp_, dtype=_OBS_TIME_INFO_TYPE, count=1) numobstimes = header["block9"]['number_of_observation_times'][0] dtype = np.dtype([ ("line_number", "<u2"), ("observation_time", "f8"), ]) lines_and_times = [] for i in range(numobstimes): lines_and_times.append(np.fromfile(fp_, dtype=dtype, count=1)) header['observation_time_information'] = lines_and_times fp_.seek(40, 1) header["block10"] = np.fromfile(fp_, dtype=_ERROR_INFO_TYPE, count=1) dtype = np.dtype([ ("line_number", "<u2"), ("numof_error_pixels_per_line", "<u2"), ]) num_err_info_data = header["block10"][ 'number_of_error_info_data'][0] err_info_data = [] for i in range(num_err_info_data): err_info_data.append(np.fromfile(fp_, dtype=dtype, count=1)) header['error_information_data'] = err_info_data fp_.seek(40, 1) np.fromfile(fp_, dtype=_SPARE_TYPE, count=1) nlines = int(header["block2"]['number_of_lines'][0]) ncols = int(header["block2"]['number_of_columns'][0]) res = da.from_array(np.memmap(self.filename, offset=fp_.tell(), dtype='<u2', shape=(nlines, ncols)), chunks=CHUNK_SIZE) res = da.where(res == 65535, np.float32(np.nan), res) self._header = header logger.debug("Reading time " + str(datetime.now() - tic)) res = self.calibrate(res, key.calibration) new_info = dict(units=info['units'], standard_name=info['standard_name'], wavelength=info['wavelength'], resolution='resolution', id=key, name=key.name, platform_name=self.platform_name, sensor=self.sensor, satellite_longitude=float( self.nav_info['SSP_longitude']), satellite_latitude=float( self.nav_info['SSP_latitude']), satellite_altitude=float(self.nav_info['distance_earth_center_to_satellite'] - self.proj_info['earth_equatorial_radius']) * 1000) res = xr.DataArray(res, attrs=new_info, dims=['y', 'x']) res = res.where(header['block5']["count_value_outside_scan_pixels"][0] != res) res = res.where(header['block5']["count_value_error_pixels"][0] != res) res = res.where(self.geo_mask()) return res
def get_sample_from_neighbour_info(self, data): # flatten x and y in the source array output_shape = [] chunks = [] source_dims = data.dims for dim in source_dims: if dim == 'y': output_shape += [self.target_geo_def.y_size] chunks += [1000] elif dim == 'x': output_shape += [self.target_geo_def.x_size] chunks += [1000] else: output_shape += [data[dim].size] chunks += [10] new_dims = [] xy_dims = [] source_shape = [1, 1] chunks = [1, 1] for i, dim in enumerate(data.dims): if dim not in ['x', 'y']: new_dims.append(dim) source_shape[1] *= data.shape[i] chunks[1] *= 10 else: xy_dims.append(dim) source_shape[0] *= data.shape[i] chunks[0] *= 1000 new_dims = xy_dims + new_dims target_shape = [np.prod(self.target_geo_def.shape), source_shape[1]] source_data = data.transpose(*new_dims).data.reshape(source_shape) input_size = self.valid_input_index.sum() index_mask = (self.index_array == input_size) new_index_array = da.where( index_mask, 0, self.index_array).ravel().astype(int).compute() valid_targets = self.valid_output_index.ravel() target_lines = [] for line in range(target_shape[1]): #target_data_line = target_data[:, line] new_data = source_data[:, line][self.valid_input_index.ravel()] # could this be a bug in dask ? we have to compute to avoid errors result = new_data.compute()[new_index_array] result[index_mask.ravel()] = np.nan #target_data_line = da.full(target_shape[0], np.nan, chunks=1000000) target_data_line = np.full(target_shape[0], np.nan) target_data_line[valid_targets] = result target_lines.append(target_data_line[:, np.newaxis]) target_data = np.hstack(target_lines) new_shape = [] for dim in new_dims: if dim == 'x': new_shape.append(self.target_geo_def.x_size) elif dim == 'y': new_shape.append(self.target_geo_def.y_size) else: new_shape.append(data[dim].size) output_arr = DataArray(da.from_array(target_data.reshape(new_shape), chunks=[1000] * len(new_shape)), dims=new_dims) for dim in source_dims: if dim == 'x': output_arr['x'] = self.target_geo_def.proj_x_coords elif dim == 'y': output_arr['y'] = self.target_geo_def.proj_y_coords else: output_arr[dim] = data[dim] return output_arr.transpose(*source_dims)
def get_sample_from_bil_info(self, data, fill_value=np.nan, output_shape=None): if fill_value is None: fill_value = np.nan # FIXME: can be this made into a dask construct ? cols, lines = np.meshgrid(np.arange(data['x'].size), np.arange(data['y'].size)) cols = da.ravel(cols) lines = da.ravel(lines) try: self.valid_input_index = self.valid_input_index.compute() except AttributeError: pass vii = self.valid_input_index.squeeze() try: self.index_array = self.index_array.compute() except AttributeError: pass # ia contains reduced (valid) indices of the source array, and has the # shape of the destination array ia = self.index_array rlines = lines[vii][ia] rcols = cols[vii][ia] slices = [] mask_slices = [] mask_2d_added = False coords = {} try: # FIXME: Use same chunk size as input data coord_x, coord_y = self.target_geo_def.get_proj_vectors_dask() except AttributeError: coord_x, coord_y = None, None for _, dim in enumerate(data.dims): if dim == 'y': slices.append(rlines) if not mask_2d_added: mask_slices.append(ia >= self.target_geo_def.size) mask_2d_added = True if coord_y is not None: coords[dim] = coord_y elif dim == 'x': slices.append(rcols) if not mask_2d_added: mask_slices.append(ia >= self.target_geo_def.size) mask_2d_added = True if coord_x is not None: coords[dim] = coord_x else: slices.append(slice(None)) mask_slices.append(slice(None)) try: coords[dim] = data.coords[dim] except KeyError: pass res = data.values[slices] res[mask_slices] = fill_value try: p_1 = res[:, :, 0] p_2 = res[:, :, 1] p_3 = res[:, :, 2] p_4 = res[:, :, 3] except IndexError: p_1 = res[:, 0] p_2 = res[:, 1] p_3 = res[:, 2] p_4 = res[:, 3] s__, t__ = self.bilinear_s, self.bilinear_t res = (p_1 * (1 - s__) * (1 - t__) + p_2 * s__ * (1 - t__) + p_3 * (1 - s__) * t__ + p_4 * s__ * t__) epsilon = 1e-6 data_min = da.nanmin(data) - epsilon data_max = da.nanmax(data) + epsilon idxs = (res > data_max) | (res < data_min) res = da.where(idxs, fill_value, res) shp = self.target_geo_def.shape if data.ndim == 3: res = da.reshape(res, (res.shape[0], shp[0], shp[1])) else: res = da.reshape(res, (shp[0], shp[1])) res = DataArray(da.from_array(res, chunks=CHUNK_SIZE), dims=data.dims, coords=coords) return res
def gradient(image): return da.where( da.fabs(image) <= huber['threshold'], 2 * image, 2 * huber['threshold'] * da.sign(image))
def decomposition(self, normalize_poissonian_noise=False, algorithm="SVD", output_dimension=None, signal_mask=None, navigation_mask=None, get=dask.threaded.get, num_chunks=None, reproject=True, print_info=True, **kwargs): """Perform Incremental (Batch) decomposition on the data. The results are stored in ``self.learning_results``. Read more in the :ref:`User Guide <big_data.decomposition>`. Parameters ---------- normalize_poissonian_noise : bool, default False If True, scale the signal to normalize Poissonian noise using the approach described in [KeenanKotula2004]_. algorithm : {'SVD', 'PCA', 'ORPCA', 'ORNMF'}, default 'SVD' The decomposition algorithm to use. output_dimension : int or None, default None Number of components to keep/calculate. If None, keep all (only valid for 'SVD' algorithm) get : dask scheduler the dask scheduler to use for computations; default `dask.threaded.get` num_chunks : int or None, default None the number of dask chunks to pass to the decomposition model. More chunks require more memory, but should run faster. Will be increased to contain at least ``output_dimension`` signals. navigation_mask : {BaseSignal, numpy array, dask array} The navigation locations marked as True are not used in the decomposition. signal_mask : {BaseSignal, numpy array, dask array} The signal locations marked as True are not used in the decomposition. reproject : bool, default True Reproject data on the learnt components (factors) after learning. print_info : bool, default True If True, print information about the decomposition being performed. In the case of sklearn.decomposition objects, this includes the values of all arguments of the chosen sklearn algorithm. **kwargs passed to the partial_fit/fit functions. References ---------- .. [KeenanKotula2004] M. Keenan and P. Kotula, "Accounting for Poisson noise in the multivariate analysis of ToF-SIMS spectrum images", Surf. Interface Anal 36(3) (2004): 203-212. See Also -------- * :py:meth:`~.learn.mva.MVA.decomposition` for non-lazy signals * :py:func:`dask.array.linalg.svd` * :py:class:`sklearn.decomposition.IncrementalPCA` * :py:class:`~.learn.rpca.ORPCA` * :py:class:`~.learn.ornmf.ORNMF` """ if kwargs.get("bounds", False): warnings.warn( "The `bounds` keyword is deprecated and will be removed " "in v2.0. Since version > 1.3 this has no effect.", VisibleDeprecationWarning, ) kwargs.pop("bounds", None) # Deprecate 'ONMF' for 'ORNMF' if algorithm == "ONMF": warnings.warn( "The argument `algorithm='ONMF'` has been deprecated and will " "be removed in future. Please use `algorithm='ORNMF'` instead.", VisibleDeprecationWarning, ) algorithm = "ORNMF" # Check algorithms requiring output_dimension algorithms_require_dimension = ["PCA", "ORPCA", "ORNMF"] if algorithm in algorithms_require_dimension and output_dimension is None: raise ValueError( "`output_dimension` must be specified for '{}'".format( algorithm)) explained_variance = None explained_variance_ratio = None _al_data = self._data_aligned_with_axes nav_chunks = _al_data.chunks[:self.axes_manager.navigation_dimension] sig_chunks = _al_data.chunks[self.axes_manager.navigation_dimension:] num_chunks = 1 if num_chunks is None else num_chunks blocksize = np.min([multiply(ar) for ar in product(*nav_chunks)]) nblocks = multiply([len(c) for c in nav_chunks]) if output_dimension and blocksize / output_dimension < num_chunks: num_chunks = np.ceil(blocksize / output_dimension) blocksize *= num_chunks # Initialize print_info to_print = [ "Decomposition info:", " normalize_poissonian_noise={}".format( normalize_poissonian_noise), " algorithm={}".format(algorithm), " output_dimension={}".format(output_dimension) ] # LEARN if algorithm == "PCA": if not import_sklearn.sklearn_installed: raise ImportError("algorithm='PCA' requires scikit-learn") obj = import_sklearn.sklearn.decomposition.IncrementalPCA( n_components=output_dimension) method = partial(obj.partial_fit, **kwargs) reproject = True to_print.extend(["scikit-learn estimator:", obj]) elif algorithm == "ORPCA": from hyperspy.learn.rpca import ORPCA batch_size = kwargs.pop("batch_size", None) obj = ORPCA(output_dimension, **kwargs) method = partial(obj.fit, batch_size=batch_size) elif algorithm == "ORNMF": from hyperspy.learn.ornmf import ORNMF batch_size = kwargs.pop("batch_size", None) obj = ORNMF(output_dimension, **kwargs) method = partial(obj.fit, batch_size=batch_size) elif algorithm != "SVD": raise ValueError("'algorithm' not recognised") original_data = self.data try: _logger.info("Performing decomposition analysis") if normalize_poissonian_noise: _logger.info("Scaling the data to normalize Poissonian noise") data = self._data_aligned_with_axes ndim = self.axes_manager.navigation_dimension sdim = self.axes_manager.signal_dimension nm = da.logical_not( da.zeros(self.axes_manager.navigation_shape[::-1], chunks=nav_chunks) if navigation_mask is None else to_array(navigation_mask, chunks=nav_chunks)) sm = da.logical_not( da.zeros(self.axes_manager.signal_shape[::-1], chunks=sig_chunks) if signal_mask is None else to_array(signal_mask, chunks=sig_chunks)) ndim = self.axes_manager.navigation_dimension sdim = self.axes_manager.signal_dimension bH, aG = da.compute( data.sum(axis=tuple(range(ndim))), data.sum(axis=tuple(range(ndim, ndim + sdim))), ) bH = da.where(sm, bH, 1) aG = da.where(nm, aG, 1) raG = da.sqrt(aG) rbH = da.sqrt(bH) coeff = raG[(..., ) + (None, ) * rbH.ndim] * rbH[(None, ) * raG.ndim + (..., )] coeff.map_blocks(np.nan_to_num) coeff = da.where(coeff == 0, 1, coeff) data = data / coeff self.data = data # LEARN if algorithm == "SVD": reproject = False from dask.array.linalg import svd try: self._unfolded4decomposition = self.unfold() # TODO: implement masking if navigation_mask or signal_mask: raise NotImplementedError( "Masking is not yet implemented for lazy SVD") U, S, V = svd(self.data) if output_dimension is None: min_shape = min(min(U.shape), min(V.shape)) else: min_shape = output_dimension U = U[:, :min_shape] S = S[:min_shape] V = V[:min_shape] factors = V.T explained_variance = S**2 / self.data.shape[0] loadings = U * S finally: if self._unfolded4decomposition is True: self.fold() self._unfolded4decomposition is False else: this_data = [] try: for chunk in progressbar( self._block_iterator( flat_signal=True, get=get, signal_mask=signal_mask, navigation_mask=navigation_mask, ), total=nblocks, leave=True, desc="Learn", ): this_data.append(chunk) if len(this_data) == num_chunks: thedata = np.concatenate(this_data, axis=0) method(thedata) this_data = [] if len(this_data): thedata = np.concatenate(this_data, axis=0) method(thedata) except KeyboardInterrupt: # pragma: no cover pass # GET ALREADY CALCULATED RESULTS if algorithm == "PCA": explained_variance = obj.explained_variance_ explained_variance_ratio = obj.explained_variance_ratio_ factors = obj.components_.T elif algorithm == "ORPCA": factors, loadings = obj.finish() loadings = loadings.T elif algorithm == "ORNMF": factors, loadings = obj.finish() loadings = loadings.T # REPROJECT if reproject: if algorithm == "PCA": method = obj.transform def post(a): return np.concatenate(a, axis=0) elif algorithm == "ORPCA": method = obj.project def post(a): return np.concatenate(a, axis=1).T elif algorithm == "ORNMF": method = obj.project def post(a): return np.concatenate(a, axis=1).T _map = map( lambda thing: method(thing), self._block_iterator( flat_signal=True, get=get, signal_mask=signal_mask, navigation_mask=navigation_mask, ), ) H = [] try: for thing in progressbar(_map, total=nblocks, desc="Project"): H.append(thing) except KeyboardInterrupt: # pragma: no cover pass loadings = post(H) if explained_variance is not None and explained_variance_ratio is None: explained_variance_ratio = explained_variance / explained_variance.sum( ) # RESHUFFLE "blocked" LOADINGS ndim = self.axes_manager.navigation_dimension if algorithm != "SVD": # Only needed for online algorithms try: loadings = _reshuffle_mixed_blocks(loadings, ndim, (output_dimension, ), nav_chunks).reshape( (-1, output_dimension)) except ValueError: # In case the projection step was not finished, it's left # as scrambled pass finally: self.data = original_data target = self.learning_results target.decomposition_algorithm = algorithm target.output_dimension = output_dimension if algorithm != "SVD": target._object = obj target.factors = factors target.loadings = loadings target.explained_variance = explained_variance target.explained_variance_ratio = explained_variance_ratio # Rescale the results if the noise was normalized if normalize_poissonian_noise is True: target.factors = target.factors * rbH.ravel()[:, np.newaxis] target.loadings = target.loadings * raG.ravel()[:, np.newaxis] # Print details about the decomposition we just performed if print_info: print("\n".join([str(pr) for pr in to_print]))
def _get_abs_max_from_min_max(min_, max_): """From array of min and array of max, get array of abs max.""" return da.where(-min_ > max_, min_, max_)
def _mask_invalid(self, data, header): """Mask invalid data""" invalid = da.logical_or(data == header['block5']["count_value_outside_scan_pixels"][0], data == header['block5']["count_value_error_pixels"][0]) return da.where(invalid, np.float32(np.nan), data)
def get_bil_info(self): """Return neighbour info. Returns ------- t__ : numpy array Vertical fractional distances from corner to the new points s__ : numpy array Horizontal fractional distances from corner to the new points input_idxs : numpy array Valid indices in the input data idx_arr : numpy array Mapping array from valid source points to target points """ if self.source_geo_def.size < self.neighbours: warnings.warn('Searching for %s neighbours in %s data points' % (self.neighbours, self.source_geo_def.size)) # Create kd-tree valid_input_idx, resample_kdtree = self._create_resample_kdtree() # This is a numpy array self.valid_input_index = valid_input_idx if resample_kdtree.n == 0: # Handle if all input data is reduced away bilinear_t, bilinear_s, valid_input_index, index_array = \ _create_empty_bil_info(self.source_geo_def, self.target_geo_def) self.bilinear_t = bilinear_t self.bilinear_s = bilinear_s self.valid_input_index = valid_input_idx self.index_array = index_array return bilinear_t, bilinear_s, valid_input_index, index_array target_lons, target_lats = self.target_geo_def.get_lonlats() valid_output_idx = ((target_lons >= -180) & (target_lons <= 180) & (target_lats <= 90) & (target_lats >= -90)) index_array, distance_array = self._query_resample_kdtree( resample_kdtree, target_lons, target_lats, valid_output_idx) # Reduce index reference input_size = da.sum(self.valid_input_index) index_mask = index_array == input_size index_array = da.where(index_mask, 0, index_array) # Get output projection as pyproj object proj = Proj(self.target_geo_def.proj_str) # Get output x/y coordinates out_x, out_y = _get_output_xy_dask(self.target_geo_def, proj) # Get input x/y coordinates in_x, in_y = _get_input_xy_dask(self.source_geo_def, proj, self.valid_input_index, index_array) # Get the four closest corner points around each output location pt_1, pt_2, pt_3, pt_4, index_array = \ _get_bounding_corners_dask(in_x, in_y, out_x, out_y, self.neighbours, index_array) # Calculate vertical and horizontal fractional distances t and s t__, s__ = _get_ts_dask(pt_1, pt_2, pt_3, pt_4, out_x, out_y) self.bilinear_t, self.bilinear_s = t__, s__ self.valid_output_index = valid_output_idx self.index_array = index_array self.distance_array = distance_array return (self.bilinear_t, self.bilinear_s, self.valid_input_index, self.index_array)
def get_reflectance(self, sun_zenith, sat_zenith, azidiff, bandname, redband=None): """Get the reflectance from the three sun-sat angles""" # Get wavelength in nm for band: if isinstance(bandname, float): LOG.warning('A wavelength is provided instead of band name - ' + 'disregard the relative spectral responses and assume ' + 'it is the effective wavelength: %f (micro meter)', bandname) wvl = bandname * 1000.0 else: wvl = self.get_effective_wavelength(bandname) wvl = wvl * 1000.0 rayl, wvl_coord, azid_coord, satz_sec_coord, sunz_sec_coord = self.get_reflectance_lut() # force dask arrays compute = False if HAVE_DASK and not isinstance(sun_zenith, Array): compute = True sun_zenith = from_array(sun_zenith, chunks=sun_zenith.shape) sat_zenith = from_array(sat_zenith, chunks=sat_zenith.shape) azidiff = from_array(azidiff, chunks=azidiff.shape) if redband is not None: redband = from_array(redband, chunks=redband.shape) clip_angle = rad2deg(arccos(1. / sunz_sec_coord.max())) sun_zenith = clip(sun_zenith, 0, clip_angle) sunzsec = 1. / cos(deg2rad(sun_zenith)) clip_angle = rad2deg(arccos(1. / satz_sec_coord.max())) sat_zenith = clip(sat_zenith, 0, clip_angle) satzsec = 1. / cos(deg2rad(sat_zenith)) shape = sun_zenith.shape if not(wvl_coord.min() < wvl < wvl_coord.max()): LOG.warning( "Effective wavelength for band %s outside 400-800 nm range!", str(bandname)) LOG.info( "Set the rayleigh/aerosol reflectance contribution to zero!") if HAVE_DASK: chunks = sun_zenith.chunks if redband is None else redband.chunks res = zeros(shape, chunks=chunks) return res.compute() if compute else res else: return zeros(shape) idx = np.searchsorted(wvl_coord, wvl) wvl1 = wvl_coord[idx - 1] wvl2 = wvl_coord[idx] fac = (wvl2 - wvl) / (wvl2 - wvl1) raylwvl = fac * rayl[idx - 1, :, :, :] + (1 - fac) * rayl[idx, :, :, :] tic = time.time() smin = [sunz_sec_coord[0], azid_coord[0], satz_sec_coord[0]] smax = [sunz_sec_coord[-1], azid_coord[-1], satz_sec_coord[-1]] orders = [ len(sunz_sec_coord), len(azid_coord), len(satz_sec_coord)] f_3d_grid = atleast_2d(raylwvl.ravel()) if HAVE_DASK and isinstance(smin[0], Array): # compute all of these at the same time before passing to the interpolator # otherwise they are computed separately smin, smax, orders, f_3d_grid = da.compute(smin, smax, orders, f_3d_grid) minterp = MultilinearInterpolator(smin, smax, orders) minterp.set_values(f_3d_grid) if HAVE_DASK: ipn = map_blocks(self._do_interp, minterp, sunzsec, azidiff, satzsec, dtype=raylwvl.dtype, chunks=azidiff.chunks) else: ipn = self._do_interp(minterp, sunzsec, azidiff, satzsec) LOG.debug("Time - Interpolation: {0:f}".format(time.time() - tic)) ipn *= 100 res = ipn if redband is not None: res = where(redband < 20., res, (1 - (redband - 20) / 80) * res) res = clip(res, 0, 100) if compute: res = res.compute() return res
def segment( image, channels, model_type, diameter, fast_mode=False, use_anisotropy=True, iou_depth=2, iou_threshold=0.7, ): """Use cellpose to segment nuclei in fluorescence data. Parameters ---------- image : array of shape (z, y, x, channel) Image used for detection of objects channels : array of int with size 2 See cellpose model_type : str "cyto" or "nuclei" diameter : tuple of size 3 Approximate diameter (in pixels) of a segmented region, i.e. cell width fast_mode : bool In fast mode, network averaging, tiling, and augmentation are turned off. use_anisotropy : bool If true, use anisotropy parameter of cellpose iou_depth: dask depth parameter Number of pixels of overlap to use in intersection-over-union calculation when linking segments across neighboring, overlapping dask chunk regions. iou_threshold: float Minimum intersection-over-union in neighboring, overlapping dask chunk regions to be considered the same segment. The region for calculating IOU is given by the iou_depth parameter. Returns: segments : array of int32 with same shape as input Each segmented cell is assigned a number and all its pixels contain that value (0 is background) """ assert image.ndim == 4, image.ndim assert image.shape[-1] in {1, 2}, image.shape assert diameter[1] == diameter[2], diameter diameter_yx = diameter[1] anisotropy = diameter[0] / diameter[1] if use_anisotropy else None image = da.asarray(image) image = image.rechunk({-1: -1}) # color channel is chunked together depth = tuple(np.ceil(diameter).astype(np.int64)) boundary = "reflect" # No chunking in channel direction image = da.overlap.overlap(image, depth + (0, ), boundary) block_iter = zip( np.ndindex(*image.numblocks), map( functools.partial(operator.getitem, image), da.core.slices_from_chunks(image.chunks), ), ) labeled_blocks = np.empty(image.numblocks[:-1], dtype=object) total = None for index, input_block in block_iter: labeled_block, n = dask.delayed(segment_chunk, nout=2)( input_block, channels, model_type, diameter_yx, anisotropy, fast_mode, index, ) shape = input_block.shape[:-1] labeled_block = da.from_delayed(labeled_block, shape=shape, dtype=np.int32) n = dask.delayed(np.int32)(n) n = da.from_delayed(n, shape=(), dtype=np.int32) total = n if total is None else total + n block_label_offset = da.where(labeled_block > 0, total, np.int32(0)) labeled_block += block_label_offset labeled_blocks[index[:-1]] = labeled_block total += n # Put all the blocks together block_labeled = da.block(labeled_blocks.tolist()) depth = da.overlap.coerce_depth(len(depth), depth) if np.prod(block_labeled.numblocks) > 1: iou_depth = da.overlap.coerce_depth(len(depth), iou_depth) if any(iou_depth[ax] > depth[ax] for ax in depth.keys()): raise DistSegError("iou_depth (%s) > depth (%s)" % (iou_depth, depth)) trim_depth = {k: depth[k] - iou_depth[k] for k in depth.keys()} block_labeled = da.overlap.trim_internal(block_labeled, trim_depth, boundary=boundary) block_labeled = link_labels( block_labeled, total, iou_depth, iou_threshold=iou_threshold, ) block_labeled = da.overlap.trim_internal(block_labeled, iou_depth, boundary=boundary) else: block_labeled = da.overlap.trim_internal(block_labeled, depth, boundary=boundary) return block_labeled
def invalid_to_nan(t__, s__): idxs = (t__ < 0) | (t__ > 1) | (s__ < 0) | (s__ > 1) t__ = da.where(idxs, np.nan, t__) s__ = da.where(idxs, np.nan, s__) return t__, s__
def individual_heterozygosity( ds: Dataset, *, call_allele_count: Hashable = variables.call_allele_count, merge: bool = True, ) -> Dataset: """Compute per call individual heterozygosity. Individual heterozygosity is the probability that two alleles drawn at random without replacement, from an individual at a given site, are not identical in state. Therefore, individual heterozygosity is defined for diploid and polyploid calls but will return nan in the case of haploid calls. Parameters ---------- ds Dataset containing genotype calls. call_allele_count Input variable name holding call_allele_count as defined by :data:`sgkit.variables.call_allele_count_spec`. If the variable is not present in ``ds``, it will be computed using :func:`count_call_alleles`. merge If True (the default), merge the input dataset and the computed output variables into a single dataset, otherwise return only the computed output variables. See :ref:`dataset_merge` for more details. Returns ------- A dataset containing :data:`sgkit.variables.call_heterozygosity_spec` of per genotype observed heterozygosity with shape (variants, samples) containing values within the interval [0, 1] or nan if ploidy < 2. Examples -------- >>> import sgkit as sg >>> ds = sg.simulate_genotype_call_dataset(n_variant=4, n_sample=2, seed=1) >>> sg.display_genotypes(ds) # doctest: +NORMALIZE_WHITESPACE samples S0 S1 variants 0 1/0 1/0 1 1/0 1/1 2 0/1 1/0 3 0/0 0/0 >>> sg.individual_heterozygosity(ds)["call_heterozygosity"].values # doctest: +NORMALIZE_WHITESPACE array([[1., 1.], [1., 0.], [1., 1.], [0., 0.]]) """ ds = define_variable_if_absent(ds, variables.call_allele_count, call_allele_count, count_call_alleles) variables.validate(ds, {call_allele_count: variables.call_allele_count_spec}) AC = da.asarray(ds.call_allele_count) K = AC.sum(axis=-1) # use nan denominator to avoid divide by zero with K - 1 K2 = da.where(K > 1, K, np.nan) AF = AC / K2[..., None] HI = (1 - da.sum(AF**2, axis=-1)) * (K / (K2 - 1)) new_ds = create_dataset( {variables.call_heterozygosity: (("variants", "samples"), HI)}) return conditional_merge_datasets(ds, new_ds, merge)
lambda df: df.dup_strings.where(df.dup_strings != 'a'), tm.assert_series_equal, id='series_literal', ), pytest.param( lambda t: t.dup_strings, lambda t: t.dup_strings, lambda df: df.dup_strings.where(df.dup_strings != df.dup_strings), tm.assert_series_equal, id='series_series', ), pytest.param( lambda t: ibis.literal('a'), lambda t: t.dup_strings, lambda df: dd.from_array( da.where(df.dup_strings.eq('a').values, np.nan, 'a') ), tm.assert_series_equal, id='literal_series', ), ], ) def test_nullif(t, df, left, right, expected, compare): expr = left(t).nullif(right(t)) result = execute(expr) if isinstance(result, (dd.Series, dd.DataFrame)): compare(result.compute(), expected(df).compute()) else: compare(result, expected(df))
def _correct_slope(self, slope): # 0 slope is invalid. Note: slope can be a scalar or array. return da.where(slope == 0, 1, slope)