def __init__(self, plink_file, scratch_dir, overwrite=False): self.options = tf.python_io.TFRecordOptions( tf.python_io.TFRecordCompressionType.ZLIB) self.plink_file = plink_file self.scratch_dir = scratch_dir # read plink data print('\nReading PLINK data...') self.bim, self.fam, G = read_plink(plink_file) # import ipdb; ipdb.set_trace() print('Done') # write tf.records if overwrite: G_df = dd.from_dask_array(da.transpose(G)) G_df = G_df.fillna(value=1) # (. _ . ) G_df = G_df.astype(np.int8) tf_records_filenames = G_df.apply(self._write_records, axis=1).compute() print('Done') else: root, dirs, files = next(os.walk(scratch_dir)) tf_records_filenames = [ root + f for f in files if f.endswith('.tfrecords') ] # split into training and test batches self.train_files, self.test_files = train_test_split( tf_records_filenames, test_size=0.20, random_state=42)
def __data_generation(self, indexes): 'Generates data containing batch_size samples' # X : (n_samples, *dim, n_channels) # Fetch features and normalize them Xnorm = da.from_zarr(self.features)[indexes, ] Xnorm = (Xnorm - self.norm_data['mean']) / self.norm_data['std'] # Transpose if needed if self.transpose: Xnorm = da.transpose(Xnorm, axes=[0, 2, 1]) # Generate data X = da.reshape(Xnorm, [len(indexes), *self.dim]).compute() y = self.labels[indexes].copy() # Return X,y pairs now (without mixup) if not self.use_mixup or (self.set_type != 'train'): return X, self.to_categorical(y, num_classes=self.num_classes) # Mixup mixed_x, y_a, y_b, lamb = self.mixup_batch(X, y, alpha=self.mixup_alpha) batch_data_in = mixed_x # X_mixup y_a = self.to_categorical(y_a, num_classes=self.num_classes) y_b = self.to_categorical(y_b, num_classes=self.num_classes) batch_data_out = lamb * y_a + (1 - lamb) * y_b # y_mixup return batch_data_in, batch_data_out
def cal(x,client): st = time.time() #Distributed scheduler #with dask.set_options(get=dask.threaded.get): with dask.set_options(get=client.get): A = da.transpose(x) B = da.dot(x,A) C = da.dot(B,B) print C.compute() #Default scheduler # with dask.set_options(get=dask.threaded.get): # A = da.transpose(x) # B = da.dot(x,A) # C = da.dot(B,B) # # print C.compute() #mannually set global thread. # from multiprocessing.pool import ThreadPool # with dask.set_options(pool=ThreadPool(4)): # A = da.transpose(x) # B = da.dot(x,A) # C = da.dot(B,B) # # print C.compute(num_works = 4) print 'time: ',time.time()-st return 0
def test_reshape_data_kwargs_values( data, given_dims, return_dims, other_args, getitem_ops_for_expected, transposer, ): actual = reshape_data( data=data, given_dims=given_dims, return_dims=return_dims, **other_args, ) expected = data[getitem_ops_for_expected] if transposer is not None: if isinstance(data, np.ndarray): expected = np.transpose(expected, transposer) else: expected = da.transpose(expected, transposer) # Check that the output data is the same type as the input assert type(actual) == type(expected) if isinstance(data, da.core.Array): actual = actual.compute() expected = expected.compute() # Check actual data np.testing.assert_array_equal(actual, expected)
def power_dask(data, x_init): A = da.matmul(data, da.transpose(data)) A.compute() T = 150 y = x_init for t in range(T): v = np.matmul(A, y) y = v / np.linalg.norm(v)
def train_set_minibatches(self, batch_size=10): ''' Yield batches of samples from the studies in the test and train datasets. ''' for study, (bim, fam, G) in self.train_studies.items(): for batch in minibatch(da.transpose(G), batch_size=batch_size): gene_matrix = batch.to_dask_dataframe() yield gene_matrix.fillna(gene_matrix.mean(axis=0), axis=0)
def reconstruct_im(data_3D, dark_ref, return_dask=False): core_count = multiprocessing.cpu_count() data_shape = data_3D.shape con_shape = tuple((np.asarray(data_shape[0:2]) * np.asarray((0.5, 2))).astype(int)) xvals = int(data_shape[-1] ** 0.5) data3d_dask = da.from_array(data_3D, chunks=(-1, -1, "auto")) data_shape = data3d_dask.shape mean_dark_ref = np.mean(dark_ref.astype(np.float), axis=-1) d3r = da.transpose(data3d_dask, (2, 0, 1)) d3s = d3r - mean_dark_ref d3D_dref = da.transpose(d3s, (1, 2, 0)) top_part = d3D_dref[0 : con_shape[0], :, :] bot_part = d3D_dref[con_shape[0] : data_shape[0], :, :] top_part_rs = top_part[::-1, ::-1, :] data3d_arranged = da.concatenate([bot_part, top_part_rs], axis=1) shape4d = (con_shape[0], con_shape[1], xvals, xvals) data4d_dask = da.reshape(data3d_arranged, shape4d) if return_dask: return data4d_dask else: data4D = data4d_dask.compute(num_workers=core_count) return data4D
def shape(cls, dataset, gridded=False): array = dataset.data[dataset.vdims[0].name] if not any(cls.irregular(dataset, kd) for kd in dataset.kdims): names = [kd.name for kd in dataset.kdims if kd.name in array.dims][::-1] if not all(d in names for d in array.dims): array = np.squeeze(array) array = array.transpose(*names) shape = array.shape if gridded: return shape else: return (np.product(shape), len(dataset.dimensions()))
def update_velocities(position, velocity, mass, G, epsilon): """Calculate the interactions between all particles and update the velocities. Args: position (dask array): dask array of all particle positions in cartesian coordinates. velocity (dask array): dask array of all particle velocities in cartesian coordinates. mass (dask array): dask array of all particle masses. G (float): gravitational constant. epsilon (float): softening parameter. Returns: velocity: updated particle velocities in cartesian coordinates. """ dx = da.subtract.outer(position[:, 0], position[:, 0]) dy = da.subtract.outer(position[:, 1], position[:, 1]) dz = da.subtract.outer(position[:, 2], position[:, 2]) r2 = da.square(dx) + da.square(dy) + da.square(dz) + da.square(epsilon) # coef = -G * mass[:] ax = coef * dx ay = coef * dy az = coef * dz # ax_scaled = da.divide(ax, r2) ay_scaled = da.divide(ay, r2) az_scaled = da.divide(az, r2) # total_ax = da.nansum(ax_scaled, axis=1) total_ay = da.nansum(ay_scaled, axis=1) total_az = da.nansum(az_scaled, axis=1) # velocity_x = da.diag(da.add.outer(da.transpose(velocity)[0], total_ax)) velocity_y = da.diag(da.add.outer(da.transpose(velocity)[1], total_ay)) velocity_z = da.diag(da.add.outer(da.transpose(velocity)[2], total_az)) # velocity = np.column_stack((velocity_x.compute(), velocity_y.compute(), velocity_z.compute())) return velocity
def transpose_grid_array( grid: GridArray, axes: Optional[list] = None, ) -> GridArray: """Reverses or permutes the axes of a `GridArray`. Parameters ---------- axes: ``list``, optional List of integers and/or strings that identify the permutation of the axes. The i'th axis of the returned `GridArray` will correspond to the axis numbered/labeled axes[i] of the input. If not specified, the order of the axes is reversed. Returns ------ :class:`nata.containers.GridArray`: Transpose of ``grid``. Examples -------- Transpose a three-dimensional array. >>> from nata.containers import GridArray >>> import numpy as np >>> data = np.arange(96).reshape((8, 4, 3)) >>> grid = GridArray.from_array(data) >>> grid.transpose().shape (3, 4, 8) >>> grid.transpose(axes=[0,2,1]).shape (8, 3, 4) """ # get transpose axes tr_axes = get_transpose_axes(grid, axes) if len(set(tr_axes)) is not grid.ndim: raise ValueError("invalid transpose axes") return GridArray.from_array( da.transpose(grid.to_dask(), axes=tr_axes), name=grid.name, label=grid.label, unit=grid.unit, axes=[grid.axes[axis] for axis in tr_axes], time=grid.time, )
def get_activations(activation_function, batch_gen): """ Computes the activations of a data set at one layer of the model in a "delayed" way (for memory and computation efficiency) and return them as a dask array. See: https://docs.dask.org/en/latest/delayed.html """ layer_shape = K.int_shape(activation_function.outputs[0])[1:] layer_dim = np.prod(K.int_shape(activation_function.outputs[0])[1:]) n_images = batch_gen.n_images n_aug = batch_gen.aug_per_im batch_size = batch_gen.batch_size # Delayed computation of the activations of a batch @dask.delayed def batch_activation(): batch_images, _ = next(batch_gen()) return activation_function([batch_images, 0])[0] # Delayed iteration over the data set activations_delayed = [batch_activation() for _ in range(batch_gen.n_batches)] activations_da_list = [da.from_delayed( activation_delayed, shape=(batch_size * n_aug, ) + layer_shape, dtype=K.floatx()) for activation_delayed in activations_delayed] activations_da = da.concatenate(activations_da_list, axis=0) # The last batch can be smaller activations_da = activations_da[:n_images * n_aug] # Reshape the activations such that # shape = (n_diff_images, layer_dim, n_aug) activations_da = da.reshape(activations_da, (activations_da.shape[0], layer_dim)) activations_da = da.transpose(da.reshape(activations_da.T, (layer_dim, n_images, n_aug)), (1, 0, 2)) return activations_da
def calc_moments(self): with h5py.File(self.infile, 'r', rdcc_nbytes=1000 * 1000 * 1000) as f: data = da.from_array(f['data'], chunks=(-1, 256, -1, -1)) # CNHW layout data = da.transpose(data, (1, 2, 3, 0)) dtype = data.dtype if dtype != np.float32: print( 'WARNING: data will be saved as float32 but input ist float64!' ) if self.mean is None: arr = data with ProgressBar(): self.mean, self.std = da.compute(arr.mean(axis=[0, 1, 2]), arr.std(axis=[0, 1, 2]), num_workers=8) else: self.mean, self.std = np.asarray( self.mean, dtype=dtype), np.asarray(self.std, dtype=dtype) print('mean: {}, std: {}'.format(list(self.mean), list(self.std))) if self.log1p_norm: data_z_norm = (data - self.mean) / self.std data_log1p = da.sign(data_z_norm) * da.log1p( da.fabs(data_z_norm)) if self.mean_log1p is None: arr = data_log1p with ProgressBar(): self.mean_log1p, self.std_log1p = da.compute( arr.mean(axis=[0, 1, 2]), arr.std(axis=[0, 1, 2]), num_workers=8) else: self.mean_log1p, self.std_log1p = np.asarray( self.mean_log1p, dtype=dtype), np.asarray(self.std_log1p, dtype=dtype) print('mean_log1p: {}, std_log1p: {}'.format( list(self.mean_log1p), list(self.std_log1p)))
def test_linear_operators(): A = da.random.random((100, 50), chunks=20) Adlo = linop.DaskLinearOperator(A) assert Adlo.size == A.size assert Adlo.shape == A.shape assert Adlo.chunks == A.chunks assert Adlo.numblocks == A.numblocks assert Adlo.dtype == A.dtype try: linop.DLOSymmetric(A) except AssertionError: print 'fail on dims' try: linop.DLOSymmetric(da.random.random((100, 100), chunks=(10, 20))) except AssertionError: print 'fail on chunks' Asymm = linop.DLOSymmetric(da.random.random((100, 100), chunks=10)) assert Asymm.numblocks == (10, 10) Adn = linop.DLODense(A) assert Adn.numblocks == A.numblocks Adiag = linop.DLODiagonal(da.diag(da.random.random((100, 100), chunks=50))) assert Adiag.numblocks == (2, 2) assert Adiag.data.numblocks == (2, ) Agm = linop.DLOGram(A) assert Agm.numblocks == (A.numblocks[1], A.numblocks[1]) Agm2 = linop.DLOGram(da.transpose(A)) assert Agm.shape == Agm2.shape Agm = linop.DLORegularizedGram(A) assert Agm.regularization == 1
print('using dummy annot') with h5py.File(alnfile +'.h5', 'r') as hf: align_array = hf['MSA2array'] print('array shape' ,align_array.shape) dummy_annot = {'dummy_gene': { 'qstart':1 , 'qend':align_array.shape[1]-1 , 'evalue':0 }} annotation = pd.DataFrame.from_dict( dummy_annot , orient = 'index') print('selecting informative sites') def retcounts( row ): return np.unique( row , return_counts=True) with h5py.File(alnfile +'.h5', 'r') as hf: align_array = hf['MSA2array'] array = da.from_array(align_array) array = da.transpose(array) #create a df of the columns daskdf = dd.from_dask_array(array) daskdf['unique']= daskdf.apply( retcounts , axis =1) res = list( daskdf['unique'].compute() ) print('compiling sites') sites= { col : dict(zip(list(unique[0]), list(unique[1]))) for col,unique in enumerate(res) } informativesites = set([ s for s in sites if len( set( sites[s].keys()) -set([b'-',b'N']) ) > 1 ] ) print('done') print('informative columns:' , len(informativesites)) #associate informative sites to a codon codon_dict = {} print( 'grouping codons') for i,r in annotation.iterrows():
def cov_mult(conv_matrix, cov_matrix): conv_matrix = da.transpose(conv_matrix) return da.matmul(da.matmul(conv_matrix, cov_matrix), da.transpose(conv_matrix))
nny = np.random.uniform(0, 10) nnz = np.random.uniform(0, 10) theta = np.random.uniform(0, 2 * np.pi) nx = 1 / np.sqrt(nnx**2 + nny**2 + nnz**2) * nnx ny = 1 / np.sqrt(nnx**2 + nny**2 + nnz**2) * nny nz = 1 / np.sqrt(nnx**2 + nny**2 + nnz**2) * nnz R = rotation(nx, ny, nz, theta) mesh2 = np.dot(R, mesh1) sys.exit() #mesh2 = da.transpose(da.dot(R,da.transpose(subcat['Position']))) mesh2 = da.transpose(da.dot(R, da.transpose(mesh1))) sys.exit() proj1 = np.fft.fftshift(mesh1.preview(axes=[0, 1], Nmesh=nmesh)) proj1 = proj1[num:-num, num:-num] # Generate a random projection angle theta = np.random.uniform(0, np.pi, size=num_maps) phi = np.random.uniform(0, 2 * np.pi, size=num_maps) theta_hat = np.array( [np.cos(theta) * np.cos(phi), np.cos(theta) * np.sin(phi), -np.sin(theta)]).T phi_hat = np.array([-np.sin(phi), np.cos(phi), np.zeros(num_maps)]).T
def load_single(file, drop_ghost=True, use_dask=True, var_list="all", ini_file=None): """Load a single step file and generate an xarray Dataset Parameters ---------- file : str or Path Location of the file to load drop_ghost : bool, optional Drop all of the ghost cells, by default True var_list : List, optional Load only a specific set of variables, by default 'all' Returns ------- xarray Dataset """ if var_list == "all": var_list = [ "density", "pressure", "sound_speed", "x_velocity", "y_velocity", "ghost_cell", "deposited_energy", "deposited_power", ] data_vars = {} space_dims = ("i", "j") if not file.endswith(".h5"): raise Exception("Step files must be .h5 files") h5 = h5py.File(file, "r") for v in var_list: try: h5[f"/{v}"].shape except KeyError: continue if use_dask: chunk_size = h5[f"/{v}"].shape array = da.from_array(h5[f"/{v}"], chunks=chunk_size) array = da.transpose(array) else: array = h5[f"/{v}"][()].T.astype(np.float32) try: long_name = var_dict[v]["long_name"] except Exception: long_name = "" try: description = h5[f"/{v}"].attrs["description"].decode("utf-8") except Exception: description = "" try: standard_name = var_dict[v]["standard_name"] except Exception: standard_name = "" try: units = h5[f"/{v}"].attrs["units"].decode("utf-8") except Exception: units = "" data_vars[f"{v}"] = xr.Variable( space_dims, array, attrs={ "units": units, "description": description, "long_name": long_name, "standard_name": standard_name, }, ) x = h5[f"/x"][()].T.astype(np.float32) x_units = h5[f"/x"].attrs["units"].decode("utf-8") y = h5[f"/y"][()].T.astype(np.float32) # Get the cell centers dy = (np.diff(x[0, :]) / 2.0)[0] dx = (np.diff(y[:, 0]) / 2.0)[0] # cell center locations xc = x[:-1, 0] + dx yc = y[0, :-1] + dy coords = { "time": h5[f"/time"][()].astype(np.float32), "x": (["i"], xc), "y": (["j"], yc), } time_units = h5[f"/time"].attrs["units"].decode("utf-8") # Get the details about the CATO build info_attr = {} info = [ "build_type", "compile_hostname", "compile_os", "compiler_flags", "compiler_version", "git_changes", "git_hash", "git_ref", "version", ] for v in info: try: info_attr[v] = h5["/cato_info"].attrs[f"{v}"].decode("utf-8") except Exception: pass attr_dict = info_attr attr_dict["time_units"] = time_units attr_dict["space_units"] = x_units if ini_file: input_dict = read_ini(ini_file) attr_dict.update(input_dict) ds = xr.Dataset(data_vars=data_vars, coords=coords, attrs=attr_dict) if ini_file: try: ds.attrs["title"] = ds.attrs["general_title"] except Exception: pass if drop_ghost: try: ds = ds.where(ds["ghost_cell"] == 0, drop=True) return ds.drop("ghost_cell") except KeyError: return ds else: return ds
kernels_mean = np.random.random((total_kernels, 3**2 * input_channels)) cov_list = [ random_cov(3**2 * input_channels) for number in range(total_kernels) ] kernels_cov = np.stack(cov_list) X = da.from_array(X) kernels_mean = da.from_array(kernels_mean) kernels_cov = da.from_array(kernels_cov) batch_out = [] for i in range(batch_size): kernel_out = [] for j in range(total_kernels): mean = da.matmul(kernels_mean[j, :], X[i, :, :]) cov = da.matmul(da.transpose(X[i, :, :]), da.matmul(kernels_cov[j, :, :], X[i, :, :])) z = mvn_random_DASK(mean, cov, total_samples, input_size**2) g = relu(z) mean_g = da.mean(g, axis=1) kernel_out.append(mean_g) kernels_out = da.stack(kernel_out, axis=0) batch_out.append(kernels_out) batches_out = da.stack(batch_out, axis=0) print('task graph complete') mean_g.visualize(rankdir="LR", filename="task_graph_mean_g.pdf", cmap='viridis') kernels_out.visualize(rankdir="LR", filename="task_graph_conv_out.pdf") batches_out.visualize(rankdir="LR", filename="task_graph_batches_out.pdf")
def _stage_3( B: Array, YP: Array, X: Array, Y: Array, contigs: Array, variant_chunk_start: NDArray, ) -> Optional[Array]: """Stage 3 - Leave-one-chromosome-out (LOCO) Estimation This stage will use the coefficients for the optimal model in stage 2 to re-estimate predictions in a LOCO scheme. This scheme involves omitting coefficients that correspond to all variant blocks for a single chromosome in the stage 2 model and then recomputing predictions without those coefficients. For more details, see the "LOCO predictions" section of the Supplementary Methods in [Mbatchou et al. 2020](https://www.biorxiv.org/content/10.1101/2020.06.19.162354v2). """ assert B.ndim == 2 assert YP.ndim == 4 assert X.ndim == 2 assert Y.ndim == 2 # Check that chunking across samples is the same for all arrays assert B.numblocks[0] == YP.numblocks[2] == X.numblocks[0] == Y.numblocks[0] assert YP.chunks[2] == X.chunks[0] == Y.chunks[0] # Extract shape statistics sample_chunks = Y.chunks[0] n_covar = X.shape[1] n_variant_block, n_alpha_1 = YP.shape[:2] n_indvar = n_covar + n_variant_block * n_alpha_1 n_sample_block = Y.numblocks[0] n_sample, n_outcome = Y.shape # Determine unique contigs to create LOCO estimates for contigs = np.asarray(contigs, like=contigs) unique_contigs = np.unique(contigs) # type: ignore[no-untyped-call] if hasattr(unique_contigs, "compute"): unique_contigs = unique_contigs.compute() n_contig = len(unique_contigs) if n_contig <= 1: # Return nothing w/o at least 2 contigs return None assert n_variant_block == len(variant_chunk_start) # Create vector of size `n_variant_block` where value # at index i corresponds to contig for variant block i variant_block_contigs = contigs[variant_chunk_start] # Transform coefficients (B) such that trailing dimensions # contain right half of matrix product for prediction: # (n_sample_block * n_indvar, n_outcome) -> # (n_outcome, n_sample_block, n_indvar) B = da.stack([B.blocks[i] for i in range(n_sample_block)], axis=0) assert_block_shape(B, n_sample_block, 1, 1) assert_chunk_shape(B, 1, n_indvar, n_outcome) assert_array_shape(B, n_sample_block, n_indvar, n_outcome) B = da.transpose(B, (2, 0, 1)) assert_block_shape(B, 1, n_sample_block, 1) assert_chunk_shape(B, n_outcome, 1, n_indvar) assert_array_shape(B, n_outcome, n_sample_block, n_indvar) # Decompose coefficients (B) so that variant blocks can be sliced: # BX -> (n_outcome, n_sample_block, n_covar) # BYP -> (n_outcome, n_sample_block, n_variant_block, n_alpha_1) BX = B[..., :n_covar] assert_array_shape(BX, n_outcome, n_sample_block, n_covar) BYP = B[..., n_covar:] assert_array_shape(BYP, n_outcome, n_sample_block, n_variant_block * n_alpha_1) BYP = BYP.reshape((n_outcome, n_sample_block, n_variant_block, n_alpha_1)) assert_block_shape(BYP, 1, n_sample_block, 1, 1) assert_chunk_shape(BYP, n_outcome, 1, n_variant_block, n_alpha_1) assert_array_shape(BYP, n_outcome, n_sample_block, n_variant_block, n_alpha_1) # Transform base predictions (YP) such that trailing dimensions # contain left half of matrix product for prediction as well # as variant blocks to slice on: # (n_variant_block, n_alpha_1, n_sample, n_outcome) -> # (n_outcome, n_sample, n_variant_block, n_alpha_1) YP = da.transpose(YP, (3, 2, 0, 1)) assert_block_shape(YP, 1, n_sample_block, n_variant_block, 1) assert_chunk_shape(YP, n_outcome, sample_chunks[0], 1, n_alpha_1) assert_array_shape(YP, n_outcome, n_sample, n_variant_block, n_alpha_1) def apply(X: Array, YP: Array, BX: Array, BYP: Array) -> Array: # Collapse selected variant blocks and alphas into single # new covariate dimension assert YP.shape[2] == BYP.shape[2] n_group_covar = n_covar + BYP.shape[2] * n_alpha_1 BYP = BYP.reshape((n_outcome, n_sample_block, -1)) BG = da.concatenate((BX, BYP), axis=-1) BG = BG.rechunk((-1, None, -1)) assert_block_shape(BG, 1, n_sample_block, 1) assert_chunk_shape(BG, n_outcome, 1, n_group_covar) assert_array_shape(BG, n_outcome, n_sample_block, n_group_covar) YP = YP.reshape((n_outcome, n_sample, -1)) XYP = da.broadcast_to(X, (n_outcome, n_sample, n_covar)) XG = da.concatenate((XYP, YP), axis=-1) XG = XG.rechunk((-1, None, -1)) assert_block_shape(XG, 1, n_sample_block, 1) assert_chunk_shape(XG, n_outcome, sample_chunks[0], n_group_covar) assert_array_shape(XG, n_outcome, n_sample, n_group_covar) YG = da.map_blocks( # Block chunks: # (n_outcome, sample_chunks[0], n_group_covar) @ # (n_outcome, n_group_covar, 1) [after transpose] lambda x, b: x @ b.transpose((0, 2, 1)), XG, BG, chunks=(n_outcome, sample_chunks, 1), ) assert_block_shape(YG, 1, n_sample_block, 1) assert_chunk_shape(YG, n_outcome, sample_chunks[0], 1) assert_array_shape(YG, n_outcome, n_sample, 1) YG = da.squeeze(YG, axis=-1).T assert_block_shape(YG, n_sample_block, 1) assert_chunk_shape(YG, sample_chunks[0], n_outcome) assert_array_shape(YG, n_sample, n_outcome) return YG # For each contig, generate predictions for all sample+outcome # combinations using only betas from stage 2 results that # correspond to *other* contigs (i.e. LOCO) YC = [] for contig in unique_contigs: # Define a variant block mask of size `n_variant_block` # determining which blocks correspond to this contig variant_block_mask = variant_block_contigs == contig if hasattr(variant_block_mask, "compute"): variant_block_mask = variant_block_mask.compute() BYPC = BYP[:, :, ~variant_block_mask, :] YPC = YP[:, :, ~variant_block_mask, :] YGC = apply(X, YPC, BX, BYPC) YC.append(YGC) YC = da.stack(YC, axis=0) assert_array_shape(YC, n_contig, n_sample, n_outcome) return YC
def test_set(self): gene_matrix = da.concatenate( [G for (bim, fam, G) in self.test_studies.values()], axis=0) gene_matrix = da.transpose(gene_matrix).to_dask_dataframe() return gene_matrix.fillna(gene_matrix.mean(axis=0), axis=0)
def _daread( img: Path, offsets: List[np.ndarray], read_lengths: np.ndarray, chunk_by_dims: List[str] = [ Dimensions.SpatialZ, Dimensions.SpatialY, Dimensions.SpatialX, ], S: int = 0, ) -> Tuple[da.core.Array, str]: """ Read a LIF image file as a delayed dask array where certain dimensions act as the chunk size. Parameters ---------- img: Path The filepath to read. offsets: List[numpy.ndarray] A List of numpy ndarrays offsets, see _compute_offsets for more details. read_lengths: numpy.ndarray A 1D numpy array of read lengths, the index is the scene index chunk_by_dims: List[str] The dimensions to use as the for mapping the chunks / blocks. Default: [Dimensions.SpatialZ, Dimensions.SpatialY, Dimensions.SpatialX] Note: SpatialY and SpatialX will always be added to the list if not present. S: int If the image has different dimensions on any scene from another, the dask array construction will fail. In that case, use this parameter to specify a specific scene to construct a dask array for. Default: 0 (select the first scene) Returns ------- img: dask.array.core.Array The constructed dask array where certain dimensions are chunked. dims: str The dimension order as a string. """ # Get image dims indicies lif = LifFile(filename=img) image_dim_indices = LifReader._dims_shape(lif=lif) # Catch inconsistent scene dimension sizes if len(image_dim_indices) > 1: # Choose the provided scene try: image_dim_indices = image_dim_indices[S] log.info( f"File contains variable dimensions per scene, " f"selected scene: {S} for data retrieval." ) except IndexError: raise exceptions.InconsistentShapeError( f"The LIF image provided has variable dimensions per scene. " f"Please provide a valid index to the 'S' parameter to create a " f"dask array for the index provided. " f"Provided scene index: {S}. Scene index range: " f"0-{len(image_dim_indices)}." ) else: # If the list is length one that means that all the scenes in the image # have the same dimensions # Just select the first dictionary in the list image_dim_indices = image_dim_indices[0] # Uppercase dimensions provided to chunk by dims chunk_by_dims = [d.upper() for d in chunk_by_dims] # Always add Y and X dims to chunk by dims because that is how LIF files work if Dimensions.SpatialY not in chunk_by_dims: log.info( "Adding the Spatial Y dimension to chunk by dimensions as it was not " "found." ) chunk_by_dims.append(Dimensions.SpatialY) if Dimensions.SpatialX not in chunk_by_dims: log.info( "Adding the Spatial X dimension to chunk by dimensions as it was not " "found." ) chunk_by_dims.append(Dimensions.SpatialX) # Setup read dimensions for an example chunk first_chunk_read_dims = {} for dim, (dim_begin_index, dim_end_index) in image_dim_indices.items(): # Only add the dimension if the dimension isn't a part of the chunk if dim not in chunk_by_dims: # Add to read dims first_chunk_read_dims[dim] = dim_begin_index # Read first chunk for information used by dask.array.from_delayed sample, sample_dims = LifReader._get_array_from_offset( im_path=img, offsets=offsets, read_lengths=read_lengths, meta=lif.xml_root, read_dims=first_chunk_read_dims, ) # Get the shape for the chunk and operating shape for the dask array # We also collect the chunk and non chunk dimension ordering so that we can # swap the dimensions after we block the dask array together. sample_chunk_shape = [] operating_shape = [] non_chunk_dimension_ordering = [] chunk_dimension_ordering = [] for i, dim_info in enumerate(sample_dims): # Unpack dim info dim, size = dim_info # If the dim is part of the specified chunk dims then append it to the # sample, and, append the dimension to the chunk dimension ordering if dim in chunk_by_dims: sample_chunk_shape.append(size) chunk_dimension_ordering.append(dim) # Otherwise, append the dimension to the non chunk dimension ordering, and, # append the true size of the image at that dimension else: non_chunk_dimension_ordering.append(dim) operating_shape.append( image_dim_indices[dim][1] - image_dim_indices[dim][0] ) # Convert shapes to tuples and combine the non and chunked dimension orders as # that is the order the data will actually come out of the read data as sample_chunk_shape = tuple(sample_chunk_shape) blocked_dimension_order = ( non_chunk_dimension_ordering + chunk_dimension_ordering ) # Fill out the rest of the operating shape with dimension sizes of 1 to match # the length of the sample chunk. When dask.block happens it fills the # dimensions from inner-most to outer-most with the chunks as long as the # dimension is size 1. Basically, we are adding empty dimensions to the # operating shape that will be filled by the chunks from dask operating_shape = tuple(operating_shape) + (1,) * len(sample_chunk_shape) # Create empty numpy array with the operating shape so that we can iter through # and use the multi_index to create the readers. lazy_arrays = np.ndarray(operating_shape, dtype=object) # We can enumerate over the multi-indexed array and construct read_dims # dictionaries by simply zipping together the ordered dims list and the current # multi-index plus the begin index for that plane. We then set the value of the # array at the same multi-index to the delayed reader using the constructed # read_dims dictionary. dims = [d for d in Dimensions.DefaultOrder] begin_indicies = tuple(image_dim_indices[d][0] for d in dims) for i, _ in np.ndenumerate(lazy_arrays): # Add the czi file begin index for each dimension to the array dimension # index this_chunk_read_indicies = ( current_dim_begin_index + curr_dim_index for current_dim_begin_index, curr_dim_index in zip(begin_indicies, i) ) # Zip the dims with the read indices this_chunk_read_dims = dict( zip(blocked_dimension_order, this_chunk_read_indicies) ) # Remove the dimensions that we want to chunk by from the read dims for d in chunk_by_dims: if d in this_chunk_read_dims: this_chunk_read_dims.pop(d) # Add delayed array to lazy arrays at index lazy_arrays[i] = da.from_delayed( delayed(LifReader._imread)( img, offsets, read_lengths, lif.xml_root, this_chunk_read_dims ), shape=sample_chunk_shape, dtype=sample.dtype, ) # Convert the numpy array of lazy readers into a dask array and fill the inner # most empty dimensions with chunks merged = da.block(lazy_arrays.tolist()) # Because we have set certain dimensions to be chunked and others not # we will need to transpose back to original dimension ordering # Example being, if the original dimension ordering was "SZYX" and we want to # chunk by "S", "Y", and "X" we created an array with dimensions ordering "ZSYX" transpose_indices = [] transpose_required = False for i, d in enumerate(Dimensions.DefaultOrder): new_index = blocked_dimension_order.index(d) if new_index != i: transpose_required = True transpose_indices.append(new_index) else: transpose_indices.append(i) # Only run if the transpose is actually required # The default case is "Z", "Y", "X", which _usually_ doesn't need to be # transposed because that is _usually_ # The normal dimension order of the LIF file anyway if transpose_required: merged = da.transpose(merged, tuple(transpose_indices)) # Because dimensions outside of Y and X can be in any order and present or not # we also return the dimension order string. return merged, "".join(dims)
def _stage_2( YP: Array, X: Array, Y: Array, alphas: Optional[NDArray] = None, normalize: bool = True, _glow_adj_alpha: bool = False, _glow_adj_scaling: bool = False, ) -> Tuple[Array, Array]: """Stage 2 - WGR Meta Regression This stage will train separate ridge regression models for each outcome using the predictions from stage 1 for that same outcome as features. These predictions are then evaluated based on R2 score to determine an optimal "meta" estimator (see `_stage_1` for the "base" estimator description). Results then include only predictions and coefficients from this optimal model. For more details, see the level 1 regression model described in step 1 of [Mbatchou et al. 2020](https://www.biorxiv.org/content/10.1101/2020.06.19.162354v2). """ assert YP.ndim == 4 assert X.ndim == 2 assert Y.ndim == 2 # Check that chunking across samples is the same for all arrays assert YP.numblocks[2] == X.numblocks[0] == Y.numblocks[0] assert YP.chunks[2] == X.chunks[0] == Y.chunks[0] # Assert single chunks for covariates and outcomes assert X.numblocks[1] == Y.numblocks[1] == 1 # Extract shape statistics n_variant_block, n_alpha_1 = YP.shape[:2] n_sample_block = Y.numblocks[0] n_sample, n_outcome = Y.shape n_covar = X.shape[1] n_indvar = n_covar + n_variant_block * n_alpha_1 sample_chunks = Y.chunks[0] if normalize: assert_block_shape(YP, n_variant_block, 1, n_sample_block, 1) assert_chunk_shape(YP, 1, n_alpha_1, sample_chunks[0], n_outcome) # See: https://github.com/projectglow/glow/issues/260 if _glow_adj_scaling: YP = da.map_blocks( lambda x: (x - x.mean(axis=2, keepdims=True)) / x.std(axis=2, keepdims=True), YP, ) else: YP = (YP - YP.mean(axis=2, keepdims=True)) / YP.std(axis=2, keepdims=True) # Tranpose for refit on level 1 predictions YP = YP.transpose((3, 2, 0, 1)) assert_array_shape(YP, n_outcome, n_sample, n_variant_block, n_alpha_1) if alphas is None: # See: https://github.com/projectglow/glow/issues/255 if _glow_adj_alpha: alphas = get_alphas(n_variant_block * n_alpha_1 * n_outcome) else: alphas = get_alphas(n_variant_block * n_alpha_1) n_alpha_2 = alphas.size YR = [] BR = [] for i in range(n_outcome): # Slice and reshape to new 2D covariate matrix; # The order of raveling in trailing dimensions is important # and later reshapes will assume variants, alphas order XPB = YP[i].reshape((n_sample, n_variant_block * n_alpha_1)) # Prepend covariates and chunk along first dim only XPB = da.concatenate((X, XPB), axis=1) XPB = XPB.rechunk(chunks=(None, -1)) assert_array_shape(XPB, n_sample, n_indvar) assert XPB.numblocks == (n_sample_block, 1) # Extract outcome vector YB = Y[:, [i]] assert XPB.ndim == YB.ndim == 2 # Fit and predict folds for each parameter BB, YPB = _ridge_regression_cv(XPB, YB, alphas, n_zero_reg=n_covar)[-2:] assert_array_shape(BB, n_alpha_2, n_sample_block * n_indvar, 1) assert_array_shape(YPB, n_alpha_2, n_sample, 1) BR.append(BB) YR.append(YPB) # Concatenate predictions along outcome dimension YR = da.concatenate(YR, axis=2) assert_block_shape(YR, 1, n_sample_block, n_outcome) assert_chunk_shape(YR, n_alpha_2, sample_chunks[0], 1) assert_array_shape(YR, n_alpha_2, n_sample, n_outcome) # Move samples to last dim so all others are batch # dims for R2 calculations YR = da.transpose(YR, (0, 2, 1)) assert_array_shape(YR, n_alpha_2, n_outcome, n_sample) YR = YR.rechunk((-1, -1, None)) assert_block_shape(YR, 1, 1, n_sample_block) assert YR.shape[1:] == Y.T.shape # Concatenate betas along outcome dimension BR = da.concatenate(BR, axis=2) assert_block_shape(BR, 1, n_sample_block, n_outcome) assert_chunk_shape(BR, n_alpha_2, n_indvar, 1) assert_array_shape(BR, n_alpha_2, n_sample_block * n_indvar, n_outcome) # Compute R2 scores within each sample block for each outcome + alpha R2 = da.stack( [ r2_score(YR.blocks[..., i], Y.T.blocks[..., i]) # Avoid warnings on R2 calculations for blocks with single rows if YR.chunks[-1][i] > 1 else da.full(YR.shape[:-1], np.nan) for i in range(n_sample_block) ] ) assert_array_shape(R2, n_sample_block, n_alpha_2, n_outcome) # Coerce to finite or nan before nan-aware mean R2 = da.where(da.isfinite(R2), R2, np.nan) # Find highest mean alpha score for each outcome across blocks R2M = da.nanmean(R2, axis=0) assert_array_shape(R2M, n_alpha_2, n_outcome) # Identify index for the alpha value with the highest mean score R2I = da.argmax(R2M, axis=0) assert_array_shape(R2I, n_outcome) # Choose the predictions corresponding to the model with best score YRM = da.stack([YR[R2I[i], i, :] for i in range(n_outcome)], axis=-1) YRM = YRM.rechunk((None, -1)) assert_block_shape(YRM, n_sample_block, 1) assert_chunk_shape(YRM, sample_chunks[0], n_outcome) assert_array_shape(YRM, n_sample, n_outcome) # Choose the betas corresponding to the model with the best score BRM = da.stack([BR[R2I[i], :, i] for i in range(n_outcome)], axis=-1) BRM = BRM.rechunk((None, -1)) assert_block_shape(BRM, n_sample_block, 1) assert_chunk_shape(BRM, n_indvar, n_outcome) assert_array_shape(BRM, n_sample_block * n_indvar, n_outcome) return BRM, YRM
X.compute(), kernels_mean.compute(), kernels_cov.compute(), batch_size, total_kernels, input_size) times = [] # list for storing execution times cluster = 'localhost:8001' # address of compute cluster with Client(cluster) as client: # Using cluster as client do for n in range(itrs): # itrs runs start = time.time() # save start tikme batch_out = [] # create list for batch output for i in range(batch_size): # for each image kernel_out = [] # create list for kernel outputs mean = da.matmul(kernels_mean, X[i, :, :]) # compute all kernel means for j in range(total_kernels): # for each kernel cov = da.matmul( da.transpose(X[i, :, :]), # compute covariance da.matmul(kernels_cov[j, :, :], X[i, :, :])) z = mvn_random_DASK( mean[j, :], cov, total_samples, input_size**2) # sample from transformed distribution g = relu(z) # pass samples through relu mean_g = da.mean( g, axis=1) # compute ensemble mean from samples kernel_out.append( mean_g) # add ensemble mean to kernel outputs list kernels_out = da.stack(kernel_out, axis=0) # stack all kernel outputs batch_out.append( kernels_out ) # add stacked kernel outputs to batch output list batches_out = da.stack(batch_out,
assert_eq(dm, m) functions = [ lambda x: x, lambda x: da.expm1(x), lambda x: 2 * x, lambda x: x / 2, lambda x: x**2, lambda x: x + x, lambda x: x * x, lambda x: x[0], lambda x: x[:, 1], lambda x: x[:1, None, 1:3], lambda x: x.T, lambda x: da.transpose(x, (1, 2, 0)), lambda x: x.sum(), lambda x: x.dot(np.arange(x.shape[-1])), lambda x: x.dot(np.eye(x.shape[-1])), lambda x: da.tensordot(x, np.ones(x.shape[:2]), axes=[(0, 1), (0, 1)]), lambda x: x.sum(axis=0), lambda x: x.max(axis=0), lambda x: x.sum(axis=(1, 2)), lambda x: x.astype(np.complex128), lambda x: x.map_blocks(lambda x: x * 2), lambda x: x.round(1), lambda x: x.reshape((x.shape[0] * x.shape[1], x.shape[2])), lambda x: abs(x), lambda x: x > 0.5, lambda x: x.rechunk((4, 4, 4)), lambda x: x.rechunk((2, 2, 1)),
def ds(self): if self._ds is None: file_exists = os.path.exists(self._result_file) reprocess = not file_exists or self._reprocess if reprocess: if file_exists: print('Old file exists ' + self._result_file) #print('Removing old file ' + self._result_file) #shutil.rmtree(self._result_file) ds_data = OrderedDict() to_seconds = np.vectorize( lambda x: x.seconds + x.microseconds / 1E6) print('Processing binary data...') xx, yy, zz = self._loadgrid() if xx is None: if self._from_nc: print('Processing existing netcdf...') fn = self._result_file[:-5] + '_QC_raw.nc' if os.path.exists(fn): ds_temp = xr.open_dataset(self._result_file[:-5] + '_QC_raw.nc', chunks={'time': 50}) u = da.transpose(ds_temp['U'].data, axes=[3, 0, 1, 2]) v = da.transpose(ds_temp['V'].data, axes=[3, 0, 1, 2]) w = da.transpose(ds_temp['W'].data, axes=[3, 0, 1, 2]) tt = ds_temp['time'] te = (tt - tt[0]) / np.timedelta64(1, 's') xx = ds_temp['x'].values yy = ds_temp['y'].values zz = ds_temp['z'].values else: print('USING OLD ZARR DATA') ds_temp = xr.open_zarr(self._result_file) u = da.transpose(ds_temp['U'].data, axes=[3, 0, 1, 2]) v = da.transpose(ds_temp['V'].data, axes=[3, 0, 1, 2]) w = da.transpose(ds_temp['W'].data, axes=[3, 0, 1, 2]) tt = ds_temp['time'] te = (tt - tt[0]) / np.timedelta64(1, 's') xx = ds_temp['x'].values yy = ds_temp['y'].values zz = ds_temp['z'].values print('ERROR: No NetCDF data found for ' + self._xml_file) #return None # print(u.shape) else: tt, uvw = self._loaddata(xx, yy, zz) if tt is None: print('ERROR: No binary data found for ' + self._xml_file) return None # calculate the elapsed time from the Timestamp objects and then convert to datetime64 datatype te = to_seconds(tt - tt[0]) tt = pd.to_datetime(tt) uvw = uvw.persist() u = uvw[:, :, :, :, 0] v = uvw[:, :, :, :, 1] w = uvw[:, :, :, :, 2] # u = xr.DataArray(uvw[:,:,:,:,0], coords=[tt, xx, yy, zz], dims=['time','x', 'y', 'z'], # name='U', attrs={'standard_name': 'sea_water_x_velocity', 'units': 'm s-1'}) # v = xr.DataArray(uvw[:,:,:,:,1], coords=[tt, xx, yy, zz], dims=['time', 'x', 'y', 'z'], # name='V', attrs={'standard_name': 'sea_water_x_velocity', 'units': 'm s-1'}) # w = xr.DataArray(uvw[:,:,:,:,2], coords=[tt, xx, yy, zz], dims=['time', 'x', 'y', 'z'], # name='W', attrs={'standard_name': 'upward_sea_water_velocity', 'units': 'm s-1'}) if xx is None: print('No data found') return None u = u.persist() v = v.persist() w = w.persist() dx = float(xx[1] - xx[0]) dy = float(yy[1] - yy[0]) dz = float(zz[1] - zz[0]) if self._norm_dims: exp = self._result_root.split('/')[4] runSheet = pd.read_csv('~/RunSheet-%s.csv' % exp) runSheet = runSheet.set_index('RunID') runDetails = runSheet.ix[int(self.run_id[-2:])] T = runDetails['T (s)'] h = runDetails['h (m)'] D = runDetails['D (m)'] ww = te / T om = 2. * np.pi / T d_s = (2. * 1E-6 / om)**0.5 bl = 3. * np.pi / 4. * d_s if exp == 'Exp6': if D == 0.1: dy_c = (188. + 82.) / 2 dx_c = 39.25 cx = dx_c / 1000. cy = dy_c / 1000. else: dy_c = (806. + 287.) / 2. * 0.22 dx_c = 113 * 0.22 cx = dx_c / 1000. cy = dy_c / 1000. elif exp == 'Exp8': dy_c = 624 * 0.22 dx_c = 15 cx = dx_c / 1000. cy = dy_c / 1000. xn = (xx + (D / 2. - cx)) / D yn = (yy - cy) / D zn = zz / h xnm, ynm = np.meshgrid(xn, yn) rr = np.sqrt(xnm**2. + ynm**2) cylMask = rr < 0.5 nanPlane = np.ones(cylMask.shape) nanPlane[cylMask] = np.nan nanPlane = nanPlane.T nanPlane = nanPlane[np.newaxis, :, :, np.newaxis] u = u * nanPlane v = v * nanPlane w = w * nanPlane if D == 0.1: xInds = xn > 3. else: xInds = xn > 2. blInd = np.argmax(zn > bl / h) blPlane = int(round(blInd)) Ue = u[:, xInds, :, :] Ue_bar = da.nanmean(Ue, axis=(1, 2, 3)).compute() Ue_bl = da.nanmean(Ue[:, :, :, blPlane], axis=(1, 2)).compute() inds = ~np.isnan(Ue_bl) xv = ww[inds] % 1. xv = xv + np.random.normal(scale=1E-6, size=xv.shape) yv = Ue_bl[inds] xy = np.stack([ np.concatenate([xv - 1., xv, xv + 1.]), np.concatenate([yv, yv, yv]) ]).T xy = xy[xy[:, 0].argsort(), :] xi = np.linspace(-0.5, 1.5, len(xv) / 8) n = np.nanmax(xy[:, 1]) # print(n) # fig,ax = pl.subplots() # ax.scatter(xy[:,0],xy[:,1]/n) # print(xy) spl = si.LSQUnivariateSpline(xy[:, 0], xy[:, 1] / n, t=xi, k=3) roots = spl.roots() der = spl.derivative() slope = der(roots) inds = np.min(np.where(slope > 0)) dt = (roots[inds] % 1.).mean() - 0.5 tpx = np.arange(0, 0.5, 0.001) U0_bl = np.abs(spl(tpx + dt).min() * n) ws = ww - dt Ue_spl = spl((ws - 0.5) % 1.0 + dt) * n * -1.0 #maxima = spl.derivative().roots() #Umax = spl(maxima) #UminIdx = np.argmin(Umax) #U0_bl = np.abs(Umax[UminIdx]*n) #ww_at_min = maxima[UminIdx] #ws = ww - ww_at_min + 0.25 inds = ~np.isnan(Ue_bar) xv = ww[inds] % 1. xv = xv + np.random.normal(scale=1E-6, size=xv.shape) yv = Ue_bar[inds] xy = np.stack([ np.concatenate([xv - 1., xv, xv + 1.]), np.concatenate([yv, yv, yv]) ]).T xy = xy[xy[:, 0].argsort(), :] xi = np.linspace(-0.5, 1.5, len(xv) / 8) n = np.nanmax(xy[:, 1]) spl = si.LSQUnivariateSpline(xy[:, 0], xy[:, 1] / n, t=xi, k=4) maxima = spl.derivative().roots() Umax = spl(maxima) UminIdx = np.argmin(Umax) U0_bar = np.abs(Umax[UminIdx] * n) ww = xr.DataArray(ww, coords=[ tt, ], dims=[ 'time', ]) ws = xr.DataArray(ws - 0.5, coords=[ tt, ], dims=[ 'time', ]) xn = xr.DataArray(xn, coords=[ xx, ], dims=[ 'x', ]) yn = xr.DataArray(yn, coords=[ yy, ], dims=[ 'y', ]) zn = xr.DataArray(zn, coords=[ zz, ], dims=[ 'z', ]) Ue_bar = xr.DataArray(Ue_bar, coords=[ tt, ], dims=[ 'time', ]) Ue_bl = xr.DataArray(Ue_bl, coords=[ tt, ], dims=[ 'time', ]) Ue_spl = xr.DataArray(Ue_spl, coords=[ tt, ], dims=[ 'time', ]) ds_data['ww'] = ww ds_data['ws'] = ws ds_data['xn'] = xn ds_data['yn'] = yn ds_data['zn'] = zn ds_data['Ue_bar'] = Ue_bar ds_data['Ue_bl'] = Ue_bl ds_data['Ue_spl'] = Ue_spl te = xr.DataArray(te, coords=[ tt, ], dims=[ 'time', ]) dims = ['time', 'x', 'y', 'z'] coords = [tt, xx, yy, zz] ds_data['U'] = xr.DataArray(u, coords=coords, dims=dims, name='U', attrs={ 'standard_name': 'sea_water_x_velocity', 'units': 'm s-1' }) ds_data['V'] = xr.DataArray(v, coords=coords, dims=dims, name='V', attrs={ 'standard_name': 'sea_water_x_velocity', 'units': 'm s-1' }) ds_data['W'] = xr.DataArray(w, coords=coords, dims=dims, name='W', attrs={ 'standard_name': 'sea_water_x_velocity', 'units': 'm s-1' }) ds_data['te'] = te # stdV = da.nanstd(v) # stdW = da.nanstd(w) # thres=7. if 'U0_bl' in locals(): condition = (da.fabs(v) / U0_bl > 1.5) | (da.fabs(w) / U0_bl > 0.6) for var in ['U', 'V', 'W']: ds_data[var].data = da.where(condition, np.nan, ds_data[var].data) piv_step_frame = float( self._xml_root.findall('piv/stepFrame')[0].text) print('Calculating tensor') # j = jacobianConv(ds.U, ds.V, ds.W, dx, dy, dz, sigma=1.5) j = jacobianDask(u, v, w, piv_step_frame, dx, dy, dz) print('Done') #j = da.from_array(j,chunks=(20,-1,-1,-1,-1,-1)) # j = jacobianDask(uvw[:,:,:,:,0],uvw[:,:,:,:,1], uvw[:,:,:,:,2], piv_step_frame, dx, dy, dz) jT = da.transpose(j, axes=[0, 1, 2, 3, 5, 4]) # j = j.persist() # jT = jT.persist() jacobianNorm = da.sqrt( da.nansum(da.nansum(j**2., axis=-1), axis=-1)) strainTensor = (j + jT) / 2. vorticityTensor = (j - jT) / 2. strainTensorNorm = da.sqrt( da.nansum(da.nansum(strainTensor**2., axis=-1), axis=-1)) vorticityTensorNorm = da.sqrt( da.nansum(da.nansum(vorticityTensor**2., axis=-1), axis=-1)) divergence = j[:, :, :, :, 0, 0] + j[:, :, :, :, 1, 1] + j[:, :, :, :, 2, 2] # print(divergence) omx = vorticityTensor[:, :, :, :, 2, 1] * 2. omy = vorticityTensor[:, :, :, :, 0, 2] * 2. omz = vorticityTensor[:, :, :, :, 1, 0] * 2. divNorm = divergence / jacobianNorm # divNorm = divNorm.persist() # divNorm_mean = da.nanmean(divNorm) # divNorm_std = da.nanstd(divNorm) dims = ['x', 'y', 'z'] comp = ['u', 'v', 'w'] ds_data['jacobian'] = xr.DataArray( j, coords=[tt, xx, yy, zz, comp, dims], dims=['time', 'x', 'y', 'z', 'comp', 'dims'], name='jacobian') ds_data['jacobianNorm'] = xr.DataArray( jacobianNorm, coords=[tt, xx, yy, zz], dims=['time', 'x', 'y', 'z'], name='jacobianNorm') ds_data['strainTensor'] = xr.DataArray( strainTensor, coords=[tt, xx, yy, zz, comp, dims], dims=['time', 'x', 'y', 'z', 'comp', 'dims'], name='strainTensor') ds_data['vorticityTensor'] = xr.DataArray( vorticityTensor, coords=[tt, xx, yy, zz, comp, dims], dims=['time', 'x', 'y', 'z', 'comp', 'dims'], name='vorticityTensor') ds_data['vorticityNorm'] = xr.DataArray( vorticityTensorNorm, coords=[tt, xx, yy, zz], dims=['time', 'x', 'y', 'z'], name='vorticityNorm') ds_data['strainNorm'] = xr.DataArray( strainTensorNorm, coords=[tt, xx, yy, zz], dims=['time', 'x', 'y', 'z'], name='strainNorm') ds_data['divergence'] = xr.DataArray( divergence, coords=[tt, xx, yy, zz], dims=['time', 'x', 'y', 'z'], name='divergence') ds_data['omx'] = xr.DataArray(omx, coords=[tt, xx, yy, zz], dims=['time', 'x', 'y', 'z'], name='omx') ds_data['omy'] = xr.DataArray(omy, coords=[tt, xx, yy, zz], dims=['time', 'x', 'y', 'z'], name='omy') ds_data['omz'] = xr.DataArray(omz, coords=[tt, xx, yy, zz], dims=['time', 'x', 'y', 'z'], name='omz') ds_data['divNorm'] = xr.DataArray(divNorm, coords=[tt, xx, yy, zz], dims=['time', 'x', 'y', 'z'], name='divNorm') # ds_data['divNorm_mean'] = xr.DataArray(divNorm_mean) # ds_data['divNorm_std'] = xr.DataArray(divNorm_std) ds = xr.Dataset(ds_data) # if self._from_nc: # for k,v in ds_temp.attrs.items(): # ds.attrs[k]=v #ds = ds.chunk({'time': 20}) self._append_CF_attrs(ds) self._append_attrs(ds) ds.attrs['filename'] = self._result_file if self._norm_dims: KC = U0_bl * T / D delta = (2. * np.pi * d_s) / h S = delta / KC ds.attrs['T'] = T ds.attrs['h'] = h ds.attrs['D'] = D ds.attrs['U0_bl'] = U0_bl ds.attrs['U0_bar'] = U0_bar ds.attrs['KC'] = KC ds.attrs['S'] = S ds.attrs['Delta+'] = ((1E-6 * T)**0.5) / h ds.attrs['Delta_l'] = 2 * np.pi * d_s ds.attrs['Delta_s'] = d_s ds.attrs['Re_D'] = U0_bl * D / 1E-6 ds.attrs['Beta'] = D**2. / (1E-6 * T) delta = (ds.attrs['dx'] * ds.attrs['dy'] * ds.attrs['dz'])**(1. / 3.) dpx = (ds.attrs['pdx'] * ds.attrs['pdy'] * ds.attrs['pdz'])**(1. / 3.) delta_px = delta / dpx dt = ds.attrs['piv_step_ensemble'] # divRMS = da.sqrt(da.nanmean((divergence * dt) ** 2.)) # divRMS = divRMS.persist() # vorticityTensorNorm.persist() # velocityError = divRMS/((3./(2.*delta_px**2.))**0.5) # print(da.percentile(ds_new['vorticityTensorNorm'].data.ravel(),99.)) # print(ds_new['divRMS']) # print(ds_new['divNorm_mean']) # vorticityError = divRMS/dt/da.percentile(vorticityTensorNorm.ravel(),99.) # divNorm_mean = da.nanmean(divNorm) # divNorm_std = da.nanstd(divNorm) # print("initial save") #ds.to_zarr(self._result_file,compute=False) #ds = xr.open_zarr(self._result_file) # xstart = np.argmax(xx > 0.05) # ystart = np.argmax(yy > 0.07) divRMS = da.sqrt(da.nanmean( (divergence * dt)**2.)) #.compute() #divNorm = divergence / jacobianNorm #divNorm = divNorm.compute() #divNorm_mean = da.nanmean(divNorm).compute() #divNorm_std = da.nanstd(divNorm).compute() velocityError = divRMS / ((3. / (2. * delta_px**2.))**0.5) vortNorm = vorticityTensorNorm #.compute() vorticityError = divRMS / dt / np.percentile( vortNorm.ravel(), 99.) velocityError, vorticityError = da.compute( velocityError, vorticityError) #ds.attrs['divNorm_mean'] = divNorm_mean #ds.attrs['divNorm_std'] = divNorm_std ds.attrs['velocityError'] = velocityError ds.attrs['vorticityError'] = vorticityError if self._norm_dims: xInds = (xn > 0.5) & (xn < 2.65) yInds = (yn > -0.75) & (yn < 0.75) else: xInds = range(len(ds['x'])) yInds = range(len(ds['y'])) vrms = (ds['V'][:, xInds, yInds, :]**2.).mean( dim=['time', 'x', 'y', 'z'])**0.5 wrms = (ds['W'][:, xInds, yInds, :]**2.).mean( dim=['time', 'x', 'y', 'z'])**0.5 ds.attrs['Vrms'] = float(vrms.compute()) ds.attrs['Wrms'] = float(wrms.compute()) #fig,ax = pl.subplots() #ax.plot(ds.ws,ds.Ue_spl/U0_bl,color='k') #ax.plot(ds.ws,ds.Ue_bl/U0_bl,color='g') #ax.set_xlabel(r'$t/T$') #ax.set_ylabel(r'$U_{bl}/U_0$') #fig.savefig(self._result_file[:-4] + 'png',dpi=125) #pl.close(fig) # print("second save") #ds.to_netcdf(self._result_file) ds.to_zarr(self._result_file, mode='w') print('Cached ' + self._result_file) #ds = xr.open_dataset(self._result_file,chunks={'time':20}) ds = xr.open_zarr(self._result_file) ds.attrs['filename'] = self._result_file else: #ds = xr.open_dataset(self._result_file,chunks={'time':20}) ds = xr.open_zarr(self._result_file) ds.attrs['filename'] = self._result_file self._ds = ds return self._ds
fbinningClip = lambda x, bin2_iStEn, bin1_nAverage: da.mean(da.reshape(x[slice(*bin2_iStEn)], (-1, bin1_nAverage)), 1) fbinning = lambda x, bin1_nAverage: da.mean(da.reshape(x, (-1, bin1_nAverage)), 1) repeat3shift1 = lambda A2: [A2[t:(len(A2) - 2 + t)] for t in range(3)] median3cols = lambda a, b, c: da.where(a < b, da.where(c < a, a, da.where(b < c, b, c)), da.where(a < c, a, da.where(c < b, b, c))) median3 = lambda x: da.hstack((np.NaN, median3cols(*repeat3shift1(x)), np.NaN)) # not convertable to dask easily: fVabs_old = lambda Gxyz, kVabs: np.polyval(kVabs.flat, np.sqrt(np.tan(fInclination(Gxyz)))) rep2mean = lambda x, bOk: np.interp(np.arange(len(x)), np.flatnonzero(bOk), x[bOk], np.NaN, np.NaN) fForce2Vabs_fitted = lambda x: da.where(x > 2, 2, da.where(x < 1, 0.25 * x, 0.25 * x + 0.3 * (x - 1) ** 4)) fIncl2Force = lambda incl: da.sqrt(da.tan(incl)) fVabs = lambda Gxyz, kVabs: fForce2Vabs_fitted(fIncl2Force(fInclination(Gxyz))) f = lambda fun, *args: fun(*args) positiveInd = lambda i, L: np.int32(da.where(i < 0, L - i, i)) minInterval = lambda iLims1, iLims2, L: f( lambda iL1, iL2: da.transpose([max(iL1[:, 0], iL2[:, 0]), min(iL1[:, -1], iL2[:, -1])]), positiveInd(iLims1, L), positiveInd(iLims2, L)) fStEn2bool = lambda iStEn, length: da.hstack( [(da.ones(iEn2iSt, dtype=np.bool8) if b else da.zeros(iEn2iSt, dtype=np.bool8)) for iEn2iSt, b in da.vstack(( da.diff( da.hstack( ( 0, iStEn.flat, length))), da.hstack( ( da.repeat( [ ( False,
def activations(images, labels, batch_size, model, layer_regex, nodaug_params, daug_params, include_input=False, class_invariance=False, n_daug_rep=0, norms=['fro']): """ Computes metrics from the activations, such as the norm of the feature maps, data augmentation invariance, class invariance, etc. Parameters ---------- images : h5py Dataset The set of images labels : h5py Dataset The ground truth labels batch_size : int Batch size model : Keras Model The model nodaug_params : dict Dictionary of data augmentation parameters for the baseline daug_params : dict Dictionary of data augmentation parameters include_input : bool If True, the input layer is considered for the analysis class_invariance : bool If True, the class invariance score is computed n_daug_rep : int If larger than 0, the data augentation invariance score is computed, performing n_daug_rep repetitions of random augmentations norms : list List of keywords to specify the types of norms to compute on the activations Returns ------- results_dict : dict Dictionary containing some performance metrics """ def _update_stats(mean_norm, std_norm, norm): mean_norm_batch = np.mean(norm, axis=0) std_norm_batch = np.std(norm, axis=0) mean_norm = init / float(end) * mean_norm + \ batch_size / float(end) * mean_norm_batch std_norm = init / float(end) * std_norm ** 2 + \ batch_size / float(end) * std_norm_batch ** 2 + \ (init * batch_size) / float(end ** 2) * \ (mean_norm - mean_norm_batch) ** 2 std_norm = np.sqrt(std_norm) return mean_norm, std_norm def _frobenius_norm(activations): norm = np.linalg.norm( activations, ord='fro', axis=tuple(range(1, len(activations.shape) - 1))) return norm def _inf_norm(activations): norm = np.max(np.abs(activations), axis=tuple(range(1, len(activations.shape) - 1))) return norm model = del_extra_nodes(model) n_images = images.shape[0] n_batches_per_epoch = int(np.ceil(float(n_images) / batch_size)) # Get relevant layers if include_input: layer_regex = '({}|.*input.*)'.format(layer_regex) else: layer_regex = layer_regex layers = [layer.name for layer in model.layers if re.compile(layer_regex).match(layer.name)] # Initialize HDF5 to store the activations # filename = 'hdf5_aux_{}'.format(time.time()) # activations_hdf5_aux = h5py.File(filename, 'w') # hdf5_aux = [filename] # # grp_activations = activations_hdf5_aux.create_group('activations') if class_invariance: # grp_labels = activations_hdf5_aux.create_group('labels') labels_true_da = [] labels_pred_da = [] predictions_da = [] # labels_true = grp_labels.create_dataset( # 'labels_true', shape=(n_images, ), dtype=np.uint8) # labels_pred = grp_labels.create_dataset( # 'labels_pred', shape=(n_images, ), dtype=np.uint8) # predictions = grp_labels.create_dataset( # 'predictions', shape=labels.shape, dtype=K.floatx()) idx_softmax = model.output_names.index('softmax') store_labels = True else: store_labels = False # Initialize results dictionary results_dict = {'activations_norm': {}, 'summary': {}, 'class_invariance': {}, 'daug_invariance': {}} # Iterate over the layers for layer_name in layers: # Create batch generator image_gen = get_generator(images, **nodaug_params) batch_gen = generate_batches(image_gen, images, labels, batch_size, aug_per_im=1, shuffle=False) layer = model.get_layer(layer_name) layer_shape = layer.output_shape[1:] n_channels = layer_shape[-1] if re.compile('.*input.*').match(layer_name): layer_name = 'input' print('\nLayer {}\n'.format(layer_name)) # Create a Dataset for the activations of the layer # activations_layer = grp_activations.create_dataset( # layer_name, shape=(n_images, ) + layer_shape, # dtype=K.floatx()) # Create dask array for the activations of the layer activations_layer_da = [] # Initialize placeholders in the results dict for the layer results_dict['activations_norm'].update({layer_name: {n: {'mean': np.zeros(n_channels), 'std': np.zeros(n_channels)} for n in norms}}) layer_dict = results_dict['activations_norm'][layer_name] activation_function = K.function([model.input, K.learning_phase()], [layer.output]) # Iterate over the data set in batches init = 0 for batch_images, batch_labels in tqdm( batch_gen, total=n_batches_per_epoch): batch_size = batch_images.shape[0] end = init + batch_size # Store labels if store_labels: preds = model.predict_on_batch(batch_images) if isinstance(preds, list): preds = preds[idx_softmax] labels_pred_da.append(da.from_array( np.argmax(preds, axis=1))) labels_true_da.append(da.from_array( np.argmax(batch_labels, axis=1))) predictions_da.append(da.from_array(preds)) # labels_pred[init:end] = np.argmax(preds, axis=1) # labels_true[init:end] = np.argmax(batch_labels, axis=1) # predictions[init:end, :] = preds # Get and store activations activations = activation_function([batch_images, 0])[0] activations_layer_da.append(da.from_array( activations, chunks=activations.shape)) # activations_layer[init:end] = activations # Compute norms for norm_key in norms: mean_norm = layer_dict[norm_key]['mean'] std_norm = layer_dict[norm_key]['std'] if norm_key == 'fro': norm = _frobenius_norm(activations) elif norm_key == 'inf': norm = _inf_norm(activations) else: raise NotImplementedError('Implemented norms are fro ' 'and inf') mean_norm, std_norm = _update_stats(mean_norm, std_norm, norm) layer_dict[norm_key]['mean'] = mean_norm layer_dict[norm_key]['std'] = std_norm init = end if init == n_images: store_labels = False break # Concatenate dask arrays activations_layer_da = da.concatenate(activations_layer_da, axis=0) activations_layer_da = activations_layer_da.reshape((n_images, -1)) d_activations = activations_layer_da.shape[-1] if class_invariance: print('\nComputing class invariance\n') labels_pred_da = da.concatenate(labels_pred_da) labels_true_da = da.concatenate(labels_true_da) predictions_da = da.concatenate(predictions_da) n_classes = len(np.unique(labels_true_da)) # Compute MSE matrix of the activations r = da.reshape(da.sum(da.square(activations_layer_da), axis=1), (-1, 1)) mse_matrix_da = (r - 2 * da.dot(activations_layer_da, da.transpose(activations_layer_da)) \ + da.transpose(r)) / d_activations mse_matrix_da = mse_matrix_da.rechunk((mse_matrix_da.chunksize[0], mse_matrix_da.shape[-1])) # Compute class invariance time0 = time() results_dict['class_invariance'].update({layer_name: {}}) class_invariance_scores_da = [] if class_invariance: # mse_matrix_mean = da.mean(mse_matrix_da).compute() for cl in tqdm(range(n_classes)): labels_cl = labels_pred_da == cl labels_cl = labels_cl.compute() mse_class = mse_matrix_da[labels_cl, :][:, labels_cl] mse_class = mse_class.rechunk((-1, -1)) # mse_class_mean = da.mean(mse_class).compute() # class_invariance_score = 1. - np.divide( # mse_class_mean, mse_matrix_mean) # results_dict['class_invariance'][layer_name].update( # {cl: class_invariance_score}) class_invariance_scores_da.append( 1. - da.divide(da.mean(mse_class), da.mean(mse_matrix_da))) # Compute data augmentation invariance print('\nComputing data augmentation invariance\n') mse_daug_da = [] results_dict['daug_invariance'].update({layer_name: {}}) for r in range(n_daug_rep): print('Repetition {}'.format(r)) image_gen_daug = get_generator(images, **daug_params) batch_gen_daug = generate_batches(image_gen_daug, images, labels, batch_size, aug_per_im=1, shuffle=False) activations_layer_daug_da = [] # Iterate over the data set in batches to compute activations init = 0 for batch_images, batch_labels in tqdm( batch_gen, total=n_batches_per_epoch): batch_size = batch_images.shape[0] end = init + batch_size # Get and store activations activations = activation_function([batch_images, 0])[0] activations_layer_daug_da.append(da.from_array( activations, chunks=activations.shape)) init = end if init == n_images: break activations_layer_daug_da = da.concatenate( activations_layer_daug_da, axis=0) activations_layer_daug_da = activations_layer_daug_da.reshape( (n_images, -1)) activations_layer_daug_da = activations_layer_daug_da.rechunk( (activations_layer_daug_da.chunksize[0], activations_layer_daug_da.shape[-1])) # Compute MSE daug mse_daug_da.append(da.mean(da.square(activations_layer_da - \ activations_layer_daug_da), axis=1)) mse_daug_da = da.stack(mse_daug_da, axis=1) mse_sum = da.repeat(da.reshape(da.sum(mse_matrix_da, axis=1), (n_images, 1)), n_daug_rep, axis=1) daug_invariance_score_da = 1 - n_images * da.divide(mse_daug_da, mse_sum) time1 = time() # Compute dask results and update results dict results_dask = da.compute(class_invariance_scores_da, daug_invariance_score_da) time2 = time() results_dict['class_invariance'][layer_name].update( {cl: cl_inv_score for cl, cl_inv_score in enumerate(results_dask[0])}) results_dict['daug_invariance'].update({layer_name: {r: daug_inv_score for r, daug_inv_score in enumerate(results_dask[1].T)}}) # Compute summary statistics of the norms across the channels for layer, layer_dict in results_dict['activations_norm'].items(): results_dict['summary'].update({layer: {}}) for norm_key, norm_dict in layer_dict.items(): results_dict['summary'][layer].update({norm_key: { 'mean': np.mean(norm_dict['mean']), 'std': np.mean(norm_dict['std'])}}) return results_dict
def _adapt_chunking(self, array, sig_dims): n_dimension = array.ndim # Handle chunked signal dimensions by merging just in case sig_dim_idxs = [*range(n_dimension)[-sig_dims:]] if any([len(array.chunks[c]) > 1 for c in sig_dim_idxs]): original_n_chunks = [len(c) for c in array.chunks] array = array.rechunk({idx: -1 for idx in sig_dim_idxs}) log.warning('Merging sig dim chunks as LiberTEM does not ' 'support paritioning along the sig axes. ' f'Original n_blocks: {original_n_chunks}. ' f'New n_blocks: {[len(c) for c in array.chunks]}.') # Warn if there is no nav_dim chunking n_nav_chunks = [len(dim_chunking) for dim_chunking in array.chunks[:-sig_dims]] if set(n_nav_chunks) == {1}: log.warning('Dask array is not chunked in navigation dimensions, ' 'cannot split into nav-partitions without loading the ' 'whole dataset on each worker. ' f'Array shape: {array.shape}. ' f'Chunking: {array.chunks}. ' f'array size {array.nbytes / 1e6} MiB.') # If we are here there is nothing else to do. return array # Orient the nav dimensions so that the zeroth dimension is # the most chunked, this obviously changes the dataset nav_shape ! if not self._preserve_dimension: n_nav_chunks = [len(dim_chunking) for dim_chunking in array.chunks[:-sig_dims]] nav_sort_order = np.argsort(n_nav_chunks)[::-1].tolist() sort_order = nav_sort_order + sig_dim_idxs if not np.equal(sort_order, np.arange(n_dimension)).all(): original_shape = array.shape original_n_chunks = [len(c) for c in array.chunks] array = da.transpose(array, axes=sort_order) log.warning('Re-ordered nav_dimensions to improve partitioning, ' 'create the dataset with preserve_dimensions=True ' 'to suppress this behaviour. ' f'Original shape: {original_shape} with ' f'n_blocks: {original_n_chunks}. ' f'New shape: {array.shape} with ' f'n_blocks: {[len(c) for c in array.chunks]}.') # Handle chunked nav_dimensions # We can allow nav_dimensions to be fully chunked (one chunk per element) # up-to-but-not-including the first non-fully chunked dimension. After this point # we must merge/rechunk all subsequent nav dimensions to ensure continuity # of frame indexes in a flattened nav dimension. This should be removed # when if we allow non-contiguous flat_idx Partitions nav_rechunk_dict = {} for dim_idx, dim_chunking in enumerate(array.chunks[:-sig_dims]): if set(dim_chunking) == {1}: continue else: merge_dimensions = [*range(dim_idx + 1, n_dimension - sig_dims)] for merge_i in merge_dimensions: if len(array.chunks[merge_i]) > 1: nav_rechunk_dict[merge_i] = -1 if nav_rechunk_dict: original_n_chunks = [len(c) for c in array.chunks] array = array.rechunk(nav_rechunk_dict) log.warning('Merging nav dimension chunks according to scheme ' f'{nav_rechunk_dict} as we cannot maintain continuity ' 'of frame indexing in the flattened navigation dimension. ' f'Original n_blocks: {original_n_chunks}. ' f'New n_blocks: {[len(c) for c in array.chunks]}.') # Merge remaining chunks maintaining C-ordering until we reach a target chunk sizes # or a minmum number of partitions corresponding to the number of workers new_chunking, min_size, max_size = merge_until_target(array, self._min_size) if new_chunking != array.chunks: original_n_chunks = [len(c) for c in array.chunks] chunksizes = get_chunksizes(array) orig_min, orig_max = chunksizes.min(), chunksizes.max() array = array.rechunk(new_chunking) log.warning('Applying re-chunking to increase minimum partition size. ' f'n_blocks: {original_n_chunks} => {[len(c) for c in array.chunks]}. ' f'Min chunk size {orig_min / 1e6:.1f} => {min_size / 1e6:.1f} MiB , ' f'Max chunk size {orig_max / 1e6:.1f} => {max_size / 1e6:.1f} MiB.') return array
X.compute(), kernels_mean.compute(), kernels_cov.compute(), batch_size, total_kernels, input_size) times = [] with Client('localhost:8001') as client: for n in range(5): # client.restart() # resets cluster # Do something using 'client' start = time.time() batches = [] for i in range(batch_size): kernel_out = [] for j in range(total_kernels): mean = da.matmul(kernels_mean[j, :], X[i, :, :]) cov = da.matmul( da.transpose(X[i, :, :]), da.matmul(kernels_cov[j, :, :], X[i, :, :])) z = mvn_random_DASK(mean, cov, total_samples, input_size**2) g = relu(z) mean_g = da.mean(g, axis=1) kernel_out.append(mean_g) kernels_out = da.stack(kernel_out, axis=0) batches.append(kernels_out.compute()) print('task graph complete') batches_out_result = np.stack(batches, axis=0) print("compute done") times.append(time.time() - start) if validate: print(
pytest.importorskip("numba", minversion="0.40.0") functions = [ lambda x: x, lambda x: da.expm1(x), lambda x: 2 * x, lambda x: x / 2, lambda x: x ** 2, lambda x: x + x, lambda x: x * x, lambda x: x[0], lambda x: x[:, 1], lambda x: x[:1, None, 1:3], lambda x: x.T, lambda x: da.transpose(x, (1, 2, 0)), lambda x: x.sum(), lambda x: x.mean(), lambda x: x.moment(order=0), pytest.param( lambda x: x.std(), marks=pytest.mark.xfail( reason="fixed in https://github.com/pydata/sparse/pull/243" ), ), pytest.param( lambda x: x.var(), marks=pytest.mark.xfail( reason="fixed in https://github.com/pydata/sparse/pull/243" ), ),
def _read_image(self, image_group, image_sub_group_key): """ Return a dictionary ready to parse of return to io module""" image_sub_group = image_group[image_sub_group_key] original_metadata = _parse_metadata(image_group, image_sub_group_key) original_metadata.update(self.original_metadata) if 'Detector' in original_metadata['BinaryResult'].keys(): self.detector_name = _parse_detector_name(original_metadata) read_stack = (self.load_SI_image_stack or self.im_type == 'Image') h5data = image_sub_group['Data'] # Get the scanning area shape of the SI from the images self.spatial_shape = h5data.shape[:-1] # Set the axes in frame, y, x order if self.lazy: data = da.transpose( da.from_array( h5data, chunks=h5data.chunks), axes=[2, 0, 1]) else: # Workaround for a h5py bug https://github.com/h5py/h5py/issues/977 # Change back to standard API once issue #977 is fixed. # Preallocate the numpy array and use read_direct method, which is # much faster in case of chunked data. data = np.empty(h5data.shape) h5data.read_direct(data) data = np.rollaxis(data, axis=2) pix_scale = original_metadata['BinaryResult'].get( 'PixelSize', {'height': 1.0, 'width': 1.0}) offsets = original_metadata['BinaryResult'].get( 'Offset', {'x': 0.0, 'y': 0.0}) original_units = original_metadata['BinaryResult'].get( 'PixelUnitX', '') axes = [] # stack of images if not read_stack: data = data[0:1, ...] if data.shape[0] == 1: # Squeeze data = data[0, ...] i = 0 else: frame_time = original_metadata['Scan']['FrameTime'] frame_time, time_unit = self._convert_scale_units( frame_time, 's', 2 * data.shape[0]) axes.append({'index_in_array': 0, 'name': 'Time', 'offset': 0, 'scale': frame_time, 'size': data.shape[0], 'units': time_unit, 'navigate': True}) i = 1 scale_x = self._convert_scale_units( pix_scale['width'], original_units, data.shape[i + 1]) scale_y = self._convert_scale_units( pix_scale['height'], original_units, data.shape[i]) offset_x = self._convert_scale_units( offsets['x'], original_units, data.shape[i + 1]) offset_y = self._convert_scale_units( offsets['y'], original_units, data.shape[i]) axes.extend([{'index_in_array': i, 'name': 'y', 'offset': offset_y[0], 'scale': scale_y[0], 'size': data.shape[i], 'units': scale_y[1], 'navigate': False}, {'index_in_array': i + 1, 'name': 'x', 'offset': offset_x[0], 'scale': scale_x[0], 'size': data.shape[i + 1], 'units': scale_x[1], 'navigate': False} ]) md = self._get_metadata_dict(original_metadata) md['Signal']['signal_type'] = 'image' if self.detector_name is not None: original_metadata['DetectorMetadata'] = _get_detector_metadata_dict( original_metadata, self.detector_name) if hasattr(self, 'map_label_dict'): if image_sub_group_key in self.map_label_dict: md['General']['title'] = self.map_label_dict[image_sub_group_key] return {'data': data, 'axes': axes, 'metadata': md, 'original_metadata': original_metadata, 'mapping': self._get_mapping(map_selected_element=False, parse_individual_EDS_detector_metadata=False)}