def pearson_influence(xarr: da.Array, yarr: da.Array) -> da.Array: """Calculating the influence for deleting a point on the pearson correlation""" if xarr.shape != yarr.shape: raise ValueError( f"The shape of xarr and yarr should be same, got {xarr.shape}, {yarr.shape}" ) # Fast calculating the influence for removing one element on the correlation n = xarr.shape[0] x2, y2 = da.square(xarr), da.square(yarr) xy = xarr * yarr # The influence is vectorized on xarr and yarr, so we need to repeat all the sums for n times xsum = da.ones(n) * da.sum(xarr) ysum = da.ones(n) * da.sum(yarr) xysum = da.ones(n) * da.sum(xy) x2sum = da.ones(n) * da.sum(x2) y2sum = da.ones(n) * da.sum(y2) # Note: in we multiply (n-1)^2 to both denominator and numerator to avoid divisions. numerator = (n - 1) * (xysum - xy) - (xsum - xarr) * (ysum - yarr) varx = (n - 1) * (x2sum - x2) - da.square(xsum - xarr) vary = (n - 1) * (y2sum - y2) - da.square(ysum - yarr) denominator = da.sqrt(varx * vary) return da.map_blocks(itruediv, numerator, denominator, dtype=numerator.dtype)
def average(a, weights, **kwargs): """ compute the weighted average """ avg = da.sum(a * weights, **kwargs) tot = da.sum(weights, **kwargs) res = avg / tot return res
def _joint_log_likelihood(self, X): jll = [] for i in range(np.size(self.classes_)): jointi = da.log(self.class_prior_[i]) n_ij = -0.5 * da.sum(da.log(2.0 * np.pi * self.sigma_[i, :])) n_ij -= 0.5 * da.sum( ((X - self.theta_[i, :])**2) / (self.sigma_[i, :]), 1) jll.append(jointi + n_ij) joint_log_likelihood = da.stack(jll).T return joint_log_likelihood
def dask_getJtJdiag(self, m, W=None): """ Return the diagonal of JtJ """ if self.gtgdiag is None: # Need to check if multiplying weights makes sense if W is None: self.gtgdiag = da.sum(self.getJ(m)**2, axis=0).compute() else: w = da.from_array(W.diagonal())[:, None] self.gtgdiag = da.sum((w * self.getJ(m))**2, axis=0).compute() return self.gtgdiag
def get_snp_mask(self): # keep sites that pass the snp prevalence threshold # Together with self.general_mask, this filter should produce exactly the # same sites as parse_midas_data.parse_snps # Need to use the alt and depth arr again alt_arr = da.from_zarr('{}/full_alt.zarr'.format(self.data_dir)) depth_arr = da.from_zarr('{}/full_depth.zarr'.format(self.data_dir)) # increase chunk size to reduce overhead rechunked_alt_arr = alt_arr.rechunk((1000000, 10)) rechunked_depth_arr = depth_arr.rechunk((1000000, 10)) filtered_depth = rechunked_depth_arr[:, self.sample_mask] filtered_alt = rechunked_alt_arr[:, self.sample_mask] # Some snps need to be polarized according to pop_freqs from plos_bio_scripts import calculate_snp_prevalences population_freqs = calculate_snp_prevalences.parse_population_freqs( self.species_name, polarize_by_consensus=False) all_pop_freqs = np.array( map(lambda x: population_freqs.get(x, 0), zip(self.chromosomes, self.locations))) sites_to_flip = all_pop_freqs > 0.5 # take care of sites need not polarize round_1_mask = self.general_mask & np.invert(sites_to_flip) alt_threshold = da.ceil( filtered_depth[round_1_mask, :] * config.parse_snps_min_freq) + 0.5 passed_snp_mask1 = da.sum( filtered_alt[round_1_mask, :] > alt_threshold, axis=1) > 0 # then flip alt round_2_mask = self.general_mask & sites_to_flip alt_threshold2 = da.ceil( filtered_depth[round_2_mask, :] * config.parse_snps_min_freq) + 0.5 polarized_alts = filtered_depth[round_2_mask, :] - filtered_alt[ round_2_mask, :] passed_snp_mask2 = da.sum(polarized_alts > alt_threshold2, axis=1) > 0 # perform dask computation passed_snp_mask1 = passed_snp_mask1.compute() passed_snp_mask2 = passed_snp_mask2.compute() final_mask = self.general_mask.copy() final_mask[self.general_mask & np.invert(sites_to_flip)] = passed_snp_mask1 final_mask[self.general_mask & sites_to_flip] = passed_snp_mask2 print("%d sites left after applying snp filter" % np.sum(final_mask)) return final_mask[self.general_mask]
def load_data(statistic, axis): import dask.array as da import numpy as np from glue.utils import view_shape x = da.from_zarr('/mnt/cephfs/zarr_data_full') f = 1500 scale = 2 lh = [] for k in range(scale): lc = [] for i in range(scale): lr = [] for j in range(scale): lr.append(x[f % 3500]) f = f + 1 lc.append(da.concatenate(lr)) lh.append(da.concatenate(lc, 1)) z = da.concatenate(lh, 2) if statistic == 'minimum': return da.min(z, axis).compute() elif statistic == 'maximum': return da.max(z, axis).compute() elif statistic == 'mean' or statistic == 'median': return da.mean(z, axis).compute() elif statistic == 'percentile': return percentile / 100 elif statistic == 'sum': return da.sum(z.axis).compute() return 0
def test_make_regression(n_samples, n_features, n_informative, n_targets, bias, effective_rank, tail_strength, noise, shuffle, coef, random_state, n_parts, cluster): c = Client(cluster) try: from cuml.dask.datasets import make_regression result = make_regression(n_samples=n_samples, n_features=n_features, n_informative=n_informative, n_targets=n_targets, bias=bias, effective_rank=effective_rank, noise=noise, shuffle=shuffle, coef=coef, random_state=random_state, n_parts=n_parts) if coef: out, values, coefs = result else: out, values = result assert out.shape == (n_samples, n_features), "out shape mismatch" if n_targets > 1: assert values.shape == (n_samples, n_targets), \ "values shape mismatch" else: assert values.shape == (n_samples, ), "values shape mismatch" assert len(out.chunks[0]) == n_parts assert len(out.chunks[1]) == 1 if coef: if n_targets > 1: assert coefs.shape == (n_features, n_targets), \ "coefs shape mismatch" assert len(coefs.chunks[1]) == 1 else: assert coefs.shape == (n_features, ), "coefs shape mismatch" assert len(coefs.chunks[0]) == 1 test1 = da.all(da.sum(coefs != 0.0, axis=0) == n_informative) std_test2 = da.std(values - (da.dot(out, coefs) + bias), axis=0) test1, std_test2 = da.compute(test1, std_test2) diff = cp.abs(1.0 - std_test2) test2 = cp.all(diff < 1.5 * 10**(-1.)) assert test1, \ "Unexpected number of informative features" assert test2, "Unexpectedly incongruent outputs" finally: c.close()
def _response(x_data, n_space, n_state): return pipe( np.linspace(0, 1, n_state), lambda h: da.maximum(1 - abs(x_data[:, :, None] - h) / (h[1] - h[0]), 0), dafft(axis=1), lambda fx: da.sum(_fcoeff(n_space, n_state)[None] * fx, axis=-1), daifft(axis=1)).real
def test_get_bounding_corners_dask(self): """Test finding surrounding bounding corners.""" import dask.array as da from pyresample.bilinear.xarr import (_get_input_xy_dask, _get_bounding_corners_dask) from pyresample._spatial_mp import Proj from pyresample import CHUNK_SIZE proj = Proj(self.target_def.proj_str) out_x, out_y = self.target_def.get_proj_coords(chunks=CHUNK_SIZE) out_x = da.ravel(out_x) out_y = da.ravel(out_y) in_x, in_y = _get_input_xy_dask(self.source_def, proj, da.from_array(self.valid_input_index), da.from_array(self.index_array)) pt_1, pt_2, pt_3, pt_4, ia_ = _get_bounding_corners_dask( in_x, in_y, out_x, out_y, self.neighbours, da.from_array(self.index_array)) self.assertTrue(pt_1.shape == pt_2.shape == pt_3.shape == pt_4.shape == (self.target_def.size, 2)) self.assertTrue(ia_.shape == (self.target_def.size, 4)) # Check which of the locations has four valid X/Y pairs by # finding where there are non-NaN values res = da.sum(pt_1 + pt_2 + pt_3 + pt_4, axis=1).compute() self.assertEqual(np.sum(~np.isnan(res)), 10)
def calc_dispersion(self, src, dst, axis=2, window=False, save_frequencies=False): # t to f self.fft_dask(src, 'mag', 'f_mag.hdf5', 'fft_1', -1, window) # x to k self.fft_dask('f_mag.hdf5', 'fft_1', 'temp2.hdf5', 'fft_2', axis, window) with hd.File('temp2.hdf5', 'r', libver='latest') as temp: disp_arr = da.from_array(temp['fft_2'], chunks=temp['fft_2'].chunks) dispersion = da.sum( sp.absolute(disp_arr), axis=tuple([a for a in range(5) if a not in (axis, 4)])) with hd.File(dst, 'w', libver='latest') as d: pass dispersion.to_hdf5(dst, 'disp') # delete the intermediary values from longterm memory if save_frequencies: os.remove('temp2.hdf5') else: os.remove('temp1.hdf5') os.remove('temp2.hdf5') return 0
def compute(fieldset): # Calculating vertical weighted average for f in [fieldset.U, fieldset.V]: for tind in f.loaded_time_indices: data = da.sum(f.data[tind, :] * DZ, axis=0) / sum(dz) data = da.broadcast_to(data, (1, f.grid.zdim, f.grid.ydim, f.grid.xdim)) f.data = f.data_concatenate(f.data, data, tind)
def compute_adjoint_dask(rays, g, dobs, i0, K_ne, m_tci, m_prior, CdCt, sigma_m, Nkernel, size_cell): L_m = Nkernel * size_cell # #i not eq i0 mask # mask = np.ones(rays.shape[0],dtype=np.bool) # mask[i0] = False # rays = rays[mask,:,:,:,:] # g = g[mask,:,:] # dobs = dobs[mask,:,:] # CdCt = CdCt[mask,:,:] #residuals #g.shape, dobs.shape [Na,Nt,Nd] dd = g - dobs #weighted residuals #Cd.shape [Na,Nt,Nd] i.e. diagonal #CdCt^-1 = 1./CdCt dd /= (CdCt + 1e-15) #get ray info Na, Nt, Nd, _, Ns = rays.shape #parallelize over directions gradient = da.sum(da.stack([ da.from_delayed(delayed(do_adjoint)( rays[:, :, d, :, :], dd[:, :, d], K_ne, m_tci, sigma_m, Nkernel, size_cell, i0), (m_tci.nx, m_tci.ny, m_tci.nz), dtype=np.double) for d in range(Nd) ], axis=-1), axis=-1) gradient = gradient.compute(get=get) gradient += m_tci.M gradient -= m_prior return gradient
def predict(fine_image_t0, coarse_image_t0, coarse_image_t1, shape=None): spec = spectral_distance(fine_image_t0, coarse_image_t0) spec_diff = spec[0] spec_dist = spec[1] temp = temporal_distance(coarse_image_t0, coarse_image_t1) temp_diff = temp[0] temp_dist = temp[1] spat_dist = spatial_distance(fine_image_t0) print("spec_dist.shape: {} temp_dist.shape: {} spat_dist.shape: {}".format( spec_dist.shape, temp_dist.shape, spat_dist.shape)) comb_dist = comb_distance(spec_dist, temp_dist, spat_dist) similar_pixels = filtering(fine_image_t0, spec_dist, temp_dist, spec_diff, temp_diff) weights = weighting(spec_dist, temp_dist, comb_dist, similar_pixels) pred_refl = fine_image_t0 + temp_diff weighted_pred_refl = da.sum(pred_refl * weights, axis=1) if shape is None: prediction = weighted_pred_refl else: prediction = da.reshape(weighted_pred_refl, shape) print("Done prediction!") return prediction
def wavg_full(data, flags, weights, axis=0, threshold=0.8): """Perform weighted average of data, flags and weights, applying flags, over axis. Parameters ---------- data : array of complex flags : array of uint8 or boolean weights : array of floats axis : int threshold : int Returns ------- av_data : weighted average of data av_flags : weighted average of flags av_weights : weighted average of weights """ weighted_data, flagged_weights = weight_data(data, flags, weights) av_data, av_weights = _wavg_axis(weighted_data, flagged_weights, axis) # Update flags to include all invalid data, ie vis = 0j and weights > 1e15 updated_flags = flagged_weights == 0 n_flags = da.sum(updated_flags, axis) av_flags = n_flags >= flags.shape[axis] * threshold return av_data, av_flags, av_weights
def logsumexp(arr, axis=0): """Computes the sum of arr assuming arr is in the log domain. Returns log(sum(exp(arr))) while minimizing the possibility of over/underflow. Examples -------- >>> import numpy as np >>> from sklearn.utils.extmath import logsumexp >>> a = np.arange(10) >>> np.log(np.sum(np.exp(a))) 9.4586297444267107 >>> logsumexp(a) 9.4586297444267107 """ if axis == 0: pass elif axis == 1: arr = arr.T else: raise NotImplementedError # Use the max to normalize, as with the log this is what accumulates # the less errors vmax = arr.max(axis=0) out = da.log(da.sum(da.exp(arr - vmax), axis=0)) out += vmax return out
def test_0d_array(): x = da.mean(da.ones(4, chunks=4), axis=0).compute() y = np.mean(np.ones(4)) assert type(x) == type(y) x = da.sum(da.zeros(4, chunks=1)).compute() y = np.sum(np.zeros(4)) assert type(x) == type(y)
def test_0d_array(): x = da.mean(da.ones(4, chunks=4), axis=0).compute() y = np.mean(np.ones(4)) assert type(x) == type(y) x = da.sum(da.zeros(4, chunks=1)).compute() y = np.sum(np.zeros(4)) assert type(x) == type(y)
def compute_gradient_dask(rays, g, dobs, i0, K_ne, m_tci, m_prior, CdCt, sigma_m, Nkernel, size_cell, cov_obj=None): L_m = Nkernel * size_cell # #i not eq i0 mask # mask = np.ones(rays.shape[0],dtype=np.bool) # mask[i0] = False # rays = rays[mask,:,:,:,:] # g = g[mask,:,:] # dobs = dobs[mask,:,:] # CdCt = CdCt[mask,:,:] #residuals #g.shape, dobs.shape [Na,Nt,Nd] dd = g - dobs #weighted residuals #Cd.shape [Na,Nt,Nd] i.e. diagonal #CdCt^-1 = 1./CdCt dd /= (CdCt + 1e-15) #get ray info Na, Nt, Nd, _, Ns = rays.shape # if Na < Nd: # #parallelize over antennas # gradient = da.sum(da.stack([da.from_delayed(delayed(do_gradient)(rays[i,:,:,:,:], dd[i,:,:], K_ne, m_tci, # sigma_m, Nkernel, size_cell),(m_tci.nx,m_tci.ny,m_tci.nz),dtype=np.double) for i in range(Na)],axis=-1),axis=-1) # else: # #parallelize over directions # gradient = da.sum(da.stack([da.from_delayed(delayed(do_gradient)(rays[:,:,d,:,:], dd[:,:,d], K_ne, m_tci, # sigma_m, Nkernel, size_cell),(m_tci.nx,m_tci.ny,m_tci.nz),dtype=np.double) for d in range(Nd)],axis=-1),axis=-1) #parallelize over directions ne_tci = m_tci.copy() np.exp(ne_tci.M, out=ne_tci.M) ne_tci.M *= K_ne / TECU gradient = da.sum(da.stack([ da.from_delayed(delayed(do_gradient)( rays[:, :, d, :, :], dd[:, :, d], ne_tci, sigma_m, Nkernel, size_cell, i0), (m_tci.nx, m_tci.ny, m_tci.nz), dtype=np.double) for d in range(Nd) ], axis=-1), axis=-1) gradient = gradient.compute(get=get) gradient -= gradient[i0, ...] if cov_obj is not None: dm = m_tci.M - m_prior gradient + cov_obj.contract(dm) #gradient += m_tci.M #gradient -= m_prior return gradient
def grid_glm_data(self, flashes): """ Aggregate the point flashes into a grid of flash counts occurring within each grid box. Args: flashes (:class:`pandas.DataFrame`): Contains the longitudes and latitudes of each flash Returns: :class:`numpy.ndarray` [y, x]: The number of flashes occurring at each grid point. """ flash_x, flash_y = self.glm_proj(flashes["flash_lon"].values, flashes["flash_lat"].values) flash_x /= 1000 flash_y /= 1000 valid_flashes = np.where( (flash_x >= self.x_points.min() - self.dx_km / 2) & (flash_x <= self.x_points.max() + self.dx_km / 2) & (flash_y >= self.y_points.min() - self.dx_km / 2) & (flash_y <= self.y_points.max() + self.dx_km / 2))[0] if valid_flashes.size > 0: if PARALLEL: x_grid_flat = da.from_array(self.x_grid.reshape( (self.x_grid.size, 1)), chunks=512) y_grid_flat = da.from_array(self.y_grid.reshape( (self.x_grid.size, 1)), chunks=512) flash_x_flat = da.from_array(flash_x[valid_flashes].reshape( 1, valid_flashes.size), chunks=512) flash_y_flat = da.from_array(flash_y[valid_flashes].reshape( 1, valid_flashes.size), chunks=512) x_dist = da.fabs(x_grid_flat - flash_x_flat) y_dist = da.fabs(y_grid_flat - flash_y_flat) flash_grid_counts = da.sum( (x_dist <= self.dx_km / 2) & (y_dist <= self.dx_km / 2), axis=1) flash_grid = flash_grid_counts.reshape( self.lon_grid.shape).astype(np.int32).compute() else: x_grid_flat = self.x_grid.reshape((self.x_grid.size, 1)) y_grid_flat = self.y_grid.reshape((self.x_grid.size, 1)) flash_x_flat = flash_x[valid_flashes].reshape( 1, valid_flashes.size) flash_y_flat = flash_y[valid_flashes].reshape( 1, valid_flashes.size) x_dist = np.abs(x_grid_flat - flash_x_flat) y_dist = np.abs(y_grid_flat - flash_y_flat) flash_grid_counts = np.sum( (x_dist <= self.dx_km / 2) & (y_dist <= self.dx_km / 2), axis=1) flash_grid = flash_grid_counts.reshape( self.lon_grid.shape).astype(np.int32) else: flash_grid = np.zeros(self.lon_grid.shape, dtype=np.int32) return flash_grid
def _run_dummy_task_on_dask(*, client): """ Runs small task on Dask client. Starting from v2021.7.0, Dask Distributed does not always close HDF5 files, that are open in read-only mode for loading raw data. Submitting and computing a small unrelated tasks seem to prompt the client to release the resources from the previous task and close the files. """ rfut = da.sum(da.random.random((1000, ), chunks=(10, ))).persist(scheduler=client) rfut.compute(scheduler=client)
def pis_mVc(x,y,beta): ''' rewrite mVc and mMSE,share the 'p' and 'dif'!!!! ''' p=logistic_func(beta, x) dif=da.absolute(y-p) xnorm=da.linalg.norm(x,axis=1) pis=dif*xnorm pi=pis/da.sum(pis) return pi
def compute_spectrum(samples, freq, N=FFT_SIZE): spec = da.sum(np.abs( da.fft.fftshift(da.fft.fft(samples.reshape((-1, N))), axes=1)**2), axis=0).compute() return xr.DataArray(spec, dims='freq', coords={ 'freq': freq + np.fft.fftshift(np.fft.fftfreq(N, 1 / SAMPRATE)) })
def compose_position_fields(fields, spacing, output, blocksize=[ 256, ] * 3, displacement=None): """ """ with distributed.distributedState() as ds: # get number of jobs needed block_grid = np.ceil(np.array(fields[0].shape[:-1]) / blocksize).astype(int) nblocks = np.prod(block_grid) # set up the cluster ds.initializeLSFCluster(job_extra=["-P multifish"]) ds.initializeClient() ds.scaleCluster(njobs=nblocks) # wrap fields as dask arrays fields_da = da.stack( [da.from_array(f, chunks=blocksize + [ 3, ]) for f in fields]) # accumulate composed = da.sum(fields_da, axis=0) # modify for multiple position fields if displacement is not None: raise NotImplementedError( "composing displacement fields not implemented yet") else: grid = position_grid_dask(composed.shape[:3], blocksize) * spacing.astype(np.float32) composed = composed - (len(fields) - 1) * grid # write in parallel as 3D array to zarr file compressor = Blosc(cname='zstd', clevel=9, shuffle=Blosc.BITSHUFFLE) composed_disk = zarr.open( output, 'w', shape=composed.shape, chunks=composed.chunksize, dtype=composed.dtype, compressor=compressor, ) da.to_zarr(composed, composed_disk) # return pointer to zarr file return composed_disk
def test_turn_off_fusion(): x = da.ones(10, chunks=(5, )) y = da.sum(x + 1 + 2 + 3) a = y._optimize(y.dask, y._keys()) with dask.set_options(fuse_ave_width=0): b = y._optimize(y.dask, y._keys()) assert dask.get(a, y._keys()) == dask.get(b, y._keys()) assert len(a) < len(b)
def test_turn_off_fusion(): x = da.ones(10, chunks=(5, )) y = da.sum(x + 1 + 2 + 3) a = y.__dask_optimize__(y.dask, y.__dask_keys__()) with dask.config.set({"optimization.fuse.ave-width": 0}): b = y.__dask_optimize__(y.dask, y.__dask_keys__()) assert dask.get(a, y.__dask_keys__()) == dask.get(b, y.__dask_keys__()) assert len(a) < len(b)
def test_turn_off_fusion(): x = da.ones(10, chunks=(5,)) y = da.sum(x + 1 + 2 + 3) a = y.__dask_optimize__(y.dask, y.__dask_keys__()) with dask.config.set(fuse_ave_width=0): b = y.__dask_optimize__(y.dask, y.__dask_keys__()) assert dask.get(a, y.__dask_keys__()) == dask.get(b, y.__dask_keys__()) assert len(a) < len(b)
def compute_power(samples, sample_time, bandwidth, avg_win=PWR_AVERAGE): samples_filt = filter_signal(samples, bandwidth) pwr = da.sum((np.abs(samples_filt[:samples_filt.size // avg_win * avg_win])**2).reshape((-1, avg_win)), axis=1).compute() return xr.DataArray(pwr, dims='time', coords={ 'time': sample_time[:samples_filt.size // avg_win * avg_win][::avg_win].compute() })
def weighting(spec_dist, temp_dist, comb_dist, similar_pixels_filtered): # Assign max weight (1) when the temporal or spectral distance is zero zero_spec_dist = da.where(spec_dist[:,mid_idx][:,None] == 1, 1, 0) zero_temp_dist = da.where(temp_dist[:,mid_idx][:,None] == 1, 1, 0) zero_dist_mid = da.where((zero_spec_dist == 1), zero_spec_dist, zero_temp_dist) shape = da.subtract(spec_dist.shape,(0,1)) zero_dist = da.zeros(shape, chunks=(spec_dist.shape[0],shape[1])) zero_dist = da.insert(zero_dist, [mid_idx], zero_dist_mid, axis=1) weights = da.where((da.sum(zero_dist,1)[:,None] == 1), zero_dist, comb_dist) # Calculate weights only for the filtered spectrally similar pixels weights_filt = weights*similar_pixels_filtered # Normalize weights norm_weights = da.rechunk(weights_filt/(da.sum(weights_filt,1)[:,None]), chunks = spec_dist.chunksize) print ("Done weighting!", norm_weights) return norm_weights
def test_dask_yarn(): try: from dask_yarn import YarnCluster except: return # Validate dask_yarn configuration cluster = YarnCluster() client = Client(cluster) cluster.scale(4) x = da.sum(np.ones(5)) x.compute()
def agreement(self, estimators): """ Implementation of Query By Committee strategy, variant: Vote entropy. The vote entropy approach is used for measuring the level of disagreement. I. Dagan and S. Engelson. Committee-based sampling for training probabilistic classifiers. In Proceedings of the International Conference on Machine Learning (ICML), pages 150–157. Morgan Kaufmann, 1995. :param estimators: :return: """ score = [] input_shape, committee_size = QueryByCommitteeStategy.check_committee_results( estimators) if len(input_shape) == 2: ele_uni = da.unique(estimators).compute() if not (len(ele_uni) == 2 and 0 in ele_uni and 1 in ele_uni): raise ValueError( "The predicted label matrix must only contain 0 and 1") # calc each instance for i in range(input_shape[0]): instance_mat = da.from_array( np.array([X[i, :] for X in estimators if X is not None])).compute() voting = da.sum(instance_mat, axis=0) tmp = [] for vote in voting: if vote != 0: tmp.append( delayed(vote / len(estimators) * np.log(vote / len(estimators)))) score.append(-delayed(sum)(tmp)) else: input_mat = da.from_array( np.array([X for X in estimators if X is not None])).compute() # for each instance for i in range(input_shape[0]): count_dict = collections.Counter(input_mat[:, i]) tmp = [] for key in count_dict: tmp.append( delayed(count_dict[key] / committee_size * np.log(count_dict[key] / committee_size))) score.append(-delayed(sum)(tmp)) return compute(score)[0]
def nearest_neighbour(test_images, train_images, train_labels, k=1): pred = np.zeros(test_images.shape[0]) for i in range(test_images.shape[0]): test_image = test_images[i, :] nn = da.sum(np.abs(train_images - test_image), axis=1, keepdims=True) if k == 1: nn = da.argmin(nn, axis=0) pred[i] = train_labels[nn] else: nn = np.array(nn) min_idx = np.argsort(nn, 0)[:k] labels = np.array([train_labels[i] for i in min_idx]) labels = np.reshape(labels, [-1]) lab = Counter(labels).most_common()[0][0] pred[i] = lab return pred
def calibrate_posterior_predictive(post_pred, qc): """ Function to calibrate posterior predictive. This allows the calibrated model to make predictions. This function is required to compute mean and log likelihood of the calibrated model. Args: post_pred: posterior predictive of shape (num samples, num X values) qc: calibration object as defined in class QuantileCalibration Returns: calibrated posterior predictive of shape (num samples, num X values) """ # Need to convert from jax array to dask array to avoid # out of memory error (on a 32GB machine for 8000 samples) in the next step. # This also helps to parallelize the task to all cpu cores. post_pred_shape = post_pred.shape res_main_post_pred = da.from_array( np.array(post_pred), chunks=( 1000, # reduce this value if out of memory! np.ceil(post_pred_shape[1] / dask.system.cpu_count()), ), ) # expand to 3D: axis 0: num observations; axis 1: num samples; axis 2: num samples uncalibrated_pp_quantiles = ( da.sum(res_main_post_pred.T[:, :, np.newaxis] <= res_main_post_pred.T[:, np.newaxis, :], axis=1).T / post_pred_shape[0]) # calculate inverse R inverse_calibrated_pp_quantiles = da.apply_along_axis( qc.inverse_transform, 0, uncalibrated_pp_quantiles) # inverse CDF by looking up existing samples with np.quantile() da_combined = da.vstack( [res_main_post_pred, inverse_calibrated_pp_quantiles.compute()]) calibrated_post_pred = da.apply_along_axis( lambda q: np.quantile( q[:post_pred_shape[0]], q[post_pred_shape[0]:], axis=0), 0, da_combined, ).compute() return calibrated_post_pred
def _sum_of_squares(a, axis=0): """ Squares each element of the input array, and returns the sum(s) of that. Parameters ---------- a : array_like Input array. axis : int or None, optional Axis along which to calculate. Default is 0. If None, compute over the whole array `a`. Returns ------- sum_of_squares : ndarray The sum along the given axis for (a**2). See also -------- _square_of_sums : The square(s) of the sum(s) (the opposite of `_sum_of_squares`). """ return da.sum(a * a, axis)
def _square_of_sums(a, axis=0): """ Sums elements of the input array, and returns the square(s) of that sum. Parameters ---------- a : array_like Input array. axis : int or None, optional Axis along which to calculate. Default is 0. If None, compute over the whole array `a`. Returns ------- square_of_sums : float or ndarray The square of the sum over `axis`. See also -------- _sum_of_squares : The sum of squares (the opposite of `square_of_sums`). """ s = da.sum(a, axis) return s * s
def get_bil_info(self): """Return neighbour info. Returns ------- t__ : numpy array Vertical fractional distances from corner to the new points s__ : numpy array Horizontal fractional distances from corner to the new points input_idxs : numpy array Valid indices in the input data idx_arr : numpy array Mapping array from valid source points to target points """ if self.source_geo_def.size < self.neighbours: warnings.warn('Searching for %s neighbours in %s data points' % (self.neighbours, self.source_geo_def.size)) # Create kd-tree valid_input_idx, resample_kdtree = self._create_resample_kdtree() # This is a numpy array self.valid_input_index = valid_input_idx if resample_kdtree.n == 0: # Handle if all input data is reduced away bilinear_t, bilinear_s, valid_input_index, index_array = \ _create_empty_bil_info(self.source_geo_def, self.target_geo_def) self.bilinear_t = bilinear_t self.bilinear_s = bilinear_s self.valid_input_index = valid_input_idx self.index_array = index_array return bilinear_t, bilinear_s, valid_input_index, index_array target_lons, target_lats = self.target_geo_def.get_lonlats() valid_output_idx = ((target_lons >= -180) & (target_lons <= 180) & (target_lats <= 90) & (target_lats >= -90)) index_array, distance_array = self._query_resample_kdtree( resample_kdtree, target_lons, target_lats, valid_output_idx) # Reduce index reference input_size = da.sum(self.valid_input_index) index_mask = index_array == input_size index_array = da.where(index_mask, 0, index_array) # Get output projection as pyproj object proj = Proj(self.target_geo_def.proj_str) # Get output x/y coordinates out_x, out_y = _get_output_xy_dask(self.target_geo_def, proj) # Get input x/y coordinates in_x, in_y = _get_input_xy_dask(self.source_geo_def, proj, self.valid_input_index, index_array) # Get the four closest corner points around each output location pt_1, pt_2, pt_3, pt_4, index_array = \ _get_bounding_corners_dask(in_x, in_y, out_x, out_y, self.neighbours, index_array) # Calculate vertical and horizontal fractional distances t and s t__, s__ = _get_ts_dask(pt_1, pt_2, pt_3, pt_4, out_x, out_y) self.bilinear_t, self.bilinear_s = t__, s__ self.valid_output_index = valid_output_idx self.index_array = index_array self.distance_array = distance_array return (self.bilinear_t, self.bilinear_s, self.valid_input_index, self.index_array)