def _calc_length_and_angles(self, vectors): """Set the three vectors that define the shape of the unit cell Parameters ---------- vectors : tuple of three arrays, each of shape=(n_frames, 3) The semantics of this array are that the shape of the unit cell in frame ``i`` are given by the three vectors, ``value[i, 0, :]``, ``value[i, 1, :]``, and ``value[i, 2, :]``. """ if vectors is None: # or da.all(abs(vectors) < 1e-15): self._unitcell_lengths = None self._unitcell_angles = None return if not len(vectors) == len(self): raise TypeError("unitcell_vectors must be the same length as " "the trajectory. you provided %s" % str(vectors)) v1 = vectors[:, 0, :] v2 = vectors[:, 1, :] v3 = vectors[:, 2, :] a, b, c, alpha, beta, gamma = box_vectors_to_lengths_and_angles( v1, v2, v3) self._unitcell_lengths = da.vstack((a, b, c)).T self._unitcell_angles = da.vstack((alpha, beta, gamma)).T
def fit( self, X: Union[ArrayLike, DataFrameType], y: Optional[Union[ArrayLike, SeriesType]] = None, ) -> "RobustScaler": q_min, q_max = self.quantile_range if not 0 <= q_min <= q_max <= 100: raise ValueError("Invalid quantile range: %s" % str(self.quantile_range)) if isinstance(X, dd.DataFrame): n_columns = len(X.columns) partition_lengths = X.map_partitions(len).compute() dtype = np.find_common_type(X.dtypes, []) blocks = X.to_delayed() X = da.vstack([ da.from_delayed(block.values, shape=(length, n_columns), dtype=dtype) for block, length in zip(blocks, partition_lengths) ]) quantiles: Any = [ da.percentile(col, [q_min, 50.0, q_max]) for col in X.T ] quantiles = da.vstack(quantiles).compute() self.center_: List[float] = quantiles[:, 1] self.scale_: List[float] = quantiles[:, 2] - quantiles[:, 0] self.scale_ = _handle_zeros_in_scale(self.scale_, copy=False) self.n_features_in_: int = X.shape[1] return self
def fit(self, X, y=None): q_min, q_max = self.quantile_range if not 0 <= q_min <= q_max <= 100: raise ValueError("Invalid quantile range: %s" % str(self.quantile_range)) if isinstance(X, dd.DataFrame): n_columns = len(X.columns) partition_lengths = X.map_partitions(len).compute() dtype = np.find_common_type(X.dtypes, []) blocks = X.to_delayed() X = da.vstack( [ da.from_delayed( block.values, shape=(length, n_columns), dtype=dtype ) for block, length in zip(blocks, partition_lengths) ] ) quantiles = [da.percentile(col, [q_min, 50., q_max]) for col in X.T] quantiles = da.vstack(quantiles).compute() self.center_ = quantiles[:, 1] self.scale_ = quantiles[:, 2] - quantiles[:, 0] self.scale_ = skdata._handle_zeros_in_scale(self.scale_, copy=False) return self
def test_vstack(): x = np.arange(5) y = np.ones(5) a = da.arange(5, chunks=2) b = da.ones(5, chunks=2) assert_eq(np.vstack((x, y)), da.vstack((a, b))) assert_eq(np.vstack((x, y[None, :])), da.vstack((a, b[None, :])))
def process_data(X, y=None, test_size=0.2): if y is None: km = dask_ml.cluster.KMeans(n_clusters=10, init_max_iter=100) km.fit(X.flatten().reshape(-1, 1)) y = km.labels_ y_uniqs = np.unique(y[:,0]) len_ = X.shape[0] X = prepare_dataset(X) shape_ = list(X.shape[1:]) if test_size != 0: samples = list() samples_labels = list() print('Preparing samples ...') for _ in range(2): for y_uniq in y_uniqs: sample = list() label = list() for xa, ya in zip(chunks(X, 10),chunks(y[:,0], 10)): try: sample.append([xa[ya == y_uniq][random.randint(0, len(xa[ya == y_uniq]) - 1)]]) label.append(y_uniq) if len(sample) >= len(y_uniqs): break except: pass samples += sample samples_labels += label samples = da.vstack(samples) samples_labels = da.vstack(samples_labels) if test_size == 0: print('Training dataset shape x: ', X.shape) print('Training dataset shape y: ', y.shape) train_dataset = Dataset(X, y) return train_dataset else: X_train, X_test, y_train, y_test = train_test_split(X.flatten().reshape(len_, -1), y, test_size=test_size, random_state=config.seeds) X_train = X_train.reshape([X_train.shape[0]] + shape_) X_test = X_test.reshape([X_test.shape[0]] + shape_) print('Training dataset shape: ', X_train.shape) print('Validation dataset shape: ', X_test.shape) train_dataset = Dataset(X_train, y_train) test_dataset = Dataset(X_test, y_test) train_dataset.samples = samples train_dataset.samples_labels = samples_labels print('Sample dataset shape: ', train_dataset.samples.shape) return train_dataset, test_dataset
def batch_function(self, func, p1): with tf.Session(graph=self.graph) as session: saver = tf.train.Saver() if (self.load(session, saver)): num_epochs_trained = self.model_graph.cur_epoch_tensor.eval( session) print('EPOCHS trained: ', num_epochs_trained) else: return output_l = list() start = 0 end = self.batch_size with tqdm(range(p1.shape[0] // self.batch_size)) as pbar: while end < p1.shape[0]: output = func(session, p1[start:end]) output = np.array(output) output = output.reshape( [output.shape[0] * output.shape[1]] + list(output.shape[2:])) output_l.append(output) start = end end += self.batch_size pbar.update(1) else: x1 = p1[start:] xsize = len(x1) p1t = da.zeros([self.batch_size - xsize] + list(x1.shape[1:])) output = func(session, np.concatenate((x1, p1t), axis=0)) output = np.array(output) output = output.reshape( [output.shape[0] * output.shape[1]] + list(output.shape[2:]))[0:xsize] output_l.append(output) pbar.update(1) try: return da.vstack(output_l) except: output_l = list(map(lambda l: l.reshape(-1, 1), output_l)) return da.vstack(output_l)
def generate(self): """ Sub-classable method for generating a factorial design of specified 'levels' in the given domain. The number of generated points is levels^d. Returns ------- dask.delayed """ if hasattr(self, 'random_idx'): del self.random_idx # Get grid coordinates grid_coords = [ da.linspace(lb, ub, num=self.levels) for lb, ub in zip(self.xmin, self.xmax) ] # Generate the full grid x = da.meshgrid(*grid_coords) dim_idx = [item.ravel() for item in x] x = da.vstack(dim_idx).T x = x.rechunk(('auto', x.shape[1])) if self.use_logger: self.logger.info( "Factorial design: generated {0} points in {1} dimensions". format(len(x), len(self.xmin))) self.generated = x return x
def prepare_dataset(X): len_ = X.shape[0] shape_ = X.shape d = int(da.sqrt(X.flatten().reshape(X.shape[0], -1).shape[1])) if len(shape_)==4: X = da.reshape(X, [-1, d, d, 3]) elif d==shape_[1] and len(shape_)==3: X = da.reshape(X, [-1, d, d]) X = da.array(list(map(lambda x: grey2rgb(x), X)), dtype=da.float32) else: r = d**2 - X.shape[1] train_padding = da.zeros((shape_[0], r)) X = da.vstack([X, train_padding]) X = da.reshape(X, [-1, d, d]) X = da.array(list(map(lambda x: grey2rgb(x), X)), dtype=da.float32) print('Scaling dataset') if scalar is not None: X = scaler.transform(X.flatten().reshape(-1,1).astype(da.float32)).reshape(X.shape) else: scaler = MinMaxScaler() X = scaler.fit_transform(X.flatten().reshape(-1,1).astype(da.float32)).reshape(X.shape) return X
def pad_hrv_data(self, res): """Add empty pixels around the HRV.""" logger.debug('Padding HRV data to full disk') nlines = int(self.mda['number_of_lines']) segment_number = self.mda['segment_sequence_number'] current_first_line = (segment_number - self.mda['planned_start_segment_number']) * nlines bounds = self.epilogue['ImageProductionStats']['ActualL15CoverageHRV'] upper_south_line = bounds[ 'LowerNorthLineActual'] - current_first_line - 1 upper_south_line = min(max(upper_south_line, 0), nlines) data_list = list() if upper_south_line > 0: # we have some of the lower window data_lower = pad_data_horizontally(res[:upper_south_line, :].data, (upper_south_line, HRV_NUM_COLUMNS), bounds['LowerEastColumnActual'], bounds['LowerWestColumnActual']) data_list.append(data_lower) if upper_south_line < nlines: # we have some of the upper window data_upper = pad_data_horizontally(res[upper_south_line:, :].data, (nlines - upper_south_line, HRV_NUM_COLUMNS), bounds['UpperEastColumnActual'], bounds['UpperWestColumnActual']) data_list.append(data_upper) return xr.DataArray(da.vstack(data_list), dims=('y', 'x'), attrs=res.attrs.copy())
def _prepare_variable_for_palette(self, variable, info): if 'scale_offset_dataset' in info: so_dataset = self.nc[info['scale_offset_dataset']] scale = so_dataset.attrs['scale_factor'] offset = so_dataset.attrs['add_offset'] else: scale = 1 offset = 0 variable.attrs['palette_meanings'] = [ int(val) for val in variable.attrs['palette_meanings'].split() ] if variable.attrs['palette_meanings'][0] == 1: variable.attrs['palette_meanings'] = [ 0 ] + variable.attrs['palette_meanings'] variable = xr.DataArray(da.vstack( (np.array(variable.attrs['fill_value_color']), variable.data)), coords=variable.coords, dims=variable.dims, attrs=variable.attrs) val, idx = np.unique(variable.attrs['palette_meanings'], return_index=True) variable.attrs['palette_meanings'] = val * scale + offset variable = variable[idx] return variable
def scale_dataset(self, dsid, variable, info): """Scale the data set, applying the attributes from the netCDF file.""" variable = remove_empties(variable) scale = variable.attrs.get('scale_factor', np.array(1)) offset = variable.attrs.get('add_offset', np.array(0)) if np.issubdtype((scale + offset).dtype, np.floating) or np.issubdtype(variable.dtype, np.floating): if '_FillValue' in variable.attrs: variable = variable.where( variable != variable.attrs['_FillValue']) variable.attrs['_FillValue'] = np.nan if 'valid_range' in variable.attrs: variable = variable.where( variable <= variable.attrs['valid_range'][1]) variable = variable.where( variable >= variable.attrs['valid_range'][0]) if 'valid_max' in variable.attrs: variable = variable.where( variable <= variable.attrs['valid_max']) if 'valid_min' in variable.attrs: variable = variable.where( variable >= variable.attrs['valid_min']) attrs = variable.attrs variable = variable * scale + offset variable.attrs = attrs variable.attrs.update({'platform_name': self.platform_name, 'sensor': self.sensor}) if not variable.attrs.get('standard_name', '').endswith('status_flag'): # TODO: do we really need to add units to everything ? variable.attrs.setdefault('units', '1') ancillary_names = variable.attrs.get('ancillary_variables', '') try: variable.attrs['ancillary_variables'] = ancillary_names.split() except AttributeError: pass if 'palette_meanings' in variable.attrs: variable.attrs['palette_meanings'] = [int(val) for val in variable.attrs['palette_meanings'].split()] if variable.attrs['palette_meanings'][0] == 1: variable.attrs['palette_meanings'] = [0] + variable.attrs['palette_meanings'] variable = xr.DataArray(da.vstack((np.array(variable.attrs['fill_value_color']), variable.data)), coords=variable.coords, dims=variable.dims, attrs=variable.attrs) val, idx = np.unique(variable.attrs['palette_meanings'], return_index=True) variable.attrs['palette_meanings'] = val variable = variable[idx] if 'standard_name' in info: variable.attrs.setdefault('standard_name', info['standard_name']) if self.sw_version == 'NWC/PPS version v2014' and dsid.name == 'ctth_alti': # pps 2014 valid range and palette don't match variable.attrs['valid_range'] = (0., 9000.) if self.sw_version == 'NWC/PPS version v2014' and dsid.name == 'ctth_alti_pal': # pps 2014 palette has the nodata color (black) first variable = variable[1:, :] return variable
def __call__(self, projectables, *args, **kwargs): """Generate the composite.""" from trollimage.image import rgb2ycbcr, ycbcr2rgb projectables = self.match_data_arrays(projectables) luminance = projectables[0].copy() luminance /= 100. # Limit between min(luminance) ... 1.0 luminance = da.where(luminance > 1., 1., luminance) # Get the enhanced version of the composite to be sharpened rgb_img = enhance2dataset(projectables[1]) # This all will be eventually replaced with trollimage convert() method # ycbcr_img = rgb_img.convert('YCbCr') # ycbcr_img.data[0, :, :] = luminance # rgb_img = ycbcr_img.convert('RGB') # Replace luminance of the IR composite y__, cb_, cr_ = rgb2ycbcr(rgb_img.data[0, :, :], rgb_img.data[1, :, :], rgb_img.data[2, :, :]) r__, g__, b__ = ycbcr2rgb(luminance, cb_, cr_) y_size, x_size = r__.shape r__ = da.reshape(r__, (1, y_size, x_size)) g__ = da.reshape(g__, (1, y_size, x_size)) b__ = da.reshape(b__, (1, y_size, x_size)) rgb_img.data = da.vstack((r__, g__, b__)) return super(LuminanceSharpeningCompositor, self).__call__(rgb_img, *args, **kwargs)
def StackColumns(*cols): """ Stack the input dask arrays vertically, column by column. This uses :func:`dask.array.vstack`. Parameters ---------- *cols : :class:`dask.array.Array` the dask arrays to stack vertically together Returns ------- :class:`dask.array.Array` : the dask array where columns correspond to the input arrays Raises ------ TypeError If the input columns are not dask arrays """ if not all(isinstance(col, da.Array) for col in cols): raise TypeError("all input columns in `vstack` must be dask arrays") return da.vstack(cols).T
def get_centroids_distance(x: np.ndarray, means: np.ndarray) -> np.ndarray: """Returns the distance values between x and each cluster's centroid. The returned values are squared Euclidean distances. Parameters ---------- x: ndarray of shape (n_samples, n_features) A series of data points. means: ndarray of shape (n_clusters, n_features) The centroids. Returns ------- distances: ndarray of shape (n_clusters, n_samples) For each cluster, the squared Euclidian distance (or distances) to x. """ x = np.atleast_2d(x) if isinstance(x, da.Array): distances = [] for i in range(means.shape[0]): distances.append(np.sum((means[i] - x)**2, axis=-1)) return da.vstack(distances) else: return scipy.spatial.distance.cdist(means, x, metric="sqeuclidean")
def _sampling_reconst(self, std_scales, random_latent=None): def aux_fun(session, rand_samp): return self.model_graph._sampling_reconst(session=session, std_scales=std_scales, random_latent=rand_samp) with tf.Session(graph=self.graph) as session: tf.set_random_seed(self.config.seeds) self.session = session self.saver = tf.train.Saver() if (self.config.restore and self.load(self.session, self.saver)): load_config = file_utils.load_args(self.config.model_name, self.config.config_dir, ['latent_mean', 'latent_std', 'samples', 'y_uniqs']) self.config.update(load_config) num_epochs_trained = self.model_graph.cur_epoch_tensor.eval(self.session) print('EPOCHS trained: ', num_epochs_trained) else: print('Initializing Variables ...') tf.global_variables_initializer().run() samples = list() if random_latent is None: while True: samples.append(self.model_graph._sampling_reconst(session=session, std_scales=std_scales)[0]) if len(samples) >= (100//self.config.batch_size)+1: samples = da.vstack(samples) samples = samples[:100] break else: samples = self.batch_function(aux_fun, random_latent) scaler = MinMaxScaler() return scaler.fit_transform(samples.flatten().reshape(-1, 1).astype(np.float32)).reshape(samples.shape)
def _transform(self, X, inverse=False): X = X.copy() # ... transformed = [self._transform_col(X[:, feature_idx], self.quantiles_[:, feature_idx], inverse) for feature_idx in range(X.shape[1])] return da.vstack(transformed).T
def select_points(self, x, n): """ Get 'n' top ranked candidates according to maximin sampling to add to current samples x Parameters ---------- x : vector or array-like existing design to which a new point must be added n : integer number of new samples to be selected Returns ------- dask.delayed """ x = da.from_array(x, chunks='auto') c = [] for idx in range(0, n): c_new = self.select_point(x) x = da.vstack((x, c_new)) c.append(c_new.to_delayed()[0]) if self.use_logger: self.logger.info( "Maximin sequential design: selected {0} new samples".format( n)) return c
def polyfit(array, deg=1, dim=None, coord=None): """ Least squares polynomial fit. Fit a polynomial ``p(x) = p[deg] * x ** deg + ... + p[0]`` of degree `deg` Returns a vector of coefficients `p` that minimises the squared error. Parameters ---------- x : xarray.DataArray The array to fit deg : int, optional Degree of the fitting polynomial, Default is 1. dim : str, optional The dimension along which the data will be fitted. If not precised, the first dimension will be used coord : xarray.Coordinate, optional The coordinates used to based the fitting on. Returns ------- output : xarray.DataArray Polynomial coefficients with a new dimension to sort the polynomial coefficients by degree """ if dim is None: dim = array.dims[0] # Re-order the array to place the fitting dimension as the first dimension # + stack the other dimensions array_stacked = _order_and_stack(array, dim) dim_chunk = array.chunks[array.get_axis_num(dim)][0] if coord is None: coord = array[dim] if pd.core.common.is_datetime64_dtype(coord.data): # Use the 1e-9 to scale nanoseconds to seconds (by default, xarray use # datetime in nanoseconds t = coord.data.astype('f8') * 1e-9 else: t = coord.data # Build coefficient matrix for the fit x = da.vstack([t**d for d in range(deg + 1)]).T x = x.rechunk((dim_chunk, deg + 1)) # Solve the least-square system p, err, _, _ = da.linalg.lstsq(x, array_stacked.data) # TO DO: Compute and store the errors associated to the fit # Store the result in a DataArray object new_dims = ('degree', ) + array_stacked.dims[1:] new_coords = { co: array_stacked.coords[co] for co in array_stacked.coords if co is not dim } ds = xr.DataArray(p, name='polynomial_coefficients', coords=new_coords, dims=new_dims) ds = ds.assign_coords(degree=range(deg + 1)) coeffs = _unstack(ds) return coeffs
def dask_array_resolver(obj, resolver, **kw): def get_partition(obj_id): client = vineyard.connect() np_value = client.get(obj_id) return da.from_array(np_value) meta = obj.meta num = int(meta['partitions_-size']) dask_client = Client(kw['dask_scheduler']) futures = [] indices = [] with_index = True for i in range(num): ts = meta.get_member('partitions_-%d' % i) instance_id = int(ts.meta['instance_id']) partition_index = json.loads(ts.meta['partition_index_']) if partition_index: indices.append((partition_index[0], partition_index[1], i)) else: with_index = False futures.append( # we require the 1-on-1 alignment of vineyard instances and dask workers. # vineyard_sockets maps vineyard instance_ids into ipc_sockets, while # dask_workers maps vineyard instance_ids into names of dask workers. dask_client.submit(get_partition, ts.meta.id, workers={kw['dask_workers'][instance_id]})) arrays = dask_client.gather(futures) if with_index: indices = list(sorted(indices)) nx = indices[-1][0] + 1 ny = indices[-1][1] + 1 assert nx * ny == num rows = [] for i in range(nx): cols = [] for j in range(ny): cols.append(arrays[indices[i * ny + j][2]]) rows.append(da.hstack(cols)) return da.vstack(rows) return da.vstack(arrays)
def _transform(self, X: Union[ArrayLike, DataFrameType], inverse: bool = False) -> Union[ArrayLike, DataFrameType]: X = X.copy() # ... transformed = [ self._transform_col(X[:, feature_idx], self.quantiles_[:, feature_idx], inverse) for feature_idx in range(X.shape[1]) ] return da.vstack(transformed, allow_unknown_chunksizes=True).T
def _transform(self, X, inverse=False): X = X.copy() # ... transformed = [ self._transform_col(X[:, feature_idx], self.quantiles_[:, feature_idx], inverse) for feature_idx in range(X.shape[1]) ] if DASK_110: kwargs = {"allow_unknown_chunksizes": True} else: kwargs = {} return da.vstack(transformed, **kwargs).T
def _assemble_azimuth_noise_blocks(self, chunks): """Assemble the azimuth noise blocks into one single array.""" # The strategy here is a bit convoluted. The job would be trivial if # performed on regular numpy arrays, but here we want to keep the data # as xarray/dask array as much as possible. # Using a pure xarray approach was tested (with `combine_first`, # `interpolate_na`, etc), but was found to be memory-hungry at the time # of implementation (March 2021). Hence the usage of a custom algorithm, # relying mostly on dask arrays. slices = self._create_dask_slices_from_blocks(chunks) populated_array = da.vstack(slices).rechunk(chunks) populated_array = xr.DataArray(populated_array, dims=['y', 'x'], coords={'x': np.arange(self._image_shape[1]), 'y': np.arange(self._image_shape[0])}) return populated_array
def calibrate_posterior_predictive(post_pred, qc): """ Function to calibrate posterior predictive. This allows the calibrated model to make predictions. This function is required to compute mean and log likelihood of the calibrated model. Args: post_pred: posterior predictive of shape (num samples, num X values) qc: calibration object as defined in class QuantileCalibration Returns: calibrated posterior predictive of shape (num samples, num X values) """ # Need to convert from jax array to dask array to avoid # out of memory error (on a 32GB machine for 8000 samples) in the next step. # This also helps to parallelize the task to all cpu cores. post_pred_shape = post_pred.shape res_main_post_pred = da.from_array( np.array(post_pred), chunks=( 1000, # reduce this value if out of memory! np.ceil(post_pred_shape[1] / dask.system.cpu_count()), ), ) # expand to 3D: axis 0: num observations; axis 1: num samples; axis 2: num samples uncalibrated_pp_quantiles = ( da.sum(res_main_post_pred.T[:, :, np.newaxis] <= res_main_post_pred.T[:, np.newaxis, :], axis=1).T / post_pred_shape[0]) # calculate inverse R inverse_calibrated_pp_quantiles = da.apply_along_axis( qc.inverse_transform, 0, uncalibrated_pp_quantiles) # inverse CDF by looking up existing samples with np.quantile() da_combined = da.vstack( [res_main_post_pred, inverse_calibrated_pp_quantiles.compute()]) calibrated_post_pred = da.apply_along_axis( lambda q: np.quantile( q[:post_pred_shape[0]], q[post_pred_shape[0]:], axis=0), 0, da_combined, ).compute() return calibrated_post_pred
def scatter_with_regression( x: da.Array, y: da.Array, sample_size: int, k: Optional[int] = None ) -> Tuple[Tuple[da.Array, da.Array], Tuple[da.Array, da.Array], Optional[da.Array]]: """Calculate pearson correlation on 2 given arrays. Parameters ---------- xarr : da.Array yarr : da.Array sample_size : int k : Optional[int] = None Highlight k points which influence pearson correlation most """ if k == 0: raise ValueError("k should be larger than 0") xp1 = da.vstack([x, da.ones_like(x)]).T xp1 = xp1.rechunk((xp1.chunks[0], -1)) mask = ~(da.isnan(x) | da.isnan(y)) # if chunk size in the first dimension is 1, lstsq will use sfqr instead of tsqr, # where the former does not support nan in shape. if len(xp1.chunks[0]) == 1: xp1 = xp1.rechunk((2, -1)) y = y.rechunk((2, -1)) mask = mask.rechunk((2, -1)) (coeffa, coeffb), _, _, _ = da.linalg.lstsq(xp1[mask], y[mask]) if sample_size < x.shape[0]: samplesel = da.random.choice(x.shape[0], int(sample_size), chunks=x.chunksize) x = x[samplesel] y = y[samplesel] if k is None: return (coeffa, coeffb), (x, y), None influences = pearson_influence(x, y) return (coeffa, coeffb), (x, y), influences
def mask_seed_region(self, parts, mask_img, medoid_coords, num_workers=16): """ Generate encoders such that all voxels included in the cluster of the medoid are assigned the one value and zero otherwise :param parts: Dask array, shape(n_sliding windows, n_voxels) Parcellations across sliding windows :param mask_img: Nifti Image, shape (x,y,z) Mask of the data :param medoid_coords: tuple, shape (x,y,z) medoid of a region :param chunksize_voxels: int Number of voxels in one chunk :return: Dask array, shape(n_sliding windows, n_voxels) Masked parcellations for one medoid """ n_sw = parts.shape[0] mask_medoid = self.__get_mask_medoid(mask_img, medoid_coords) tmp = np.unique(mask_medoid) if tmp.shape[0] == 1: return None else: with closing(Pool(processes=num_workers)) as p: l_masked_parts = p.starmap(self.get_onhot_vect, [(parts, mask_medoid, sw_idx) for sw_idx in range(0, n_sw)]) del mask_medoid gc.collect() darr_masked_parts = da.vstack(l_masked_parts) del l_masked_parts gc.collect() return darr_masked_parts
def compute_importance_gbt(x, y, x_test, y_test): """Compute importance based on gradient boosted trees.""" print("Computing importance based on gradient boosted trees ... ") num_factors = y.shape[1] #num_codes = x.shape[0] importance_matrix = list() train_loss = [] test_loss = [] for i in range(num_factors): model = GradientBoostingClassifier(verbose=1) model.fit(x, y[:, i]) importance_matrix.append(np.abs(model.feature_importances_)) train_loss.append(da.mean(model.predict(x) == y[:, i])) test_loss.append(da.mean(model.predict(x_test) == y_test[:, i])) return da.vstack(importance_matrix), np.mean(train_loss), np.mean( test_loss)
def scatter_with_regression( xarr: da.Array, yarr: da.Array, sample_size: int, k: Optional[int] = None ) -> Tuple[Tuple[float, float], dd.DataFrame, Optional[np.ndarray]]: """ Calculate pearson correlation on 2 given arrays. Parameters ---------- xarr : da.Array yarr : da.Array sample_size : int k : Optional[int] = None Highlight k points which influence pearson correlation most Returns ------- Intermediate """ if k == 0: raise ValueError("k should be larger than 0") mask = ~(da.isnan(xarr) | da.isnan(yarr)) xarr = da.from_array(np.array(xarr)[mask]) yarr = da.from_array(np.array(yarr)[mask]) xarrp1 = da.vstack([xarr, da.ones_like(xarr)]).T xarrp1 = xarrp1.rechunk((xarrp1.chunks[0], -1)) (coeffa, coeffb), _, _, _ = da.linalg.lstsq(xarrp1, yarr) if sample_size < len(xarr): samplesel = np.random.choice(len(xarr), int(sample_size)) xarr = xarr[samplesel] yarr = yarr[samplesel] df = dd.concat([dd.from_dask_array(arr) for arr in [xarr, yarr]], axis=1) df.columns = ["x", "y"] if k is None: return (coeffa, coeffb), df, None influences = pearson_influence(xarr, yarr) return (coeffa, coeffb), df, influences
def process_data(X, y=None, test_size=0.20, dummies=False): if y is None: y = da.ones(X.shape[0]) y_uniqs = np.unique(y) len_ = X.shape[0] X = prepare_dataset(X) if dummies: y = dd.get_dummies(y) shape_ = list(X.shape[1:]) samples = list() for _ in range(10): for y_uniq in y_uniqs: sample = list() for xa, ya in zip(chunks(X, 10),chunks(y, 10)): try: sample.append([xa[ya == y_uniq][random.randint(0, len(xa[ya == y_uniq]) - 1)]]) if len(sample) >= 500: break except: pass samples += sample samples = da.vstack(samples) X_train, X_test, y_train, y_test = train_test_split(X.flatten().reshape(len_, -1), y, test_size=test_size, random_state=4891) X_train = X_train.reshape([X_train.shape[0]] + shape_) X_test = X_test.reshape([X_test.shape[0]] + shape_) print('Training dataset shape: ', X_train.shape) print('Validation dataset shape: ', X_test.shape) train_dataset = Dataset(X_train, y_train) test_dataset = Dataset(X_test, y_test) train_dataset.samples = samples print('Sample dataset shape: ', train_dataset.samples.shape) return train_dataset, test_dataset
def SkyToUnitSphere(ra, dec, degrees=True): """ Convert sky coordinates (``ra``, ``dec``) to Cartesian coordinates on the unit sphere. Parameters ---------- ra : :class:`dask.array.Array`; shape: (N,) the right ascension angular coordinate dec : :class:`dask.array.Array`; ; shape: (N,) the declination angular coordinate degrees : bool, optional specifies whether ``ra`` and ``dec`` are in degrees or radians Returns ------- pos : :class:`dask.array.Array`; shape: (N,3) the cartesian position coordinates, where columns represent ``x``, ``y``, and ``z`` Raises ------ TypeError If the input columns are not dask arrays """ if not all(isinstance(col, da.Array) for col in [ra, dec]): raise TypeError("both ``ra`` and ``dec`` must be dask arrays") # put into radians from degrees if degrees: ra = da.deg2rad(ra) dec = da.deg2rad(dec) # cartesian coordinates x = da.cos( dec ) * da.cos( ra ) y = da.cos( dec ) * da.sin( ra ) z = da.sin( dec ) return da.vstack([x,y,z]).T
def StackColumns(*cols): """ Stack the input dask arrays vertically, column by column. This uses :func:`dask.array.vstack`. Parameters ---------- *cols : :class:`dask.array.Array` the dask arrays to stack vertically together Returns ------- :class:`dask.array.Array` : the dask array where columns correspond to the input arrays Raises ------ TypeError If the input columns are not dask arrays """ cols = da.broadcast_arrays(*cols) return da.vstack(cols).T
def SkyToUnitSphere(ra, dec, degrees=True, frame='icrs'): """ Convert sky coordinates (``ra``, ``dec``) to Cartesian coordinates on the unit sphere. Parameters ---------- ra : :class:`dask.array.Array`; shape: (N,) the right ascension angular coordinate dec : :class:`dask.array.Array`; ; shape: (N,) the declination angular coordinate degrees : bool, optional specifies whether ``ra`` and ``dec`` are in degrees or radians frame : string ('icrs' or 'galactic') speciefies which frame the Cartesian coordinates is. Useful if you know the simulation (usually cartesian) is in galactic units but you want to convert to the icrs (ra, dec) usually used in surveys. Returns ------- pos : :class:`dask.array.Array`; shape: (N,3) the cartesian position coordinates, where columns represent ``x``, ``y``, and ``z`` Raises ------ TypeError If the input columns are not dask arrays """ ra, dec = da.broadcast_arrays(ra, dec) if frame == 'icrs': # no frame transformation # put into radians from degrees if degrees: ra = da.deg2rad(ra) dec = da.deg2rad(dec) # cartesian coordinates x = da.cos( dec ) * da.cos( ra ) y = da.cos( dec ) * da.sin( ra ) z = da.sin( dec ) return da.vstack([x,y,z]).T else: from astropy.coordinates import SkyCoord if degrees: ra = da.deg2rad(ra) dec = da.deg2rad(dec) def eq_to_cart(ra, dec): try: sc = SkyCoord(ra, dec, unit='rad', representation_type='unitspherical', frame='icrs') except: sc = SkyCoord(ra, dec, unit='rad', representation='unitspherical', frame='icrs') scg = sc.transform_to(frame=frame) scg = scg.cartesian x, y, z = scg.x.value, scg.y.value, scg.z.value return numpy.stack([x, y, z], axis=1) arr = da.apply_gufunc(eq_to_cart, '(),()->(p)', ra, dec, output_dtypes=[ra.dtype], output_sizes={'p': 3}) return arr
def vstack(self, *others, **kwargs): others = tuple(ensure_dask_array(d) for d in others) tup = (self,) + others out = da.vstack(tup) return view_subclass(out, type(self))