def std_dev_contrast_stretch(arr: np.ndarray, n=2): """ Performs a contrast stretch from +/-2σ around the mean to -1 to 1. """ sigma = arr.std()*n m = arr.mean() return np.interp(arr,[m-sigma,m+sigma],[-1,1])
def _init_weights(self, X: np.ndarray) -> None: """Initialize weights from PCA eigenvectors""" if not hasattr(self, 'weights'): pca = PCA(n_components=self._ndims) comp = pca.fit(X).components_[:2] coeff = X.mean(0) + 5 * X.std(0) / self._shape[0] # Create grid based on PCA eigenvectors and std dev of features raw_weights = np.asarray([ (coeff * (comp[0] * (x - 0.5 / self._shape[0]) + comp[1] * (y - 0.5 / self._shape[1]))) for x, y in zip(np.nditer(self._X.flatten()), np.nditer(self._Y.flatten())) ]).reshape(self._shape + (self._ndims,)) # Scale to (0, 1) full_shape = self._shape + (1,) self.weights = ( (raw_weights - raw_weights.min(2).reshape(full_shape)) / raw_weights.ptp(2).reshape(full_shape) )
def mean_dif_std(arr: np.ndarray) -> float: return arr.mean() - arr.std()
def tensor_standardize(x: np.ndarray, dim=-1): x_mean = np.expand_dims(x.mean(axis=dim), axis=dim) x_std = np.expand_dims(x.std(axis=dim), axis=dim) return (x - x_mean) / tackle_denominator(x_std)
def _normalize(adv: np.ndarray): """#### Normalize advantage function""" return (adv - adv.mean()) / (adv.std() + 1e-8)
def normalize(x: np.ndarray): mean = x.mean() std = x.std() x = (x - mean) / (std + 1e-11 + 1j * 1e-11) return x, mean, std
def image_std(np_img: np.ndarray): """Return standard deviation of each channel. """ return np_img.std(axis=(0, 1))
def safe_normalize(vector: np.ndarray): vector = vector - vector.mean() std = vector.std() if std > 0: vector /= std return vector
def zech_aslan(x: np.ndarray, y: np.ndarray, *, dmin: float = 1e-12) -> float: r""" Compute a modified Zech-Aslan energy distance dissimilarity metric based on an analogy with the energy of a cloud of electrical charges. This method is scale-invariant. Parameters ---------- x : np.ndarray (n,d) Reference sample. y : np.ndarray (m,d) Candidate sample. dmin : float The cut-off for low distances to avoid singularities on identical points. Returns ------- float Zech-Aslan dissimilarity metric ranging from -infinity to infinity. Notes ----- The energy measure between two variables :math:`X`, :math:`Y` (target and candidates) of sizes :math:`n,d` and :math:`m,d` proposed by [AZ03]_ is defined by: .. math:: e(X, Y) &= \left[\phi_{xx} + \phi_{yy} - \phi_{xy}\right] \\ \phi_{xy} &= \frac{1}{n m} \sum_{i = 1}^n \sum_{j = 1}^m R\left[SED(X_i, Y_j)\right] \\ \phi_{xx} &= \frac{1}{n^2} \sum_{i = 1}^n \sum_{j = i + 1}^n R\left[SED(X_i, X_j)\right] \\ \phi_{yy} &= \frac{1}{m^2} \sum_{i = 1}^m \sum_{j = i + 1}^m R\left[SED(X_i, Y_j)\right] \\ where :math:`X_i` denotes the i-th observation of :math:`X`. :math:`R` is a weight function and :math:`SED(A, B)` denotes the standardized Euclidean distance. .. math:: R(r) &= \left\{\begin{array}{r l} -\ln r & \text{for } r > d_{min} \\ -\ln d_{min} & \text{for } r \leq d_{min} \end{array}\right. \\ SED(X_i, Y_j) &= \sqrt{\sum_{k=1}^d \frac{\left(X_i(k) - Y_i(k)\right)^2}{\sigma_x(k)\sigma_y(k)}} where :math:`k` is a counter over dimensions (indices in the case of spatial analogs) and :math:`\sigma_x(k)` is the standard deviation of :math:`X` in dimension :math:`k`. Finally, :math:`d_{min}` is a cut-off to avoid poles when :math:`r \to 0`, it is controllable through the `dmin` parameter. This version corresponds the :math:`D_{ZAE}` test of [Grenier2013]_ (eq. 7), which is a version of :math:`\phi_{NM}` from [AZ03]_, modified by using the standardized euclidean distance, the log weight function and choosing :math:`d_{min} = 10^{-12}`. References ---------- .. Zech G. and Aslan B. (2003) A Multivariate two-sample test based on the concept of minimum energy. PHYStat2003, SLAC, Stanford, CA, Sep 8-11. .. [AZ03] Aslan B. and Zech G. (2003) A new class of binning-free, multivariate goodness-of-fit tests: the energy tests. arXiV:hep-ex/0203010. """ nx, d = x.shape ny, d = y.shape v = (x.std(axis=0, ddof=1) * y.std(axis=0, ddof=1)).astype(np.double) dx = spatial.distance.pdist(x, "seuclidean", V=v) dy = spatial.distance.pdist(y, "seuclidean", V=v) dxy = spatial.distance.cdist(x, y, "seuclidean", V=v) phix = -np.log(dx.clip(dmin)).sum() / (nx * (nx - 1)) phiy = -np.log(dy.clip(dmin)).sum() / (ny * (ny - 1)) phixy = -np.log(dxy.clip(dmin)).sum() / (nx * ny) return phix + phiy - phixy
def _normalize(adv: np.ndarray): return (adv - adv.mean()) / (adv.std() + 1e-8)
def scale(x: np.ndarray, axis: int = 0) -> np.ndarray: """Normalize features, assuming 2D array t * n with n features and t observations.""" return (x - x.mean(axis, keepdims=True)) / x.std(axis, keepdims=True)
def statistics_from_array(x: numpy.ndarray): """Return the (mean, std, max, min) of an array.""" try: return x.mean(), x.std(), x.max(), x.min() except AttributeError: return numpy.nan, numpy.nan, numpy.nan, numpy.nan
def zscore(x: np.ndarray) -> np.ndarray: """Replace all array values with their z-scores.""" return (x - x.mean()) / x.std(ddof=1)
def std(data: np.ndarray): """Standardization: zero mean and unit variance.""" mean_ = data.mean(axis=0) std_ = data.std(axis=0) data -= mean_ data /= std_
def normalize_std_array(vector: np.ndarray): """Applies a unit mean and variance normalization to an arrayset""" return (vector if vector.std(axis=0) == 0 else (vector - vector.mean(axis=0)) / vector.std(axis=0))
def fit(self, X: np.ndarray, y: np.ndarray, sample_weight: Optional[np.ndarray] = None) -> 'Explainer': """Fit ensemble of feature-wise GBMs. @param X Training batch inputs. @param y Training batch outputs. @param sample_weight Sample weights (are not supported for now). """ check_X_y(X, y) n_features = X.shape[1] gbm_prototype = RGBMRegressor(n_estimators=self.pretraining_iter, max_depth=self.tree_max_depth, learning_rate=self.gbm_lr, init_est_type=self.init_est_type, use_deterministic_trees=self.use_deterministic_trees) # make estimators with the same prototype self.estimators_ = [clone(gbm_prototype) for _ in range(n_features)] # initialize weights with ones self.weights_ = torch.ones(n_features, dtype=torch.double, requires_grad=True) y_norm = 0 if self.norm_target: # normalize target self.mean_ = y.mean() self.std_ = y.std() y_norm = (y - self.mean_) / self.std_ target = torch.tensor(y_norm).double() else: target = torch.tensor(y).double() # # find center (probably it could be passed as an argument) # center = X.mean(axis=0) # var = np.mean(X.var(axis=0)) # # RBF as sample weights # sample_weight = np.exp(-((X - center) ** 2.0).sum(axis=1) / (2.0 * var)) if self.init_type == "target": init_target = y if not self.norm_target else y_norm elif self.init_type == "ones": init_target = np.ones_like(y) elif self.init_type == "zeros": init_target = np.zeros_like(y) elif type(self.init_type) == float: init_target = np.random.normal(0.0, self.init_type, size=y.shape) else: raise ValueError(f"Incorrect init_type: {self.init_type}") if self.enable_history: self.history_ = [] self.loss_history_ = [] # init each gbm for i, est in enumerate(self.estimators_): est.fit(X[:, i:i + 1], init_target, sample_weight) use_opt = (self.optimal_weights is not None) outputs = np.zeros_like(X.T) # train composition for epoch in range(self.n_epochs): # compute gbms outputs outputs += self._predict_last_residuals(X) # check if it is needed to recompute weights if use_opt: opt_started = (epoch >= self.optimal_weights) else: opt_started = False if self.optimal_period is None: opt_period = True else: opt_period = (epoch % self.optimal_period == 0) if use_opt and opt_started and opt_period: opt_est = LassoCV(cv=self.optimal_cv_folds) opt_est.fit(outputs.T, target.numpy()) cur_opt_weights = opt_est.coef_.ravel() new_weights = torch.tensor(cur_opt_weights, dtype=torch.double, requires_grad=True) # new_intercept = torch.tensor(opt_est.intercept_, dtype=torch.double, # requires_grad=True) self.weights_.data = torch.lerp(self.weights_.data, new_weights, self.optimal_rate) # TODO: check that intercept in regression is close to zero cur_outputs = torch.tensor(outputs, dtype=torch.double, requires_grad=True) cumulative_pred = (self.weights_.unsqueeze(1) * cur_outputs).sum(dim=0) # calculate loss and gradients # MSE loss loss = ((target - cumulative_pred) ** 2).mean() # loss += self.eta * ((cur_outputs.mean(dim=0) - 1) ** 2).sum().sqrt() self.weights_.retain_grad() cur_outputs.retain_grad() loss.backward() # update weights self.weights_.data -= self.weights_lr * self.weights_.grad.data # update gbms for i, est in enumerate(self.estimators_[:n_features]): est.append(X[:, i:i + 1], -cur_outputs.grad[i].data.numpy(), sample_weight=sample_weight) # clear gradients self.weights_.grad.data.zero_() # update history if self.enable_history: self.history_.append(self.weights_.data.numpy().copy()) self.loss_history_.append(loss.item()) self.coef_ = self.weights_.data.numpy().copy() if self.enable_history: self.history_ = np.stack(self.history_, axis=0) return self
def intrinsic_dimension(X:np.ndarray, k1:int=6, k2:int=12, estimator:str='levina', metric:str='vector', trafo:str='var', mem_threshold:int=5000): """Calculate intrinsic dimension based on the MLE by Levina and Bickel [1]_. Parameters ---------- X : ndarray - An ``m x n`` vector data matrix with ``n`` objects in an ``m`` dimensional feature space - An ``n x n`` distance matrix. NOTE: The type must be defined via parameter `metric`! k1 : int, optional (default: 6) Start of neighborhood range to search in. k2 : int, optional (default: 12) End of neighborhood range to search in. estimator : {'levina', 'mackay'}, optional (default: 'levina') Determine the summation strategy: see [2]_. metric : {'vector', 'distance'}, optional (default: 'vector') Determine data type of `X`. NOTE: the MLE was derived for euclidean distances. Using other dissimilarity measures may lead to undefined results. trafo : {None, 'std', 'var'}, optional (default: 'var') Transform vector data. - None: no transformation - 'std': standardization - 'var': subtract mean, divide by variance (default behavior of Laurens van der Maaten's DR toolbox; most likely for other ID/DR techniques). mem_treshold : int, optional, default: 5000 Controls speed-memory usage trade-off: If number of points is higher than the given value, don't calculate complete distance matrix at once (fast, high memory), but per row (slower, less memory). Returns ------- d_mle : int Intrinsic dimension estimate (rounded to next integer) References ---------- .. [1] Levina, E., & Bickel, P. (2004). Maximum likelihood estimation of intrinsic dimension. Advances in Neural Information …, 17, 777–784. http://doi.org/10.2307/2335172 .. [2] http://www.inference.phy.cam.ac.uk/mackay/dimension/ """ n = X.shape[0] if estimator not in ['levina', 'mackay']: raise ValueError("Parameter 'estimator' must be 'levina' or 'mackay'.") if k1 < 1 or k2 < k1 or k2 >= n: raise ValueError("Invalid neighborhood: Please make sure that " "0 < k1 <= k2 < n. (Got k1={} and k2={}).". format(k1, k2)) X = X.copy().astype(float) if metric == 'vector': # New array with unique rows X = X[np.lexsort(np.fliplr(X).T)] if trafo is None: pass elif trafo == 'var': X -= X.mean(axis=0) # broadcast X /= X.var(axis=0) + 1e-7 # broadcast elif trafo == 'std': # Standardization X -= X.mean(axis=0) # broadcast X /= X.std(axis=0) + 1e-7 # broadcast else: raise ValueError("Transformation must be None, 'std', or 'var'.") # Compute matrix of log nearest neighbor distances X2 = (X**2).sum(1) if n <= mem_threshold: # speed-memory trade-off distance = X2.reshape(-1, 1) + X2 - 2*np.dot(X, X.T) #2x br.cast distance.sort(1) # Replace invalid values with a small number distance[distance<0] = 1e-7 knnmatrix = .5 * np.log(distance[:, 1:k2+1]) else: knnmatrix = np.zeros((n, k2)) for i in range(n): distance = np.sort(X2[i] + X2 - 2 * np.dot(X, X[i, :])) # Replace invalid values with a small number distance[distance < 0] = 1e-7 knnmatrix[i, :] = .5 * np.log(distance[1:k2+1]) elif metric == 'distance': raise NotImplementedError("ID currently only supports vector data.") #======================================================================= # # TODO calculation WRONG # X.sort(1) # X[X < 0] = 1e-7 # knnmatrix = np.log(X[:, 1:k2+1]) #======================================================================= elif metric == 'similarity': raise NotImplementedError("ID currently only supports vector data.") #======================================================================= # # TODO calculation WRONG # print("WARNING: using similarity data may return " # "undefined results.", file=sys.stderr) # X[X < 0] = 0 # distance = 1 - (X / X.max()) # knnmatrix = np.log(distance[:, 1:k2+1]) #======================================================================= else: raise ValueError("Parameter 'metric' must be 'vector' or 'distance'.") # Compute the ML estimate S = np.cumsum(knnmatrix, 1) indexk = np.arange(k1, k2+1) # broadcasted afterwards dhat = -(indexk - 2) / (S[:, k1-1:k2] - knnmatrix[:, k1-1:k2] * indexk) if estimator == 'levina': # Average over estimates and over values of k no_dims = dhat.mean() if estimator == 'mackay': # Average over inverses dhat **= -1 dhat_k = dhat.mean(0) no_dims = (dhat_k ** -1).mean() return int(no_dims.round())
def _find_pareto_points(self, points: np.ndarray) -> np.ndarray: """ Runs the finder on the given points. Args: points: (n, k) array of n points of k dimensions. The convention is that the Returns: indices into [points] of the approximate pareto set """ n, k = points.shape # specialcase some stupid inputs if n == 0: return np.array([], dtype=np.uint64) # specialcased for normalization (see below) to work elif n == 1: return np.array([0], dtype=np.uint64) elif k == 0: raise ValueError( "No pareto-optimal set of points with no features") # we normalize our point sets points = (points - points.mean(axis=0)) / points.std(axis=0) # for finding qhull faces pointing toward optimality test_vector = np.zeros(k + 1) test_vector[:-1] = -1 # vertices in the candidate pareto set vertex_mask = np.zeros(len(points), dtype=bool) if self.n_peel == 0: vertex_mask[:] = True else: for layer in range(self.n_peel): # construct hull of all vertices NOT already in the set try: qhull = sss.ConvexHull(points[~vertex_mask]) except (QhullError, ValueError): break # noinspection PyUnresolvedReferences pareto_side = np.where((qhull.equations @ test_vector) > 0) # noinspection PyUnresolvedReferences pareto_vertices = np.unique( qhull.simplices[pareto_side].ravel()) vertex_mask[pareto_vertices] = True # list of indices into points in the candidate set of dominator points pareto_vertices = np.where(vertex_mask)[0] if self.eliminate_dominated: # estimate goodness as the total score # NB. this is where it helps to be normalized goodness_order = np.argsort(points[pareto_vertices].sum(axis=1)) # from the most promising dominator for dominator_ix in goodness_order: # and the most promising dominated for dominated_ix in goodness_order[::-1]: if np.all(points[dominator_ix] > points[dominated_ix]): # mark that vertex as dominated pareto_vertices[dominated_ix] = -1 break return pareto_vertices[pareto_vertices >= 0]
def set_emul_error_func(self, x_cv: np.ndarray, y_cv_err: np.ndarray) -> None: self.emul_error: Callable[[np.ndarray], np.ndarray] = lambda x: y_cv_err.std()
def szekely_rizzo(x: np.ndarray, y: np.ndarray, *, standardize: bool = True) -> float: r""" Compute the Székely-Rizzo energy distance dissimilarity metric based on an analogy with Newton's gravitational potential energy. This method is scale-invariant when `standardize=True` (default), scale-dependent otherwise. Parameters ---------- x : ndarray (n,d) Reference sample. y : ndarray (m,d) Candidate sample. standardize : bool If True (default), the standardized euclidean norm is used, instead of the conventional one. Returns ------- float Székely-Rizzo's energy distance dissimilarity metric ranging from 0 to infinity. Notes ----- The e-distance between two variables :math:`X`, :math:`Y` (target and candidates) of sizes :math:`n,d` and :math:`m,d` proposed by [SR2004]_ is defined by: .. math:: e(X, Y) = \frac{n m}{n + m} \left[2\phi_{xy} − \phi_{xx} − \phi_{yy} \right] where .. math:: \phi_{xy} &= \frac{1}{n m} \sum_{i = 1}^n \sum_{j = 1}^m \left\Vert X_i − Y_j \right\Vert \\ \phi_{xx} &= \frac{1}{n^2} \sum_{i = 1}^n \sum_{j = 1}^n \left\Vert X_i − X_j \right\Vert \\ \phi_{yy} &= \frac{1}{m^2} \sum_{i = 1}^m \sum_{j = 1}^m \left\Vert X_i − Y_j \right\Vert \\ and where :math:`\Vert\cdot\Vert` denotes the Euclidean norm, :math:`X_i` denotes the i-th observation of :math:`X`. When `standardized=False`, this corresponds to the :math:`T` test of [RS2016]_ (p. 28) and to the ``eqdist.e`` function of the `energy` R package (with two samples) and gives results twice as big as :py:func:`xclim.sdba.processing.escore`. The standardization was added following the logic of [Grenier2013] to make the metric scale-invariant. References ---------- .. [SR2004] Székely, G. J. and Rizzo, M. L. (2004) Testing for Equal Distributions in High Dimension, InterStat, November (5) .. [RS2016] Rizzo, M. L., & Székely, G. J. (2016). Energy distance. Wiley Interdisciplinary Reviews: Computational Statistics, 8(1), 27–38. https://doi.org/10.1002/wics.1375 """ n, _ = x.shape m, _ = y.shape # Mean of the distance pairs # We are not taking "mean" because of the condensed output format of pdist if standardize: v = (x.std(axis=0, ddof=1) * y.std(axis=0, ddof=1)).astype(np.double) sXY = spatial.distance.cdist(x, y, "seuclidean", V=v).sum() / (n * m) sXX = spatial.distance.pdist(x, "seuclidean", V=v).sum() * 2 / n**2 sYY = spatial.distance.pdist(y, "seuclidean", V=v).sum() * 2 / m**2 else: sXY = spatial.distance.cdist(x, y, "euclidean").sum() / (n * m) sXX = spatial.distance.pdist(x, "euclidean").sum() * 2 / n**2 sYY = spatial.distance.pdist(y, "euclidean").sum() * 2 / m**2 w = n * m / (n + m) return w * (sXY + sXY - sXX - sYY)
def normalize(x: np.ndarray) -> np.ndarray: return (x - x.mean()) / x.std()
def _normalize(arr: np.ndarray): return (arr - arr.mean()) / arr.std()
def _fit_binomial(self, data: dict, n_near_species: np.ndarray, n_near: np.ndarray, n_species: np.ndarray, n_all: np.ndarray, n_near_tot: int, n_all_tot: int): """ This function computes the following approximate probability distributions and derives statistics accordingly. * The number of lipids near the protein is represented as a normal distribution. * The fraction of lipids near the protein follows a hypergeometric distribution. * The enrichment is represented as the log-normal distribution derived from the ratio of two binomial convolutions of the frame-by-frame binomial distributions. All these approximations assume that each frame or observation is independent. The binomial approximation assumes that: * the number of the lipid species near the protein is small compared to the total number of that lipid species * the total number of all lipids is large * the fraction (n_species / n_all) is not close to 0 or 1. .. note:: The enrichment p-value is calculated from the log-normal distribution of the null hypothesis: that the average enrichment is representative of the ratio of n_species : n_all """ summary = { "Total # lipids, all": n_all_tot, "Total # lipids, shell": n_near_tot } p_time = data['Fraction near protein'] summary['Total # species, shell'] = N = n_near_species.sum() summary['Total # species, all'] = N_sp = n_species.sum() if n_near_tot: # catch zeros p_shell = N / n_near_tot else: p_shell = 0 if n_all_tot: p_null = N_sp / n_all_tot else: p_null = 0 # n events: assume normal summary['Mean # species, shell'] = n_near_species.mean() summary['SD # species, shell'] = sd = n_near_species.std() # actually hypergeometric, but binomials are easier # X ~ B(n_near_tot, p_shell) summary['Mean fraction of species, shell'] = p_shell summary['SD fraction of species, shell'] = sd_frac = sd / n_near.mean() if p_null == 0: summary['Mean enrichment'] = 1 summary['SD enrichment'] = 0 else: summary['Mean enrichment'] = p_shell / p_null summary['SD enrichment'] = sd_frac / p_null return summary