def test_stable_cumsum(): assert_array_equal(stable_cumsum([1, 2, 3]), np.cumsum([1, 2, 3])) r = np.random.RandomState(0).rand(100000) assert_warns(RuntimeWarning, stable_cumsum, r, rtol=0, atol=0) # test axis parameter A = np.random.RandomState(36).randint(1000, size=(5, 5, 5)) assert_array_equal(stable_cumsum(A, axis=0), np.cumsum(A, axis=0)) assert_array_equal(stable_cumsum(A, axis=1), np.cumsum(A, axis=1)) assert_array_equal(stable_cumsum(A, axis=2), np.cumsum(A, axis=2))
def test_stable_cumsum(): if np_version < (1, 9): raise SkipTest("Sum is as unstable as cumsum for numpy < 1.9") assert_array_equal(stable_cumsum([1, 2, 3]), np.cumsum([1, 2, 3])) r = np.random.RandomState(0).rand(100000) assert_warns(RuntimeWarning, stable_cumsum, r, rtol=0, atol=0) # test axis parameter A = np.random.RandomState(36).randint(1000, size=(5, 5, 5)) assert_array_equal(stable_cumsum(A, axis=0), np.cumsum(A, axis=0)) assert_array_equal(stable_cumsum(A, axis=1), np.cumsum(A, axis=1)) assert_array_equal(stable_cumsum(A, axis=2), np.cumsum(A, axis=2))
def test_stable_cumsum(): if np_version < (1, 9): raise SkipTest("Sum is as unstable as cumsum for numpy < 1.9") assert_array_equal(stable_cumsum([1, 2, 3]), np.cumsum([1, 2, 3])) r = np.random.RandomState(0).rand(100000) assert_raise_message(RuntimeError, 'cumsum was found to be unstable: its last element ' 'does not correspond to sum', stable_cumsum, r, rtol=0, atol=0)
def _weighted_percentile(array, sample_weight, percentile=50): """ Compute the weighted ``percentile`` of ``array`` with ``sample_weight``. """ sorted_idx = np.argsort(array) # Find index of median prediction for each sample weight_cdf = stable_cumsum(sample_weight[sorted_idx]) percentile_idx = np.searchsorted( weight_cdf, (percentile / 100.) * weight_cdf[-1]) return array[sorted_idx[percentile_idx]]
def _k_init(X, n_clusters, x_squared_norms, random_state, n_local_trials=None): """Init n_clusters seeds according to k-means++ Parameters ---------- X : {ndarray, sparse matrix} of shape (n_samples, n_features) The data to pick seeds for. To avoid memory copy, the input data should be double precision (dtype=np.float64). n_clusters : int The number of seeds to choose x_squared_norms : ndarray of shape (n_samples,) Squared Euclidean norm of each data point. random_state : RandomState instance The generator used to initialize the centers. See :term:`Glossary <random_state>`. n_local_trials : int, default=None The number of seeding trials for each center (except the first), of which the one reducing inertia the most is greedily chosen. Set to None to make the number of trials depend logarithmically on the number of seeds (2+log(k)); this is the default. Notes ----- Selects initial cluster centers for k-mean clustering in a smart way to speed up convergence. see: Arthur, D. and Vassilvitskii, S. "k-means++: the advantages of careful seeding". ACM-SIAM symposium on Discrete algorithms. 2007 Version ported from http://www.stanford.edu/~darthur/kMeansppTest.zip, which is the implementation used in the aforementioned paper. """ n_samples, n_features = X.shape centers = np.empty((n_clusters, n_features), dtype=X.dtype) assert x_squared_norms is not None, 'x_squared_norms None in _k_init' # Set the number of local seeding trials if none is given if n_local_trials is None: # This is what Arthur/Vassilvitskii tried, but did not report # specific results for other than mentioning in the conclusion # that it helped. n_local_trials = 2 + int(np.log(n_clusters)) # Pick first center randomly center_id = random_state.randint(n_samples) centers[0] = X[center_id] # Initialize list of closest distances and calculate current potential closest_dist_sq = euclidean_distances(centers[0, np.newaxis], X, Y_norm_squared=x_squared_norms, squared=True) current_pot = closest_dist_sq.sum() # Pick the remaining n_clusters-1 points for c in range(1, n_clusters): # Choose center candidates by sampling with probability proportional # to the squared distance to the closest existing center rand_vals = random_state.random_sample(n_local_trials) * current_pot candidate_ids = np.searchsorted(stable_cumsum(closest_dist_sq), rand_vals) # XXX: numerical imprecision can result in a candidate_id out of range np.clip(candidate_ids, None, closest_dist_sq.size - 1, out=candidate_ids) # Compute distances to center candidates distance_to_candidates = euclidean_distances( X[candidate_ids], X, Y_norm_squared=x_squared_norms, squared=True) # update closest distances squared and potential for each candidate np.minimum(closest_dist_sq, distance_to_candidates, out=distance_to_candidates) candidates_pot = distance_to_candidates.sum(axis=1) # Decide which candidate is the best best_candidate = np.argmin(candidates_pot) current_pot = candidates_pot[best_candidate] closest_dist_sq = distance_to_candidates[best_candidate] best_candidate = candidate_ids[best_candidate] # Permanently add best center candidate found in local tries centers[c] = X[best_candidate] return centers
def _kpp_init(self, D, n_clusters, random_state_, n_local_trials=None): """Init n_clusters seeds with a method similar to k-means++ Parameters ----------- D : array, shape (n_samples, n_samples) The distance matrix we will use to select medoid indices. n_clusters : integer The number of seeds to choose random_state : RandomState The generator used to initialize the centers. n_local_trials : integer, optional The number of seeding trials for each center (except the first), of which the one reducing inertia the most is greedily chosen. Set to None to make the number of trials depend logarithmically on the number of seeds (2+log(k)); this is the default. Notes ----- Selects initial cluster centers for k-medoid clustering in a smart way to speed up convergence. see: Arthur, D. and Vassilvitskii, S. "k-means++: the advantages of careful seeding". ACM-SIAM symposium on Discrete algorithms. 2007 Version ported from http://www.stanford.edu/~darthur/kMeansppTest.zip, which is the implementation used in the aforementioned paper. """ n_samples, _ = D.shape centers = np.empty(n_clusters, dtype=int) # Set the number of local seeding trials if none is given if n_local_trials is None: # This is what Arthur/Vassilvitskii tried, but did not report # specific results for other than mentioning in the conclusion # that it helped. n_local_trials = 2 + int(np.log(n_clusters)) center_id = random_state_.randint(n_samples) centers[0] = center_id # Initialize list of closest distances and calculate current potential closest_dist_sq = D[centers[0], :]**2 current_pot = closest_dist_sq.sum() # pick the remaining n_clusters-1 points for cluster_index in range(1, n_clusters): rand_vals = (random_state_.random_sample(n_local_trials) * current_pot) candidate_ids = np.searchsorted(stable_cumsum(closest_dist_sq), rand_vals) # Compute distances to center candidates distance_to_candidates = D[candidate_ids, :]**2 # Decide which candidate is the best best_candidate = None best_pot = None best_dist_sq = None for trial in range(n_local_trials): # Compute potential when including center candidate new_dist_sq = np.minimum(closest_dist_sq, distance_to_candidates[trial]) new_pot = new_dist_sq.sum() # Store result if it is the best local trial so far if (best_candidate is None) or (new_pot < best_pot): best_candidate = candidate_ids[trial] best_pot = new_pot best_dist_sq = new_dist_sq centers[cluster_index] = best_candidate current_pot = best_pot closest_dist_sq = best_dist_sq return centers
def _fit_full(self, X, n_components): self.accountant.check(self.epsilon, 0) n_samples, n_features = X.shape if self.centered: self.mean_ = np.zeros_like(np.mean(X, axis=0)) else: if self.bounds is None: warnings.warn( "Bounds parameter hasn't been specified, so falling back to determining range from the data.\n" "This will result in additional privacy leakage. To ensure differential privacy with no " "additional privacy loss, specify `range` for each valued returned by np.mean().", PrivacyLeakWarning) self.bounds = (np.min(X, axis=0), np.max(X, axis=0)) self.bounds = check_bounds(self.bounds, n_features) self.mean_ = mean(X, epsilon=self.epsilon / 2, bounds=self.bounds, axis=0, accountant=BudgetAccountant()) X -= self.mean_ if self.data_norm is None: warnings.warn( "Data norm has not been specified and will be calculated on the data provided. This will " "result in additional privacy leakage. To ensure differential privacy and no additional " "privacy leakage, specify `data_norm` at initialisation.", PrivacyLeakWarning) self.data_norm = np.linalg.norm(X, axis=1).max() X = clip_to_norm(X, self.data_norm) s, u = covariance_eig( X, epsilon=self.epsilon if self.centered else self.epsilon / 2, norm=self.data_norm, dims=n_components if isinstance(n_components, Integral) else None) u, _ = svd_flip(u, np.zeros_like(u).T) s = np.sqrt(s) components_ = u.T # Get variance explained by singular values explained_variance_ = np.sort((s**2) / (n_samples - 1))[::-1] total_var = explained_variance_.sum() explained_variance_ratio_ = explained_variance_ / total_var singular_values_ = s.copy() # Store the singular values. # Post-process the number of components required if n_components == 'mle': # TODO: Update when sklearn requirement changes to >= 0.23, removing try...except try: n_components = sk_pca._infer_dimension(explained_variance_, n_samples) except AttributeError: n_components = sk_pca._infer_dimension_( explained_variance_, n_samples, n_features) elif 0 < n_components < 1.0: # number of components for which the cumulated explained # variance percentage is superior to the desired threshold ratio_cumsum = stable_cumsum(explained_variance_ratio_) n_components = np.searchsorted(ratio_cumsum, n_components) + 1 # Compute noise covariance using Probabilistic PCA model # The sigma2 maximum likelihood (cf. eq. 12.46) if n_components < min(n_features, n_samples): self.noise_variance_ = explained_variance_[n_components:].mean() else: self.noise_variance_ = 0. self.n_samples_, self.n_features_ = n_samples, n_features self.components_ = components_[:n_components] self.n_components_ = n_components self.explained_variance_ = explained_variance_[:n_components] self.explained_variance_ratio_ = explained_variance_ratio_[: n_components] self.singular_values_ = singular_values_[:n_components] self.accountant.spend(self.epsilon, 0) return u, s[:n_components], u.T
def fit(self, X, y=None, sample_weight=None): """Train whitening spatial filters. Parameters ---------- X : ndarray, shape (n_matrices, n_channels, n_channels) Set of SPD matrices. y : None Ignored as unsupervised. sample_weight : None | ndarray, shape (n_matrices,), default=None Weight of each matrix, to compute the weighted mean covariance matrix used for whitening and dimension reduction. If None, it uses equal weights. Returns ------- self : Whitening instance The Whitening instance. """ # weighted mean of input covariance matrices Xm = mean_covariance(X, metric=self.metric, sample_weight=sample_weight) # whitening without dimension reduction if self.dim_red is None: self.n_components_ = X.shape[-1] self.filters_ = invsqrtm(Xm) self.inv_filters_ = sqrtm(Xm) # whitening with dimension reduction elif isinstance(self.dim_red, dict): if len(self.dim_red) > 1: raise ValueError( 'Dictionary dim_red must contain only one element (Got %d)' % len(self.dim_red)) dim_red_key = next(iter(self.dim_red)) dim_red_val = self.dim_red.get(dim_red_key) eigvals, eigvecs = eigh(Xm, eigvals_only=False) eigvals = eigvals[::-1] # sort eigvals in descending order eigvecs = np.fliplr(eigvecs) # idem for eigvecs if dim_red_key == 'n_components': if dim_red_val < 1: raise ValueError( 'Value n_components must be superior to 1 (Got %d)' % dim_red_val) if not isinstance(dim_red_val, numbers.Integral): raise ValueError( 'n_components=%d must be of type int (Got %r)' % (dim_red_val, type(dim_red_val))) self.n_components_ = min(dim_red_val, X.shape[-1]) elif dim_red_key == 'expl_var': if not 0 < dim_red_val <= 1: raise ValueError( 'Value expl_var must be included in (0, 1] (Got %d)' % dim_red_val) cum_expl_var = stable_cumsum(eigvals / eigvals.sum()) if self.verbose: print('Cumulative explained variance: \n %r' % cum_expl_var) self.n_components_ = np.searchsorted( cum_expl_var, dim_red_val, side='right') + 1 elif dim_red_key == 'max_cond': if dim_red_val <= 1: raise ValueError( 'Value max_cond must be strictly superior to 1 ' '(Got %d)' % dim_red_val) conds = eigvals[0] / eigvals if self.verbose: print('Condition numbers: \n %r' % conds) self.n_components_ = np.searchsorted(conds, dim_red_val, side='left') else: raise ValueError('Unknown key in parameter dim_red: %r' % dim_red_key) # dimension reduction if self.verbose: print('Dimension reduction of Whitening on %d components' % self.n_components_) pca_filters = eigvecs[:, :self.n_components_] pca_sqrtvals = np.sqrt(eigvals[:self.n_components_]) # whitening self.filters_ = pca_filters @ np.diag(1. / pca_sqrtvals) self.inv_filters_ = np.diag(pca_sqrtvals).T @ pca_filters.T else: raise ValueError('Unknown type for parameter dim_red: %r' % type(self.dim_red)) return self
def _fit_daal4py(self, X, n_components): n_samples, n_features = X.shape n_sf_min = min(n_samples, n_features) _validate_n_components(n_components, n_samples, n_features) if n_components == 'mle': daal_n_components = n_features elif n_components < 1: daal_n_components = n_sf_min else: daal_n_components = n_components fpType = getFPType(X) centering_algo = daal4py.normalization_zscore( fptype=fpType, doScale=False) pca_alg = daal4py.pca( fptype=fpType, method='svdDense', normalization=centering_algo, resultsToCompute='mean|variance|eigenvalue', isDeterministic=True, nComponents=daal_n_components ) pca_res = pca_alg.compute(X) self.mean_ = pca_res.means.ravel() variances_ = pca_res.variances.ravel() components_ = pca_res.eigenvectors explained_variance_ = pca_res.eigenvalues.ravel() tot_var = explained_variance_.sum() explained_variance_ratio_ = explained_variance_ / tot_var if n_components == 'mle': n_components = \ _infer_dimension_(explained_variance_, n_samples, n_features) elif 0 < n_components < 1.0: # number of components for which the cumulated explained # variance percentage is superior to the desired threshold ratio_cumsum = stable_cumsum(explained_variance_ratio_) n_components = np.searchsorted(ratio_cumsum, n_components) + 1 # Compute noise covariance using Probabilistic PCA model # The sigma2 maximum likelihood (cf. eq. 12.46) if n_components < n_sf_min: if explained_variance_.shape[0] == n_sf_min: self.noise_variance_ = explained_variance_[n_components:].mean() else: resid_var_ = variances_.sum() resid_var_ -= explained_variance_[:n_components].sum() self.noise_variance_ = resid_var_ / (n_sf_min - n_components) else: self.noise_variance_ = 0. self.n_samples_, self.n_features_ = n_samples, n_features self.components_ = components_[:n_components] self.n_components_ = n_components self.explained_variance_ = explained_variance_[:n_components] self.explained_variance_ratio_ = \ explained_variance_ratio_[:n_components] self.singular_values_ = np.sqrt((n_samples - 1) * self.explained_variance_)
def _boost(self, iboost, X, y, sample_weight, random_state): """Implement a single boost for regression Perform a single boost according to the AdaBoost.R2 algorithm and return the updated sample weights. Parameters ---------- iboost : int The index of the current boost iteration. X : {array-like, sparse matrix} of shape = [n_samples, n_features] The training input samples. Sparse matrix can be CSC, CSR, COO, DOK, or LIL. DOK and LIL are converted to CSR. y : array-like of shape = [n_samples] The target values (class labels in classification, real numbers in regression). sample_weight : array-like of shape = [n_samples] The current sample weights. random_state : numpy.RandomState The current random number generator Returns ------- sample_weight : array-like of shape = [n_samples] or None The reweighted sample weights. If None then boosting has terminated early. estimator_weight : float The weight for the current boost. If None then boosting has terminated early. estimator_error : float The regression error for the current boost. If None then boosting has terminated early. """ estimator = self._make_estimator(random_state=random_state) # Weighted sampling of the training set with replacement # For NumPy >= 1.7.0 use np.random.choice cdf = stable_cumsum(sample_weight) cdf /= cdf[-1] uniform_samples = random_state.random_sample(X.shape[0]) bootstrap_idx = cdf.searchsorted(uniform_samples, side='right') # searchsorted returns a scalar bootstrap_idx = np.array(bootstrap_idx, copy=False) # Fit on the bootstrapped sample and obtain a prediction # for all samples in the training set estimator.fit(X[bootstrap_idx], y[bootstrap_idx]) y_predict = estimator.predict(X) error_vect = np.abs(y_predict - y) error_max = error_vect.max() if error_max != 0.: error_vect /= error_max if self.loss == 'square': error_vect **= 2 elif self.loss == 'exponential': error_vect = 1. - np.exp(-error_vect) # Calculate the average loss estimator_error = (sample_weight * error_vect).sum() if estimator_error <= 0: # Stop if fit is perfect return sample_weight, 1., 0. elif estimator_error >= 0.5: # Discard current estimator only if it isn't the only one if len(self.estimators_) > 1: self.estimators_.pop(-1) return None, None, None beta = estimator_error / (1. - estimator_error) # Boost weight using AdaBoost.R2 alg estimator_weight = self.learning_rate * np.log(1. / beta) if not iboost == self.n_estimators - 1: sample_weight *= np.power(beta, (1. - error_vect) * self.learning_rate) return sample_weight, estimator_weight, estimator_error
def fit(self, X, y=None): """Fit. Compute and diagonalize cospectra, to estimate forward and backward spatial filters. Parameters ---------- X : ndarray, shape (n_subjects, n_conditions, n_channels, n_samples) | list of n_subjects of list of n_conditions ndarray of shape (n_channels, n_samples), with same n_conditions and n_channels but different n_samples Signal in channel space, acquired for different subjects and under different experimental conditions. y : None Currently not used, here for compatibility with sklearn API. Returns ------- self : AJDC instance The AJDC instance. """ # definition of params for Welch's method cospcov = est.CospCovariances( window=self.window, overlap=self.overlap, fmin=self.fmin, fmax=self.fmax, fs=self.fs) # estimation of cospectra on subjects and conditions cosp = [] for s in range(len(X)): cosp_ = cospcov.transform(X[s]) if s == 0: n_conditions = cosp_.shape[0] self.n_channels_ = cosp_.shape[1] self.freqs_ = cospcov.freqs_ else: if n_conditions != cosp_.shape[0]: raise ValueError('Unequal number of conditions') if self.n_channels_ != cosp_.shape[1]: raise ValueError('Unequal number of channels') cosp.append(cosp_) cosp = numpy.transpose(numpy.array(cosp), axes=(0, 1, 4, 2, 3)) # trace-normalization of cospectra, Eq(3) in [2] cosp = normalize(cosp, "trace") # average of cospectra across subjects, Eq(7) in [2] cosp = numpy.mean(cosp, axis=0, keepdims=False) # concatenation of cospectra along conditions self._cosp_channels = numpy.concatenate(cosp, axis=0) # estimation of non-diagonality weights, Eq(B.1) in [1] weights = get_nondiag_weight(self._cosp_channels) # dimension reduction, computed on the weighted mean of cospectra # across frequencies (and conditions) cosp_av = numpy.average( self._cosp_channels, axis=0, weights=weights) eigvals, eigvecs = eigh(cosp_av, eigvals_only=False) eigvals = eigvals[::-1] # sorted in descending order eigvecs = numpy.fliplr(eigvecs) # idem cum_expl_var = stable_cumsum(eigvals / eigvals.sum()) self.n_sources_ = numpy.searchsorted( cum_expl_var, self.expl_var, side='right') + 1 if self.verbose: print("Fitting AJDC to data using {} components ".format( self.n_sources_)) pca_filters = eigvecs[:, :self.n_sources_] pca_vals = eigvals[:self.n_sources_] # whitening, Eq.(8) in [2] whit_filters = pca_filters @ numpy.diag(1. / numpy.sqrt(pca_vals)) whit_inv_filters = pca_filters @ numpy.diag(numpy.sqrt(pca_vals)) # apply dimension reduction and whitening on cospectra cosp_rw = whit_filters.T @ self._cosp_channels @ whit_filters # approximate joint diagonalization, currently by Pham's algorithm [3] diag_filters, self._cosp_sources = ajd_pham( cosp_rw, n_iter_max=100, sample_weight=weights) # computation of forward and backward filters, Eq.(9) and (10) in [2] self.forward_filters_ = diag_filters @ whit_filters.T self.backward_filters_ = whit_inv_filters @ inv(diag_filters) return self
def _fit_full(self, X, n_components): """Fit the model by computing full SVD on X""" n_samples, n_features = X.shape if n_components == 'mle': if n_samples < n_features: raise ValueError("n_components='mle' is only supported " "if n_samples >= n_features") elif n_components == 'latent_root': pass elif not 0 <= n_components <= min(n_samples, n_features): raise ValueError("n_components=%r must be between 0 and " "min(n_samples, n_features)=%r with " "svd_solver='full'" % (n_components, min(n_samples, n_features))) elif n_components >= 1: if not isinstance(n_components, numbers.Integral): raise ValueError("n_components=%r must be of type int " "when greater than or equal to 1, " "was of type=%r" % (n_components, type(n_components))) # Center data self.mean_ = np.mean(X, axis=0) X -= self.mean_ U, S, V = linalg.svd(X, full_matrices=False) # flip eigenvectors' sign to enforce deterministic output U, V = svd_flip(U, V) components_ = V # Get variance explained by singular values explained_variance_ = (S**2) / (n_samples - 1) total_var = explained_variance_.sum() explained_variance_ratio_ = explained_variance_ / total_var singular_values_ = S.copy() # Store the singular values. # Postprocess the number of components required if n_components == 'mle': n_components = \ _infer_dimension_(explained_variance_, n_samples, n_features) elif n_components == 'latent_root': n_components = (explained_variance_ > 1).sum() elif 0 < n_components < 1.0: # number of components for which the cumulated explained # variance percentage is superior to the desired threshold ratio_cumsum = stable_cumsum(explained_variance_ratio_) n_components = np.searchsorted(ratio_cumsum, n_components) + 1 # Compute noise covariance using Probabilistic PCA model # The sigma2 maximum likelihood (cf. eq. 12.46) if n_components < min(n_features, n_samples): self.noise_variance_ = explained_variance_[n_components:].mean() else: self.noise_variance_ = 0. self.n_samples_, self.n_features_ = n_samples, n_features self.components_ = components_[:n_components] self.n_components_ = n_components self.explained_variance_ = explained_variance_[:n_components] self.explained_variance_ratio_ = \ explained_variance_ratio_[:n_components] self.singular_values_ = singular_values_[:n_components] return U, S, V
def _k_init_metric(X, n_clusters, cdist_metric, random_state, n_local_trials=None): """Init n_clusters seeds according to k-means++ with a custom distance metric. Parameters ---------- X : array, shape (n_samples, n_timestamps, n_features) The data to pick seeds for. n_clusters : integer The number of seeds to choose cdist_metric : function Function to be called for cross-distance computations random_state : RandomState instance Generator used to initialize the centers. n_local_trials : integer, optional The number of seeding trials for each center (except the first), of which the one reducing inertia the most is greedily chosen. Set to None to make the number of trials depend logarithmically on the number of seeds (2+log(k)); this is the default. Notes ----- Selects initial cluster centers for k-mean clustering in a smart way to speed up convergence. see: Arthur, D. and Vassilvitskii, S. "k-means++: the advantages of careful seeding". ACM-SIAM symposium on Discrete algorithms. 2007 Version adapted from scikit-learn for use with a custom metric in place of Euclidean distance. """ n_samples, n_timestamps, n_features = X.shape centers = numpy.empty((n_clusters, n_timestamps, n_features), dtype=X.dtype) # Set the number of local seeding trials if none is given if n_local_trials is None: # This is what Arthur/Vassilvitskii tried, but did not report # specific results for other than mentioning in the conclusion # that it helped. n_local_trials = 2 + int(numpy.log(n_clusters)) # Pick first center randomly center_id = random_state.randint(n_samples) centers[0] = X[center_id] # Initialize list of closest distances and calculate current potential closest_dist_sq = cdist_metric(centers[0, numpy.newaxis], X)**2 current_pot = closest_dist_sq.sum() # Pick the remaining n_clusters-1 points for c in range(1, n_clusters): # Choose center candidates by sampling with probability proportional # to the squared distance to the closest existing center rand_vals = random_state.random_sample(n_local_trials) * current_pot candidate_ids = numpy.searchsorted(stable_cumsum(closest_dist_sq), rand_vals) # XXX: numerical imprecision can result in a candidate_id out of range numpy.clip(candidate_ids, None, closest_dist_sq.size - 1, out=candidate_ids) # Compute distances to center candidates distance_to_candidates = cdist_metric(X[candidate_ids], X)**2 # update closest distances squared and potential for each candidate numpy.minimum(closest_dist_sq, distance_to_candidates, out=distance_to_candidates) candidates_pot = distance_to_candidates.sum(axis=1) # Decide which candidate is the best best_candidate = numpy.argmin(candidates_pot) current_pot = candidates_pot[best_candidate] closest_dist_sq = distance_to_candidates[best_candidate] best_candidate = candidate_ids[best_candidate] # Permanently add best center candidate found in local tries centers[c] = X[best_candidate] return centers
def _k_init(norm, X, n_clusters, random_state, n_local_trials=None): """Init n_clusters seeds according to k-means++ Parameters ---------- norm : `l1` or `l2` manhattan or euclidean distance X : array or sparse matrix, shape (n_samples, n_features) The data to pick seeds for. To avoid memory copy, the input data should be double precision (dtype=numpy.float64). n_clusters : integer The number of seeds to choose random_state : int, RandomState instance The generator used to initialize the centers. Use an int to make the randomness deterministic. See :term:`Glossary <random_state>`. n_local_trials : integer, optional The number of seeding trials for each center (except the first), of which the one reducing inertia the most is greedily chosen. Set to None to make the number of trials depend logarithmically on the number of seeds (2+log(k)); this is the default. Notes ----- Selects initial cluster centers for k-mean clustering in a smart way to speed up convergence. see: Arthur, D. and Vassilvitskii, S. "k-means++: the advantages of careful seeding". ACM-SIAM symposium on Discrete algorithms. 2007 Version ported from http://www.stanford.edu/~darthur/kMeansppTest.zip, which is the implementation used in the aforementioned paper. """ n_samples, n_features = X.shape centers = numpy.empty((n_clusters, n_features), dtype=X.dtype) # Set the number of local seeding trials if none is given if n_local_trials is None: # This is what Arthur/Vassilvitskii tried, but did not report # specific results for other than mentioning in the conclusion # that it helped. n_local_trials = 2 + int(numpy.log(n_clusters)) # Pick first center randomly center_id = random_state.randint(n_samples) if issparse(X): centers[0] = X[center_id].toarray() else: centers[0] = X[center_id] # Initialize list of closest distances and calculate current potential if norm.lower() == 'l2': dist_fct = lambda x, y: euclidean_distances(x, y, squared=True) elif norm.lower() == 'l1': dist_fct = lambda x, y: manhattan_distances(x, y) else: raise NotImplementedError( "norm must be 'l1' or 'l2' not '{}'.".format(norm)) closest_dist_sq = dist_fct(centers[0, numpy.newaxis], X) current_pot = closest_dist_sq.sum() # Pick the remaining n_clusters-1 points for c in range(1, n_clusters): # Choose center candidates by sampling with probability proportional # to the squared distance to the closest existing center rand_vals = random_state.random_sample(n_local_trials) * current_pot candidate_ids = numpy.searchsorted(stable_cumsum(closest_dist_sq), rand_vals) numpy.clip(candidate_ids, None, closest_dist_sq.size - 1, out=candidate_ids) # Compute distances to center candidates distance_to_candidates = dist_fct(X[candidate_ids], X) # update closest distances squared and potential for each candidate numpy.minimum(closest_dist_sq, distance_to_candidates, out=distance_to_candidates) candidates_pot = distance_to_candidates.sum(axis=1) # Decide which candidate is the best best_candidate = numpy.argmin(candidates_pot) current_pot = candidates_pot[best_candidate] closest_dist_sq = distance_to_candidates[best_candidate] best_candidate = candidate_ids[best_candidate] # Permanently add best center candidate found in local tries if issparse(X): centers[c] = X[best_candidate].toarray() else: centers[c] = X[best_candidate] return centers
def uplift_curve(y_true, uplift, treatment): """Compute Uplift curve. For computing the area under the Uplift Curve, see :func:`.uplift_auc_score`. Args: y_true (1d array-like): Correct (true) target values. uplift (1d array-like): Predicted uplift, as returned by a model. treatment (1d array-like): Treatment labels. Returns: array (shape = [>2]), array (shape = [>2]): Points on a curve. See also: :func:`.uplift_auc_score`: Compute normalized Area Under the Uplift curve from prediction scores. :func:`.perfect_uplift_curve`: Compute the perfect Uplift curve. :func:`.plot_uplift_curve`: Plot Uplift curves from predictions. :func:`.qini_curve`: Compute Qini curve. References: Devriendt, F., Guns, T., & Verbeke, W. (2020). Learning to rank for uplift modeling. ArXiv, abs/2002.05897. """ check_consistent_length(y_true, uplift, treatment) check_is_binary(treatment) y_true, uplift, treatment = np.array(y_true), np.array(uplift), np.array( treatment) desc_score_indices = np.argsort(uplift, kind="mergesort")[::-1] y_true, uplift, treatment = y_true[desc_score_indices], uplift[ desc_score_indices], treatment[desc_score_indices] y_true_ctrl, y_true_trmnt = y_true.copy(), y_true.copy() y_true_ctrl[treatment == 1] = 0 y_true_trmnt[treatment == 0] = 0 distinct_value_indices = np.where(np.diff(uplift))[0] threshold_indices = np.r_[distinct_value_indices, uplift.size - 1] num_trmnt = stable_cumsum(treatment)[threshold_indices] y_trmnt = stable_cumsum(y_true_trmnt)[threshold_indices] num_all = threshold_indices + 1 num_ctrl = num_all - num_trmnt y_ctrl = stable_cumsum(y_true_ctrl)[threshold_indices] curve_values = (np.divide( y_trmnt, num_trmnt, out=np.zeros_like(y_trmnt), where=num_trmnt != 0) - np.divide( y_ctrl, num_ctrl, out=np.zeros_like(y_ctrl), where=num_ctrl != 0)) * num_all if num_all.size == 0 or curve_values[0] != 0 or num_all[0] != 0: # Add an extra threshold position if necessary # to make sure that the curve starts at (0, 0) num_all = np.r_[0, num_all] curve_values = np.r_[0, curve_values] return num_all, curve_values
def qini_curve(y_true, uplift, treatment): """Compute Qini curve. For computing the area under the Qini Curve, see :func:`.qini_auc_score`. Args: y_true (1d array-like): Correct (true) target values. uplift (1d array-like): Predicted uplift, as returned by a model. treatment (1d array-like): Treatment labels. Returns: array (shape = [>2]), array (shape = [>2]): Points on a curve. See also: :func:`.uplift_curve`: Compute the area under the Qini curve. :func:`.perfect_qini_curve`: Compute the perfect Qini curve. :func:`.plot_qini_curves`: Plot Qini curves from predictions.. :func:`.uplift_curve`: Compute Uplift curve. References: Nicholas J Radcliffe. (2007). Using control groups to target on predicted lift: Building and assessing uplift model. Direct Marketing Analytics Journal, (3):14–21, 2007. Devriendt, F., Guns, T., & Verbeke, W. (2020). Learning to rank for uplift modeling. ArXiv, abs/2002.05897. """ check_consistent_length(y_true, uplift, treatment) check_is_binary(treatment) y_true, uplift, treatment = np.array(y_true), np.array(uplift), np.array( treatment) desc_score_indices = np.argsort(uplift, kind="mergesort")[::-1] y_true = y_true[desc_score_indices] treatment = treatment[desc_score_indices] uplift = uplift[desc_score_indices] y_true_ctrl, y_true_trmnt = y_true.copy(), y_true.copy() y_true_ctrl[treatment == 1] = 0 y_true_trmnt[treatment == 0] = 0 distinct_value_indices = np.where(np.diff(uplift))[0] threshold_indices = np.r_[distinct_value_indices, uplift.size - 1] num_trmnt = stable_cumsum(treatment)[threshold_indices] y_trmnt = stable_cumsum(y_true_trmnt)[threshold_indices] num_all = threshold_indices + 1 num_ctrl = num_all - num_trmnt y_ctrl = stable_cumsum(y_true_ctrl)[threshold_indices] curve_values = y_trmnt - y_ctrl * np.divide( num_trmnt, num_ctrl, out=np.zeros_like(num_trmnt), where=num_ctrl != 0) if num_all.size == 0 or curve_values[0] != 0 or num_all[0] != 0: # Add an extra threshold position if necessary # to make sure that the curve starts at (0, 0) num_all = np.r_[0, num_all] curve_values = np.r_[0, curve_values] return num_all, curve_values
def calibrate_threshold(self, pairs_valid, y_valid, strategy='accuracy', min_rate=None, beta=1.): """Decision threshold calibration for pairwise binary classification Method that calibrates the decision threshold (cutoff point) of the metric learner. This threshold will then be used when calling the method `predict`. The methods for picking cutoff points make use of traditional binary classification evaluation statistics such as the true positive and true negative rates and F-scores. The threshold will be found to maximize the chosen score on the validation set ``(pairs_valid, y_valid)``. See more in the :ref:`User Guide <calibration>`. Parameters ---------- strategy : str, optional (default='accuracy') The strategy to use for choosing the cutoff threshold. 'accuracy' Selects a decision threshold that maximizes the accuracy. 'f_beta' Selects a decision threshold that maximizes the f_beta score, with beta given by the parameter `beta`. 'max_tpr' Selects a decision threshold that yields the highest true positive rate with true negative rate at least equal to the value of the parameter `min_rate`. 'max_tnr' Selects a decision threshold that yields the highest true negative rate with true positive rate at least equal to the value of the parameter `min_rate`. beta : float in [0, 1], optional (default=None) Beta value to be used in case strategy == 'f_beta'. min_rate : float in [0, 1] or None, (default=None) In case strategy is 'max_tpr' or 'max_tnr' this parameter must be set to specify the minimal value for the true negative rate or true positive rate respectively that needs to be achieved. pairs_valid : array-like, shape=(n_pairs_valid, 2, n_features) The validation set of pairs to use to set the threshold. y_valid : array-like, shape=(n_pairs_valid,) The labels of the pairs of the validation set to use to set the threshold. They must be +1 for positive pairs and -1 for negative pairs. References ---------- .. [1] Receiver-operating characteristic (ROC) plots: a fundamental evaluation tool in clinical medicine, MH Zweig, G Campbell - Clinical chemistry, 1993 .. [2] most of the code of this function is from scikit-learn's PR #10117 See Also -------- sklearn.calibration : scikit-learn's module for calibrating classifiers """ self._validate_calibration_params(strategy, min_rate, beta) pairs_valid, y_valid = self._prepare_inputs(pairs_valid, y_valid, type_of_inputs='tuples') n_samples = pairs_valid.shape[0] if strategy == 'accuracy': scores = self.decision_function(pairs_valid) scores_sorted_idces = np.argsort(scores)[::-1] scores_sorted = scores[scores_sorted_idces] # true labels ordered by decision_function value: (higher first) y_ordered = y_valid[scores_sorted_idces] # we need to add a threshold that will reject all points scores_sorted = np.concatenate([[scores_sorted[0] + 1], scores_sorted]) # finds the threshold that maximizes the accuracy: cum_tp = stable_cumsum(y_ordered == 1) # cumulative number of true # positives # we need to add the point where all samples are rejected: cum_tp = np.concatenate([[0.], cum_tp]) cum_tn_inverted = stable_cumsum(y_ordered[::-1] == -1) cum_tn = np.concatenate([[0.], cum_tn_inverted])[::-1] cum_accuracy = (cum_tp + cum_tn) / n_samples imax = np.argmax(cum_accuracy) # we set the threshold to the lowest accepted score # note: we are working with negative distances but we want the threshold # to be with respect to the actual distances so we take minus sign self.threshold_ = - scores_sorted[imax] # note: if the best is to reject all points it's already one of the # thresholds (scores_sorted[0]) return self if strategy == 'f_beta': precision, recall, thresholds = precision_recall_curve( y_valid, self.decision_function(pairs_valid), pos_label=1) # here the thresholds are decreasing # We ignore the warnings here, in the same taste as # https://github.com/scikit-learn/scikit-learn/blob/62d205980446a1abc1065 # f4332fd74eee57fcf73/sklearn/metrics/classification.py#L1284 with np.errstate(divide='ignore', invalid='ignore'): f_beta = ((1 + beta**2) * (precision * recall) / (beta**2 * precision + recall)) # We need to set nans to zero otherwise they will be considered higher # than the others (also discussed in https://github.com/scikit-learn/ # scikit-learn/pull/10117/files#r262115773) f_beta[np.isnan(f_beta)] = 0. imax = np.argmax(f_beta) # we set the threshold to the lowest accepted score # note: we are working with negative distances but we want the threshold # to be with respect to the actual distances so we take minus sign self.threshold_ = - thresholds[imax] # Note: we don't need to deal with rejecting all points (i.e. threshold = # max_scores + 1), since this can never happen to be optimal # (see a more detailed discussion in test_calibrate_threshold_extreme) return self fpr, tpr, thresholds = roc_curve(y_valid, self.decision_function(pairs_valid), pos_label=1) # here the thresholds are decreasing fpr, tpr, thresholds = fpr, tpr, thresholds if strategy in ['max_tpr', 'max_tnr']: if strategy == 'max_tpr': indices = np.where(1 - fpr >= min_rate)[0] imax = np.argmax(tpr[indices]) if strategy == 'max_tnr': indices = np.where(tpr >= min_rate)[0] imax = np.argmax(1 - fpr[indices]) imax_valid = indices[imax] # note: we are working with negative distances but we want the threshold # to be with respect to the actual distances so we take minus sign if indices[imax] == len(thresholds): # we want to accept everything self.threshold_ = - (thresholds[imax_valid] - 1) else: # thanks to roc_curve, the first point will always be max_scores # + 1, see: https://github.com/scikit-learn/scikit-learn/pull/13523 self.threshold_ = - thresholds[imax_valid] return self
def _fit(self, X, base_point=None, point_type='vector'): """Fit the model by computing full SVD on X""" if point_type == 'matrix': raise NotImplementedError( 'This is currently only implemented for vectors.') if base_point is None: base_point = self.metric.mean(X) tangent_vecs = self.metric.log(X, base_point=base_point) # Convert to sklearn format X = tangent_vecs X = check_array(X, dtype=[np.float64, np.float32], ensure_2d=True, copy=self.copy) # Handle n_components==None if self.n_components is None: n_components = min(X.shape) else: n_components = self.n_components n_samples, n_features = X.shape if n_components == 'mle': if n_samples < n_features: raise ValueError("n_components='mle' is only supported " "if n_samples >= n_features") elif not 0 <= n_components <= min(n_samples, n_features): raise ValueError("n_components=%r must be between 0 and " "min(n_samples, n_features)=%r with " "svd_solver='full'" % (n_components, min(n_samples, n_features))) elif n_components >= 1: if not isinstance(n_components, (numbers.Integral, np.integer)): raise ValueError("n_components=%r must be of type int " "when greater than or equal to 1, " "was of type=%r" % (n_components, type(n_components))) # Center data self.mean_ = np.mean(X, axis=0) X -= self.mean_ U, S, V = linalg.svd(X, full_matrices=False) # flip eigenvectors' sign to enforce deterministic output U, V = svd_flip(U, V) components_ = V # Get variance explained by singular values explained_variance_ = (S ** 2) / (n_samples - 1) total_var = explained_variance_.sum() explained_variance_ratio_ = explained_variance_ / total_var singular_values_ = S.copy() # Store the singular values. # Postprocess the number of components required if n_components == 'mle': n_components = \ _infer_dimension_(explained_variance_, n_samples, n_features) elif 0 < n_components < 1.0: # number of components for which the cumulated explained # variance percentage is superior to the desired threshold ratio_cumsum = stable_cumsum(explained_variance_ratio_) n_components = np.searchsorted(ratio_cumsum, n_components) + 1 # Compute noise covariance using Probabilistic PCA model # The sigma2 maximum likelihood (cf. eq. 12.46) if n_components < min(n_features, n_samples): self.noise_variance_ = explained_variance_[n_components:].mean() else: self.noise_variance_ = 0. self.n_samples_, self.n_features_ = n_samples, n_features self.components_ = components_[:n_components] self.n_components_ = n_components self.explained_variance_ = explained_variance_[:n_components] self.explained_variance_ratio_ = \ explained_variance_ratio_[:n_components] self.singular_values_ = singular_values_[:n_components] return U, S, V
def _fit_full_daal4py(self, X, n_components): n_samples, n_features = X.shape n_sf_min = min(n_samples, n_features) if n_components == 'mle': daal_n_components = n_features elif n_components < 1: daal_n_components = n_sf_min else: daal_n_components = n_components fpType = getFPType(X) covariance_algo = daal4py.covariance( fptype=fpType, outputMatrixType='covarianceMatrix') covariance_res = covariance_algo.compute(X) self.mean_ = covariance_res.mean.ravel() covariance = covariance_res.covariance variances_ = np.array([covariance[i, i] for i in range(n_features)]) pca_alg = daal4py.pca(fptype=fpType, method='correlationDense', resultsToCompute='eigenvalue', isDeterministic=True, nComponents=daal_n_components) pca_res = pca_alg.compute(X, covariance) components_ = pca_res.eigenvectors explained_variance_ = np.maximum(pca_res.eigenvalues.ravel(), 0) tot_var = explained_variance_.sum() explained_variance_ratio_ = explained_variance_ / tot_var if n_components == 'mle': if sklearn_check_version('0.23'): n_components = _infer_dimension(explained_variance_, n_samples) else: n_components = _infer_dimension_(explained_variance_, n_samples, n_features) elif 0 < n_components < 1.0: ratio_cumsum = stable_cumsum(explained_variance_ratio_) n_components = np.searchsorted( ratio_cumsum, n_components, side='right') + 1 if n_components < n_sf_min: if explained_variance_.shape[0] == n_sf_min: self.noise_variance_ = explained_variance_[n_components:].mean( ) else: resid_var_ = variances_.sum() resid_var_ -= explained_variance_[:n_components].sum() self.noise_variance_ = resid_var_ / (n_sf_min - n_components) else: self.noise_variance_ = 0. self.n_samples_, self.n_features_ = n_samples, n_features self.components_ = components_[:n_components] self.n_components_ = n_components self.explained_variance_ = explained_variance_[:n_components] self.explained_variance_ratio_ = explained_variance_ratio_[: n_components] self.singular_values_ = np.sqrt( (n_samples - 1) * self.explained_variance_)
def _fit(self, X, base_point=None): """Fit the model by computing full SVD on X. Parameters ---------- X : array-like, shape=[..., n_features] Training data, where n_samples is the number of samples and n_features is the number of features. y : Ignored (Compliance with scikit-learn interface) base_point : array-like, shape=[..., n_features] Point at which to perform the tangent PCA. Optional, default to Frechet mean if None. Returns ------- U, S, V : array-like Matrices of the SVD decomposition """ if base_point is None: mean = FrechetMean(metric=self.metric, point_type=self.point_type) mean.fit(X) base_point = mean.estimate_ tangent_vecs = self.metric.log(X, base_point=base_point) if self.point_type == "matrix": if Matrices.is_symmetric(tangent_vecs).all(): X = SymmetricMatrices.to_vector(tangent_vecs) else: X = gs.reshape(tangent_vecs, (len(X), -1)) else: X = tangent_vecs if self.n_components is None: n_components = min(X.shape) else: n_components = self.n_components n_samples, n_features = X.shape if n_components == "mle": if n_samples < n_features: raise ValueError("n_components='mle' is only supported " "if n_samples >= n_features") elif not 0 <= n_components <= min(n_samples, n_features): raise ValueError("n_components=%r must be between 0 and " "min(n_samples, n_features)=%r with " "svd_solver='full'" % (n_components, min(n_samples, n_features))) elif n_components >= 1: if not isinstance(n_components, numbers.Integral): raise ValueError("n_components=%r must be of type int " "when greater than or equal to 1, " "was of type=%r" % (n_components, type(n_components))) # Center data - the mean should be 0 if base_point is the Frechet mean self.mean_ = gs.mean(X, axis=0) X -= self.mean_ U, S, V = gs.linalg.svd(X, full_matrices=False) # flip eigenvectors' sign to enforce deterministic output U, V = svd_flip(U, V) components_ = V # Get variance explained by singular values explained_variance_ = (S**2) / (n_samples - 1) total_var = explained_variance_.sum() explained_variance_ratio_ = explained_variance_ / total_var singular_values_ = gs.copy(S) # Store the singular values. # Postprocess the number of components required if n_components == "mle": n_components = _infer_dimension_(explained_variance_, n_samples, n_features) elif 0 < n_components < 1.0: # number of components for which the cumulated explained # variance percentage is superior to the desired threshold ratio_cumsum = stable_cumsum(explained_variance_ratio_) n_components = gs.searchsorted(ratio_cumsum, n_components) + 1 # Compute noise covariance using Probabilistic PCA model # The sigma2 maximum likelihood (cf. eq. 12.46) if n_components < min(n_features, n_samples): self.noise_variance_ = explained_variance_[n_components:].mean() else: self.noise_variance_ = 0.0 self.base_point_fit = base_point self.n_samples_, self.n_features_ = n_samples, n_features self.components_ = components_[:n_components] self.n_components_ = int(n_components) self.explained_variance_ = explained_variance_[:n_components] self.explained_variance_ratio_ = explained_variance_ratio_[: n_components] self.singular_values_ = singular_values_[:n_components] return U, S, V
def _binary_clf_curve(y_true, y_score, pos_label=None, sample_weight=None): """Calculate true and false positives per binary classification threshold. Parameters ---------- y_true : array, shape = [n_samples] True targets of binary classification y_score : array, shape = [n_samples] Estimated probabilities or decision function pos_label : int or str, default=None The label of the positive class sample_weight : array-like of shape = [n_samples], optional Sample weights. Returns ------- fps : array, shape = [n_thresholds] A count of false positives, at index i being the number of negative samples assigned a score >= thresholds[i]. The total number of negative samples is equal to fps[-1] (thus true negatives are given by fps[-1] - fps). tps : array, shape = [n_thresholds <= len(np.unique(y_score))] An increasing count of true positives, at index i being the number of positive samples assigned a score >= thresholds[i]. The total number of positive samples is equal to tps[-1] (thus false negatives are given by tps[-1] - tps). thresholds : array, shape = [n_thresholds] Decreasing score values. """ check_consistent_length(y_true, y_score) y_true = column_or_1d(y_true) y_score = column_or_1d(y_score) assert_all_finite(y_true) assert_all_finite(y_score) if sample_weight is not None: sample_weight = column_or_1d(sample_weight) # ensure binary classification if pos_label is not specified classes = np.unique(y_true) if (pos_label is None and not (np.array_equal(classes, [0, 1]) or np.array_equal(classes, [-1, 1]) or np.array_equal(classes, [0]) or np.array_equal(classes, [-1]) or np.array_equal(classes, [1]))): raise ValueError("Data is not binary and pos_label is not specified") elif pos_label is None: pos_label = 1. # make y_true a boolean vector y_true = (y_true == pos_label) # sort scores and corresponding truth values desc_score_indices = np.argsort(y_score, kind="mergesort")[::-1] y_score = y_score[desc_score_indices] y_true = y_true[desc_score_indices] if sample_weight is not None: weight = sample_weight[desc_score_indices] else: weight = 1. # y_score typically has many tied values. Here we extract # the indices associated with the distinct values. We also # concatenate a value for the end of the curve. distinct_value_indices = np.where(np.diff(y_score))[0] threshold_idxs = np.r_[distinct_value_indices, y_true.size - 1] # accumulate the true positives with decreasing threshold tps = stable_cumsum(y_true * weight)[threshold_idxs] if sample_weight is not None: fps = stable_cumsum(weight)[threshold_idxs] - tps else: fps = 1 + threshold_idxs - tps return fps, tps, y_score[threshold_idxs]