def pairwise_distances_no_broadcast(X, Y): """Utility function to calculate row-wise euclidean distance of two matrix. Different from pair-wise calculation, this function would not broadcast. For instance, X and Y are both (4,3) matrices, the function would return a distance vector with shape (4,), instead of (4,4). Parameters ---------- X : array of shape (n_samples, n_features) First input samples Y : array of shape (n_samples, n_features) Second input samples Returns ------- distance : array of shape (n_samples,) Row-wise euclidean distance of X and Y """ X = check_array(X) Y = check_array(Y) if X.shape[0] != Y.shape[0] or X.shape[1] != Y.shape[1]: raise ValueError("pairwise_distances_no_broadcast function receive" "matrix with different shapes {0} and {1}".format( X.shape, Y.shape)) return _pairwise_distances_no_broadcast_helper(X, Y)
def predict(self, X, categorical=None): """Predict the closest cluster each sample in X belongs to. Parameters ---------- X : array-like, shape = [n_samples, n_features] New data to predict. categorical : Indices of columns that contain categorical data Returns ------- labels : array, shape [n_samples,] Index of the cluster each sample belongs to. """ assert hasattr(self, '_enc_cluster_centroids'), "Model not yet fitted." if categorical is not None: assert isinstance(categorical, (int, list, tuple)), "The 'categorical' \ argument needs to be an integer with the index of the categorical \ column in your data, or a list or tuple of several of them, \ but it is a {}.".format(type(categorical)) X = pandas_to_numpy(X) Xnum, Xcat = _split_num_cat(X, categorical) Xnum, Xcat = check_array(Xnum), check_array(Xcat, dtype=None) Xcat, _ = encode_features(Xcat, enc_map=self._enc_map) return _labels_cost(Xnum, Xcat, self._enc_cluster_centroids, self.num_dissim, self.cat_dissim, self.gamma)[0]
def check_array_with_weights(X, weights, **kwargs): """Utility to validate data and weights. This calls check_array on X and weights, making sure results match. """ if weights is None: return check_array(X, **kwargs), weights # Always use copy=False for weights kwargs_weights = dict(kwargs) kwargs_weights.update(copy=False) weights = check_array(weights, **kwargs_weights) # Always use force_all_finite=False for X kwargs_X = dict(kwargs) kwargs_X.update(force_all_finite=False) X = check_array(X, **kwargs_X) # Make sure shapes match and missing data has weights=0 if X.shape != weights.shape: raise ValueError("Shape of `X` and `weights` should match") Wzero = (weights == 0) X[Wzero] = 0 if not np.all(np.isfinite(X)): raise ValueError("Input contains NaN or infinity without " "a corresponding zero in `weights`.") return X, weights
def log_loss(y_true, y_pred, eps=1e-15, normalize=True, sample_weight=None): lb = LabelBinarizer() T = lb.fit_transform(y_true) if T.shape[1] == 1: T = np.append(1 - T, T, axis=1) # Clipping Y = np.clip(y_pred, eps, 1 - eps) # This happens in cases when elements in y_pred have type "str". if not isinstance(Y, np.ndarray): raise ValueError("y_pred should be an array of floats.") # If y_pred is of single dimension, assume y_true to be binary # and then check. if Y.ndim == 1: Y = Y[:, np.newaxis] if Y.shape[1] == 1: Y = np.append(1 - Y, Y, axis=1) # Check if dimensions are consistent. val.check_consistent_length(T, Y) T = val.check_array(T) Y = val.check_array(Y) print(T) print(Y) if T.shape[1] != Y.shape[1]: raise ValueError("y_true and y_pred have different number of classes " "%d, %d" % (T.shape[1], Y.shape[1])) # Renormalize Y /= Y.sum(axis=1)[:, np.newaxis] loss = -(T * np.log(Y)).sum(axis=1) return _weighted_sum(loss, sample_weight, normalize)
def _process_inputs(self, X, constraints): self.X_ = X = check_array(X) # check to make sure that no two constrained vectors are identical a,b,c,d = constraints no_ident = vector_norm(X[a] - X[b]) > 1e-9 a, b = a[no_ident], b[no_ident] no_ident = vector_norm(X[c] - X[d]) > 1e-9 c, d = c[no_ident], d[no_ident] if len(a) == 0: raise ValueError('No non-trivial similarity constraints given for MMC.') if len(c) == 0: raise ValueError('No non-trivial dissimilarity constraints given for MMC.') # init metric if self.A0 is None: self.A_ = np.identity(X.shape[1]) if not self.diagonal: # Don't know why division by 10... it's in the original code # and seems to affect the overall scale of the learned metric. self.A_ /= 10.0 else: self.A_ = check_array(self.A0) return a,b,c,d
def _impose_f_order(X): """Helper Function""" # important to access flags instead of calling np.isfortran, # this catches corner cases. if X.flags.c_contiguous: return check_array(X.T, copy=False, order='F'), True else: return check_array(X, copy=False, order='F'), False
def _prepare_inputs(self, X, W): self.X_ = X = check_array(X) W = check_array(W, accept_sparse=True) # set up prior M if self.use_cov: self.M_ = pinvh(np.cov(X, rowvar = False)) else: self.M_ = np.identity(X.shape[1]) L = laplacian(W, normed=False) return X.T.dot(L.dot(X))
def fit(self, X, y=None): """Fit detector. y is optional for unsupervised methods. Parameters ---------- X : numpy array of shape (n_samples, n_features) The input samples. y : numpy array of shape (n_samples,), optional (default=None) The ground truth of the input samples (labels). """ # validate inputs X and y (optional) X = check_array(X) self._set_n_classes(y) n_samples, n_features = X.shape # check parameters # number of clusters are default to 8 self._validate_estimator(default=MiniBatchKMeans( n_clusters=self.n_clusters, random_state=self.random_state)) self.clustering_estimator_.fit(X=X, y=y) # Get the labels of the clustering results # labels_ is consistent across sklearn clustering algorithms self.cluster_labels_ = self.clustering_estimator_.labels_ self.cluster_sizes_ = np.bincount(self.cluster_labels_) self._set_cluster_centers(X, n_features) self._set_small_large_clusters(n_samples) self.decision_scores_ = self._decision_function(X, self.cluster_labels_) self._process_decision_scores() return self
def predict_proba(self, X): """ Predict the membership probabilities for the data samples in X using trained model. Parameters ---------- X : array-like, shape (n_samples, n_features) List of n_features-dimensional data points. Each row corresponds to a single data point. Returns ------- proba : array, shape (n_samples, n_clusters) """ X = check_array(X, copy=False, order='C', dtype=sp.float64) K = self.score_samples(X) T = sp.empty_like(K) # Compute the Loglikelhood K *= (0.5) # Compute the posterior with sp.errstate(over='ignore'): for c in xrange(self.C): T[:, c] = 1 / sp.exp(K-K[:, c][:, sp.newaxis]).sum(axis=1) return T
def score_samples(self, X, y=None): """Compute the negative weighted log probabilities for each sample. Parameters ---------- X : array-like, shape (n_samples, n_features) List of n_features-dimensional data points. Each row corresponds to a single data point. Returns ------- log_prob : array, shape (n_samples, n_clusters) Log probabilities of each data point in X. """ X = check_array(X, copy=False, order='C', dtype=sp.float64) nt, d = X.shape K = sp.empty((nt, self.C)) # Start the prediction for each class for c in xrange(self.C): # Compute the constant term K[:, c] = self.logdet[c] - 2*sp.log(self.prop[c]) + self.cst # Remove the mean Xc = X - self.mean[c] # Do the projection Px = sp.dot(Xc, sp.dot(self.Q[c], self.Q[c].T)) temp = sp.dot(Px, self.Q[c]/sp.sqrt(self.a[c])) K[:, c] += sp.sum(temp**2, axis=1) K[:, c] += sp.sum((Xc - Px)**2, axis=1)/self.b[c] return -K
def score(self, X, y=None): """Compute the per-sample log-likelihood of the given data X. Parameters ---------- X : array-like, shape (n_samples, n_dimensions) List of n_features-dimensional data points. Each row corresponds to a single data point. Returns ------- log_likelihood : float Log likelihood of the Gaussian mixture given X. """ X = check_array(X, copy=False, order='C', dtype=sp.float64) # Get some parameters n = X.shape[0] # Compute the membership function K = self.score_samples(X) # Compute the Loglikelhood K *= (0.5) Km = K.max(axis=1) Km.shape = (n, 1) # Logsumexp trick LL = (sp.log(sp.exp(K-Km).sum(axis=1))[:, sp.newaxis]+Km).sum() return LL
def fit_transform(self, X, y=None): """Fit the model with X and apply the dimensionality reduction on X. Parameters ---------- X : array-like, shape (n_samples, n_features) New data, where n_samples in the number of samples and n_features is the number of features. Returns ------- X_new : array-like, shape (n_samples, n_components) """ X = check_array(X) if self.n_components is None: n_components = X.shape[1] else: n_components = self.n_components self.mean_ = X.mean(0) U, s, VT = np.linalg.svd(X - self.mean_) self.components_ = VT[:n_components] var = s ** 2 / X.shape[0] self.explained_variance_ = var[:self.n_components] self.explained_variance_ratio_ = var[:n_components] / var.sum() return s[:n_components] * U[:, :n_components]
def predict(self, X): """Predict class for X. Parameters ---------- X : Array-like of shape [n_samples, n_features] The input to classify. Returns ------- y : array of shape = [n_samples] The predicted classes. """ X = check_array(X) if self.trees_ is None: raise Exception("Pattern trees not initialized. Perform a fit first.") y_classes = np.zeros((X.shape[0], len(self.classes_))) for i, c in enumerate(self.classes_): y_classes[:, i] = self.trees_[i](X) # predict the maximum value return self.classes_.take(np.argmax(y_classes, -1))
def fit(self, X, y, random_state=np.random): """Create constraints from labels and learn the SDML model. Parameters ---------- X : array-like, shape (n, d) data matrix, where each row corresponds to a single instance y : array-like, shape (n,) data labels, one for each instance random_state : {numpy.random.RandomState, int}, optional Random number generator or random seed. If not given, the singleton numpy.random will be used. Returns ------- self : object Returns the instance. """ y = check_array(y, ensure_2d=False) num_constraints = self.num_constraints if num_constraints is None: num_classes = len(np.unique(y)) num_constraints = 20 * num_classes**2 c = Constraints.random_subset(y, self.num_labeled, random_state=random_state) adj = c.adjacency_matrix(num_constraints, random_state=random_state) return SDML.fit(self, X, adj)
def predict(self, X): """Predict multi-output variable using a model trained for each target variable. Parameters ---------- X : (sparse) array-like, shape (n_samples, n_features) Data. Returns ------- y : (sparse) array-like, shape (n_samples, n_outputs) Multi-output targets predicted across multiple predictors. Note: Separate models are generated for each predictor. """ check_is_fitted(self, 'estimators_') if not hasattr(self.estimator, "predict"): raise ValueError("The base estimator should implement a predict method") X = check_array(X, accept_sparse=True) y = Parallel(n_jobs=self.n_jobs)(delayed(parallel_helper)(e, 'predict', X) for e in self.estimators_) return np.asarray(y).T
def predict_learned(self, X, return_std=False): X = check_array(X) # Predict based on GP posterior K_trans = self.kernel_(X, self.X_train_learned) y_mean = K_trans.dot(self.alpha_learned) # Line 4 (y_mean = f_star) y_mean = self.y_train_mean_learned + y_mean # undo normal. if return_std: # compute inverse K_inv of K based on its Cholesky # decomposition L and its inverse L_inv L_inv = solve_triangular(self.L_learned.T, np.eye(self.L_learned.shape[0])) K_inv = L_inv.dot(L_inv.T) # Compute variance of predictive distribution y_var = self.kernel_.diag(X) y_var -= np.einsum("ki,kj,ij->k", K_trans, K_trans, K_inv) # Check if any of the variances is negative because of # numerical issues. If yes: set the variance to 0. y_var_negative = y_var < 0 if np.any(y_var_negative): warnings.warn("Predicted variances smaller than 0. " "Setting those variances to 0.") y_var[y_var_negative] = 0.0 return y_mean, np.sqrt(y_var) else: return y_mean
def chi2(X, y): X = check_array(X, accept_sparse='csr') if np.any((X.data if issparse(X) else X) < 0): raise ValueError("Input X must be non-negative.") Y = MultiLabelBinarizer().fit_transform(y) if Y.shape[1] == 1: Y = np.append(1 - Y, Y, axis=1) observed = safe_sparse_dot(Y.T, X) # n_classes * n_features feature_count = check_array(X.sum(axis=0)) class_prob = check_array(Y.mean(axis=0)) expected = np.dot(class_prob.T, feature_count) return _chisquare(observed, expected)
def chi2_contingency_matrix(X_train, y_train): X = X_train.copy() X.data = np.ones_like(X.data) X = check_array(X, accept_sparse='csr') if np.any((X.data if issparse(X) else X) < 0): raise ValueError("Input X must be non-negative.") Y = LabelBinarizer().fit_transform(y_train) if Y.shape[1] == 1: Y = np.append(1 - Y, Y, axis=1) observed = safe_sparse_dot(Y.T, X) # n_classes * n_features # feature_count = check_array(X.sum(axis=0)) # class_prob = check_array(Y.mean(axis=0)) feature_count = X.sum(axis=0).reshape(1, -1) class_prob = Y.mean(axis=0).reshape(1, -1) expected = np.dot(class_prob.T, feature_count) observed = np.asarray(observed, dtype=np.float64) k = len(observed) # Reuse observed for chi-squared statistics contingency_matrix = observed contingency_matrix -= expected contingency_matrix **= 2 expected[expected == 0.0] = 1.0 contingency_matrix /= expected # weights = contingency_matrix.max(axis=0) return contingency_matrix
def fit(self, X, y=None): """Don't trust the documentation of this module! Compute the mean and std to be used for later scaling. Parameters ---------- X : array-like or CSR matrix with shape [n_samples, n_features] The data used to compute the mean and standard deviation used for later scaling along the features axis. """ X = check_array(X, copy=self.copy, accept_sparse="csc", ensure_2d=False) if warn_if_not_float(X, estimator=self): # Costly conversion, but otherwise the pipeline will break: # https://github.com/scikit-learn/scikit-learn/issues/1709 X = X.astype(np.float32) if sparse.issparse(X): if self.center_sparse: means = [] vars = [] # This only works for csc matrices... for i in range(X.shape[1]): if X.indptr[i] == X.indptr[i + 1]: means.append(0) vars.append(1) else: vars.append( X.data[X.indptr[i]:X.indptr[i + 1]].var()) # If the variance is 0, set all occurences of this # features to 1 means.append( X.data[X.indptr[i]:X.indptr[i + 1]].mean()) if 0.0000001 >= vars[-1] >= -0.0000001: means[-1] -= 1 self.std_ = np.sqrt(np.array(vars)) self.std_[np.array(vars) == 0.0] = 1.0 self.mean_ = np.array(means) return self elif self.with_mean: raise ValueError( "Cannot center sparse matrices: pass `with_mean=False` " "instead. See docstring for motivation and alternatives.") else: self.mean_ = None if self.with_std: var = mean_variance_axis(X, axis=0)[1] self.std_ = np.sqrt(var) self.std_[var == 0.0] = 1.0 else: self.std_ = None return self else: self.mean_, self.std_ = _mean_and_std( X, axis=0, with_mean=self.with_mean, with_std=self.with_std) return self
def predict_presence_absence_evidences(self, X): X = check_array(X, accept_sparse="csr") absence_log_prob_ = np.log(1 - np.exp(self.feature_log_prob_)) presence_log_ratios = self.feature_log_prob_[1] - self.feature_log_prob_[0] absence_log_ratios = absence_log_prob_[1] - absence_log_prob_[0] presence_neg_log_ratios = presence_log_ratios * (presence_log_ratios<0) presence_pos_log_ratios = presence_log_ratios * (presence_log_ratios>0) if issparse(X): p_neg_evi = X * presence_neg_log_ratios p_pos_evi = X * presence_pos_log_ratios else: p_neg_evi = np.dot(X, presence_neg_log_ratios) p_pos_evi = np.dot(X, presence_pos_log_ratios) absence_neg_log_ratios = absence_log_ratios * (absence_log_ratios<0) absence_pos_log_ratios = absence_log_ratios * (absence_log_ratios>0) default_a_neg_evi = absence_neg_log_ratios.sum() default_a_pos_evi = absence_pos_log_ratios.sum() if issparse(X): a_neg_evi = -(X * absence_neg_log_ratios) + default_a_neg_evi a_pos_evi = -(X * absence_pos_log_ratios) + default_a_pos_evi else: a_neg_evi = -np.dot(X, absence_neg_log_ratios) + default_a_neg_evi a_pos_evi = -np.dot(X, absence_pos_log_ratios) + default_a_pos_evi return p_neg_evi, p_pos_evi, a_neg_evi, a_pos_evi
def transform(self, X): """ A reference implementation of a transform function. Parameters ---------- X : array-like of shape = [n_samples, n_features] The input samples. Returns ------- X_transformed : array of int of shape = [n_samples, n_features] The array containing the element-wise square roots of the values in `X` """ # Check is fit had been called check_is_fitted(self, ['input_shape_']) # Input validation X = check_array(X) # Check that the input is of the same shape as the one passed # during fit. if X.shape != self.input_shape_: raise ValueError('Shape of input is different from what was seen' 'in `fit`') return np.sqrt(X)
def dump_svmlight_file(X, y, f, zero_based=True, comment=None, query_id=None): y = np.asarray(y) if y.ndim != 1: raise ValueError("expected y of shape (n_samples,), got %r" % (y.shape,)) Xval = check_array(X, accept_sparse='csr') if Xval.shape[0] != y.shape[0]: raise ValueError("X.shape[0] and y.shape[0] should be the same, got" " %r and %r instead." % (Xval.shape[0], y.shape[0])) # We had some issues with CSR matrices with unsorted indices (e.g. #1501), # so sort them here, but first make sure we don't modify the user's X. # TODO We can do this cheaper; sorted_indices copies the whole matrix. if Xval is X and hasattr(Xval, "sorted_indices"): X = Xval.sorted_indices() else: X = Xval if hasattr(X, "sort_indices"): X.sort_indices() if query_id is not None: query_id = np.asarray(query_id) if query_id.shape[0] != y.shape[0]: raise ValueError("expected query_id of shape (n_samples,), got %r" % (query_id.shape,)) one_based = not zero_based if hasattr(f, "write"): _dump_svmlight(X, y, f, one_based, comment, query_id) else: with open(f, "wb") as f: _dump_svmlight(X, y, f, one_based, comment, query_id)
def transform(self, X, y=None, copy=None): """Perform standardization by centering and scaling Parameters ---------- X : array-like with shape [n_samples, n_features] The data used to scale along the features axis. """ check_is_fitted(self, 'std_') copy = copy if copy is not None else self.copy X = check_array(X, copy=copy, accept_sparse="csc", ensure_2d=False) if warn_if_not_float(X, estimator=self): X = X.astype(np.float) if sparse.issparse(X): if self.center_sparse: for i in range(X.shape[1]): X.data[X.indptr[i]:X.indptr[i + 1]] -= self.mean_[i] elif self.with_mean: raise ValueError( "Cannot center sparse matrices: pass `with_mean=False` " "instead. See docstring for motivation and alternatives.") else: pass if self.std_ is not None: inplace_column_scale(X, 1 / self.std_) else: if self.with_mean: X -= self.mean_ if self.with_std: X /= self.std_ return X
def ttest(X, y): X = check_array(X, accept_sparse='csr') if np.any((X.data if issparse(X) else X) < 0): raise ValueError("Input X must be non-negative.") Y = MultiLabelBinarizer().fit_transform(y) if Y.shape[1] == 1: Y = np.append(1 - Y, Y, axis=1) negY = 1- Y labelNum = Y.shape[1] # sampleNum = Y.shape[0] featureNum = X.shape[1] t = [] prob = [] for i in range(featureNum): values = X[:,i].T.toarray()[0] ti = 0 probi = 0 for j in range(labelNum): observed = values * Y[:,j] notObserved = values * negY[:,j] (res0, res1) = scipy.stats.ttest_ind(observed, notObserved) ti = ti + res0 probi = probi + res1 t.append(ti) prob.append(probi) t = np.asarray(t) prob = np.asarray(prob) return t, prob
def fit(self, X, y=None): """Fit detector. y is optional for unsupervised methods. Parameters ---------- X : numpy array of shape (n_samples, n_features) The input samples. y : numpy array of shape (n_samples,), optional (default=None) The ground truth of the input samples (labels). """ # validate inputs X and y (optional) X = check_array(X) self._set_n_classes(y) self.detector_ = LocalOutlierFactor(n_neighbors=self.n_neighbors, algorithm=self.algorithm, leaf_size=self.leaf_size, metric=self.metric, p=self.p, metric_params=self.metric_params, contamination=self.contamination, n_jobs=self.n_jobs) self.detector_.fit(X=X, y=y) # Invert decision_scores_. Outliers comes with higher outlier scores self.decision_scores_ = invert_order( self.detector_.negative_outlier_factor_) self._process_decision_scores() return self
def predict_proba(self, X): """Predict probability for each possible outcome. Compute the probability estimates for each single sample in X and each possible outcome seen during training (categorical distribution). Parameters ---------- X : array_like, shape = [n_samples, n_features] Returns ------- probabilities : array, shape = [n_samples, n_classes] Normalized probability distributions across class labels """ check_is_fitted(self, 'X_') X_2d = check_array(X, accept_sparse = ['csc', 'csr', 'coo', 'dok', 'bsr', 'lil', 'dia']) weight_matrices = self._get_kernel(self.X_, X_2d) if self.kernel == 'knn': probabilities = [] for weight_matrix in weight_matrices: ine = np.sum(self.label_distributions_[weight_matrix], axis=0) probabilities.append(ine) probabilities = np.array(probabilities) else: weight_matrices = weight_matrices.T probabilities = np.dot(weight_matrices, self.label_distributions_) normalizer = np.atleast_2d(np.sum(probabilities, axis=1)).T probabilities /= normalizer return probabilities
def fit(self, X, y=None): """Fit the model with ``X``. Parameters ---------- X: array-like, shape (n_samples, n_features) Training data, where n_samples in the number of samples and n_features is the number of features. Returns ------- self : object Returns the instance itself. """ X = check_array(X, dtype=np.float) L, S, (U, s, Vt), self.n_iter_ = rpca(X, self.lam, self.mu, self.max_iter, self.eps_primal, self.eps_dual, self.rho, self.initial_sv, self.max_mu, self.verbose) self.low_rank_ = L r = np.count_nonzero(s) self.n_components_ = r self.components_ = Vt[:r] return self
def predict_moments(self, X): """ Full predictive distribution from Bayesian linear regression. Parameters ---------- X : ndarray (N*,d) array query input dataset (N* samples, d dimensions). Returns ------- Ey : ndarray The expected value of y* for the query inputs, X* of shape (N*,). Vy : ndarray The expected variance of y* for the query inputs, X* of shape (N*,). """ check_is_fitted(self, ['var_', 'regularizer_', 'weights_', 'covariance_', 'hypers_']) X = check_array(X) Phi = self.basis.transform(X, *atleast_list(self.hypers_)) Ey = Phi.dot(self.weights_) Vf = (Phi.dot(self.covariance_) * Phi).sum(axis=1) return Ey, Vf + self.var_
def fit(self, X, y): X = check_array(X) random_state = check_random_state(self.random_state) self.classes_, y_reverse = np.unique(y, return_inverse=True) if np.nan in self.classes_: raise ValueError("NaN class not supported.") # build models models = {} for c_idx, c_value in enumerate(self.classes_): X_class = X[y == c_value] a_sample_size = min(len(X_class), self.sample_size) c_models = [] for i in range(self.n_models): # resample X_sample = X_class[random_state.choice(len(X_class), a_sample_size)] c_models.append(self.build_for_class(random_state, X_sample)) models[c_value] = np.array(c_models) weights = self.fit_weights(random_state, models, X, y_reverse) self.models_ = models self.weights_ = weights return self
def decision_function(self, X): """Predict raw anomaly score of X using the fitted detector. The anomaly score of an input sample is computed based on different detector algorithms. For consistency, outliers are assigned with larger anomaly scores. Parameters ---------- X : numpy array of shape (n_samples, n_features) The training input samples. Sparse matrices are accepted only if they are supported by the base estimator. Returns ------- anomaly_scores : numpy array of shape (n_samples,) The anomaly score of the input samples. """ check_is_fitted(self, ['components_', 'w_components_']) X = check_array(X) if self.standardization: X = self.scaler_.transform(X) return np.sum( cdist(X, self.selected_components_) / self.selected_w_components_, axis=1).ravel()
def preprocess_data(self, X): X = check_array(X, dtype=[np.float64, np.float32], ensure_min_samples=1) X2 = row_norms(X, squared=True) return X, X2
def fit(self, X, y=None): self.X_shape_ = check_array(X).shape return self
def k_prototypes(X, categorical, n_clusters, max_iter, num_dissim, cat_dissim, gamma, init, n_init, verbose): """k-prototypes algorithm""" if sparse.issparse(X): raise TypeError("k-prototypes does not support sparse data.") if categorical is None or not categorical: raise NotImplementedError( "No categorical data selected, effectively doing k-means. " "Present a list of categorical columns, or use scikit-learn's " "KMeans instead." ) if isinstance(categorical, int): categorical = [categorical] assert len(categorical) != X.shape[1], \ "All columns are categorical, use k-modes instead of k-prototypes." assert max(categorical) < X.shape[1], \ "Categorical index larger than number of columns." ncatattrs = len(categorical) nnumattrs = X.shape[1] - ncatattrs npoints = X.shape[0] assert n_clusters <= npoints, "More clusters than data points?" Xnum, Xcat = _split_num_cat(X, categorical) Xnum, Xcat = check_array(Xnum), check_array(Xcat, dtype=None) # Convert the categorical values in Xcat to integers for speed. # Based on the unique values in Xcat, we can make a mapping to achieve this. Xcat, enc_map = encode_features(Xcat) # Are there more n_clusters than unique rows? Then set the unique # rows as initial values and skip iteration. unique = get_unique_rows(X) n_unique = unique.shape[0] if n_unique <= n_clusters: max_iter = 0 n_init = 1 n_clusters = n_unique init = list(_split_num_cat(unique, categorical)) init[1], _ = encode_features(init[1], enc_map) # Estimate a good value for gamma, which determines the weighing of # categorical values in clusters (see Huang [1997]). if gamma is None: gamma = 0.5 * Xnum.std() all_centroids = [] all_labels = [] all_costs = [] all_n_iters = [] for init_no in range(n_init): # For numerical part of initialization, we don't have a guarantee # that there is not an empty cluster, so we need to retry until # there is none. init_tries = 0 while True: init_tries += 1 # _____ INIT _____ if verbose: print("Init: initializing centroids") if isinstance(init, str) and init == 'Huang': centroids = kmodes.init_huang(Xcat, n_clusters, cat_dissim) elif isinstance(init, str) and init == 'Cao': centroids = kmodes.init_cao(Xcat, n_clusters, cat_dissim) elif isinstance(init, str) and init == 'random': seeds = np.random.choice(range(npoints), n_clusters) centroids = Xcat[seeds] elif isinstance(init, list): # Make sure inits are 2D arrays. init = [np.atleast_2d(cur_init).T if len(cur_init.shape) == 1 else cur_init for cur_init in init] assert init[0].shape[0] == n_clusters, \ "Wrong number of initial numerical centroids in init " \ "({}, should be {}).".format(init[0].shape[0], n_clusters) assert init[0].shape[1] == nnumattrs, \ "Wrong number of numerical attributes in init ({}, should be {})."\ .format(init[0].shape[1], nnumattrs) assert init[1].shape[0] == n_clusters, \ "Wrong number of initial categorical centroids in init ({}, " \ "should be {}).".format(init[1].shape[0], n_clusters) assert init[1].shape[1] == ncatattrs, \ "Wrong number of categorical attributes in init ({}, should be {})."\ .format(init[1].shape[1], ncatattrs) centroids = [np.asarray(init[0], dtype=np.float64), np.asarray(init[1], dtype=np.uint8)] else: raise NotImplementedError("Initialization method not supported.") if not isinstance(init, list): # Numerical is initialized by drawing from normal distribution, # categorical following the k-modes methods. meanx = np.mean(Xnum, axis=0) stdx = np.std(Xnum, axis=0) centroids = [ meanx + np.random.randn(n_clusters, nnumattrs) * stdx, centroids ] if verbose: print("Init: initializing clusters") membship = np.zeros((n_clusters, npoints), dtype=np.uint8) # Keep track of the sum of attribute values per cluster so that we # can do k-means on the numerical attributes. cl_attr_sum = np.zeros((n_clusters, nnumattrs), dtype=np.float64) # cl_attr_freq is a list of lists with dictionaries that contain # the frequencies of values per cluster and attribute. cl_attr_freq = [[defaultdict(int) for _ in range(ncatattrs)] for _ in range(n_clusters)] for ipoint in range(npoints): # Initial assignment to clusters clust = np.argmin( num_dissim(centroids[0], Xnum[ipoint]) + gamma * cat_dissim(centroids[1], Xcat[ipoint]) ) membship[clust, ipoint] = 1 # Count attribute values per cluster. for iattr, curattr in enumerate(Xnum[ipoint]): cl_attr_sum[clust, iattr] += curattr for iattr, curattr in enumerate(Xcat[ipoint]): cl_attr_freq[clust][iattr][curattr] += 1 # If no empty clusters, then consider initialization finalized. if membship.sum(axis=1).min() > 0: break if init_tries == MAX_INIT_TRIES: # Could not get rid of empty clusters. Randomly # initialize instead. init = 'random' elif init_tries == RAISE_INIT_TRIES: raise ValueError( "Clustering algorithm could not initialize. " "Consider assigning the initial clusters manually." ) # Perform an initial centroid update. for ik in range(n_clusters): for iattr in range(nnumattrs): centroids[0][ik, iattr] = \ cl_attr_sum[ik, iattr] / sum(membship[ik, :]) for iattr in range(ncatattrs): centroids[1][ik, iattr] = \ get_max_value_key(cl_attr_freq[ik][iattr]) # _____ ITERATION _____ if verbose: print("Starting iterations...") itr = 0 converged = False cost = np.Inf while itr <= max_iter and not converged: itr += 1 centroids, moves = _k_prototypes_iter(Xnum, Xcat, centroids, cl_attr_sum, cl_attr_freq, membship, num_dissim, cat_dissim, gamma) # All points seen in this iteration labels, ncost = _labels_cost(Xnum, Xcat, centroids, num_dissim, cat_dissim, gamma) converged = (moves == 0) or (ncost >= cost) cost = ncost if verbose: print("Run: {}, iteration: {}/{}, moves: {}, ncost: {}" .format(init_no + 1, itr, max_iter, moves, ncost)) # Store results of current run. all_centroids.append(centroids) all_labels.append(labels) all_costs.append(cost) all_n_iters.append(itr) best = np.argmin(all_costs) if n_init > 1 and verbose: print("Best run was number {}".format(best + 1)) # Note: return gamma in case it was automatically determined. return all_centroids[best], enc_map, all_labels[best], \ all_costs[best], all_n_iters[best], gamma
def predict(self, X, threshold=0.5): check_is_fitted(self) X = check_array(X) return self.predict_proba(X) >= threshold
def is_stationary(self, x): """Test whether the time series is stationary. Parameters ---------- x : array-like, shape=(n_samples,) The time series vector. """ if not self._base_case(x): return np.nan, False # ensure vector x = column_or_1d( check_array(x, ensure_2d=False, dtype=DTYPE, force_all_finite=True)) # type: np.ndarray # embed the vector. This is some funkiness that goes on in the R # code... basically, make a matrix where the column (rows if not T) # are lagged windows of x z = self._embed(x, 2) yt = z[0, :] yt1 = z[1, :] # type: np.ndarray # fit a linear model to a predictor matrix n = yt.shape[0] tt = (np.arange(n) + 1) - (n / 2.0) X = np.array([np.ones(n), tt, yt1]).T res = LinearRegression().fit(X, yt) # lm(yt ~ 1 + tt + yt1) coef = res.coef_ # check for singularities - do we want to do this??? in the R code, # it happens. but the very same lm in the R code is rank 3, and here # it is rank 2. Should we just ignore?... # if res.rank_ < 3: # raise ValueError('singularities in regression') u = yt - res.predict(X) # residuals ssqru = (u * u).sum() / float(n) scalar = 12 if not self.lshort else 4 l = int(np.trunc(scalar * np.power(n / 100.0, 0.25))) ssqrtl = C_tseries_pp_sum(u, n, l, ssqru) # define trm vals n2 = n * n syt11n = (yt1 * (np.arange(n) + 1)).sum() # sum(yt1*(1:n)) trm1 = n2 * (n2 - 1) * (yt1**2).sum() / 12.0 # R code: # n*sum(yt1*(1:n))^2 trm2 = n * (syt11n**2) # R code: n*(n+1)*sum(yt1*(1:n))*sum(yt1) trm3 = n * (n + 1) * syt11n * yt1.sum() trm4 = (n * (n + 1) * (2 * n + 1) * (yt1.sum()**2)) / 6.0 dx = trm1 - trm2 + trm3 - trm4 # if self.typ == 'alpha': alpha = coef[2] # it's the last col... STAT = n * (alpha - 1) - (n**6) / (24.0 * dx) * (ssqrtl - ssqru) table = -np.array([ c(22.5, 25.7, 27.4, 28.4, 28.9, 29.5), c(19.9, 22.4, 23.6, 24.4, 24.8, 25.1), c(17.9, 19.8, 20.7, 21.3, 21.5, 21.8), c(15.6, 16.8, 17.5, 18.0, 18.1, 18.3), c(3.66, 3.71, 3.74, 3.75, 3.76, 3.77), c(2.51, 2.60, 2.62, 2.64, 2.65, 2.66), c(1.53, 1.66, 1.73, 1.78, 1.78, 1.79), c(0.43, 0.65, 0.75, 0.82, 0.84, 0.87) ]).T tablen = table.shape[1] tableT = c(25, 50, 100, 250, 500, 100000).astype(DTYPE) tablep = c(0.01, 0.025, 0.05, 0.10, 0.90, 0.95, 0.975, 0.99) tableipl = np.zeros(tablen) for i in range(tablen): _, pval = approx(tableT, table[:, i], xout=n, rule=2) tableipl[i] = pval # make sure to do 1 - x... _, interpol = approx(tableipl, tablep, xout=STAT, rule=2) pval = 1 - interpol[0] # in the R code, here is where the P value warning is tested again... return pval, pval < self.alpha
def _validate_train_parms(self, train_set, train_lab, classes=None): random_state = validation.check_random_state(self.random_state) train_set, train_lab = validation.check_X_y(train_set, train_lab.ravel()) if (self.initial_fit): if (classes): self.classes_ = np.asarray(classes) self.protos_initialized = np.zeros(self.classes_.size) else: self.classes_ = unique_labels(train_lab) self.protos_initialized = np.zeros(self.classes_.size) nb_classes = len(self.classes_) nb_samples, nb_features = train_set.shape # nb_samples unused # set prototypes per class if isinstance(self.prototypes_per_class, int): if self.prototypes_per_class < 0 or not isinstance( self.prototypes_per_class, int): raise ValueError("prototypes_per_class must be a positive int") # nb_ppc = number of protos per class nb_ppc = np.ones([nb_classes], dtype='int') * self.prototypes_per_class else: nb_ppc = validation.column_or_1d( validation.check_array(self.prototypes_per_class, ensure_2d=False, dtype='int')) if nb_ppc.min() <= 0: raise ValueError( "values in prototypes_per_class must be positive") if nb_ppc.size != nb_classes: raise ValueError("length of prototypes per class" " does not fit the number of classes" "classes=%d" "length=%d" % (nb_classes, nb_ppc.size)) # initialize prototypes if self.initial_prototypes is None: if self.initial_fit: self.w_ = np.empty([np.sum(nb_ppc), nb_features], dtype=np.double) self.c_w_ = np.empty([nb_ppc.sum()], dtype=self.classes_.dtype) pos = 0 for actClassIdx in range(len(self.classes_)): actClass = self.classes_[actClassIdx] nb_prot = nb_ppc[actClassIdx] # nb_ppc: prototypes per class if (self.protos_initialized[actClassIdx] == 0 and actClass in unique_labels(train_lab)): mean = np.mean(train_set[train_lab == actClass, :], 0) self.w_[pos:pos + nb_prot] = mean + ( random_state.rand(nb_prot, nb_features) * 2 - 1) if math.isnan(self.w_[pos, 0]): print('Prototype is NaN: ', actClass) self.protos_initialized[actClassIdx] = 0 else: self.protos_initialized[actClassIdx] = 1 self.c_w_[pos:pos + nb_prot] = actClass pos += nb_prot else: x = validation.check_array(self.initial_prototypes) self.w_ = x[:, :-1] self.c_w_ = x[:, -1] if self.w_.shape != (np.sum(nb_ppc), nb_features): raise ValueError("the initial prototypes have wrong shape\n" "found=(%d,%d)\n" "expected=(%d,%d)" % (self.w_.shape[0], self.w_.shape[1], nb_ppc.sum(), nb_features)) if set(self.c_w_) != set(self.classes_): raise ValueError( "prototype labels and test data classes do not match\n" "classes={}\n" "prototype labels={}\n".format(self.classes_, self.c_w_)) if self.initial_fit: # Next two lines are Init for Adadelta/RMSprop self.squared_mean_gradient = np.zeros_like(self.w_) self.squared_mean_step = np.zeros_like(self.w_) self.initial_fit = False return train_set, train_lab, random_state
def kneighbors( self, X=None, n_candidates=None, return_distance=True ) -> Union[Tuple[np.array, np.array], np.array]: """ Retrieve k nearest neighbors. Parameters ---------- X: np.array or None, optional, default = None Query objects. If None, search among the indexed objects. n_candidates: int or None, optional, default = None Number of neighbors to retrieve. If None, use the value passed during construction. return_distance: bool, default = True If return_distance, will return distances and indices to neighbors. Else, only return the indices. """ check_is_fitted(self, 'index_') if X is not None: X = check_array(X) n_test = self.n_samples_fit_ if X is None else X.shape[0] dtype = self.X_dtype_ if X is None else X.dtype if n_candidates is None: n_candidates = self.n_candidates n_candidates = check_n_candidates(n_candidates) # For compatibility reasons, as each sample is considered as its own # neighbor, one extra neighbor will be computed. if X is None: n_neighbors = n_candidates + 1 start = 1 else: n_neighbors = n_candidates start = 0 # If fewer candidates than required are found for a query, # we save index=-1 and distance=NaN neigh_ind = -np.ones((n_test, n_candidates), dtype=np.int32) if return_distance: neigh_dist = np.empty_like(neigh_ind, dtype=dtype) * np.nan if isinstance(self.index_, str): index = ngtpy.Index(self.index_) else: index = self.index_ disable_tqdm = False if self.verbose else True if X is None: for i in tqdm( range(n_test), desc='Query NNG', disable=disable_tqdm, ): query = index.get_object(i) response = index.search( query=query, size=n_neighbors, with_distance=return_distance, epsilon=self.epsilon, ) if return_distance: ind, dist = [np.array(arr) for arr in zip(*response)] else: ind = response ind = ind[start:] neigh_ind[i, :len(ind)] = ind if return_distance: dist = dist[start:] neigh_dist[i, :len(dist)] = dist else: # if X was provided for i, x in tqdm( enumerate(X), desc='Query NNG', disable=disable_tqdm, ): response = index.search( query=x, size=n_neighbors, with_distance=return_distance, epsilon=self.epsilon, ) if return_distance: ind, dist = [np.array(arr) for arr in zip(*response)] else: ind = response ind = ind[start:] neigh_ind[i, :len(ind)] = ind if return_distance: dist = dist[start:] neigh_dist[i, :len(dist)] = dist if return_distance and self.metric == 'sqeuclidean': neigh_dist **= 2 if return_distance: return neigh_dist, neigh_ind else: return neigh_ind
def fit(self, X, y=None): X = check_array(X) return self
def transform(self, X): X = check_array(X) return X
def predict(self, X): X = check_array(X) return np.ones(X.shape[0])
def predict(self, X): check_is_fitted(self) X = check_array(X) return np.ones(X.shape[0])
def transform(self, X, y=None): check_is_fitted(self) X = check_array(X) return X
def predict(self, X): check_is_fitted(self) X = check_array(X) return np.ones(shape=(X.shape[0],)) * self._mean
def plot_partial_corrcoef(partial_corrcoef, ax=None, cbar=True, figsize=None, filename=None, title='Partial correlation', **kwargs): """Plot the partial correlation coefficient matrix. Parameters ---------- partial_corrcoef : array-like of shape (n_features, n_features) Partial correlation coefficient matrix. ax : matplotlib Axes, default None Target axes instance. cbar : bool, default True. If True, draw a colorbar. figsize : tuple, default None Tuple denoting figure size of the plot. filename : str, default None If provided, save the current figure. title : string, default 'Partial correlation' Axes title. To disable, pass None. **kwargs : dict Other keywords passed to ``ax.pcolormesh``. Returns ------- ax : matplotlib Axes Axes on which the plot was drawn. Examples -------- >>> import matplotlib.pyplot as plt >>> from kenchi.plotting import plot_partial_corrcoef >>> from sklearn.datasets import make_sparse_spd_matrix >>> A = make_sparse_spd_matrix(dim=20, norm_diag=True, random_state=0) >>> plot_partial_corrcoef(A) # doctest: +ELLIPSIS <matplotlib.axes._subplots.AxesSubplot object at 0x...> >>> plt.show() # doctest: +SKIP .. figure:: images/plot_partial_corrcoef.png """ import matplotlib.pyplot as plt from mpl_toolkits.axes_grid1 import make_axes_locatable partial_corrcoef = check_array(partial_corrcoef) partial_corrcoef = check_symmetric(partial_corrcoef, raise_exception=True) if ax is None: _, ax = plt.subplots(figsize=figsize) if title is not None: ax.set_title(title) # Add the pcolormesh kwargs here kwargs.setdefault('cmap', 'RdBu') kwargs.setdefault('edgecolors', 'white') kwargs.setdefault('vmin', -1.) kwargs.setdefault('vmax', 1.) # Draw the heatmap mesh = ax.pcolormesh(np.ma.masked_equal(partial_corrcoef, 0.), **kwargs) ax.set_aspect('equal') ax.set_facecolor('grey') # Invert the y axis to show the plot in matrix form ax.invert_yaxis() if cbar: # Create an axes on the right side of ax divider = make_axes_locatable(ax) cax = divider.append_axes('right', '5%', pad=0.1) ax.get_figure().colorbar(mesh, cax=cax) if filename is not None: ax.get_figure().savefig(filename) return ax
def predict(self, X): if not hasattr(self, 'coef_'): raise CorrectNotFittedError("estimator is not fitted yet") X = check_array(X) return np.ones(X.shape[0])
def fit(self, X, y=None) -> NNG: """ Build the ngtpy.Index and insert data from X. Parameters ---------- X: np.array Data to be indexed y: any Ignored Returns ------- self: NNG An instance of NNG with a built index """ if y is None: X = check_array(X) else: X, y = check_X_y(X, y) self.y_train_ = y self.n_samples_fit_ = X.shape[0] self.n_features_ = X.shape[1] self.X_dtype_ = X.dtype # Map common distance names to names used by ngt try: self.effective_metric_ = NNG.internal_distance_type[self.metric] except KeyError: self.effective_metric_ = self.metric if self.effective_metric_ not in NNG.valid_metrics: raise ValueError( f'Unknown distance/similarity measure: {self.effective_metric_}. ' f'Please use one of: {NNG.valid_metrics}.') # Set up a directory to save the index to prefix = 'skhubness_' suffix = '.anng' if self.index_dir in ['auto']: index_path = create_tempfile_preferably_in_dir( prefix=prefix, suffix=suffix, directory='/dev/shm') logging.warning( f'The index will be stored in {index_path}. ' f'It will NOT be deleted automatically, when this instance is destructed.' ) elif isinstance(self.index_dir, str): index_path = create_tempfile_preferably_in_dir( prefix=prefix, suffix=suffix, directory=self.index_dir) elif self.index_dir is None: index_path = create_tempfile_preferably_in_dir(prefix=prefix, suffix=suffix) else: raise TypeError( f'NNG requires to write an index to the filesystem. ' f'Please provide a valid path with parameter `index_dir`.') # Create the ANNG index, insert data ngtpy.create( path=index_path, dimension=self.n_features_, edge_size_for_creation=self.edge_size_for_creation, edge_size_for_search=self.edge_size_for_search, distance_type=self.effective_metric_, ) index_obj = ngtpy.Index(index_path) index_obj.batch_insert(X, num_threads=self.n_jobs) index_obj.save() # Convert ANNG top ONNG if self.optimize: optimizer = ngtpy.Optimizer() optimizer.set(num_of_outgoings=self.num_outgoing, num_of_incomings=self.num_incoming) index_path_onng = str( pathlib.Path(index_path).with_suffix('.onng')) optimizer.execute(index_path, index_path_onng) index_path = index_path_onng # Keep index in memory or store in path if self.index_dir is None: self.index_ = index_obj else: # index_obj.save() self.index_ = index_path return self
def fit(self, X, y=None): # Covariance does not make sense for a single feature x = check_array(X, ensure_min_features=2, estimator=self) n, p = x.shape kf = KFold(n_splits=self.folds, random_state=self.random_state, shuffle=self.shuffle) lam1n, lam2n, lam1type, lam2type = self._candidate() self.res = [] for i in range(lam1n): if lam1type < 3: lam1 = self.lam1s self.lam1 = lam1 else: lam1 = self.lam1s[i] lam1val = [] for j in range(lam2n): if lam2type < 3: lam2 = self.lam2s self.lam2 = lam2 else: lam2 = self.lam2s[j] lam2val = [] for train_index, test_index in kf.split(x): sam_cov = np.cov(x[train_index], rowvar=False) omega = graphical_concord(sam_cov=sam_cov, lam1=lam1, lam2=lam2, method=self.method, tol=self.tol, maxit=self.maxit, steptype=self.steptype, assume_scaled=self.assume_scaled) cost = self._predrisk(omega, x[test_index]) lam2val.append(cost) lam1val.append(np.mean(lam2val)) self.res.append(lam1val) idx = np.argwhere(self.res == np.min(self.res)) if self.lam1 is None: self.lam1 = self.lam1s[idx[0][0]] if self.lam2 is None: self.lam2 = self.lam2s[idx[0][1]] sam_cov = np.cov(x, rowvar=False) self.omega = graphical_concord(sam_cov, lam1=self.lam1, lam2=self.lam2, method=self.method, tol=self.tol, maxit=self.maxit, steptype=self.steptype, assume_scaled=self.assume_scaled) return self
def fit(self, X, y, sample_weight=None): """Fit the model according to the given training data. Parameters ---------- X : array-like, shape (n_samples, n_features) Training vector, where n_samples is the number of samples and n_features is the number of features. y : array-like, shape (n_samples,) Target vector relative to X. Has to follow the convention 0 for normal data, 1 for anomalies. sample_weight : array-like, shape (n_samples,) optional Array of weights that are assigned to individual samples, typically the amount in case of transactions data. Used to grow regression trees producing further rules to be tested. If not provided, then each sample is given unit weight. Returns ------- self : object Returns self. """ X, y = check_X_y(X, y) check_classification_targets(y) self.n_features_ = X.shape[1] self.classes_ = np.unique(y) n_classes = len(self.classes_) if n_classes < 2: raise ValueError("This method needs samples of at least 2 classes" " in the data, but the data contains only one" " class: %r" % self.classes_[0]) if not isinstance(self.max_depth_duplication, int) \ and self.max_depth_duplication is not None: raise ValueError("max_depth_duplication should be an integer" ) if not set(self.classes_) == set([0, 1]): warn("Found labels %s. This method assumes target class to be" " labeled as 1 and normal data to be labeled as 0. Any label" " different from 0 will be considered as being from the" " target class." % set(self.classes_)) y = (y > 0) # ensure that max_samples is in [1, n_samples]: n_samples = X.shape[0] if isinstance(self.max_samples, six.string_types): raise ValueError('max_samples (%s) is not supported.' 'Valid choices are: "auto", int or' 'float' % self.max_samples) elif isinstance(self.max_samples, INTEGER_TYPES): if self.max_samples > n_samples: warn("max_samples (%s) is greater than the " "total number of samples (%s). max_samples " "will be set to n_samples for estimation." % (self.max_samples, n_samples)) max_samples = n_samples else: max_samples = self.max_samples else: # float if not (0. < self.max_samples <= 1.): raise ValueError("max_samples must be in (0, 1], got %r" % self.max_samples) max_samples = int(self.max_samples * X.shape[0]) self.max_samples_ = max_samples self.rules_ = {} self.estimators_ = [] self.estimators_samples_ = [] self.estimators_features_ = [] # default columns names : feature_names_ = [BASE_FEATURE_NAME + x for x in np.arange(X.shape[1]).astype(str)] if self.feature_names is not None: self.feature_dict_ = {BASE_FEATURE_NAME + str(i): feat for i, feat in enumerate(self.feature_names)} else: self.feature_dict_ = {BASE_FEATURE_NAME + str(i): feat for i, feat in enumerate(feature_names_)} self.feature_names_ = feature_names_ clfs = [] regs = [] self._max_depths = self.max_depth \ if isinstance(self.max_depth, Iterable) else [self.max_depth] for max_depth in self._max_depths: bagging_clf = BaggingClassifier( base_estimator=DecisionTreeClassifier( max_depth=max_depth, max_features=self.max_features, min_samples_split=self.min_samples_split), n_estimators=self.n_estimators, max_samples=self.max_samples_, max_features=self.max_samples_features, bootstrap=self.bootstrap, bootstrap_features=self.bootstrap_features, # oob_score=... XXX may be added # if selection on tree perf needed. # warm_start=... XXX may be added to increase computation perf. n_jobs=self.n_jobs, random_state=self.random_state, verbose=self.verbose) bagging_reg = BaggingRegressor( base_estimator=DecisionTreeRegressor( max_depth=max_depth, max_features=self.max_features, min_samples_split=self.min_samples_split), n_estimators=self.n_estimators, max_samples=self.max_samples_, max_features=self.max_samples_features, bootstrap=self.bootstrap, bootstrap_features=self.bootstrap_features, # oob_score=... XXX may be added # if selection on tree perf needed. # warm_start=... XXX may be added to increase computation perf. n_jobs=self.n_jobs, random_state=self.random_state, verbose=self.verbose) clfs.append(bagging_clf) regs.append(bagging_reg) # define regression target: if sample_weight is not None: if sample_weight is not None: sample_weight = check_array(sample_weight, ensure_2d=False) weights = sample_weight - sample_weight.min() contamination = float(sum(y)) / len(y) y_reg = ( pow(weights, 0.5) * 0.5 / contamination * (y > 0) - pow((weights).mean(), 0.5) * (y == 0)) y_reg = 1. / (1 + np.exp(-y_reg)) # sigmoid else: y_reg = y # same as an other classification bagging for clf in clfs: clf.fit(X, y) self.estimators_ += clf.estimators_ self.estimators_samples_ += clf.estimators_samples_ self.estimators_features_ += clf.estimators_features_ for reg in regs: reg.fit(X, y_reg) self.estimators_ += reg.estimators_ self.estimators_samples_ += reg.estimators_samples_ self.estimators_features_ += reg.estimators_features_ rules_ = [] for estimator, samples, features in zip(self.estimators_, self.estimators_samples_, self.estimators_features_): # Create mask for OOB samples mask = ~samples if sum(mask) == 0: warn("OOB evaluation not possible: doing it in-bag." " Performance evaluation is likely to be wrong" " (overfitting) and selected rules are likely to" " not perform well! Please use max_samples < 1.") mask = samples rules_from_tree = self._tree_to_rules( estimator, np.array(self.feature_names_)[features]) # XXX todo: idem without dataframe X_oob = pandas.DataFrame((X[mask, :])[:, features], columns=np.array( self.feature_names_)[features]) if X_oob.shape[1] > 1: # otherwise pandas bug (cf. issue #16363) y_oob = y[mask] y_oob = np.array((y_oob != 0)) # Add OOB performances to rules: rules_from_tree = [(r, self._eval_rule_perf(r, X_oob, y_oob)) for r in set(rules_from_tree)] rules_ += rules_from_tree # Factorize rules before semantic tree filtering rules_ = [ tuple(rule) for rule in [Rule(r, args=args) for r, args in rules_]] # keep only rules verifying precision_min and recall_min: for rule, score in rules_: if score[0] >= self.precision_min and score[1] >= self.recall_min: if rule in self.rules_: # update the score to the new mean c = self.rules_[rule][2] + 1 b = self.rules_[rule][1] + 1. / c * ( score[1] - self.rules_[rule][1]) a = self.rules_[rule][0] + 1. / c * ( score[0] - self.rules_[rule][0]) self.rules_[rule] = (a, b, c) else: self.rules_[rule] = (score[0], score[1], 1) self.rules_ = sorted(self.rules_.items(), key=lambda x: (x[1][0], x[1][1]), reverse=True) # Deduplicate the rule using semantic tree if self.max_depth_duplication is not None: self.rules_ = self.deduplicate(self.rules_) self.rules_ = sorted(self.rules_, key=lambda x: - self.f1_score(x)) self.rules_without_feature_names_ = self.rules_ # Replace generic feature names by real feature names self.rules_ = [(replace_feature_name(rule, self.feature_dict_), perf) for rule, perf in self.rules_] return self
def _predict_proba(self, X): """Return probability estimates for the test data X. Parameters ---------- X : 3D numpy array dimensions (n,d,m) or (n_query, n_indexed) if metric == 'precomputed' Test samples. Returns ------- p : array of shape = [n_samples, n_classes], or a list of n_outputs of such arrays if n_outputs > 1. The class probabilities of the input samples. Classes are ordered by lexicographic order. """ self.check_is_fitted() if hasattr(check_array, "__wrapped__"): temp = check_array.__wrapped__.__code__ check_array.__wrapped__.__code__ = _check_array_ts.__code__ else: temp = check_array.__code__ check_array.__code__ = _check_array_ts.__code__ X = check_array(X, accept_sparse="csr") neigh_dist, neigh_ind = self.kneighbors(X) classes_ = self.classes_ _y = self._y if not self.outputs_2d_: _y = self._y.reshape((-1, 1)) classes_ = [self.classes_] n_samples = X.shape[0] weights = _get_weights(neigh_dist, self.weights) if weights is None: weights = np.ones_like(neigh_ind) all_rows = np.arange(X.shape[0]) probabilities = [] for k, classes_k in enumerate(classes_): pred_labels = _y[:, k][neigh_ind] proba_k = np.zeros((n_samples, classes_k.size)) # a simple ':' index doesn't work right for i, idx in enumerate(pred_labels.T): # loop is O(n_neighbors) proba_k[all_rows, idx] += weights[:, i] # normalize 'votes' into real [0,1] probabilities normalizer = proba_k.sum(axis=1)[:, np.newaxis] normalizer[normalizer == 0.0] = 1.0 proba_k /= normalizer probabilities.append(proba_k) if not self.outputs_2d_: probabilities = probabilities[0] if hasattr(check_array, "__wrapped__"): check_array.__wrapped__.__code__ = temp else: check_array.__code__ = temp return probabilities
def predict(self, X): check_is_fitted(self, 'coeffs_') X = check_array(X) recoded_X = self._recode(X) return np.dot(recoded_X, self.coeffs_)
def predict(self, X): # return 1 if X has more than one element else return 0 X = check_array(X) if X.shape[0] > 1: return np.ones(X.shape[0]) return np.zeros(X.shape[0])
def kneighbors(self, X, n_neighbors=None, return_distance=True): """Find the K-neighbors of a point. Returns indices of and distances to the neighbors of each point. Parameters ---------- X : sktime-format pandas dataframe with shape([n_cases,n_dimensions]), or numpy ndarray with shape([n_cases,n_readings,n_dimensions]) y : {array-like, sparse matrix} Target values of shape = [n_samples] n_neighbors : int Number of neighbors to get (default is the value passed to the constructor). return_distance : boolean, optional. Defaults to True. If False, distances will not be returned Returns ------- dist : array Array representing the lengths to points, only present if return_distance=True ind : array Indices of the nearest points in the population matrix. """ self.check_is_fitted() # Transpose to work correctly with distance functions X = X.transpose((0, 2, 1)) if n_neighbors is None: n_neighbors = self.n_neighbors elif n_neighbors <= 0: raise ValueError("Expected n_neighbors > 0. Got %d" % n_neighbors) else: if not np.issubdtype(type(n_neighbors), np.integer): raise TypeError("n_neighbors does not take %s value, " "enter integer value" % type(n_neighbors)) if X is not None: query_is_train = False X = check_array(X, accept_sparse="csr", allow_nd=True) else: query_is_train = True X = self._fit_X # Include an extra neighbor to account for the sample itself being # returned, which is removed later n_neighbors += 1 train_size = self._fit_X.shape[0] if n_neighbors > train_size: raise ValueError("Expected n_neighbors <= n_samples, " " but n_samples = %d, n_neighbors = %d" % (train_size, n_neighbors)) n_samples = X.shape[0] sample_range = np.arange(n_samples)[:, None] n_jobs = effective_n_jobs(self.n_jobs) if self._fit_method == "brute": reduce_func = partial( self._kneighbors_reduce_func, n_neighbors=n_neighbors, return_distance=return_distance, ) # for efficiency, use squared euclidean distances kwds = ({ "squared": True } if self.effective_metric_ == "euclidean" else self.effective_metric_params_) result = pairwise_distances_chunked(X, self._fit_X, reduce_func=reduce_func, metric=self.effective_metric_, n_jobs=n_jobs, **kwds) else: raise ValueError("internal: _fit_method not recognized") if return_distance: dist, neigh_ind = zip(*result) result = np.vstack(dist), np.vstack(neigh_ind) else: result = np.vstack(result) if not query_is_train: return result else: # If the query data is the same as the indexed data, we would like # to ignore the first nearest neighbor of every sample, i.e # the sample itself. if return_distance: dist, neigh_ind = result else: neigh_ind = result sample_mask = neigh_ind != sample_range # Corner case: When the number of duplicates are more # than the number of neighbors, the first NN will not # be the sample, but a duplicate. # In that case mask the first duplicate. dup_gr_nbrs = np.all(sample_mask, axis=1) sample_mask[:, 0][dup_gr_nbrs] = False neigh_ind = np.reshape(neigh_ind[sample_mask], (n_samples, n_neighbors - 1)) if return_distance: dist = np.reshape(dist[sample_mask], (n_samples, n_neighbors - 1)) return dist, neigh_ind return neigh_ind
def _optimize(self, x, y, random_state): if not isinstance(self.regularization, float) or self.regularization < 0: raise ValueError("regularization must be a positive float ") nb_prototypes, nb_features = self.w_.shape if self.initialdim is None: self.dim_ = nb_features elif not isinstance(self.initialdim, int) or self.initialdim <= 0: raise ValueError("dim must be an positive int") else: self.dim_ = self.initialdim if self.initial_matrix is None: if self.dim_ == nb_features: self.omega_ = np.eye(nb_features) else: self.omega_ = random_state.rand(self.dim_, nb_features) * 2 - 1 else: self.omega_ = validation.check_array(self.initial_matrix) if self.omega_.shape[1] != nb_features: raise ValueError( "initial matrix has wrong number of features\n" "found=%d\n" "expected=%d" % (self.omega_.shape[1], nb_features)) variables = np.append(self.w_, self.omega_, axis=0) label_equals_prototype = y method = 'l-bfgs-b' method = 'bfgs' res = minimize( fun=lambda vs: self._optfun(vs, x, label_equals_prototype=y), jac=lambda vs: self._optgrad(vs, x, label_equals_prototype=y, random_state=random_state, lr_prototypes=1, lr_relevances=0), method=method, x0=variables, options={'disp': self.display, 'gtol': self.gtol, 'maxiter': self.max_iter}) n_iter = res.nit res = minimize( fun=lambda vs: self._optfun(vs, x, label_equals_prototype=label_equals_prototype), jac=lambda vs: self._optgrad(vs, x, label_equals_prototype=label_equals_prototype, random_state=random_state, lr_prototypes=0, lr_relevances=1), method=method, x0=res.x, options={'disp': self.display, 'gtol': self.gtol, 'maxiter': self.max_iter}) n_iter = max(n_iter, res.nit) res = minimize( fun=lambda vs: self._optfun(vs, x, label_equals_prototype=label_equals_prototype), jac=lambda vs: self._optgrad(vs, x, label_equals_prototype=label_equals_prototype, random_state=random_state, lr_prototypes=1, lr_relevances=1), method=method, x0=res.x, options={'disp': self.display, 'gtol': self.gtol, 'maxiter': self.max_iter}) n_iter = max(n_iter, res.nit) out = res.x.reshape(res.x.size // nb_features, nb_features) self.w_ = out[:nb_prototypes] self.omega_ = out[nb_prototypes:] self.omega_ /= np.math.sqrt( np.sum(np.diag(self.omega_.T.dot(self.omega_)))) self.n_iter_ = n_iter
def transform(self, X, is_train_set=None): """Transform (predict) given data set. If ``X`` is train set: for each estimator return out-of-fold predictions (OOF). If ``X`` is any other set: variant A: for each estimator return mean (mode) of predictions made in each fold variant B: for each estimator return single prediction Parameters ---------- X : 2d numpy array or sparse matrix of shape [n_samples, n_features] Input data is_train_set : boolean, default None Fallback parameter. In general case should not be used (should be None). Gives ability to explicitly specify that given dataset is train set or other set. Returns ------- X_transformed : 2d numpy array of shape [n_samples, n_estimators] or [n_samples, n_estimators * n_classes] Out-of-fold predictions (OOF) for train set. Regular or bagged predictions for any other set. This is stacked features for next level. """ # Check if fitted check_is_fitted(self, ['models_A_']) # Input validation # ``check_estimator`` does not allow ``force_all_finite=False`` X = check_array(X, accept_sparse=['csr'], force_all_finite=True) # ********************************************************************* # Fitted StackingTransformer instance is bound to train set used for fitting. # So during transformation we have different actions for train set # and all other sets # ********************************************************************* if is_train_set is None: is_train_set = self._check_identity(X) # Print if self.verbose > 0: if is_train_set: print('Train set was detected.') print('Transforming...\n') # ********************************************************************* # Transform train set # ********************************************************************* if is_train_set: # In case if user directly tells that it is train set but shape is different if self.train_shape_ != X.shape: raise ValueError('Train set must have the same shape ' 'in order to be transformed.') # Create empty numpy array for train predictions (OOF) S_train = np.zeros((X.shape[0], self.n_estimators_ * self.n_classes_implicit_)) # ----------------------------------------------------------------- # MAIN TRANSFORM (PREDICT) PROCEDURE for train set # ----------------------------------------------------------------- # Loop across estimators # ----------------------------------------------------------------- for estimator_counter, (name, estimator) in enumerate(self.estimators_): if self.verbose > 0: estimator_str = 'estimator %2d: [%s: %s]' % (estimator_counter, name, estimator.__class__.__name__) print(estimator_str) # ------------------------------------------------------------- # Loop across folds # ------------------------------------------------------------- for fold_counter, (tr_index, te_index) in enumerate(self.kf_.split(X, self._y_)): # Split data # X_tr = X[tr_index] X_te = X[te_index] # Predict out-of-fold part of train set if 'predict_proba' == self.action_: col_slice_estimator = slice(estimator_counter * self.n_classes_implicit_, estimator_counter * self.n_classes_implicit_ + self.n_classes_implicit_) else: col_slice_estimator = estimator_counter S_train[te_index, col_slice_estimator] = self._estimator_action(self.models_A_[estimator_counter][fold_counter], None, None, X_te, action=self.action_, transform=self.transform_pred) # Print if self.verbose > 1: fold_str = ' model from fold %2d: done' % fold_counter print(fold_str) if self.verbose > 1: sep_str = ' ----' print(sep_str) if self.verbose > 0: done_str = ' DONE\n' print(done_str) # ----------------------------------------------------------------- # Cast class labels to int # ----------------------------------------------------------------- if not self.regression and not self.needs_proba: S_train = S_train.astype(int) # Return transformed data (OOF) return S_train # X_transformed # ********************************************************************* # Transform any other set # ********************************************************************* else: # Check n_features if X.shape[1] != self.n_features_: raise ValueError('Inconsistent number of features.') # Create empty numpy array for test predictions S_test = np.zeros((X.shape[0], self.n_estimators_ * self.n_classes_implicit_)) # --------------------------------------------------------------------- # MAIN TRANSFORM (PREDICT) PROCEDURE for any other set # ----------------------------------------------------------------- # Loop across estimators # ----------------------------------------------------------------- for estimator_counter, (name, estimator) in enumerate(self.estimators_): if self.verbose > 0: estimator_str = 'estimator %2d: [%s: %s]' % (estimator_counter, name, estimator.__class__.__name__) print(estimator_str) # ------------------------------------------------------------- # Variant A # ------------------------------------------------------------- if self.variant in ['A']: # Create empty numpy array, which will contain temporary predictions # for test set made in each fold S_test_temp = np.zeros((X.shape[0], self.n_folds * self.n_classes_implicit_)) # --------------------------------------------------------- # Loop across fitted models (it is the same as loop across folds) # --------------------------------------------------------- for fold_counter, model in enumerate(self.models_A_[estimator_counter]): # Predict test set in each fold if 'predict_proba' == self.action_: col_slice_fold = slice(fold_counter * self.n_classes_implicit_, fold_counter * self.n_classes_implicit_ + self.n_classes_implicit_) else: col_slice_fold = fold_counter S_test_temp[:, col_slice_fold] = self._estimator_action(model, None, None, X, action=self.action_, transform=self.transform_pred) # Print if self.verbose > 1: fold_str = ' model from fold %2d: done' % fold_counter print(fold_str) if self.verbose > 1: sep_str = ' ----' print(sep_str) # --------------------------------------------------------- # Compute mean or mode (majority voting) of predictions for test set # --------------------------------------------------------- if 'predict_proba' == self.action_: # Here we copute means of probabilirties for each class for class_id in range(self.n_classes_implicit_): S_test[:, estimator_counter * self.n_classes_implicit_ + class_id] = np.mean(S_test_temp[:, class_id::self.n_classes_implicit_], axis=1) else: if self.regression: S_test[:, estimator_counter] = np.mean(S_test_temp, axis=1) else: S_test[:, estimator_counter] = st.mode(S_test_temp, axis=1)[0].ravel() if self.verbose > 0: done_str = ' DONE\n' print(done_str) # ------------------------------------------------------------- # Variant B # ------------------------------------------------------------- else: if 'predict_proba' == self.action_: col_slice_estimator = slice(estimator_counter * self.n_classes_implicit_, estimator_counter * self.n_classes_implicit_ + self.n_classes_implicit_) else: col_slice_estimator = estimator_counter S_test[:, col_slice_estimator] = self._estimator_action(self.models_B_[estimator_counter], None, None, X, action=self.action_, transform=self.transform_pred) if self.verbose > 0: done_str = ' DONE\n' print(done_str) # --------------------------------------------------------------------- # Cast class labels to int # --------------------------------------------------------------------- if not self.regression and not self.needs_proba: S_test = S_test.astype(int) return S_test # X_transformed
def transform(self, X): X = check_array(X) if X.shape[1] != self.X_shape_[1]: raise ValueError('Bad number of features') return sp.csr_matrix(X)
def check_array(array, *args, **kwargs): """Validate inputs Parameters ---------- accept_dask_array : bool, default True accept_dask_dataframe : bool, default False accept_unknown_chunks : bool, default False For dask Arrays, whether to allow the `.chunks` attribute to contain any unknown values accept_multiple_blocks : bool, default False For dask Arrays, whether to allow multiple blocks along the second axis. *args, **kwargs : tuple, dict Passed through to scikit-learn Returns ------- array : obj Same type as the input Notes ----- For dask.array, a small numpy array emulating ``array`` is created and passed to scikit-learn's ``check_array`` with all the additional arguments. """ accept_dask_array = kwargs.pop("accept_dask_array", True) preserve_pandas_dataframe = kwargs.pop("preserve_pandas_dataframe", False) accept_dask_dataframe = kwargs.pop("accept_dask_dataframe", False) accept_unknown_chunks = kwargs.pop("accept_unknown_chunks", False) accept_multiple_blocks = kwargs.pop("accept_multiple_blocks", False) if isinstance(array, da.Array): if not accept_dask_array: raise TypeError if not accept_unknown_chunks: if np.isnan(array.shape[0]): raise TypeError( "Cannot operate on Dask array with unknown chunk sizes." ) if not accept_multiple_blocks and array.ndim > 1: if len(array.chunks[1]) > 1: msg = ( "Chunking is only allowed on the first axis. " "Use 'array.rechunk({1: array.shape[1]})' to " "rechunk to a single block along the second axis." ) raise TypeError(msg) # hmmm, we want to catch things like shape errors. # I'd like to make a small sample somehow shape = array.shape if len(shape) == 2: shape = (min(10, shape[0]), shape[1]) elif shape == 1: shape = min(10, shape[0]) sample = np.ones(shape=shape, dtype=array.dtype) sk_validation.check_array(sample, *args, **kwargs) return array elif isinstance(array, dd.DataFrame): if not accept_dask_dataframe: raise TypeError("This estimator does not support dask dataframes.") # TODO: sample? return array elif isinstance(array, pd.DataFrame) and preserve_pandas_dataframe: # TODO: validation? return array else: return sk_validation.check_array(array, *args, **kwargs)
def predict(self, X): X = check_array(X) return np.array([self.value_] * X.shape[0])
def predict(self, X): X = check_array(X) self.key = 1000 return np.ones(X.shape[0])
def fit(self, X, y=None): check_array(X) self.is_fitted_ = True return self
def is_stationary(self, x): """Test whether the time series is stationary. Parameters ---------- x : array-like, shape=(n_samples,) The time series vector. """ if not self._base_case(x): return np.nan, False # ensure vector x = column_or_1d( check_array(x, ensure_2d=False, dtype=DTYPE, force_all_finite=True)) # type: np.ndarray # if k is none... k = self.k if k is None: k = np.trunc(np.power(x.shape[0] - 1, 1 / 3.0)) k = int(k) + 1 y = diff(x) n = y.shape[0] z = self._embed(y, k) yt = z[0, :] tt = np.arange(k - 1, n) # R does [k:n].. but that's 1-based indexing and inclusive on the tail xt1 = x[tt] # make tt inclusive again (it was used as a mask before) tt += 1 # the array that will create the LM: _n = xt1.shape[0] X = np.hstack([ xt1.reshape((_n, 1)), np.ones(_n).reshape((_n, 1)), tt.reshape((_n, 1)) ]) if k > 1: yt1 = z[1:k, :] # R had 2:k X = np.hstack([X, yt1.T]) # fit the linear regression - this one is a bit strange in that we # are using OLS from statsmodels rather than LR from sklearn. This is # because we need the std errors, and sklearn does not have a way to # store them. res = sm.OLS(yt, X).fit() STAT = res.params[0] / res.HC0_se[0] # FIXME: is the denom correct?... table = -np.array([ c(4.38, 4.15, 4.04, 3.99, 3.98, 3.96), c(3.95, 3.80, 3.73, 3.69, 3.68, 3.66), c(3.60, 3.50, 3.45, 3.43, 3.42, 3.41), c(3.24, 3.18, 3.15, 3.13, 3.13, 3.12), c(1.14, 1.19, 1.22, 1.23, 1.24, 1.25), c(0.80, 0.87, 0.90, 0.92, 0.93, 0.94), c(0.50, 0.58, 0.62, 0.64, 0.65, 0.66), c(0.15, 0.24, 0.28, 0.31, 0.32, 0.33) ]).T tablen = table.shape[1] tableT = c(25, 50, 100, 250, 500, 100000) tablep = c(0.01, 0.025, 0.05, 0.10, 0.90, 0.95, 0.975, 0.99) tableipl = np.zeros(tablen) for i in range(tablen): _, pval = approx(tableT, table[:, i], xout=n, rule=2) tableipl[i] = pval # make sure to do 1 - x... _, interpol = approx(tableipl, tablep, xout=STAT, rule=2) pval = 1 - interpol[0] # in the R code, here is where the P value warning is tested again... return pval, pval < self.alpha