def precision_n_scores(y, y_pred, n=None): """Utility function to calculate precision @ rank n. Parameters ---------- y : list or numpy array of shape (n_samples,) The ground truth. Binary (0: inliers, 1: outliers). y_pred : list or numpy array of shape (n_samples,) The raw outlier scores as returned by a fitted model. n : int, optional (default=None) The number of outliers. if not defined, infer using ground truth. Returns ------- precision_at_rank_n : float Precision at rank n score. """ # turn raw prediction decision scores into binary labels y_pred = get_label_n(y, y_pred, n) # enforce formats of y and labels_ y = column_or_1d(y) y_pred = column_or_1d(y_pred) return precision_score(y, y_pred)
def check_consistent_shape(X_train, y_train, X_test, y_test, y_train_pred, y_test_pred): """Internal shape to check input data shapes are consistent. Parameters ---------- X_train : numpy array of shape (n_samples, n_features) The training samples. y_train : list or array of shape (n_samples,) The ground truth of training samples. X_test : numpy array of shape (n_samples, n_features) The test samples. y_test : list or array of shape (n_samples,) The ground truth of test samples. y_train_pred : numpy array of shape (n_samples, n_features) The predicted binary labels of the training samples. y_test_pred : numpy array of shape (n_samples, n_features) The predicted binary labels of the test samples. Returns ------- X_train : numpy array of shape (n_samples, n_features) The training samples. y_train : list or array of shape (n_samples,) The ground truth of training samples. X_test : numpy array of shape (n_samples, n_features) The test samples. y_test : list or array of shape (n_samples,) The ground truth of test samples. y_train_pred : numpy array of shape (n_samples, n_features) The predicted binary labels of the training samples. y_test_pred : numpy array of shape (n_samples, n_features) The predicted binary labels of the test samples. """ # check input data shapes are consistent X_train, y_train = check_X_y(X_train, y_train) X_test, y_test = check_X_y(X_test, y_test) y_test_pred = column_or_1d(y_test_pred) y_train_pred = column_or_1d(y_train_pred) check_consistent_length(y_train, y_train_pred) check_consistent_length(y_test, y_test_pred) if X_train.shape[1] != X_test.shape[1]: raise ValueError("X_train {0} and X_test {1} have different number " "of features.".format(X_train.shape, X_test.shape)) return X_train, y_train, X_test, y_test, y_train_pred, y_test_pred
def savings_score(y_true, y_pred, cost_mat): #TODO: update description """Savings score. This function calculates the savings cost of using y_pred on y_true with cost-matrix cost-mat, as the difference of y_pred and the cost_loss of a naive classification model. Parameters ---------- y_true : array-like or label indicator matrix Ground truth (correct) labels. y_pred : array-like or label indicator matrix Predicted labels, as returned by a classifier. cost_mat : array-like of shape = [n_samples, 4] Cost matrix of the classification problem Where the columns represents the costs of: false positives, false negatives, true positives and true negatives, for each example. Returns ------- score : float Savings of a using y_pred on y_true with cost-matrix cost-mat The best performance is 1. References ---------- .. [1] A. Correa Bahnsen, A. Stojanovic, D.Aouada, B, Ottersten, `"Improving Credit Card Fraud Detection with Calibrated Probabilities" <http://albahnsen.com/files/%20Improving%20Credit%20Card%20Fraud%20Detection%20by%20using%20Calibrated%20Probabilities%20-%20Publish.pdf>`__, in Proceedings of the fourteenth SIAM International Conference on Data Mining, 677-685, 2014. See also -------- cost_loss Examples -------- >>> import numpy as np >>> from costcla.metrics import savings_score, cost_loss >>> y_pred = [0, 1, 0, 0] >>> y_true = [0, 1, 1, 0] >>> cost_mat = np.array([[4, 1, 0, 0], [1, 3, 0, 0], [2, 3, 0, 0], [2, 1, 0, 0]]) >>> savings_score(y_true, y_pred, cost_mat) 0.5 """ #TODO: Check consistency of cost_mat y_true = column_or_1d(y_true) y_pred = column_or_1d(y_pred) n_samples = len(y_true) # Calculate the cost of naive prediction cost_base = min(cost_loss(y_true, np.zeros(n_samples), cost_mat), cost_loss(y_true, np.ones(n_samples), cost_mat)) cost = cost_loss(y_true, y_pred, cost_mat) return 1.0 - cost / cost_base
def evaluate_print(clf_name, y, y_pred): """Utility function for evaluating and printing the results for examples. Default metrics include ROC and Precision @ n Parameters ---------- clf_name : str The name of the detector. y : list or numpy array of shape (n_samples,) The ground truth. Binary (0: inliers, 1: outliers). y_pred : list or numpy array of shape (n_samples,) The raw outlier scores as returned by a fitted model. """ y = column_or_1d(y) y_pred = column_or_1d(y_pred) check_consistent_length(y, y_pred) print('{clf_name} ROC:{roc}, precision @ rank n:{prn}'.format( clf_name=clf_name, roc=np.round(roc_auc_score(y, y_pred), decimals=4), prn=np.round(precision_n_scores(y, y_pred), decimals=4)))
def _sigmoid_calibration(self,df, y, sample_weight=None): """Probability Calibration with sigmoid method (Platt 2000) Parameters ---------- df : ndarray, shape (n_samples,) The decision function or predict proba for the samples. y : ndarray, shape (n_samples,) The targets. sample_weight : array-like, shape = [n_samples] or None Sample weights. If None, then samples are equally weighted. Returns ------- a : float The slope. b : float The intercept. References ---------- Platt, "Probabilistic Outputs for Support Vector Machines" """ df = column_or_1d(df) y = column_or_1d(y) F = df # F follows Platt's notations in the Reference Paper tiny = np.finfo(np.float).tiny # to avoid division by 0 warning # Bayesian priors (see Platt end of section 2.2 in the Reference Paper) prior0 = float(np.sum(y <= 0)) prior1 = y.shape[0] - prior0 T = np.zeros(y.shape) T[y > 0] = (prior1 + 1.) / (prior1 + 2.) T[y <= 0] = 1. / (prior0 + 2.) T1 = 1. - T def objective(AB): # From Platt (beginning of Section 2.2 in the Reference Paper) E = np.exp(AB[0] * F + AB[1]) P = 1. / (1. + E) l = -(T * np.log(P + tiny) + T1 * np.log(1. - P + tiny)) if sample_weight is not None: return (sample_weight * l).sum() else: return l.sum() def grad(AB): # gradient of the objective function E = np.exp(AB[0] * F + AB[1]) P = 1. / (1. + E) TEP_minus_T1P = P * (T * E - T1) if sample_weight is not None: TEP_minus_T1P *= sample_weight dA = np.dot(TEP_minus_T1P, F) dB = np.sum(TEP_minus_T1P) return np.array([dA, dB]) AB0 = np.array([0., math.log((prior0 + 1.) / (prior1 + 1.))]) AB_ = fmin_bfgs(objective, AB0, fprime=grad, disp=False) return (AB_[0], AB_[1])
def _check_targets_hmc(y_true, y_pred): check_consistent_length(y_true, y_pred) y_type = set([type_of_target(y_true), type_of_target(y_pred)]) if y_type == set(["binary", "multiclass"]): y_type = set(["multiclass"]) if y_type != set(["multiclass"]): raise ValueError("{0} is not supported".format(y_type)) y_true = column_or_1d(y_true) y_pred = column_or_1d(y_pred) return y_true, y_pred
def brier_score_loss(y_true, y_prob): """Compute the Brier score The smaller the Brier score, the better, hence the naming with "loss". Across all items in a set N predictions, the Brier score measures the mean squared difference between (1) the predicted probability assigned to the possible outcomes for item i, and (2) the actual outcome. Therefore, the lower the Brier score is for a set of predictions, the better the predictions are calibrated. Note that the Brier score always takes on a value between zero and one, since this is the largest possible difference between a predicted probability (which must be between zero and one) and the actual outcome (which can take on values of only 0 and 1). The Brier score is appropriate for binary and categorical outcomes that can be structured as true or false, but is inappropriate for ordinal variables which can take on three or more values (this is because the Brier score assumes that all possible outcomes are equivalently "distant" from one another). Parameters ---------- y_true : array, shape (n_samples,) True targets. y_prob : array, shape (n_samples,) Probabilities of the positive class. Returns ------- score : float Brier score Examples -------- >>> import numpy as np >>> from costcla.metrics import brier_score_loss >>> y_true = [0, 1, 1, 0] >>> y_prob = [0.1, 0.9, 0.8, 0.3] >>> brier_score_loss(y_true, y_prob) # doctest: +ELLIPSIS 0.037... >>> brier_score_loss(y_true, np.array(y_prob) > 0.5) 0.0 References ---------- http://en.wikipedia.org/wiki/Brier_score """ y_true = column_or_1d(y_true) y_prob = column_or_1d(y_prob) return np.mean((y_true - y_prob) ** 2)
def _check_clf_targets(y_true, y_pred): """Check that y_true and y_pred belong to the same classification task This converts multiclass or binary types to a common shape, and raises a ValueError for a mix of multilabel and multiclass targets, a mix of multilabel formats, for the presence of continuous-valued or multioutput targets, or for targets of different lengths. Column vectors are squeezed to 1d. Parameters ---------- y_true : array-like, y_pred : array-like Returns ------- type_true : one of {'multilabel-indicator', 'multilabel-sequences', \ 'multiclass', 'binary'} The type of the true target data, as output by ``utils.multiclass.type_of_target`` y_true : array or indicator matrix or sequence of sequences y_pred : array or indicator matrix or sequence of sequences """ y_true, y_pred = check_arrays(y_true, y_pred, allow_lists=True) type_true = type_of_target(y_true) type_pred = type_of_target(y_pred) y_type = set([type_true, type_pred]) if y_type == set(["binary", "multiclass"]): y_type = set(["multiclass"]) if len(y_type) > 1: raise ValueError("Can't handle mix of {0} and {1}" "".format(type_true, type_pred)) # We can't have more than one value on y_type => The set is no more needed y_type = y_type.pop() # No metrics support "multiclass-multioutput" format if y_type not in ["binary", "multiclass", "multilabel-indicator", "multilabel-sequences"]: raise ValueError("{0} is not supported".format(y_type)) if y_type in ["binary", "multiclass"]: y_true = column_or_1d(y_true) y_pred = column_or_1d(y_pred) return y_type, y_true, y_pred
def score_to_label(pred_scores, outliers_fraction=0.1): """Turn raw outlier outlier scores to binary labels (0 or 1). Parameters ---------- pred_scores : list or numpy array of shape (n_samples,) Raw outlier scores. Outliers are assumed have larger values. outliers_fraction : float in (0,1) Percentage of outliers. Returns ------- outlier_labels : numpy array of shape (n_samples,) For each observation, tells whether or not it should be considered as an outlier according to the fitted model. Return the outlier probability, ranging in [0,1]. """ # check input values pred_scores = column_or_1d(pred_scores) check_parameter(outliers_fraction, 0, 1) threshold = scoreatpercentile(pred_scores, 100 * (1 - outliers_fraction)) pred_labels = (pred_scores > threshold).astype('int') return pred_labels
def average(scores, estimator_weight=None): """Combination method to merge the outlier scores from multiple estimators by taking the average. Parameters ---------- scores : numpy array of shape (n_samples, n_estimators) Score matrix from multiple estimators on the same samples. estimator_weight : list of shape (1, n_estimators) If specified, using weighted average Returns ------- combined_scores : numpy array of shape (n_samples, ) The combined outlier scores. """ scores = check_array(scores) if estimator_weight is not None: estimator_weight = column_or_1d(estimator_weight).reshape(1, -1) assert_equal(scores.shape[1], estimator_weight.shape[1]) # (d1*w1 + d2*w2 + ...+ dn*wn)/(w1+w2+...+wn) # generated weighted scores scores = np.sum(np.multiply(scores, estimator_weight), axis=1) / np.sum( estimator_weight) return scores.ravel() else: return np.mean(scores, axis=1).ravel()
def fit(self, T, y, sample_weight=None): """Fit using `T`, `y` as training data. Parameters ---------- * `T` [array-like, shape=(n_samples,)]: Training data. * `y` [array-like, shape=(n_samples,)]: Training target. * `sample_weight` [array-like, shape=(n_samples,), optional]: Weights. If set to `None`, all weights will be set to 1. Returns ------- * `self` [object]: `self`. """ # Check input T = column_or_1d(T) # Fit self.calibrator_ = _SigmoidCalibration() self.calibrator_.fit(T, y, sample_weight=sample_weight) return self
def fit(self, T, y, sample_weight=None): """Fit using `T`, `y` as training data. Parameters ---------- * `T` [array-like, shape=(n_samples,)]: Training data. * `y` [array-like, shape=(n_samples,)]: Training target. Returns ------- * `self` [object]: `self`. """ # Check input T = column_or_1d(T) assert sample_weight is None # not supported by KernelDensity # Fit t0 = T[y == 0] t1 = T[y == 1] self.calibrator0 = KernelDensity(bandwidth=self.bandwidth) self.calibrator1 = KernelDensity(bandwidth=self.bandwidth) self.calibrator0.fit(t0.reshape(-1, 1)) self.calibrator1.fit(t1.reshape(-1, 1)) return self
def fit(self, X, y, sample_weight=None, check_input=True): """Fit Ridge regression model after searching for the best mu and tau. Parameters ---------- X : array-like, shape = [n_samples, n_features] Training data y : array-like, shape = [n_samples] or [n_samples, n_targets] Target values sample_weight : float or array-like of shape [n_samples] Sample weight Returns ------- self : Returns self. """ self._label_binarizer = LabelBinarizer(pos_label=1, neg_label=-1) y = self._label_binarizer.fit_transform(y) if self._label_binarizer.y_type_.startswith('multilabel'): raise ValueError( "%s doesn't support multi-label classification" % ( self.__class__.__name__)) else: y = column_or_1d(y, warn=False) param_grid = {'tau': self.taus, 'lamda': self.lamdas} fit_params = {'sample_weight': sample_weight, 'check_input': check_input} estimator = L1L2TwoStepClassifier( mu=self.mu, fit_intercept=self.fit_intercept, use_gpu=self.use_gpu, threshold=self.threshold, normalize=self.normalize, precompute=self.precompute, max_iter=self.max_iter, copy_X=self.copy_X, tol=self.tol, warm_start=self.warm_start, positive=self.positive, random_state=self.random_state, selection=self.selection) gs = GridSearchCV( estimator=estimator, param_grid=param_grid, fit_params=fit_params, cv=self.cv, scoring=self.scoring, n_jobs=self.n_jobs, iid=self.iid, refit=self.refit, verbose=self.verbose, pre_dispatch=self.pre_dispatch, error_score=self.error_score, return_train_score=self.return_train_score) gs.fit(X, y) estimator = gs.best_estimator_ self.tau_ = estimator.tau self.lamda_ = estimator.lamda self.coef_ = estimator.coef_ self.intercept_ = estimator.intercept_ self.best_estimator_ = estimator # XXX DEBUG if self.classes_.shape[0] > 2: ndim = self.classes_.shape[0] else: ndim = 1 self.coef_ = self.coef_.reshape(ndim, -1) return self
def fit(self, X, y): """Fit the model to the data X and target y. Parameters ---------- X : {array-like, sparse matrix}, shape (n_samples, n_features) Training data, where n_samples in the number of samples and n_features is the number of features. y : numpy array of shape (n_samples) Returns ------- self """ y = column_or_1d(y, warn=True) # needs a better way to check multi-label instances if isinstance(np.reshape(y, (-1, 1))[0][0], list): self.multi_label = True else: self.multi_label = False self.classes_ = np.unique(y) self._lbin = LabelBinarizer() y = self._lbin.fit_transform(y) super(MultilayerPerceptronClassifier, self).fit(X, y) return self
def get_color_codes(y): """Internal function to generate color codes for inliers and outliers. Inliers (0): blue; Outlier (1): red. Parameters ---------- y : list or numpy array of shape (n_samples,) The ground truth. Binary (0: inliers, 1: outliers). Returns ------- c : numpy array of shape (n_samples,) Color codes. """ y = column_or_1d(y) # inliers are assigned blue c = np.full([len(y)], 'b', dtype=str) outliers_ind = np.where(y == 1) # outlier are assigned red c[outliers_ind] = 'r' return c
def _validate_y(self, y): y = column_or_1d(y, warn=True) check_classification_targets(y) self.classes_, y = np.unique(y, return_inverse=True) self.n_classes_ = len(self.classes_) return y
def get_label_n(y, y_pred, n=None): """Function to turn raw outlier scores into binary labels by assign 1 to top n outlier scores. Parameters ---------- y : list or numpy array of shape (n_samples,) The ground truth. Binary (0: inliers, 1: outliers). y_pred : list or numpy array of shape (n_samples,) The raw outlier scores as returned by a fitted model. n : int, optional (default=None) The number of outliers. if not defined, infer using ground truth. Returns ------- labels : numpy array of shape (n_samples,) binary labels 0: normal points and 1: outliers Examples -------- >>> from pyod.utils.utility import get_label_n >>> y = [0, 1, 1, 0, 0, 0] >>> y_pred = [0.1, 0.5, 0.3, 0.2, 0.7] >>> get_label_n(y, y_pred) >>> [0, 1, 0, 0, 1] """ # enforce formats of inputs y = column_or_1d(y) y_pred = column_or_1d(y_pred) check_consistent_length(y, y_pred) y_len = len(y) # the length of targets # calculate the percentage of outliers if n is not None: outliers_fraction = n / y_len else: outliers_fraction = np.count_nonzero(y) / y_len threshold = scoreatpercentile(y_pred, 100 * (1 - outliers_fraction)) y_pred = (y_pred > threshold).astype('int') return y_pred
def _validate_y(self, y): y = column_or_1d(y, warn=True) check_classification_targets(y) self.classes_, y = np.unique(y, return_inverse=True) n_classes = len(self.classes_) if n_classes > 2: raise ValueError("It's a binary classification algorithm. Use a dataset with only 2 classes to predict.") return y
def fit_all(self, X, y, n_shop, last_obs_plan): # if not warmstart - clear the estimator state if not self.warm_start: self._clear_state() # Check input X, = check_arrays(X, dtype=DTYPE, sparse_format="dense") y = column_or_1d(y, warn=True) n_samples, n_features = X.shape self.n_features = n_features random_state = check_random_state(self.random_state) self._check_params() if not self._is_initialized(): if self.verbose: print 'Initializing gradient boosting...' # init state self._init_state() # fit initial model if not self.fix_history: idx = get_truncated_shopping_indices(n_shop) else: idx = np.arange(len(n_shop)) # init predictions by averaging over the shopping histories y_pred = self.init_.predict(last_obs_plan[idx]) print 'First training accuracy:', accuracy_score(y, y_pred.argmax(axis=1)) begin_at_stage = 0 else: # add more estimators to fitted model # invariant: warm_start = True if self.n_estimators < self.estimators_.shape[0]: raise ValueError('n_estimators=%d must be larger or equal to ' 'estimators_.shape[0]=%d when ' 'warm_start==True' % (self.n_estimators, self.estimators_.shape[0])) begin_at_stage = self.estimators_.shape[0] y_pred = self.decision_function(X) self._resize_state() # fit the boosting stages n_stages = self._fit_stages(X, y, y_pred, random_state, begin_at_stage, n_shop) # change shape of arrays after fit (early-stopping or additional tests) if n_stages != self.estimators_.shape[0]: self.estimators_ = self.estimators_[:n_stages] self.train_score_ = self.train_score_[:n_stages] if hasattr(self, 'oob_improvement_'): self.oob_improvement_ = self.oob_improvement_[:n_stages] if hasattr(self, '_oob_score_'): self._oob_score_ = self._oob_score_[:n_stages] return self
def partial_fit(self, X, y, classes=None): """Fit the model to the data X and target y. Parameters ---------- X : {array-like, sparse matrix}, shape (n_samples, n_features) Training data, where n_samples in the number of samples and n_features is the number of features. classes : array, shape (n_classes) Classes across all calls to partial_fit. Can be obtained by via `np.unique(y_all)`, where y_all is the target vector of the entire dataset. This argument is required for the first call to partial_fit and can be omitted in the subsequent calls. Note that y doesn't need to contain all labels in `classes`. y : numpy array of shape (n_samples) Subset of the target values. Returns ------- self """ if self.algorithm != 'sgd': raise ValueError("only SGD algorithm" " supports partial fit") if self.classes_ is None and classes is None: raise ValueError("classes must be passed on the first call " "to partial_fit.") elif self.classes_ is not None and classes is not None: if np.any(self.classes_ != np.unique(classes)): raise ValueError("`classes` is not the same as on last call " "to partial_fit.") elif classes is not None: self.classes_ = classes if not hasattr(self, '_lbin'): self._lbin = LabelBinarizer() self._lbin._classes = classes y = column_or_1d(y, warn=True) # needs a better way to check multi-label instances if isinstance(np.reshape(y, (-1, 1))[0][0], list): self.multi_label = True else: self.multi_label = False y = self._lbin.fit_transform(y) super(MultilayerPerceptronClassifier, self).partial_fit(X, y) return self
def fit(self, T, y, sample_weight=None): """Fit using `T`, `y` as training data. Parameters ---------- * `T` [array-like, shape=(n_samples,)]: Training data. * `y` [array-like, shape=(n_samples,)]: Training target. * `sample_weight` [array-like, shape=(n_samples,), optional]: Weights. If set to `None`, all weights will be set to 1. Returns ------- * `self` [object]: `self`. """ # Check input T = column_or_1d(T) t0 = T[y == 0] t1 = T[y == 1] sw0 = None if sample_weight is not None: sw0 = sample_weight[y == 0] sw1 = None if sample_weight is not None: sw1 = sample_weight[y == 1] bins = self.bins if self.bins == "auto": bins = 10 + int(len(t0) ** (1. / 3.)) range = self.range if self.range is None: t_min = max(0, min(np.min(t0), np.min(t1)) - self.eps) t_max = min(1, max(np.max(t0), np.max(t1)) + self.eps) range = [(t_min, t_max)] # Fit self.calibrator0 = Histogram(bins=bins, range=range, interpolation=self.interpolation, variable_width=self.variable_width) self.calibrator1 = Histogram(bins=bins, range=range, interpolation=self.interpolation, variable_width=self.variable_width) self.calibrator0.fit(t0.reshape(-1, 1), sample_weight=sw0) self.calibrator1.fit(t1.reshape(-1, 1), sample_weight=sw1) return self
def fit(self, X, y): """Finds the intervals of interest from the input data. Parameters ---------- X : The array containing features to be discretized. Continuous features should be specified by the `continuous_features` attribute if `X` is a 2-D array. y : A list or array of class labels corresponding to `X`. """ self.dimensions_ = len(X.shape) if self.dimensions_ > 2: raise ValueError("Invalid input dimension for `X`. Input shape is" "{0}".format(X.shape)) X = check_array(X, force_all_finite=True, ensure_2d=False) y = column_or_1d(y) y = check_array(y, ensure_2d=False, dtype=int) X, y = check_X_y(X, y) if not self.shuffle: import warnings warnings.warn("Shuffle parameter will be removed in the future.", DeprecationWarning) else: state = check_random_state(self.random_state) perm = state.permutation(len(y)) X = X[perm] y = y[perm] if self.dimensions_ == 2: if self.continuous_features_ is None: self.continuous_features_ = np.arange(X.shape[1]) self.cut_points_ = dict() for index, col in enumerate(X.T): if index not in self.continuous_features_: continue cut_points = MDLPDiscretize(col, y, self.min_depth) self.cut_points_[index] = cut_points else: if self.continuous_features_ is not None: raise ValueError("Passed in a 1-d column of continuous features, " "but continuous_features is not None") self.continuous_features_ = None cut_points = MDLPDiscretize(X, y, self.min_depth) self.cut_points_ = cut_points return self
def fit(self, T, y, sample_weight=None): """Fit using `T`, `y` as training data. Parameters ---------- * `T` [array-like, shape=(n_samples,)]: Training data. * `y` [array-like, shape=(n_samples,)]: Training target. * `sample_weight` [array-like, shape=(n_samples,), optional]: Weights. If set to None, all weights will be set to 1. Returns ------- * `self` [object]: `self`. Notes ----- `T` is stored for future use, as `predict` needs T to interpolate new input data. """ # Check input T = column_or_1d(T) # Fit isotonic regression self.ir_ = IsotonicRegression(y_min=self.y_min, y_max=self.y_max, increasing=self.increasing, out_of_bounds="clip") self.ir_.fit(T, y, sample_weight=sample_weight) # Interpolators if self.interpolation: p = self.ir_.transform(T) change_mask1 = (p - np.roll(p, 1)) > 0 change_mask2 = np.roll(change_mask1, -1) change_mask1[0] = True change_mask1[-1] = True change_mask2[0] = True change_mask2[-1] = True self.interp1_ = interp1d(T[change_mask1], p[change_mask1], bounds_error=False, fill_value=(0., 1.)) self.interp2_ = interp1d(T[change_mask2], p[change_mask2], bounds_error=False, fill_value=(0., 1.)) return self
def group_based_cvm(y_pred, mask, sample_weight, groups_indices): y_pred = column_or_1d(y_pred) sample_weight = check_sample_weight(y_pred, sample_weight=sample_weight) group_weights = compute_group_weights_by_indices(groups_indices, sample_weight=sample_weight) result = 0. global_data, global_weight, global_F = prepare_distribution(y_pred[mask], weights=sample_weight[mask]) for group, group_weight in zip(groups_indices, group_weights): local_distribution = y_pred[group] local_weights = sample_weight[group] result += group_weight * _cvm_2samp_fast(global_data, local_distribution, global_weight, local_weights, global_F) return result
def hinge_loss(y_true, pred_decision, labels=None, sample_weight=None): check_consistent_length(y_true, pred_decision, sample_weight) pred_decision = check_array(pred_decision, ensure_2d=False) y_true = column_or_1d(y_true) y_true_unique = np.unique(y_true) if y_true_unique.size > 2: if (labels is None and pred_decision.ndim > 1 and (np.size(y_true_unique) != pred_decision.shape[1])): raise ValueError("Please include all labels in y_true " "or pass labels as third argument") if labels is None: labels = y_true_unique le = LabelEncoder() le.fit(labels) y_true = le.transform(y_true) mask = np.ones_like(pred_decision, dtype=bool) mask[np.arange(y_true.shape[0]), y_true] = False margin = pred_decision[~mask] margin -= np.max(pred_decision[mask].reshape(y_true.shape[0], -1), axis=1) else: # Handles binary class case # this code assumes that positive and negative labels # are encoded as +1 and -1 respectively pred_decision = column_or_1d(pred_decision) pred_decision = np.ravel(pred_decision) lbin = LabelBinarizer(neg_label=-1) y_true = lbin.fit_transform(y_true)[:, 0] try: margin = y_true * pred_decision except TypeError: raise TypeError("pred_decision should be an array of floats.") losses = 1 - margin # The hinge_loss doesn't penalize good enough predictions. losses[losses <= 0] = 0 return losses
def fit(self, X, y,Q=None, monitor=None): if Q is None: Q = np.ones(Q) else: ids = np.argsort(Q) X,y,Q = X[ids],y[ids],Q[ids] self.n_classes_ = 1 if Q is not None: Q = column_or_1d(Q, warn=True) # check if sample_group is grouped uniq_group = {Q[0]} last_group = Q[0] for g in Q[1:]: if g != last_group: # group must be unseen thus far if g in uniq_group: raise ValueError("queries must be grouped together") uniq_group.add(g) last_group = g self.n_uniq_group = len(uniq_group) y = self._gain(column_or_1d(y, warn=True)) return super(LambdaMART, self).fit(X, y, monitor,sample_group=Q)
def __init__(self, filename='./corpus/train.csv'): if os.path.exists(filename): data = pd.read_csv(filename) self.data = shuffle(data) X_data = pd.DataFrame(data.drop('sentiment', axis=1)) Y_data = column_or_1d(data[:]['sentiment'], warn=True) self.X_train, self.X_val,\ self.y_train, self.y_val = train_test_split(X_data, Y_data, test_size=0.3, random_state=1) self.model = None self.load_model() self.preprocessor = Preprocessor.Preprocessor() else: print('No Source!') self.preprocessor.process_data()
def compute_theil_on_groups(y_pred, mask, groups_indices, target_efficiencies, sample_weight): y_pred = column_or_1d(y_pred) sample_weight = check_sample_weight(y_pred, sample_weight=sample_weight) groups_weights = compute_group_weights_by_indices(groups_indices, sample_weight=sample_weight) divided_weight = compute_divided_weight_by_indices(groups_indices, sample_weight=sample_weight * mask) cuts = compute_cut_for_efficiency(target_efficiencies, mask=mask, y_pred=y_pred, sample_weight=sample_weight) result = 0. for cut in cuts: groups_efficiencies = compute_group_efficiencies_by_indices(y_pred, groups_indices=groups_indices, cut=cut, divided_weight=divided_weight) result += theil(groups_efficiencies, groups_weights) return result / len(cuts)
def compute_sde_on_groups(y_pred, mask, groups_indices, target_efficiencies, sample_weight=None, power=2.): y_pred = column_or_1d(y_pred) sample_weight = check_sample_weight(y_pred, sample_weight=sample_weight) group_weights = compute_group_weights_by_indices(groups_indices, sample_weight=sample_weight) divided_weight = compute_divided_weight_by_indices(groups_indices, sample_weight=sample_weight * mask) cuts = compute_cut_for_efficiency(target_efficiencies, mask=mask, y_pred=y_pred, sample_weight=sample_weight) sde = 0. for cut in cuts: group_efficiencies = compute_group_efficiencies_by_indices(y_pred, groups_indices=groups_indices, cut=cut, divided_weight=divided_weight) # print('FROM SDE function', cut, group_efficiencies) sde += weighted_deviation(group_efficiencies, weights=group_weights, power=power) return (sde / len(cuts)) ** (1. / power)
def fit(self, X, y, sample_weight=None): """Fit Naive Bayes classifier according to X, y Parameters ---------- X : {array-like, sparse matrix}, shape = [n_samples, n_features] Training vectors, where n_samples is the number of samples and n_features is the number of features. y : array-like, shape = [n_samples] Target values. sample_weight : array-like, shape = [n_samples], optional Weights applied to individual samples (1. for unweighted). Returns ------- self : object Returns self. """ X, y = check_arrays(X, y, sparse_format='csr') X = X.astype(np.float) y = column_or_1d(y, warn=True) _, n_features = X.shape labelbin = LabelBinarizer() Y = labelbin.fit_transform(y) self.classes_ = labelbin.classes_ if Y.shape[1] == 1: Y = np.concatenate((1 - Y, Y), axis=1) # convert to float to support sample weight consistently Y = Y.astype(np.float64) if sample_weight is not None: Y *= array2d(sample_weight).T class_prior = self.class_prior # Count raw events from data before updating the class log prior # and feature log probas n_effective_classes = Y.shape[1] self.class_count_ = np.zeros(n_effective_classes, dtype=np.float64) self.feature_count_ = np.zeros((n_effective_classes, n_features), dtype=np.float64) self._count(X, Y) self._update_feature_log_prob() self._update_class_log_prior(class_prior=class_prior) return self
def remap_labels( y_true: Union[List, np.ndarray, pd.Series], y_pred: Union[List, np.ndarray, pd.Series], return_map: bool = False, ) -> np.ndarray: """ Remaps a categorical labeling (such as one predicted by a clustering algorithm) to match the labels used by another similar labeling. Given two :math:`n`-length vectors describing a categorical labeling of :math:`n` samples, this method reorders the labels of the second vector (`y_pred`) so that as many samples as possible from the two label vectors are in the same category. Parameters ---------- y_true : array-like of shape (n_samples,) Ground truth labels, or, labels to map to. y_pred : array-like of shape (n_samples,) Labels to remap to match the categorical labeling of `y_true`. The categorical labeling of `y_pred` will be preserved exactly, but the labels used to denote the categories will be changed to best match the categories used in `y_true`. return_map : bool, optional Whether to return a dictionary where the keys are the original category labels from `y_pred` and the values are the new category labels that they were mapped to. Returns ------- remapped_y_pred : np.ndarray of shape (n_samples,) Same categorical labeling as that of `y_pred`, but with the category labels permuted to best match those of `y_true`. label_map : dict Mapping from the original labels of `y_pred` to the new labels which best resemble those of `y_true`. Only returned if `return_map` was True. Examples -------- >>> y_true = np.array([0,0,1,1,2,2]) >>> y_pred = np.array([2,2,1,1,0,0]) >>> remap_labels(y_true, y_pred) array([0, 0, 1, 1, 2, 2]) Notes ----- This method will work well when the label vectors describe a somewhat similar categorization of the data (as measured by metrics such as :func:`sklearn.metrics.adjusted_rand_score`, for example). When the categorizations are not similar, the remapping may not make sense (as such a remapping does not exist). For example, consider when one category in `y_true` is exactly split in half into two categories in `y_pred`. If this is the case, it is impossible to say which of the categories in `y_pred` match that original category from `y_true`. """ check_consistent_length(y_true, y_pred) true_type = type_of_target(y_true) pred_type = type_of_target(y_pred) valid_target_types = {"binary", "multiclass"} if (true_type not in valid_target_types) or (pred_type not in valid_target_types): msg = "Elements of `y_true` and `y_pred` must represent a valid binary or " msg += "multiclass labeling, see " msg += "https://scikit-learn.org/stable/modules/generated/sklearn.utils.multiclass.type_of_target.html" msg += " for more information." raise ValueError(msg) y_true = column_or_1d(y_true) y_pred = column_or_1d(y_pred) if not isinstance(return_map, bool): raise TypeError("return_map must be of type bool.") labels = unique_labels(y_true, y_pred) confusion_mat = confusion_matrix(y_true, y_pred, labels=labels) row_inds, col_inds = linear_sum_assignment(confusion_mat, maximize=True) label_map = dict(zip(labels[col_inds], labels[row_inds])) remapped_y_pred = np.vectorize(label_map.get)(y_pred) if return_map: return remapped_y_pred, label_map else: return remapped_y_pred
def transform(target, y): y = column_or_1d(y, warn=True) indices = np.isin(y, target) y_transformed = np.searchsorted(target, y) y_transformed[~indices] = -1 return y_transformed
def _validate_y(self, y): # Default implementation return column_or_1d(y, warn=True)
from sklearn.utils import column_or_1d # importing the dataset dataset = pd.read_csv('./models/Prediction/dataset/score_and_grade.csv') bins = [ 0, 5, 10, 15, 20, 25, 30, 35, 40, 45, 50, 55, 60, 65, 70, 75, 80, 85, 90, 95, 100 ] dataset["Grade Bins"] = pd.cut(dataset['Score'], bins=bins) x = dataset.iloc[:, :1].values y = dataset.iloc[:, 2].values # Encoding our categorical variables for Y label_encoding_Y = LabelEncoder() y = column_or_1d(y, warn=True) y = label_encoding_Y.fit_transform(y) # splitting the dataset into a training set and a test set # here we are using 100 observation which is 100/400 = 0.25, so test_size=0.25 x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25, random_state=0) # feature scaling sc_X = StandardScaler() scaler_x = sc_X.fit(x_train) x_train = scaler_x.transform(x_train) x_test = scaler_x.transform(x_test)
def fit(self, X, y, sample_weight=None, monitor=None): """Fit the gradient boosting model. Parameters ---------- X : array-like, shape = (n_samples, n_features) Data matrix y : structured array, shape = (n_samples,) A structured array containing the binary event indicator as first field, and time of event or time of censoring as second field. sample_weight : array-like, shape = (n_samples,), optional Weights given to each sample. If omitted, all samples have weight 1. monitor : callable, optional The monitor is called after each iteration with the current iteration, a reference to the estimator and the local variables of ``_fit_stages`` as keyword arguments ``callable(i, self, locals())``. If the callable returns ``True`` the fitting procedure is stopped. The monitor can be used for various things such as computing held-out estimates, early stopping, model introspect, and snapshoting. Returns ------- self : object Returns self. """ X, event, time = check_arrays_survival( X, y, accept_sparse=['csr', 'csc', 'coo'], dtype=DTYPE) n_samples, self.n_features_ = X.shape X = X.astype(DTYPE) sample_weight_is_none = sample_weight is None if sample_weight_is_none: sample_weight = numpy.ones(n_samples, dtype=numpy.float32) else: sample_weight = column_or_1d(sample_weight, warn=True) check_consistent_length(X, sample_weight) self._check_params() if isinstance(self.loss_, (CensoredSquaredLoss, IPCWLeastSquaresError)): time = numpy.log(time) self._init_state() if sample_weight_is_none: self.init_.fit(X, (event, time)) else: self.init_.fit(X, (event, time), sample_weight) raw_predictions = self.loss_.get_init_raw_predictions(X, self.init_) begin_at_stage = 0 # The rng state must be preserved if warm_start is True self._rng = check_random_state(self.random_state) if self.presort is True and issparse(X): raise ValueError( "Presorting is not supported for sparse matrices.") presort = self.presort # Allow presort to be 'auto', which means True if the dataset is dense, # otherwise it will be False. if presort == 'auto': presort = not issparse(X) X_idx_sorted = None if presort: X_idx_sorted = numpy.asfortranarray(numpy.argsort(X, axis=0), dtype=numpy.int32) # fit the boosting stages y = numpy.fromiter(zip(event, time), dtype=[('event', numpy.bool), ('time', numpy.float64)]) n_stages = self._fit_stages(X, y, raw_predictions, sample_weight, self._rng, begin_at_stage, monitor, X_idx_sorted) # change shape of arrays after fit (early-stopping or additional tests) if n_stages != self.estimators_.shape[0]: self.estimators_ = self.estimators_[:n_stages] self.train_score_ = self.train_score_[:n_stages] if hasattr(self, 'oob_improvement_'): self.oob_improvement_ = self.oob_improvement_[:n_stages] self.n_estimators_ = n_stages return self
def _validate_y(self, y): y = column_or_1d(y, warn=True) self.classes_, y = np.unique(y, return_inverse=True) self.n_classes_ = len(self.classes_) return y
def fit(self, y): y = column_or_1d(y, warn=True) return self
def _binary_clf_curve2(y_true, y_score, pos_label=None, sample_weight=None): """ MODIFIED VERSION OF SCIKIT-LEARN API Calculate true and false positives per binary classification threshold. Parameters ---------- y_true : array, shape = [n_samples] True targets of binary classification y_score : array, shape = [n_samples] Estimated probabilities or decision function pos_label : int or str, default=None The label of the positive class sample_weight : array-like of shape (n_samples,), default=None Sample weights. Returns ------- fps : array, shape = [n_thresholds] A count of false positives, at index i being the number of negative samples assigned a score >= thresholds[i]. The total number of negative samples is equal to fps[-1] (thus true negatives are given by fps[-1] - fps). tps : array, shape = [n_thresholds <= len(np.unique(y_score))] An increasing count of true positives, at index i being the number of positive samples assigned a score >= thresholds[i]. The total number of positive samples is equal to tps[-1] (thus false negatives are given by tps[-1] - tps). thresholds : array, shape = [n_thresholds] Decreasing score values. Example ------- >>> y_true = [ 1, 1, 1, 1, 1, 1, 0] >>> y_score = [ np.nan, 0.2, 0.3, 0.4, 0.5, 0.6, 0.3] >>> sample_weight = None >>> pos_label = None >>> fps, tps, thresholds = _binary_clf_curve2(y_true, y_score) """ import numpy as np from sklearn.utils import assert_all_finite from sklearn.utils import column_or_1d from sklearn.utils import check_consistent_length from sklearn.utils.multiclass import type_of_target from sklearn.utils.extmath import stable_cumsum # Check to make sure y_true is valid y_type = type_of_target(y_true) if not (y_type == "binary" or (y_type == "multiclass" and pos_label is not None)): raise ValueError("{0} format is not supported".format(y_type)) check_consistent_length(y_true, y_score, sample_weight) y_true = column_or_1d(y_true) y_score = column_or_1d(y_score) assert_all_finite(y_true) # assert_all_finite(y_score) if sample_weight is not None: sample_weight = column_or_1d(sample_weight) # ensure binary classification if pos_label is not specified # classes.dtype.kind in ('O', 'U', 'S') is required to avoid # triggering a FutureWarning by calling np.array_equal(a, b) # when elements in the two arrays are not comparable. classes = np.unique(y_true) if (pos_label is None and ( classes.dtype.kind in ('O', 'U', 'S') or not (np.array_equal(classes, [0, 1]) or np.array_equal(classes, [-1, 1]) or np.array_equal(classes, [0]) or np.array_equal(classes, [-1]) or np.array_equal(classes, [1])))): classes_repr = ", ".join(repr(c) for c in classes) raise ValueError("y_true takes value in {{{classes_repr}}} and " "pos_label is not specified: either make y_true " "take value in {{0, 1}} or {{-1, 1}} or " "pass pos_label explicitly.".format( classes_repr=classes_repr)) elif pos_label is None: pos_label = 1. # make y_true a boolean vector y_true = (y_true == pos_label) # Transform nans into negative infinity nan_flags = np.isnan(y_score) y_score[nan_flags] = -np.inf # sort scores and corresponding truth values desc_score_indices = np.argsort(y_score, kind="mergesort")[::-1] y_score = y_score[desc_score_indices] y_true = y_true[desc_score_indices] if sample_weight is not None: weight = sample_weight[desc_score_indices] else: weight = 1. # y_score typically has many tied values. Here we extract # the indices associated with the distinct values. We also # concatenate a value for the end of the curve. with np.errstate(invalid="ignore"): y_diff = np.diff(y_score) # Set difference between -inf to zero fix_flags = np.isinf(y_score[:-1]) & np.isnan(y_diff) y_diff[fix_flags] = 0 distinct_value_indices = np.where(y_diff)[0] threshold_idxs = np.r_[distinct_value_indices, y_true.size - 1] # accumulate the true positives with decreasing threshold tps = stable_cumsum(y_true * weight)[threshold_idxs] if sample_weight is not None: # express fps as a cumsum to ensure fps is increasing even in # the presence of floating point errors fps = stable_cumsum((1 - y_true) * weight)[threshold_idxs] else: fps = 1 + threshold_idxs - tps return fps, tps, y_score[threshold_idxs]
def fit(self, X, y): if self.model is not None: thundersvm.model_free(c_void_p(self.model)) self.model = None sparse = sp.isspmatrix(X) self._sparse = sparse and not callable(self.kernel) X, y = check_X_y(X, y, dtype=np.float64, order='C', accept_sparse='csr') y = column_or_1d(y, warn=True).astype(np.float64) solver_type = SVM_TYPE.index(self._impl) if self.gamma == 'auto': self._gamma = 1.0 / X.shape[1] else: self._gamma = self.gamma if self.kernel not in KERNEL_TYPE: print( "The kernel parameter not recognized, please refer to the document." ) exit() else: kernel = KERNEL_TYPE.index(self.kernel) fit = self._sparse_fit if self._sparse else self._dense_fit thundersvm.model_new.restype = c_void_p self.model = thundersvm.model_new(solver_type) if self.max_mem_size != -1: thundersvm.set_memory_size(c_void_p(self.model), self.max_mem_size) fit(X, y, solver_type, kernel) if self._train_succeed[0] == -1: print("Training failed!") return self.n_sv = thundersvm.n_sv(c_void_p(self.model)) csr_row = (c_int * (self.n_sv + 1))() csr_col = (c_int * (self.n_sv * self.n_features))() csr_data = (c_float * (self.n_sv * self.n_features))() data_size = (c_int * 1)() sv_indices = (c_int * self.n_sv)() thundersvm.get_sv(csr_row, csr_col, csr_data, data_size, sv_indices, c_void_p(self.model)) self.row = np.frombuffer(csr_row, dtype=np.int32) self.col = np.frombuffer(csr_col, dtype=np.int32)[:data_size[0]] self.data = np.frombuffer(csr_data, dtype=np.float32)[:data_size[0]] self.support_vectors_ = sp.csr_matrix((self.data, self.col, self.row)) if not self._sparse: self.support_vectors_ = self.support_vectors_.toarray(order='C') self.support_ = np.frombuffer(sv_indices, dtype=np.int32).astype(int) dual_coef = (c_float * ((self.n_classes - 1) * self.n_sv))() thundersvm.get_coef(dual_coef, self.n_classes, self.n_sv, c_void_p(self.model)) self.dual_coef_ = np.frombuffer(dual_coef, dtype=np.float32)\ .astype(float)\ .reshape((self.n_classes - 1, self.n_sv)) rho_size = int(self.n_classes * (self.n_classes - 1) / 2) self.n_binary_model = rho_size rho = (c_float * rho_size)() thundersvm.get_rho(rho, rho_size, c_void_p(self.model)) self.intercept_ = np.frombuffer(rho, dtype=np.float32).astype(float) if self.kernel == 'linear': coef = (c_float * (self.n_binary_model * self.n_features))() thundersvm.get_linear_coef(coef, self.n_binary_model, self.n_features, c_void_p(self.model)) self.coef_ = np.frombuffer(coef, dtype=np.float32)\ .astype(float)\ .reshape((self.n_binary_model, self.n_features)) n_support_ = (c_int * self.n_classes)() thundersvm.get_support_classes(n_support_, self.n_classes, c_void_p(self.model)) self.n_support_ = np.frombuffer(n_support_, dtype=np.int32).astype(int) self.shape_fit_ = X.shape return self
def brier_score_loss(y_true, y_prob, sample_weight=None, pos_label=None): """Compute the Brier score. The smaller the Brier score, the better, hence the naming with "loss". Across all items in a set N predictions, the Brier score measures the mean squared difference between (1) the predicted probability assigned to the possible outcomes for item i, and (2) the actual outcome. Therefore, the lower the Brier score is for a set of predictions, the better the predictions are calibrated. Note that the Brier score always takes on a value between zero and one, since this is the largest possible difference between a predicted probability (which must be between zero and one) and the actual outcome (which can take on values of only 0 and 1). The Brier score is appropriate for binary and categorical outcomes that can be structured as true or false, but is inappropriate for ordinal variables which can take on three or more values (this is because the Brier score assumes that all possible outcomes are equivalently "distant" from one another). Which label is considered to be the positive label is controlled via the parameter pos_label, which defaults to 1. Read more in the :ref:`User Guide <calibration>`. Parameters ---------- y_true : array, shape (n_samples,) True targets. y_prob : array, shape (n_samples,) Probabilities of the positive class. sample_weight : array-like of shape = [n_samples], optional Sample weights. pos_label : int (default: None) Label of the positive class. If None, the maximum label is used as positive class Returns ------- score : float Brier score Examples -------- >>> import numpy as np >>> from sklearn.metrics import brier_score_loss >>> y_true = np.array([0, 1, 1, 0]) >>> y_true_categorical = np.array(["spam", "ham", "ham", "spam"]) >>> y_prob = np.array([0.1, 0.9, 0.8, 0.3]) >>> brier_score_loss(y_true, y_prob) # doctest: +ELLIPSIS 0.037... >>> brier_score_loss(y_true, 1-y_prob, pos_label=0) # doctest: +ELLIPSIS 0.037... >>> brier_score_loss(y_true_categorical, y_prob, \ pos_label="ham") # doctest: +ELLIPSIS 0.037... >>> brier_score_loss(y_true, np.array(y_prob) > 0.5) 0.0 References ---------- http://en.wikipedia.org/wiki/Brier_score """ y_true = column_or_1d(y_true) y_prob = column_or_1d(y_prob) if pos_label is None: pos_label = y_true.max() y_true = np.array(y_true == pos_label, int) y_true = _check_binary_probabilistic_predictions(y_true, y_prob) return np.average((y_true - y_prob)**2, weights=sample_weight)
def fit(self, X, y=None): X = column_or_1d(X, warn=True) return self
def transform(self, X): X = column_or_1d(X, warn=True) func = lambda x: x[self.begin:self.end] Xt = eval_rows(X, func) return _col2d(Xt)
def fit(self, y): y = column_or_1d(y, warn = True) self.classes_ = numpy.unique(y[~pandas.isnull(y)]) return self
def VOC_prec_recall_curve(y_true, y_score, sample_weight=None): '''The unstable version by VOC people. This function heavily copies from scikit-learn's stable code. Licence: # Authors: Alexandre Gramfort <*****@*****.**> # Mathieu Blondel <*****@*****.**> # Olivier Grisel <*****@*****.**> # Arnaud Joly <*****@*****.**> # Jochen Wersdorfer <*****@*****.**> # Lars Buitinck # Joel Nothman <*****@*****.**> # Noel Dawe <*****@*****.**> # License: BSD 3 clause ''' from sklearn.utils import assert_all_finite, check_consistent_length, column_or_1d from sklearn.utils.extmath import stable_cumsum check_consistent_length(y_true, y_score) y_true = column_or_1d(y_true) y_score = column_or_1d(y_score) assert_all_finite(y_true) assert_all_finite(y_score) if sample_weight is not None: sample_weight = column_or_1d(sample_weight) # ensure binary classification if pos_label is not specified classes = np.unique(y_true) if not (np.array_equal(classes, [0, 1]) or np.array_equal(classes, [-1, 1]) or np.array_equal(classes, [0]) or np.array_equal(classes, [-1]) or np.array_equal(classes, [1])): raise ValueError("Data is not binary and pos_label is not specified") else: pos_label = 1. # make y_true a boolean vector y_true = (y_true == pos_label) # sort scores and corresponding truth values # first flip for consistency with buggy MATLAB version y_score = y_score[::-1] y_true = y_true[::-1] desc_score_indices = np.argsort(y_score, kind="mergesort")[::-1] y_score = y_score[desc_score_indices] y_true = y_true[desc_score_indices] if sample_weight is not None: weight = sample_weight[desc_score_indices] else: weight = 1. # y_score typically has many tied values. Here we extract # the indices associated with the distinct values. We also # concatenate a value for the end of the curve. # distinct_value_indices = np.where(np.diff(y_score))[0] # threshold_idxs = np.r_[distinct_value_indices, y_true.size - 1] # VOC ignores this. threshold_idxs = np.r_[range(y_true.size)] # accumulate the true positives with decreasing threshold tps = stable_cumsum(y_true * weight)[threshold_idxs] if sample_weight is not None: fps = stable_cumsum(weight)[threshold_idxs] - tps else: fps = 1 + threshold_idxs - tps fps, tps, thresholds = fps, tps, y_score[threshold_idxs] # now copying from the caller precision = tps / (tps + fps) recall = tps / tps[-1] # stop when full recall attained # and reverse the outputs so recall is decreasing last_ind = tps.searchsorted(tps[-1]) sl = slice(last_ind, None, -1) return np.r_[precision[sl], 1], np.r_[recall[sl], 0], thresholds[sl]
def transform(x): x = column_or_1d(x, warn=True) return DataFrame([described_solvents[x] for x in x], columns=header)
def linear_regr(X_train, y_train, X_test, y_test, poly_degree, interaction_only, print_coef, plot, ask_user, model_result): # create more features poly = preprocessing.PolynomialFeatures(poly_degree, interaction_only=interaction_only) X_train = poly.fit_transform(X_train) X_test = poly.fit_transform(X_test) (s_n, f_n) = X_train.shape # l_n = int(math.ceil(1.5*f_n)) l_n = int(math.ceil(1.2 * f_n)) print("@@@ s_n = {}, f_n = {}, l_n = {}".format(s_n, f_n, l_n)) np.savetxt("x_train.csv", X_train, delimiter=",") np.savetxt("y_train.csv", y_train, delimiter=",") np.savetxt("x_test.csv", X_test, delimiter=",") np.savetxt("y_test.csv", y_test, delimiter=",") print("### type of X_train = {}".format(type(X_train))) # debug for model in [2]: # linear regr: [0, 1, 2] NN: [3, 4] # for model in [0 1 2 3]: # run all: very long runtime # for model in [0 1 2 3 4]: # model selection ## # test score: 0.83 ## model_name = "SGDRegressor" ## model_rt_start = timeit.default_timer() ## regr = linear_model.SGDRegressor(penalty='elasticnet', alpha=0.01, l1_ratio=0.25, fit_intercept=True) ## model_rt_stop = timeit.default_timer() ## model_runtime = model_rt_stop - model_rt_start ## # test score: 0.83 ## model_name = "ElasticNet" ## model_rt_start = timeit.default_timer() ## regr = linear_model.ElasticNet(alpha = 0.01) ## model_rt_stop = timeit.default_timer() ## model_runtime = model_rt_stop - model_rt_start if (model == 0): # test score: 0.84 alpha = 0 model_name = "linear_model.LinearRegression" regr = linear_model.LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False) model_rt_start = timeit.default_timer() regr.fit(X_train, column_or_1d(y_train)) model_rt_stop = timeit.default_timer() model_runtime = model_rt_stop - model_rt_start model_result = evaluation(X_train, y_train, X_test, y_test, poly_degree, interaction_only, print_coef, plot, ask_user, model_result, model_name, model_runtime, regr, alpha) elif (model == 1): for alpha in [0.0001, 0.001, 0.01, 0.1, 1, 3, 10]: # test score: 0.83 model_name = "linear_model.Lasso" regr_lasso = linear_model.Lasso(alpha=alpha) model_rt_start = timeit.default_timer() regr_lasso.fit(X_train, column_or_1d(y_train)) model_rt_stop = timeit.default_timer() model_runtime = model_rt_stop - model_rt_start model_result = evaluation(X_train, y_train, X_test, y_test, poly_degree, interaction_only, print_coef, plot, ask_user, model_result, model_name, model_runtime, regr_lasso, alpha) elif (model == 2): for alpha in [0.0001, 0.001, 0.01, 0.1, 1, 3, 10]: # for alpha in [0.0000001, 0.00001, 0.001, 0.01, 0.1, 1, 3, 10, 30, 100, 300, 10**3, 10**4, 10**5]: # test score: 0.84 model_name = "linear_model.Ridge" regr_ridge = linear_model.Ridge(alpha=alpha) model_rt_start = timeit.default_timer() regr_ridge.fit(X_train, column_or_1d(y_train)) model_rt_stop = timeit.default_timer() model_runtime = model_rt_stop - model_rt_start model_result = evaluation(X_train, y_train, X_test, y_test, poly_degree, interaction_only, print_coef, plot, ask_user, model_result, model_name, model_runtime, regr_ridge, alpha) elif (model == 3): if (poly_degree <= 2): for alpha in [0.0001, 0.01, 1]: # for alpha in [0.00001]: for layer_n in [3, 7, 11]: # for layer_n in [3]: # test score: 0.83, runtime longer model_name = "neural_network.MLPRegressor, layer = " + str( layer_n) if (layer_n == 3): regr = neural_network.MLPRegressor( random_state=True, hidden_layer_sizes=(l_n, l_n, l_n), alpha=alpha) if (layer_n == 7): regr = neural_network.MLPRegressor( random_state=True, hidden_layer_sizes=(l_n, l_n, l_n, l_n, l_n, l_n, l_n), alpha=alpha) if (layer_n == 11): regr = neural_network.MLPRegressor( random_state=True, hidden_layer_sizes=(l_n, l_n, l_n, l_n, l_n, l_n, l_n, l_n, l_n, l_n, l_n), alpha=alpha) model_rt_start = timeit.default_timer() regr.fit(X_train, column_or_1d(y_train)) model_rt_stop = timeit.default_timer() model_runtime = model_rt_stop - model_rt_start model_result = evaluation(X_train, y_train, X_test, y_test, poly_degree, interaction_only, print_coef, plot, ask_user, model_result, model_name, model_runtime, regr, alpha) elif (model == 4): if (poly_degree <= 3): for alpha in [1, 10, 1000]: # for alpha in [0.00001]: # for layer_n in [3, 7, 11]: for layer_n in [7, 11]: # for layer_n in [3]: # test score: 0.83, runtime longer model_name = "neural_network.MLPRegressor, layer = " + str( layer_n) if (layer_n == 3): regr = neural_network.MLPRegressor( random_state=True, hidden_layer_sizes=(l_n, l_n, l_n), alpha=alpha) if (layer_n == 7): # regr = neural_network.MLPRegressor(random_state=True,hidden_layer_sizes=(l_n,l_n,l_n,l_n,l_n,l_n,l_n),alpha=alpha) regr = neural_network.MLPRegressor( random_state=True, hidden_layer_sizes=(l_n, l_n, l_n, l_n, l_n, l_n, l_n), alpha=alpha, learning_rate='invscaling') if (layer_n == 11): regr = neural_network.MLPRegressor( random_state=True, hidden_layer_sizes=(l_n, l_n, l_n, l_n, l_n, l_n, l_n, l_n, l_n, l_n, l_n), alpha=alpha) model_rt_start = timeit.default_timer() regr.fit(X_train, column_or_1d(y_train)) model_rt_stop = timeit.default_timer() model_runtime = model_rt_stop - model_rt_start model_result = evaluation(X_train, y_train, X_test, y_test, poly_degree, interaction_only, print_coef, plot, ask_user, model_result, model_name, model_runtime, regr, alpha) else: raise SystemExit("Model selection out of range!!!") return model_result
def multilabel_confusion_matrix(y_true, y_pred, sample_weight=None, labels=None, samplewise=False): """Compute a confusion matrix for each class or sample .. versionadded:: 0.21 Compute class-wise (default) or sample-wise (samplewise=True) multilabel confusion matrix to evaluate the accuracy of a classification, and output confusion matrices for each class or sample. In multilabel confusion matrix :math:`MCM`, the count of true negatives is :math:`MCM_{:,0,0}`, false negatives is :math:`MCM_{:,1,0}`, true positives is :math:`MCM_{:,1,1}` and false positives is :math:`MCM_{:,0,1}`. Multiclass data will be treated as if binarized under a one-vs-rest transformation. Returned confusion matrices will be in the order of sorted unique labels in the union of (y_true, y_pred). Read more in the :ref:`User Guide <multilabel_confusion_matrix>`. Parameters ---------- y_true : 1d array-like, or label indicator array / sparse matrix of shape (n_samples, n_outputs) or (n_samples,) Ground truth (correct) target values. y_pred : 1d array-like, or label indicator array / sparse matrix of shape (n_samples, n_outputs) or (n_samples,) Estimated targets as returned by a classifier sample_weight : array-like of shape = (n_samples,), optional Sample weights labels : array-like A list of classes or column indices to select some (or to force inclusion of classes absent from the data) samplewise : bool, default=False In the multilabel case, this calculates a confusion matrix per sample Returns ------- multi_confusion : array, shape (n_outputs, 2, 2) A 2x2 confusion matrix corresponding to each output in the input. When calculating class-wise multi_confusion (default), then n_outputs = n_labels; when calculating sample-wise multi_confusion (samplewise=True), n_outputs = n_samples. If ``labels`` is defined, the results will be returned in the order specified in ``labels``, otherwise the results will be returned in sorted order by default. See also -------- confusion_matrix Notes ----- The multilabel_confusion_matrix calculates class-wise or sample-wise multilabel confusion matrices, and in multiclass tasks, labels are binarized under a one-vs-rest way; while confusion_matrix calculates one confusion matrix for confusion between every two classes. Examples -------- Multilabel-indicator case: >>> import numpy as np >>> from sklearn.metrics import multilabel_confusion_matrix >>> y_true = np.array([[1, 0, 1], ... [0, 1, 0]]) >>> y_pred = np.array([[1, 0, 0], ... [0, 1, 1]]) >>> multilabel_confusion_matrix(y_true, y_pred) array([[[1, 0], [0, 1]], <BLANKLINE> [[1, 0], [0, 1]], <BLANKLINE> [[0, 1], [1, 0]]]) Multiclass case: >>> y_true = ["cat", "ant", "cat", "cat", "ant", "bird"] >>> y_pred = ["ant", "ant", "cat", "cat", "ant", "cat"] >>> multilabel_confusion_matrix(y_true, y_pred, ... labels=["ant", "bird", "cat"]) array([[[3, 1], [0, 2]], <BLANKLINE> [[5, 0], [1, 0]], <BLANKLINE> [[2, 1], [1, 2]]]) """ y_type, y_true, y_pred = _check_targets(y_true, y_pred) if sample_weight is not None: sample_weight = column_or_1d(sample_weight) check_consistent_length(y_true, y_pred, sample_weight) if y_type not in ("binary", "multiclass", "multilabel-indicator"): raise ValueError("%s is not supported" % y_type) present_labels = unique_labels(y_true, y_pred) if labels is None: labels = present_labels n_labels = None else: n_labels = len(labels) labels = np.hstack( [labels, np.setdiff1d(present_labels, labels, assume_unique=True)]) if y_true.ndim == 1: if samplewise: raise ValueError("Samplewise metrics are not available outside of " "multilabel classification.") le = LabelEncoder() le.fit(labels) y_true = le.transform(y_true) y_pred = le.transform(y_pred) sorted_labels = le.classes_ # labels are now from 0 to len(labels) - 1 -> use bincount tp = y_true == y_pred tp_bins = y_true[tp] if sample_weight is not None: tp_bins_weights = np.asarray(sample_weight)[tp] else: tp_bins_weights = None if len(tp_bins): tp_sum = np.bincount(tp_bins, weights=tp_bins_weights, minlength=len(labels)) else: # Pathological case true_sum = pred_sum = tp_sum = np.zeros(len(labels)) if len(y_pred): pred_sum = np.bincount(y_pred, weights=sample_weight, minlength=len(labels)) if len(y_true): true_sum = np.bincount(y_true, weights=sample_weight, minlength=len(labels)) # Retain only selected labels indices = np.searchsorted(sorted_labels, labels[:n_labels]) tp_sum = tp_sum[indices] true_sum = true_sum[indices] pred_sum = pred_sum[indices] else: sum_axis = 1 if samplewise else 0 # All labels are index integers for multilabel. # Select labels: if not np.array_equal(labels, present_labels): if np.max(labels) > np.max(present_labels): raise ValueError('All labels must be in [0, n labels) for ' 'multilabel targets. ' 'Got %d > %d' % (np.max(labels), np.max(present_labels))) if np.min(labels) < 0: raise ValueError('All labels must be in [0, n labels) for ' 'multilabel targets. ' 'Got %d < 0' % np.min(labels)) if n_labels is not None: y_true = y_true[:, labels[:n_labels]] y_pred = y_pred[:, labels[:n_labels]] # calculate weighted counts true_and_pred = y_true.multiply(y_pred) tp_sum = count_nonzero(true_and_pred, axis=sum_axis, sample_weight=sample_weight) pred_sum = count_nonzero(y_pred, axis=sum_axis, sample_weight=sample_weight) true_sum = count_nonzero(y_true, axis=sum_axis, sample_weight=sample_weight) fp = pred_sum - tp_sum fn = true_sum - tp_sum tp = tp_sum if sample_weight is not None and samplewise: sample_weight = np.array(sample_weight) tp = np.array(tp) fp = np.array(fp) fn = np.array(fn) tn = sample_weight * y_true.shape[1] - tp - fp - fn elif sample_weight is not None: tn = sum(sample_weight) - tp - fp - fn elif samplewise: tn = y_true.shape[1] - tp - fp - fn else: tn = y_true.shape[0] - tp - fp - fn return np.array([tn, fp, fn, tp]).T.reshape(-1, 2, 2)
def evaluation(X_train, y_train, X_test, y_test, poly_degree, interaction_only, print_coef, plot, ask_user, model_result, model_name, model_runtime, regr, alpha): print("poly_degree = {}, interaction_only = {}".format( poly_degree, interaction_only)) with open("logs/log_" + log_timestr + ".txt", "a") as logfile: logfile.write("====================\n") logfile.write("poly_degree = {}, interaction_only = {}\n".format( poly_degree, interaction_only)) print("Model: {} \n".format(model_name)) print("Alpha (Regularization strength): {} \n".format(alpha)) print("X_train.shape = {}".format(X_train.shape)) print("y_train.shape = {}".format(y_train.shape)) print("X_test.shape = {}".format(X_test.shape)) print("y_test.shape = {}".format(y_test.shape)) if (print_coef): # The coefficients if hasattr(regr, 'coef_'): print("Coefficients: {}\n", regr.coef_) with open("logs/log_" + log_timestr + ".txt", "a") as logfile: logfile.write("Coefficients: {}\n".format(regr.coef_)) # for neural_network.MLPRegressor if hasattr(regr, 'coefs_'): print("Coefficients: {}\n", regr.coefs_) with open("logs/log_" + log_timestr + ".txt", "a") as logfile: logfile.write("Coefficients: {}\n".format(regr.coefs_)) print("For training set:") (mse_train, score_train) = (0, 0) # mse_train = float(np.mean( (regr.predict(X_train) - y_train) ** 2) ) # need to use column_or_1d instead of np.array model_rt_predict_train_start = timeit.default_timer() predict_train = regr.predict(X_train) model_rt_predict_train_stop = timeit.default_timer() model_runtime_predict_train = model_rt_predict_train_stop - model_rt_predict_train_start mse_train = float(np.mean((predict_train - column_or_1d(y_train))**2)) score_train = regr.score(X_train, y_train) # The mean squared error print("Mean squared error (train): {0:.3f} \n".format(mse_train)) # Explained variance score: 1 is perfect prediction print("Variance score (train): {0:.3f} \n".format(score_train)) print("model_runtime (training) = {0:.3f} (seconds) \n".format( model_runtime)) print("model_runtime (predict train set) = {0:.3f} (seconds) \n".format( model_runtime_predict_train)) print("For test set:") (mse_test, score_test) = (0, 0) model_rt_predict_test_start = timeit.default_timer() predict_test = regr.predict(X_test) model_rt_predict_test_stop = timeit.default_timer() model_runtime_predict_test = model_rt_predict_test_stop - model_rt_predict_test_start mse_test = float(np.mean((predict_test - column_or_1d(y_test))**2)) score_test = regr.score(X_test, y_test) # The mean squared error print("Mean squared error (test): {0:.3f} \n".format(mse_test)) # Explained variance score: 1 is perfect prediction print("Variance score (test): {0:.3f} \n".format(score_test)) print("model_runtime (predict test set) = {0:.3f} (seconds) \n".format( model_runtime_predict_test)) with open("logs/log_" + log_timestr + ".txt", "a") as logfile: logfile.write("====================\n") logfile.write("Features polynomial degree: {} \n".format(poly_degree)) logfile.write("Model: {} \n".format(model_name)) logfile.write("Alpha (Regularization strength): {} \n".format(alpha)) logfile.write("X_train.shape = {} \n".format(X_train.shape)) logfile.write("y_train.shape = {} \n".format(y_train.shape)) logfile.write("X_test.shape = {} \n".format(X_test.shape)) logfile.write("y_test.shape = {} \n".format(y_test.shape)) logfile.write("For training set: \n") logfile.write( "Mean squared error (train): {0:.3f} \n".format(mse_train)) logfile.write("Variance score (train): {0:.3f} \n".format(score_train)) logfile.write("For test set: \n") logfile.write("Mean squared error (test): {0:.3f} \n".format(mse_test)) logfile.write("Variance score (test): {0:.3f} \n".format(score_test)) logfile.write("model_runtime (training) = {0:.3f} (seconds) \n".format( model_runtime)) logfile.write( "model_runtime (predict train set) = {0:.3f} (seconds) \n".format( model_runtime_predict_train)) logfile.write( "model_runtime (predict test set) = {0:.3f} (seconds) \n".format( model_runtime_predict_test)) logfile.write("====================\n") # collect info. (s_n, f_n) = X_train.shape model_result.update({(model_name, alpha, int(f_n), poly_degree): []}) ## model_result[(model_name, alpha, int(f_n) )] = ( poly_degree, round(mse_train, 3), round(score_train, 3), ## round(mse_test, 3), round(score_test, 3), ## round(model_runtime, 3), round(model_runtime_predict_train, 3), round(model_runtime_predict_test, 3) ) model_result[(model_name, alpha, int(f_n), poly_degree)] = (round(mse_train, 3), round(score_train, 3), round(mse_test, 3), round(score_test, 3), round(model_runtime, 3), round(model_runtime_predict_train, 3), round(model_runtime_predict_test, 3)) # print shape if (plot == True): plot_y_test(regr, X_test, y_test, ask_user) return model_result
clf = KNN() # 初始化检测器clf clf.fit(X_train) # 使用X_train训练检测器clf # 返回训练数据X_train上的异常标签和异常分值 y_train_pred = clf.labels_ # 返回训练数据上的分类标签 (0: 正常值, 1: 异常值) y_train_scores = clf.decision_scores_ # 返回训练数据上的异常值 (分值越大越异常) print("On train Data:") evaluate_print(clf_name, y_train, y_train_scores) # 用训练好的clf来预测未知数据中的异常值 y_test_pred = clf.predict(X_test) # 返回未知数据上的分类标签 (0: 正常值, 1: 异常值) y_test_scores = clf.decision_function(X_test) # 返回未知数据上的异常值 (分值越大越异常) print("On Test Data:") evaluate_print(clf_name, y_test, y_test_scores) y_true = column_or_1d(y_test) y_pred = column_or_1d(y_test_scores) check_consistent_length(y_true, y_pred) roc = np.round(roc_auc_score(y_true, y_pred), decimals=4), prn = np.round(precision_n_scores(y_true, y_pred), decimals=4) knn_roc.append(roc) knn_prn.append(prn) clf_name = 'LOF' clf = LOF() # 初始化检测器clf clf.fit(X_train) # 使用X_train训练检测器clf # 返回训练数据X_train上的异常标签和异常分值 y_train_pred = clf.labels_ # 返回训练数据上的分类标签 (0: 正常值, 1: 异常值)
def _validate_input(self, X, y, incremental): X, y = check_X_y(X, y, accept_sparse=['csr', 'csc', 'coo'], multi_output=True, y_numeric=True) if y.ndim == 2 and y.shape[1] == 1: y = column_or_1d(y, warn=True) return X, y
def label_validate(self, y): return column_or_1d(y, warn=True).astype(np.float64)
def transform(self, X): X = column_or_1d(X, warn=True) engine = _regex_engine(self.pattern) func = lambda x: engine.sub(self.replacement, x) Xt = eval_rows(X, func) return _col2d(Xt)
def fit(self, X, y, sample_weight=None, check_input=True): """Fit Ridge regression model after searching for the best mu and tau. Parameters ---------- X : array-like, shape = [n_samples, n_features] Training data y : array-like, shape = [n_samples] or [n_samples, n_targets] Target values sample_weight : float or array-like of shape [n_samples] Sample weight Returns ------- self : Returns self. """ self._label_binarizer = LabelBinarizer(pos_label=1, neg_label=-1) y = self._label_binarizer.fit_transform(y) if self._label_binarizer.y_type_.startswith('multilabel'): raise ValueError("%s doesn't support multi-label classification" % (self.__class__.__name__)) else: y = column_or_1d(y, warn=False) param_grid = {'tau': self.taus, 'lamda': self.lamdas} fit_params = { 'sample_weight': sample_weight, 'check_input': check_input } estimator = L1L2TwoStepClassifier(mu=self.mu, fit_intercept=self.fit_intercept, use_gpu=self.use_gpu, threshold=self.threshold, normalize=self.normalize, precompute=self.precompute, max_iter=self.max_iter, copy_X=self.copy_X, tol=self.tol, warm_start=self.warm_start, positive=self.positive, random_state=self.random_state, selection=self.selection) gs = GridSearchCV(estimator=estimator, param_grid=param_grid, fit_params=fit_params, cv=self.cv, scoring=self.scoring, n_jobs=self.n_jobs, iid=self.iid, refit=self.refit, verbose=self.verbose, pre_dispatch=self.pre_dispatch, error_score=self.error_score, return_train_score=self.return_train_score) gs.fit(X, y) estimator = gs.best_estimator_ self.tau_ = estimator.tau self.lamda_ = estimator.lamda self.coef_ = estimator.coef_ self.intercept_ = estimator.intercept_ self.best_estimator_ = estimator # XXX DEBUG if self.classes_.shape[0] > 2: ndim = self.classes_.shape[0] else: ndim = 1 self.coef_ = self.coef_.reshape(ndim, -1) return self
def fit(self, y): y = column_or_1d(y, warn=True) self.classes_ = pd.Series(y).unique() return self
def fit(self, X, y,weights=None): """Scikit-learn required: Computes the feature importance scores from the training data. Parameters ---------- X: array-like {n_samples, n_features} Training instances to compute the feature importance scores from y: array-like {n_samples} Training labels Returns ------- self """ X = check_array(X, force_all_finite=False) y = column_or_1d(y) #random_state if self.random_state != None: np.random.seed(self.random_state) random.seed(self.random_state) #Make subsets with all the features num_features = X.shape[1] self.size_feature_subset = min(self.size_feature_subset,num_features) subsets = self.make_subsets(list(range(num_features)),self.num_feature_subset,self.size_feature_subset) #Fit each subset scores = [] for subset in subsets: new_X = self.custom_transform(X,subset) copy_relief_object = copy.deepcopy(self.relief_object) if not isinstance(weights,np.ndarray): copy_relief_object.fit(new_X,y) else: copy_relief_object.fit(new_X,y,weights=weights[subset]) raw_score = copy_relief_object.feature_importances_ score = np.empty(num_features) if self.rank_absolute: score.fill(0) else: score.fill(np.NINF) counter = 0 for index in subset: score[index] = raw_score[counter] counter+=1 scores.append(score) #DEBUGGING #print(score) scores = np.array(scores) #Merge results by selecting largest found weight for each feature max_scores = [] for score in scores.T: if self.rank_absolute: max = np.max(np.absolute(score)) if max in score: max_scores.append(max) else: max_scores.append(-max) else: max_scores.append(np.max(score)) max_scores = np.array(max_scores) #Save FI as feature_importances_ self.feature_importances_ = max_scores if self.rank_absolute: self.top_features_ = np.argsort(np.absolute(self.feature_importances_))[::-1] else: self.top_features_ = np.argsort(self.feature_importances_)[::-1] return self
def fit(self, X, y, sample_weight=None, monitor=None): """Fit the gradient boosting model. Parameters ---------- X : array-like, shape = (n_samples, n_features) Data matrix y : structured array, shape = (n_samples,) A structured array containing the binary event indicator as first field, and time of event or time of censoring as second field. sample_weight : array-like, shape = (n_samples,), optional Weights given to each sample. If omitted, all samples have weight 1. monitor : callable, optional The monitor is called after each iteration with the current iteration, a reference to the estimator and the local variables of ``_fit_stages`` as keyword arguments ``callable(i, self, locals())``. If the callable returns ``True`` the fitting procedure is stopped. The monitor can be used for various things such as computing held-out estimates, early stopping, model introspect, and snapshoting. Returns ------- self : object Returns self. """ random_state = check_random_state(self.random_state) X, event, time = check_arrays_survival(X, y, accept_sparse=['csr', 'csc', 'coo'], dtype=DTYPE) n_samples, self.n_features_ = X.shape X = X.astype(DTYPE) if sample_weight is None: sample_weight = numpy.ones(n_samples, dtype=numpy.float32) else: sample_weight = column_or_1d(sample_weight, warn=True) check_consistent_length(X, sample_weight) self._check_params() self.loss_ = LOSS_FUNCTIONS[self.loss](1) if isinstance(self.loss_, (CensoredSquaredLoss, IPCWLeastSquaresError)): time = numpy.log(time) self._init_state() self.init_.fit(X, (event, time), sample_weight) y_pred = self.init_.predict(X) begin_at_stage = 0 # fit the boosting stages y = numpy.fromiter(zip(event, time), dtype=[('event', numpy.bool), ('time', numpy.float64)]) n_stages = self._fit_stages(X, y, y_pred, sample_weight, random_state, begin_at_stage, monitor) # change shape of arrays after fit (early-stopping or additional tests) if n_stages != self.estimators_.shape[0]: self.estimators_ = self.estimators_[:n_stages] self.train_score_ = self.train_score_[:n_stages] if hasattr(self, 'oob_improvement_'): self.oob_improvement_ = self.oob_improvement_[:n_stages] return self
def __call__(self, est, X, y_true): column_or_1d(y_true) y_pred = est.predict_proba(X)[:, 1] return self.score_fct(y_true, y_pred)
def fit(self, X, y=None): X = column_or_1d(X, warn=True) self.classes_ = numpy.unique(X[~pandas.isnull(X)]) return self
def transform(self, y): y = column_or_1d(y, warn = True) index = list(self.classes_) return numpy.array([self.missing_value if pandas.isnull(v) else index.index(v) for v in y])
def transform(self, X): X = column_or_1d(X, warn=True) engine = _regex_engine(self.pattern) func = lambda x: bool(engine.search(x)) Xt = eval_rows(X, func) return _col2d(Xt)