def evaluate_print(clf_name, y, y_pred): """Utility function for evaluating and printing the results for examples. Default metrics include ROC and Precision @ n Parameters ---------- clf_name : str The name of the detector. y : list or numpy array of shape (n_samples,) The ground truth. Binary (0: inliers, 1: outliers). y_pred : list or numpy array of shape (n_samples,) The raw outlier scores as returned by a fitted model. """ y = column_or_1d(y) y_pred = column_or_1d(y_pred) check_consistent_length(y, y_pred) print('{clf_name} ROC:{roc}, precision @ rank n:{prn}, ap:{prn}'.format( clf_name=clf_name, roc=np.round(roc_auc_score(y, y_pred), decimals=4), prn=np.round(precision_n_scores(y, y_pred), decimals=4), ap=np.round(average_precision_score(y, y_pred), decimals=4)))
def _check_targets(y_true, y_pred): check_consistent_length(y_true, y_pred) type_true = type_of_target(y_true) type_pred = type_of_target(y_pred) y_type = {type_true, type_pred} if y_type == {"binary", "multiclass"}: y_type = {"multiclass"} if len(y_type) > 1: raise ValueError("Classification metrics can't handle a mix of {0} " "and {1} targets".format(type_true, type_pred)) # We can't have more than one value on y_type => The set is no more needed y_type = y_type.pop() # No metrics support "multiclass-multioutput" format if (y_type not in ["binary", "multiclass", "multilabel-indicator"]): raise ValueError("{0} is not supported".format(y_type)) if y_type in ["binary", "multiclass"]: y_true = column_or_1d(y_true) y_pred = column_or_1d(y_pred) if y_type == "binary": unique_values = np.union1d(y_true, y_pred) if len(unique_values) > 2: y_type = "multiclass" if y_type.startswith('multilabel'): y_true = csr_matrix(y_true) y_pred = csr_matrix(y_pred) y_type = 'multilabel-indicator' return y_type, y_true, y_pred
def fit(self, X, y): if self.penalty not in ["l1", "none"]: raise ValueError( f"penalty should be either 'l1' or 'none', got {self.penalty}") self.sensitive_col_idx_ = self.sensitive_cols if isinstance(X, pd.DataFrame): self.sensitive_col_idx_ = [ i for i, name in enumerate(X.columns) if name in self.sensitive_cols ] X, y = check_X_y(X, y, accept_large_sparse=False) sensitive = X[:, self.sensitive_col_idx_] if not self.train_sensitive_cols: X = np.delete(X, self.sensitive_col_idx_, axis=1) X = self._add_intercept(X) column_or_1d(y) label_encoder = LabelEncoder().fit(y) y = label_encoder.transform(y) self.classes_ = label_encoder.classes_ if len(self.classes_) > 2: raise ValueError( f"This solver needs samples of exactly 2 classes" f" in the data, but the data contains {len(self.classes_)}: {self.classes_}" ) self._solve(sensitive, X, y) return self
def check_metrics_arguments(y_true, y_pred, sample_weight, two_class=True, binary_pred=True): """ Checks the arguments passed to metrics :param y_true: labels of classes :param y_pred: predictions :param sample_weight: weights of samples :param two_class: if True, will check that y_true contains only zeros and ones :param binary_pred: if True, will check that y_pred contains only zeros and ones :return: the same arguments as tuple """ sample_weight = check_sample_weight(y_true, sample_weight=sample_weight) y_true = column_or_1d(y_true) y_pred = column_or_1d(y_pred) assert len(y_true) == len(y_pred), "The lengths of y_true and y_pred are different: %i and %i" % ( len(y_true), len(y_pred), ) if two_class: assert numpy.in1d( y_true, [0, 1] ).all(), "The y_true array should contain only two labels: 0 and 1, " "it contains:" + str(numpy.unique(y_true)) if binary_pred: assert numpy.in1d( y_pred, [0, 1] ).all(), "The y_pred array should contain only two labels: 0 and 1, " "it contains:" + str(numpy.unique(y_pred)) return y_true, y_pred, sample_weight
def check_inputs(X, y, sample_weight=None, ensure_2d=True): """Input validation for debiasing algorithms. Checks all inputs for consistent length, validates shapes (optional for X), and returns an array of all ones if sample_weight is ``None``. Args: X (array-like): Input data. y (array-like, shape = (n_samples,)): Target values. sample_weight (array-like, optional): Sample weights. ensure_2d (bool, optional): Whether to raise a ValueError if X is not 2D. Returns: tuple: * **X** (`array-like`) -- Validated X. Unchanged. * **y** (`array-like`) -- Validated y. Possibly converted to 1D if not a :class:`pandas.Series`. * **sample_weight** (`array-like`) -- Validated sample_weight. If no sample_weight is provided, returns a consistent-length array of ones. """ if ensure_2d and X.ndim != 2: raise ValueError("Expected X to be 2D, got ndim == {} instead.".format( X.ndim)) if not isinstance(y, pd.Series): # don't cast Series -> ndarray y = column_or_1d(y) if sample_weight is not None: sample_weight = column_or_1d(sample_weight) else: sample_weight = np.ones(X.shape[0]) check_consistent_length(X, y, sample_weight) return X, y, sample_weight
def fit(self, x, y): assert(type_of_target(y) == "binary") x = column_or_1d(x) y = column_or_1d(y) self.fit_df(x, y) self.fit_beta() return self
def check_metrics_arguments(y_true, y_pred, sample_weight, two_class=True, binary_pred=True): """ Checks the arguments passed to metrics :param y_true: labels of classes :param y_pred: predictions :param sample_weight: weights of samples :param two_class: if True, will check that y_true contains only zeros and ones :param binary_pred: if True, will check that y_pred contains only zeros and ones :return: the same arguments as tuple """ sample_weight = check_sample_weight(y_true, sample_weight=sample_weight) y_true = column_or_1d(y_true) y_pred = column_or_1d(y_pred) assert len(y_true) == len(y_pred), \ 'The lengths of y_true and y_pred are different: %i and %i' % (len(y_true), len(y_pred)) if two_class: assert numpy.in1d(y_true, [0, 1]).all(), 'The y_true array should contain only two labels: 0 and 1, ' \ 'it contains:' + str(numpy.unique(y_true)) if binary_pred: assert numpy.in1d(y_pred, [0, 1]).all(), 'The y_pred array should contain only two labels: 0 and 1, ' \ 'it contains:' + str(numpy.unique(y_pred)) return y_true, y_pred, sample_weight
def fit(self, X: XSeries, y: XSeries) -> None: """[summary]. Args: X : [description]. y (optional): [description]. Defaults to None. """ # TODO(smly): warn to use fit_transform instead of fit(). # transform() is recommended for encoding test set. if cudf_is_available() and isinstance(X, cudf.Series): pass elif isinstance(X, np.ndarray): X = column_or_1d(X, warn=True) y = column_or_1d(y, warn=True) else: raise RuntimeError # y = column_or_1d(y, warn=True) self.mean_encoders_ = [] # Fit and append mean_encoders for trn_idx, tst_idx in self.fold.split(X): X_trn, _ = X[trn_idx], X[tst_idx] y_trn, _ = y[trn_idx], y[tst_idx] if cudf_is_available() and isinstance(X, cudf.Series): encoder = _CuPy_MeanEncoder() encoder.fit(X_trn, y_trn) self.mean_encoders_.append(encoder) elif isinstance(X, np.ndarray): encoder = _MeanEncoder() encoder.fit(X_trn, y_trn) self.mean_encoders_.append(encoder) else: raise RuntimeError
def regression_mean_width_score(y_pred_low: ArrayLike, y_pred_up: ArrayLike) -> float: """ Effective mean width score obtained by the prediction intervals. Parameters ---------- y_pred_low : ArrayLike of shape (n_samples,) Lower bound of prediction intervals. y_pred_up : ArrayLike of shape (n_samples,) Upper bound of prediction intervals. Returns ------- float Effective mean width of the prediction intervals. Examples -------- >>> from mapie.metrics import regression_mean_width_score >>> import numpy as np >>> y_pred_low = np.array([4, 6, 9, 8.5, 10.5]) >>> y_pred_up = np.array([6, 9, 10, 12.5, 12]) >>> print(regression_mean_width_score(y_pred_low, y_pred_up)) 2.3 """ y_pred_low = cast(NDArray, column_or_1d(y_pred_low)) y_pred_up = cast(NDArray, column_or_1d(y_pred_up)) mean_width = np.abs(y_pred_up - y_pred_low).mean() return float(mean_width)
def _check_params(self, n_features): if not 0 < self.l1_ratio <= 1: raise ValueError("l1_ratio must be in interval ]0;1], but was %f" % self.l1_ratio) if self.tol <= 0: raise ValueError("tolerance must be positive, but was %f" % self.tol) if self.penalty_factor is None: penalty_factor = numpy.ones(n_features, dtype=numpy.float64) else: pf = column_or_1d(self.penalty_factor, warn=True) if pf.shape[0] != n_features: raise ValueError("penalty_factor must be array of length n_features (%d), " "but got %d" % (n_features, pf.shape[0])) assert_all_finite(pf) check_non_negative(pf, "penalty_factor") penalty_factor = pf * n_features / pf.sum() assert_all_finite(penalty_factor) create_path = self.alphas is None if create_path: if self.n_alphas <= 0: raise ValueError("n_alphas must be a positive integer") alphas = numpy.empty(int(self.n_alphas), dtype=numpy.float64) else: alphas = column_or_1d(self.alphas, warn=True) assert_all_finite(alphas) check_non_negative(alphas, "alphas") assert_all_finite(alphas) if self.max_iter <= 0: raise ValueError("max_iter must be a positive integer") return create_path, alphas.astype(numpy.float64), penalty_factor.astype(numpy.float64)
def fit(self, X, y): """Fit a factorization machine regressor Internally, X and y are converted to a Tensorflow Dataset with types (float32, float32) :param X: {array-like} of shape (n_samples, n_features) Training data. :param y: array-like of shape (n_samples,) or (n_samples, n_targets) Target values. :return: an instance of self. """ X, y = utils.check_X_y(X, y, estimator=self, dtype=FLOAT_DTYPES) column_or_1d(y) train_dataset = to_tf_dataset( X, y, batch_size=self.batch_size, ) self.optimizer = tf.keras.optimizers.Adam(learning_rate=self.eta) self.w0_, self.W_, self.V_ = train( train_dataset, num_factors=self.n_factors, max_iter=self.max_iter, optimizer=self.optimizer, loss=self.loss, C=self.C, penalty=self.penalty_function, random_state=self.random_state, ) return self
def rf_test(X_train_raw, y_train_raw, X_test_raw, y_test_raw): # select features with RF y = column_or_1d(y_train_raw, warn=False) clf_rf = RandomForestClassifier() # find most fitted features sel = SelectFromModel(clf_rf) sel.fit(X_train_raw, y) # transform data X_transform = sel.transform(X_train_raw) X_test_transform = sel.transform(X_test_raw) y_test_1d = column_or_1d(y_test_raw, warn=False) # model best_params = {'n_estimators': 100, 'max_depth': 32, 'max_features': 2} clf_rf_opt = RandomForestClassifier(**best_params) clf_rf_opt.fit(X_transform, y) # test y_pred_test = clf_rf_opt.predict(X_test_transform) # calculate accuracy acc_test = np.trace(confusion_matrix(y_test_1d, y_pred_test)) / len(y_pred_test) return acc_test
def check_inputs(X, y, sample_weight, allow_none_weights=True, allow_multiple_targets=False): if allow_multiple_targets: y = numpy.array(y) else: y = column_or_1d(y) if allow_none_weights and sample_weight is None: # checking only X, y if len(X) != len(y): raise ValueError('Different size of X: {} and y: {}'.format( X.shape, y.shape)) return X, y, None if sample_weight is None: sample_weight = numpy.ones(len(y), dtype=float) sample_weight = column_or_1d(sample_weight) assert sum(numpy.isnan(sample_weight) ) == 0, "Weight contains nan, this format isn't supported" if not (len(X) == len(y) == len(sample_weight)): message = 'Different sizes of X: {}, y: {} and sample_weight: {}' raise ValueError(message.format(X.shape, y.shape, sample_weight.shape)) return X, y, sample_weight
def fit(self, X: np.ndarray, y: np.ndarray) -> None: """[summary]. Args: X : [description]. y : [description]. """ X = column_or_1d(X, warn=True) y = column_or_1d(y, warn=True) # Label encoding if necessary if not np.can_cast(X.dtype, np.int64): X, uniques = pd.Series(X).factorize() self._label_encoding_uniques = uniques self.classes_, counts = np.unique(X, return_counts=True) self.class_means_ = np.zeros_like(self.classes_, dtype="float64") for idx, uniq_value in enumerate(self.classes_): mean_value = np.mean(y[X == uniq_value]) self.class_means_[idx] = mean_value self.classes_ = np.append(self.classes_, [np.max(self.classes_) + 1]) self.class_means_ = np.append(self.class_means_, [self.default_unseen_]) self.lut_ = np.hstack( [self.classes_.reshape(-1, 1), self.class_means_.reshape(-1, 1)])
def fit(self, X, y): """ :param X: :param y: """ X, y = utils.check_X_y(X, y, estimator=self, dtype=FLOAT_DTYPES) column_or_1d(y) self.label_binarizer = LabelBinarizer().fit(y) y = self.label_binarizer.transform(y) train_dataset = to_tf_dataset(X, y, batch_size=self.batch_size) self.classes_ = self.label_binarizer.classes_ if not self.optimizer: self.optimizer = tf.keras.optimizers.Adam(learning_rate=self.eta, beta_1=0.9, beta_2=0.999, epsilon=1e-07) self.w0_, self.W_, self.V_ = train(train_dataset, num_factors=self.num_factors, max_iter=self.max_iter, optimizer=self.optimizer, loss=self.loss, penalty=self.penalty_function, random_state=self.random_state) return self
def _sigmoid_calibration(df, y, sample_weight=None): """Probability Calibration with sigmoid method (Platt 2000) Parameters ---------- df : ndarray, shape (n_samples,) The decision function or predict proba for the samples. y : ndarray, shape (n_samples,) The targets. sample_weight : array-like of shape (n_samples,), default=None Sample weights. If None, then samples are equally weighted. Returns ------- a : float The slope. b : float The intercept. References ---------- Platt, "Probabilistic Outputs for Support Vector Machines" """ df = column_or_1d(df) y = column_or_1d(y) F = df # F follows Platt's notations # Bayesian priors (see Platt end of section 2.2) prior0 = float(np.sum(y <= 0)) prior1 = y.shape[0] - prior0 T = np.zeros(y.shape) T[y > 0] = (prior1 + 1.) / (prior1 + 2.) T[y <= 0] = 1. / (prior0 + 2.) T1 = 1. - T def objective(AB): # From Platt (beginning of Section 2.2) P = expit(-(AB[0] * F + AB[1])) loss = -(xlogy(T, P) + xlogy(T1, 1. - P)) if sample_weight is not None: return (sample_weight * loss).sum() else: return loss.sum() def grad(AB): # gradient of the objective function P = expit(-(AB[0] * F + AB[1])) TEP_minus_T1P = T - P if sample_weight is not None: TEP_minus_T1P *= sample_weight dA = np.dot(TEP_minus_T1P, F) dB = np.sum(TEP_minus_T1P) return np.array([dA, dB]) AB0 = np.array([0., log((prior0 + 1.) / (prior1 + 1.))]) AB_ = fmin_bfgs(objective, AB0, fprime=grad, disp=False) return AB_[0], AB_[1]
def apply_restrictions(X, causes, metadata=None, restrictions=None): """Mask restricted causes based on demographics Args: X (dataframe): samples by causes matrix of tariff ranks ages (list-like): continuous age value for each sample sexes (list-like): sex of each sample codes as 1=male, 2=female min_age (list): tuples of (treshold, list of causes) max_age (list): tuples of (threshold, list of causes) males_only (list): causes which only occur in males females_only (list): causes which only occur in females regional (list): causes which appear in the training data, but are known to not occur in the prediction data censored (matrix-like): mask indicating which cells should be censored Returns: X_valid (np.array): A copy of X with restricted combinations set to the worst possible rank """ input_is_df = isinstance(X, pd.DataFrame) df_index = X.index if input_is_df else None X = check_array(X, copy=True, force_all_finite=False) restrictions = restrictions or dict() metadata = metadata or dict() ages = metadata.get('age_', np.full(X.shape[0], np.nan)) sexes = metadata.get('sex_', np.zeros(X.shape[0])) regions = metadata.get('region_', np.full(X.shape[0], np.nan)) check_consistent_length(X, ages, sexes, regions) ages = column_or_1d(ages) sexes = column_or_1d(sexes) regions = column_or_1d(regions) for thre, labels in restrictions.get('min_age', []): X[(ages < thre)[:, None] & np.in1d(causes, labels)] = np.nan for thre, labels in metadata.get('max_age', []): X[(ages > thre)[:, None] & np.in1d(causes, labels)] = np.nan males_only = restrictions.get('males_only', []) X[(sexes == 2)[:, None] & np.in1d(causes, males_only)] = np.nan females_only = restrictions.get('females_only', []) X[(sexes == 1)[:, None] & np.in1d(causes, females_only)] = np.nan for r, labels in restrictions.get('regions', []): X[(regions == r)[:, None] & np.in1d(causes, labels)] = np.nan if input_is_df: pd.DataFrame(X, df_index, causes) return X
def fit(self, X, y=None, log_base=False): """ Fit the model by estimating time duration distribution between states. Parameters ---------- X : array-like, shape = (n_samples, 2) First column corresponds to states, second column to timestamps y : array-like, shape (n_samples,) No used, only here for compatibility reason Returns ------- self : object Returns an instance of self. """ if X.shape[1] != 2: raise ValueError("Shape must be exactly (n,2) but is " + X.shape) x = X[:, 0] timestamp = X[:, 1] x = column_or_1d(x) timestamp = column_or_1d(timestamp) self.labels_ = np.unique(x) label_amount = len(self.labels_) self.inv_dict_labels_ = {self.labels_[i]: i for i in range(0, label_amount)} self.sda_matrix_ = np.empty([label_amount, label_amount], dtype=object) self.timeduration_matrix_ = np.empty([label_amount, label_amount], dtype=object) for i in range(0, label_amount): for j in range(0, label_amount): self.timeduration_matrix_[i, j] = list() for i in range(1, len(x)): time_duration = timestamp[i] - timestamp[i - 1] previous_event_index = self.inv_dict_labels_[x[i - 1]] current_event_index = self.inv_dict_labels_[x[i]] self.timeduration_matrix_[previous_event_index, current_event_index].append(time_duration) for i in range(0, label_amount): for j in range(0, label_amount): if self.timeduration_matrix_[i, j] is not None: if log_base: hist = np.histogram(np.log(self.timeduration_matrix_[i, j]), bins=self.bins) else: hist = np.histogram(self.timeduration_matrix_[i, j], bins=self.bins) hda = HistogramData(hist[1], hist[0]) self.sda_matrix_[i, j] = hda return self
def evaluate(self, true_label, guess_label, hardCut=False): """ 模型性能统计分析 Args: true_label: 测试样本真实标签序列 guess_label: 测试样本预测标签序列 returns: (aucv, precision, recall, accuracy, fscore, ks, actual_cut) """ def logging(*params): print(datetime.now().strftime('%Y-%m-%d %H:%M:%S'), ' '.join(['%s' for _ in params]) % params) true_label = column_or_1d(true_label) guess_label = column_or_1d(guess_label) cumulative_1, _, cumu_delta = self.cumulation(true_label, guess_label) ks = np.max(cumu_delta) softcut = cumulative_1[1][np.argmax(cumu_delta)] if isinstance(hardCut, float): actual_cut = hardCut else: hardCut = 0.5 actual_cut = softcut fpr, tpr, _ = roc_curve(true_label, guess_label) A = sum(logical_and(guess_label >= actual_cut, true_label == 1)) B = sum(logical_and(guess_label >= actual_cut, true_label == 0)) C = sum(logical_and(guess_label < actual_cut, true_label == 1)) D = sum(logical_and(guess_label < actual_cut, true_label == 0)) accuracy = 1.0 * (A + D) / (A + B + C + D) precision = 1.0 * A / (A + B) acc_pos = 1.0 * A / (A + C) acc_neg = 1.0 * D / (B + D) recall = acc_pos gmean = sqrt(acc_pos * acc_neg) fscore = 2.0 * precision * recall / (precision + recall) aucv = auc(fpr, tpr) logging(u'实际类别为1的个数: %d, 判定类别为1的个数: %d' % (sum(true_label == 1), sum(guess_label >= actual_cut))) logging(u'实际类别为0的个数: %d, 判定类别为0的个数: %d' % (sum(true_label == 0), sum(guess_label < actual_cut))) logging(u'A=%d, B=%d, C=%d, D=%d' % (A, B, C, D)) logging(u'Precision=%.4f, Recall=%.4f, Accuracy=%.4f' % (precision, recall, accuracy)) logging(u'AUC:%.4f, G-mean=%.4f, F-score=%.4f' % (aucv, gmean, fscore)) logging('KS=%.4f,' % ks, 'Softcut=%.4f,' % softcut, 'HardCut=%.4f' % hardCut) return (aucv, precision, recall, accuracy, fscore, ks, actual_cut)
def final_ds(train_X, test_X, train_Y, test_Y): print('Converting datasets to correct shape') x = np.array(train_X) y = np.array(train_Y) y = column_or_1d(y, warn=True) x_t = np.array(test_X) y_t = np.array(test_Y) y_t = column_or_1d(y_t, warn=True) print('Shape of training dataset', x.shape) print('Shape of label tensor:', y.shape) print('Shape of test dataset', x_t.shape) print('Shape of test label tensor:', y_t.shape) return x, y, x_t, y_t
def fit_transform(self, X: np.ndarray, y: np.ndarray) -> np.ndarray: """[summary]. Args: X : [description]. y : [description]. Returns: Any : [description]. """ X = column_or_1d(X, warn=True) y = column_or_1d(y, warn=True) self.fit(X, y) return self.transform(X)
def fit(self, data: List[Union[Patient]], sklearn_clf: ClassifierMixin = RandomForestClassifier(), **fit_params): """ Generates and executes the preprocessing and training pipeline. For each fhir attribute its respective preprocessor will be used Args: data (list): A list of fhir objects (e.g. Patient) sklearn_clf (BaseEstimator): Instance of a sklearn classifier Returns: (list, list, object): A tuple of complete data matrix, labels and trained clf """ # Get list of patients and their fhir attrs represented as list logging.info("Extracting attributes from data set") data_matrix = self._get_data_matrix(data) # Generate feature and label preprocessing pipeline pipeline = self._generate_pipeline() ct = ColumnTransformer(pipeline) logging.info("Preprocessing data") # Caution: The pipeline returns preprocessed features AND label complete_data_matrix = ct.fit_transform(data_matrix) X = complete_data_matrix[:, :len(self.feature_attrs)] y = complete_data_matrix[:, len(self.feature_attrs):] y = column_or_1d(y) # Check y suitability for classification if type_of_target(y) in [ "continuous", "continuous-multioutput", "unknown" ]: logging.warning( "The target label is not suitable for classification (type: {})" .format(type_of_target(y))) logging.info("Started training of clf") self.clf = sklearn_clf self.clf.fit(X, column_or_1d(y)) logging.info("Training completed") self.train_eval = self.evaluate(X, y) logging.info("Accuracy : {}, F1-score : {}".format( self.train_eval['accuracy'], self.train_eval['f1_score'])) return X, y, self.clf
def compute_msee_on_bins(y_pred, mask, bin_indices, target_efficiencies, power=2., sample_weight=None): """ An efficient function to compute MSE, the splitting into bins should be given in bin_indices """ assert len(y_pred) == len(bin_indices) == len(mask), "different size of arrays" # needed in case if in some bins there are no signal events y_pred = column_or_1d(y_pred) sample_weight = check_sample_weight(y_pred, sample_weight=sample_weight) n_bins = numpy.max(bin_indices[mask]) + 1 target_efficiencies = numpy.array(target_efficiencies) signal_proba = y_pred[mask] signal_answers = numpy.ones(len(signal_proba), dtype=numpy.int) signal_bins = bin_indices[mask] signal_weights = sample_weight[mask] bin_total = numpy.bincount(signal_bins, weights=signal_weights, minlength=n_bins) + 1e-6 cuts = compute_cut_for_efficiency(target_efficiencies, signal_answers, y_pred=signal_proba, sample_weight=signal_weights) result = 0. for cut, efficiency in zip(cuts, target_efficiencies): passed_cut = signal_proba > cut mean_efficiency = numpy.average(passed_cut, weights=signal_weights) bin_passed_cut = numpy.bincount(signal_bins[passed_cut], weights=signal_weights[passed_cut], minlength=n_bins) bin_efficiency = bin_passed_cut / bin_total result += numpy.sum(bin_total * numpy.abs(bin_efficiency - mean_efficiency) ** power) # TODO probably we should norm on the weights # Minkowski distance trick with powers return 10 * (result / len(target_efficiencies) / numpy.sum(mask)) ** (1. / power)
def compute_msee_on_groups(y_pred, mask, groups, target_efficiencies, sample_weight=None, power=2.): """ An efficient function to compute MSE, the splitting into groups should be given in the format of list, each item is a list of indices inside bin""" assert len(y_pred) == len(mask), "different size" sample_weight = check_sample_weight(y_pred, sample_weight) y_pred = column_or_1d(y_pred) cuts = compute_cut_for_efficiency(target_efficiencies, mask, y_pred=y_pred, sample_weight=sample_weight) efficiencies = [list() for eff in target_efficiencies] groups_sizes = numpy.array([len(x) for x in groups]) groups_weights = numpy.array([numpy.sum(numpy.take(sample_weight, g)) for g in groups]) signal_weight = sample_weight[mask] for group_indices in groups: if len(group_indices) == 0: continue assert numpy.all(mask[group_indices]), "The provided groups contain bg events" group_predictions = numpy.take(y_pred, group_indices) group_weights = numpy.take(sample_weight, group_indices) for i, (eff, cut) in enumerate(zip(efficiencies, cuts)): efficiencies[i].append(numpy.average(group_predictions > cut, weights=group_weights)) result = 0. for cut, efficiencies_at_cut in zip(cuts, efficiencies): mean_efficiency = numpy.average(y_pred[mask] > cut, weights=signal_weight) result += numpy.sum(groups_weights * numpy.abs(efficiencies_at_cut - mean_efficiency) ** power) # Minkowski distance trick with powers return 10 * (result / len(target_efficiencies) / numpy.sum(groups_sizes)) ** (1. / power)
def transform(self, X, y=None): """Cuts `X` so it is aligned with `y`. Parameters ---------- X : ndarray, shape (n_samples,) or (n_samples, 1) Time series to build a target for. y : None There is no need for a target, yet the pipeline API requires this parameter. Returns ------- Xt : ndarray, shape (n_samples_new, 1) The cut input time series. """ # Check is fit had been called check_is_fitted(self) X = column_or_1d(X) Xt = X[:-self.n_steps_future] if self.n_steps_future < self.width: Xt = Xt[self.width - self.n_steps_future:] return Xt.reshape(-1, 1)
def fit(self, X, y, sample_weight=None): label = self.uniform_label self.uniform_label = numpy.array([label]) if isinstance(label, numbers.Number) else numpy.array(label) sample_weight = check_sample_weight(y, sample_weight=sample_weight).copy() assert numpy.all(numpy.in1d(y, [0, 1])), 'only two-class classification is supported by now' y = column_or_1d(y) y_signed = 2 * y - 1 X = pandas.DataFrame(X) knn_indices = computeKnnIndicesOfSameClass(self.uniform_variables, X, y, self.n_neighbours) # for those events with non-uniform label we repeat it's own index several times for label in [0, 1]: if label not in self.uniform_label: knn_indices[y == label, :] = numpy.arange(len(y))[y == label][:, numpy.newaxis] X = self.get_train_vars(X) cumulative_score = numpy.zeros(len(X)) self.estimators = [] for stage in range(self.n_estimators): classifier = sklearn.clone(self.base_estimator) classifier.fit(X, y, sample_weight=sample_weight) score = self.learning_rate * self.compute_score(classifier, X=X) cumulative_score += score sample_weight *= numpy.exp(- y_signed * numpy.take(score, knn_indices).mean(axis=1)) sample_weight = self.normalize_weights(y=y, sample_weight=sample_weight) self.estimators.append(classifier)
def fit(self, X, y): """Fit the model to the data X and target y. Parameters ---------- X : {array-like, sparse matrix}, shape (n_samples, n_features) Training data, where n_samples in the number of samples and n_features is the number of features. y : numpy array of shape (n_samples) Returns ------- self """ y = column_or_1d(y, warn=True) # needs a better way to check multi-label instances if isinstance(np.reshape(y, (-1, 1))[0][0], list): self.multi_label = True else: self.multi_label = False self.classes_ = np.unique(y) self._lbin = LabelBinarizer() y = self._lbin.fit_transform(y) super(MultilayerPerceptronClassifier, self).fit(X, y) return self
def gbdt_test(X_train_raw, y_train_raw, X_test_raw, y_test_raw): # select features with GBDT y = column_or_1d(y_train_raw, warn=False) clf_gbdt = GradientBoostingClassifier() sel = SelectFromModel(clf_gbdt) sel.fit(X_train_raw, y) # transform data X_transform = sel.transform(X_train_raw) X_test_transform = sel.transform(X_test_raw) # optimum params params = { 'n_estimators': 100, 'max_leaf_nodes': 4, 'max_depth': None, 'random_state': 2, 'min_samples_split': 5 } clf_gbdt = GradientBoostingClassifier(**params) # fit clf_gbdt.fit(X_transform, y) y_pred_test = clf_gbdt.predict(X_test_transform) # calculate accuracy acc_test = np.trace(confusion_matrix(y_test_raw, y_pred_test)) / len(y_pred_test) return acc_test
def resample(self, y, X=None): """Resample `y` so that, for any i > 0, the minus i-th entry of the resampled vector corresponds in time to the last coordinate of the minus i-th embedding vector produced by :meth:`transform`. Parameters ---------- y : ndarray, shape (n_samples,) Target. X : None There is no need for input data, yet the pipeline API requires this parameter. Returns ------- yr : ndarray, shape (n_samples_new,) The resampled target. ``n_samples_new = (n_samples - time_delay * (dimension - 1) - 1) // stride + 1``. """ # Check if fit had been called check_is_fitted(self) yr = column_or_1d(y) yr = np.flip(yr) final_index = -self.time_delay_ * (self.dimension_ - 1) yr = np.flip(yr[:final_index:self.stride]) return yr
def _initial_data_check(X, y, sample_weight): sample_weight = check_sample_weight(y, sample_weight=sample_weight) assert len(X) == len(y), 'Different lengths of X and y' X = pandas.DataFrame(X) y = numpy.array(column_or_1d(y), dtype=int) assert numpy.all(numpy.in1d(y, [0, 1])), 'Only two-class classification supported' return X, y, sample_weight
def fit(self, X, y=None, features_to_keep=None, select_percent=0.05): ''' TODO: Does this "select_percent" include those pre-set to keep? features_to_keep includes both features wanted to keep + non-numeric features Args: X: y: features_to_keep: select_percent: Returns: ''' if not features_to_keep: features_to_keep = [] X_numeric = X[X.columns[X.columns.isin( self.features_by_type['numeric_features'])]] self.fs.set_input_matrix(X_numeric.values, column_or_1d(y.values)) num_features_to_select = int( round(select_percent * len(X_numeric.columns.values))) self.fs.select(k=num_features_to_select) feature_ranks = self.fs.compute_ranks() for i in range(len(feature_ranks)): if feature_ranks[i] <= num_features_to_select: # If in features_to_keep, pretend it wasn't eliminated. features_to_keep.append(X_numeric.columns[i]) self.selected_features = features_to_keep[:] return self
def check_endog(y, dtype=DTYPE, copy=True, force_all_finite=False): """Wrapper for ``check_array`` and ``column_or_1d`` from sklearn Parameters ---------- y : array-like, shape=(n_samples,) The 1d endogenous array. dtype : string, type or None (default=np.float64) Data type of result. If None, the dtype of the input is preserved. If "numeric", dtype is preserved unless array.dtype is object. copy : bool, optional (default=False) Whether a forced copy will be triggered. If copy=False, a copy might still be triggered by a conversion. force_all_finite : bool, optional (default=False) Whether to raise an error on np.inf and np.nan in an array. The possibilities are: - True: Force all values of array to be finite. - False: accept both np.inf and np.nan in array. Returns ------- y : np.ndarray, shape=(n_samples,) A 1d numpy ndarray """ return column_or_1d( check_array(y, ensure_2d=False, force_all_finite=force_all_finite, copy=copy, dtype=dtype)) # type: np.ndarray
def decode(self, encodings, one_hot_axis=None): """Decodes the given encodings into their respective keys. Parameters ---------- encodings : scalar or np.ndarray one_hot : bool If True, then expects to decode one hot vectors into their respective keys. Otherwise, expects to map elements to their respective keys. Returns ------- scalar or np.ndarray Same shape as input encodings, but with elements changed to the proper encoding. """ if isinstance(one_hot_axis, int): encodings = encodings.argmax(axis=one_hot_axis) # TODO check encodings.shape to expected shape encodings = validation.column_or_1d(encodings, warn=True) # inverse transform of empty array is empty array if validation._num_samples(encodings) == 0: return np.array([]) diff = np.setdiff1d(encodings, np.arange(len(self.keys()))) if len(diff): raise ValueError( "encodings contains previously unseen labels: %s" % str(diff)) # TODO hard to handle unknowns in the decoding case, but could do # update or default as well, I suppose. return np.array(self.encoder)[np.array(encodings)]
def transform(self, X: np.ndarray) -> np.ndarray: """[summary]. Args: X : [description]. Returns: Any : [description]. """ check_is_fitted(self, "class_means_") X = column_or_1d(X, warn=True) # Label encoding if necessary if self._label_encoding_uniques is not None: X = self._label_encoding_uniques.get_indexer(pd.Series(X)) missing_mask = np.isnan(X) encode_mask = np.invert(missing_mask) unseen_mask = np.bitwise_xor(np.isin(X, self.classes_, invert=True), missing_mask) X = X.copy() X[unseen_mask] = np.max(self.classes_) indices = _get_index(self.classes_, X[encode_mask]) _classes_index_list = np.searchsorted(self.lut_[:, 0], self.classes_) encoded_values = np.zeros(X.shape[0], dtype=np.float32) encoded_values[encode_mask] = np.take( self.lut_[:, 1], np.take(_classes_index_list, indices)) encoded_values[unseen_mask] = self.default_unseen_ return encoded_values
def transform(self, y): """Transform labels to normalized encoding. If ``self.fill_unseen_labels`` is ``True``, use ``self.fill_encoded_label_value`` for unseen values. Seen labels are encoded with value between 0 and n_classes-1. Unseen labels are encoded with ``self.fill_encoded_label_value`` with a default value of n_classes. Parameters ---------- y : array-like of shape [n_samples] Label values. Returns ------- y_encoded : array-like of shape [n_samples] Encoded label values. """ check_is_fitted(self, "classes_") y = column_or_1d(y, warn=True) # transform of empty array is empty array if _num_samples(y) == 0: return np.array([]) if self.fill_unseen_labels: _, mask = _encode_check_unknown(y, self.classes_, return_mask=True) y_encoded = np.searchsorted(self.classes_, y) fill_encoded_label_value = self.fill_encoded_label_value or len( self.classes_) y_encoded[~mask] = fill_encoded_label_value else: _, y_encoded = _encode(y, uniques=self.classes_, encode=True) return y_encoded
def ndiffs(x, alpha=0.05, test='kpss', max_d=2, **kwargs): """Estimate differencing term. Function to estimate the number of differences required to make a given time series stationary. Parameters ---------- x : array-like, shape=(n_samples, [n_features]) The array to difference. alpha : float, optional (default=0.05) Level of the test test : str, optional (default='kpss') Type of unit root test of stationarity to use in order to test the stationarity of the time-series. max_d : int, optional (default=2) Maximum number of non-seasonal differences allowed. Must be a positive integer. """ if max_d <= 0: raise ValueError('max_d must be a positive integer') # get the test testfunc = get_callable(test, VALID_TESTS)(alpha, **kwargs).is_stationary x = column_or_1d( check_array(x, ensure_2d=False, force_all_finite=True, dtype=DTYPE)) # base case, if constant return 0 d = 0 if is_constant(x): return d # get initial diff pval, dodiff = testfunc(x) # if initially NaN, return 0 if np.isnan(pval): return 0 # (d is zero, but this is more explicit to the reader) # Begin loop. while dodiff and d < max_d: d += 1 # do differencing x = diff(x) if is_constant(x): return d # get new result pval, dodiff = testfunc(x) # if it's NaN now, take the last non-null one if np.isnan(pval): return d - 1 # when d >= max_d return d
def cvm_flatness(y, proba, X, uniform_variables, sample_weight=None, label=1, knn=30): """ The most simple way to compute Cramer-von Mises flatness, this is however very slow if you need to compute it many times :param y: real classes of events, shape = [n_samples] :param proba: predicted probabilities, shape = [n_samples, n_classes] :param X: pandas.DataFrame with uniform features (i.e. test dataset) :param uniform_variables: features, along which uniformity is desired, list of strings :param sample_weight: weights of events, shape = [n_samples] :param label: class, for which uniformity is measured (usually, 0 is bck, 1 is signal) :param knn: number of nearest neighbours used in knn Example of usage: proba = classifier.predict_proba(testX) cvm_flatness(testY, proba=proba, X=testX, uniform_variables=['mass']) """ y, proba = check_arrays(y, proba) assert len(y) == len(proba) == len(X), 'Different lengths' y = column_or_1d(y) sample_weight = check_sample_weight(y, sample_weight=sample_weight) X = pandas.DataFrame(X) signal_mask = y == label groups_indices = computeSignalKnnIndices(uniform_variables=uniform_variables, dataframe=X, is_signal=signal_mask, n_neighbors=knn) groups_indices = groups_indices[signal_mask, :] return ut.group_based_cvm(proba[:, label], mask=signal_mask, groups_indices=groups_indices, sample_weight=sample_weight)
def compute_ams_on_cuts(answers, predictions, sample_weight): """ Prediction is probabilities""" assert len(answers) == len(predictions) == len(sample_weight) answers = column_or_1d(answers) predictions = column_or_1d(predictions) sample_weight = column_or_1d(sample_weight) order = numpy.argsort(predictions)[::-1] reordered_answers = answers[order] reordered_weights = sample_weight[order] s_cumulative = numpy.cumsum(reordered_answers * reordered_weights) b_cumulative = numpy.cumsum((1 - reordered_answers) * reordered_weights) b_cumulative *= real_b / b_cumulative[-1] s_cumulative *= real_s / s_cumulative[-1] br = 10. s = s_cumulative b = b_cumulative radicands = 2 * ((s + b + br) * numpy.log(1.0 + s/(b + br)) - s) return predictions[order], radicands
def check_inputs(X, y, sample_weight, allow_none_weights=True): y = column_or_1d(y) if allow_none_weights and sample_weight is None: # checking only X, y if len(X) != len(y): raise ValueError('Different size of X: {} and y: {}'.format(X.shape, y.shape)) return X, y, None if sample_weight is None: sample_weight = numpy.ones(len(y), dtype=float) sample_weight = column_or_1d(sample_weight) assert sum(numpy.isnan(sample_weight)) == 0, "Weight contains nan, this format isn't supported" if not (len(X) == len(y) == len(sample_weight)): message = 'Different sizes of X: {}, y: {} and sample_weight: {}' raise ValueError(message.format(X.shape, y.shape, sample_weight.shape)) return X, y, sample_weight
def compute_efficiencies_on_bins(signal_proba, signal_mask, bin_indices, n_total_bins, cut, sample_weight=None): assert len(signal_mask) == len(signal_proba) == len(bin_indices), "different size" sample_weight = check_sample_weight(signal_mask, sample_weight=sample_weight) bin_total = numpy.bincount(bin_indices[signal_mask], weights=sample_weight[signal_mask], minlength=n_total_bins) + 1e-6 signal_proba = column_or_1d(signal_proba) passed_cut = signal_proba > cut bin_passed_cut = numpy.bincount(bin_indices[signal_mask & passed_cut], weights=sample_weight[signal_mask & passed_cut], minlength=n_total_bins) - 1e-10 return bin_passed_cut / bin_total
def check_input(X, y, sample_weight, check_two_classes=True): sample_weight = check_sample_weight(y, sample_weight=sample_weight) assert len(X) == len(y), 'Different lengths' X = pandas.DataFrame(X) y = column_or_1d(y) if check_two_classes: assert numpy.all(numpy.in1d(y, [0, 1])), \ 'only two-class classification is supported by now' return X, y, sample_weight
def compute_group_efficiencies(y_score, groups_matrix, cut, divided_weight=None, smoothing=0.0): """ Provided cut, computes efficiencies inside each bin. :param divided_weight: weight for each event, divided by the number of it's occurences """ y_score = column_or_1d(y_score) divided_weight = check_sample_weight(y_score, sample_weight=divided_weight) # with smoothing=0, this is 0 or 1, latter for passed events. passed_cut = sigmoid_function(y_score - cut, width=smoothing) passed_weight = groups_matrix.dot(divided_weight * passed_cut) total_weight = groups_matrix.dot(divided_weight) return passed_weight / numpy.maximum(total_weight, 1e-10)
def fit(self, X, y, sample_weight=None): sample_weight = check_sample_weight(y, sample_weight=sample_weight) assert len(X) == len(y), 'Different lengths of X and y' X = pandas.DataFrame(X) y = numpy.array(column_or_1d(y), dtype=int) assert numpy.all(numpy.in1d(y, [0, 1])), 'Only two-class classification supported' self.check_params() self.estimators = [] self.scores = [] n_samples = len(X) n_inbag = int(self.subsample * len(X)) self.loss = copy.copy(self.loss) self.loss.fit(X, y, sample_weight=sample_weight) # preparing for fitting in trees X = self.get_train_vars(X) self.n_features = X.shape[1] X, y = check_arrays(X, y) X = X.astype(DTYPE) y_pred = numpy.zeros(len(X), dtype=float) if self.init_estimator is not None: y_signed = 2 * y - 1 self.init_estimator.fit(X, y_signed, sample_weight=sample_weight) y_pred += numpy.ravel(self.init_estimator.predict(X)) for stage in range(self.n_estimators): # tree creation tree = DecisionTreeRegressor( criterion=self.criterion, splitter=self.splitter, max_depth=self.max_depth, min_samples_split=self.min_samples_split, min_samples_leaf=self.min_samples_leaf, max_features=self.max_features, random_state=self.random_state, max_leaf_nodes=self.max_leaf_nodes) # tree learning residual = self.loss.negative_gradient(y_pred) train_indices = self.random_state.choice(n_samples, size=n_inbag, replace=False) tree.fit(X[train_indices], residual[train_indices], sample_weight=sample_weight[train_indices], check_input=False) # update tree leaves if self.update_tree: self.loss.update_tree(tree.tree_, X=X, y=y, y_pred=y_pred, sample_weight=sample_weight, update_mask=numpy.ones(len(X), dtype=bool), residual=residual) y_pred += self.learning_rate * tree.predict(X) self.estimators.append(tree) self.scores.append(self.loss(y_pred)) return self
def compute_theil_on_groups(y_pred, mask, groups_indices, target_efficiencies, sample_weight): y_pred = column_or_1d(y_pred) sample_weight = check_sample_weight(y_pred, sample_weight=sample_weight) groups_weights = compute_group_weights(groups_indices, sample_weight=sample_weight) cuts = compute_cut_for_efficiency(target_efficiencies, mask=mask, y_pred=y_pred, sample_weight=sample_weight) result = 0. for cut in cuts: groups_efficiencies = compute_group_efficiencies(y_pred, groups_indices, cut, sample_weight=sample_weight) result += theil(groups_efficiencies, groups_weights) return result / len(cuts)
def compute_sde_on_groups(y_pred, mask, groups_indices, target_efficiencies, sample_weight=None, power=2.): y_pred = column_or_1d(y_pred) sample_weight = check_sample_weight(y_pred, sample_weight=sample_weight) group_weights = compute_group_weights(groups_indices, sample_weight=sample_weight) cuts = compute_cut_for_efficiency(target_efficiencies, mask=mask, y_pred=y_pred, sample_weight=sample_weight) sde = 0. for cut in cuts: group_efficiencies = compute_group_efficiencies(y_pred, groups_indices=groups_indices, cut=cut, sample_weight=sample_weight) sde += weighted_deviation(group_efficiencies, weights=group_weights, power=power) return (sde / len(cuts)) ** (1. / power)
def compute_bin_efficiencies(y_score, bin_indices, cut, sample_weight, minlength=None): """Efficiency of bin = total weight of (signal) events that passed the cut in the bin / total weight of signal events in the bin. Returns small negative number for empty bins""" y_score = column_or_1d(y_score) assert len(y_score) == len(sample_weight) == len(bin_indices), "different size" if minlength is None: minlength = numpy.max(bin_indices) + 1 bin_total = numpy.bincount(bin_indices, weights=sample_weight, minlength=minlength) passed_cut = y_score > cut bin_passed_cut = numpy.bincount(bin_indices[passed_cut], weights=sample_weight[passed_cut], minlength=minlength) return bin_passed_cut / numpy.maximum(bin_total, 1)
def check_metrics_arguments(y_true, y_pred, sample_weight, two_class=True, binary_pred=True): """ Checks the arguments passed to metrics :param y_true: :param y_pred: :param sample_weight: :param two_class: :param binary_pred: :return: """ sample_weight = check_sample_weight(y_true, sample_weight=sample_weight) y_true = column_or_1d(y_true) y_pred = column_or_1d(y_pred) assert len(y_true) == len(y_pred), \ 'The lengths of y_true and y_pred are different: %i and %i' % (len(y_true), len(y_pred)) if two_class: assert numpy.in1d(y_true, [0, 1]).all(), 'The y_true array should contain only two labels: 0 and 1, ' \ 'it contains:' + str(numpy.unique(y_true)) if binary_pred: assert numpy.in1d(y_pred, [0, 1]).all(), 'The y_pred array should contain only two labels: 0 and 1, ' \ 'it contains:' + str(numpy.unique(y_pred)) return y_true, y_pred, sample_weight
def check_xyw(X, y, sample_weight=None): """ Checks parameters of classifier / loss / metrics :param X: array-like of shape [n_samples, n_features] (numpy.array or pandas.DataFrame) :param y: array-like of shape [n_samples] :param sample_weight: None or array-like of shape [n_samples] :return: """ from sklearn.utils.validation import column_or_1d y = column_or_1d(y) sample_weight = check_sample_weight(y, sample_weight=sample_weight) assert len(X) == len(y), 'Lengths are different' if not (isinstance(X, pandas.DataFrame) or (isinstance(X, numpy.ndarray))): X = numpy.array(X) return X, y, sample_weight
def compute_group_efficiencies(y_score, groups_indices, cut, sample_weight=None, smoothing=0.0): y_score = column_or_1d(y_score) sample_weight = check_sample_weight(y_score, sample_weight=sample_weight) # with smoothing=0, this is passed_cut = sigmoid_function(y_score - cut, width=smoothing) if isinstance(groups_indices, numpy.ndarray) and numpy.ndim(groups_indices) == 2: # this speedup is specially for knn result = numpy.average(numpy.take(passed_cut, groups_indices), weights=numpy.take(sample_weight, groups_indices), axis=1) else: result = numpy.zeros(len(groups_indices)) for i, group in enumerate(groups_indices): result[i] = numpy.average(passed_cut[group], weights=sample_weight[group]) return result
def AMS(answers, predictions, sample_weight): """ Predictions are classes """ assert len(answers) == len(predictions) == len(sample_weight) predictions = column_or_1d(predictions) total_s = numpy.sum(sample_weight[answers > 0.5]) total_b = numpy.sum(sample_weight[answers < 0.5]) s = numpy.sum(sample_weight[answers * predictions > 0.5]) b = numpy.sum(sample_weight[(1 - answers) * predictions > 0.5]) s *= real_s / total_s b *= real_b / total_b br = 10. radicand = 2 * ( (s+b+br) * numpy.log(1.0 + s/(b+br)) - s) if radicand < 0: raise ValueError('Radicand is negative') else: return numpy.sqrt(radicand)
def partial_fit(self, X, y, classes=None): """Fit the model to the data X and target y. Parameters ---------- X : {array-like, sparse matrix}, shape (n_samples, n_features) Training data, where n_samples in the number of samples and n_features is the number of features. classes : array, shape (n_classes) Classes across all calls to partial_fit. Can be obtained by via `np.unique(y_all)`, where y_all is the target vector of the entire dataset. This argument is required for the first call to partial_fit and can be omitted in the subsequent calls. Note that y doesn't need to contain all labels in `classes`. y : numpy array of shape (n_samples) Subset of the target values. Returns ------- self """ if self.algorithm != "sgd": raise ValueError("only SGD algorithm" " supports partial fit") if self.classes_ is None and classes is None: raise ValueError("classes must be passed on the first call " "to partial_fit.") elif self.classes_ is not None and classes is not None: if np.any(self.classes_ != np.unique(classes)): raise ValueError("`classes` is not the same as on last call " "to partial_fit.") elif classes is not None: self.classes_ = classes if not hasattr(self, "_lbin"): self._lbin = LabelBinarizer() self._lbin._classes = classes y = column_or_1d(y, warn=True) # needs a better way to check multi-label instances if isinstance(np.reshape(y, (-1, 1))[0][0], list): self.multi_label = True else: self.multi_label = False y = self._lbin.fit_transform(y) super(MultilayerPerceptronClassifier, self).partial_fit(X, y) return self
def compute_group_efficiencies_by_indices(y_score, groups_indices, cut, divided_weight=None, smoothing=0.0): """ Provided cut, computes efficiencies inside each bin. :param divided_weight: weight for each event, divided by the number of it's occurences """ y_score = column_or_1d(y_score) divided_weight = check_sample_weight(y_score, sample_weight=divided_weight) # with smoothing=0, this is 0 or 1, latter for passed events. passed_cut = sigmoid_function(y_score - cut, width=smoothing) if isinstance(groups_indices, numpy.ndarray) and numpy.ndim(groups_indices) == 2: # this speedup is specially for knn result = numpy.average(numpy.take(passed_cut, groups_indices), weights=numpy.take(divided_weight, groups_indices), axis=1) else: result = numpy.zeros(len(groups_indices)) for i, group in enumerate(groups_indices): result[i] = numpy.average(passed_cut[group], weights=divided_weight[group]) return result
def compute_theil_on_bins(y_pred, mask, bin_indices, target_efficiencies, sample_weight): y_pred = column_or_1d(y_pred) sample_weight = check_sample_weight(y_pred, sample_weight=sample_weight) # ignoring events from other classes y_pred = y_pred[mask] bin_indices = bin_indices[mask] sample_weight = sample_weight[mask] bin_weights = compute_bin_weights(bin_indices=bin_indices, sample_weight=sample_weight) cuts = compute_cut_for_efficiency(target_efficiencies, mask=numpy.ones(len(y_pred), dtype=bool), y_pred=y_pred, sample_weight=sample_weight) result = 0. for cut in cuts: bin_efficiencies = compute_bin_efficiencies(y_pred, bin_indices=bin_indices, cut=cut, sample_weight=sample_weight) result += theil(bin_efficiencies, weights=bin_weights) return result / len(cuts)
def plot_classes_distribution(X, y, var_names): y = column_or_1d(y) labels = numpy.unique(y) if len(var_names) == 1: pylab.figure(figsize=(14, 7)) pylab.title('Distribution of classes') for label in labels: pylab.hist(numpy.ravel(X.ix[y == label, var_names]), label='class=%i' % label, histtype='step') pylab.xlabel(var_names[0]) elif len(var_names) == 2: pylab.figure(figsize=(12, 10)) pylab.title('Distribution of classes') x_var, y_var = var_names for label in labels: alpha = numpy.clip(2000. / numpy.sum(y == label), 0.02, 1) pylab.plot(X.loc[y == label, x_var], X.loc[y == label, y_var], '.', alpha=alpha, label='class=' + str(label)) else: raise ValueError("More than tow variables are not implemented")
def __init__(self, classifiers_dict, X, y, sample_weight=None, low_memory=None): """The main object for different reports and plots, computes predictions of different classifiers on the same test data sets and makes it possible to compute different metrics, plot some quality curves and so on """ assert isinstance(classifiers_dict, OrderedDict) if low_memory is not None: warnings.warn("Low memory argument is deprecated", DeprecationWarning) self.X = X self.y = column_or_1d(numpy.array(y, dtype=int)) self.sample_weight = sample_weight assert len(X) == len(y), 'Different lengths' self.n_samples = len(y) self.checked_sample_weight = check_sample_weight(y, sample_weight=sample_weight) self.predictions = OrderedDict([(name, classifier.predict_proba(X)) for name, classifier in classifiers_dict.items()]) self.staged_predictions = None self.classifiers = classifiers_dict
def _log_loss(y_true, y_pred, eps=1e-10, sample_weight=None): """ This is shorter ans simpler version og log_loss, which supports sample_weight """ sample_weight = check_sample_weight(y_true, sample_weight=sample_weight) y_true, y_pred, sample_weight = check_arrays(y_true, y_pred, sample_weight) y_true = column_or_1d(y_true) lb = LabelBinarizer() T = lb.fit_transform(y_true) if T.shape[1] == 1: T = numpy.append(1 - T, T, axis=1) # Clipping Y = numpy.clip(y_pred, eps, 1 - eps) # Check if dimensions are consistent. T, Y = check_arrays(T, Y) # Renormalize Y /= Y.sum(axis=1)[:, numpy.newaxis] loss = -(T * numpy.log(Y) * sample_weight[:, numpy.newaxis]).sum() / numpy.sum(sample_weight) return loss
def group_based_cvm(y_pred, mask, sample_weight, groups_indices): y_pred = column_or_1d(y_pred) sample_weight = check_sample_weight(y_pred, sample_weight=sample_weight) group_weights = compute_group_weights(groups_indices, sample_weight=sample_weight) result = 0. global_data, global_weight, global_F = prepare_distibution(y_pred[mask], weights=sample_weight[mask]) for group, group_weight in zip(groups_indices, group_weights): local_distribution = y_pred[group] local_weights = sample_weight[group] result += group_weight * _cvm_2samp_fast(global_data, local_distribution, global_weight, local_weights, global_F) return result # endregion
def r2_score(y_true, y_pred, sample_weight=None, multioutput=None): """R^2 (coefficient of determination) regression score function. Best possible score is 1.0 and it can be negative (because the model can be arbitrarily worse). A constant model that always predicts the expected value of y, disregarding the input features, would get a R^2 score of 0.0. Read more in the :ref:`User Guide <r2_score>`. Parameters ---------- y_true : array-like of shape = (n_samples) or (n_samples, n_outputs) Ground truth (correct) target values. y_pred : array-like of shape = (n_samples) or (n_samples, n_outputs) Estimated target values. sample_weight : array-like of shape = (n_samples), optional Sample weights. multioutput : string in ['raw_values', 'uniform_average', 'variance_weighted'] or None or array-like of shape (n_outputs) Defines aggregating of multiple output scores. Array-like value defines weights used to average scores. Default value correponds to 'variance_weighted', but will be changed to 'uniform_average' in next versions. 'raw_values' : Returns a full set of scores in case of multioutput input. 'uniform_average' : Scores of all outputs are averaged with uniform weight. 'variance_weighted' : Scores of all outputs are averaged, weighted by the variances of each individual output. Returns ------- z : float or ndarray of floats The R^2 score or ndarray of scores if 'multioutput' is 'raw_values'. Notes ----- This is not a symmetric function. Unlike most other scores, R^2 score may be negative (it need not actually be the square of a quantity R). References ---------- .. [1] `Wikipedia entry on the Coefficient of determination <http://en.wikipedia.org/wiki/Coefficient_of_determination>`_ Examples -------- >>> from sklearn.metrics import r2_score >>> y_true = [3, -0.5, 2, 7] >>> y_pred = [2.5, 0.0, 2, 8] >>> r2_score(y_true, y_pred) # doctest: +ELLIPSIS 0.948... >>> y_true = [[0.5, 1], [-1, 1], [7, -6]] >>> y_pred = [[0, 2], [-1, 2], [8, -5]] >>> r2_score(y_true, y_pred, multioutput='variance_weighted') # doctest: +ELLIPSIS 0.938... """ y_type, y_true, y_pred, multioutput = _check_reg_targets( y_true, y_pred, multioutput) if sample_weight is not None: sample_weight = column_or_1d(sample_weight) weight = sample_weight[:, np.newaxis] else: weight = 1. numerator = (weight * (y_true - y_pred) ** 2).sum(axis=0, dtype=np.float64) denominator = (weight * (y_true - np.average( y_true, axis=0, weights=sample_weight)) ** 2).sum(axis=0, dtype=np.float64) nonzero_denominator = denominator != 0 nonzero_numerator = numerator != 0 valid_score = nonzero_denominator & nonzero_numerator output_scores = np.ones([y_true.shape[1]]) output_scores[valid_score] = 1 - (numerator[valid_score] / denominator[valid_score]) # arbitrary set to zero to avoid -inf scores, having a constant # y_true is not interesting for scoring a regression anyway output_scores[nonzero_numerator & ~nonzero_denominator] = 0. if multioutput is None and y_true.shape[1] != 1: # @FIXME change in 0.18 warnings.warn("Default 'multioutput' behavior now corresponds to " "'variance_weighted' value, it will be changed " "to 'uniform_average' in 0.18.", DeprecationWarning) multioutput = 'variance_weighted' if multioutput == 'raw_values': # return scores individually return output_scores elif multioutput == 'uniform_average': # passing None as weights results is uniform mean avg_weights = None elif multioutput == 'variance_weighted': avg_weights = denominator # avoid fail on constant y or one-element arrays if not np.any(nonzero_denominator): if not np.any(nonzero_numerator): return 1.0 else: return 0.0 else: avg_weights = multioutput return np.average(output_scores, weights=avg_weights)
def fit(self, X, y, sample_weight=None): shuffler = Shuffler(X, random_state=self.random_state) X, y = check_arrays(X, y, dtype=DTYPE, sparse_format="dense", check_ccontiguous=True) y = column_or_1d(y, warn=True) n_samples = len(X) n_inbag = int(self.subsample * n_samples) sample_weight = check_sample_weight(y, sample_weight=sample_weight).copy() self.random_state = check_random_state(self.random_state) # skipping all checks assert self.update_on in ['all', 'same', 'other', 'random'] y_pred = numpy.zeros(len(y), dtype=float) self.classifiers = [] self.learning_rates = [] self.loss_values = [] self.loss = copy.copy(self.loss) self.loss.fit(X, y, sample_weight=sample_weight) iter_X = shuffler.generate(0.) prev_smearing = 1 for iteration in range(self.n_estimators): if iteration % self.recount_step == 0: if prev_smearing > 0: iter_smearing = interpolate(self.smearing, iteration, self.n_estimators) prev_smearing = iter_smearing iter_X = shuffler.generate(iter_smearing) iter_X, = check_arrays(iter_X, dtype=DTYPE, sparse_format="dense", check_ccontiguous=True) y_pred = numpy.zeros(len(y)) y_pred += sum(cl.predict(X) * rate for rate, cl in zip(self.learning_rates, self.classifiers)) self.loss_values.append(self.loss(y, y_pred, sample_weight=sample_weight)) tree = DecisionTreeRegressor( criterion=self.criterion, splitter=self.splitter, max_depth=interpolate(self.max_depth, iteration, self.n_estimators), min_samples_split=self.min_samples_split, min_samples_leaf=interpolate(self.min_samples_leaf, iteration, self.n_estimators, use_log=True), max_features=self.max_features, random_state=self.random_state) sample_mask = _random_sample_mask(n_samples, n_inbag, self.random_state) loss_weight = sample_weight if self.weights_in_loss else numpy.ones(len(sample_weight)) tree_weight = sample_weight if not self.weights_in_loss else numpy.ones(len(sample_weight)) residual = self.loss.negative_gradient(y, y_pred, sample_weight=loss_weight) tree.fit(numpy.array(iter_X)[sample_mask, :], residual[sample_mask], sample_weight=tree_weight[sample_mask], check_input=False) # update tree leaves if self.update_tree: if self.update_on == 'all': update_mask = numpy.ones(len(sample_mask), dtype=bool) elif self.update_on == 'same': update_mask = sample_mask elif self.update_on == 'other': update_mask = ~sample_mask else: # random update_mask = _random_sample_mask(n_samples, n_inbag, self.random_state) self.loss.update_terminal_regions(tree.tree_, X=iter_X, y=y, residual=residual, pred=y_pred, sample_mask=update_mask, sample_weight=sample_weight) iter_learning_rate = interpolate(self.learning_rate, iteration, self.n_estimators, use_log=True) y_pred += iter_learning_rate * tree.predict(X) self.classifiers.append(tree) self.learning_rates.append(iter_learning_rate) return self
def __init__(self, data, sample_weight=None): sample_weight = check_sample_weight(data, sample_weight=sample_weight) data = column_or_1d(data) assert numpy.all(sample_weight >= 0.), 'sample weight must be non-negative' self.data, sample_weight = reorder_by_first(data, sample_weight) self.predictions = numpy.cumsum(sample_weight) / numpy.sum(sample_weight)