Beispiel #1
0
def evaluate_print(clf_name, y, y_pred):
    """Utility function for evaluating and printing the results for examples.
    Default metrics include ROC and Precision @ n

    Parameters
    ----------
    clf_name : str
        The name of the detector.

    y : list or numpy array of shape (n_samples,)
        The ground truth. Binary (0: inliers, 1: outliers).

    y_pred : list or numpy array of shape (n_samples,)
        The raw outlier scores as returned by a fitted model.

    """

    y = column_or_1d(y)
    y_pred = column_or_1d(y_pred)
    check_consistent_length(y, y_pred)

    print('{clf_name} ROC:{roc}, precision @ rank n:{prn}, ap:{prn}'.format(
        clf_name=clf_name,
        roc=np.round(roc_auc_score(y, y_pred), decimals=4),
        prn=np.round(precision_n_scores(y, y_pred), decimals=4),
        ap=np.round(average_precision_score(y, y_pred), decimals=4)))
Beispiel #2
0
Datei: cm.py Projekt: chnlyi/i2b2
def _check_targets(y_true, y_pred):

    check_consistent_length(y_true, y_pred)
    type_true = type_of_target(y_true)
    type_pred = type_of_target(y_pred)

    y_type = {type_true, type_pred}
    if y_type == {"binary", "multiclass"}:
        y_type = {"multiclass"}

    if len(y_type) > 1:
        raise ValueError("Classification metrics can't handle a mix of {0} "
                         "and {1} targets".format(type_true, type_pred))

    # We can't have more than one value on y_type => The set is no more needed
    y_type = y_type.pop()

    # No metrics support "multiclass-multioutput" format
    if (y_type not in ["binary", "multiclass", "multilabel-indicator"]):
        raise ValueError("{0} is not supported".format(y_type))

    if y_type in ["binary", "multiclass"]:
        y_true = column_or_1d(y_true)
        y_pred = column_or_1d(y_pred)
        if y_type == "binary":
            unique_values = np.union1d(y_true, y_pred)
            if len(unique_values) > 2:
                y_type = "multiclass"

    if y_type.startswith('multilabel'):
        y_true = csr_matrix(y_true)
        y_pred = csr_matrix(y_pred)
        y_type = 'multilabel-indicator'

    return y_type, y_true, y_pred
Beispiel #3
0
    def fit(self, X, y):
        if self.penalty not in ["l1", "none"]:
            raise ValueError(
                f"penalty should be either 'l1' or 'none', got {self.penalty}")

        self.sensitive_col_idx_ = self.sensitive_cols
        if isinstance(X, pd.DataFrame):
            self.sensitive_col_idx_ = [
                i for i, name in enumerate(X.columns)
                if name in self.sensitive_cols
            ]
        X, y = check_X_y(X, y, accept_large_sparse=False)

        sensitive = X[:, self.sensitive_col_idx_]
        if not self.train_sensitive_cols:
            X = np.delete(X, self.sensitive_col_idx_, axis=1)
        X = self._add_intercept(X)

        column_or_1d(y)
        label_encoder = LabelEncoder().fit(y)
        y = label_encoder.transform(y)
        self.classes_ = label_encoder.classes_

        if len(self.classes_) > 2:
            raise ValueError(
                f"This solver needs samples of exactly 2 classes"
                f" in the data, but the data contains {len(self.classes_)}: {self.classes_}"
            )

        self._solve(sensitive, X, y)
        return self
Beispiel #4
0
def check_metrics_arguments(y_true, y_pred, sample_weight, two_class=True, binary_pred=True):
    """
    Checks the arguments passed to metrics
    :param y_true: labels of classes
    :param y_pred: predictions
    :param sample_weight: weights of samples
    :param two_class: if True, will check that y_true contains only zeros and ones
    :param binary_pred: if True, will check that y_pred contains only zeros and ones
    :return: the same arguments as tuple
    """
    sample_weight = check_sample_weight(y_true, sample_weight=sample_weight)
    y_true = column_or_1d(y_true)
    y_pred = column_or_1d(y_pred)
    assert len(y_true) == len(y_pred), "The lengths of y_true and y_pred are different: %i and %i" % (
        len(y_true),
        len(y_pred),
    )
    if two_class:
        assert numpy.in1d(
            y_true, [0, 1]
        ).all(), "The y_true array should contain only two labels: 0 and 1, " "it contains:" + str(numpy.unique(y_true))
    if binary_pred:
        assert numpy.in1d(
            y_pred, [0, 1]
        ).all(), "The y_pred array should contain only two labels: 0 and 1, " "it contains:" + str(numpy.unique(y_pred))
    return y_true, y_pred, sample_weight
Beispiel #5
0
def check_inputs(X, y, sample_weight=None, ensure_2d=True):
    """Input validation for debiasing algorithms.

    Checks all inputs for consistent length, validates shapes (optional for X),
    and returns an array of all ones if sample_weight is ``None``.

    Args:
        X (array-like): Input data.
        y (array-like, shape = (n_samples,)): Target values.
        sample_weight (array-like, optional): Sample weights.
        ensure_2d (bool, optional): Whether to raise a ValueError if X is not
            2D.

    Returns:
        tuple:

            * **X** (`array-like`) -- Validated X. Unchanged.

            * **y** (`array-like`) -- Validated y. Possibly converted to 1D if
              not a :class:`pandas.Series`.
            * **sample_weight** (`array-like`) -- Validated sample_weight. If no
              sample_weight is provided, returns a consistent-length array of
              ones.
    """
    if ensure_2d and X.ndim != 2:
        raise ValueError("Expected X to be 2D, got ndim == {} instead.".format(
                X.ndim))
    if not isinstance(y, pd.Series):  # don't cast Series -> ndarray
        y = column_or_1d(y)
    if sample_weight is not None:
        sample_weight = column_or_1d(sample_weight)
    else:
        sample_weight = np.ones(X.shape[0])
    check_consistent_length(X, y, sample_weight)
    return X, y, sample_weight
Beispiel #6
0
 def fit(self, x, y):
     assert(type_of_target(y) == "binary")
     x = column_or_1d(x)
     y = column_or_1d(y)
     self.fit_df(x, y)
     self.fit_beta()
     return self
Beispiel #7
0
def check_metrics_arguments(y_true,
                            y_pred,
                            sample_weight,
                            two_class=True,
                            binary_pred=True):
    """
    Checks the arguments passed to metrics
    :param y_true: labels of classes
    :param y_pred: predictions
    :param sample_weight: weights of samples
    :param two_class: if True, will check that y_true contains only zeros and ones
    :param binary_pred: if True, will check that y_pred contains only zeros and ones
    :return: the same arguments as tuple
    """
    sample_weight = check_sample_weight(y_true, sample_weight=sample_weight)
    y_true = column_or_1d(y_true)
    y_pred = column_or_1d(y_pred)
    assert len(y_true) == len(y_pred), \
        'The lengths of y_true and y_pred are different: %i and %i' % (len(y_true), len(y_pred))
    if two_class:
        assert numpy.in1d(y_true, [0, 1]).all(), 'The y_true array should contain only two labels: 0 and 1, ' \
                                                 'it contains:' + str(numpy.unique(y_true))
    if binary_pred:
        assert numpy.in1d(y_pred, [0, 1]).all(), 'The y_pred array should contain only two labels: 0 and 1, ' \
                                                 'it contains:' + str(numpy.unique(y_pred))
    return y_true, y_pred, sample_weight
Beispiel #8
0
    def fit(self, X: XSeries, y: XSeries) -> None:
        """[summary].

        Args:
            X : [description].
            y (optional): [description]. Defaults to None.
        """
        # TODO(smly): warn to use fit_transform instead of fit().
        # transform() is recommended for encoding test set.
        if cudf_is_available() and isinstance(X, cudf.Series):
            pass
        elif isinstance(X, np.ndarray):
            X = column_or_1d(X, warn=True)
            y = column_or_1d(y, warn=True)
        else:
            raise RuntimeError

        # y = column_or_1d(y, warn=True)
        self.mean_encoders_ = []

        # Fit and append mean_encoders
        for trn_idx, tst_idx in self.fold.split(X):
            X_trn, _ = X[trn_idx], X[tst_idx]
            y_trn, _ = y[trn_idx], y[tst_idx]
            if cudf_is_available() and isinstance(X, cudf.Series):
                encoder = _CuPy_MeanEncoder()
                encoder.fit(X_trn, y_trn)
                self.mean_encoders_.append(encoder)
            elif isinstance(X, np.ndarray):
                encoder = _MeanEncoder()
                encoder.fit(X_trn, y_trn)
                self.mean_encoders_.append(encoder)
            else:
                raise RuntimeError
Beispiel #9
0
def regression_mean_width_score(y_pred_low: ArrayLike,
                                y_pred_up: ArrayLike) -> float:
    """
    Effective mean width score obtained by the prediction intervals.

    Parameters
    ----------
    y_pred_low : ArrayLike of shape (n_samples,)
        Lower bound of prediction intervals.
    y_pred_up : ArrayLike of shape (n_samples,)
        Upper bound of prediction intervals.

    Returns
    -------
    float
        Effective mean width of the prediction intervals.

    Examples
    --------
    >>> from mapie.metrics import regression_mean_width_score
    >>> import numpy as np
    >>> y_pred_low = np.array([4, 6, 9, 8.5, 10.5])
    >>> y_pred_up = np.array([6, 9, 10, 12.5, 12])
    >>> print(regression_mean_width_score(y_pred_low, y_pred_up))
    2.3
    """
    y_pred_low = cast(NDArray, column_or_1d(y_pred_low))
    y_pred_up = cast(NDArray, column_or_1d(y_pred_up))
    mean_width = np.abs(y_pred_up - y_pred_low).mean()
    return float(mean_width)
    def _check_params(self, n_features):
        if not 0 < self.l1_ratio <= 1:
            raise ValueError("l1_ratio must be in interval ]0;1], but was %f" % self.l1_ratio)

        if self.tol <= 0:
            raise ValueError("tolerance must be positive, but was %f" % self.tol)

        if self.penalty_factor is None:
            penalty_factor = numpy.ones(n_features, dtype=numpy.float64)
        else:
            pf = column_or_1d(self.penalty_factor, warn=True)
            if pf.shape[0] != n_features:
                raise ValueError("penalty_factor must be array of length n_features (%d), "
                                 "but got %d" % (n_features, pf.shape[0]))
            assert_all_finite(pf)
            check_non_negative(pf, "penalty_factor")
            penalty_factor = pf * n_features / pf.sum()
            assert_all_finite(penalty_factor)

        create_path = self.alphas is None
        if create_path:
            if self.n_alphas <= 0:
                raise ValueError("n_alphas must be a positive integer")

            alphas = numpy.empty(int(self.n_alphas), dtype=numpy.float64)
        else:
            alphas = column_or_1d(self.alphas, warn=True)
            assert_all_finite(alphas)
            check_non_negative(alphas, "alphas")
            assert_all_finite(alphas)

        if self.max_iter <= 0:
            raise ValueError("max_iter must be a positive integer")

        return create_path, alphas.astype(numpy.float64), penalty_factor.astype(numpy.float64)
Beispiel #11
0
    def fit(self, X, y):
        """Fit a factorization machine regressor

        Internally, X and y are converted to a Tensorflow Dataset with types (float32, float32)

        :param X: {array-like} of shape (n_samples, n_features)
            Training data.
        :param y: array-like of shape (n_samples,) or (n_samples, n_targets)
            Target values.
        :return: an instance of self.
        """
        X, y = utils.check_X_y(X, y, estimator=self, dtype=FLOAT_DTYPES)
        column_or_1d(y)

        train_dataset = to_tf_dataset(
            X,
            y,
            batch_size=self.batch_size,
        )
        self.optimizer = tf.keras.optimizers.Adam(learning_rate=self.eta)
        self.w0_, self.W_, self.V_ = train(
            train_dataset,
            num_factors=self.n_factors,
            max_iter=self.max_iter,
            optimizer=self.optimizer,
            loss=self.loss,
            C=self.C,
            penalty=self.penalty_function,
            random_state=self.random_state,
        )
        return self
Beispiel #12
0
def rf_test(X_train_raw, y_train_raw, X_test_raw, y_test_raw):
    # select features with RF
    y = column_or_1d(y_train_raw, warn=False)
    clf_rf = RandomForestClassifier()
    # find most fitted features
    sel = SelectFromModel(clf_rf)
    sel.fit(X_train_raw, y)

    # transform data
    X_transform = sel.transform(X_train_raw)
    X_test_transform = sel.transform(X_test_raw)
    y_test_1d = column_or_1d(y_test_raw, warn=False)

    # model
    best_params = {'n_estimators': 100, 'max_depth': 32, 'max_features': 2}
    clf_rf_opt = RandomForestClassifier(**best_params)
    clf_rf_opt.fit(X_transform, y)

    # test
    y_pred_test = clf_rf_opt.predict(X_test_transform)
    # calculate accuracy
    acc_test = np.trace(confusion_matrix(y_test_1d,
                                         y_pred_test)) / len(y_pred_test)

    return acc_test
Beispiel #13
0
def check_inputs(X,
                 y,
                 sample_weight,
                 allow_none_weights=True,
                 allow_multiple_targets=False):
    if allow_multiple_targets:
        y = numpy.array(y)
    else:
        y = column_or_1d(y)
    if allow_none_weights and sample_weight is None:
        # checking only X, y
        if len(X) != len(y):
            raise ValueError('Different size of X: {} and y: {}'.format(
                X.shape, y.shape))
        return X, y, None

    if sample_weight is None:
        sample_weight = numpy.ones(len(y), dtype=float)

    sample_weight = column_or_1d(sample_weight)
    assert sum(numpy.isnan(sample_weight)
               ) == 0, "Weight contains nan, this format isn't supported"
    if not (len(X) == len(y) == len(sample_weight)):
        message = 'Different sizes of X: {}, y: {} and sample_weight: {}'
        raise ValueError(message.format(X.shape, y.shape, sample_weight.shape))

    return X, y, sample_weight
Beispiel #14
0
    def fit(self, X: np.ndarray, y: np.ndarray) -> None:
        """[summary].

        Args:
            X : [description].
            y : [description].
        """
        X = column_or_1d(X, warn=True)
        y = column_or_1d(y, warn=True)

        # Label encoding if necessary
        if not np.can_cast(X.dtype, np.int64):
            X, uniques = pd.Series(X).factorize()
            self._label_encoding_uniques = uniques

        self.classes_, counts = np.unique(X, return_counts=True)
        self.class_means_ = np.zeros_like(self.classes_, dtype="float64")

        for idx, uniq_value in enumerate(self.classes_):
            mean_value = np.mean(y[X == uniq_value])
            self.class_means_[idx] = mean_value

        self.classes_ = np.append(self.classes_, [np.max(self.classes_) + 1])
        self.class_means_ = np.append(self.class_means_,
                                      [self.default_unseen_])

        self.lut_ = np.hstack(
            [self.classes_.reshape(-1, 1),
             self.class_means_.reshape(-1, 1)])
Beispiel #15
0
    def fit(self, X, y):
        """
        :param X:
        :param y:
        """
        X, y = utils.check_X_y(X, y, estimator=self, dtype=FLOAT_DTYPES)
        column_or_1d(y)

        self.label_binarizer = LabelBinarizer().fit(y)
        y = self.label_binarizer.transform(y)
        train_dataset = to_tf_dataset(X, y, batch_size=self.batch_size)
        self.classes_ = self.label_binarizer.classes_
        if not self.optimizer:
            self.optimizer = tf.keras.optimizers.Adam(learning_rate=self.eta,
                                                      beta_1=0.9,
                                                      beta_2=0.999,
                                                      epsilon=1e-07)

        self.w0_, self.W_, self.V_ = train(train_dataset,
                                           num_factors=self.num_factors,
                                           max_iter=self.max_iter,
                                           optimizer=self.optimizer,
                                           loss=self.loss,
                                           penalty=self.penalty_function,
                                           random_state=self.random_state)

        return self
Beispiel #16
0
def _sigmoid_calibration(df, y, sample_weight=None):
    """Probability Calibration with sigmoid method (Platt 2000)
    Parameters
    ----------
    df : ndarray, shape (n_samples,)
        The decision function or predict proba for the samples.
    y : ndarray, shape (n_samples,)
        The targets.
    sample_weight : array-like of shape (n_samples,), default=None
        Sample weights. If None, then samples are equally weighted.
    Returns
    -------
    a : float
        The slope.
    b : float
        The intercept.
    References
    ----------
    Platt, "Probabilistic Outputs for Support Vector Machines"
    """
    df = column_or_1d(df)
    y = column_or_1d(y)

    F = df  # F follows Platt's notations

    # Bayesian priors (see Platt end of section 2.2)
    prior0 = float(np.sum(y <= 0))
    prior1 = y.shape[0] - prior0
    T = np.zeros(y.shape)
    T[y > 0] = (prior1 + 1.) / (prior1 + 2.)
    T[y <= 0] = 1. / (prior0 + 2.)
    T1 = 1. - T

    def objective(AB):
        # From Platt (beginning of Section 2.2)
        P = expit(-(AB[0] * F + AB[1]))
        loss = -(xlogy(T, P) + xlogy(T1, 1. - P))
        if sample_weight is not None:
            return (sample_weight * loss).sum()
        else:
            return loss.sum()

    def grad(AB):
        # gradient of the objective function
        P = expit(-(AB[0] * F + AB[1]))
        TEP_minus_T1P = T - P
        if sample_weight is not None:
            TEP_minus_T1P *= sample_weight
        dA = np.dot(TEP_minus_T1P, F)
        dB = np.sum(TEP_minus_T1P)
        return np.array([dA, dB])

    AB0 = np.array([0., log((prior0 + 1.) / (prior1 + 1.))])
    AB_ = fmin_bfgs(objective, AB0, fprime=grad, disp=False)

    return AB_[0], AB_[1]
def apply_restrictions(X, causes, metadata=None, restrictions=None):
    """Mask restricted causes based on demographics

    Args:
        X (dataframe): samples by causes matrix of tariff ranks
        ages (list-like): continuous age value for each sample
        sexes (list-like): sex of each sample codes as 1=male, 2=female
        min_age (list): tuples of (treshold, list of causes)
        max_age (list): tuples of (threshold, list of causes)
        males_only (list): causes which only occur in males
        females_only (list): causes which only occur in females
        regional (list): causes which appear in the training data, but
            are known to not occur in the prediction data
        censored (matrix-like): mask indicating which cells should be censored

    Returns:
        X_valid (np.array): A copy of X with restricted combinations
            set to the worst possible rank
    """
    input_is_df = isinstance(X, pd.DataFrame)
    df_index = X.index if input_is_df else None

    X = check_array(X, copy=True, force_all_finite=False)
    restrictions = restrictions or dict()
    metadata = metadata or dict()

    ages = metadata.get('age_', np.full(X.shape[0], np.nan))
    sexes = metadata.get('sex_', np.zeros(X.shape[0]))
    regions = metadata.get('region_', np.full(X.shape[0], np.nan))

    check_consistent_length(X, ages, sexes, regions)
    ages = column_or_1d(ages)
    sexes = column_or_1d(sexes)
    regions = column_or_1d(regions)

    for thre, labels in restrictions.get('min_age', []):
        X[(ages < thre)[:, None] & np.in1d(causes, labels)] = np.nan

    for thre, labels in metadata.get('max_age', []):
        X[(ages > thre)[:, None] & np.in1d(causes, labels)] = np.nan

    males_only = restrictions.get('males_only', [])
    X[(sexes == 2)[:, None] & np.in1d(causes, males_only)] = np.nan

    females_only = restrictions.get('females_only', [])
    X[(sexes == 1)[:, None] & np.in1d(causes, females_only)] = np.nan

    for r, labels in restrictions.get('regions', []):
        X[(regions == r)[:, None] & np.in1d(causes, labels)] = np.nan

    if input_is_df:
        pd.DataFrame(X, df_index, causes)

    return X
Beispiel #18
0
    def fit(self, X, y=None, log_base=False):
        """
        Fit the model by estimating time duration distribution between states. 

        Parameters
        ----------
        X : array-like, shape = (n_samples, 2)
            First column corresponds to states, second column to timestamps
        y : array-like, shape (n_samples,)
            No used, only here for compatibility reason

        Returns
        -------
        self : object
            Returns an instance of self.
        """

        if X.shape[1] != 2:
            raise ValueError("Shape must be exactly (n,2) but is " + X.shape)

        x = X[:, 0]
        timestamp = X[:, 1]

        x = column_or_1d(x)
        timestamp = column_or_1d(timestamp)

        self.labels_ = np.unique(x)
        label_amount = len(self.labels_)
        self.inv_dict_labels_ = {self.labels_[i]: i for i in range(0, label_amount)}
        self.sda_matrix_ = np.empty([label_amount, label_amount], dtype=object)
        self.timeduration_matrix_ = np.empty([label_amount, label_amount], dtype=object)
        for i in range(0, label_amount):
            for j in range(0, label_amount):
                self.timeduration_matrix_[i, j] = list()

        for i in range(1, len(x)):
            time_duration = timestamp[i] - timestamp[i - 1]
            previous_event_index = self.inv_dict_labels_[x[i - 1]]
            current_event_index = self.inv_dict_labels_[x[i]]
            self.timeduration_matrix_[previous_event_index, current_event_index].append(time_duration)

        for i in range(0, label_amount):
            for j in range(0, label_amount):
                if self.timeduration_matrix_[i, j] is not None:
                    if log_base:
                        hist = np.histogram(np.log(self.timeduration_matrix_[i, j]), bins=self.bins)
                    else:
                        hist = np.histogram(self.timeduration_matrix_[i, j], bins=self.bins)

                    hda = HistogramData(hist[1], hist[0])
                    self.sda_matrix_[i, j] = hda

        return self
    def evaluate(self, true_label, guess_label, hardCut=False):
        """
        模型性能统计分析
        Args:
            true_label: 测试样本真实标签序列
            guess_label: 测试样本预测标签序列
        returns:
            (aucv, precision, recall, accuracy, fscore, ks, actual_cut)
        """
        def logging(*params):
            print(datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
                  ' '.join(['%s' for _ in params]) % params)

        true_label = column_or_1d(true_label)
        guess_label = column_or_1d(guess_label)

        cumulative_1, _, cumu_delta = self.cumulation(true_label, guess_label)
        ks = np.max(cumu_delta)
        softcut = cumulative_1[1][np.argmax(cumu_delta)]

        if isinstance(hardCut, float):
            actual_cut = hardCut
        else:
            hardCut = 0.5
            actual_cut = softcut

        fpr, tpr, _ = roc_curve(true_label, guess_label)
        A = sum(logical_and(guess_label >= actual_cut, true_label == 1))
        B = sum(logical_and(guess_label >= actual_cut, true_label == 0))
        C = sum(logical_and(guess_label < actual_cut, true_label == 1))
        D = sum(logical_and(guess_label < actual_cut, true_label == 0))

        accuracy = 1.0 * (A + D) / (A + B + C + D)
        precision = 1.0 * A / (A + B)
        acc_pos = 1.0 * A / (A + C)
        acc_neg = 1.0 * D / (B + D)
        recall = acc_pos
        gmean = sqrt(acc_pos * acc_neg)
        fscore = 2.0 * precision * recall / (precision + recall)
        aucv = auc(fpr, tpr)
        logging(u'实际类别为1的个数: %d, 判定类别为1的个数: %d' %
                (sum(true_label == 1), sum(guess_label >= actual_cut)))
        logging(u'实际类别为0的个数: %d, 判定类别为0的个数: %d' %
                (sum(true_label == 0), sum(guess_label < actual_cut)))
        logging(u'A=%d, B=%d, C=%d, D=%d' % (A, B, C, D))
        logging(u'Precision=%.4f, Recall=%.4f, Accuracy=%.4f' %
                (precision, recall, accuracy))
        logging(u'AUC:%.4f, G-mean=%.4f, F-score=%.4f' % (aucv, gmean, fscore))
        logging('KS=%.4f,' % ks, 'Softcut=%.4f,' % softcut,
                'HardCut=%.4f' % hardCut)

        return (aucv, precision, recall, accuracy, fscore, ks, actual_cut)
def final_ds(train_X, test_X, train_Y, test_Y):
    print('Converting datasets to correct shape')
    x = np.array(train_X)
    y = np.array(train_Y)
    y = column_or_1d(y, warn=True)
    x_t = np.array(test_X)
    y_t = np.array(test_Y)
    y_t = column_or_1d(y_t, warn=True)
    print('Shape of training dataset', x.shape)
    print('Shape of label tensor:', y.shape)
    print('Shape of test dataset', x_t.shape)
    print('Shape of test label tensor:', y_t.shape)

    return x, y, x_t, y_t
Beispiel #21
0
    def fit_transform(self, X: np.ndarray, y: np.ndarray) -> np.ndarray:
        """[summary].

        Args:
            X : [description].
            y : [description].
        Returns:
            Any : [description].
        """
        X = column_or_1d(X, warn=True)
        y = column_or_1d(y, warn=True)

        self.fit(X, y)
        return self.transform(X)
Beispiel #22
0
    def fit(self,
            data: List[Union[Patient]],
            sklearn_clf: ClassifierMixin = RandomForestClassifier(),
            **fit_params):
        """
        Generates and executes the preprocessing and training pipeline.
        For each fhir attribute its respective preprocessor will be used

        Args:
            data (list):    A list of fhir objects (e.g. Patient)
            sklearn_clf (BaseEstimator): Instance of a sklearn classifier

        Returns:
            (list, list, object): A tuple of complete data matrix, labels and trained clf
        """

        # Get list of patients and their fhir attrs represented as list
        logging.info("Extracting attributes from data set")
        data_matrix = self._get_data_matrix(data)

        # Generate feature and label preprocessing pipeline
        pipeline = self._generate_pipeline()
        ct = ColumnTransformer(pipeline)

        logging.info("Preprocessing data")
        # Caution: The pipeline returns preprocessed features AND label
        complete_data_matrix = ct.fit_transform(data_matrix)
        X = complete_data_matrix[:, :len(self.feature_attrs)]
        y = complete_data_matrix[:, len(self.feature_attrs):]
        y = column_or_1d(y)

        # Check y suitability for classification
        if type_of_target(y) in [
                "continuous", "continuous-multioutput", "unknown"
        ]:
            logging.warning(
                "The target label is not suitable for classification (type: {})"
                .format(type_of_target(y)))

        logging.info("Started training of clf")
        self.clf = sklearn_clf
        self.clf.fit(X, column_or_1d(y))
        logging.info("Training completed")

        self.train_eval = self.evaluate(X, y)
        logging.info("Accuracy : {}, F1-score : {}".format(
            self.train_eval['accuracy'], self.train_eval['f1_score']))
        return X, y, self.clf
Beispiel #23
0
def compute_msee_on_bins(y_pred, mask, bin_indices, target_efficiencies, power=2., sample_weight=None):
    """ An efficient function to compute MSE, the splitting into bins should be given in bin_indices """
    assert len(y_pred) == len(bin_indices) == len(mask), "different size of arrays"
    # needed in case if in some bins there are no signal events
    y_pred = column_or_1d(y_pred)
    sample_weight = check_sample_weight(y_pred, sample_weight=sample_weight)
    n_bins = numpy.max(bin_indices[mask]) + 1
    target_efficiencies = numpy.array(target_efficiencies)

    signal_proba = y_pred[mask]
    signal_answers = numpy.ones(len(signal_proba), dtype=numpy.int)
    signal_bins = bin_indices[mask]
    signal_weights = sample_weight[mask]

    bin_total = numpy.bincount(signal_bins, weights=signal_weights, minlength=n_bins) + 1e-6
    cuts = compute_cut_for_efficiency(target_efficiencies, signal_answers, y_pred=signal_proba,
                                      sample_weight=signal_weights)
    result = 0.
    for cut, efficiency in zip(cuts, target_efficiencies):
        passed_cut = signal_proba > cut
        mean_efficiency = numpy.average(passed_cut, weights=signal_weights)
        bin_passed_cut = numpy.bincount(signal_bins[passed_cut], weights=signal_weights[passed_cut], minlength=n_bins)
        bin_efficiency = bin_passed_cut / bin_total
        result += numpy.sum(bin_total * numpy.abs(bin_efficiency - mean_efficiency) ** power)
    # TODO probably we should norm on the weights
    # Minkowski distance trick with powers
    return 10 * (result / len(target_efficiencies) / numpy.sum(mask)) ** (1. / power)
Beispiel #24
0
def compute_msee_on_groups(y_pred, mask, groups, target_efficiencies, sample_weight=None, power=2.):
    """ An efficient function to compute MSE, the splitting into groups should be given
     in the format of list, each item is a list of indices inside bin"""
    assert len(y_pred) == len(mask), "different size"
    sample_weight = check_sample_weight(y_pred, sample_weight)
    y_pred = column_or_1d(y_pred)

    cuts = compute_cut_for_efficiency(target_efficiencies, mask, y_pred=y_pred, sample_weight=sample_weight)

    efficiencies = [list() for eff in target_efficiencies]
    groups_sizes = numpy.array([len(x) for x in groups])
    groups_weights = numpy.array([numpy.sum(numpy.take(sample_weight, g)) for g in groups])
    signal_weight = sample_weight[mask]

    for group_indices in groups:
        if len(group_indices) == 0:
            continue
        assert numpy.all(mask[group_indices]), "The provided groups contain bg events"
        group_predictions = numpy.take(y_pred, group_indices)
        group_weights = numpy.take(sample_weight, group_indices)

        for i, (eff, cut) in enumerate(zip(efficiencies, cuts)):
            efficiencies[i].append(numpy.average(group_predictions > cut, weights=group_weights))

    result = 0.
    for cut, efficiencies_at_cut in zip(cuts, efficiencies):
        mean_efficiency = numpy.average(y_pred[mask] > cut, weights=signal_weight)
        result += numpy.sum(groups_weights * numpy.abs(efficiencies_at_cut - mean_efficiency) ** power)

    # Minkowski distance trick with powers
    return 10 * (result / len(target_efficiencies) / numpy.sum(groups_sizes)) ** (1. / power)
Beispiel #25
0
    def transform(self, X, y=None):
        """Cuts `X` so it is aligned with `y`.

        Parameters
        ----------
        X : ndarray, shape (n_samples,) or (n_samples, 1)
            Time series to build a target for.

        y : None
            There is no need for a target, yet the pipeline API
            requires this parameter.

        Returns
        -------
        Xt : ndarray, shape (n_samples_new, 1)
            The cut input time series.

        """
        # Check is fit had been called
        check_is_fitted(self)
        X = column_or_1d(X)

        Xt = X[:-self.n_steps_future]

        if self.n_steps_future < self.width:
            Xt = Xt[self.width - self.n_steps_future:]
        return Xt.reshape(-1, 1)
    def fit(self, X, y, sample_weight=None):
        label = self.uniform_label
        self.uniform_label = numpy.array([label]) if isinstance(label, numbers.Number) else numpy.array(label)

        sample_weight = check_sample_weight(y, sample_weight=sample_weight).copy()
        assert numpy.all(numpy.in1d(y, [0, 1])), 'only two-class classification is supported by now'
        y = column_or_1d(y)
        y_signed = 2 * y - 1

        X = pandas.DataFrame(X)
        knn_indices = computeKnnIndicesOfSameClass(self.uniform_variables, X, y, self.n_neighbours)

        # for those events with non-uniform label we repeat it's own index several times
        for label in [0, 1]:
            if label not in self.uniform_label:
                knn_indices[y == label, :] = numpy.arange(len(y))[y == label][:, numpy.newaxis]

        X = self.get_train_vars(X)
        cumulative_score = numpy.zeros(len(X))
        self.estimators = []

        for stage in range(self.n_estimators):
            classifier = sklearn.clone(self.base_estimator)
            classifier.fit(X, y, sample_weight=sample_weight)
            score = self.learning_rate * self.compute_score(classifier, X=X)
            cumulative_score += score
            sample_weight *= numpy.exp(- y_signed * numpy.take(score, knn_indices).mean(axis=1))
            sample_weight = self.normalize_weights(y=y, sample_weight=sample_weight)
            self.estimators.append(classifier)
    def fit(self, X, y):
        """Fit the model to the data X and target y.
        Parameters
        ----------
        X : {array-like, sparse matrix}, shape (n_samples, n_features)
            Training data, where n_samples in the number of samples
            and n_features is the number of features.
        y : numpy array of shape (n_samples)
        Returns
        -------
        self
        """
        y = column_or_1d(y, warn=True)

        # needs a better way to check multi-label instances
        if isinstance(np.reshape(y, (-1, 1))[0][0], list):
            self.multi_label = True
        else:
            self.multi_label = False

        self.classes_ = np.unique(y)
        self._lbin = LabelBinarizer()
        y = self._lbin.fit_transform(y)

        super(MultilayerPerceptronClassifier, self).fit(X, y)

        return self
Beispiel #28
0
def gbdt_test(X_train_raw, y_train_raw, X_test_raw, y_test_raw):
    # select features with GBDT
    y = column_or_1d(y_train_raw, warn=False)
    clf_gbdt = GradientBoostingClassifier()
    sel = SelectFromModel(clf_gbdt)
    sel.fit(X_train_raw, y)

    # transform data
    X_transform = sel.transform(X_train_raw)
    X_test_transform = sel.transform(X_test_raw)

    # optimum params
    params = {
        'n_estimators': 100,
        'max_leaf_nodes': 4,
        'max_depth': None,
        'random_state': 2,
        'min_samples_split': 5
    }
    clf_gbdt = GradientBoostingClassifier(**params)
    # fit
    clf_gbdt.fit(X_transform, y)

    y_pred_test = clf_gbdt.predict(X_test_transform)
    # calculate accuracy
    acc_test = np.trace(confusion_matrix(y_test_raw,
                                         y_pred_test)) / len(y_pred_test)

    return acc_test
Beispiel #29
0
    def resample(self, y, X=None):
        """Resample `y` so that, for any i > 0, the minus i-th entry of the
        resampled vector corresponds in time to the last coordinate of the
        minus i-th embedding vector produced by :meth:`transform`.

        Parameters
        ----------
        y : ndarray, shape (n_samples,)
            Target.

        X : None
            There is no need for input data, yet the pipeline API requires
            this parameter.

        Returns
        -------
        yr : ndarray, shape (n_samples_new,)
            The resampled target. ``n_samples_new = (n_samples - time_delay *
            (dimension - 1) - 1) // stride + 1``.

        """
        # Check if fit had been called
        check_is_fitted(self)
        yr = column_or_1d(y)

        yr = np.flip(yr)
        final_index = -self.time_delay_ * (self.dimension_ - 1)
        yr = np.flip(yr[:final_index:self.stride])
        return yr
Beispiel #30
0
 def _initial_data_check(X, y, sample_weight):
     sample_weight = check_sample_weight(y, sample_weight=sample_weight)
     assert len(X) == len(y), 'Different lengths of X and y'
     X = pandas.DataFrame(X)
     y = numpy.array(column_or_1d(y), dtype=int)
     assert numpy.all(numpy.in1d(y, [0, 1])), 'Only two-class classification supported'
     return X, y, sample_weight
    def fit(self, X, y=None, features_to_keep=None, select_percent=0.05):
        '''
        TODO: Does this "select_percent" include those pre-set to keep?
        features_to_keep includes both features wanted to keep + non-numeric features

        Args:
            X:
            y:
            features_to_keep:
            select_percent:

        Returns:

        '''
        if not features_to_keep:
            features_to_keep = []

        X_numeric = X[X.columns[X.columns.isin(
            self.features_by_type['numeric_features'])]]

        self.fs.set_input_matrix(X_numeric.values, column_or_1d(y.values))

        num_features_to_select = int(
            round(select_percent * len(X_numeric.columns.values)))
        self.fs.select(k=num_features_to_select)

        feature_ranks = self.fs.compute_ranks()

        for i in range(len(feature_ranks)):
            if feature_ranks[i] <= num_features_to_select:
                # If in features_to_keep, pretend it wasn't eliminated.
                features_to_keep.append(X_numeric.columns[i])

        self.selected_features = features_to_keep[:]
        return self
Beispiel #32
0
def check_endog(y, dtype=DTYPE, copy=True, force_all_finite=False):
    """Wrapper for ``check_array`` and ``column_or_1d`` from sklearn

    Parameters
    ----------
    y : array-like, shape=(n_samples,)
        The 1d endogenous array.

    dtype : string, type or None (default=np.float64)
        Data type of result. If None, the dtype of the input is preserved.
        If "numeric", dtype is preserved unless array.dtype is object.

    copy : bool, optional (default=False)
        Whether a forced copy will be triggered. If copy=False, a copy might
        still be triggered by a conversion.

    force_all_finite : bool, optional (default=False)
        Whether to raise an error on np.inf and np.nan in an array. The
        possibilities are:

        - True: Force all values of array to be finite.
        - False: accept both np.inf and np.nan in array.

    Returns
    -------
    y : np.ndarray, shape=(n_samples,)
        A 1d numpy ndarray
    """
    return column_or_1d(
        check_array(y,
                    ensure_2d=False,
                    force_all_finite=force_all_finite,
                    copy=copy,
                    dtype=dtype))  # type: np.ndarray
Beispiel #33
0
    def decode(self, encodings, one_hot_axis=None):
        """Decodes the given encodings into their respective keys.

        Parameters
        ----------
        encodings : scalar or np.ndarray
        one_hot : bool
            If True, then expects to decode one hot vectors into their
            respective keys. Otherwise, expects to map elements to their
            respective keys.

        Returns
        -------
        scalar or np.ndarray
            Same shape as input encodings, but with elements changed to the
            proper encoding.
        """
        if isinstance(one_hot_axis, int):
            encodings = encodings.argmax(axis=one_hot_axis)
            # TODO check encodings.shape to expected shape

        encodings = validation.column_or_1d(encodings, warn=True)
        # inverse transform of empty array is empty array
        if validation._num_samples(encodings) == 0:
            return np.array([])

        diff = np.setdiff1d(encodings, np.arange(len(self.keys())))
        if len(diff):
            raise ValueError(
                "encodings contains previously unseen labels: %s" % str(diff))
            # TODO hard to handle unknowns in the decoding case, but could do
            # update or default as well, I suppose.

        return np.array(self.encoder)[np.array(encodings)]
Beispiel #34
0
    def transform(self, X: np.ndarray) -> np.ndarray:
        """[summary].

        Args:
            X : [description].
        Returns:
            Any : [description].
        """
        check_is_fitted(self, "class_means_")
        X = column_or_1d(X, warn=True)

        # Label encoding if necessary
        if self._label_encoding_uniques is not None:
            X = self._label_encoding_uniques.get_indexer(pd.Series(X))

        missing_mask = np.isnan(X)
        encode_mask = np.invert(missing_mask)
        unseen_mask = np.bitwise_xor(np.isin(X, self.classes_, invert=True),
                                     missing_mask)

        X = X.copy()
        X[unseen_mask] = np.max(self.classes_)
        indices = _get_index(self.classes_, X[encode_mask])

        _classes_index_list = np.searchsorted(self.lut_[:, 0], self.classes_)
        encoded_values = np.zeros(X.shape[0], dtype=np.float32)
        encoded_values[encode_mask] = np.take(
            self.lut_[:, 1], np.take(_classes_index_list, indices))

        encoded_values[unseen_mask] = self.default_unseen_
        return encoded_values
Beispiel #35
0
    def transform(self, y):
        """Transform labels to normalized encoding.

        If ``self.fill_unseen_labels`` is ``True``, use ``self.fill_encoded_label_value`` for unseen values.
        Seen labels are encoded with value between 0 and n_classes-1.  Unseen labels are encoded with
        ``self.fill_encoded_label_value`` with a default value of n_classes.

        Parameters
        ----------
        y : array-like of shape [n_samples]
            Label values.

        Returns
        -------
        y_encoded : array-like of shape [n_samples]
                    Encoded label values.
        """
        check_is_fitted(self, "classes_")
        y = column_or_1d(y, warn=True)

        # transform of empty array is empty array
        if _num_samples(y) == 0:
            return np.array([])

        if self.fill_unseen_labels:
            _, mask = _encode_check_unknown(y, self.classes_, return_mask=True)
            y_encoded = np.searchsorted(self.classes_, y)
            fill_encoded_label_value = self.fill_encoded_label_value or len(
                self.classes_)
            y_encoded[~mask] = fill_encoded_label_value
        else:
            _, y_encoded = _encode(y, uniques=self.classes_, encode=True)

        return y_encoded
Beispiel #36
0
def ndiffs(x, alpha=0.05, test='kpss', max_d=2, **kwargs):
    """Estimate differencing term.

    Function to estimate the number of differences required to
    make a given time series stationary.

    Parameters
    ----------
    x : array-like, shape=(n_samples, [n_features])
        The array to difference.

    alpha : float, optional (default=0.05)
        Level of the test

    test : str, optional (default='kpss')
        Type of unit root test of stationarity to use in order to
        test the stationarity of the time-series.

    max_d : int, optional (default=2)
        Maximum number of non-seasonal differences allowed. Must
        be a positive integer.
    """
    if max_d <= 0:
        raise ValueError('max_d must be a positive integer')

    # get the test
    testfunc = get_callable(test, VALID_TESTS)(alpha, **kwargs).is_stationary
    x = column_or_1d(
        check_array(x, ensure_2d=False, force_all_finite=True, dtype=DTYPE))

    # base case, if constant return 0
    d = 0
    if is_constant(x):
        return d

    # get initial diff
    pval, dodiff = testfunc(x)

    # if initially NaN, return 0
    if np.isnan(pval):
        return 0  # (d is zero, but this is more explicit to the reader)

    # Begin loop.
    while dodiff and d < max_d:
        d += 1

        # do differencing
        x = diff(x)
        if is_constant(x):
            return d

        # get new result
        pval, dodiff = testfunc(x)

        # if it's NaN now, take the last non-null one
        if np.isnan(pval):
            return d - 1

    # when d >= max_d
    return d
Beispiel #37
0
def cvm_flatness(y, proba, X, uniform_variables, sample_weight=None, label=1, knn=30):
    """ The most simple way to compute Cramer-von Mises flatness, this is however very slow
    if you need to compute it many times
    :param y: real classes of events, shape = [n_samples]
    :param proba: predicted probabilities, shape = [n_samples, n_classes]
    :param X: pandas.DataFrame with uniform features (i.e. test dataset)
    :param uniform_variables: features, along which uniformity is desired, list of strings
    :param sample_weight: weights of events, shape = [n_samples]
    :param label: class, for which uniformity is measured (usually, 0 is bck, 1 is signal)
    :param knn: number of nearest neighbours used in knn

    Example of usage:
    proba = classifier.predict_proba(testX)
    cvm_flatness(testY, proba=proba, X=testX, uniform_variables=['mass'])
    """
    y, proba = check_arrays(y, proba)
    assert len(y) == len(proba) == len(X), 'Different lengths'
    y = column_or_1d(y)
    sample_weight = check_sample_weight(y, sample_weight=sample_weight)

    X = pandas.DataFrame(X)

    signal_mask = y == label
    groups_indices = computeSignalKnnIndices(uniform_variables=uniform_variables, dataframe=X,
                                             is_signal=signal_mask, n_neighbors=knn)
    groups_indices = groups_indices[signal_mask, :]

    return ut.group_based_cvm(proba[:, label], mask=signal_mask, groups_indices=groups_indices,
                              sample_weight=sample_weight)
def compute_ams_on_cuts(answers, predictions, sample_weight):
    """ Prediction is probabilities"""
    assert len(answers) == len(predictions) == len(sample_weight)
    answers = column_or_1d(answers)
    predictions = column_or_1d(predictions)
    sample_weight = column_or_1d(sample_weight)
    order = numpy.argsort(predictions)[::-1]
    reordered_answers = answers[order]
    reordered_weights = sample_weight[order]
    s_cumulative = numpy.cumsum(reordered_answers * reordered_weights)
    b_cumulative = numpy.cumsum((1 - reordered_answers) * reordered_weights)
    b_cumulative *= real_b / b_cumulative[-1]
    s_cumulative *= real_s / s_cumulative[-1]
    br = 10.
    s = s_cumulative
    b = b_cumulative
    radicands = 2 * ((s + b + br) * numpy.log(1.0 + s/(b + br)) - s)
    return predictions[order], radicands
Beispiel #39
0
def check_inputs(X, y, sample_weight, allow_none_weights=True):
    y = column_or_1d(y)
    if allow_none_weights and sample_weight is None:
        # checking only X, y
        if len(X) != len(y):
            raise ValueError('Different size of X: {} and y: {}'.format(X.shape, y.shape))
        return X, y, None

    if sample_weight is None:
        sample_weight = numpy.ones(len(y), dtype=float)

    sample_weight = column_or_1d(sample_weight)
    assert sum(numpy.isnan(sample_weight)) == 0, "Weight contains nan, this format isn't supported"
    if not (len(X) == len(y) == len(sample_weight)):
        message = 'Different sizes of X: {}, y: {} and sample_weight: {}'
        raise ValueError(message.format(X.shape, y.shape, sample_weight.shape))

    return X, y, sample_weight
Beispiel #40
0
def compute_efficiencies_on_bins(signal_proba, signal_mask, bin_indices, n_total_bins, cut, sample_weight=None):
    assert len(signal_mask) == len(signal_proba) == len(bin_indices), "different size"
    sample_weight = check_sample_weight(signal_mask, sample_weight=sample_weight)
    bin_total = numpy.bincount(bin_indices[signal_mask], weights=sample_weight[signal_mask], minlength=n_total_bins) + 1e-6
    signal_proba = column_or_1d(signal_proba)

    passed_cut = signal_proba > cut
    bin_passed_cut = numpy.bincount(bin_indices[signal_mask & passed_cut],
                                    weights=sample_weight[signal_mask & passed_cut], minlength=n_total_bins) - 1e-10
    return bin_passed_cut / bin_total
    def check_input(X, y, sample_weight, check_two_classes=True):
        sample_weight = check_sample_weight(y, sample_weight=sample_weight)
        assert len(X) == len(y), 'Different lengths'
        X = pandas.DataFrame(X)
        y = column_or_1d(y)

        if check_two_classes:
            assert numpy.all(numpy.in1d(y, [0, 1])), \
                'only two-class classification is supported by now'
        return X, y, sample_weight
def compute_group_efficiencies(y_score, groups_matrix, cut, divided_weight=None, smoothing=0.0):
    """ Provided cut, computes efficiencies inside each bin.
    :param divided_weight: weight for each event, divided by the number of it's occurences """
    y_score = column_or_1d(y_score)
    divided_weight = check_sample_weight(y_score, sample_weight=divided_weight)
    # with smoothing=0, this is 0 or 1, latter for passed events.
    passed_cut = sigmoid_function(y_score - cut, width=smoothing)
    passed_weight = groups_matrix.dot(divided_weight * passed_cut)
    total_weight = groups_matrix.dot(divided_weight)
    return passed_weight / numpy.maximum(total_weight, 1e-10)
    def fit(self, X, y, sample_weight=None):
        sample_weight = check_sample_weight(y, sample_weight=sample_weight)
        assert len(X) == len(y), 'Different lengths of X and y'
        X = pandas.DataFrame(X)
        y = numpy.array(column_or_1d(y), dtype=int)
        assert numpy.all(numpy.in1d(y, [0, 1])), 'Only two-class classification supported'
        self.check_params()

        self.estimators = []
        self.scores = []

        n_samples = len(X)
        n_inbag = int(self.subsample * len(X))
        self.loss = copy.copy(self.loss)
        self.loss.fit(X, y, sample_weight=sample_weight)

        # preparing for fitting in trees
        X = self.get_train_vars(X)
        self.n_features = X.shape[1]
        X, y = check_arrays(X, y)
        X = X.astype(DTYPE)
        y_pred = numpy.zeros(len(X), dtype=float)

        if self.init_estimator is not None:
            y_signed = 2 * y - 1
            self.init_estimator.fit(X, y_signed, sample_weight=sample_weight)
            y_pred += numpy.ravel(self.init_estimator.predict(X))

        for stage in range(self.n_estimators):
            # tree creation
            tree = DecisionTreeRegressor(
                criterion=self.criterion,
                splitter=self.splitter,
                max_depth=self.max_depth,
                min_samples_split=self.min_samples_split,
                min_samples_leaf=self.min_samples_leaf,
                max_features=self.max_features,
                random_state=self.random_state,
                max_leaf_nodes=self.max_leaf_nodes)

            # tree learning
            residual = self.loss.negative_gradient(y_pred)
            train_indices = self.random_state.choice(n_samples, size=n_inbag, replace=False)

            tree.fit(X[train_indices], residual[train_indices],
                     sample_weight=sample_weight[train_indices], check_input=False)
            # update tree leaves
            if self.update_tree:
                self.loss.update_tree(tree.tree_, X=X, y=y, y_pred=y_pred, sample_weight=sample_weight,
                                      update_mask=numpy.ones(len(X), dtype=bool), residual=residual)

            y_pred += self.learning_rate * tree.predict(X)
            self.estimators.append(tree)
            self.scores.append(self.loss(y_pred))
        return self
def compute_theil_on_groups(y_pred, mask, groups_indices, target_efficiencies, sample_weight):
    y_pred = column_or_1d(y_pred)
    sample_weight = check_sample_weight(y_pred, sample_weight=sample_weight)
    groups_weights = compute_group_weights(groups_indices, sample_weight=sample_weight)
    cuts = compute_cut_for_efficiency(target_efficiencies, mask=mask,
                                      y_pred=y_pred, sample_weight=sample_weight)
    result = 0.
    for cut in cuts:
        groups_efficiencies = compute_group_efficiencies(y_pred, groups_indices, cut, sample_weight=sample_weight)
        result += theil(groups_efficiencies, groups_weights)
    return result / len(cuts)
def compute_sde_on_groups(y_pred, mask, groups_indices, target_efficiencies, sample_weight=None, power=2.):
    y_pred = column_or_1d(y_pred)
    sample_weight = check_sample_weight(y_pred, sample_weight=sample_weight)
    group_weights = compute_group_weights(groups_indices, sample_weight=sample_weight)
    cuts = compute_cut_for_efficiency(target_efficiencies, mask=mask, y_pred=y_pred, sample_weight=sample_weight)
    sde = 0.
    for cut in cuts:
        group_efficiencies = compute_group_efficiencies(y_pred, groups_indices=groups_indices,
                                                        cut=cut, sample_weight=sample_weight)
        sde += weighted_deviation(group_efficiencies, weights=group_weights, power=power)
    return (sde / len(cuts)) ** (1. / power)
Beispiel #46
0
def compute_bin_efficiencies(y_score, bin_indices, cut, sample_weight, minlength=None):
    """Efficiency of bin = total weight of (signal) events that passed the cut
    in the bin / total weight of signal events in the bin.
    Returns small negative number for empty bins"""
    y_score = column_or_1d(y_score)
    assert len(y_score) == len(sample_weight) == len(bin_indices), "different size"
    if minlength is None:
        minlength = numpy.max(bin_indices) + 1

    bin_total = numpy.bincount(bin_indices, weights=sample_weight, minlength=minlength)
    passed_cut = y_score > cut
    bin_passed_cut = numpy.bincount(bin_indices[passed_cut], weights=sample_weight[passed_cut], minlength=minlength)
    return bin_passed_cut / numpy.maximum(bin_total, 1)
def check_metrics_arguments(y_true, y_pred, sample_weight, two_class=True, binary_pred=True):
    """
    Checks the arguments passed to metrics
    :param y_true:
    :param y_pred:
    :param sample_weight:
    :param two_class:
    :param binary_pred:
    :return:
    """
    sample_weight = check_sample_weight(y_true, sample_weight=sample_weight)
    y_true = column_or_1d(y_true)
    y_pred = column_or_1d(y_pred)
    assert len(y_true) == len(y_pred), \
        'The lengths of y_true and y_pred are different: %i and %i' % (len(y_true), len(y_pred))
    if two_class:
        assert numpy.in1d(y_true, [0, 1]).all(), 'The y_true array should contain only two labels: 0 and 1, ' \
                                                 'it contains:' + str(numpy.unique(y_true))
    if binary_pred:
        assert numpy.in1d(y_pred, [0, 1]).all(), 'The y_pred array should contain only two labels: 0 and 1, ' \
                                                 'it contains:' + str(numpy.unique(y_pred))
    return y_true, y_pred, sample_weight
Beispiel #48
0
def check_xyw(X, y, sample_weight=None):
    """
    Checks parameters of classifier / loss / metrics
    :param X: array-like of shape [n_samples, n_features] (numpy.array or pandas.DataFrame)
    :param y: array-like of shape [n_samples]
    :param sample_weight: None or array-like of shape [n_samples]
    :return:
    """
    from sklearn.utils.validation import column_or_1d
    y = column_or_1d(y)
    sample_weight = check_sample_weight(y, sample_weight=sample_weight)
    assert len(X) == len(y), 'Lengths are different'
    if not (isinstance(X, pandas.DataFrame) or (isinstance(X, numpy.ndarray))):
        X = numpy.array(X)
    return X, y, sample_weight
def compute_group_efficiencies(y_score, groups_indices, cut, sample_weight=None, smoothing=0.0):
    y_score = column_or_1d(y_score)
    sample_weight = check_sample_weight(y_score, sample_weight=sample_weight)
    # with smoothing=0, this is
    passed_cut = sigmoid_function(y_score - cut, width=smoothing)

    if isinstance(groups_indices, numpy.ndarray) and numpy.ndim(groups_indices) == 2:
        # this speedup is specially for knn
        result = numpy.average(numpy.take(passed_cut, groups_indices),
                               weights=numpy.take(sample_weight, groups_indices),
                               axis=1)
    else:
        result = numpy.zeros(len(groups_indices))
        for i, group in enumerate(groups_indices):
            result[i] = numpy.average(passed_cut[group], weights=sample_weight[group])
    return result
def AMS(answers, predictions, sample_weight):
    """ Predictions are classes """
    assert len(answers) == len(predictions) == len(sample_weight)
    predictions = column_or_1d(predictions)
    total_s = numpy.sum(sample_weight[answers > 0.5])
    total_b = numpy.sum(sample_weight[answers < 0.5])
    s = numpy.sum(sample_weight[answers * predictions > 0.5])
    b = numpy.sum(sample_weight[(1 - answers) * predictions > 0.5])
    s *= real_s / total_s
    b *= real_b / total_b
    br = 10.
    radicand = 2 * ( (s+b+br) * numpy.log(1.0 + s/(b+br)) - s)
    if radicand < 0:
        raise ValueError('Radicand is negative')
    else:
        return numpy.sqrt(radicand)
    def partial_fit(self, X, y, classes=None):
        """Fit the model to the data X and target y.
        Parameters
        ----------
        X : {array-like, sparse matrix}, shape (n_samples, n_features)
            Training data, where n_samples in the number of samples
            and n_features is the number of features.
        classes : array, shape (n_classes)
            Classes across all calls to partial_fit.
            Can be obtained by via `np.unique(y_all)`, where y_all is the
            target vector of the entire dataset.
            This argument is required for the first call to partial_fit
            and can be omitted in the subsequent calls.
            Note that y doesn't need to contain all labels in `classes`.
        y : numpy array of shape (n_samples)
             Subset of the target values.
        Returns
        -------
        self
        """
        if self.algorithm != "sgd":
            raise ValueError("only SGD algorithm" " supports partial fit")

        if self.classes_ is None and classes is None:
            raise ValueError("classes must be passed on the first call " "to partial_fit.")
        elif self.classes_ is not None and classes is not None:
            if np.any(self.classes_ != np.unique(classes)):
                raise ValueError("`classes` is not the same as on last call " "to partial_fit.")
        elif classes is not None:
            self.classes_ = classes

        if not hasattr(self, "_lbin"):
            self._lbin = LabelBinarizer()
            self._lbin._classes = classes

        y = column_or_1d(y, warn=True)

        # needs a better way to check multi-label instances
        if isinstance(np.reshape(y, (-1, 1))[0][0], list):
            self.multi_label = True
        else:
            self.multi_label = False

        y = self._lbin.fit_transform(y)
        super(MultilayerPerceptronClassifier, self).partial_fit(X, y)

        return self
def compute_group_efficiencies_by_indices(y_score, groups_indices, cut, divided_weight=None, smoothing=0.0):
    """ Provided cut, computes efficiencies inside each bin.
    :param divided_weight: weight for each event, divided by the number of it's occurences """
    y_score = column_or_1d(y_score)
    divided_weight = check_sample_weight(y_score, sample_weight=divided_weight)
    # with smoothing=0, this is 0 or 1, latter for passed events.
    passed_cut = sigmoid_function(y_score - cut, width=smoothing)

    if isinstance(groups_indices, numpy.ndarray) and numpy.ndim(groups_indices) == 2:
        # this speedup is specially for knn
        result = numpy.average(numpy.take(passed_cut, groups_indices),
                               weights=numpy.take(divided_weight, groups_indices),
                               axis=1)
    else:
        result = numpy.zeros(len(groups_indices))
        for i, group in enumerate(groups_indices):
            result[i] = numpy.average(passed_cut[group], weights=divided_weight[group])
    return result
def compute_theil_on_bins(y_pred, mask, bin_indices, target_efficiencies, sample_weight):
    y_pred = column_or_1d(y_pred)
    sample_weight = check_sample_weight(y_pred, sample_weight=sample_weight)

    # ignoring events from other classes
    y_pred = y_pred[mask]
    bin_indices = bin_indices[mask]
    sample_weight = sample_weight[mask]

    bin_weights = compute_bin_weights(bin_indices=bin_indices, sample_weight=sample_weight)
    cuts = compute_cut_for_efficiency(target_efficiencies, mask=numpy.ones(len(y_pred), dtype=bool),
                                      y_pred=y_pred, sample_weight=sample_weight)
    result = 0.
    for cut in cuts:
        bin_efficiencies = compute_bin_efficiencies(y_pred, bin_indices=bin_indices,
                                                    cut=cut, sample_weight=sample_weight)
        result += theil(bin_efficiencies, weights=bin_weights)
    return result / len(cuts)
Beispiel #54
0
def plot_classes_distribution(X, y, var_names):
        y = column_or_1d(y)
        labels = numpy.unique(y)
        if len(var_names) == 1:
            pylab.figure(figsize=(14, 7))
            pylab.title('Distribution of classes')
            for label in labels:
                pylab.hist(numpy.ravel(X.ix[y == label, var_names]), label='class=%i' % label, histtype='step')
                pylab.xlabel(var_names[0])

        elif len(var_names) == 2:
            pylab.figure(figsize=(12, 10))
            pylab.title('Distribution of classes')
            x_var, y_var = var_names
            for label in labels:
                alpha = numpy.clip(2000. / numpy.sum(y == label), 0.02, 1)
                pylab.plot(X.loc[y == label, x_var], X.loc[y == label, y_var], '.',
                           alpha=alpha, label='class=' + str(label))
        else:
            raise ValueError("More than tow variables are not implemented")
Beispiel #55
0
    def __init__(self, classifiers_dict, X, y, sample_weight=None, low_memory=None):
        """The main object for different reports and plots,
        computes predictions of different classifiers on the same test data sets
        and makes it possible to compute different metrics,
        plot some quality curves and so on
        """
        assert isinstance(classifiers_dict, OrderedDict)
        if low_memory is not None:
            warnings.warn("Low memory argument is deprecated", DeprecationWarning)

        self.X = X
        self.y = column_or_1d(numpy.array(y, dtype=int))
        self.sample_weight = sample_weight
        assert len(X) == len(y), 'Different lengths'
        self.n_samples = len(y)
        self.checked_sample_weight = check_sample_weight(y, sample_weight=sample_weight)

        self.predictions = OrderedDict([(name, classifier.predict_proba(X))
                                        for name, classifier in classifiers_dict.items()])
        self.staged_predictions = None
        self.classifiers = classifiers_dict
Beispiel #56
0
def _log_loss(y_true, y_pred, eps=1e-10, sample_weight=None):
    """ This is shorter ans simpler version og log_loss, which supports sample_weight """
    sample_weight = check_sample_weight(y_true, sample_weight=sample_weight)
    y_true, y_pred, sample_weight = check_arrays(y_true, y_pred, sample_weight)
    y_true = column_or_1d(y_true)

    lb = LabelBinarizer()
    T = lb.fit_transform(y_true)
    if T.shape[1] == 1:
        T = numpy.append(1 - T, T, axis=1)

    # Clipping
    Y = numpy.clip(y_pred, eps, 1 - eps)

    # Check if dimensions are consistent.
    T, Y = check_arrays(T, Y)

    # Renormalize
    Y /= Y.sum(axis=1)[:, numpy.newaxis]
    loss = -(T * numpy.log(Y) * sample_weight[:, numpy.newaxis]).sum() / numpy.sum(sample_weight)
    return loss
def group_based_cvm(y_pred, mask, sample_weight, groups_indices):
    y_pred = column_or_1d(y_pred)
    sample_weight = check_sample_weight(y_pred, sample_weight=sample_weight)
    group_weights = compute_group_weights(groups_indices, sample_weight=sample_weight)

    result = 0.
    global_data, global_weight, global_F = prepare_distibution(y_pred[mask], weights=sample_weight[mask])
    for group, group_weight in zip(groups_indices, group_weights):
        local_distribution = y_pred[group]
        local_weights = sample_weight[group]
        result += group_weight * _cvm_2samp_fast(global_data, local_distribution,
                                                 global_weight, local_weights, global_F)
    return result







    # endregion
Beispiel #58
0
def r2_score(y_true, y_pred,
             sample_weight=None,
             multioutput=None):
    """R^2 (coefficient of determination) regression score function.

    Best possible score is 1.0 and it can be negative (because the
    model can be arbitrarily worse). A constant model that always
    predicts the expected value of y, disregarding the input features,
    would get a R^2 score of 0.0.

    Read more in the :ref:`User Guide <r2_score>`.

    Parameters
    ----------
    y_true : array-like of shape = (n_samples) or (n_samples, n_outputs)
        Ground truth (correct) target values.

    y_pred : array-like of shape = (n_samples) or (n_samples, n_outputs)
        Estimated target values.

    sample_weight : array-like of shape = (n_samples), optional
        Sample weights.

    multioutput : string in ['raw_values', 'uniform_average',
                'variance_weighted'] or None or array-like of shape (n_outputs)
        Defines aggregating of multiple output scores.
        Array-like value defines weights used to average scores.
        Default value correponds to 'variance_weighted', but
        will be changed to 'uniform_average' in next versions.

        'raw_values' :
            Returns a full set of scores in case of multioutput input.

        'uniform_average' :
            Scores of all outputs are averaged with uniform weight.

        'variance_weighted' :
            Scores of all outputs are averaged, weighted by the variances
            of each individual output.

    Returns
    -------
    z : float or ndarray of floats
        The R^2 score or ndarray of scores if 'multioutput' is
        'raw_values'.

    Notes
    -----
    This is not a symmetric function.

    Unlike most other scores, R^2 score may be negative (it need not actually
    be the square of a quantity R).

    References
    ----------
    .. [1] `Wikipedia entry on the Coefficient of determination
            <http://en.wikipedia.org/wiki/Coefficient_of_determination>`_

    Examples
    --------
    >>> from sklearn.metrics import r2_score
    >>> y_true = [3, -0.5, 2, 7]
    >>> y_pred = [2.5, 0.0, 2, 8]
    >>> r2_score(y_true, y_pred)  # doctest: +ELLIPSIS
    0.948...
    >>> y_true = [[0.5, 1], [-1, 1], [7, -6]]
    >>> y_pred = [[0, 2], [-1, 2], [8, -5]]
    >>> r2_score(y_true, y_pred, multioutput='variance_weighted')  # doctest: +ELLIPSIS
    0.938...

    """
    y_type, y_true, y_pred, multioutput = _check_reg_targets(
        y_true, y_pred, multioutput)

    if sample_weight is not None:
        sample_weight = column_or_1d(sample_weight)
        weight = sample_weight[:, np.newaxis]
    else:
        weight = 1.

    numerator = (weight * (y_true - y_pred) ** 2).sum(axis=0,
                                                      dtype=np.float64)
    denominator = (weight * (y_true - np.average(
        y_true, axis=0, weights=sample_weight)) ** 2).sum(axis=0,
                                                          dtype=np.float64)
    nonzero_denominator = denominator != 0
    nonzero_numerator = numerator != 0
    valid_score = nonzero_denominator & nonzero_numerator
    output_scores = np.ones([y_true.shape[1]])
    output_scores[valid_score] = 1 - (numerator[valid_score] /
                                      denominator[valid_score])
    # arbitrary set to zero to avoid -inf scores, having a constant
    # y_true is not interesting for scoring a regression anyway
    output_scores[nonzero_numerator & ~nonzero_denominator] = 0.
    if multioutput is None and y_true.shape[1] != 1:
        # @FIXME change in 0.18
        warnings.warn("Default 'multioutput' behavior now corresponds to "
                      "'variance_weighted' value, it will be changed "
                      "to 'uniform_average' in 0.18.",
                      DeprecationWarning)
        multioutput = 'variance_weighted'
    if multioutput == 'raw_values':
        # return scores individually
        return output_scores
    elif multioutput == 'uniform_average':
        # passing None as weights results is uniform mean
        avg_weights = None
    elif multioutput == 'variance_weighted':
        avg_weights = denominator
        # avoid fail on constant y or one-element arrays
        if not np.any(nonzero_denominator):
            if not np.any(nonzero_numerator):
                return 1.0
            else:
                return 0.0
    else:
        avg_weights = multioutput

    return np.average(output_scores, weights=avg_weights)
    def fit(self, X, y, sample_weight=None):
        shuffler = Shuffler(X, random_state=self.random_state)
        X, y = check_arrays(X, y, dtype=DTYPE, sparse_format="dense", check_ccontiguous=True)
        y = column_or_1d(y, warn=True)
        n_samples = len(X)
        n_inbag = int(self.subsample * n_samples)
        sample_weight = check_sample_weight(y, sample_weight=sample_weight).copy()
        self.random_state = check_random_state(self.random_state)

        # skipping all checks
        assert self.update_on in ['all', 'same', 'other', 'random']
        y_pred = numpy.zeros(len(y), dtype=float)

        self.classifiers = []
        self.learning_rates = []
        self.loss_values = []
        self.loss = copy.copy(self.loss)
        self.loss.fit(X, y, sample_weight=sample_weight)
        iter_X = shuffler.generate(0.)

        prev_smearing = 1
        for iteration in range(self.n_estimators):
            if iteration % self.recount_step == 0:
                if prev_smearing > 0:
                    iter_smearing = interpolate(self.smearing, iteration, self.n_estimators)
                    prev_smearing = iter_smearing
                    iter_X = shuffler.generate(iter_smearing)
                    iter_X, = check_arrays(iter_X, dtype=DTYPE, sparse_format="dense", check_ccontiguous=True)
                    y_pred = numpy.zeros(len(y))
                    y_pred += sum(cl.predict(X) * rate for rate, cl in zip(self.learning_rates, self.classifiers))


            self.loss_values.append(self.loss(y, y_pred, sample_weight=sample_weight))
            tree = DecisionTreeRegressor(
                criterion=self.criterion,
                splitter=self.splitter,
                max_depth=interpolate(self.max_depth, iteration, self.n_estimators),
                min_samples_split=self.min_samples_split,
                min_samples_leaf=interpolate(self.min_samples_leaf, iteration, self.n_estimators, use_log=True),
                max_features=self.max_features,
                random_state=self.random_state)

            sample_mask = _random_sample_mask(n_samples, n_inbag, self.random_state)
            loss_weight = sample_weight if self.weights_in_loss else numpy.ones(len(sample_weight))
            tree_weight = sample_weight if not self.weights_in_loss else numpy.ones(len(sample_weight))
            residual = self.loss.negative_gradient(y, y_pred, sample_weight=loss_weight)

            tree.fit(numpy.array(iter_X)[sample_mask, :],
                     residual[sample_mask],
                     sample_weight=tree_weight[sample_mask], check_input=False)
            # update tree leaves
            if self.update_tree:
                if self.update_on == 'all':
                    update_mask = numpy.ones(len(sample_mask), dtype=bool)
                elif self.update_on == 'same':
                    update_mask = sample_mask
                elif self.update_on == 'other':
                    update_mask = ~sample_mask
                else:  # random
                    update_mask = _random_sample_mask(n_samples, n_inbag, self.random_state)
                self.loss.update_terminal_regions(tree.tree_, X=iter_X, y=y, residual=residual, pred=y_pred,
                                                  sample_mask=update_mask, sample_weight=sample_weight)
            iter_learning_rate = interpolate(self.learning_rate, iteration, self.n_estimators, use_log=True)
            y_pred += iter_learning_rate * tree.predict(X)
            self.classifiers.append(tree)
            self.learning_rates.append(iter_learning_rate)

        return self
Beispiel #60
0
 def __init__(self, data, sample_weight=None):
     sample_weight = check_sample_weight(data, sample_weight=sample_weight)
     data = column_or_1d(data)
     assert numpy.all(sample_weight >= 0.), 'sample weight must be non-negative'
     self.data, sample_weight = reorder_by_first(data, sample_weight)
     self.predictions = numpy.cumsum(sample_weight) / numpy.sum(sample_weight)