def log_loss(y_true, y_pred, eps=1e-15, normalize=True, sample_weight=None):
    lb = LabelBinarizer()
    T = lb.fit_transform(y_true)
    if T.shape[1] == 1:
        T = np.append(1 - T, T, axis=1)

    # Clipping
    Y = np.clip(y_pred, eps, 1 - eps)

    # This happens in cases when elements in y_pred have type "str".
    if not isinstance(Y, np.ndarray):
        raise ValueError("y_pred should be an array of floats.")

    # If y_pred is of single dimension, assume y_true to be binary
    # and then check.
    if Y.ndim == 1:
        Y = Y[:, np.newaxis]
    if Y.shape[1] == 1:
        Y = np.append(1 - Y, Y, axis=1)

    # Check if dimensions are consistent.
    val.check_consistent_length(T, Y)
    T = val.check_array(T)
    Y = val.check_array(Y)
    print(T)
    print(Y)
    if T.shape[1] != Y.shape[1]:
        raise ValueError("y_true and y_pred have different number of classes "
                         "%d, %d" % (T.shape[1], Y.shape[1]))

    # Renormalize
    Y /= Y.sum(axis=1)[:, np.newaxis]
    loss = -(T * np.log(Y)).sum(axis=1)

    return _weighted_sum(loss, sample_weight, normalize)
Beispiel #2
0
    def pinball_loss(y_true, y_pred, probs):
        """Compute the pinball loss.

        Parameters
        ----------
        pred : {array-like}, shape = [n_quantiles, n_samples] or [n_samples]
            Predictions.
        y : {array-like}, shape = [n_samples]
            Targets.

        Returns
        -------
        l : {array}, shape = [n_quantiles]
            Average loss for each quantile level.
        """
        probs = asarray(probs).reshape(-1)
        check_consistent_length(y_true, y_pred.T)
        y_true = check_array(y_true.reshape((-1, 1)),
                             ensure_2d=True)
        y_pred = check_array(y_pred.T.reshape((y_true.shape[0], -1)),
                             ensure_2d=True)
        residual = y_true - y_pred
        loss = npsum([fmax(prob * res, (prob - 1) * res) for (res, prob) in
                      zip(residual.T, probs)], axis=1)
        return loss / y_true.size
Beispiel #3
0
def _check_rows_and_columns(a, b):
    """Unpacks the row and column arrays and checks their shape."""
    check_consistent_length(*a)
    check_consistent_length(*b)
    checks = lambda x: check_array(x, ensure_2d=False)
    a_rows, a_cols = map(checks, a)
    b_rows, b_cols = map(checks, b)
    return a_rows, a_cols, b_rows, b_cols
def test_check_dataframe_fit_attribute():
    # check pandas dataframe with 'fit' column does not raise error
    # https://github.com/scikit-learn/scikit-learn/issues/8415
    try:
        import pandas as pd
        X = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]])
        X_df = pd.DataFrame(X, columns=['a', 'b', 'fit'])
        check_consistent_length(X_df)
    except ImportError:
        raise SkipTest("Pandas not found")
Beispiel #5
0
    def fit(self, X, y, sample_weight=None):
        """
        Build a classifier from the training set (X, y).

        Parameters
        ----------
        X : array-like or sparse matrix of shape = [n_samples, n_features]
            The training input samples.

        y : array-like, shape = [n_samples]
            The target values (class labels in classification).

        sample_weight : array-like, shape = [n_samples] or None
            Individual weights for each sample.

        Returns
        -------
        self : object
            Returns self.
        """
        self._validate_params(**self.get_params())

        X, y = check_X_y(X, y, accept_sparse=True)
        if sp.isspmatrix(X):
            self._is_sparse_train_X = True
        else:
            self._is_sparse_train_X = False
        self._n_samples, self._n_features = X.shape
        sample_weight = self._get_sample_weight(sample_weight)
        check_consistent_length(X, y, sample_weight)
        check_classification_targets(y)
        self._classes = sorted(np.unique(y))
        self._n_classes = len(self._classes)
        self._classes_map = {}

        self._set_params_with_dependencies()
        params = self._get_params()

        if self._n_classes == 2:
            self._classes_map[0] = self._classes[0]
            self._classes_map[1] = self._classes[1]
            self._estimators = [None]
            y = (y == self._classes[0]).astype(int)
            self._fit_binary_task(X, y, sample_weight, params)
        elif self._n_classes > 2:
            if sp.isspmatrix_dok(X):
                X = X.tocsr().tocoo()  # Fix to avoid scipy 7699 issue
            self._estimators = [None] * self._n_classes
            self._fit_multiclass_task(X, y, sample_weight, params)
        else:
            raise ValueError("Classifier can't predict when only one class is present.")

        self._fitted = True

        return self
Beispiel #6
0
def _indexable(X, y):
    """Make arrays indexable for cross-validation. Checks consistent 
    length, passes through None, and ensures that everything can be indexed.

    Parameters
    ----------

    X : array-like or pandas DataFrame, shape = [n_samples, n_features]
        Input data, where n_samples is the number of samples and
        n_features is the number of features.

    y : array-like, shape = [n_samples] or [n_samples, n_output], optional
        Target relative to X for classification or regression;
        None for unsupervised learning.
    """
    result = [_validate_X(X), _validate_y(y)]
    check_consistent_length(*result)
    return result
Beispiel #7
0
    def fit(self, X, y, sample_weight=None):
        """
        Build a regressor from the training set (X, y).

        Parameters
        ----------
        X : array-like or sparse matrix of shape = [n_samples, n_features]
            The training input samples.

        y : array-like, shape = [n_samples]
            The target values (real numbers in regression).

        sample_weight : array-like, shape = [n_samples] or None
            Individual weights for each sample.

        Returns
        -------
        self : object
            Returns self.
        """
        self._validate_params(**self.get_params())

        X, y = check_X_y(X, y, accept_sparse=True)
        if sp.isspmatrix(X):
            self._is_sparse_train_X = True
        else:
            self._is_sparse_train_X = False
        self._n_samples, self._n_features = X.shape
        sample_weight = self._get_sample_weight(sample_weight)
        check_consistent_length(X, y, sample_weight)

        self._set_params_with_dependencies()
        params = self._get_params()

        self._estimators = [None]
        self._fit_regression_task(X, y, sample_weight, params)

        self._fitted = True

        return self
Beispiel #8
0
def _my_lrap(y_true, y_score):
    """Simple implementation of label ranking average precision"""
    check_consistent_length(y_true, y_score)
    y_true = check_array(y_true)
    y_score = check_array(y_score)
    n_samples, n_labels = y_true.shape
    score = np.empty((n_samples, ))
    for i in range(n_samples):
        # The best rank correspond to 1. Rank higher than 1 are worse.
        # The best inverse ranking correspond to n_labels.
        unique_rank, inv_rank = np.unique(y_score[i], return_inverse=True)
        n_ranks = unique_rank.size
        rank = n_ranks - inv_rank

        # Rank need to be corrected to take into account ties
        # ex: rank 1 ex aequo means that both label are rank 2.
        corr_rank = np.bincount(rank, minlength=n_ranks + 1).cumsum()
        rank = corr_rank[rank]

        relevant = y_true[i].nonzero()[0]
        if relevant.size == 0 or relevant.size == n_labels:
            score[i] = 1
            continue

        score[i] = 0.
        for label in relevant:
            # Let's count the number of relevant label with better rank
            # (smaller rank).
            n_ranked_above = sum(rank[r] <= rank[label] for r in relevant)

            # Weight by the rank of the actual label
            score[i] += n_ranked_above / rank[label]

        score[i] /= relevant.size

    return score.mean()
Beispiel #9
0
def test_check_consistent_length():
    check_consistent_length([1], [2], [3], [4], [5])
    check_consistent_length([[1, 2], [[1, 2]]], [1, 2], ['a', 'b'])
    check_consistent_length([1], (2,), np.array([3]), sp.csr_matrix((1, 2)))
    assert_raises_regexp(ValueError, 'inconsistent numbers of samples',
                         check_consistent_length, [1, 2], [1])
    assert_raises_regexp(TypeError, 'got <\w+ \'int\'>',
                         check_consistent_length, [1, 2], 1)
    assert_raises_regexp(TypeError, 'got <\w+ \'object\'>',
                         check_consistent_length, [1, 2], object())

    assert_raises(TypeError, check_consistent_length, [1, 2], np.array(1))
    # Despite ensembles having __len__ they must raise TypeError
    assert_raises_regexp(TypeError, 'estimator', check_consistent_length,
                         [1, 2], RandomForestRegressor())
Beispiel #10
0
def test_check_consistent_length():
    check_consistent_length([1], [2], [3], [4], [5])
    check_consistent_length([[1, 2], [[1, 2]]], [1, 2], ['a', 'b'])
    check_consistent_length([1], (2, ), np.array([3]), sp.csr_matrix((1, 2)))
    assert_raises_regex(ValueError, 'inconsistent numbers of samples',
                        check_consistent_length, [1, 2], [1])
    assert_raises_regex(TypeError, r"got <\w+ 'int'>", check_consistent_length,
                        [1, 2], 1)
    assert_raises_regex(TypeError, r"got <\w+ 'object'>",
                        check_consistent_length, [1, 2], object())

    assert_raises(TypeError, check_consistent_length, [1, 2], np.array(1))
    # Despite ensembles having __len__ they must raise TypeError
    assert_raises_regex(TypeError, 'Expected sequence or array-like',
                        check_consistent_length, [1, 2],
                        RandomForestRegressor())
def check_consistent_length(u, i, r):
    skval.check_consistent_length(u, i, r)
    return np.asarray(u), np.asarray(i), np.asarray(r, dtype=DTYPE)
Beispiel #12
0
def wpearsonr(x, y, w=None):
    """Utility function to calculate the weighted Pearson correlation of two
    samples.

    See https://stats.stackexchange.com/questions/221246/such-thing-as-a-weighted-correlation
    for more information

    Parameters
    ----------
    x : array, shape (n,)
        Input x.

    y : array, shape (n,)
        Input y.

    w : array, shape (n,)
        Weights w.

    Returns
    -------
    scores : float in range of [-1,1]
        Weighted Pearson Correlation between x and y.

    """

    # unweighted version
    # note the return is different
    # TODO: fix output differences
    if w is None:
        return pearsonr(x, y)

    x = np.asarray(x)
    y = np.asarray(y)
    w = np.asarray(w)

    check_consistent_length([x, y, w])
    # n = len(x)

    w_sum = w.sum()
    mx = np.sum(x * w) / w_sum
    my = np.sum(y * w) / w_sum

    xm, ym = (x - mx), (y - my)

    r_num = np.sum(xm * ym * w) / w_sum

    xm2 = np.sum(xm * xm * w) / w_sum
    ym2 = np.sum(ym * ym * w) / w_sum

    r_den = np.sqrt(xm2 * ym2)
    r = r_num / r_den

    r = max(min(r, 1.0), -1.0)

    # TODO: disable p value calculation due to python 2.7 break
    #    df = n_train_ - 2
    #
    #    if abs(r) == 1.0:
    #        prob = 0.0
    #    else:
    #        t_squared = r ** 2 * (df / ((1.0 - r) * (1.0 + r)))
    #        prob = _betai(0.5 * df, 0.5, df / (df + t_squared))
    return r  # , prob
Beispiel #13
0
    def _daal_fit(self, X, y):
        self._check_daal_supported_parameters()
        _supported_dtypes_ = [np.single, np.double]
        X = check_array(X, dtype=_supported_dtypes_)
        y = np.asarray(y)
        y = np.atleast_1d(y)

        if y.ndim == 2 and y.shape[1] == 1:
            warnings.warn("A column-vector y was passed when a 1d array was"
                 " expected. Please change the shape of y to "
                 "(n_samples,), for example using ravel().",
                 DataConversionWarning, stacklevel=2)

        check_consistent_length(X, y)

        if y.ndim == 1:
            # reshape is necessary to preserve the data contiguity against vs
            # [:, np.newaxis] that does not.
            y = np.reshape(y, (-1, 1))

        self.n_outputs_ = y.shape[1]

        if self.n_outputs_ != 1:
            _class_name = self.__class__.__name__
            raise ValueError(_class_name + " does not currently support multi-output data. Consider using OneHotEncoder")

        y = check_array(y, ensure_2d=False, dtype=None)
        y, _ = self._validate_y_class_weight(y)
        self.n_classes_ = self.n_classes_[0]
        self.classes_ = self.classes_[0]

        self.n_features_ = X.shape[1]

        rs_ = check_random_state(self.random_state)
        seed_ = rs_.randint(0, np.iinfo('i').max)

        if self.n_classes_ < 2:
            raise ValueError("Training data only contain information about one class.")

        # create algorithm
        X_fptype = getFPType(X)
        daal_engine_ = daal4py.engines_mt2203(seed=seed_, fptype=X_fptype)
        _featuresPerNode = _to_absolute_max_features(self.max_features, X.shape[1], is_classification=True)

        dfc_algorithm = daal4py.decision_forest_classification_training(
            nClasses=int(self.n_classes_),
            fptype=X_fptype,
            method='defaultDense',
            nTrees=int(self.n_estimators),
            observationsPerTreeFraction=1,
            featuresPerNode=int(_featuresPerNode),
            maxTreeDepth=int(0 if self.max_depth is None else self.max_depth),
            minObservationsInLeafNode=int(self.min_samples_leaf),
            engine=daal_engine_,
            impurityThreshold=float(0.0 if self.min_impurity_split is None else self.min_impurity_split),
            varImportance="MDI",
            resultsToCompute="",
            memorySavingMode=False,
            bootstrap=bool(self.bootstrap)
        )
        self._cached_estimators_ = None
        # compute
        dfc_trainingResult = dfc_algorithm.compute(X, y)

        # get resulting model
        model = dfc_trainingResult.model
        self.daal_model_ = model

        # compute oob_score_
        if self.oob_score:
            self._set_oob_score(X, y)

        return self
Beispiel #14
0
    def fit(self,
            data,
            sites,
            discrete_covariates=None,
            continuous_covariates=None):
        """Compute the parameters to perform the harmonization/normalization

        Parameters
        ----------
        data : array-like, shape [n_samples, n_features]
            The data used to compute the per-feature statistics
            used for later harmonization along the acquisition sites.
        sites : array-like, shape [n_samples, 1]
            The target variable for harmonization problems (e.g. acquisition sites or batches).
        discrete_covariates : array-like, shape [n_samples, n_discrete_covariates]
            The covariates which are categorical
            (e.g. schizophrenia patient or healthy control).
        continuous_covariates : array-like, shape [n_samples, n_continuous_covariates]
            The covariates which are continuous
            (e.g. age and clinical scores)
        """

        # Reset internal state before fitting
        self._reset()

        data = check_array(data,
                           copy=self.copy,
                           estimator=self,
                           dtype=FLOAT_DTYPES)
        sites = check_array(sites, copy=self.copy, estimator=self)

        check_consistent_length(data, sites)

        if discrete_covariates is not None:
            self.discrete_covariates_used = True
            discrete_covariates = check_array(discrete_covariates,
                                              copy=self.copy,
                                              dtype=None,
                                              estimator=self)

        if continuous_covariates is not None:
            self.continuous_covariates_used = True
            continuous_covariates = check_array(continuous_covariates,
                                                copy=self.copy,
                                                estimator=self,
                                                dtype=FLOAT_DTYPES)

        # To have a similar code to neuroCombat and Combat original scripts
        data = data.T

        sites_names, n_samples_per_site = np.unique(sites, return_counts=True)

        self.sites_names = sites_names
        self.n_sites = len(sites_names)

        n_samples = sites.shape[0]
        idx_per_site = [list(np.where(sites == idx)[0]) for idx in sites_names]

        design = self._make_design_matrix(sites,
                                          discrete_covariates,
                                          continuous_covariates,
                                          fitting=True)

        standardized_data, _ = self._standardize_across_features(
            data, design, n_samples, n_samples_per_site, fitting=True)

        gamma_hat, delta_hat = self._fit_ls_model(standardized_data, design,
                                                  idx_per_site)

        gamma_bar, tau_2, a_prior, b_prior = self._find_priors(
            gamma_hat, delta_hat)

        self.gamma_star, self.delta_star = self._find_parametric_adjustments(
            standardized_data, idx_per_site, gamma_hat, delta_hat, gamma_bar,
            tau_2, a_prior, b_prior)

        return self
def test_check_consistent_length():
    check_consistent_length([1], [2], [3], [4], [5])
    check_consistent_length([[1, 2], [[1, 2]]], [1, 2], ['a', 'b'])
    check_consistent_length([1], (2,), np.array([3]), sp.csr_matrix((1, 2)))
    with pytest.raises(ValueError, match="inconsistent numbers of samples"):
        check_consistent_length([1, 2], [1])
    with pytest.raises(TypeError, match=r"got <\w+ 'int'>"):
        check_consistent_length([1, 2], 1)
    with pytest.raises(TypeError, match=r"got <\w+ 'object'>"):
        check_consistent_length([1, 2], object())

    with pytest.raises(TypeError):
        check_consistent_length([1, 2], np.array(1))

    # Despite ensembles having __len__ they must raise TypeError
    with pytest.raises(TypeError, match="Expected sequence or array-like"):
        check_consistent_length([1, 2], RandomForestRegressor())
Beispiel #16
0
    def fit(self, X, Y):
        """Fit model to data

        Parameters
        ----------
        X : array-like, shape = [n_samples, n_features]
            Training vectors, where n_samples is the number of samples and
            n_features is the number of predictors.

        Y : array-like, shape = [n_samples, 1]
            Target vector, where n_samples is the number of samples.
            This implementation only supports a single response (target) variable.

        """

        # copy since this will contains the residuals (deflated) matrices
        check_consistent_length(X, Y)
        X = check_array(X, dtype=np.float64, copy=True, ensure_min_samples=2)
        Y = check_array(Y, dtype=np.float64, copy=True, ensure_2d=False)
        if Y.ndim == 1:
            Y = Y.reshape(-1, 1)

        X, Y, self.x_mean_, self.y_mean_, self.x_std_, self.y_std_ = _center_scale_xy(
            X, Y, self.scale)

        Z = X.copy()
        w = np.dot(X.T, Y)  # calculate weight vector
        w /= np.linalg.norm(w)  # normalize weight vector

        T = []
        P = []
        Q = []
        W_ortho = []
        T_ortho = []
        P_ortho = []
        Q_ortho = []

        for i in range(self.n_components):
            t = np.dot(Z, w)  # scores vector
            q = np.dot(Y.T, t) / np.dot(t.T, t).item()  # loadings of y
            u = np.dot(Y, q)  # scores of y
            p = np.dot(Z.T, t) / np.dot(t.T, t).item()  # loadings of X
            w_ortho = p - np.dot(w.T, p).item() / np.dot(
                w.T, w).item() * w  # orthogonal weight
            w_ortho = w_ortho / np.linalg.norm(
                w_ortho)  # normalize orthogonal weight
            t_ortho = np.dot(Z, w_ortho)  # orthogonal components
            p_ortho = np.dot(Z.T, t_ortho) / np.dot(t_ortho.T, t_ortho).item()
            # not sure if q_ortho is OK, but it follows q
            q_ortho = np.dot(Y.T, t_ortho) / np.dot(t_ortho.T, t_ortho).item()
            Z -= np.dot(t_ortho, p_ortho.T)
            T.append(t)
            P.append(p)
            Q.append(q)
            W_ortho.append(w_ortho)
            T_ortho.append(t_ortho)
            P_ortho.append(p_ortho)
            Q_ortho.append(q_ortho)

        self.T = np.hstack(T)
        self.P = np.hstack(P)
        self.Q = np.hstack(Q)
        self.W_ortho_ = np.hstack(W_ortho)
        self.T_ortho_ = np.hstack(T_ortho)
        self.P_ortho_ = np.hstack(P_ortho)
        self.Q_ortho_ = np.hstack(Q_ortho)

        self._vipscore()

        return self
    def from_arrays(cls,
                    x,
                    y,
                    d,
                    z=None,
                    use_other_treat_as_covariate=True,
                    force_all_x_finite=True):
        """
        Initialize :class:`DoubleMLData` from :class:`numpy.ndarray`'s.

        Parameters
        ----------
        x : :class:`numpy.ndarray`
            Array of covariates.

        y : :class:`numpy.ndarray`
            Array of the outcome variable.

        d : :class:`numpy.ndarray`
            Array of treatment variables.

        z : None or :class:`numpy.ndarray`
            Array of instrumental variables.
            Default is ``None``.

        use_other_treat_as_covariate : bool
            Indicates whether in the multiple-treatment case the other treatment variables should be added as covariates.
            Default is ``True``.

        force_all_x_finite : bool or str
            Indicates whether to raise an error on infinite values and / or missings in the covariates ``x``.
            Possible values are: ``True`` (neither missings ``np.nan``, ``pd.NA`` nor infinite values ``np.inf`` are
            allowed), ``False`` (missings and infinite values are allowed), ``'allow-nan'`` (only missings are allowed).
            Note that the choice ``False`` and ``'allow-nan'`` are only reasonable if the machine learning methods used
            for the nuisance functions are capable to provide valid predictions with missings and / or infinite values
            in the covariates ``x``.
            Default is ``True``.

        Examples
        --------
        >>> from doubleml import DoubleMLData
        >>> from doubleml.datasets import make_plr_CCDDHNR2018
        >>> (x, y, d) = make_plr_CCDDHNR2018(return_type='array')
        >>> obj_dml_data_from_array = DoubleMLData.from_arrays(x, y, d)
        """
        if isinstance(force_all_x_finite, str):
            if force_all_x_finite != 'allow-nan':
                raise ValueError(
                    "Invalid force_all_x_finite " + force_all_x_finite + ". " +
                    "force_all_x_finite must be True, False or 'allow-nan'.")
        elif not isinstance(force_all_x_finite, bool):
            raise TypeError(
                "Invalid force_all_x_finite. " +
                "force_all_x_finite must be True, False or 'allow-nan'.")

        x = check_array(x,
                        ensure_2d=False,
                        allow_nd=False,
                        force_all_finite=force_all_x_finite)
        d = check_array(d, ensure_2d=False, allow_nd=False)
        y = column_or_1d(y, warn=True)

        x = _assure_2d_array(x)
        d = _assure_2d_array(d)

        y_col = 'y'
        if z is None:
            check_consistent_length(x, y, d)
            z_cols = None
        else:
            z = check_array(z, ensure_2d=False, allow_nd=False)
            z = _assure_2d_array(z)
            check_consistent_length(x, y, d, z)
            if z.shape[1] == 1:
                z_cols = ['z']
            else:
                z_cols = [f'z{i + 1}' for i in np.arange(z.shape[1])]

        if d.shape[1] == 1:
            d_cols = ['d']
        else:
            d_cols = [f'd{i+1}' for i in np.arange(d.shape[1])]

        x_cols = [f'X{i+1}' for i in np.arange(x.shape[1])]

        if z is None:
            data = pd.DataFrame(np.column_stack((x, y, d)),
                                columns=x_cols + [y_col] + d_cols)
        else:
            data = pd.DataFrame(np.column_stack((x, y, d, z)),
                                columns=x_cols + [y_col] + d_cols + z_cols)

        return cls(data, y_col, d_cols, x_cols, z_cols,
                   use_other_treat_as_covariate, force_all_x_finite)
Beispiel #18
0
    def fit(self,
            X,
            y,
            treatment,
            estimator_trmnt_fit_params=None,
            estimator_ctrl_fit_params=None):
        """Fit the model according to the given training data.

        For each test example calculate predictions on new set twice: by the first and second models.
        After that calculate uplift as a delta between these predictions.

        Return delta of predictions for each example.

        Args:
            X (array-like, shape (n_samples, n_features)): Training vector, where n_samples is the number
                of samples and n_features is the number of features.
            y (array-like, shape (n_samples,)): Target vector relative to X.
            treatment (array-like, shape (n_samples,)): Binary treatment vector relative to X.
            estimator_trmnt_fit_params (dict, optional): Parameters to pass to the fit method
                of the treatment estimator.
            estimator_ctrl_fit_params (dict, optional): Parameters to pass to the fit method
                of the control estimator.

        Returns:
            object: self
        """
        # TODO: check the treatment is binary
        check_consistent_length(X, y, treatment)
        self._type_of_target = type_of_target(y)

        X_ctrl, y_ctrl = X[treatment == 0], y[treatment == 0]
        X_trmnt, y_trmnt = X[treatment == 1], y[treatment == 1]

        if estimator_trmnt_fit_params is None:
            estimator_trmnt_fit_params = {}
        if estimator_ctrl_fit_params is None:
            estimator_ctrl_fit_params = {}

        if self.method == 'vanilla':
            self.estimator_ctrl.fit(X_ctrl, y_ctrl,
                                    **estimator_ctrl_fit_params)
            self.estimator_trmnt.fit(X_trmnt, y_trmnt,
                                     **estimator_trmnt_fit_params)

        if self.method == 'ddr_control':
            self.estimator_ctrl.fit(X_ctrl, y_ctrl,
                                    **estimator_ctrl_fit_params)
            if self._type_of_target == 'binary':
                ddr_control = self.estimator_ctrl.predict_proba(X_trmnt)[:, 1]
            else:
                ddr_control = self.estimator_ctrl.predict(X_trmnt)

            if isinstance(X_trmnt, np.ndarray):
                X_trmnt_mod = np.column_stack((X_trmnt, ddr_control))
            elif isinstance(X_trmnt, pd.DataFrame):
                X_trmnt_mod = X_trmnt.assign(ddr_control=ddr_control)
            else:
                raise TypeError(
                    "Expected numpy.ndarray or pandas.DataFrame, got %s" %
                    type(X_trmnt))

            self.estimator_trmnt.fit(X_trmnt_mod, y_trmnt,
                                     **estimator_trmnt_fit_params)

        if self.method == 'ddr_treatment':
            self.estimator_trmnt.fit(X_trmnt, y_trmnt,
                                     **estimator_trmnt_fit_params)
            if self._type_of_target == 'binary':
                ddr_treatment = self.estimator_trmnt.predict_proba(X_ctrl)[:,
                                                                           1]
            else:
                ddr_treatment = self.estimator_trmnt.predict(X_ctrl)

            if isinstance(X_ctrl, np.ndarray):
                X_ctrl_mod = np.column_stack((X_ctrl, ddr_treatment))
            elif isinstance(X_trmnt, pd.DataFrame):
                X_ctrl_mod = X_ctrl.assign(ddr_treatment=ddr_treatment)
            else:
                raise TypeError(
                    "Expected numpy.ndarray or pandas.DataFrame, got %s" %
                    type(X_ctrl))

            self.estimator_ctrl.fit(X_ctrl_mod, y_ctrl,
                                    **estimator_ctrl_fit_params)

        return self
Beispiel #19
0
    def _binary_clf_curve(self, y_true, y_score, pos_label=None, sample_weight=None):
        """Calculate true and false positives per binary classification threshold.
        Parameters
        ----------
        y_true : array, shape = [n_samples]
            True targets of binary classification
        y_score : array, shape = [n_samples]
            Estimated probabilities or decision function
        pos_label : int or str, default=None
            The label of the positive class
        sample_weight : array-like of shape (n_samples,), default=None
            Sample weights.
        Returns
        -------
        fps : array, shape = [n_thresholds]
            A count of false positives, at index i being the number of negative
            samples assigned a score >= thresholds[i]. The total number of
            negative samples is equal to fps[-1] (thus true negatives are given by
            fps[-1] - fps).
        tps : array, shape = [n_thresholds <= len(np.unique(y_score))]
            An increasing count of true positives, at index i being the number
            of positive samples assigned a score >= thresholds[i]. The total
            number of positive samples is equal to tps[-1] (thus false negatives
            are given by tps[-1] - tps).
        thresholds : array, shape = [n_thresholds]
            Decreasing score values.
        """
        # Check to make sure y_true is valid
        y_type = type_of_target(y_true)
        if not (y_type == "binary" or
                (y_type == "multiclass" and pos_label is not None)):
            raise ValueError("{0} format is not supported".format(y_type))

        check_consistent_length(y_true, y_score, sample_weight)
        y_true = column_or_1d(y_true)
        y_score = column_or_1d(y_score)
        assert_all_finite(y_true)
        assert_all_finite(y_score)

        if sample_weight is not None:
            sample_weight = column_or_1d(sample_weight)

        # ensure binary classification if pos_label is not specified
        # classes.dtype.kind in ('O', 'U', 'S') is required to avoid
        # triggering a FutureWarning by calling np.array_equal(a, b)
        # when elements in the two arrays are not comparable.
        classes = np.unique(y_true)
        if (pos_label is None and (
                classes.dtype.kind in ('O', 'U', 'S') or
                not (np.array_equal(classes, [0, 1]) or
                    np.array_equal(classes, [-1, 1]) or
                    np.array_equal(classes, [0]) or
                    np.array_equal(classes, [-1]) or
                    np.array_equal(classes, [1])))):
            classes_repr = ", ".join(repr(c) for c in classes)
            raise ValueError("y_true takes value in {{{classes_repr}}} and "
                            "pos_label is not specified: either make y_true "
                            "take value in {{0, 1}} or {{-1, 1}} or "
                            "pass pos_label explicitly.".format(
                                classes_repr=classes_repr))
        elif pos_label is None:
            pos_label = 1.

        # make y_true a boolean vector
        y_true = (y_true == pos_label)

        # sort scores and corresponding truth values
        desc_score_indices = np.argsort(y_score, kind="mergesort")[::-1]
        y_score = y_score[desc_score_indices]
        y_true = y_true[desc_score_indices]
        if sample_weight is not None:
            weight = sample_weight[desc_score_indices]
        else:
            weight = 1.

        # y_score typically has many tied values. Here we extract
        # the indices associated with the distinct values. We also
        # concatenate a value for the end of the curve.
        distinct_value_indices = np.where(np.diff(y_score))[0]
        threshold_idxs = np.r_[distinct_value_indices, y_true.size - 1]

        # accumulate the true positives with decreasing threshold
        tps = stable_cumsum(y_true * weight)[threshold_idxs]
        positives = stable_cumsum(y_true)[threshold_idxs] # Note that the number of positive should be computed differently
        if sample_weight is not None:
            # express fps as a cumsum to ensure fps is increasing even in
            # the presence of floating point errors
            fps = stable_cumsum((1 - y_true))[threshold_idxs]
        else:
            fps = 1 + threshold_idxs - tps
        return fps, tps, y_score[threshold_idxs], positives
Beispiel #20
0
def _validate_and_reformat_input(X,
                                 y=None,
                                 expect_y=True,
                                 enforce_binary_labels=False,
                                 **kwargs):
    """Validate input data and return the data in an appropriate format.

    The :code:`**kwargs` can contain :code:`sensitive_features=` and :code:`control_features=`
    parameters.

    Parameters
    ----------
    X : numpy.ndarray, pandas.DataFrame
        The feature matrix
    y : numpy.ndarray, pandas.DataFrame, pandas.Series, or list
        The label vector
    expect_y : bool
        If True y needs to be provided, otherwise ignores the argument; default True
    enforce_binary_labels : bool
        If True raise exception if there are more than two distinct
        values in the `y` data; default False

    Returns
    -------
    Tuple(pandas.DataFrame, pandas.Series, pandas.Series, pandas.Series)
        The validated and reformatted X, y, sensitive_features and control_features; note
        that certain estimators rely on metadata encoded in X which may be stripped during
        the reformatting process, so mitigation methods should ideally use the input X instead
        of the returned X for training estimators and leave potential reformatting of X to the
        estimator.

    """
    if y is not None:
        # calling check_X_y with a 2-dimensional y causes a warning, so ensure it is 1-dimensional
        if isinstance(y, np.ndarray) and len(y.shape) == 2 and y.shape[1] == 1:
            y = y.reshape(-1)
        elif isinstance(y, pd.DataFrame) and y.shape[1] == 1:
            y = y.to_numpy().reshape(-1)

        X, y = check_X_y(X, y, dtype=None, force_all_finite=False)
        y = check_array(y, ensure_2d=False, dtype='numeric')
        if enforce_binary_labels and not set(np.unique(y)).issubset(set([0, 1
                                                                         ])):
            raise ValueError(_LABELS_NOT_0_1_ERROR_MESSAGE)
    elif expect_y:
        raise ValueError(_MESSAGE_Y_NONE)
    else:
        X = check_array(X)

    sensitive_features = kwargs.get(_KW_SENSITIVE_FEATURES)
    if sensitive_features is None:
        raise ValueError(_MESSAGE_SENSITIVE_FEATURES_NONE)

    check_consistent_length(X, sensitive_features)
    sensitive_features = check_array(sensitive_features,
                                     ensure_2d=False,
                                     dtype=None)

    # compress multiple sensitive features into a single column
    if len(sensitive_features.shape) > 1 and sensitive_features.shape[1] > 1:
        sensitive_features = _merge_columns(sensitive_features)

    # Handle the control features
    control_features = kwargs.get(_KW_CONTROL_FEATURES)
    if control_features is not None:
        check_consistent_length(X, control_features)
        control_features = check_array(control_features,
                                       ensure_2d=False,
                                       dtype=None)

        # compress multiple control features into a single column
        if len(control_features.shape) > 1 and control_features.shape[1] > 1:
            control_features = _merge_columns(control_features)

        control_features = pd.Series(control_features.squeeze())

    # If we don't have a y, then need to fiddle with return type to
    # avoid a warning from pandas
    if y is not None:
        result_y = pd.Series(y)
    else:
        result_y = pd.Series(dtype="float64")

    return pd.DataFrame(X), result_y, pd.Series(
        sensitive_features.squeeze()), control_features
Beispiel #21
0
def cv_split(cv, X, y, groups):
    check_consistent_length(X, y, groups)
    return list(cv.split(X, y, groups))
Beispiel #22
0
def _fit_regressor(self, X, y, sample_weight=None):
    if sp.issparse(y):
        raise ValueError(
            "sparse multilabel-indicator for y is not supported."
        )
    _check_parameters(self)
    if sample_weight is not None:
        sample_weight = check_sample_weight(sample_weight, X)

    if sklearn_check_version('1.0') and self.criterion == "mse":
        warnings.warn(
            "Criterion 'mse' was deprecated in v1.0 and will be "
            "removed in version 1.2. Use `criterion='squared_error'` "
            "which is equivalent.",
            FutureWarning
        )

    _patching_status = PatchingConditionsChain(
        "sklearn.ensemble.RandomForestRegressor.fit")
    _dal_ready = _patching_status.and_conditions([
        (self.oob_score and daal_check_version((2021, 'P', 500)) or not self.oob_score,
            "OOB score is only supported starting from 2021.5 version of oneDAL."),
        (self.warm_start is False, "Warm start is not supported."),
        (self.criterion in ["mse", "squared_error"],
            f"'{self.criterion}' criterion is not supported. "
            "Only 'mse' and 'squared_error' criteria are supported."),
        (self.ccp_alpha == 0.0,
            f"Non-zero 'ccp_alpha' ({self.ccp_alpha}) is not supported."),
        (not sp.issparse(X), "X is sparse. Sparse input is not supported.")
    ])

    if _dal_ready:
        if sklearn_check_version("1.0"):
            self._check_feature_names(X, reset=True)
        X = check_array(X, dtype=[np.float64, np.float32])
        y = np.asarray(y)
        y = np.atleast_1d(y)

        if y.ndim == 2 and y.shape[1] == 1:
            warnings.warn("A column-vector y was passed when a 1d array was"
                          " expected. Please change the shape of y to "
                          "(n_samples,), for example using ravel().",
                          DataConversionWarning, stacklevel=2)

        y = check_array(y, ensure_2d=False, dtype=X.dtype)
        check_consistent_length(X, y)

        if y.ndim == 1:
            # reshape is necessary to preserve the data contiguity against vs
            # [:, np.newaxis] that does not.
            y = np.reshape(y, (-1, 1))

        self.n_outputs_ = y.shape[1]
        _dal_ready = _patching_status.and_conditions([
            (self.n_outputs_ == 1, f"Number of outputs ({self.n_outputs_}) is not 1.")])

    _patching_status.write_log()
    if _dal_ready:
        _daal_fit_regressor(self, X, y, sample_weight=sample_weight)

        self.estimators_ = self._estimators_
        return self
    return super(RandomForestRegressor, self).fit(
        X, y, sample_weight=sample_weight)
Beispiel #23
0
    def from_arrays(cls, x, y, d, z=None, use_other_treat_as_covariate=True):
        """
        Initialize :class:`DoubleMLData` from :class:`numpy.ndarray`'s.

        Parameters
        ----------
        x : :class:`numpy.ndarray`
            Array of covariates.

        y : :class:`numpy.ndarray`
            Array of the outcome variable.

        d : :class:`numpy.ndarray`
            Array of treatment variables.

        z : None or :class:`numpy.ndarray`
            Array of instrumental variables.
            Default is ``None``.

        use_other_treat_as_covariate : bool
            Indicates whether in the multiple-treatment case the other treatment variables should be added as covariates.
            Default is ``True``.

        Examples
        --------
        >>> from doubleml import DoubleMLData
        >>> from doubleml.datasets import make_plr_CCDDHNR2018
        >>> (x, y, d) = make_plr_CCDDHNR2018(return_type='array')
        >>> obj_dml_data_from_array = DoubleMLData.from_arrays(x, y, d)
        """
        x = check_array(x, ensure_2d=False, allow_nd=False)
        d = check_array(d, ensure_2d=False, allow_nd=False)
        y = column_or_1d(y, warn=True)

        x = _assure_2d_array(x)
        d = _assure_2d_array(d)

        y_col = 'y'
        if z is None:
            check_consistent_length(x, y, d)
            z_cols = None
        else:
            z = check_array(z, ensure_2d=False, allow_nd=False)
            z = _assure_2d_array(z)
            check_consistent_length(x, y, d, z)
            if z.shape[1] == 1:
                z_cols = ['z']
            else:
                z_cols = [f'z{i + 1}' for i in np.arange(z.shape[1])]

        if d.shape[1] == 1:
            d_cols = ['d']
        else:
            d_cols = [f'd{i+1}' for i in np.arange(d.shape[1])]

        x_cols = [f'X{i+1}' for i in np.arange(x.shape[1])]

        if z is None:
            data = pd.DataFrame(np.column_stack((x, y, d)),
                                columns=x_cols + [y_col] + d_cols)
        else:
            data = pd.DataFrame(np.column_stack((x, y, d, z)),
                                columns=x_cols + [y_col] + d_cols + z_cols)

        return cls(data, y_col, d_cols, x_cols, z_cols,
                   use_other_treat_as_covariate)
Beispiel #24
0
    def fit(self, X, y, sample_weight=None):
        """
        Build a RGF Classifier from the training set (X, y).

        Parameters
        ----------
        X : array-like or sparse matrix of shape = [n_samples, n_features]
            The training input samples.

        y : array-like, shape = [n_samples]
            The target values (class labels in classification).

        sample_weight : array-like, shape = [n_samples] or None
            Individual weights for each sample.

        Returns
        -------
        self : object
            Returns self.
        """
        _validate_params(**self.get_params())

        X, y = check_X_y(X, y, accept_sparse=True)
        n_samples, self._n_features = X.shape

        if self.sl2 is None:
            self._sl2 = self.l2
        else:
            self._sl2 = self.sl2

        if isinstance(self.min_samples_leaf, _FLOATS):
            self._min_samples_leaf = ceil(self.min_samples_leaf * n_samples)
        else:
            self._min_samples_leaf = self.min_samples_leaf

        if self.n_iter is None:
            if self.loss == "LS":
                self._n_iter = 10
            else:
                self._n_iter = 5
        else:
            self._n_iter = self.n_iter

        if sample_weight is None:
            sample_weight = np.ones(n_samples, dtype=np.float32)
        else:
            sample_weight = column_or_1d(sample_weight, warn=True)
            if (sample_weight <= 0).any():
                raise ValueError("Sample weights must be positive.")
        check_consistent_length(X, y, sample_weight)
        check_classification_targets(y)

        self._classes = sorted(np.unique(y))
        self._n_classes = len(self._classes)
        self._classes_map = {}

        params = dict(max_leaf=self.max_leaf,
                      test_interval=self.test_interval,
                      algorithm=self.algorithm,
                      loss=self.loss,
                      reg_depth=self.reg_depth,
                      l2=self.l2,
                      sl2=self._sl2,
                      normalize=self.normalize,
                      min_samples_leaf=self._min_samples_leaf,
                      n_iter=self._n_iter,
                      n_tree_search=self.n_tree_search,
                      opt_interval=self.opt_interval,
                      learning_rate=self.learning_rate,
                      memory_policy=self.memory_policy,
                      verbose=self.verbose)
        if self._n_classes == 2:
            self._classes_map[0] = self._classes[0]
            self._classes_map[1] = self._classes[1]
            self._estimators = [None]
            y = (y == self._classes[0]).astype(int)
            self._estimators[0] = _RGFBinaryClassifier(**params)
            self._estimators[0].fit(X, y, sample_weight)
        elif self._n_classes > 2:
            if sp.isspmatrix_dok(X):
                X = X.tocsr().tocoo()  # Fix to avoid scipy 7699 issue
            self._estimators = [None] * self._n_classes
            ovr_list = [None] * self._n_classes
            for i, cls_num in enumerate(self._classes):
                self._classes_map[i] = cls_num
                ovr_list[i] = (y == cls_num).astype(int)
                self._estimators[i] = _RGFBinaryClassifier(**params)
            self._estimators = Parallel(n_jobs=self.n_jobs)(delayed(_fit_ovr_binary)(self._estimators[i],
                                                                                     X,
                                                                                     ovr_list[i],
                                                                                     sample_weight)
                                                            for i in range(self._n_classes))
        else:
            raise ValueError("Classifier can't predict when only one class is present.")

        self._fitted = True
        return self
Beispiel #25
0
    def fit(self, X, y, sample_weight=None):
        """
        Build a RGF Regressor from the training set (X, y).

        Parameters
        ----------
        X : array-like or sparse matrix of shape = [n_samples, n_features]
            The training input samples.

        y : array-like, shape = [n_samples]
            The target values (real numbers in regression).

        sample_weight : array-like, shape = [n_samples] or None
            Individual weights for each sample.

        Returns
        -------
        self : object
            Returns self.
        """
        _validate_params(**self.get_params())

        X, y = check_X_y(X, y, accept_sparse=True, multi_output=False, y_numeric=True)
        n_samples, self._n_features = X.shape

        if self.sl2 is None:
            self._sl2 = self.l2
        else:
            self._sl2 = self.sl2

        if isinstance(self.min_samples_leaf, _FLOATS):
            self._min_samples_leaf = ceil(self.min_samples_leaf * n_samples)
        else:
            self._min_samples_leaf = self.min_samples_leaf

        if self.n_iter is None:
            if self.loss == "LS":
                self._n_iter = 10
            else:
                self._n_iter = 5
        else:
            self._n_iter = self.n_iter

        if sample_weight is None:
            sample_weight = np.ones(n_samples, dtype=np.float32)
        else:
            sample_weight = column_or_1d(sample_weight, warn=True)
            if (sample_weight <= 0).any():
                raise ValueError("Sample weights must be positive.")
        check_consistent_length(X, y, sample_weight)

        train_x_loc = os.path.join(_TEMP_PATH, self._file_prefix + ".train.data.x")
        train_y_loc = os.path.join(_TEMP_PATH, self._file_prefix + ".train.data.y")
        train_weight_loc = os.path.join(_TEMP_PATH, self._file_prefix + ".train.data.weight")
        if sp.isspmatrix(X):
            _sparse_savetxt(train_x_loc, X)
        else:
            np.savetxt(train_x_loc, X, delimiter=' ', fmt="%s")
        np.savetxt(train_y_loc, y, delimiter=' ', fmt="%s")
        np.savetxt(train_weight_loc, sample_weight, delimiter=' ', fmt="%s")

        # Format train command
        params = []
        if self.verbose > 0:
            params.append("Verbose")
        if self.verbose > 5:
            params.append("Verbose_opt")  # Add some info on weight optimization
        if self.normalize:
            params.append("NormalizeTarget")
        params.append("train_x_fn=%s" % train_x_loc)
        params.append("train_y_fn=%s" % train_y_loc)
        params.append("algorithm=%s" % self.algorithm)
        params.append("loss=%s" % self.loss)
        params.append("max_leaf_forest=%s" % self.max_leaf)
        params.append("test_interval=%s" % self.test_interval)
        params.append("reg_L2=%s" % self.l2)
        params.append("reg_sL2=%s" % self._sl2)
        params.append("reg_depth=%s" % self.reg_depth)
        params.append("min_pop=%s" % self._min_samples_leaf)
        params.append("num_iteration_opt=%s" % self._n_iter)
        params.append("num_tree_search=%s" % self.n_tree_search)
        params.append("opt_interval=%s" % self.opt_interval)
        params.append("opt_stepsize=%s" % self.learning_rate)
        params.append("memory_policy=%s" % self.memory_policy.title())
        params.append("model_fn_prefix=%s" % os.path.join(_TEMP_PATH, self._file_prefix + ".model"))
        params.append("train_w_fn=%s" % train_weight_loc)

        cmd = (_EXE_PATH, "train", ",".join(params))

        # Train
        output = subprocess.Popen(cmd,
                                  stdout=subprocess.PIPE,
                                  stderr=subprocess.STDOUT,
                                  universal_newlines=True).communicate()

        if self.verbose:
            for k in output:
                print(k)

        self._fitted = True

        # Find latest model location
        model_glob = os.path.join(_TEMP_PATH, self._file_prefix + ".model*")
        model_files = glob(model_glob)
        if not model_files:
            raise Exception('Model learning result is not found in {0}. '
                            'Training is abnormally finished.'.format(_TEMP_PATH))
        self._latest_model_loc = sorted(model_files, reverse=True)[0]
        return self
Beispiel #26
0
def cv_split(cv, X, y, groups, is_pairwise, cache):
    check_consistent_length(X, y, groups)
    return CVCache(list(cv.split(X, y, groups)), is_pairwise, cache, _num_samples(X))
Beispiel #27
0
def _check_reg_targets(y_true, y_pred, multioutput):
    """Check that y_true and y_pred belong to the same regression task

    Parameters
    ----------
    y_true : array-like,

    y_pred : array-like,

    multioutput : array-like or string in ['raw_values', uniform_average',
        'variance_weighted'] or None
        None is accepted due to backward compatibility of r2_score().

    Returns
    -------
    type_true : one of {'continuous', continuous-multioutput'}
        The type of the true target data, as output by
        'utils.multiclass.type_of_target'

    y_true : array-like of shape = (n_samples, n_outputs)
        Ground truth (correct) target values.

    y_pred : array-like of shape = (n_samples, n_outputs)
        Estimated target values.

    multioutput : array-like of shape = (n_outputs) or string in ['raw_values',
        uniform_average', 'variance_weighted'] or None
        Custom output weights if ``multioutput`` is array-like or
        just the corresponding argument if ``multioutput`` is a
        correct keyword.

    """
    check_consistent_length(y_true, y_pred)
    y_true = check_array(y_true, ensure_2d=False)
    y_pred = check_array(y_pred, ensure_2d=False)

    if y_true.ndim == 1:
        y_true = y_true.reshape((-1, 1))

    if y_pred.ndim == 1:
        y_pred = y_pred.reshape((-1, 1))

    if y_true.shape[1] != y_pred.shape[1]:
        raise ValueError("y_true and y_pred have different number of output "
                         "({0}!={1})".format(y_true.shape[1], y_pred.shape[1]))

    n_outputs = y_true.shape[1]
    multioutput_options = (None, 'raw_values', 'uniform_average',
                           'variance_weighted')
    if multioutput not in multioutput_options:
        multioutput = check_array(multioutput, ensure_2d=False)
        if n_outputs == 1:
            raise ValueError("Custom weights are useful only in "
                             "multi-output cases.")
        elif n_outputs != len(multioutput):
            raise ValueError(("There must be equally many custom weights "
                              "(%d) as outputs (%d).") %
                             (len(multioutput), n_outputs))
    y_type = 'continuous' if n_outputs == 1 else 'continuous-multioutput'

    return y_type, y_true, y_pred, multioutput
Beispiel #28
0
    def fit(self, X, y, sample_weight=None):
        """
        Build a RGF Regressor from the training set (X, y).

        Parameters
        ----------
        X : array-like or sparse matrix of shape = [n_samples, n_features]
            The training input samples.

        y : array-like, shape = [n_samples]
            The target values (real numbers in regression).

        sample_weight : array-like, shape = [n_samples] or None
            Individual weights for each sample.

        Returns
        -------
        self : object
            Returns self.
        """
        _validate_params(**self.get_params())

        X, y = check_X_y(X,
                         y,
                         accept_sparse=True,
                         multi_output=False,
                         y_numeric=True)
        n_samples, self._n_features = X.shape

        if self.sl2 is None:
            self._sl2 = self.l2
        else:
            self._sl2 = self.sl2

        if isinstance(self.min_samples_leaf, _FLOATS):
            self._min_samples_leaf = ceil(self.min_samples_leaf * n_samples)
        else:
            self._min_samples_leaf = self.min_samples_leaf

        if self.n_iter is None:
            if self.loss == "LS":
                self._n_iter = 10
            else:
                self._n_iter = 5
        else:
            self._n_iter = self.n_iter

        if sample_weight is None:
            sample_weight = np.ones(n_samples, dtype=np.float32)
        else:
            sample_weight = column_or_1d(sample_weight, warn=True)
            if (sample_weight <= 0).any():
                raise ValueError("Sample weights must be positive.")
        check_consistent_length(X, y, sample_weight)

        train_x_loc = os.path.join(_TEMP_PATH,
                                   self._file_prefix + ".train.data.x")
        train_y_loc = os.path.join(_TEMP_PATH,
                                   self._file_prefix + ".train.data.y")
        train_weight_loc = os.path.join(
            _TEMP_PATH, self._file_prefix + ".train.data.weight")
        if sp.isspmatrix(X):
            _sparse_savetxt(train_x_loc, X)
        else:
            np.savetxt(train_x_loc, X, delimiter=' ', fmt="%s")
        np.savetxt(train_y_loc, y, delimiter=' ', fmt="%s")
        np.savetxt(train_weight_loc, sample_weight, delimiter=' ', fmt="%s")

        # Format train command
        params = []
        if self.verbose > 0:
            params.append("Verbose")
        if self.verbose > 5:
            params.append(
                "Verbose_opt")  # Add some info on weight optimization
        if self.normalize:
            params.append("NormalizeTarget")
        params.append("train_x_fn=%s" % train_x_loc)
        params.append("train_y_fn=%s" % train_y_loc)
        params.append("algorithm=%s" % self.algorithm)
        params.append("loss=%s" % self.loss)
        params.append("max_leaf_forest=%s" % self.max_leaf)
        params.append("test_interval=%s" % self.test_interval)
        params.append("reg_L2=%s" % self.l2)
        params.append("reg_sL2=%s" % self._sl2)
        params.append("reg_depth=%s" % self.reg_depth)
        params.append("min_pop=%s" % self._min_samples_leaf)
        params.append("num_iteration_opt=%s" % self._n_iter)
        params.append("num_tree_search=%s" % self.n_tree_search)
        params.append("opt_interval=%s" % self.opt_interval)
        params.append("opt_stepsize=%s" % self.learning_rate)
        params.append("memory_policy=%s" % self.memory_policy.title())
        params.append("model_fn_prefix=%s" %
                      os.path.join(_TEMP_PATH, self._file_prefix + ".model"))
        params.append("train_w_fn=%s" % train_weight_loc)

        cmd = (_EXE_PATH, "train", ",".join(params))

        # Train
        output = subprocess.Popen(cmd,
                                  stdout=subprocess.PIPE,
                                  stderr=subprocess.STDOUT,
                                  universal_newlines=True).communicate()

        if self.verbose:
            for k in output:
                print(k)

        self._fitted = True

        # Find latest model location
        model_glob = os.path.join(_TEMP_PATH, self._file_prefix + ".model*")
        model_files = glob(model_glob)
        if not model_files:
            raise Exception(
                'Model learning result is not found in {0}. '
                'Training is abnormally finished.'.format(_TEMP_PATH))
        self._latest_model_loc = sorted(model_files, reverse=True)[0]
        return self
Beispiel #29
0
    def plot_auc_test(self,
                      X,
                      y,
                      cv=1,
                      groups=None,
                      title=None,
                      ax=None,
                      save_fig=False):
        '''plot roc_auc curve for given fitted estimator, must have continuous
        predictons (decision_function or predict_proba) to evaluate model by
        roc_auc metrics(iterables of X, y can be passed or X, y 
        can be splited using cv > 1), to assess model fit performance

        X
            -2D array or list of 2D ndarrays
        y
            -binary or list of class labels
        cv 
            -int, cross-validation generator or an iterable
            - if cv>1, generate splits by StratifyKfold method
        title
            - title added to plot header as to indicate (X, y)
        return
        --------
        ax, mean-auc, std-auc,
       
        data_splits:
           list of test data set in the form of DataFrame (combined X & y)
        '''
        L = locals().copy()
        L.pop('self')
        estimator = self.estimator
        # split test set by cv
        if cv > 1:
            xs = []
            ys = []
            data_splits = tuple(
                _split_cv(X, y=y, cv=cv, groups=groups,
                          random_state=self.seed))
            for x_set, y_set in data_splits:
                xs.append(x_set[1])
                ys.append(y_set[1])
            L.update({'X': xs, 'y': ys, 'cv': 1})
            return self.plot_auc_test(**L)

        self._check_fitted(estimator)
        X = get_flat_list(X)
        y = get_flat_list(y)
        validation.check_consistent_length(X, y)
        fprs = []
        tprs = []
        aucs = []
        n_sample = 0
        for i in range(len(X)):
            x0 = X[i]
            y0 = y[i]
            y_pre = self._pre_continueous(estimator, x0)
            fpr, tpr, threshhold = roc_curve(y0, y_pre, drop_intermediate=True)
            fprs.append(fpr)
            tprs.append(tpr)
            aucs.append(auc(fpr, tpr))
            n_sample += len(x0)
        # -- plot
        if ax is None:
            fig, ax = plt.subplots(1, 1)
        ax = plotter_auc(fprs, tprs, ax=ax)

        header = '-'.join([
            _get_estimator_name(estimator), 'testCV',
            '{} samples'.format(n_sample)
        ])
        if isinstance(title, str):
            header = '-'.join([title, header])
        ax.set_title(header)

        data_splits = [
            pd.concat((pd.DataFrame(i) for i in item), axis=1)
            for item in zip(X, y)
        ]

        if save_fig is True:
            if isinstance(title, str):
                plot_name = 'plots/roc_test_' + title + '.pdf'
            else:
                plot_name = 'plots/roc_test.pdf'
            self.folder.write(plt.gcf(), plot_name)
            plt.close()
        return ax, np.mean(aucs), np.std(aucs), data_splits
Beispiel #30
0
    def fit(self, X, y, sample_weight=None):
        """
        Build a RGF Classifier from the training set (X, y).

        Parameters
        ----------
        X : array-like or sparse matrix of shape = [n_samples, n_features]
            The training input samples.

        y : array-like, shape = [n_samples]
            The target values (class labels in classification).

        sample_weight : array-like, shape = [n_samples] or None
            Individual weights for each sample.

        Returns
        -------
        self : object
            Returns self.
        """
        _validate_params(**self.get_params())

        X, y = check_X_y(X, y, accept_sparse=True)
        n_samples, self._n_features = X.shape

        if self.sl2 is None:
            self._sl2 = self.l2
        else:
            self._sl2 = self.sl2

        if isinstance(self.min_samples_leaf, _FLOATS):
            self._min_samples_leaf = ceil(self.min_samples_leaf * n_samples)
        else:
            self._min_samples_leaf = self.min_samples_leaf

        if self.n_iter is None:
            if self.loss == "LS":
                self._n_iter = 10
            else:
                self._n_iter = 5
        else:
            self._n_iter = self.n_iter

        if sample_weight is None:
            sample_weight = np.ones(n_samples, dtype=np.float32)
        else:
            sample_weight = column_or_1d(sample_weight, warn=True)
            if (sample_weight <= 0).any():
                raise ValueError("Sample weights must be positive.")
        check_consistent_length(X, y, sample_weight)
        check_classification_targets(y)

        self._classes = sorted(np.unique(y))
        self._n_classes = len(self._classes)
        self._classes_map = {}

        params = dict(max_leaf=self.max_leaf,
                      test_interval=self.test_interval,
                      algorithm=self.algorithm,
                      loss=self.loss,
                      reg_depth=self.reg_depth,
                      l2=self.l2,
                      sl2=self._sl2,
                      normalize=self.normalize,
                      min_samples_leaf=self._min_samples_leaf,
                      n_iter=self._n_iter,
                      n_tree_search=self.n_tree_search,
                      opt_interval=self.opt_interval,
                      learning_rate=self.learning_rate,
                      memory_policy=self.memory_policy,
                      verbose=self.verbose)
        if self._n_classes == 2:
            self._classes_map[0] = self._classes[0]
            self._classes_map[1] = self._classes[1]
            self._estimators = [None]
            y = (y == self._classes[0]).astype(int)
            self._estimators[0] = _RGFBinaryClassifier(**params)
            self._estimators[0].fit(X, y, sample_weight)
        elif self._n_classes > 2:
            if sp.isspmatrix_dok(X):
                X = X.tocsr().tocoo()  # Fix to avoid scipy 7699 issue
            self._estimators = [None] * self._n_classes
            ovr_list = [None] * self._n_classes
            for i, cls_num in enumerate(self._classes):
                self._classes_map[i] = cls_num
                ovr_list[i] = (y == cls_num).astype(int)
                self._estimators[i] = _RGFBinaryClassifier(**params)
            self._estimators = Parallel(n_jobs=self.n_jobs)(
                delayed(_fit_ovr_binary)(self._estimators[i], X, ovr_list[i],
                                         sample_weight)
                for i in range(self._n_classes))
        else:
            raise ValueError(
                "Classifier can't predict when only one class is present.")

        self._fitted = True
        return self
Beispiel #31
0
    def transform(self,
                  data,
                  sites,
                  discrete_covariates=None,
                  continuous_covariates=None):
        """Transform data to harmonized space

        Parameters
        ----------
        data : array-like
            Input data that will be transformed.
        sites : array-like
            Site info of the inputted data
        discrete_covariates : array-like
            The covariates which are categorical
        continuous_covariates : array-like
            The covariates which are continuous
        """

        check_is_fitted(self, 'n_sites')

        data = check_array(data,
                           copy=self.copy,
                           estimator=self,
                           dtype=FLOAT_DTYPES)
        sites = check_array(sites, copy=self.copy, estimator=self)

        check_consistent_length(data, sites)

        if hasattr(self, 'discrete_covariates_used'):
            discrete_covariates = check_array(discrete_covariates,
                                              copy=self.copy,
                                              dtype=None,
                                              estimator=self)

        if hasattr(self, 'continuous_covariates_used'):
            continuous_covariates = check_array(continuous_covariates,
                                                copy=self.copy,
                                                estimator=self,
                                                dtype=FLOAT_DTYPES)

        # To have a similar code to neuroCombat and Combat original scripts
        data = data.T

        new_data_sites_name = np.unique(sites)

        # Check all sites from new_data were seen
        if not all(site_name in self.sites_names
                   for site_name in new_data_sites_name):
            raise ValueError(
                'There is a site unseen during the fit method in the data.')

        n_samples = sites.shape[0]
        n_samples_per_site = np.array(
            [np.sum(sites == site_name) for site_name in self.sites_names])
        idx_per_site = [
            list(np.where(sites == site_name)[0])
            for site_name in self.sites_names
        ]

        design = self._make_design_matrix(sites,
                                          discrete_covariates,
                                          continuous_covariates,
                                          fitting=False)

        standardized_data, standardized_mean = self._standardize_across_features(
            data, design, n_samples, n_samples_per_site, fitting=False)

        bayes_data = self._adjust_data_final(standardized_data, design,
                                             standardized_mean,
                                             n_samples_per_site, n_samples,
                                             idx_per_site)

        return bayes_data.T
Beispiel #32
0
def uplift_at_k(y_true, uplift, treatment, strategy, k=0.3):
    """Compute uplift at first k observations by uplift of the total sample.

    Args:
        y_true (1d array-like): Correct (true) target values.
        uplift (1d array-like): Predicted uplift, as returned by a model.
        treatment (1d array-like): Treatment labels.
        k (float or int): If float, should be between 0.0 and 1.0 and represent the proportion of the dataset
            to include in the computation of uplift. If int, represents the absolute number of samples.
        strategy (string, ['overall', 'by_group']): Determines the calculating strategy.

            * ``'overall'``:
                The first step is taking the first k observations of all test data ordered by uplift prediction
                (overall both groups - control and treatment) and conversions in treatment and control groups
                calculated only on them. Then the difference between these conversions is calculated.

            * ``'by_group'``:
                Separately calculates conversions in top k observations in each group (control and treatment)
                sorted by uplift predictions. Then the difference between these conversions is calculated



    .. versionchanged:: 0.1.0

        * Add supporting absolute values for ``k`` parameter
        * Add parameter ``strategy``

    Returns:
        float: Uplift score at first k observations of the total sample.

    See also:
        :func:`.uplift_auc_score`: Compute normalized Area Under the Uplift curve from prediction scores.

        :func:`.qini_auc_score`: Compute normalized Area Under the Qini Curve from prediction scores.
    """
    # ToDo: checker that treatment is binary and all groups is not empty
    check_consistent_length(y_true, uplift, treatment)

    y_true, uplift, treatment = np.array(y_true), np.array(uplift), np.array(
        treatment)

    strategy_methods = ['overall', 'by_group']
    if strategy not in strategy_methods:
        raise ValueError(
            f'Uplift score supports only calculating methods in {strategy_methods},'
            f' got {strategy}.')

    n_samples = len(y_true)
    order = np.argsort(uplift, kind='mergesort')[::-1]
    _, treatment_counts = np.unique(treatment, return_counts=True)
    n_samples_ctrl = treatment_counts[0]
    n_samples_trmnt = treatment_counts[1]

    k_type = np.asarray(k).dtype.kind

    if (k_type == 'i' and (k >= n_samples or k <= 0)
            or k_type == 'f' and (k <= 0 or k >= 1)):
        raise ValueError(
            f'k={k} should be either positive and smaller'
            f' than the number of samples {n_samples} or a float in the '
            f'(0, 1) range')

    if k_type not in ('i', 'f'):
        raise ValueError(f'Invalid value for k: {k_type}')

    if strategy == 'overall':
        if k_type == 'f':
            n_size = int(n_samples * k)
        else:
            n_size = k

        # ToDo: _checker_ there are observations among two groups among first k
        score_ctrl = y_true[order][:n_size][treatment[order][:n_size] ==
                                            0].mean()
        score_trmnt = y_true[order][:n_size][treatment[order][:n_size] ==
                                             1].mean()

    else:  # strategy == 'by_group':
        if k_type == 'f':
            n_ctrl = int((treatment == 0).sum() * k)
            n_trmnt = int((treatment == 1).sum() * k)

        else:
            n_ctrl = k
            n_trmnt = k

        if n_ctrl > n_samples_ctrl:
            raise ValueError(
                f'With k={k}, the number of the first k observations'
                ' bigger than the number of samples'
                f'in the control group: {n_samples_ctrl}')
        if n_trmnt > n_samples_trmnt:
            raise ValueError(
                f'With k={k}, the number of the first k observations'
                ' bigger than the number of samples'
                f'in the treatment group: {n_samples_ctrl}')

        score_ctrl = y_true[order][treatment[order] == 0][:n_ctrl].mean()
        score_trmnt = y_true[order][treatment[order] == 1][:n_trmnt].mean()

    return score_trmnt - score_ctrl
Beispiel #33
0
def _check_reg_targets(y_true, y_pred, multioutput):
    """Check that y_true and y_pred belong to the same regression task

    Parameters
    ----------
    y_true : array-like,

    y_pred : array-like,

    multioutput : array-like or string in ['raw_values', uniform_average',
        'variance_weighted'] or None
        None is accepted due to backward compatibility of r2_score().

    Returns
    -------
    type_true : one of {'continuous', continuous-multioutput'}
        The type of the true target data, as output by
        'utils.multiclass.type_of_target'

    y_true : array-like of shape = (n_samples, n_outputs)
        Ground truth (correct) target values.

    y_pred : array-like of shape = (n_samples, n_outputs)
        Estimated target values.

    multioutput : array-like of shape = (n_outputs) or string in ['raw_values',
        uniform_average', 'variance_weighted'] or None
        Custom output weights if ``multioutput`` is array-like or
        just the corresponding argument if ``multioutput`` is a
        correct keyword.

    """
    check_consistent_length(y_true, y_pred)
    y_true = check_array(y_true, ensure_2d=False)
    y_pred = check_array(y_pred, ensure_2d=False)

    if y_true.ndim == 1:
        y_true = y_true.reshape((-1, 1))

    if y_pred.ndim == 1:
        y_pred = y_pred.reshape((-1, 1))

    if y_true.shape[1] != y_pred.shape[1]:
        raise ValueError("y_true and y_pred have different number of output "
                         "({0}!={1})".format(y_true.shape[1], y_pred.shape[1]))

    n_outputs = y_true.shape[1]
    allowed_multioutput_str = ('raw_values', 'uniform_average',
                               'variance_weighted')
    if isinstance(multioutput, string_types):
        if multioutput not in allowed_multioutput_str:
            raise ValueError("Allowed 'multioutput' string values are {}. "
                             "You provided multioutput={!r}".format(
                                 allowed_multioutput_str, multioutput))
    elif multioutput is not None:
        multioutput = check_array(multioutput, ensure_2d=False)
        if n_outputs == 1:
            raise ValueError("Custom weights are useful only in "
                             "multi-output cases.")
        elif n_outputs != len(multioutput):
            raise ValueError(
                ("There must be equally many custom weights "
                 "(%d) as outputs (%d).") % (len(multioutput), n_outputs))
    y_type = 'continuous' if n_outputs == 1 else 'continuous-multioutput'

    return y_type, y_true, y_pred, multioutput
Beispiel #34
0
def response_rate_by_percentile(y_true,
                                uplift,
                                treatment,
                                group,
                                strategy='overall',
                                bins=10):
    """Compute response rate (target mean in the control or treatment group) at each percentile.

    Args:
        y_true (1d array-like): Correct (true) target values.
        uplift (1d array-like): Predicted uplift, as returned by a model.
        treatment (1d array-like): Treatment labels.
        group (string, ['treatment', 'control']): Group type for computing response rate: treatment or control.

            * ``'treatment'``:
                Values equal 1 in the treatment column.
            * ``'control'``:
                Values equal 0 in the treatment column.

        strategy (string, ['overall', 'by_group']): Determines the calculating strategy. Default is 'overall'.

            * ``'overall'``:
                The first step is taking the first k observations of all test data ordered by uplift prediction
                (overall both groups - control and treatment) and conversions in treatment and control groups
                calculated only on them. Then the difference between these conversions is calculated.
            * ``'by_group'``:
                Separately calculates conversions in top k observations in each group (control and treatment)
                sorted by uplift predictions. Then the difference between these conversions is calculated.

        bins (int): Determines the number of bins (and relative percentile) in the data. Default is 10.
        
    Returns:
        array (shape = [>2]), array (shape = [>2]), array (shape = [>2]):
        response rate at each percentile for control or treatment group,
        variance of the response rate at each percentile,
        group size at each percentile.
    """

    group_types = ['treatment', 'control']
    strategy_methods = ['overall', 'by_group']

    n_samples = len(y_true)
    check_consistent_length(y_true, uplift, treatment)

    if group not in group_types:
        raise ValueError(
            f'Response rate supports only group types in {group_types},'
            f' got {group}.')

    if strategy not in strategy_methods:
        raise ValueError(
            f'Response rate supports only calculating methods in {strategy_methods},'
            f' got {strategy}.')

    if not isinstance(bins, int) or bins <= 0:
        raise ValueError(
            f'Bins should be positive integer. Invalid value bins: {bins}')

    if bins >= n_samples:
        raise ValueError(
            f'Number of bins = {bins} should be smaller than the length of y_true {n_samples}'
        )

    y_true, uplift, treatment = np.array(y_true), np.array(uplift), np.array(
        treatment)
    order = np.argsort(uplift, kind='mergesort')[::-1]

    trmnt_flag = 1 if group == 'treatment' else 0

    if strategy == 'overall':
        y_true_bin = np.array_split(y_true[order], bins)
        trmnt_bin = np.array_split(treatment[order], bins)

        group_size = np.array([
            len(y[trmnt == trmnt_flag])
            for y, trmnt in zip(y_true_bin, trmnt_bin)
        ])
        response_rate = np.array([
            np.mean(y[trmnt == trmnt_flag])
            for y, trmnt in zip(y_true_bin, trmnt_bin)
        ])

    else:  # strategy == 'by_group'
        y_bin = np.array_split(y_true[order][treatment[order] == trmnt_flag],
                               bins)

        group_size = np.array([len(y) for y in y_bin])
        response_rate = np.array([np.mean(y) for y in y_bin])

    variance = np.multiply(response_rate,
                           np.divide((1 - response_rate), group_size))

    return response_rate, variance, group_size
Beispiel #35
0
    def _daal_fit(self, X, y):
        self._check_daal_supported_parameters()
        _supported_dtypes_ = [np.double, np.single]
        X = check_array(X, dtype=_supported_dtypes_)
        y = np.asarray(y)
        y = np.atleast_1d(y)

        if y.ndim == 2 and y.shape[1] == 1:
            warnings.warn("A column-vector y was passed when a 1d array was"
                 " expected. Please change the shape of y to "
                 "(n_samples,), for example using ravel().",
                 DataConversionWarning, stacklevel=2)

        y = check_array(y, ensure_2d=False, dtype=X.dtype)
        check_consistent_length(X, y)

        if y.ndim == 1:
            # reshape is necessary to preserve the data contiguity against vs
            # [:, np.newaxis] that does not.
            y = np.reshape(y, (-1, 1))

        self.n_outputs_ = y.shape[1]
        self.n_features_ = X.shape[1]
        rs_ = check_random_state(self.random_state)

        if not self.bootstrap and self.oob_score:
            raise ValueError("Out of bag estimation only available"
                             " if bootstrap=True")

        X_fptype = getFPType(X)
        seed_ = rs_.randint(0, np.iinfo('i').max)
        daal_engine = daal4py.engines_mt2203(seed=seed_, fptype=X_fptype)

        _featuresPerNode = _to_absolute_max_features(self.max_features, X.shape[1], is_classification=False)

        # create algorithm
        dfr_algorithm = daal4py.decision_forest_regression_training(
            fptype = getFPType(X),
            method='defaultDense',
            nTrees=int(self.n_estimators),
            observationsPerTreeFraction=1,
            featuresPerNode=int(_featuresPerNode),
            maxTreeDepth=int(0 if self.max_depth is None else self.max_depth),
            minObservationsInLeafNode=1,
            engine=daal_engine,
            impurityThreshold=float(0.0 if self.min_impurity_split is None else self.min_impurity_split),
            varImportance="MDI",
            resultsToCompute="",
            memorySavingMode=False,
            bootstrap=bool(self.bootstrap)
        )

        self._cached_estimators_ = None
        dfr_trainingResult = dfr_algorithm.compute(X, y)

        # get resulting model
        model = dfr_trainingResult.model
        self.daal_model_ = model

        # compute oob_score_
        if self.oob_score:
            self._set_oob_score(X, y)

        return self
Beispiel #36
0
def uplift_by_percentile(y_true,
                         uplift,
                         treatment,
                         strategy='overall',
                         bins=10,
                         std=False,
                         total=False):
    """Compute metrics: uplift, group size, group response rate, standard deviation at each percentile.

    Metrics in columns and percentiles in rows of pandas DataFrame:

        - ``n_treatment``, ``n_control`` - group sizes.
        - ``response_rate_treatment``, ``response_rate_control`` - group response rates.
        - ``uplift`` - treatment response rate substract control response rate.
        - ``std_treatment``, ``std_control`` - (optional) response rates standard deviation.
        - ``std_uplift`` - (optional) uplift standard deviation.

    Args:
        y_true (1d array-like): Correct (true) target values.
        uplift (1d array-like): Predicted uplift, as returned by a model.
        treatment (1d array-like): Treatment labels.
        strategy (string, ['overall', 'by_group']): Determines the calculating strategy. Default is 'overall'.

            * ``'overall'``:
                The first step is taking the first k observations of all test data ordered by uplift prediction
                (overall both groups - control and treatment) and conversions in treatment and control groups
                calculated only on them. Then the difference between these conversions is calculated.
            * ``'by_group'``:
                Separately calculates conversions in top k observations in each group (control and treatment)
                sorted by uplift predictions. Then the difference between these conversions is calculated

        std (bool): If True, add columns with the uplift standard deviation and the response rate standard deviation.
            Default is False.
        total (bool): If True, add the last row with the total values. Default is False.
            The total uplift is a weighted average uplift. See :func:`.weighted_average_uplift`.
            The total response rate is a response rate on the full data amount.
        bins (int): Determines the number of bins (and the relative percentile) in the data. Default is 10.

    Returns:
        pandas.DataFrame: DataFrame where metrics are by columns and percentiles are by rows.
    """

    strategy_methods = ['overall', 'by_group']

    n_samples = len(y_true)
    check_consistent_length(y_true, uplift, treatment)

    if strategy not in strategy_methods:
        raise ValueError(
            f'Response rate supports only calculating methods in {strategy_methods},'
            f' got {strategy}.')

    if not isinstance(total, bool):
        raise ValueError(f'Flag total should be bool: True or False.'
                         f' Invalid value total: {total}')

    if not isinstance(std, bool):
        raise ValueError(f'Flag std should be bool: True or False.'
                         f' Invalid value std: {std}')

    if not isinstance(bins, int) or bins <= 0:
        raise ValueError(f'Bins should be positive integer.'
                         f' Invalid value bins: {bins}')

    if bins >= n_samples:
        raise ValueError(
            f'Number of bins = {bins} should be smaller than the length of y_true {n_samples}'
        )

    y_true, uplift, treatment = np.array(y_true), np.array(uplift), np.array(
        treatment)

    response_rate_trmnt, variance_trmnt, n_trmnt = response_rate_by_percentile(
        y_true,
        uplift,
        treatment,
        group='treatment',
        strategy=strategy,
        bins=bins)

    response_rate_ctrl, variance_ctrl, n_ctrl = response_rate_by_percentile(
        y_true,
        uplift,
        treatment,
        group='control',
        strategy=strategy,
        bins=bins)

    uplift_scores = response_rate_trmnt - response_rate_ctrl
    uplift_variance = variance_trmnt + variance_ctrl

    percentiles = [round(p * 100 / bins, 1) for p in range(1, bins + 1)]

    df = pd.DataFrame({
        'percentile': percentiles,
        'n_treatment': n_trmnt,
        'n_control': n_ctrl,
        'response_rate_treatment': response_rate_trmnt,
        'response_rate_control': response_rate_ctrl,
        'uplift': uplift_scores
    })

    if total:
        response_rate_trmnt_total, variance_trmnt_total, n_trmnt_total = response_rate_by_percentile(
            y_true,
            uplift,
            treatment,
            strategy=strategy,
            group='treatment',
            bins=1)

        response_rate_ctrl_total, variance_ctrl_total, n_ctrl_total = response_rate_by_percentile(
            y_true,
            uplift,
            treatment,
            strategy=strategy,
            group='control',
            bins=1)

        weighted_avg_uplift = 1 / n_trmnt_total * np.dot(
            n_trmnt, uplift_scores)

        df.loc[-1, :] = [
            'total', n_trmnt_total, n_ctrl_total, response_rate_trmnt_total,
            response_rate_ctrl_total, weighted_avg_uplift
        ]

    if std:
        std_treatment = np.sqrt(variance_trmnt)
        std_control = np.sqrt(variance_ctrl)
        std_uplift = np.sqrt(uplift_variance)

        if total:
            std_treatment = np.append(std_treatment, np.sum(std_treatment))
            std_control = np.append(std_control, np.sum(std_control))
            std_uplift = np.append(std_uplift, np.sum(std_uplift))

        df.loc[:, 'std_treatment'] = std_treatment
        df.loc[:, 'std_control'] = std_control
        df.loc[:, 'std_uplift'] = std_uplift

    df = df \
        .set_index('percentile', drop=True, inplace=False) \
        .astype({'n_treatment': 'int32', 'n_control': 'int32'})

    return df
Beispiel #37
0
def cv_split(cv, X, y, groups, is_pairwise, cache):
    check_consistent_length(X, y, groups)
    return CVCache(list(cv.split(X, y, groups)), is_pairwise, cache)
Beispiel #38
0
 def _check_sample_weight(sample_weight, X, dtype=None):
     check_consistent_length(sample_weight, X)
     return sample_weight
def kaplan_meier_estimator(event, time_exit, time_enter=None, time_min=None):
    """Kaplan-Meier estimator of survival function.

    Parameters
    ----------
    event : array-like, shape = (n_samples,)
        Contains binary event indicators.

    time_exit : array-like, shape = (n_samples,)
        Contains event/censoring times.

    time_enter : array-like, shape = (n_samples,), optional
        Contains time when each individual entered the study for
        left truncated survival data.

    time_min : float, optional
        Compute estimator conditional on survival at least up to
        the specified time.

    Returns
    -------
    time : array, shape = (n_times,)
        Unique times.

    prob_survival : array, shape = (n_times,)
        Survival probability at each unique time point.
        If `time_enter` is provided, estimates are conditional probabilities.

    Examples
    --------
    Creating a Kaplan-Meier curve:

    >>> x, y = kaplan_meier_estimator(event, time)
    >>> plt.step(x, y, where="post")
    >>> plt.ylim(0, 1)
    >>> plt.show()

    References
    ----------
    .. [1] Kaplan, E. L. and Meier, P., "Nonparametric estimation from incomplete observations",
           Journal of The American Statistical Association, vol. 53, pp. 457-481, 1958.
    """
    event, time_enter, time_exit = check_y_survival(event, time_enter,
                                                    time_exit)
    check_consistent_length(event, time_enter, time_exit)

    if time_enter is None:
        uniq_times, n_events, n_at_risk = _compute_counts(event, time_exit)
    else:
        uniq_times, n_events, n_at_risk = _compute_counts_truncated(
            event, time_enter, time_exit)

    values = 1 - n_events / n_at_risk

    if time_min is not None:
        mask = uniq_times >= time_min
        uniq_times = numpy.compress(mask, uniq_times)
        values = numpy.compress(mask, values)

    y = numpy.cumprod(values)
    return uniq_times, y
Beispiel #40
0
def group_predict(train, test, labels, *, K=20, mu=0.4, t=20):
    """
    Propagates `labels` from `train` data to `test` data via SNF

    Parameters
    ----------
    train : `m`-list of (S1, F) array_like
        Input subject x feature training data. Subjects in these data sets
        should have been previously labelled (see: `labels`).
    test : `m`-list of (S2, F) array_like
        Input subject x feature testing data. These should be similar to the
        data in `train` (though the first dimension can differ). Labels will be
        propagated to these subjects.
    labels : (S1,) array_like
        Cluster labels for `S1` subjects in `train` data sets. These could have
        been obtained from some ground-truth labelling or via a previous
        iteration of SNF with only the `train` data (e.g., the output of
        :py:func:`sklearn.cluster.spectral_clustering` would be appropriate).
    K : (0, N) int, optional
        Hyperparameter normalization factor for scaling. See `Notes` of
        `snf.affinity_matrix` for more details. Default: 20
    mu : (0, 1) float, optional
        Hyperparameter normalization factor for scaling. See `Notes` of
        `snf.affinity_matrix` for more details. Default: 0.5
    t : int, optional
        Number of iterations to perform information swapping during SNF.
        Default: 20

    Returns
    -------
    predicted_labels : (S2,) np.ndarray
        Cluster labels for subjects in `test` assigning to groups in `labels`
    """

    # check inputs are legit
    try:
        check_consistent_length(train, test)
    except ValueError:
        raise ValueError('Training and testing set must have same number of '
                         'data types.')
    if not all([len(labels) == len(t) for t in train]):
        raise ValueError('Training data must have the same number of subjects '
                         'as provided labels.')

    # generate affinity matrices for stacked train/test data sets
    affinities = []
    for (tr, te) in zip(train, test):
        try:
            check_consistent_length(tr.T, te.T)
        except ValueError:
            raise ValueError('Train and test data must have same number of '
                             'features for each data type. Make sure to '
                             'supply data types in the same order.')
        affinities += [make_affinity(np.row_stack([tr, te]), K=K, mu=mu)]

    # fuse with SNF
    fused_aff = snf(*affinities, K=K, t=t)

    # get unique groups in training data and generate array to hold all labels
    groups = np.unique(labels)
    all_labels = np.zeros((len(fused_aff), groups.size))
    # reassign training labels to all_labels array
    for i in range(groups.size):
        all_labels[np.where(labels == groups[i])[0], i] = 1

    # propagate labels from train data to test data using SNF fused array
    propagated_labels = _label_prop(fused_aff, all_labels, t=1000)
    predicted_labels = groups[propagated_labels[len(train[0]):].argmax(axis=1)]

    return predicted_labels
Beispiel #41
0
def resample(*arrays, **options):

    """Resample arrays or sparse matrices in a consistent way

    The default strategy implements one step of the bootstrapping
    procedure.

    Parameters
    ----------
    *arrays : sequence of indexable data-structures
        Indexable data-structures can be arrays, lists, dataframes or scipy
        sparse matrices with consistent first dimension.

    Other Parameters
    ----------------
    replace : boolean, True by default
        Implements resampling with replacement. If False, this will implement
        (sliced) random permutations.

    n_samples : int, None by default
        Number of samples to generate. If left to None this is
        automatically set to the first dimension of the arrays.
        If replace is False it should not be larger than the length of
        arrays.

    random_state : int, RandomState instance or None, optional (default=None)
        The seed of the pseudo random number generator to use when shuffling
        the data.  If int, random_state is the seed used by the random number
        generator; If RandomState instance, random_state is the random number
        generator; If None, the random number generator is the RandomState
        instance used by `np.random`.

    stratify : array-like or None (default=None)
        If not None, data is split in a stratified fashion, using this as
        the class labels.

    Returns
    -------
    resampled_arrays : sequence of indexable data-structures
        Sequence of resampled copies of the collections. The original arrays
        are not impacted.

    Examples
    --------
    It is possible to mix sparse and dense arrays in the same run::

      >>> X = np.array([[1., 0.], [2., 1.], [0., 0.]])
      >>> y = np.array([0, 1, 2])

      >>> from scipy.sparse import coo_matrix
      >>> X_sparse = coo_matrix(X)

      >>> from sklearn.utils import resample
      >>> X, X_sparse, y = resample(X, X_sparse, y, random_state=0)
      >>> X
      array([[1., 0.],
             [2., 1.],
             [1., 0.]])

      >>> X_sparse                   # doctest: +ELLIPSIS +NORMALIZE_WHITESPACE
      <3x2 sparse matrix of type '<... 'numpy.float64'>'
          with 4 stored elements in Compressed Sparse Row format>

      >>> X_sparse.toarray()
      array([[1., 0.],
             [2., 1.],
             [1., 0.]])

      >>> y
      array([0, 1, 0])

      >>> resample(y, n_samples=2, random_state=0)
      array([0, 1])


    See also
    --------
    :func:`sklearn.utils.shuffle`
    """

    random_state = check_random_state(options.pop('random_state', None))
    replace = options.pop('replace', True)
    max_n_samples = options.pop('n_samples', None)
    stratify = options.pop('stratify', None)
    if options:
        raise ValueError("Unexpected kw arguments: %r" % options.keys())

    if len(arrays) == 0:
        return None

    first = arrays[0]
    n_samples = first.shape[0] if hasattr(first, 'shape') else len(first)

    if max_n_samples is None:
        max_n_samples = n_samples
    elif (max_n_samples > n_samples) and (not replace):
        raise ValueError("Cannot sample %d out of arrays with dim %d "
                         "when replace is False" % (max_n_samples,
                                                    n_samples))

    check_consistent_length(*arrays)

    if stratify is None:
        if replace:
            indices = random_state.randint(0, n_samples, size=(max_n_samples,))
        else:
            indices = np.arange(n_samples)
            random_state.shuffle(indices)
            indices = indices[:max_n_samples]
    else:
        # Code adapted from StratifiedShuffleSplit()
        y = stratify
        if y.ndim == 2:
            # for multi-label y, map each distinct row to a string repr
            # using join because str(row) uses an ellipsis if len(row) > 1000
            y = np.array([' '.join(row.astype('str')) for row in y])

        classes, y_indices = np.unique(y, return_inverse=True)
        n_classes = classes.shape[0]

        class_counts = np.bincount(y_indices)

        # Find the sorted list of instances for each class:
        # (np.unique above performs a sort, so code is O(n logn) already)
        class_indices = np.split(np.argsort(y_indices, kind='mergesort'),
                                 np.cumsum(class_counts)[:-1])

        # if there are ties in the class-counts, we want
        # to make sure to break them anew in each iteration
        n_i = _approximate_mode(class_counts, max_n_samples, random_state)

        indices = []

        for i in range(n_classes):
            indices_i = random_state.choice(class_indices[i], n_i[i],
                                            replace=replace)
            indices.extend(indices_i)

        indices = random_state.permutation(indices)

    # convert sparse matrices to CSR for row-based indexing
    arrays = [a.tocsr() if issparse(a) else a for a in arrays]
    resampled_arrays = [safe_indexing(a, indices) for a in arrays]
    if len(resampled_arrays) == 1:
        # syntactic sugar for the unit argument case
        return resampled_arrays[0]
    else:
        return resampled_arrays
Beispiel #42
0
def _daal_check_X_y(X, y, accept_sparse=False, *, accept_large_sparse=True,
                    dtype="numeric", order=None, copy=False, force_all_finite=True,
                    ensure_2d=True, allow_nd=False, multi_output=False,
                    ensure_min_samples=1, ensure_min_features=1, y_numeric=False,
                    estimator=None):
    """Input validation for standard estimators.

    Checks X and y for consistent length, enforces X to be 2D and y 1D. By
    default, X is checked to be non-empty and containing only finite values.
    Standard input checks are also applied to y, such as checking that y
    does not have np.nan or np.inf targets. For multi-label y, set
    multi_output=True to allow 2D and sparse y. If the dtype of X is
    object, attempt converting to float, raising on failure.

    Parameters
    ----------
    X : nd-array, list or sparse matrix
        Input data.

    y : nd-array, list or sparse matrix
        Labels.

    accept_sparse : string, boolean or list of string (default=False)
        String[s] representing allowed sparse matrix formats, such as 'csc',
        'csr', etc. If the input is sparse but not in the allowed format,
        it will be converted to the first listed format. True allows the input
        to be any format. False means that a sparse matrix input will
        raise an error.

    accept_large_sparse : bool (default=True)
        If a CSR, CSC, COO or BSR sparse matrix is supplied and accepted by
        accept_sparse, accept_large_sparse will cause it to be accepted only
        if its indices are stored with a 32-bit dtype.

        .. versionadded:: 0.20

    dtype : string, type, list of types or None (default="numeric")
        Data type of result. If None, the dtype of the input is preserved.
        If "numeric", dtype is preserved unless array.dtype is object.
        If dtype is a list of types, conversion on the first type is only
        performed if the dtype of the input is not in the list.

    order : 'F', 'C' or None (default=None)
        Whether an array will be forced to be fortran or c-style.

    copy : boolean (default=False)
        Whether a forced copy will be triggered. If copy=False, a copy might
        be triggered by a conversion.

    force_all_finite : boolean or 'allow-nan', (default=True)
        Whether to raise an error on np.inf, np.nan, pd.NA in X. This parameter
        does not influence whether y can have np.inf, np.nan, pd.NA values.
        The possibilities are:

        - True: Force all values of X to be finite.
        - False: accepts np.inf, np.nan, pd.NA in X.
        - 'allow-nan': accepts only np.nan or pd.NA values in X. Values cannot
          be infinite.

        .. versionadded:: 0.20
           ``force_all_finite`` accepts the string ``'allow-nan'``.

        .. versionchanged:: 0.23
           Accepts `pd.NA` and converts it into `np.nan`

    ensure_2d : boolean (default=True)
        Whether to raise a value error if X is not 2D.

    allow_nd : boolean (default=False)
        Whether to allow X.ndim > 2.

    multi_output : boolean (default=False)
        Whether to allow 2D y (array or sparse matrix). If false, y will be
        validated as a vector. y cannot have np.nan or np.inf values if
        multi_output=True.

    ensure_min_samples : int (default=1)
        Make sure that X has a minimum number of samples in its first
        axis (rows for a 2D array).

    ensure_min_features : int (default=1)
        Make sure that the 2D array has some minimum number of features
        (columns). The default value of 1 rejects empty datasets.
        This check is only enforced when X has effectively 2 dimensions or
        is originally 1D and ``ensure_2d`` is True. Setting to 0 disables
        this check.

    y_numeric : boolean (default=False)
        Whether to ensure that y has a numeric type. If dtype of y is object,
        it is converted to float64. Should only be used for regression
        algorithms.

    estimator : str or estimator instance (default=None)
        If passed, include the name of the estimator in warning messages.

    Returns
    -------
    X_converted : object
        The converted and validated X.

    y_converted : object
        The converted and validated y.
    """
    if y is None:
        raise ValueError("y cannot be None")

    X = _daal_check_array(
        X, accept_sparse=accept_sparse,
        accept_large_sparse=accept_large_sparse,
        dtype=dtype, order=order, copy=copy,
        force_all_finite=force_all_finite,
        ensure_2d=ensure_2d, allow_nd=allow_nd,
        ensure_min_samples=ensure_min_samples,
        ensure_min_features=ensure_min_features,
        estimator=estimator
    )
    if multi_output:
        y = _daal_check_array(y, accept_sparse='csr', force_all_finite=True,
                              ensure_2d=False, dtype=None)
    else:
        y = column_or_1d(y, warn=True)
        _daal_assert_all_finite(y)
    if y_numeric and hasattr(y, 'dtype') and y.dtype.kind == 'O':
        y = y.astype(np.float64)

    check_consistent_length(X, y)

    return X, y
Beispiel #43
0
def weighted_average_uplift(y_true,
                            uplift,
                            treatment,
                            strategy='overall',
                            bins=10):
    """Weighted average uplift.

    It is an average of uplift by percentile.
    Weights are sizes of the treatment group by percentile.

    Args:
        y_true (1d array-like): Correct (true) target values.
        uplift (1d array-like): Predicted uplift, as returned by a model.
        treatment (1d array-like): Treatment labels.
        strategy (string, ['overall', 'by_group']): Determines the calculating strategy. Default is 'overall'.

            * ``'overall'``:
                The first step is taking the first k observations of all test data ordered by uplift prediction
                (overall both groups - control and treatment) and conversions in treatment and control groups
                calculated only on them. Then the difference between these conversions is calculated.
            * ``'by_group'``:
                Separately calculates conversions in top k observations in each group (control and treatment)
                sorted by uplift predictions. Then the difference between these conversions is calculated

        bins (int): Determines the number of bins (and the relative percentile) in the data. Default is 10.

    Returns:
        float: Weighted average uplift.
    """

    strategy_methods = ['overall', 'by_group']

    n_samples = len(y_true)
    check_consistent_length(y_true, uplift, treatment)

    if strategy not in strategy_methods:
        raise ValueError(
            f'Response rate supports only calculating methods in {strategy_methods},'
            f' got {strategy}.')

    if not isinstance(bins, int) or bins <= 0:
        raise ValueError(f'Bins should be positive integer.'
                         f' Invalid value bins: {bins}')

    if bins >= n_samples:
        raise ValueError(
            f'Number of bins = {bins} should be smaller than the length of y_true {n_samples}'
        )

    response_rate_trmnt, variance_trmnt, n_trmnt = response_rate_by_percentile(
        y_true,
        uplift,
        treatment,
        group='treatment',
        strategy=strategy,
        bins=bins)

    response_rate_ctrl, variance_ctrl, n_ctrl = response_rate_by_percentile(
        y_true,
        uplift,
        treatment,
        group='control',
        strategy=strategy,
        bins=bins)

    uplift_scores = response_rate_trmnt - response_rate_ctrl

    weighted_avg_uplift = np.dot(n_trmnt, uplift_scores) / np.sum(n_trmnt)

    return weighted_avg_uplift
Beispiel #44
0
def gap_train_test_split(*arrays, **options):
    """Split arrays or matrices into random train and test subsets (with a gap)

    Parameters
    ----------
    *arrays : sequence of indexables with same length / shape[0]
        Allowed inputs are lists, numpy arrays, scipy-sparse
        matrices or pandas dataframes.

    gap_size : float or int, default=0
        If float, should be between 0.0 and 1.0 and represent the proportion
        of the dataset between the training and the test set. If int,
        represents the absolute number of the dropped samples.

    test_size : float, int, or None, default=None
        If float, should be between 0.0 and 1.0 and equal to
        test / (train + test). If int, represents the absolute number of
        test samples. If None, the value is set to the complement of the
        train size and the gap. If `train_size` is also None,
        it will be set to 0.25.

    train_size : float, int, or None, default=None
        If float, should be between 0.0 and 1.0 and equal to
        train / (train + test). If int, represents the absolute number of
        train samples. If None, the value is automatically set to
        the complement of the test size and the gap size.

    Returns
    -------
    splitting : list, length=2 * len(arrays)
        List containing train-test split of inputs.

    Examples
    --------
    >>> import numpy as np
    >>> from tscv import gap_train_test_split
    >>> X, y = np.arange(10).reshape((5, 2)), range(5)
    >>> X
    array([[0, 1],
           [2, 3],
           [4, 5],
           [6, 7],
           [8, 9]])
    >>> list(y)
    [0, 1, 2, 3, 4]
    >>> X_train, X_test, y_train, y_test = gap_train_test_split(
    ...     X, y, test_size=0.33, gap_size=1)
    ...
    >>> X_train
    array([[0, 1],
           [2, 3],
           [4, 5]])
    >>> y_train
    [0, 1, 2]
    >>> X_test
    array([[8, 9]])
    >>> y_test
    [4]
    >>> gap_train_test_split(list(range(10)), gap_size=0.1)
    [[0, 1, 2, 3, 4, 5, 6], [8, 9]]
    """
    n_arrays = len(arrays)
    if n_arrays == 0:
        raise ValueError("At least one array required as input")
    check_consistent_length(*arrays)
    test_size = options.pop('test_size', None)
    train_size = options.pop('train_size', None)
    gap_size = options.pop('gap_size', 0)
    if not isinstance(gap_size, numbers.Real):
        raise TypeError("The gap size should be a real number.")

    if options:
        raise TypeError("Invalid parameters passed: %s. \n"
                        "Check the spelling of keyword parameters." %
                        str(options))

    arrays = indexable(*arrays)
    n_samples = _num_samples(arrays[0])

    def size_to_number(size, n):
        b, a = modf(size)
        return int(max(a, round(b * n)))

    n_gap = size_to_number(gap_size, n_samples)
    n_remain = n_samples - n_gap
    if test_size is None and train_size is None:
        test_size = 0.25
    if train_size is None:
        n_test = size_to_number(test_size, n_remain)
        n_train = n_remain - n_test
    elif test_size is None:
        n_train = size_to_number(train_size, n_remain)
        n_test = n_remain - n_train
    else:
        warnings.warn(
            "The train_size argument is overridden by test_size; "
            "in case of nonzero gap_size, "
            "an explicit value should be provided "
            "and cannot be implied by 1 - train_size - test_size.", Warning)
        n_test = size_to_number(test_size, n_remain)
        n_train = n_remain - n_test

    train = np.arange(n_train)
    test = np.arange(n_train + n_gap, n_samples)

    return list(
        chain.from_iterable((_safe_indexing(a, train), _safe_indexing(a, test))
                            for a in arrays))