Example #1
0
def test_type_of_target():
    for group, group_examples in EXAMPLES.items():
        for example in group_examples:
            assert_equal(type_of_target(example), group,
                         msg=('type_of_target(%r) should be %r, got %r'
                              % (example, group, type_of_target(example))))

    for example in NON_ARRAY_LIKE_EXAMPLES:
        msg_regex = r'Expected array-like \(array or non-string sequence\).*'
        assert_raises_regex(ValueError, msg_regex, type_of_target, example)

    for example in MULTILABEL_SEQUENCES:
        msg = ('You appear to be using a legacy multi-label data '
               'representation. Sequence of sequences are no longer supported;'
               ' use a binary array or sparse matrix instead.')
        assert_raises_regex(ValueError, msg, type_of_target, example)

    try:
        from pandas import SparseSeries
    except ImportError:
        raise SkipTest("Pandas not found")

    y = SparseSeries([1, 0, 0, 1, 0])
    msg = "y cannot be class 'SparseSeries'."
    assert_raises_regex(ValueError, msg, type_of_target, y)
Example #2
0
    def fit(self, X, y):
        """Find the classes statistics before to perform sampling.

        Parameters
        ----------
        X : ndarray, shape (n_samples, n_features)
            Matrix containing the data which have to be sampled.

        y : ndarray, shape (n_samples, )
            Corresponding label for each sample in X.

        Returns
        -------
        self : object,
            Return self.

        """

        super(BaseMulticlassSampler, self).fit(X, y)

        # Check that the target type is either binary or multiclass
        if not (type_of_target(y) == 'binary' or
                type_of_target(y) == 'multiclass'):
            warnings.simplefilter('always', UserWarning)
            warnings.warn('The target type should be binary or multiclass.')

        return self
Example #3
0
def test_type_of_target():
    for group, group_examples in iteritems(EXAMPLES):
        for example in group_examples:
            assert_equal(type_of_target(example), group,
                         msg='type_of_target(%r) should be %r, got %r'
                         % (example, group, type_of_target(example)))

    for example in NON_ARRAY_LIKE_EXAMPLES:
        assert_raises(ValueError, type_of_target, example)
Example #4
0
def _check_targets_hmc(y_true, y_pred):
    check_consistent_length(y_true, y_pred)
    y_type = set([type_of_target(y_true), type_of_target(y_pred)])
    if y_type == set(["binary", "multiclass"]):
        y_type = set(["multiclass"])
    if y_type != set(["multiclass"]):
        raise ValueError("{0} is not supported".format(y_type))
    y_true = column_or_1d(y_true)
    y_pred = column_or_1d(y_pred)
    return y_true, y_pred
def _check_clf_targets(y_true, y_pred):
    """Check that y_true and y_pred belong to the same classification task

    This converts multiclass or binary types to a common shape, and raises a
    ValueError for a mix of multilabel and multiclass targets, a mix of
    multilabel formats, for the presence of continuous-valued or multioutput
    targets, or for targets of different lengths.

    Column vectors are squeezed to 1d.

    Parameters
    ----------
    y_true : array-like,

    y_pred : array-like

    Returns
    -------
    type_true : one of {'multilabel-indicator', 'multilabel-sequences', \
    'multiclass', 'binary'}
    The type of the true target data, as output by
    ``utils.multiclass.type_of_target``

    y_true : array or indicator matrix or sequence of sequences

    y_pred : array or indicator matrix or sequence of sequences
    """

    y_true, y_pred = check_arrays(y_true, y_pred, allow_lists=True)
    type_true = type_of_target(y_true)
    type_pred = type_of_target(y_pred)

    y_type = set([type_true, type_pred])
    if y_type == set(["binary", "multiclass"]):
        y_type = set(["multiclass"])

    if len(y_type) > 1:
        raise ValueError("Can't handle mix of {0} and {1}" "".format(type_true, type_pred))

    # We can't have more than one value on y_type => The set is no more needed
    y_type = y_type.pop()

    # No metrics support "multiclass-multioutput" format
    if y_type not in ["binary", "multiclass", "multilabel-indicator", "multilabel-sequences"]:
        raise ValueError("{0} is not supported".format(y_type))

    if y_type in ["binary", "multiclass"]:
        y_true = column_or_1d(y_true)
        y_pred = column_or_1d(y_pred)

    return y_type, y_true, y_pred
Example #6
0
    def fit(self, X, y):
        """Find the classes statistics before to perform sampling.

        Parameters
        ----------
        X : ndarray, shape (n_samples, n_features)
            Matrix containing the data which have to be sampled.

        y : ndarray, shape (n_samples, )
            Corresponding label for each sample in X.

        Returns
        -------
        self : object,
            Return self.

        """

        super(BaseBinarySampler, self).fit(X, y)

        # Check that the target type is binary
        if not type_of_target(y) == 'binary':
            warnings.warn('The target type should be binary.')

        return self
Example #7
0
def check_target_type(y, indicate_one_vs_all=False):
    """Check the target types to be conform to the current samplers.

    The current samplers should be compatible with ``'binary'``,
    ``'multilabel-indicator'`` and ``'multiclass'`` targets only.

    Parameters
    ----------
    y : ndarray,
        The array containing the target.

    indicate_one_vs_all : bool, optional
        Either to indicate if the targets are encoded in a one-vs-all fashion.

    Returns
    -------
    y : ndarray,
        The returned target.

    is_one_vs_all : bool, optional
        Indicate if the target was originally encoded in a one-vs-all fashion.
        Only returned if ``indicate_multilabel=True``.

    """
    type_y = type_of_target(y)
    if type_y == 'multilabel-indicator':
        if np.any(y.sum(axis=1) > 1):
            raise ValueError(
                "When 'y' corresponds to '{}', 'y' should encode the "
                "multiclass (a single 1 by row).".format(type_y))
        y = y.argmax(axis=1)

    return (y, type_y == 'multilabel-indicator') if indicate_one_vs_all else y
def test_type_of_target():
    for group, group_examples in iteritems(EXAMPLES):
        for example in group_examples:
            assert_equal(type_of_target(example), group,
                         msg=('type_of_target(%r) should be %r, got %r'
                              % (example, group, type_of_target(example))))

    for example in NON_ARRAY_LIKE_EXAMPLES:
        msg_regex = 'Expected array-like \(array or non-string sequence\).*'
        assert_raises_regex(ValueError, msg_regex, type_of_target, example)

    for example in MULTILABEL_SEQUENCES:
        msg = ('You appear to be using a legacy multi-label data '
               'representation. Sequence of sequences are no longer supported;'
               ' use a binary array or sparse matrix instead.')
        assert_raises_regex(ValueError, msg, type_of_target, example)
Example #9
0
 def _posibility(self, x, tag, event=1):
     """计算触发概率
     Parameters:
     ----------
         x (Sequence): - 离散特征序列
         tag (Sequence): - 用于训练的标签序列
         event (any): - True指代的触发事件
     Returns:
     ----------
         Dict[str,Tuple[rate_T, rate_F]]: - 训练好后的好坏触发概率
     """
     if type_of_target(tag) not in ['binary']:
         raise AttributeError("tag must be a binary array")
     #if type_of_target(x) in ['continuous']:
     #    raise AttributeError("input array must not continuous")
     tag = np.array(tag)
     x = np.array(x)
     event_total = (tag == event).sum()
     non_event_total = tag.shape[-1] - event_total
     x_labels = pd.unique(x[pd.notnull(x)])
     pos_dic = {}
     for x1 in x_labels:
         # 当 x1 是nan时,y1 也为空
         y1 = tag[np.where(x == x1)[0]]
         event_count = (y1 == event).sum()
         non_event_count = y1.shape[-1] - event_count
         rate_event = 1.0 * event_count / event_total
         rate_non_event = 1.0 * non_event_count / non_event_total
         pos_dic[x1] = (rate_event, rate_non_event)
     return pos_dic
Example #10
0
def _sampling_strategy_float(sampling_strategy, y, sampling_type):
    """Take a proportion of the majority (over-sampling) or minority
    (under-sampling) class in binary classification."""
    type_y = type_of_target(y)
    if type_y != 'binary':
        raise ValueError(
            '"sampling_strategy" can be a float only when the type '
            'of target is binary. For multi-class, use a dict.')
    target_stats = Counter(y)
    if sampling_type == 'over-sampling':
        n_sample_majority = max(target_stats.values())
        class_majority = max(target_stats, key=target_stats.get)
        sampling_strategy_ = {
            key: int(n_sample_majority * sampling_strategy - value)
            for (key, value) in target_stats.items() if key != class_majority
        }
    elif (sampling_type == 'under-sampling'):
        n_sample_minority = min(target_stats.values())
        class_minority = min(target_stats, key=target_stats.get)
        sampling_strategy_ = {
            key: int(n_sample_minority / sampling_strategy)
            for (key, value) in target_stats.items() if key != class_minority
        }
    else:
        raise ValueError("'clean-sampling' methods do let the user "
                         "specify the sampling ratio.")
    return sampling_strategy_
    def cross_val_score_one_vs_all_per_class(estimator, X, y=None, *args, **kargs):
        y_type = type_of_target(y)
        positive_example_amount = y.sum(axis=0)
        error = ""
        if (positive_example_amount < kargs["cv"]).any():
            error = (
                str((positive_example_amount < kargs["cv"]).sum())
                + " : too little examples for "
                + str(np.where(positive_example_amount < kargs["cv"]))
                + str(positive_example_amount[np.where(positive_example_amount < kargs["cv"])])
            )
        if (positive_example_amount > y.shape[0] - kargs["cv"]).any():
            error += (
                str((positive_example_amount > y.shape[0] - kargs["cv"]).sum())
                + " : too many examples for "
                + str(np.where(positive_example_amount > y.shape[0] - kargs["cv"]))
                + str(positive_example_amount[np.where(positive_example_amount > y.shape[0] - kargs["cv"])])
            )
        #        if error:
        #            raise Exception(error)
        if y_type.startswith("multilabel") and isinstance(estimator, OneVsRestClassifier):
            res = []
            for yy in y.transpose():
                res.append(_cross_val_score(deepcopy(estimator.estimator), X, yy, *args, **kargs))
            import pdb

            pdb.set_trace()
        else:
            res = _cross_val_score(estimator, X, y, *args, **kargs)
        return np.array(list(res))
Example #12
0
    def fit(self, X, y):
        """Fit MLP Classifier according to X, y

        Parameters
        ----------
        X : array-like, shape = [n_samples, n_features]
        Training vectors, where n_samples is the number of samples
        and n_features is the number of features.

        y : array-like, shape = [n_samples] or [n_samples, n_classes]
        Target values. It determines the problem type.

        *binary*
        If y is a vector of integers with two unique values.

        *multiclass*
        If y is a vector of integers with three or more values
        or if y is a two-dimensional array of integers and there exists only
        one non-zero element per row.

        *multiclass-multioutput*
        If y is two-dimensional array of integers with two unique values
        and there exists more than one non-zero element per row.

        *continuous*
        If y is a vector of floats.

        *continuous-multioutput*
        If y is a two-dimensional array of floats.

        Returns
        -------
        self : object
        Returns self.
        """
        X, = check_arrays(X, sparse_format='dense')

        n_samples, self.input_size_ = X.shape

        y = np.atleast_1d(y)

        self.type_of_target_ = type_of_target(y)
        if self.verbose > 0:
            print("The inferred type of y is %s" % self.type_of_target_)
        if self.type_of_y != None:
            if self.type_of_y != self.type_of_target_:
                print("Passed type of y is %s, inferred type is %s"
                      % (self.type_of_y, self.type_of_target_))
                raise("derp")

        self.check_type_implemented()
        y = self._get_output(y)
        X, y = self._scale(X, y)
        self._inst_mlp()
        self._fit_mlp(X, y)
        if self.dropout and self.type_of_target_ in ['continuous', 'continuous-multioutput']:
            self._lineregress(X, y)
 def check_target_binary(self, y):
     '''
     check if the target variable is binary, raise error if not.
     :param y:
     :return:
     '''
     y_type = type_of_target(y)
     if y_type not in ['binary']:
         raise ValueError('Label type must be binary')
def check_samplers_multiclass_ova(name, Sampler):
    # Check that multiclass target lead to the same results than OVA encoding
    X, y = make_classification(n_samples=1000, n_classes=3, n_informative=4,
                               weights=[0.2, 0.3, 0.5], random_state=0)
    y_ova = label_binarize(y, np.unique(y))
    sampler = Sampler()
    # FIXME: in 0.6 set the random_state for all
    if name not in DONT_HAVE_RANDOM_STATE:
        set_random_state(sampler)
    X_res, y_res = sampler.fit_resample(X, y)
    X_res_ova, y_res_ova = sampler.fit_resample(X, y_ova)
    assert_allclose(X_res, X_res_ova)
    if issubclass(Sampler, BaseEnsembleSampler):
        for batch_y, batch_y_ova in zip(y_res, y_res_ova):
            assert type_of_target(batch_y_ova) == type_of_target(y_ova)
            assert_allclose(batch_y, batch_y_ova.argmax(axis=1))
    else:
        assert type_of_target(y_res_ova) == type_of_target(y_ova)
        assert_allclose(y_res, y_res_ova.argmax(axis=1))
Example #15
0
def _check_cv(cv=3, y=None, classifier=False, **kwargs):
    """Input checker utility for building a cross-validator.

    Parameters
    ----------
    cv : int, cross-validation generator or an iterable, optional
        Determines the cross-validation splitting strategy.
        Possible inputs for cv are:
          - None, to use the default 3-fold cross-validation,
          - integer, to specify the number of folds.
          - An object to be used as a cross-validation generator.
          - An iterable yielding train/test splits.

        For integer/None inputs, if classifier is True and ``y`` is either
        binary or multiclass, :class:`StratifiedKFold` is used. In all other
        cases, :class:`KFold` is used.

        Refer :ref:`User Guide <cross_validation>` for the various
        cross-validation strategies that can be used here.

    y : array-like, optional
        The target variable for supervised learning problems.

    classifier : boolean, optional, default False
        Whether the task is a classification task, in which case
        stratified KFold will be used.

    kwargs : dict
        Other parameters for StratifiedShuffleSplit or ShuffleSplit.

    Returns
    -------
    checked_cv : a cross-validator instance.
        The return value is a cross-validator which generates the train/test
        splits via the ``split`` method.
    """
    if cv is None:
        cv = kwargs.pop('n_splits', 0) or 10

    if isinstance(cv, numbers.Integral):
        if (classifier and (y is not None) and
                (type_of_target(y) in ('binary', 'multiclass'))):
            return StratifiedShuffleSplit(cv, **kwargs)
        else:
            return ShuffleSplit(cv, **kwargs)

    if not hasattr(cv, 'split') or isinstance(cv, str):
        if not isinstance(cv, Iterable) or isinstance(cv, str):
            raise ValueError("Expected cv as an integer, cross-validation "
                             "object (from sklearn.model_selection) "
                             "or an iterable. Got %s." % cv)
        return _CVIterableWrapper(cv)

    return cv  # New style cv objects are passed without any modification
Example #16
0
def check_averaging(name, y_true, y_true_binarize, y_pred, y_pred_binarize, y_score):
    is_multilabel = type_of_target(y_true).startswith("multilabel")

    metric = ALL_METRICS[name]

    if name in METRICS_WITH_AVERAGING:
        _check_averaging(metric, y_true, y_pred, y_true_binarize, y_pred_binarize, is_multilabel)
    elif name in THRESHOLDED_METRICS_WITH_AVERAGING:
        _check_averaging(metric, y_true, y_score, y_true_binarize, y_score, is_multilabel)
    else:
        raise ValueError("Metric is not recorded as having an average option")
    def _validate_target(self, y):
        """
        Raises a value error if the target is not a classification target.
        """
        # Ignore None values
        if y is None:
            return

        y_type = type_of_target(y)
        if y_type not in ("binary", "multiclass"):
            raise YellowbrickValueError((
                "'{}' target type not supported, only binary and multiclass"
            ).format(y_type))
Example #18
0
def woe(X,y,event=1):
    res_woe = [] 
    iv_dict = {}
    for feature in X.columns:
        x = X[feature].values
        # 判断x 是否为连续变量,如果是,就要进行离散化
        if type_of_target(x) == 'continuous':
            x = discrete(x)
        woe_dict,iv = woe_single_x(x, y, feature, event)
        iv_dict[feature] = iv
        res_woe.append(woe_dict)
     
    return iv_dict
Example #19
0
def check_target_type(y):
    """Check the target types to be conform to the current samplers.

    The current samplers should be compatible with ``'binary'`` and
    ``'multiclass'`` targets only.

    Parameters
    ----------
    y : ndarray,
        The array containing the target

    Returns
    -------
    y : ndarray,
        The returned target.

    """
    if type_of_target(y) not in TARGET_KIND:
        # FIXME: perfectly we should raise an error but the sklearn API does
        # not allow for it
        warnings.warn("'y' should be of types {} only. Got {} instead.".format(
            TARGET_KIND, type_of_target(y)))
    return y
Example #20
0
def check_target_type(y, indicate_one_vs_all=False):
    """Check the target types to be conform to the current samplers.

    The current samplers should be compatible with ``'binary'``,
    ``'multilabel-indicator'`` and ``'multiclass'`` targets only.

    Parameters
    ----------
    y : ndarray,
        The array containing the target.

    indicate_one_vs_all : bool, optional
        Either to indicate if the targets are encoded in a one-vs-all fashion.

    Returns
    -------
    y : ndarray,
        The returned target.

    is_one_vs_all : bool, optional
        Indicate if the target was originally encoded in a one-vs-all fashion.
        Only returned if ``indicate_multilabel=True``.

    """
    type_y = type_of_target(y)
    if type_y not in TARGET_KIND:
        # FIXME: perfectly we should raise an error but the sklearn API does
        # not allow for it
        warnings.warn("'y' should be of types {} only. Got {} instead.".format(
            TARGET_KIND, type_of_target(y)))

    if indicate_one_vs_all:
        return (y.argmax(axis=1) if type_y == 'multilabel-indicator' else y,
                type_y == 'multilabel-indicator')
    else:
        return y.argmax(axis=1) if type_y == 'multilabel-indicator' else y
Example #21
0
 def feature_discretion(self, X):
     '''
     Discrete the continuous features of input data X, and keep other features unchanged.
     :param X : numpy array
     :return: the numpy array in which all continuous features are discreted
     '''
     temp = []
     for i in range(0, X.shape[-1]):
         x = X[:, i]
         x_type = type_of_target(x)
         if x_type == 'continuous':
             x1 = self.discrete(x)
             temp.append(x1)
         else:
             temp.append(x)
     return np.array(temp).T
Example #22
0
def is_cat(s: pd.Series, consider_ordinal_as_cat):
    for elem in s:
        if isinstance(elem, (float, int)):
            continue
        else:
            return True
    if consider_ordinal_as_cat:
        if isinstance(s, np.ndarray):
            s = pd.Series(s)
        s = s.dropna()
        if s.dtype == object:
            s = s.astype('float32')
        tp = type_of_target(s)
        if tp in ("multiclass",):
            return True
    return False
Example #23
0
 def test_regression_conversion(self):
     """
     Makes sure that a regression input
     properly retains the continious target type
     """
     for input_object in [
         [1.0, 76.9, 123, 4.0, 81.1],
         np.array([1.0, 76.9, 123, 4.0, 81.1]),
         pd.DataFrame([1.0, 76.9, 123, 4.0, 81.1]),
     ]:
         validator = InputValidator()
         y_train = validator.validate_target(
             input_object,
             is_classification=False,
         )
         self.assertEqual('continuous', type_of_target(y_train))
Example #24
0
    def _encode_class_labels(self, y):
        """
        Fit the internal label encoder and return encoded labels.
        """
        self.type_of_target_ = type_of_target(y)
        if self.type_of_target_ in ("binary", "multiclass"):
            self.labels_are_encoded = True
            self.label_encoder_ = LabelEncoder()
            encoded_y = self.label_encoder_.fit_transform(y)
        else:
            msg = ("CascadeForestClassifier is used for binary and multiclass"
                   " classification, wheras the training labels seem not to"
                   " be any one of them.")
            raise ValueError(msg)

        return encoded_y
Example #25
0
def check_cv(
    cv: Union[int, Iterable, BaseCrossValidator] = 5,
    y: Optional[Union[pd.Series, np.ndarray]] = None,
    stratified: bool = False,
    random_state: int = 0,
):
    if cv is None:
        cv = 5
    if isinstance(cv, numbers.Integral):
        if (stratified and (y is not None)
                and (type_of_target(y) in ("binary", "multiclass"))):
            return StratifiedKFold(cv, shuffle=True, random_state=random_state)
        else:
            return KFold(cv, shuffle=True, random_state=random_state)

    return model_selection.check_cv(cv, y, stratified)
 def feature_discretion(self, X):
     '''
     Discrete the continuous features of input data X, and keep other features unchanged.
     :param X : numpy array
     :return: the numpy array in which all continuous features are discreted
     '''
     temp = []
     for i in range(0, X.shape[-1]):
         x = X[:, i]
         x_type = type_of_target(x)
         if x_type == 'continuous':
             x1 = self.discrete(x)
             temp.append(x1)
         else:
             temp.append(x)
     return np.array(temp).T
Example #27
0
def check_binarized_results(y, classes, pos_label, neg_label, expected):
    for sparse_output in [True, False]:
        if ((pos_label == 0 or neg_label != 0) and sparse_output):
            with pytest.raises(ValueError):
                label_binarize(y,
                               classes=classes,
                               neg_label=neg_label,
                               pos_label=pos_label,
                               sparse_output=sparse_output)
            continue

        # check label_binarize
        binarized = label_binarize(y,
                                   classes=classes,
                                   neg_label=neg_label,
                                   pos_label=pos_label,
                                   sparse_output=sparse_output)
        binarized = binarized.fetch()
        if hasattr(binarized, 'raw'):
            binarized = binarized.raw
        assert_array_equal(toarray(binarized), expected)
        assert sp.issparse(binarized) == sparse_output

        # check inverse
        y_type = type_of_target(y)
        if y_type == "multiclass":
            inversed = _inverse_binarize_multiclass(binarized, classes=classes)

        else:
            inversed = _inverse_binarize_thresholding(
                binarized,
                output_type=y_type,
                classes=classes,
                threshold=((neg_label + pos_label) / 2.))

        assert_array_equal(toarray(inversed), toarray(y))

        # Check label binarizer
        lb = LabelBinarizer(neg_label=neg_label,
                            pos_label=pos_label,
                            sparse_output=sparse_output)
        binarized = lb.fit_transform(y)
        assert_array_equal(toarray(binarized), expected)
        assert binarized.issparse() == sparse_output
        inverse_output = lb.inverse_transform(binarized)
        assert_array_equal(toarray(inverse_output), toarray(y))
        assert inverse_output.issparse() == sp.issparse(y)
Example #28
0
    def _check_X_y(self, X, y, accept_sparse=True):
        is_2d = hasattr(y, 'shape') and len(y.shape) > 1 and y.shape[1] >= 2
        if is_2d or type_of_target(y) != 'binary':
            raise TypeError("Only binary targets supported. For training "
                            "multiclass or multilabel models, you may use the "
                            "OneVsRest or OneVsAll metaestimators in "
                            "scikit-learn.")

        X, Y = check_X_y(X,
                         y,
                         dtype=np.double,
                         accept_sparse=accept_sparse,
                         multi_output=False)

        self.label_binarizer_ = LabelBinarizer(pos_label=1, neg_label=-1)
        y = self.label_binarizer_.fit_transform(Y).ravel().astype(np.double)
        return X, y
Example #29
0
 def fit(self, x, y):
     """
     Parameters
     ----------
     x: np.ndarray
     y: np.ndarray, 1D
     """
     x, y = check_X_y(x, y, "csc")
     self.x0 = x
     self.y0 = y
     cv = self.cv
     if isinstance(cv, numbers.Integral):
         if (type_of_target(y) in ('binary', 'multiclass')):
             cv = StratifiedKFold(cv)
         else:
             cv = KFold(cv)
     self.kf = list(cv.split(x))
Example #30
0
def _make_1st_stage_preds(X, y, X_test):
    if type_of_target(y) == 'continuous':
        models = [
            SVR(),
            Ridge(random_state=0),
            RandomForestRegressor(n_estimators=30, random_state=0)
        ]
    else:
        models = [
            SVC(random_state=0),
            LogisticRegression(random_state=0),
            RandomForestClassifier(n_estimators=30, random_state=0)
        ]

    results = [cross_validate(m, X, y, X_test, cv=5) for m in models]

    return [r.oof_prediction for r in results], [r.test_prediction for r in results]
 def _predict_and_score(self, X_test, y_test):
     #XXX: Implement type_of_target(y)
     
     if(self.predict_proba):
         y_type = type_of_target(y_test)
         if(y_type in ('binary')):
             pred = self.model.predict_proba(X_test)[:,1]
         else:
             pred = self.model.predict_proba(X_test)
             
     else:
         pred = self.model.predict(X_test)
     
     if(self.multiclass_average == 'binary'):
         return self.metric(y_test, pred), pred
     else:
         return self.metric(y_test, pred, average=self.multiclass_average), pred
Example #32
0
def dichotomize_vector(y, n_bins, ordered=False):
    y = np.squeeze(y)

    if type_of_target(y) == 'multiclass':
        print('target could be multiclass!')
    splitter = MaxentropyMedianDichotomizationTransformer(n_bins)
    y_unique = np.unique(y)

    if n_bins < y_unique.shape[0]:
        splitter.fit(y.reshape(-1, 1))
    else:
        return np.array(map_continuous_names(y))

    if ordered:
        return np.squeeze(splitter.transform_ordered(y.reshape(-1, 1)))
    else:
        return np.squeeze(splitter.transform(y.reshape(-1, 1)))
Example #33
0
    def _is_multilabel(self, y):
        """
        Return whether the given target array corresponds to a multilabel
        problem.
        """
        temp_y = y.copy()
        temp_y[np.zeros_like(temp_y, dtype=bool) | (temp_y == -1)] = 1
        target_type = type_of_target(temp_y)

        if target_type in ['binary', 'multiclass']:
            return False
        elif target_type == 'multilabel-indicator':
            return True
        else:
            # Raise an error, as in
            # sklearn.utils.multiclass.check_classification_targets.
            raise ValueError("Unknown label type: %r" % y)
Example #34
0
 def _check_data(self, obj_dml_data):
     if obj_dml_data.z_cols is not None:
         raise ValueError(
             'Incompatible data. ' + ' and '.join(obj_dml_data.z_cols) +
             ' have been set as instrumental variable(s). '
             'To fit an interactive IV regression model use DoubleMLIIVM instead of DoubleMLIRM.'
         )
     one_treat = (obj_dml_data.n_treat == 1)
     binary_treat = (type_of_target(obj_dml_data.d) == 'binary')
     zero_one_treat = np.all((np.power(obj_dml_data.d, 2) -
                              obj_dml_data.d) == 0)
     if not (one_treat & binary_treat & zero_one_treat):
         raise ValueError('Incompatible data. '
                          'To fit an IRM model with DML '
                          'exactly one binary variable with values 0 and 1 '
                          'needs to be specified as treatment variable.')
     return
Example #35
0
 def test_multiclass_conversion(self):
     """
     Makes sure that a encoded target for classification
     properly retains the multiclass target type
     """
     # Multiclass conversion for different datatype
     for input_object in [
         [1.0, 2.0, 2.0, 4.0, 3],
         np.array([1.0, 2.0, 2.0, 4.0, 3], dtype=np.float64),
         pd.DataFrame([1.0, 2.0, 2.0, 4.0, 3], dtype='category'),
     ]:
         validator = InputValidator()
         y_train = validator.validate_target(
             input_object,
             is_classification=True,
         )
         self.assertEqual('multiclass', type_of_target(y_train))
Example #36
0
    def _is_multilabel(self, y):
        """
        Return whether the given target array corresponds to a multilabel
        problem.
        """
        temp_y = y.copy()
        temp_y[np.zeros_like(temp_y, dtype=bool) | (temp_y == -1)] = 1
        target_type = type_of_target(temp_y)

        if target_type in ['binary', 'multiclass']:
            return False
        elif target_type == 'multilabel-indicator':
            return True
        else:
            # Raise an error, as in
            # sklearn.utils.multiclass.check_classification_targets.
            raise ValueError("Unknown label type: %r" % y)
Example #37
0
 def test_multilabel_conversion(self):
     """
     Makes sure that a encoded target for classification
     properly retains the multilabel target type
     """
     # Multi-label conversion for different datatype
     for input_object in [
         [[1, 0, 0, 1], [0, 0, 1, 1], [0, 0, 0, 0]],
         np.array([[1, 0, 0, 1], [0, 0, 1, 1], [0, 0, 0, 0]]),
         pd.DataFrame([[1, 0, 0, 1], [0, 0, 1, 1], [0, 0, 0, 0]], dtype='category'),
     ]:
         validator = InputValidator()
         y_train = validator.validate_target(
             input_object,
             is_classification=True,
         )
         self.assertEqual('multilabel-indicator', type_of_target(y_train))
Example #38
0
    def _validate_targets(self, y):
        """
        Validates labels for training and testing classifier
        """

        y_ = column_or_1d(y, warn=True)
        check_classification_targets(y)
        self.classes_, y = np.unique(y_, return_inverse=True)

        # Make sure that lables are binary
        if type_of_target(y) == 'binary':

            return y

        else:

            print("Labels must be binary. That is, +1 or -1")
Example #39
0
    def validate(self, test_data, test_target):

        best_model = self.sklearn_class(**self.best_params)
        best_model.fit(self.data, self.target)
        prediction = best_model.predict(test_data)
        score = best_model.score(test_data, test_target)

        if 'classification' in str(self.primitive_class):
            type_target = type_of_target(test_target)

            if type_target == "binary":
                series_target = pd.Series(test_target)
                positive_label = series_target.value_counts().index[1]
                scores_dict = self._classification_scoring(
                    test_target,
                    prediction,
                    average_type='binary',
                    positive_label=positive_label)
                roc_auc = roc_auc_score(test_target, prediction)
                scores_dict['roc_auc'] = roc_auc
                scores_dict['score'] = score

            elif type_target == "multiclass":
                scores_dict = self._classification_scoring(
                    test_target, prediction, average_type='macro')
                scores_dict['score'] = score

        elif 'regression' in str(self.primitive_class):
            r2 = r2_score(test_target, prediction)
            mse = mean_squared_error(test_target, prediction)
            explained_variance = explained_variance_score(
                test_target, prediction)
            scores_dict = {
                'optimization_technique': 'hb',
                'estimator': str(self.primitive_class),
                'dataset': self.dataset_name,
                'r2': r2,
                'explained_variance_score': explained_variance,
                'mean_squared_error': mse,
                'max_evals': self.MAX_EVALS,
                'total_time': self.run_time,
                'best_params': self.best_params,
                'score': score
            }

        return scores_dict
Example #40
0
 def test_continuous_multioutput_conversion(self):
     """
     Makes sure that an input for regression
     properly retains the multiout continious target type
     """
     # Regression multi out conversion for different datatype
     for input_object in [
         [[31.4, 94], [40.5, 109], [25.0, 30]],
         np.array([[31.4, 94], [40.5, 109], [25.0, 30]]),
         pd.DataFrame([[31.4, 94], [40.5, 109], [25.0, 30]]),
     ]:
         validator = InputValidator()
         y_train = validator.validate_target(
             input_object,
             is_classification=False,
         )
         self.assertEqual('continuous-multioutput', type_of_target(y_train))
Example #41
0
def check_holdout(holdout, X, y, classifier=True):
    is_sparse = sp.issparse(X)
    if holdout is None:
        holdout = 0.8
    if isinstance(holdout, numbers.Integral):
        if classifier:
            if type_of_target(y) in ['binary', 'multiclass']:
                holdout = StratifiedShuffleSplit(y, train_size=holdout)
            else:
                holdout = ShuffleSplit(_num_samples(y), train_size=holdout)
        else:
            if not is_sparse:
                n_samples = len(X)
            else:
                n_samples = X.shape[0]
            holdout = ShuffleSplit(n_samples, train_size=holdout)
    return holdout
Example #42
0
def get_ml_task_from_y(y):
    from autoflow.constants import binary_classification_task, multiclass_classification_task, \
        multilabel_classification_task, regression_task
    y_type = type_of_target(y)
    if y_type == "binary":
        ml_task = binary_classification_task
    elif y_type == "multiclass":
        ml_task = multiclass_classification_task
    elif y_type == "multilabel-indicator":
        ml_task = multilabel_classification_task
    elif y_type == "multiclass-multioutput":
        raise NotImplementedError()
    elif y_type == "continuous":
        ml_task = regression_task
    else:
        raise NotImplementedError()
    return ml_task
Example #43
0
    def compute_metrics(self,
                        targets,
                        predictions,
                        scores=None,
                        target_field=None,
                        prediction_field=None,
                        score_field=None):
        """
        Compute and track metrics for confusion_matrix

        Parameters
        ----------
        targets : List
            targets (or actuals) for validation
        predictions : List
            predictions (or inferred values)
        scores : List, optional
            associated scores for each prediction
        target_field : str, optional
        prediction_field : str, optional
        score_field : str, optional


        Raises
        ------
        NotImplementedError

        """
        tgt_type = type_of_target(targets)
        if tgt_type not in ("binary", "multiclass"):
            raise NotImplementedError("target type not supported yet")
        # if score are not present set them to 1.
        if scores is None:
            scores = np.ones(len(targets))

        scores = np.array(scores)

        # compute confusion_matrix
        self.metrics.compute_confusion_matrix(
            predictions=predictions,
            targets=targets,
            scores=scores,
            target_field=target_field,
            prediction_field=prediction_field,
            score_field=score_field)
Example #44
0
def check_binarized_results(y, classes, pos_label, neg_label, expected):
    for sparse_output in [True, False]:
        if ((pos_label == 0 or neg_label != 0) and sparse_output):
            assert_raises(ValueError,
                          label_binarize,
                          y,
                          classes,
                          neg_label=neg_label,
                          pos_label=pos_label,
                          sparse_output=sparse_output)
            continue

        # check label_binarize
        binarized = label_binarize(y,
                                   classes,
                                   neg_label=neg_label,
                                   pos_label=pos_label,
                                   sparse_output=sparse_output)
        assert_array_equal(toarray(binarized), expected)
        assert_equal(issparse(binarized), sparse_output)

        # check inverse
        y_type = type_of_target(y)
        if y_type == "multiclass":
            inversed = _inverse_binarize_multiclass(binarized, classes=classes)

        else:
            inversed = _inverse_binarize_thresholding(
                binarized,
                output_type=y_type,
                classes=classes,
                threshold=((neg_label + pos_label) / 2.))

        assert_array_equal(toarray(inversed), toarray(y))

        # Check label binarizer
        lb = LabelBinarizer(neg_label=neg_label,
                            pos_label=pos_label,
                            sparse_output=sparse_output)
        binarized = lb.fit_transform(y)
        assert_array_equal(toarray(binarized), expected)
        assert_equal(issparse(binarized), sparse_output)
        inverse_output = lb.inverse_transform(binarized)
        assert_array_equal(toarray(inverse_output), toarray(y))
        assert_equal(issparse(inverse_output), issparse(y))
Example #45
0
    def fit(self, data, **kwargs):
        """
        Fit the regressor to given training data.
        :param data: DataNode
        :return: self
        """
        self.metric = 'mse' if self.metric is None else self.metric

        # Check the task type: {continuous}
        task_type = type_of_target(data.data[1])
        if task_type in type_dict:
            task_type = type_dict[task_type]
        else:
            raise ValueError("Invalid Task Type: %s!" % task_type)
        self.task_type = task_type
        super().fit(data)

        return self
Example #46
0
    def fit(self, data: DataNode):
        """
        Fit the classifier to given training data.
        :param data: instance of DataNode
        :return: self
        """
        self.metric = 'acc' if self.metric is None else self.metric

        # Check the task type: {binary, multiclass}
        task_type = type_of_target(data.data[1])
        if task_type in type_dict:
            task_type = type_dict[task_type]
        else:
            raise ValueError("Invalid Task Type: %s!" % task_type)
        self.task_type = task_type
        super().fit(data)

        return self
Example #47
0
    def fit(self, X, y=None):
        """
        Fit the classification model.
        """
        # The target determines what kind of estimator is fit
        ttype = type_of_target(y)
        if ttype.startswith(MULTICLASS):
            self.target_type_ = MULTICLASS
        elif ttype.startswith(BINARY):
            self.target_type_ = BINARY
        else:
            raise YellowbrickValueError(
                ("{} does not support target type '{}', "
                 "please provide a binary or multiclass single-output target"
                 ).format(self.__class__.__name__, ttype))

        # Fit the model and return self
        return super(ROCAUC, self).fit(X, y)
Example #48
0
    def __call__(self, clf, X, y_true, sample_weight=None, lamb=None):
        """Evaluate decision function output for X relative to y_true.

        Parameters
        ----------
        clf : object
            Trained classifier to use for scoring. Must have either a
            decision_function method or a predict_proba method; the output of
            that is used to compute the score.

        X : array-like or sparse matrix
            Test data that will be fed to clf.decision_function or
            clf.predict_proba.

        y_true : array-like
            Gold standard target values for X. These must be class labels,
            not decision function values.

        sample_weight : array-like, optional (default=None)
            Sample weights.

        lamb : array, shape (n_lambda,)
            Values of lambda from lambda_path_ from which to score predictions.

        Returns
        -------
        score : array, shape (n_lambda,)
            Score function applied to prediction of estimator on X.
        """
        y_type = type_of_target(y_true)
        if y_type not in ("binary", "multilabel-indicator"):
            raise ValueError("{0} format is not supported".format(y_type))

        y_pred = clf.decision_function(X, lamb=lamb)
        if sample_weight is not None:
            scores = np.apply_along_axis(
                lambda y_hat: self._score_func(
                    y_true, y_hat, sample_weight=sample_weight, **self._kwargs
                ), 0, y_pred)
        else:
            scores = np.apply_along_axis(
                lambda y_hat: self._score_func(y_true, y_hat, **self._kwargs),
                0, y_pred)
        return self._sign * scores
Example #49
0
    def partial_fit(self, X, y=None, forget=False, update_classes=False, compute_output_weights=True) -> ELMClassifier:
        """Update classifier with a new batch of data.

        |method_partial_fit|

        Parameters
        ----------
        X : {array-like, sparse matrix}, shape=[n_samples, n_features]
            Training input samples

        y : array-like, shape=[n_samples, n_targets]
            Training targets

        forget : boolean, default False
            |param_forget|

        update_classes : boolean, default False
            Include new classes from `y` into the model, assuming they were 0 in all previous samples.

        compute_output_weights : boolean, optional, default True
            |param_compute_output_weights|
        """

        #todo: Warning on strongly non-normalized data

        X, y = check_X_y(X, y, accept_sparse=True, multi_output=True)

        # init label binarizer if needed
        if not hasattr(self, 'label_binarizer_'):
            self.label_binarizer_ = LabelBinarizer()
            if type_of_target(y).endswith("-multioutput"):
                self.label_binarizer_ = MultiLabelBinarizer()
            self.label_binarizer_.fit(self.classes if self.classes is not None else y)

        if update_classes:
            self._update_classes(y)

        y_numeric = self.label_binarizer_.transform(y)
        if len(y_numeric.shape) > 1 and y_numeric.shape[1] == 1:
            y_numeric = y_numeric[:, 0]

        super().partial_fit(X, y_numeric, forget=forget, compute_output_weights=compute_output_weights)
        return self
Example #50
0
    def __call__(self, y_true, y_pred, sample_weight=None):
        """Evaluate predicted target values for X relative to y_true.

        Parameters
        ----------
        y_true : array-like
            Gold standard target values for X.

        y_pred : array-like, [n_samples x n_classes]
            Model predictions

        sample_weight : array-like, optional (default=None)
            Sample weights.

        Returns
        -------
        score : float
            Score function applied to prediction of estimator on X.
        """

        if isinstance(y_true, list):
            y_true = np.array(y_true)
        if isinstance(y_pred, list):
            y_pred = np.array(y_pred)
        type_true = type_of_target(y_true)

        if len(y_pred.shape
               ) == 1 or y_pred.shape[1] == 1 or type_true == 'continuous':
            pass  # must be regression, all other task types would return at least two probabilities
        elif type_true in ['binary', 'multiclass']:
            y_pred = np.argmax(y_pred, axis=1)
        elif type_true == 'multilabel-indicator':
            y_pred[y_pred > 0.5] = 1.0
            y_pred[y_pred <= 0.5] = 0.0
        else:
            raise ValueError(type_true)

        if sample_weight is not None:
            return self._sign * self._score_func(
                y_true, y_pred, sample_weight=sample_weight, **self._kwargs)
        else:
            return self._sign * self._score_func(y_true, y_pred, **
                                                 self._kwargs)
Example #51
0
def is_cat(s: Union[pd.Series, np.ndarray], consider_ordinal_as_cat):
    if not isinstance(s, pd.Series):
        s = pd.Series(s)
    if s.dtype == object:
        for elem in s:
            if isinstance(elem, (float, int)):
                continue
            else:
                return True
        s = s.astype('float32')
    if consider_ordinal_as_cat:
        valid_types = ["multiclass"]
        if consider_ordinal_as_cat in (2, "binary"):
            valid_types += ["binary"]
        s = s.dropna()
        tp = type_of_target(s)
        if tp in valid_types:
            return True
    return False
Example #52
0
 def feature_discretion(self, X, y):
     """
     Discrete the continuous features of input data X, and keep other features unchanged.
     :param X : numpy array
     :return: the numpy array in which all continuous features are discrete
     """
     temp, X_interval = [], []
     if self._DISCRETION == "percentile_discrete":
         for i in range(0, X.shape[-1]):
             x = X[:, i]
             x_type = type_of_target(x)
             # logging.info("before: "+" ".join([str(i), str(set(X[:, i])), str(x_type)]))
             if 0:
                 if x_type == 'continuous':
                     x1, interval = self.percentile_discrete(x, self._WOE_N)
                     X_interval.append(interval)
                     temp.append(x1)
                     # logging.info("continue_after: " + " ".join([str(i), str(set(x1)), str(x1)]))
                 else:
                     temp.append(x)
                     # logging.info("after: " + " ".join([str(i), str(set(x)), str(x)]))
             else:
                 x1, interval = self.percentile_discrete(x, self._WOE_N)
                 X_interval.append(interval)
                 temp.append(x1)
                 # logging.info("continue_after: " + " ".join([str(i), str(set(x1)), str(x1)]))
     elif self._DISCRETION == "interval_discrete":
         for i in range(0, X.shape[-1]):
             x = X[:, i]
             # logging.info("before: "+" ".join([str(i), str(set(X[:, i]))]))
             x1, interval = self.interval_discrete(x, self._WOE_N)
             X_interval.append(interval)
             temp.append(x1)
             # logging.info("interval_after: " + " ".join([str(i), str(set(x1)), str(x1)]))
     elif self._DISCRETION == "rf_discrete":
         for i in range(0, X.shape[-1]):
             x = X[:, i]
             # logging.info("before: "+" ".join([str(i), str(set(X[:, i]))]))
             x1, interval = self.rf_discrete(x, y)
             X_interval.append(interval)
             temp.append(x1)
             # logging.info("rf_after: " + " ".join([str(i), str(set(x1)), str(x1)]))
     return np.array(temp).T, X_interval
def train_nb(X, y):
    m, n = X.shape
    p1 = (len(y[y == '是']) + 1) / (m + 2)  # 拉普拉斯平滑

    p1_list = []  # 用于保存正例下各属性的条件概率
    p0_list = []

    X1 = X[y == '是']
    X0 = X[y == '否']

    m1, _ = X1.shape
    m0, _ = X0.shape

    for i in range(n):
        xi = X.iloc[:, i]
        p_xi = namedtuple(X.columns[i],
                          ['is_continuous', 'conditional_pro'])  # 用于储存每个变量的情况

        is_continuous = type_of_target(xi) == 'continuous'
        xi1 = X1.iloc[:, i]
        xi0 = X0.iloc[:, i]
        if is_continuous:  # 连续值时,conditional_pro 储存的就是 [mean, var] 即均值和方差
            xi1_mean = np.mean(xi1)
            xi1_var = np.var(xi1)
            xi0_mean = np.mean(xi0)
            xi0_var = np.var(xi0)

            p1_list.append(p_xi(is_continuous, [xi1_mean, xi1_var]))
            p0_list.append(p_xi(is_continuous, [xi0_mean, xi0_var]))
        else:  # 离散值时直接计算各类别的条件概率
            unique_value = xi.unique()  # 取值情况
            nvalue = len(unique_value)  # 取值个数

            xi1_value_count = pd.value_counts(xi1)[unique_value].fillna(
                0) + 1  # 计算正样本中,该属性每个取值的数量,并且加1,即拉普拉斯平滑
            xi0_value_count = pd.value_counts(xi0)[unique_value].fillna(0) + 1

            p1_list.append(
                p_xi(is_continuous, np.log(xi1_value_count / (m1 + nvalue))))
            p0_list.append(
                p_xi(is_continuous, np.log(xi0_value_count / (m0 + nvalue))))

    return p1, p1_list, p0_list
def check_binarized_results(y, classes, pos_label, neg_label, expected):
    for sparse_output in [True, False]:
        if ((pos_label == 0 or neg_label != 0) and sparse_output):
            assert_raises(ValueError, label_binarize, y, classes,
                          neg_label=neg_label, pos_label=pos_label,
                          sparse_output=sparse_output)
            continue

        # check label_binarize
        binarized = label_binarize(y, classes, neg_label=neg_label,
                                   pos_label=pos_label,
                                   sparse_output=sparse_output)
        assert_array_equal(toarray(binarized), expected)
        assert_equal(issparse(binarized), sparse_output)

        # check inverse
        y_type = type_of_target(y)
        if y_type == "multiclass":
            inversed = _inverse_binarize_multiclass(binarized, classes=classes)

        else:
            inversed = _inverse_binarize_thresholding(binarized,
                                                      output_type=y_type,
                                                      classes=classes,
                                                      threshold=((neg_label +
                                                                 pos_label) /
                                                                 2.))

        assert_array_equal(toarray(inversed), toarray(y))

        # Check label binarizer
        lb = LabelBinarizer(neg_label=neg_label, pos_label=pos_label,
                            sparse_output=sparse_output)
        binarized = lb.fit_transform(y)
        assert_array_equal(toarray(binarized), expected)
        assert_equal(issparse(binarized), sparse_output)
        inverse_output = lb.inverse_transform(binarized)
        assert_array_equal(toarray(inverse_output), toarray(y))
        assert_equal(issparse(inverse_output), issparse(y))
Example #55
0
    def __call__(self, clf, X, y_true, sample_weight=None, lamb=None):
        """Evaluate decision function output for X relative to y_true.

        Parameters
        ----------
        clf : object
            Trained classifier to use for scoring. Must have either a
            decision_function method or a predict_proba method; the output of
            that is used to compute the score.

        X : array-like or sparse matrix
            Test data that will be fed to clf.decision_function or
            clf.predict_proba.

        y_true : array-like
            Gold standard target values for X. These must be class labels,
            not decision function values.

        sample_weight : array-like, optional (default=None)
            Sample weights.

        lamb : array, shape (n_lambda,)
            Values of lambda from lambda_path_ from which to score predictions.

        Returns
        -------
        score : array, shape (n_lambda,)
            Score function applied to prediction of estimator on X.
        """
        y_type = type_of_target(y_true)
        if y_type not in ("binary", "multilabel-indicator"):
            raise ValueError("{0} format is not supported".format(y_type))

        y_pred = clf.decision_function(X, lamb=lamb)
        if sample_weight is not None:
            scores = np.apply_along_axis(lambda y_hat: self._score_func(y_true, y_hat, sample_weight=sample_weight, **self._kwargs), 0, y_pred)
        else:
            scores = np.apply_along_axis(lambda y_hat: self._score_func(y_true, y_hat, **self._kwargs), 0, y_pred)
        return self._sign * scores
Example #56
0
def _sampling_strategy_float(sampling_strategy, y, sampling_type):
    """Take a proportion of the majority (over-sampling) or minority
    (under-sampling) class in binary classification."""
    type_y = type_of_target(y)
    if type_y != 'binary':
        raise ValueError(
            '"sampling_strategy" can be a float only when the type '
            'of target is binary. For multi-class, use a dict.')
    target_stats = _count_class_sample(y)
    if sampling_type == 'over-sampling':
        n_sample_majority = max(target_stats.values())
        class_majority = max(target_stats, key=target_stats.get)
        sampling_strategy_ = {
            key: int(n_sample_majority * sampling_strategy - value)
            for (key, value) in target_stats.items() if key != class_majority
        }
        if any([n_samples <= 0 for n_samples in sampling_strategy_.values()]):
            raise ValueError("The specified ratio required to remove samples "
                             "from the minority class while trying to "
                             "generate new samples. Please increase the "
                             "ratio.")
    elif (sampling_type == 'under-sampling'):
        n_sample_minority = min(target_stats.values())
        class_minority = min(target_stats, key=target_stats.get)
        sampling_strategy_ = {
            key: int(n_sample_minority / sampling_strategy)
            for (key, value) in target_stats.items() if key != class_minority
        }
        if any([n_samples > target_stats[target]
               for target, n_samples in sampling_strategy_.items()]):
            raise ValueError("The specified ratio required to generate new "
                             "sample in the majority class while trying to "
                             "remove samples. Please increase the ratio.")
    else:
        raise ValueError("'clean-sampling' methods do let the user "
                         "specify the sampling ratio.")
    return sampling_strategy_
Example #57
0
def analyse_results(
        regular_cv_results, permutation_cv_results, labels, estimator,
        base_folder=None, analysis_folder='analysis', feature_names=None,
        learning_task=None, vs_analysis=None,
        threshold=.75, model_assessment_options=None,
        score_surfaces_options=None):
    """Summary and plot generation."""

    # learning_task follows the convention of
    # sklearn.utils.multiclass.type_of_target
    if learning_task is None:
        if is_regressor(estimator):
            learning_task = 'continuous'
        else:
            learning_task = type_of_target(labels)

    # Create an empty dictionary which will contain the key results
    # of the analysis
    analysis_summary = dict()

    # Run the appropriate analysis according to the learning_task
    is_regression = learning_task.lower() in ('continuous', 'regression')
    if is_regression:
        # Perform regression analysis
        target = 'regression'
    elif learning_task.lower() == 'multiclass':
        target = 'multiclass'
    else:
        # Perform classification analysis
        target = 'classification'

    # Support for empty regular or permutation tests
    performance_regular = performance_metrics(
        regular_cv_results, labels, target)
    performance_permutation = performance_metrics(
        permutation_cv_results, labels, target)
    if base_folder is not None and analysis_folder is not None:
        analysis_folder = os.path.join(base_folder, analysis_folder)
        if not os.path.exists(analysis_folder):
            os.makedirs(analysis_folder)

        # ### Create two separate folders for figures in different formats
        try:
            os.mkdir(os.path.join(analysis_folder, 'figures_pdf'))
            os.mkdir(os.path.join(analysis_folder, 'figures_png'))
        except OSError:
            pass  # if folder already exists, ignore it
    else:
        analysis_folder = None

    if model_assessment_options is None:
        model_assessment_options = {}
    # Handle variable selection step
    if vs_analysis is not None:
        # Get feature names
        if feature_names is None:
            # what follows creates [feat_0, feat_1, ..., feat_d]
            # feature_names = 'feat_' + np.arange(
            #     labels.size).astype(str).astype(object)
            raise ValueError(
                "Variable selection analysis was specified, but no feature "
                "names were provided.")

        feature_names = np.array(feature_names)  # force feature names to array
        if threshold is None:
            threshold = .75
        selected = {}
        # Init variable selection containers
        selected['regular'] = dict(zip(feature_names,
                                       np.zeros(len(feature_names))))
        selected['permutation'] = selected['regular'].copy()

        n_splits_regular = len((regular_cv_results.values() or [[]])[0])
        n_splits_permutation = len((permutation_cv_results.values() or [[]])[0])
        n_jobs = {'regular': n_splits_regular,
                  'permutation': n_splits_permutation}
        names_ = ('regular', 'permutation')
        cv_results_ = (regular_cv_results, permutation_cv_results)
        for batch_name, cv_result in zip(names_, cv_results_):
            # cv_result['estimator'] is a list containing
            # the grid-search estimators
            estimators = cv_result.get('estimator', None)
            if estimators is None:
                continue  # in case of no permutations skip this iter
            for estimator in estimators:
                selected_list = get_selected_list(
                    estimator, vs_analysis)
                if len(selected_list) < 1:
                    continue
                selected_variables = feature_names[selected_list]

                for var in selected_variables:
                    selected[batch_name][var] += 1. / n_jobs[batch_name]

            # Save selected variables textual summary
            if analysis_folder is not None:
                save_signature(os.path.join(
                    analysis_folder, 'signature_%s.txt' % batch_name),
                    selected[batch_name], threshold)

            # Also save the frequency list as an entry of the analysis summary

            # Create an empty pandas dataframe to store the frequencies
            df_tmp = pd.DataFrame(columns=['Frequency'])

            for k in reversed(sorted(
                    selected[batch_name], key=selected[batch_name].__getitem__)):

                    df_tmp.loc[k] = selected[batch_name][k] * 100

            # Add the dataframe to the analysis summary
            analysis_summary['selection_frequency_{}'.format(batch_name)] = df_tmp

        feat_arr_r = np.array(list(iteritems(selected['regular'])), dtype=object)
        feat_arr_p = np.array(list(iteritems(selected['permutation'])), dtype=object)

        # sort by name
        feat_arr_r = feat_arr_r[feat_arr_r[:, 0].argsort()]
        feat_arr_p = feat_arr_p[feat_arr_p[:, 0].argsort()]

        # Save graphical summary
        plotting.feature_frequencies(
            feat_arr_r, analysis_folder,
            threshold=threshold)

        plotting.features_manhattan(
            feat_arr_r, feat_arr_p, analysis_folder,
            threshold=threshold)

        plotting.select_over_threshold(
            feat_arr_r, feat_arr_p, analysis_folder,
            threshold=threshold)

    # Generate distribution plots
    # And save distributions in analysis summary
    for i, metric in enumerate(performance_regular):
        plotting.distributions(
            v_regular=performance_regular[metric],
            v_permutation=performance_permutation.get(metric, []),
            base_folder=analysis_folder,
            metric=metric,
            first_run=i == 0,
            is_regression=is_regression)

        v_regular = performance_regular[metric]
        v_permutation = performance_permutation.get(metric, [])

        metric_values = dict()
        metric_values['values_regular'] = v_regular
        metric_values['values_permutation'] = v_permutation

        r_mean, r_sd = np.nanmean(v_regular), np.nanstd(v_regular)
        p_mean, p_sd = np.nanmean(v_permutation), np.nanstd(v_permutation)
        rstest = stats.ks_2samp(v_regular, v_permutation)

        metric_values['mean_regular'] = r_mean
        metric_values['sd_regular'] = r_sd

        metric_values['mean_permutation'] = p_mean
        metric_values['sd_permutation'] = p_sd

        metric_values['rstest'] = rstest

        analysis_summary['metric_{}'.format(metric)] = metric_values

    # Generate surfaces
    # This has meaning only if the estimator is an istance of GridSearchCV
    if isinstance(estimator, BaseSearchCV):
        if score_surfaces_options is None:
            score_surfaces_options = {}
        plotting.score_surfaces(
            param_grid=estimator.param_grid,
            results=regular_cv_results,
            base_folder=analysis_folder,
            is_regression=is_regression,
            **score_surfaces_options)

    # Finally, save in the analysis folder the pickled summary
    if analysis_folder is not None:
        with open(os.path.join(analysis_folder, 'summary.pkl'), 'w') as af:
            pkl.dump(analysis_summary, af)
Example #58
0
    def fit(self, X, y, **kwargs):
        """
        Fit is the entry point for the visualizer. Given instances described
        by X and binary classes described in the target y, fit performs n
        trials by shuffling and splitting the dataset then computing the
        precision, recall, f1, and queue rate scores for each trial. The
        scores are aggregated by the quantiles expressed then drawn.

        Parameters
        ----------
        X : ndarray or DataFrame of shape n x m
            A matrix of n instances with m features

        y : ndarray or Series of length n
            An array or series of target or class values. The target y must
            be a binary classification target.

        kwargs: dict
            keyword arguments passed to Scikit-Learn API.

        Returns
        -------
        self : instance
            Returns the instance of the visualizer

        raises: YellowbrickValueError
            If the target y is not a binary classification target.
        """
        # Check target before metrics raise crazy exceptions
        if type_of_target(y) != 'binary':
            raise YellowbrickValueError("multiclass format is not supported")

        # Make arrays indexable for cross validation
        X, y = indexable(X, y)

        # TODO: parallelize trials with joblib (using sklearn utility)
        # NOTE: parallelization with matplotlib is tricy at best!
        trials = [
            metric
            for idx in range(self.n_trials)
            for metric in self._split_fit_score_trial(X, y, idx)
        ]

        # Compute maximum number of uniform thresholds across all trials
        n_thresholds = np.array([len(t['thresholds']) for t in trials]).min()
        self.thresholds_ = np.linspace(0.0, 1.0, num=n_thresholds)

        # Filter metrics and collect values for uniform thresholds
        metrics = frozenset(METRICS) - self._check_exclude(self.exclude)
        uniform_metrics = defaultdict(list)

        for trial in trials:
            rows = defaultdict(list)
            for t in self.thresholds_:
                idx = bisect.bisect_left(trial['thresholds'], t)
                for metric in metrics:
                    rows[metric].append(trial[metric][idx])

            for metric, row in rows.items():
                uniform_metrics[metric].append(row)

        # Convert metrics to metric arrays
        uniform_metrics = {
            metric: np.array(values)
            for metric, values in uniform_metrics.items()
        }

        # Perform aggregation and store cv_scores_
        quantiles = self._check_quantiles(self.quantiles)
        self.cv_scores_ = {}

        for metric, values in uniform_metrics.items():
            # Compute the lower, median, and upper plots
            lower, median, upper = mstats.mquantiles(
                values, prob=quantiles, axis=0
            )

            # Store the aggregates in cv scores
            self.cv_scores_[metric] = median
            self.cv_scores_["{}_lower".format(metric)] = lower
            self.cv_scores_["{}_upper".format(metric)] = upper

        # Draw and always return self
        self.draw()
        return self
def test_check_classification_targets():
    # Test that check_classification_target return correct type. #5782
    y = np.array([0.0, 1.1, 2.0, 3.0])
    msg = type_of_target(y)
    assert_raise_message(ValueError, msg, check_classification_targets, y)