Esempio n. 1
0
    def _fit_resample(self, X, y):
        n_samples = X.shape[0]

        # convert y to z_score
        y_z = (y - y.mean()) / y.std()

        index0 = np.arange(n_samples)
        index_negative = index0[y_z > self.negative_thres]
        index_positive = index0[y_z <= self.positive_thres]
        index_unclassified = [x for x in index0
                              if x not in index_negative
                              and x not in index_positive]

        y_z[index_negative] = 0
        y_z[index_positive] = 1
        y_z[index_unclassified] = -1

        ros = RandomOverSampler(
            sampling_strategy=self.sampling_strategy,
            random_state=self.random_state,
            ratio=self.ratio)
        _, _ = ros.fit_resample(X, y_z)
        sample_indices = ros.sample_indices_

        print("Before sampler: %s. Total after: %s"
              % (Counter(y_z), sample_indices.shape))

        self.sample_indices_ = np.array(sample_indices)

        if self.return_indices:
            return (_safe_indexing(X, sample_indices),
                    _safe_indexing(y, sample_indices),
                    sample_indices)
        return (_safe_indexing(X, sample_indices),
                _safe_indexing(y, sample_indices))
Esempio n. 2
0
    def train_submission(self, module_path, X, y, train_idx=None):
        """Train the estimator of a given submission.

        Parameters
        ----------
        module_path : str
            The path to the submission where `filename` is located.
        X : {array-like, sparse matrix, dataframe} of shape \
                (n_samples, n_features)
            The data matrix.
        y : array-like of shape (n_samples,)
            The target vector.
        train_idx : array-like of shape (n_training_samples,), default=None
            The training indices. By default, the full dataset will be used
            to train the model. If an array is provided, `X` and `y` will be
            subsampled using these indices.

        Returns
        -------
        estimator : estimator object
            The scikit-learn fitted on (`X`, `y`).
        """
        train_idx = slice(None, None, None) if train_idx is None else train_idx
        submission_module = import_module_from_source(
            os.path.join(module_path, self.filename),
            os.path.splitext(self.filename)[0],  # keep the module name only
            sanitize=True)
        estimator = submission_module.get_estimator()
        X_train = _safe_indexing(X, train_idx)
        y_train = _safe_indexing(y, train_idx)
        return estimator.fit(X_train, y_train)
Esempio n. 3
0
def test_check_fit_params(indices):
    X = np.random.randn(4, 2)
    fit_params = {
        'list': [1, 2, 3, 4],
        'array': np.array([1, 2, 3, 4]),
        'sparse-col': sp.csc_matrix([1, 2, 3, 4]).T,
        'sparse-row': sp.csc_matrix([1, 2, 3, 4]),
        'scalar-int': 1,
        'scalar-str': 'xxx',
        'None': None,
    }
    result = _check_fit_params(X, fit_params, indices)
    indices_ = indices if indices is not None else list(range(X.shape[0]))

    for key in ['sparse-row', 'scalar-int', 'scalar-str', 'None']:
        assert result[key] is fit_params[key]

    assert result['list'] == _safe_indexing(fit_params['list'], indices_)
    assert_array_equal(
        result['array'], _safe_indexing(fit_params['array'], indices_)
    )
    assert_allclose_dense_sparse(
        result['sparse-col'],
        _safe_indexing(fit_params['sparse-col'], indices_)
    )
Esempio n. 4
0
def test_safe_indexing_2d_container_axis_1(array_type, indices_type, indices):
    # validation of the indices
    # we make a copy because indices is mutable and shared between tests
    indices_converted = copy(indices)
    if indices_type == "slice" and isinstance(indices[1], int):
        indices_converted[1] += 1

    columns_name = ["col_0", "col_1", "col_2"]
    array = _convert_container(
        [[1, 2, 3], [4, 5, 6], [7, 8, 9]], array_type, columns_name
    )
    indices_converted = _convert_container(indices_converted, indices_type)

    if isinstance(indices[0], str) and array_type != "dataframe":
        err_msg = (
            "Specifying the columns using strings is only supported "
            "for pandas DataFrames"
        )
        with pytest.raises(ValueError, match=err_msg):
            _safe_indexing(array, indices_converted, axis=1)
    else:
        subset = _safe_indexing(array, indices_converted, axis=1)
        assert_allclose_dense_sparse(
            subset, _convert_container([[2, 3], [5, 6], [8, 9]], array_type)
        )
Esempio n. 5
0
    def __getitem__(self, index):
        X_resampled = _safe_indexing(
            self.X,
            self.indices_[
                index * self.batch_size:(index + 1) * self.batch_size
            ],
        )
        y_resampled = _safe_indexing(
            self.y,
            self.indices_[
                index * self.batch_size:(index + 1) * self.batch_size
            ],
        )
        if issparse(X_resampled) and not self.keep_sparse:
            X_resampled = X_resampled.toarray()
        if self.sample_weight is not None:
            sample_weight_resampled = _safe_indexing(
                self.sample_weight,
                self.indices_[
                    index * self.batch_size:(index + 1) * self.batch_size
                ],
            )

        if self.sample_weight is None:
            return X_resampled, y_resampled
        else:
            return X_resampled, y_resampled, sample_weight_resampled
    def _fit_resample(self, X, y):
        random_state = check_random_state(self.random_state)

        idx_under = np.empty((0, ), dtype=int)

        for target_class in np.unique(y):
            if target_class in self.sampling_strategy_.keys():
                n_samples = self.sampling_strategy_[target_class]
                index_target_class = random_state.choice(
                    range(np.count_nonzero(y == target_class)),
                    size=n_samples,
                    replace=self.replacement,
                )
            else:
                index_target_class = slice(None)

            idx_under = np.concatenate(
                (
                    idx_under,
                    np.flatnonzero(y == target_class)[index_target_class],
                ),
                axis=0,
            )

        self.sample_indices_ = idx_under

        return _safe_indexing(X, idx_under), _safe_indexing(y, idx_under)
Esempio n. 7
0
    def _fit_and_predict_oof_model(
        self,
        estimator: RegressorMixin,
        X: ArrayLike,
        y: ArrayLike,
        train_index: ArrayLike,
        val_index: ArrayLike,
        sample_weight: Optional[ArrayLike] = None,
    ) -> Tuple[RegressorMixin, NDArray, ArrayLike]:
        """
        Fit a single out-of-fold model on a given training set and
        perform predictions on a test set.

        Parameters
        ----------
        estimator : RegressorMixin
            Estimator to train.

        X : ArrayLike of shape (n_samples, n_features)
            Input data.

        y : ArrayLike of shape (n_samples,)
            Input labels.

        train_index : ArrayLike of shape (n_samples_train)
            Training data indices.

        val_index : ArrayLike of shape (n_samples_val)
            Validation data indices.

        sample_weight : Optional[ArrayLike] of shape (n_samples,)
            Sample weights. If None, then samples are equally weighted.
            By default ``None``.

        Returns
        -------
        Tuple[RegressorMixin, NDArray, ArrayLike]

        - [0]: RegressorMixin, fitted estimator
        - [1]: NDArray of shape (n_samples_val,),
          estimator predictions on the validation fold.
        - [3]: ArrayLike of shape (n_samples_val,),
          validation data indices.
        """
        X_train = _safe_indexing(X, train_index)
        y_train = _safe_indexing(y, train_index)
        X_val = _safe_indexing(X, val_index)
        if sample_weight is None:
            estimator = fit_estimator(estimator, X_train, y_train)
        else:
            sample_weight_train = _safe_indexing(sample_weight, train_index)
            estimator = fit_estimator(
                estimator, X_train, y_train, sample_weight_train
            )
        if _num_samples(X_val) > 0:
            y_pred = estimator.predict(X_val)
        else:
            y_pred = np.array([])
        return estimator, y_pred, val_index
Esempio n. 8
0
    def _split_fit_score_trial(self, X, y, idx=0):
        """
        Splits the dataset, fits a clone of the estimator, then scores it
        according to the required metrics.

        The index of the split is added to the random_state if the
        random_state is not None; this ensures that every split is shuffled
        differently but in a deterministic fashion for testing purposes.
        """
        random_state = self.random_state
        if random_state is not None:
            random_state += idx

        splitter = self._check_cv(self.cv, random_state)

        for train_index, test_index in splitter.split(X, y):
            # Safe indexing handles multiple types of inputs including
            # DataFrames and structured arrays - required for generic splits.
            X_train = _safe_indexing(X, train_index)
            y_train = _safe_indexing(y, train_index)
            X_test = _safe_indexing(X, test_index)
            y_test = _safe_indexing(y, test_index)

            model = clone(self.estimator)
            model.fit(X_train, y_train)

            if hasattr(model, "predict_proba"):
                # Get the probabilities for the positive class
                y_scores = model.predict_proba(X_test)[:, 1]
            else:
                # Use the decision function to get the scores
                y_scores = model.decision_function(X_test)

            # Compute the curve metrics and thresholds
            curve_metrics = precision_recall_curve(y_test, y_scores)
            precision, recall, thresholds = curve_metrics

            # Compute the F1 score from precision and recall
            # Don't need to warn for F, precision/recall would have warned
            with np.errstate(divide="ignore", invalid="ignore"):
                beta = self.fbeta**2
                f_score = (1 + beta) * precision * recall / (beta * precision +
                                                             recall)

            # Ensure thresholds ends at 1
            thresholds = np.append(thresholds, 1)

            # Compute the queue rate
            queue_rate = np.array([(y_scores >= threshold).mean()
                                   for threshold in thresholds])

            yield {
                "thresholds": thresholds,
                "precision": precision,
                "recall": recall,
                "fscore": f_score,
                "queue_rate": queue_rate,
            }
Esempio n. 9
0
    def _fit_resample(self, X, y):
        self._validate_estimator()
        enn = EditedNearestNeighbours(
            sampling_strategy=self.sampling_strategy,
            n_neighbors=self.n_neighbors,
            kind_sel="mode",
            n_jobs=self.n_jobs,
        )
        enn.fit_resample(X, y)
        index_not_a1 = enn.sample_indices_
        index_a1 = np.ones(y.shape, dtype=bool)
        index_a1[index_not_a1] = False
        index_a1 = np.flatnonzero(index_a1)

        # clean the neighborhood
        target_stats = Counter(y)
        class_minority = min(target_stats, key=target_stats.get)
        # compute which classes to consider for cleaning for the A2 group
        classes_under_sample = [
            c
            for c, n_samples in target_stats.items()
            if (
                c in self.sampling_strategy_.keys()
                and (n_samples > X.shape[0] * self.threshold_cleaning)
            )
        ]
        self.nn_.fit(X)
        class_minority_indices = np.flatnonzero(y == class_minority)
        X_class = _safe_indexing(X, class_minority_indices)
        y_class = _safe_indexing(y, class_minority_indices)
        nnhood_idx = self.nn_.kneighbors(X_class, return_distance=False)[:, 1:]
        nnhood_label = y[nnhood_idx]
        if self.kind_sel == "mode":
            nnhood_label_majority, _ = mode(nnhood_label, axis=1)
            nnhood_bool = np.ravel(nnhood_label_majority) == y_class
        elif self.kind_sel == "all":
            nnhood_label_majority = nnhood_label == class_minority
            nnhood_bool = np.all(nnhood_label, axis=1)
        else:
            raise NotImplementedError
        # compute a2 group
        index_a2 = np.ravel(nnhood_idx[~nnhood_bool])
        index_a2 = np.unique(
            [index for index in index_a2 if y[index] in classes_under_sample]
        )

        union_a1_a2 = np.union1d(index_a1, index_a2).astype(int)
        selected_samples = np.ones(y.shape, dtype=bool)
        selected_samples[union_a1_a2] = False
        self.sample_indices_ = np.flatnonzero(selected_samples)

        return (
            _safe_indexing(X, self.sample_indices_),
            _safe_indexing(y, self.sample_indices_),
        )
Esempio n. 10
0
def check_null_weight(
        sample_weight: Optional[ArrayLike], X: ArrayLike,
        y: ArrayLike) -> Tuple[Optional[NDArray], ArrayLike, ArrayLike]:
    """
    Check sample weights and remove samples with null sample weights.

    Parameters
    ----------
    sample_weight : Optional[ArrayLike] of shape (n_samples,)
        Sample weights.
    X : ArrayLike of shape (n_samples, n_features)
        Training samples.
    y : ArrayLike of shape (n_samples,)
        Training labels.

    Returns
    -------
    sample_weight : Optional[NDArray] of shape (n_samples,)
        Non-null sample weights.

    X : ArrayLike of shape (n_samples, n_features)
        Training samples with non-null weights.

    y : ArrayLike of shape (n_samples,)
        Training labels with non-null weights.

    Examples
    --------
    >>> import numpy as np
    >>> from mapie.utils import check_null_weight
    >>> X = np.array([[0], [1], [2], [3], [4], [5]])
    >>> y = np.array([5, 7, 9, 11, 13, 15])
    >>> sample_weight = np.array([0, 1, 1, 1, 1, 1])
    >>> sample_weight, X, y = check_null_weight(sample_weight, X, y)
    >>> print(sample_weight)
    [1. 1. 1. 1. 1.]
    >>> print(X)
    [[1]
     [2]
     [3]
     [4]
     [5]]
    >>> print(y)
    [ 7  9 11 13 15]
    """
    if sample_weight is not None:
        sample_weight = _check_sample_weight(sample_weight, X)
        non_null_weight = sample_weight != 0
        X = _safe_indexing(X, non_null_weight)
        y = _safe_indexing(y, non_null_weight)
        sample_weight = _safe_indexing(sample_weight, non_null_weight)
    sample_weight = cast(Optional[NDArray], sample_weight)
    return sample_weight, X, y
Esempio n. 11
0
    def _fit_resample(self, X, y):
        # Find the nearest neighbour of every point
        nn = NearestNeighbors(n_neighbors=2, n_jobs=self.n_jobs)
        nn.fit(X)
        nns = nn.kneighbors(X, return_distance=False)[:, 1]

        links = self.is_tomek(y, nns, self.sampling_strategy_)
        self.sample_indices_ = np.flatnonzero(np.logical_not(links))

        return (
            _safe_indexing(X, self.sample_indices_),
            _safe_indexing(y, self.sample_indices_),
        )
Esempio n. 12
0
def test_safe_indexing_1d_array_error(X_constructor):
    # check that we are raising an error if the array-like passed is 1D and
    # we try to index on the 2nd dimension
    X = list(range(5))
    if X_constructor == 'array':
        X_constructor = np.asarray(X)
    elif X_constructor == 'series':
        pd = pytest.importorskip("pandas")
        X_constructor = pd.Series(X)

    err_msg = "'X' should be a 2D NumPy array, 2D sparse matrix or pandas"
    with pytest.raises(ValueError, match=err_msg):
        _safe_indexing(X_constructor, [0, 1], axis=1)
Esempio n. 13
0
 def generator(X, y, sample_weight, indices, batch_size):
     while True:
         for index in range(0, len(indices), batch_size):
             X_res = _safe_indexing(X, indices[index:index + batch_size])
             y_res = _safe_indexing(y, indices[index:index + batch_size])
             if issparse(X_res) and not keep_sparse:
                 X_res = X_res.toarray()
             if sample_weight is None:
                 yield X_res, y_res
             else:
                 sw_res = _safe_indexing(sample_weight,
                                         indices[index:index + batch_size])
                 yield X_res, y_res, sw_res
Esempio n. 14
0
def accumulated_local_effects(est, x, feature, n_quantiles):
    """calculates ale for a feature"""
    ale = np.array(n_quantiles,)
    features_indices = np.asarray(
        _get_column_indices(x, feature), dtype=np.int32, order='C'
    ).ravel()
    quantiles = _quantiles_from_x(_safe_indexing(x, features_indices, axis=1), n_quantiles)
    x_feat = _safe_indexing(x, feature, axis=1)
    if x_feat.to_numpy().dtype.name == "category" or x_feat.to_numpy().dtype == "object":
        ale = _ale_for_categorical(est, quantiles, x, x_feat)
    else:
        ale = _ale_for_numeric(est, quantiles, x, x_feat)
    return ale
    def _fit_resample(self, X, y, sample_weight=None):
        self._validate_estimator()

        random_state = check_random_state(self.random_state)
        target_stats = Counter(y)
        class_minority = min(target_stats, key=target_stats.get)

        idx_under = np.empty((0, ), dtype=int)

        for target_class in np.unique(y):
            if target_class in self.sampling_strategy_.keys():
                # select a sample from the current class
                idx_maj = np.flatnonzero(y == target_class)
                sel_idx_maj = random_state.randint(
                    low=0,
                    high=target_stats[target_class],
                    size=self.n_seeds_S)
                idx_maj_sample = idx_maj[sel_idx_maj]

                minority_class_indices = np.flatnonzero(y == class_minority)
                C_indices = np.append(minority_class_indices, idx_maj_sample)

                # create the set composed of all minority samples and one
                # sample from the current class.
                C_x = _safe_indexing(X, C_indices)
                C_y = _safe_indexing(y, C_indices)

                # create the set S with removing the seed from S
                # since that it will be added anyway
                idx_maj_extracted = np.delete(idx_maj, sel_idx_maj, axis=0)
                S_x = _safe_indexing(X, idx_maj_extracted)
                S_y = _safe_indexing(y, idx_maj_extracted)
                self.estimator_.fit(C_x, C_y)
                pred_S_y = self.estimator_.predict(S_x)

                S_misclassified_indices = np.flatnonzero(pred_S_y != S_y)
                idx_tmp = idx_maj_extracted[S_misclassified_indices]
                idx_under = np.concatenate(
                    (idx_under, idx_maj_sample, idx_tmp), axis=0)
            else:
                idx_under = np.concatenate(
                    (idx_under, np.flatnonzero(y == target_class)), axis=0)

        X_resampled = _safe_indexing(X, idx_under)
        y_resampled = _safe_indexing(y, idx_under)

        # apply Tomek cleaning
        tl = TomekLinks(sampling_strategy=list(self.sampling_strategy_.keys()))
        X_cleaned, y_cleaned = tl.fit_resample(X_resampled, y_resampled)

        self.sample_indices_ = _safe_indexing(idx_under, tl.sample_indices_)

        idx_under = self.sample_indices_
        if sample_weight is not None:
            # sample_weight is already validated in self.fit_resample()
            sample_weight_under = _safe_indexing(sample_weight, idx_under)
            return X_cleaned, y_cleaned, sample_weight_under
        else:
            return X_cleaned, y_cleaned
        def evaluate(D, sol):
            phenotype = SamplingBenchmark.map_to_phenotype(
                CustomSamplingBenchmark.to_phenotype(sol))
            X_sampled = _safe_indexing(self.X_train, phenotype)
            y_sampled = _safe_indexing(self.y_train, phenotype)

            if X_sampled.shape[0] > 0:
                cls = self.evaluator.fit(X_sampled, y_sampled)
                y_predicted = cls.predict(self.X_valid)
                quality = accuracy_score(self.y_valid, y_predicted)
                size_percentage = len(y_sampled) / len(sol)

                return (1 - quality) * size_percentage
            else:
                return math.inf
Esempio n. 17
0
    def _fit_transformer(self, y):
        """Check transformer and fit transformer.

        Create the default transformer, fit it and make additional inverse
        check on a subset (optional).

        """
        if (self.transformer is not None and
                (self.func is not None or self.inverse_func is not None)):
            raise ValueError("'transformer' and functions 'func'/"
                             "'inverse_func' cannot both be set.")
        elif self.transformer is not None:
            self.transformer_ = clone(self.transformer)
        else:
            if self.func is not None and self.inverse_func is None:
                raise ValueError("When 'func' is provided, 'inverse_func' must"
                                 " also be provided")
            self.transformer_ = FunctionTransformer(
                func=self.func, inverse_func=self.inverse_func, validate=True,
                check_inverse=self.check_inverse)
        # XXX: sample_weight is not currently passed to the
        # transformer. However, if transformer starts using sample_weight, the
        # code should be modified accordingly. At the time to consider the
        # sample_prop feature, it is also a good use case to be considered.
        self.transformer_.fit(y)
        if self.check_inverse:
            idx_selected = slice(None, None, max(1, y.shape[0] // 10))
            y_sel = _safe_indexing(y, idx_selected)
            y_sel_t = self.transformer_.transform(y_sel)
            if not np.allclose(y_sel,
                               self.transformer_.inverse_transform(y_sel_t)):
                warnings.warn("The provided functions or transformer are"
                              " not strictly inverse of each other. If"
                              " you are sure you want to proceed regardless"
                              ", set 'check_inverse=False'", UserWarning)
Esempio n. 18
0
    def _fit_resample(self, X, y):
        self._validate_estimator()

        X_resampled = [X.copy()]
        y_resampled = [y.copy()]
        for class_sample, n_samples in self.sampling_strategy_.items():
            print(self.sampling_strategy_.items())
            if n_samples == 0:
                continue
            target_class_indices = np.flatnonzero(y == class_sample)
            X_class = _safe_indexing(X, target_class_indices)

            self.nn_k_.fit(X_class)
            nns = self.nn_k_.kneighbors(X_class, return_distance=False)[:, 1:]
            X_new, y_new = self._make_samples(X_class, y.dtype, class_sample,
                                              X_class, nns, n_samples, 1.0)
            X_resampled.append(X_new)
            y_resampled.append(y_new)

        if sparse.issparse(X):
            X_resampled = sparse.vstack(X_resampled, format=X.format)
        else:
            X_resampled = np.vstack(X_resampled)
        y_resampled = np.hstack(y_resampled)

        from IPython.display import display, HTML
        import pandas as pd
        pd.set_option("display.max_rows", None, "display.max_columns", None)
        new = pd.DataFrame(X_new)
        display(new)

        return X_resampled, y_resampled
Esempio n. 19
0
def _local_parallel_build_trees(sampler,
                                tree,
                                forest,
                                X,
                                y,
                                sample_weight,
                                tree_idx,
                                n_trees,
                                verbose=0,
                                class_weight=None,
                                n_samples_bootstrap=None):
    # resample before to fit the tree
    X_resampled, y_resampled = sampler.fit_resample(X, y)
    if sample_weight is not None:
        sample_weight = _safe_indexing(sample_weight, sampler.sample_indices_)
    if _get_n_samples_bootstrap is not None:
        n_samples_bootstrap = min(n_samples_bootstrap, X_resampled.shape[0])
    tree = _parallel_build_trees(
        tree,
        forest,
        X_resampled,
        y_resampled,
        sample_weight,
        tree_idx,
        n_trees,
        verbose=verbose,
        class_weight=class_weight,
        n_samples_bootstrap=n_samples_bootstrap,
    )
    return sampler, tree
Esempio n. 20
0
def make_sample(imbalanced_data_arr2, diff):
    # 将数据集分开为少数类数据和多数类数据
    minor_data_arr2, major_data_arr2 = seperate_minor_and_major_data(imbalanced_data_arr2)
    imbalanced_featured_data = imbalanced_data_arr2[:, : -1]
    imbalanced_label_data = imbalanced_data_arr2[:, -1]
    # 原始少数样本的特征集
    old_feature_data = minor_data_arr2[:, : -1]
    # 原始少数样本的标签值
    old_label_data = minor_data_arr2[0][-1]
    danger_index = in_danger(imbalanced_featured_data, old_feature_data, old_label_data, imbalanced_label_data)
    # 少数样本中噪音集合,也就是最终要产生新样本的集合
    danger_index_data = _safe_indexing(old_feature_data, danger_index)
    # 获取每一个少数类样本点周围最近的n_neighbors-1个点的位置矩阵
    nns = NearestNeighbors(n_neighbors=6).fit(old_feature_data).kneighbors(danger_index_data,
                                                                           return_distance=False)[:, 1:]
    # 随机产生diff个随机数作为之后产生新样本的选取的样本下标值
    samples_indices = np.random.randint(low=0, high=np.shape(danger_index_data)[0], size=diff)
    # 随机产生diff个随机数作为之后产生新样本的间距值
    steps = np.random.uniform(size=diff)
    cols = np.mod(samples_indices, nns.shape[1])
    reshaped_feature = np.zeros((diff, danger_index_data.shape[1]))
    for i, (col, step) in enumerate(zip(cols, steps)):
        row = samples_indices[i]
        reshaped_feature[i] = danger_index_data[row] - step * (danger_index_data[row] - old_feature_data[nns[row, col]])
    new_min_feature_data = np.vstack((reshaped_feature, old_feature_data))
    return new_min_feature_data
Esempio n. 21
0
def test_safe_indexing_1d_container_mask(array_type, indices_type):
    indices = [False] + [True] * 2 + [False] * 6
    array = _convert_container([1, 2, 3, 4, 5, 6, 7, 8, 9], array_type)
    indices = _convert_container(indices, indices_type)
    subset = _safe_indexing(array, indices, axis=0)
    assert_allclose_dense_sparse(subset, _convert_container([2, 3],
                                                            array_type))
Esempio n. 22
0
    def _init_pretest(self, features, target):
        """Set the sample of data used to verify pipelines work
        with the passed data set.

        This is not intend for anything other than perfunctory dataset
        pipeline compatibility testing
        """
        num_unique_target = len(np.unique(target))
        # make sure train_size is at least num_unique_target
        train_size = max(min(50, int(0.9 * features.shape[0])),
                         num_unique_target)

        self.pretest_X, _, self.pretest_y, _ = \
                train_test_split(
                                features,
                                target,
                                random_state=self.random_state,
                                test_size=None,
                                train_size=train_size
                                )
        #Make sure there is a least one example from each class
        #for this evaluative test sample
        if not np.array_equal(np.unique(target), np.unique(self.pretest_y)):
            unique_target_idx = np.unique(target, return_index=True)[1]
            self.pretest_y[0:unique_target_idx.shape[0]] = \
                    _safe_indexing(target, unique_target_idx)
Esempio n. 23
0
    def _fit_resample(self, X, y):
        self._validate_estimator()

        X_resampled = [X.copy()]
        y_resampled = [y.copy()]

        for class_sample, n_samples in self.sampling_strategy_.items():
            if n_samples == 0:
                continue
            target_class_indices = np.flatnonzero(y == class_sample)
            X_class = _safe_indexing(X, target_class_indices)

            self.nn_k_.fit(X_class)
            nns = self.nn_k_.kneighbors(X_class, return_distance=False)[:, 1:]
            X_new, y_new = self._make_samples(
                X_class, y.dtype, class_sample, X_class, nns, n_samples, 1.0
            )
            X_resampled.append(X_new)
            y_resampled.append(y_new)

        if sparse.issparse(X):
            X_resampled = sparse.vstack(X_resampled, format=X.format)
        else:
            X_resampled = np.vstack(X_resampled)
        y_resampled = np.hstack(y_resampled)

        return X_resampled, y_resampled
Esempio n. 24
0
    def calculate(self, chromosome: ndarray) -> float:
        labels = self.cluster.run(chromosome, self.samples)
        self.samples, labels = check_X_y(self.samples, labels)
        le = LabelEncoder()
        labels = le.fit_transform(labels)
        n_samples, _ = self.samples.shape
        n_labels = len(le.classes_)
        check_number_of_labels(n_labels, n_samples)

        intra_dists = np.zeros(n_labels)
        centroids = np.zeros((n_labels, len(self.samples[0])), dtype=float)
        for k in range(n_labels):
            cluster_k = _safe_indexing(self.samples, labels == k)
            centroid = chromosome[k]
            centroids[k] = centroid
            intra_dists[k] = np.average(
                pairwise_distances(cluster_k, [centroid]))

        centroid_distances = pairwise_distances(centroids)

        if np.allclose(intra_dists, 0) or np.allclose(centroid_distances, 0):
            return 0.0

        centroid_distances[centroid_distances == 0] = np.inf
        combined_intra_dists = intra_dists[:, None] + intra_dists
        scores = np.max(combined_intra_dists / centroid_distances, axis=1)
        return 1 / np.mean(scores)
Esempio n. 25
0
    def _boost_real(self, iboost, X, y, sample_weight, random_state):
        """Implement a single boost using the SAMME.R real algorithm."""
        estimator, sampler = self._make_sampler_estimator(random_state=random_state)

        X_res, y_res = sampler.fit_resample(X, y)
        sample_weight_res = _safe_indexing(sample_weight, sampler.sample_indices_)
        estimator.fit(X_res, y_res, sample_weight=sample_weight_res)

        y_predict_proba = estimator.predict_proba(X)

        if iboost == 0:
            self.classes_ = getattr(estimator, "classes_", None)
            self.n_classes_ = len(self.classes_)

        y_predict = self.classes_.take(np.argmax(y_predict_proba, axis=1), axis=0)

        # Instances incorrectly classified
        incorrect = y_predict != y

        # Error fraction
        estimator_error = np.mean(np.average(incorrect, weights=sample_weight, axis=0))

        # Stop if classification is perfect
        if estimator_error <= 0:
            return sample_weight, 1.0, 0.0

        # Construct y coding as described in Zhu et al [2]:
        #
        #    y_k = 1 if c == k else -1 / (K - 1)
        #
        # where K == n_classes_ and c, k in [0, K) are indices along the second
        # axis of the y coding with c being the index corresponding to the true
        # class label.
        n_classes = self.n_classes_
        classes = self.classes_
        y_codes = np.array([-1.0 / (n_classes - 1), 1.0])
        y_coding = y_codes.take(classes == y[:, np.newaxis])

        # Displace zero probabilities so the log is defined.
        # Also fix negative elements which may occur with
        # negative sample weights.
        proba = y_predict_proba  # alias for readability
        np.clip(proba, np.finfo(proba.dtype).eps, None, out=proba)

        # Boost weight using multi-class AdaBoost SAMME.R alg
        estimator_weight = (
            -1.0
            * self.learning_rate
            * ((n_classes - 1.0) / n_classes)
            * (y_coding * np.log(y_predict_proba)).sum(axis=1)
        )

        # Only boost the weights if it will fit again
        if not iboost == self.n_estimators - 1:
            # Only boost positive weights
            sample_weight *= np.exp(
                estimator_weight * ((sample_weight > 0) | (estimator_weight < 0))
            )

        return sample_weight, 1.0, estimator_error
Esempio n. 26
0
def permutations(estimator,
                 X,
                 y,
                 cv=None,
                 n_permuations=100,
                 random_state=0,
                 scoring=None):
    """
    This follows the sklearn API sklearn.inspection.permutation_test_score
    I have modified accordinlgy to accomodate filtering of features using correlation matrix
    before running cross-validation using the model
    """

    Xs, ys = indexable(X, y)
    cv = check_cv(cv, y, classifier=is_classifier(estimator))
    scorer = check_scoring(estimator, scoring=scoring)
    random_state = check_random_state(random_state)

    # corr = CorrMatrix()
    # corr.fit(X,y)
    # Xs, ys = corr.transform()
    score = _permutations(clone(estimator), Xs, ys, cv, scorer)
    permutation_scores = np.zeros((n_permuations))
    for i in range(n_permuations):
        # corr_p = CorrMatrix()
        # corr_p.fit(X, y)
        # Xp, yp = corr_p.transform()
        yp = _safe_indexing(y, random_state.permutation(len(y)))
        permutation_scores[i] = _permutations(clone(estimator), Xs, yp, cv,
                                              scorer)

    pvalue = (np.sum(permutation_scores >= score) + 1.0) / (n_permuations + 1)

    return score, permutation_scores, pvalue
Esempio n. 27
0
    def _fit_resample(self, X, y):
        self._validate_estimator()

        X_resampled = X.copy()
        y_resampled = y.copy()

        for class_sample, n_samples in self.sampling_strategy_.items():
            if n_samples == 0:
                continue
            target_class_indices = np.flatnonzero(y == class_sample)
            X_class = _safe_indexing(X, target_class_indices)

            self.nn_k_.fit(X_class)
            nns = self.nn_k_.kneighbors(X_class, return_distance=False)[:, 1:]
            X_new, y_new = self._make_samples(X_class, y.dtype, class_sample,
                                              X_class, nns, n_samples, 1.0)

            if sparse.issparse(X_new):
                X_resampled = sparse.vstack([X_resampled, X_new])
                sparse_func = "tocsc" if X.format == "csc" else "tocsr"
                X_resampled = getattr(X_resampled, sparse_func)()
            else:
                X_resampled = np.vstack((X_resampled, X_new))
            y_resampled = np.hstack((y_resampled, y_new))

        return X_resampled, y_resampled
	def db(X, labels):
		X, labels = check_X_y(X, labels)
		le = LabelEncoder()
		labels = le.fit_transform(labels)
		n_samples, _ = X.shape
		n_labels = len(le.classes_)
		check_number_of_labels(n_labels, n_samples)

		intra_dists = np.zeros(n_labels)
		centroids = np.zeros((n_labels, len(X[0])), dtype=float)
		for k in range(n_labels):
			cluster_k = _safe_indexing(X, labels == k)
			centroid = cluster_k.mean(axis=0)
			centroids[k] = centroid
			intra_dists[k] = np.average(pairwise_distances(
			    cluster_k, [centroid], metric='euclidean'))

		centroid_distances = pairwise_distances(centroids, metric='euclidean')

		if np.allclose(intra_dists, 0) or np.allclose(centroid_distances, 0):
			return 0.0

		centroid_distances[centroid_distances == 0] = np.inf
		combined_intra_dists = intra_dists[:, None] + intra_dists
		scores = np.max(combined_intra_dists / centroid_distances, axis=1)
		return np.mean(scores)
Esempio n. 29
0
def SMOTE_Borderline_D(imbalanced_data_arr2):
    # 将数据集分开为少数类数据和多数类数据
    minor_data_arr2, major_data_arr2 = seperate_minor_and_major_data(
        imbalanced_data_arr2)
    print('多数类样本数:', len(major_data_arr2), ', 少数类样本数:', len(minor_data_arr2))
    imbalanced_featured_data = imbalanced_data_arr2[:, :-1]
    imbalanced_label_data = imbalanced_data_arr2[:, -1]
    # 计算多数类数据和少数类数据之间的数量差,也是需要过采样的数量
    # 1
    n = major_data_arr2.shape[0] - minor_data_arr2.shape[0]

    # 原始少数样本的特征集
    old_feature_data = minor_data_arr2[:, :-1]
    # 原始少数样本的标签值
    old_label_data = minor_data_arr2[0][-1]

    danger_index = in_danger(imbalanced_featured_data, old_feature_data,
                             old_label_data, imbalanced_label_data)
    # 少数样本中噪音集合,也就是最终要产生新样本的集合
    danger_index_data = _safe_indexing(old_feature_data, danger_index)

    # 使用K近邻方法产生的新样本特征集
    new_feature_data = make_sample(old_feature_data, danger_index_data,
                                   n)  # 扩展少数类样本集
    # 将类别标签数组合并到少数类样本特征集,构建出新的少数类样本数据集
    new_labels_data = np.array([old_label_data] * len(new_feature_data))
    new_minor_data_arr2 = np.column_stack((new_feature_data, new_labels_data))

    # balanced_data_arr2 = np.row_stack((new_minor_data_arr2, major_data_arr2))
    balanced_data_arr2 = np.row_stack((major_data_arr2, new_minor_data_arr2))
    # 将少数类数据集和多数据类数据集合并,并对样本数据进行打乱重排,
    # balanced_data_arr2 = concat_and_shuffle_data(new_minor_data_arr2, major_data_arr2)
    return balanced_data_arr2
Esempio n. 30
0
    def _fit_resample(self, X, y):
        # FIXME: to be removed in 0.12
        if self.n_jobs is not None:
            warnings.warn(
                "The parameter `n_jobs` has been deprecated in 0.10 and will be "
                "removed in 0.12. You can pass an nearest neighbors estimator where "
                "`n_jobs` is already set instead.",
                FutureWarning,
            )

        self._validate_estimator()

        X_resampled = [X.copy()]
        y_resampled = [y.copy()]

        for class_sample, n_samples in self.sampling_strategy_.items():
            if n_samples == 0:
                continue
            target_class_indices = np.flatnonzero(y == class_sample)
            X_class = _safe_indexing(X, target_class_indices)

            self.nn_k_.fit(X_class)
            nns = self.nn_k_.kneighbors(X_class, return_distance=False)[:, 1:]
            X_new, y_new = self._make_samples(X_class, y.dtype, class_sample,
                                              X_class, nns, n_samples, 1.0)
            X_resampled.append(X_new)
            y_resampled.append(y_new)

        if sparse.issparse(X):
            X_resampled = sparse.vstack(X_resampled, format=X.format)
        else:
            X_resampled = np.vstack(X_resampled)
        y_resampled = np.hstack(y_resampled)

        return X_resampled, y_resampled