Esempio n. 1
0
    def subsample(self, X, y, pos_label=1, neg_label=0):
        """ subsampler to filter the input examples closer to the decision boundary

        Parameters
        -----------
        X : pandas.DataFrame
            input examples representing the training set
        y : pandas.DataFrame
            target labels associated with the training set
        pos_label : int
        neg_label : int

        Returns
        --------
        X_, y_ : pandas.dataframe
        sub-sampled input examples
        """
        if not isinstance(X, pd.DataFrame) or not isinstance(y, pd.Series):
            raise exceptions.DataSetError("Only pandas.DataFrame as input type is currently supported")

        # validate the consistency of the input data
        if not X.shape[0] == y.shape[0]:
            raise exceptions.DataSetError("mismatch in the shape of X and y")

        try:
            self.surrogate_estimator.predict_proba(X[0:1])
        except NotFittedError:
            self.surrogate_estimator.fit(X, y)

        est_prob_scores = pd.DataFrame(self.surrogate_estimator.predict_proba(X))

        # compute the distance from the decision boundary
        distance_from_threshold = est_prob_scores[pos_label].apply(lambda x: np.abs(self.threhold - x))
        pos_label_index = np.where(y == pos_label)[0]
        neg_label_index = np.where(y == neg_label)[0]

        pos_label_dist = distance_from_threshold[pos_label_index]
        neg_label_dist = distance_from_threshold[neg_label_index]

        # sort the neighboring distances from the threshold in the ascending order to select points which
        # are closer to the decision boundary
        sorted_dist_pos_label = pos_label_dist.sort_values()
        sorted_dist_neg_label = neg_label_dist.sort_values()

        # sub-sample the data
        number_of_rows = len(y) * self.sample_percentage
        pos_fraction = len(pos_label_index) / float(len(y))
        neg_fraction = 1 - pos_fraction

        pos_df = pd.DataFrame(X.iloc[sorted_dist_pos_label[:int(number_of_rows * pos_fraction) + 1].index])
        neg_df = pd.DataFrame(X.iloc[sorted_dist_neg_label[:int(number_of_rows * neg_fraction) + 1].index])

        new_X = pd.concat([pos_df, neg_df], axis=0)

        # Randomly shuffle the newly formed data-set
        X_, y_ = shuffle(new_X, y[new_X.index])
        return X_, y_
Esempio n. 2
0
    def fit(self,
            X,
            y_true,
            n_quantiles=None,
            bin_labels='default',
            undiscretize_feature_list=None,
            precision=3):
        """ Fit the estimator.

        Parameters
        -----------
            X : pandas.DataFrame object, that could be used by the model for training.
                 It must not have a column named 'label'

            y_true : pandas.Series, 1-D array to store ground truth labels

        Returns
        -------
            SBRL model instance: rpy2.robjects.vectors.ListVector


        Examples
        ---------
        >>> from skater.core.global_interpretation.interpretable_models.brlc import BRLC
        >>> sbrl_model = BRLC(min_rule_len=1, max_rule_len=10, iterations=10000, n_chains=20, drop_features=True)
        >>> # Train a model, by default discretizer is enabled. So, you wish to exclude features then exclude them using
        >>> # the undiscretize_feature_list parameter
        >>> model = sbrl_model.fit(Xtrain, ytrain, bin_labels="default")
        """
        if len(np.unique(y_true)) != 2:
            raise Exception("Supports only binary classification right now")

        if not isinstance(X, pd.DataFrame):
            raise exceptions.DataSetError(
                "Only pandas.DataFrame as input type is currently supported")

        # Conditions being managed
        # 1. if 'undiscretize_feature_list' is empty and discretization flag is enabled,
        #    discretize 'all' continuous features
        # 2. if undiscretize_feature_list is not empty and discretization flag is enabled, filter the ones not needed
        #    needed
        for_discretization_clmns = tuple(filter(lambda c_name: c_name not in undiscretize_feature_list, X.columns)) \
            if undiscretize_feature_list is not None else tuple(X.columns)

        data = self.discretizer(X, self._filter_continuous_features(X, for_discretization_clmns),
                                no_of_quantiles=n_quantiles, labels_for_bin=bin_labels, precision=precision) \
            if self.__discretize is True else X

        # record all the feature names
        self.feature_names = data.columns
        data.loc[:, "label"] = y_true
        data_as_r_frame = self.__r_frame(self.__s_apply(
            data, self.__as_factor))
        self.model = self.__r_sbrl.sbrl(data_as_r_frame, **self.model_params)
        return self.model
Esempio n. 3
0
    def predict_proba(self, X):
        """ Computes possible class probabilities for the input 'X'

        Parameters
        -----------
            X: pandas.DataFrame object

        Returns
        -------
            pandas.DataFrame of shape (#datapoints, 2), the possible probability of each class for each observation
        """
        if not isinstance(X, pd.DataFrame):
            raise exceptions.DataSetError(
                "Only pandas.DataFrame as input type is currently supported")

        data_as_r_frame = self.__r_frame(self.__s_apply(X, self.__as_factor))
        results = self.__r_sbrl.predict_sbrl(self.model, data_as_r_frame)
        return pandas2ri.ri2py_dataframe(results).T