Python DataSetError Examples

Programming Language: Python

Namespace/Package Name: skater.util.exceptions

Method/Function: DataSetError

Examples at hotexamples.com: 3

Python DataSetError - 3 examples found. These are the top rated real world Python examples of skater.util.exceptions.DataSetError extracted from open source projects. You can rate examples to help us improve the quality of examples.

Example #1

Show file

File: bigdatabrlc.py Project: ytalhatamer/Skater

    def subsample(self, X, y, pos_label=1, neg_label=0):
        """ subsampler to filter the input examples closer to the decision boundary

        Parameters
        -----------
        X : pandas.DataFrame
            input examples representing the training set
        y : pandas.DataFrame
            target labels associated with the training set
        pos_label : int
        neg_label : int

        Returns
        --------
        X_, y_ : pandas.dataframe
        sub-sampled input examples
        """
        if not isinstance(X, pd.DataFrame) or not isinstance(y, pd.Series):
            raise exceptions.DataSetError("Only pandas.DataFrame as input type is currently supported")

        # validate the consistency of the input data
        if not X.shape[0] == y.shape[0]:
            raise exceptions.DataSetError("mismatch in the shape of X and y")

        try:
            self.surrogate_estimator.predict_proba(X[0:1])
        except NotFittedError:
            self.surrogate_estimator.fit(X, y)

        est_prob_scores = pd.DataFrame(self.surrogate_estimator.predict_proba(X))

        # compute the distance from the decision boundary
        distance_from_threshold = est_prob_scores[pos_label].apply(lambda x: np.abs(self.threhold - x))
        pos_label_index = np.where(y == pos_label)[0]
        neg_label_index = np.where(y == neg_label)[0]

        pos_label_dist = distance_from_threshold[pos_label_index]
        neg_label_dist = distance_from_threshold[neg_label_index]

        # sort the neighboring distances from the threshold in the ascending order to select points which
        # are closer to the decision boundary
        sorted_dist_pos_label = pos_label_dist.sort_values()
        sorted_dist_neg_label = neg_label_dist.sort_values()

        # sub-sample the data
        number_of_rows = len(y) * self.sample_percentage
        pos_fraction = len(pos_label_index) / float(len(y))
        neg_fraction = 1 - pos_fraction

        pos_df = pd.DataFrame(X.iloc[sorted_dist_pos_label[:int(number_of_rows * pos_fraction) + 1].index])
        neg_df = pd.DataFrame(X.iloc[sorted_dist_neg_label[:int(number_of_rows * neg_fraction) + 1].index])

        new_X = pd.concat([pos_df, neg_df], axis=0)

        # Randomly shuffle the newly formed data-set
        X_, y_ = shuffle(new_X, y[new_X.index])
        return X_, y_

Example #2

Show file

    def fit(self,
            X,
            y_true,
            n_quantiles=None,
            bin_labels='default',
            undiscretize_feature_list=None,
            precision=3):
        """ Fit the estimator.

        Parameters
        -----------
            X : pandas.DataFrame object, that could be used by the model for training.
                 It must not have a column named 'label'

            y_true : pandas.Series, 1-D array to store ground truth labels

        Returns
        -------
            SBRL model instance: rpy2.robjects.vectors.ListVector


        Examples
        ---------
        >>> from skater.core.global_interpretation.interpretable_models.brlc import BRLC
        >>> sbrl_model = BRLC(min_rule_len=1, max_rule_len=10, iterations=10000, n_chains=20, drop_features=True)
        >>> # Train a model, by default discretizer is enabled. So, you wish to exclude features then exclude them using
        >>> # the undiscretize_feature_list parameter
        >>> model = sbrl_model.fit(Xtrain, ytrain, bin_labels="default")
        """
        if len(np.unique(y_true)) != 2:
            raise Exception("Supports only binary classification right now")

        if not isinstance(X, pd.DataFrame):
            raise exceptions.DataSetError(
                "Only pandas.DataFrame as input type is currently supported")

        # Conditions being managed
        # 1. if 'undiscretize_feature_list' is empty and discretization flag is enabled,
        #    discretize 'all' continuous features
        # 2. if undiscretize_feature_list is not empty and discretization flag is enabled, filter the ones not needed
        #    needed
        for_discretization_clmns = tuple(filter(lambda c_name: c_name not in undiscretize_feature_list, X.columns)) \
            if undiscretize_feature_list is not None else tuple(X.columns)

        data = self.discretizer(X, self._filter_continuous_features(X, for_discretization_clmns),
                                no_of_quantiles=n_quantiles, labels_for_bin=bin_labels, precision=precision) \
            if self.__discretize is True else X

        # record all the feature names
        self.feature_names = data.columns
        data.loc[:, "label"] = y_true
        data_as_r_frame = self.__r_frame(self.__s_apply(
            data, self.__as_factor))
        self.model = self.__r_sbrl.sbrl(data_as_r_frame, **self.model_params)
        return self.model

Example #3

Show file

    def predict_proba(self, X):
        """ Computes possible class probabilities for the input 'X'

        Parameters
        -----------
            X: pandas.DataFrame object

        Returns
        -------
            pandas.DataFrame of shape (#datapoints, 2), the possible probability of each class for each observation
        """
        if not isinstance(X, pd.DataFrame):
            raise exceptions.DataSetError(
                "Only pandas.DataFrame as input type is currently supported")

        data_as_r_frame = self.__r_frame(self.__s_apply(X, self.__as_factor))
        results = self.__r_sbrl.predict_sbrl(self.model, data_as_r_frame)
        return pandas2ri.ri2py_dataframe(results).T