def subsample(self, X, y, pos_label=1, neg_label=0): """ subsampler to filter the input examples closer to the decision boundary Parameters ----------- X : pandas.DataFrame input examples representing the training set y : pandas.DataFrame target labels associated with the training set pos_label : int neg_label : int Returns -------- X_, y_ : pandas.dataframe sub-sampled input examples """ if not isinstance(X, pd.DataFrame) or not isinstance(y, pd.Series): raise exceptions.DataSetError("Only pandas.DataFrame as input type is currently supported") # validate the consistency of the input data if not X.shape[0] == y.shape[0]: raise exceptions.DataSetError("mismatch in the shape of X and y") try: self.surrogate_estimator.predict_proba(X[0:1]) except NotFittedError: self.surrogate_estimator.fit(X, y) est_prob_scores = pd.DataFrame(self.surrogate_estimator.predict_proba(X)) # compute the distance from the decision boundary distance_from_threshold = est_prob_scores[pos_label].apply(lambda x: np.abs(self.threhold - x)) pos_label_index = np.where(y == pos_label)[0] neg_label_index = np.where(y == neg_label)[0] pos_label_dist = distance_from_threshold[pos_label_index] neg_label_dist = distance_from_threshold[neg_label_index] # sort the neighboring distances from the threshold in the ascending order to select points which # are closer to the decision boundary sorted_dist_pos_label = pos_label_dist.sort_values() sorted_dist_neg_label = neg_label_dist.sort_values() # sub-sample the data number_of_rows = len(y) * self.sample_percentage pos_fraction = len(pos_label_index) / float(len(y)) neg_fraction = 1 - pos_fraction pos_df = pd.DataFrame(X.iloc[sorted_dist_pos_label[:int(number_of_rows * pos_fraction) + 1].index]) neg_df = pd.DataFrame(X.iloc[sorted_dist_neg_label[:int(number_of_rows * neg_fraction) + 1].index]) new_X = pd.concat([pos_df, neg_df], axis=0) # Randomly shuffle the newly formed data-set X_, y_ = shuffle(new_X, y[new_X.index]) return X_, y_
def fit(self, X, y_true, n_quantiles=None, bin_labels='default', undiscretize_feature_list=None, precision=3): """ Fit the estimator. Parameters ----------- X : pandas.DataFrame object, that could be used by the model for training. It must not have a column named 'label' y_true : pandas.Series, 1-D array to store ground truth labels Returns ------- SBRL model instance: rpy2.robjects.vectors.ListVector Examples --------- >>> from skater.core.global_interpretation.interpretable_models.brlc import BRLC >>> sbrl_model = BRLC(min_rule_len=1, max_rule_len=10, iterations=10000, n_chains=20, drop_features=True) >>> # Train a model, by default discretizer is enabled. So, you wish to exclude features then exclude them using >>> # the undiscretize_feature_list parameter >>> model = sbrl_model.fit(Xtrain, ytrain, bin_labels="default") """ if len(np.unique(y_true)) != 2: raise Exception("Supports only binary classification right now") if not isinstance(X, pd.DataFrame): raise exceptions.DataSetError( "Only pandas.DataFrame as input type is currently supported") # Conditions being managed # 1. if 'undiscretize_feature_list' is empty and discretization flag is enabled, # discretize 'all' continuous features # 2. if undiscretize_feature_list is not empty and discretization flag is enabled, filter the ones not needed # needed for_discretization_clmns = tuple(filter(lambda c_name: c_name not in undiscretize_feature_list, X.columns)) \ if undiscretize_feature_list is not None else tuple(X.columns) data = self.discretizer(X, self._filter_continuous_features(X, for_discretization_clmns), no_of_quantiles=n_quantiles, labels_for_bin=bin_labels, precision=precision) \ if self.__discretize is True else X # record all the feature names self.feature_names = data.columns data.loc[:, "label"] = y_true data_as_r_frame = self.__r_frame(self.__s_apply( data, self.__as_factor)) self.model = self.__r_sbrl.sbrl(data_as_r_frame, **self.model_params) return self.model
def predict_proba(self, X): """ Computes possible class probabilities for the input 'X' Parameters ----------- X: pandas.DataFrame object Returns ------- pandas.DataFrame of shape (#datapoints, 2), the possible probability of each class for each observation """ if not isinstance(X, pd.DataFrame): raise exceptions.DataSetError( "Only pandas.DataFrame as input type is currently supported") data_as_r_frame = self.__r_frame(self.__s_apply(X, self.__as_factor)) results = self.__r_sbrl.predict_sbrl(self.model, data_as_r_frame) return pandas2ri.ri2py_dataframe(results).T