def test_subset(dataset_df): # Change the index so that labels don't correspond to positions. df = dataset_df.set_index(dataset_df.index + 51) df["avg_col"] = (df["a"] + df["b"]) / 2 ds = Dataset(df, "label", feature_cols=["a", "b", "c", "d", "e"]) ds_C = ds.subset(ds.df["c"] == "C") assert isinstance(ds_C, Dataset) assert ds_C.size == 15 assert ds_C.feature_names == ["a", "b", "c", "d", "e"] assert ds_C.column_names == ["a", "b", "c", "d", "e", "avg_col"] assert_series_equal(ds_C.labels, df.loc[df["c"] == "C", "label"]) assert list(ds_C.other_cols.columns) == ["avg_col"] assert_frame_equal(ds_C.df, df[df["c"] == "C"]) ii = list(range(51, 61)) ds_ind = ds.subset(ii) assert isinstance(ds_ind, Dataset) assert ds_ind.size == 10 assert ds_ind.feature_names == ["a", "b", "c", "d", "e"] assert ds_ind.column_names == ["a", "b", "c", "d", "e", "avg_col"] assert_series_equal(ds_ind.labels, df.iloc[range(10)]["label"]) assert list(ds_ind.other_cols.columns) == ["avg_col"] assert_frame_equal(ds_ind.df, df.iloc[range(10)]) ds_pos = ds.subset(ii, by_position=True) assert isinstance(ds_pos, Dataset) assert ds_pos.size == 10 assert ds_pos.feature_names == ["a", "b", "c", "d", "e"] assert ds_pos.column_names == ["a", "b", "c", "d", "e", "avg_col"] assert_series_equal(ds_pos.labels, df.iloc[ii]["label"]) assert list(ds_pos.other_cols.columns) == ["avg_col"] assert_frame_equal(ds_pos.df, df.iloc[ii])
def test_dataset_other_cols(dataset_df): ds = Dataset(dataset_df, "label", feature_cols=["a", "b", "d"]) assert ds.size == 100 assert ds.feature_names == ["a", "b", "d"] assert ds.column_names == ["a", "b", "d", "c", "e"] assert_frame_equal(ds.features, dataset_df[["a", "b", "d"]]) assert_series_equal(ds.labels, dataset_df["label"]) assert_frame_equal(ds.other_cols, dataset_df[["c", "e"]]) assert ds.df is dataset_df ds.df["avg_col"] = (dataset_df["a"] + dataset_df["b"]) / 2 assert_frame_equal(ds.other_cols, dataset_df[["c", "e", "avg_col"]]) assert ds.feature_names == ["a", "b", "d"] assert ds.column_names == ["a", "b", "d", "c", "e", "avg_col"]
def labeling(X, original_classifier, label_col="class"): """Labels the samples from a dataset according to a classifier. Parameters ---------- X : pandas DataFrame Dataset with the features but not the labels. original_classifier : sklearn-type classifier Classifier to use for the labeling of the samples. label_col : str Name of the label column. Returns ------- presc.dataset.Dataset Outputs a PRESC Dataset with the samples and their labels. """ df_labeled = X.copy() # Label synthetic data with original classifier df_labeled[label_col] = original_classifier.predict(df_labeled) # Instantiate dataset wrapper df_labeled = Dataset(df_labeled, label_col=label_col) return df_labeled
def test_summary_metrics(): random_seed = 42 # Original data train_data = pd.DataFrame( {"x": [0, 1, 0, 2, 1], "y": [1, 0, 2, 0, 1], "label": [0, 0, 1, 1, 1]}, columns=["x", "y", "label"], ) test_data = Dataset( pd.DataFrame( {"x": [2, 0, 0, 1, 2], "y": [1, 0, 2, 0, 2], "label": [0, 0, 1, 0, 1]}, columns=["x", "y", "label"], ), label_col="label", ) # Original classifier original_classifier = SVC(kernel="linear", random_state=random_seed) original_classifier.fit(train_data[["x", "y"]], train_data["label"]) # Copy classifier feature_parameters = {"x": {"min": 0, "max": 2}, "y": {"min": 0, "max": 2}} classifier_copy = DecisionTreeClassifier(max_depth=2, random_state=random_seed) copy_grid = ClassifierCopy( original_classifier, classifier_copy, grid_sampling, nsamples=20, label_col="label", feature_parameters=feature_parameters, ) copy_grid.copy_classifier() # Generated data synthetic_test_data = copy_grid.generate_synthetic_data( generated_nsamples=5, random_state=random_seed, label_col="label" ) metrics = summary_metrics( original_model=original_classifier, copy_model=copy_grid, test_data=test_data, synthetic_data=synthetic_test_data, show_results=True, ) expected_results = { "Original Model Accuracy (test)": 0.6, "Copy Model Accuracy (test)": 0.8, "Empirical Fidelity Error (synthetic)": 0.0625, "Empirical Fidelity Error (test)": 0.2, "Replacement Capability (synthetic)": 0.9375, "Replacement Capability (test)": 1.33333333, } metric_names = metrics.keys() for name in metric_names: np.testing.assert_almost_equal(metrics[name], expected_results[name], decimal=6)
def test_dataset(dataset_df): ds = Dataset(dataset_df, "label") assert ds.size == 100 assert ds.feature_names == ["a", "b", "c", "d", "e"] assert ds.column_names == ["a", "b", "c", "d", "e"] assert_frame_equal(ds.features, dataset_df.drop(columns=["label"])) assert_series_equal(ds.labels, dataset_df["label"]) assert ds.other_cols.size == 0 assert ds.df is dataset_df
def generate_synthetic_data(self, **k_mod_sampling_parameters): """Generates synthetic data using the original model. Generates samples following the sampling strategy specified on instantiation for the numerical features and a discrete distribution for the categorical features, and then labels them using the original model. If the same data needs to be generated then simply use a specific random seed. Parameters ---------- **k_mod_sampling_parameters : If the "nsamples" and/or "random_state" parameters of the sampling function have to be changed in order to obtain a different set of synthetic data, they can be specified here. Returns ------- presc.dataset.Dataset Outputs a PRESC Dataset with the generated samples and their labels. """ # Random state needs to be fixed to obtain the same training data k_sampling_parameters_gen = self.k_sampling_parameters.copy() if "nsamples" in k_mod_sampling_parameters.keys(): k_sampling_parameters_gen["nsamples"] = k_mod_sampling_parameters[ "nsamples"] if "random_state" in k_mod_sampling_parameters.keys(): k_sampling_parameters_gen[ "random_state"] = k_mod_sampling_parameters["random_state"] X_generated = mixed_data_sampling( numerical_sampling=self.sampling_function, **k_sampling_parameters_gen) # If the type of sampling function attempts to balance the synthetic # dataset, it returns the features AND the labels. Otherwise, it returns # only the features, and the labeling function must be called. if self.balancing_sampler: df_generated = Dataset(X_generated, label_col=self.label_col) else: df_generated = labeling(X_generated, self.original, label_col=self.label_col) return df_generated
def test_dataset(dataset_df, in_test_set): return Dataset(dataset_df[in_test_set], label_col="label")
def multiclass_gaussians( nsamples=3000, nfeatures=30, nclasses=15, center_low=2, center_high=10, scale_low=1, scale_high=1, ): """Generates a multidimensional gaussian dataset with multiple classes. This function generates a multidimensional normal distribution centered at the origin with standard deviation one for class zero. And then adds an additional gaussian distribution per class, centered at a random distance between `center_low` and `center_high`, and with random standard deviation between `scale_low` and `scale_high`. Parameters ---------- nsamples : int Maximum number of samples to generate. Actual number of samples depends on the number of classes, because the function yields a balanced dataset with the same number of samples per class. nfeatures : int Number of features of the generated samples. nclasses : int Number of classes in the generated dataset. center_low : float Minimum translation from the origin of the center of the gaussian distributions corresponding to additional classes. center_high : float Maximum translation from the origin of the center of the gaussian distributions corresponding to additional classes. scale_low : float Minimum value for the standard deviation of the gaussian distributions corresponding to additional classes. scale_high : float Maximum value for the standard deviation of the gaussian distributions corresponding to additional classes. Returns ------- presc.dataset.Dataset Outputs a PRESC Dataset with the generated samples and their labels. """ class_samples = int(nsamples / nclasses) # Create class zero drawing samples from a `nfeatures`-dimensional normal # distribution centered at the origin and with a standard deviation between # `scale_low` and `scale_high`. scale = np.random.uniform(low=scale_low, high=scale_high) t_pred = scale * np.random.normal(0, 1, (class_samples, nfeatures)) df_pred = pd.DataFrame(t_pred) df_pred["class"] = 0 # Create additional classes centered at `m` with standard deviation `scale` for i in range(1, nclasses): # Generate a normalized vector in a random direction v = np.random.normal(0, 1, nfeatures) v = v / np.linalg.norm(v) # Generate a random distance from the origin to define the center of each gaussian alpha = np.random.uniform(low=center_low, high=center_high) m = alpha * v # Generate a random scaling for each gaussian scale = np.random.uniform(low=scale_low, high=scale_high) # Generate normally distributed random samples for this classs t = m + scale * np.random.normal(0, 1, (class_samples, nfeatures)) df = pd.DataFrame(t) df["class"] = i # Add class data to the dataset df_pred = pd.concat([df_pred, df], ignore_index=True) # Convert into PRESC Dataset df_presc = Dataset(df_pred, label_col="class") return df_presc
def vehicles_dataset_wrapper(): dataset_wrapper = Dataset(pd.read_csv(VEHICLES_DATA_PATH), VEHICLES_LABEL_COL) dataset_wrapper.split_test_train(test_size=0.4, random_state=random_state) return dataset_wrapper
from sklearn.pipeline import Pipeline from sklearn.preprocessing import StandardScaler from sklearn.svm import SVC from sklearn.model_selection import ShuffleSplit from pathlib import Path THIS_DIR = Path(__file__).parent DATASET_DIR = THIS_DIR / ".." / ".." / "datasets" / "winequality.csv" # Load the dataset. df = pd.read_csv(DATASET_DIR) df = df.drop(columns=["quality"]) dataset = Dataset(df, label_col="recommend") splitter = ShuffleSplit(n_splits=1, test_size=0.3, random_state=543) train_ind, test_ind = next(splitter.split(dataset.features)) train_dataset = dataset.subset(train_ind, by_position=True) test_dataset = dataset.subset(test_ind, by_position=True) # Set up the model model = Pipeline([("scaler", StandardScaler()), ("clf", SVC(class_weight="balanced"))]) cm = ClassificationModel(model) cm.train(train_dataset) presc_report = ReportRunner() presc_report.run(model=cm,
import pandas as pd from presc.dataset import Dataset from presc.model import ClassificationModel from sklearn.pipeline import Pipeline from sklearn.preprocessing import StandardScaler from sklearn.svm import SVC # Better quality plots from IPython.display import set_matplotlib_formats set_matplotlib_formats("svg") # Load the dataset. df = pd.read_csv("../../datasets/winequality.csv") df = df.drop(columns=["quality"]) dataset = Dataset(df, label="recommend") dataset.split_test_train(0.3) # Set up the model model = Pipeline([("scaler", StandardScaler()), ("clf", SVC(class_weight="balanced"))]) cm = ClassificationModel(model, dataset, should_train=True) # Config options (TODO: read from file) config = {"misclass_rate": {"num_bins": 20}}
def wine_dataset_wrapper(expected_wine_dataset): dataset_wrapper = Dataset(expected_wine_dataset, WINE_LABEL_COL) return dataset_wrapper
def test_label_not_in_dataset(expected_wine_dataset): with pytest.raises(KeyError): Dataset(expected_wine_dataset, "wrong_label")