def test_subset(dataset_df): # Change the index so that labels don't correspond to positions. df = dataset_df.set_index(dataset_df.index + 51) df["avg_col"] = (df["a"] + df["b"]) / 2 ds = Dataset(df, "label", feature_cols=["a", "b", "c", "d", "e"]) ds_C = ds.subset(ds.df["c"] == "C") assert isinstance(ds_C, Dataset) assert ds_C.size == 15 assert ds_C.feature_names == ["a", "b", "c", "d", "e"] assert ds_C.column_names == ["a", "b", "c", "d", "e", "avg_col"] assert_series_equal(ds_C.labels, df.loc[df["c"] == "C", "label"]) assert list(ds_C.other_cols.columns) == ["avg_col"] assert_frame_equal(ds_C.df, df[df["c"] == "C"]) ii = list(range(51, 61)) ds_ind = ds.subset(ii) assert isinstance(ds_ind, Dataset) assert ds_ind.size == 10 assert ds_ind.feature_names == ["a", "b", "c", "d", "e"] assert ds_ind.column_names == ["a", "b", "c", "d", "e", "avg_col"] assert_series_equal(ds_ind.labels, df.iloc[range(10)]["label"]) assert list(ds_ind.other_cols.columns) == ["avg_col"] assert_frame_equal(ds_ind.df, df.iloc[range(10)]) ds_pos = ds.subset(ii, by_position=True) assert isinstance(ds_pos, Dataset) assert ds_pos.size == 10 assert ds_pos.feature_names == ["a", "b", "c", "d", "e"] assert ds_pos.column_names == ["a", "b", "c", "d", "e", "avg_col"] assert_series_equal(ds_pos.labels, df.iloc[ii]["label"]) assert list(ds_pos.other_cols.columns) == ["avg_col"] assert_frame_equal(ds_pos.df, df.iloc[ii])
from sklearn.model_selection import ShuffleSplit from pathlib import Path THIS_DIR = Path(__file__).parent DATASET_DIR = THIS_DIR / ".." / ".." / "datasets" / "winequality.csv" # Load the dataset. df = pd.read_csv(DATASET_DIR) df = df.drop(columns=["quality"]) dataset = Dataset(df, label_col="recommend") splitter = ShuffleSplit(n_splits=1, test_size=0.3, random_state=543) train_ind, test_ind = next(splitter.split(dataset.features)) train_dataset = dataset.subset(train_ind, by_position=True) test_dataset = dataset.subset(test_ind, by_position=True) # Set up the model model = Pipeline([("scaler", StandardScaler()), ("clf", SVC(class_weight="balanced"))]) cm = ClassificationModel(model) cm.train(train_dataset) presc_report = ReportRunner() presc_report.run(model=cm, test_dataset=test_dataset, train_dataset=train_dataset) print(f"The report is available at {presc_report.report_html}")