Exemple #1
0
def test_subset(dataset_df):
    # Change the index so that labels don't correspond to positions.
    df = dataset_df.set_index(dataset_df.index + 51)
    df["avg_col"] = (df["a"] + df["b"]) / 2
    ds = Dataset(df, "label", feature_cols=["a", "b", "c", "d", "e"])

    ds_C = ds.subset(ds.df["c"] == "C")
    assert isinstance(ds_C, Dataset)
    assert ds_C.size == 15
    assert ds_C.feature_names == ["a", "b", "c", "d", "e"]
    assert ds_C.column_names == ["a", "b", "c", "d", "e", "avg_col"]
    assert_series_equal(ds_C.labels, df.loc[df["c"] == "C", "label"])
    assert list(ds_C.other_cols.columns) == ["avg_col"]
    assert_frame_equal(ds_C.df, df[df["c"] == "C"])

    ii = list(range(51, 61))
    ds_ind = ds.subset(ii)
    assert isinstance(ds_ind, Dataset)
    assert ds_ind.size == 10
    assert ds_ind.feature_names == ["a", "b", "c", "d", "e"]
    assert ds_ind.column_names == ["a", "b", "c", "d", "e", "avg_col"]
    assert_series_equal(ds_ind.labels, df.iloc[range(10)]["label"])
    assert list(ds_ind.other_cols.columns) == ["avg_col"]
    assert_frame_equal(ds_ind.df, df.iloc[range(10)])

    ds_pos = ds.subset(ii, by_position=True)
    assert isinstance(ds_pos, Dataset)
    assert ds_pos.size == 10
    assert ds_pos.feature_names == ["a", "b", "c", "d", "e"]
    assert ds_pos.column_names == ["a", "b", "c", "d", "e", "avg_col"]
    assert_series_equal(ds_pos.labels, df.iloc[ii]["label"])
    assert list(ds_pos.other_cols.columns) == ["avg_col"]
    assert_frame_equal(ds_pos.df, df.iloc[ii])
Exemple #2
0
from sklearn.model_selection import ShuffleSplit

from pathlib import Path

THIS_DIR = Path(__file__).parent
DATASET_DIR = THIS_DIR / ".." / ".." / "datasets" / "winequality.csv"

# Load the dataset.

df = pd.read_csv(DATASET_DIR)
df = df.drop(columns=["quality"])
dataset = Dataset(df, label_col="recommend")

splitter = ShuffleSplit(n_splits=1, test_size=0.3, random_state=543)
train_ind, test_ind = next(splitter.split(dataset.features))
train_dataset = dataset.subset(train_ind, by_position=True)
test_dataset = dataset.subset(test_ind, by_position=True)

# Set up the model

model = Pipeline([("scaler", StandardScaler()),
                  ("clf", SVC(class_weight="balanced"))])
cm = ClassificationModel(model)
cm.train(train_dataset)

presc_report = ReportRunner()
presc_report.run(model=cm,
                 test_dataset=test_dataset,
                 train_dataset=train_dataset)

print(f"The report is available at {presc_report.report_html}")