def test_load_train_and_test_columns_dont_intersect(temp_file_pair): tmp_train, tmp_test = temp_file_pair _make_and_write_data(tmp_train, 100, 19, True, True, 0, column_prefix="A") _make_and_write_data(tmp_test, 20, 11, True, True, 0, column_prefix="B") with pytest.raises( ValueError, match="columns of training and test data do not intersect"): sdata.load_arff_files_standardized(tmp_train.name, ["event", "time"], 1, path_testing=tmp_test.name, survival=True, standardize_numeric=False, to_numeric=False)
def test_load_train_and_test_with_different_columns(temp_file_pair): tmp_train, tmp_test = temp_file_pair _make_and_write_data(tmp_train, 100, 19, False, True, 0) _make_and_write_data(tmp_test, 20, 11, False, True, 0) with pytest.warns(UserWarning, match="Restricting columns to intersection between " "training and testing data"): sdata.load_arff_files_standardized(tmp_train.name, ["event", "time"], 1, path_testing=tmp_test.name, survival=True, standardize_numeric=False, to_numeric=False)
def test_load_with_categorical_index_2(arff_2): x_train, y_train, x_test, y_test = sdata.load_arff_files_standardized( arff_2, ["label"], pos_label="yes", survival=False, standardize_numeric=False, to_numeric=False) assert x_test is None assert y_test is None assert x_train.shape == (5, 2) assert y_train.shape == (5, 1) index = pandas.Index(['ASampleOne', 'ASampleTwo', 'ASampleThree', 'ASampleFour', 'ASampleFive'], name='index', dtype=object) tm.assert_index_equal(x_train.index, index, exact=True) label = pandas.Series(pandas.Categorical(["no", "no", "yes", "yes", "no"], categories=["yes", "no"], ordered=False), name="label", index=index) tm.assert_series_equal(y_train["label"], label, check_exact=True) value = pandas.Series([1.51, 1.38, -20, 245.3, 3.14], name="value", index=index) tm.assert_series_equal(x_train["value"], value, check_exact=True) size = pandas.Series(pandas.Categorical(["small", "small", "large", "small", "large"], categories=["small", "medium", "large"], ordered=False), name="size", index=index) tm.assert_series_equal(x_train["size"], size, check_exact=True)
def test_load_with_index(temp_file): dataset = _make_and_write_data(temp_file, 100, 10, True, True, 0) x_train, y_train, x_test, y_test = sdata.load_arff_files_standardized( temp_file.name, ["event", "time"], 1, survival=True, standardize_numeric=False, to_numeric=False) assert x_test is None assert y_test is None cols = ["event", "time"] x_true = dataset.drop(cols, axis=1) assert_x_equal(x_true, x_train) assert_y_equal(dataset, y_train)
def test_load_train_and_test_no_labels(temp_file_pair): tmp_train, tmp_test = temp_file_pair train_dataset = _make_and_write_data(tmp_train, 100, 10, True, True, 0) test_dataset = _make_and_write_data(tmp_test, 20, 10, True, False, 0) x_train, y_train, x_test, y_test = sdata.load_arff_files_standardized( tmp_train.name, ["event", "time"], 1, path_testing=tmp_test.name, survival=True, standardize_numeric=False, to_numeric=False) cols = ["event", "time"] x_true = train_dataset.drop(cols, axis=1) assert_x_equal(x_true, x_train) assert_y_equal(train_dataset, y_train) assert_x_equal(test_dataset, x_test) assert y_test is None
def test_load_train_and_test_with_categorical_index(arff_1, arff_2): x_train, y_train, x_test, y_test = sdata.load_arff_files_standardized( arff_1, ["label"], pos_label="yes", path_testing=arff_2, survival=False, standardize_numeric=False, to_numeric=False) assert x_train.shape == (4, 2) assert x_test.shape == (5, 2) assert y_train.shape == (4, 1) assert y_test.shape == (5, 1) # Check train data train_index = pandas.Index(['SampleOne', 'SampleTwo', 'SampleThree', 'SampleFour'], name='index', dtype=object) tm.assert_index_equal(x_train.index, train_index, exact=True) train_label = pandas.Series( pandas.Categorical(["yes", "no", "yes", "yes"], categories=["no", "yes"], ordered=False), name="label", index=train_index) tm.assert_series_equal(y_train["label"], train_label, check_exact=True) train_value = pandas.Series([15.1, 13.8, -0.2, 2.453], name="value", index=train_index) tm.assert_series_equal(x_train["value"], train_value, check_exact=True) train_size = pandas.Series(pandas.Categorical(["medium", "large", "small", "large"], categories=["small", "medium", "large"], ordered=False), name="size", index=train_index) tm.assert_series_equal(x_train["size"], train_size, check_exact=True) # Check test data test_index = pandas.Index(['ASampleOne', 'ASampleTwo', 'ASampleThree', 'ASampleFour', 'ASampleFive'], name='index', dtype=object) tm.assert_index_equal(x_test.index, test_index, exact=True) test_label = pandas.Series( pandas.Categorical(["no", "no", "yes", "yes", "no"], categories=["yes", "no"], ordered=False), name="label", index=test_index) tm.assert_series_equal(y_test["label"], test_label, check_exact=True) test_value = pandas.Series([1.51, 1.38, -20, 245.3, 3.14], name="value", index=test_index) tm.assert_series_equal(x_test["value"], test_value, check_exact=True) test_size = pandas.Series(pandas.Categorical(["small", "small", "large", "small", "large"], categories=["small", "medium", "large"], ordered=False), name="size", index=test_index) tm.assert_series_equal(x_test["size"], test_size, check_exact=True)