Example #1
0
def test_fetch_openml_australian_pandas_error_sparse(monkeypatch):
    data_id = 292

    _monkey_patch_webbased_functions(monkeypatch, data_id, True)

    msg = 'Cannot return dataframe with sparse data'
    with pytest.raises(ValueError, match=msg):
        fetch_openml(data_id=data_id, as_frame=True, cache=False)
Example #2
0
def test_convert_arff_data_dataframe_warning_low_memory_pandas(monkeypatch):
    pytest.importorskip('pandas')

    data_id = 1119
    _monkey_patch_webbased_functions(monkeypatch, data_id, True)

    msg = 'Could not adhere to working_memory config.'
    with pytest.warns(UserWarning, match=msg):
        with config_context(working_memory=1e-6):
            fetch_openml(data_id=data_id, as_frame=True, cache=False)
Example #3
0
def test_fetch_openml_adultcensus_pandas(monkeypatch):
    pd = pytest.importorskip('pandas')
    CategoricalDtype = pd.api.types.CategoricalDtype

    # Check because of the numeric row attribute (issue #12329)
    data_id = 1119
    data_shape = (10, 14)
    target_shape = (10, )
    frame_shape = (10, 15)

    expected_data_categories = 8
    expected_data_floats = 6
    target_column = 'class'

    _monkey_patch_webbased_functions(monkeypatch, data_id, True)
    bunch = fetch_openml(data_id=data_id, as_frame=True, cache=False)
    data = bunch.data
    target = bunch.target
    frame = bunch.frame

    assert isinstance(data, pd.DataFrame)
    assert data.shape == data_shape
    n_categories = len([
        dtype for dtype in data.dtypes if isinstance(dtype, CategoricalDtype)
    ])
    n_floats = len([dtype for dtype in data.dtypes if dtype.kind == 'f'])
    assert expected_data_categories == n_categories
    assert expected_data_floats == n_floats

    assert isinstance(target, pd.Series)
    assert target.shape == target_shape
    assert target.name == target_column

    assert isinstance(frame, pd.DataFrame)
    assert frame.shape == frame_shape
Example #4
0
def test_fetch_openml_adultcensus_pandas_return_X_y(monkeypatch):
    pd = pytest.importorskip('pandas')
    CategoricalDtype = pd.api.types.CategoricalDtype

    data_id = 1119
    data_shape = (10, 14)
    target_shape = (10, )

    expected_data_categories = 8
    expected_data_floats = 6
    target_column = 'class'

    _monkey_patch_webbased_functions(monkeypatch, data_id, True)
    X, y = fetch_openml(data_id=data_id,
                        as_frame=True,
                        cache=False,
                        return_X_y=True)
    assert isinstance(X, pd.DataFrame)
    assert X.shape == data_shape
    n_categories = len(
        [dtype for dtype in X.dtypes if isinstance(dtype, CategoricalDtype)])
    n_floats = len([dtype for dtype in X.dtypes if dtype.kind == 'f'])
    assert expected_data_categories == n_categories
    assert expected_data_floats == n_floats

    assert isinstance(y, pd.Series)
    assert y.shape == target_shape
    assert y.name == target_column
Example #5
0
def test_fetch_openml_iris_pandas_equal_to_no_frame(monkeypatch):
    # as_frame = True returns the same underlying data as as_frame = False
    pytest.importorskip('pandas')
    data_id = 61

    _monkey_patch_webbased_functions(monkeypatch, data_id, True)

    frame_bunch = fetch_openml(data_id=data_id, as_frame=True, cache=False)
    frame_data = frame_bunch.data
    frame_target = frame_bunch.target

    norm_bunch = fetch_openml(data_id=data_id, as_frame=False, cache=False)
    norm_data = norm_bunch.data
    norm_target = norm_bunch.target

    assert_allclose(norm_data, frame_data)
    assert_array_equal(norm_target, frame_target)
Example #6
0
def test_fetch_openml_titanic_pandas(monkeypatch):
    # dataset with strings
    pd = pytest.importorskip('pandas')
    CategoricalDtype = pd.api.types.CategoricalDtype

    data_id = 40945
    data_shape = (1309, 13)
    target_shape = (1309, )
    frame_shape = (1309, 14)
    name_to_dtype = {
        'pclass': np.float64,
        'name': object,
        'sex': CategoricalDtype(['female', 'male']),
        'age': np.float64,
        'sibsp': np.float64,
        'parch': np.float64,
        'ticket': object,
        'fare': np.float64,
        'cabin': object,
        'embarked': CategoricalDtype(['C', 'Q', 'S']),
        'boat': object,
        'body': np.float64,
        'home.dest': object,
        'survived': CategoricalDtype(['0', '1'])
    }

    frame_columns = [
        'pclass', 'survived', 'name', 'sex', 'age', 'sibsp', 'parch', 'ticket',
        'fare', 'cabin', 'embarked', 'boat', 'body', 'home.dest'
    ]
    frame_dtypes = [name_to_dtype[col] for col in frame_columns]
    feature_names = [
        'pclass', 'name', 'sex', 'age', 'sibsp', 'parch', 'ticket', 'fare',
        'cabin', 'embarked', 'boat', 'body', 'home.dest'
    ]
    target_name = 'survived'

    _monkey_patch_webbased_functions(monkeypatch, data_id, True)
    bunch = fetch_openml(data_id=data_id, as_frame=True, cache=False)
    data = bunch.data
    target = bunch.target
    frame = bunch.frame

    assert isinstance(data, pd.DataFrame)
    assert data.shape == data_shape
    assert np.all(data.columns == feature_names)

    assert isinstance(target, pd.Series)
    assert target.shape == target_shape
    assert target.name == target_name
    assert target.dtype == name_to_dtype[target_name]

    assert isinstance(frame, pd.DataFrame)
    assert frame.shape == frame_shape
    assert np.all(frame.dtypes == frame_dtypes)
Example #7
0
def load_mnist(n_samples=None, class_0='0', class_1='8'):
    """Load MNIST, select two classes, shuffle and return only n_samples."""
    # Load data from http://openml.org/d/554
    mnist = fetch_openml('mnist_784', version=1)

    # take only two classes for binary classification
    mask = np.logical_or(mnist.target == class_0, mnist.target == class_1)

    X, y = shuffle(mnist.data[mask], mnist.target[mask], random_state=42)
    if n_samples is not None:
        X, y = X[:n_samples], y[:n_samples]
    return X, y
Example #8
0
def test_fetch_openml_notarget(monkeypatch, gzip_response):
    data_id = 61
    target_column = None
    expected_observations = 150
    expected_features = 5

    _monkey_patch_webbased_functions(monkeypatch, data_id, gzip_response)
    data = fetch_openml(data_id=data_id,
                        target_column=target_column,
                        cache=False)
    assert data.data.shape == (expected_observations, expected_features)
    assert data.target is None
Example #9
0
def get_data(dataset_name):
    print("Getting dataset: %s" % dataset_name)

    if dataset_name == 'lfw_people':
        X = fetch_lfw_people().data
    elif dataset_name == '20newsgroups':
        X = fetch_20newsgroups_vectorized().data[:, :100000]
    elif dataset_name == 'olivetti_faces':
        X = fetch_olivetti_faces().data
    elif dataset_name == 'rcv1':
        X = fetch_rcv1().data
    elif dataset_name == 'CIFAR':
        if handle_missing_dataset(CIFAR_FOLDER) == "skip":
            return
        X1 = [
            unpickle("%sdata_batch_%d" % (CIFAR_FOLDER, i + 1))
            for i in range(5)
        ]
        X = np.vstack(X1)
        del X1
    elif dataset_name == 'SVHN':
        if handle_missing_dataset(SVHN_FOLDER) == 0:
            return
        X1 = sp.io.loadmat("%strain_32x32.mat" % SVHN_FOLDER)['X']
        X2 = [X1[:, :, :, i].reshape(32 * 32 * 3) for i in range(X1.shape[3])]
        X = np.vstack(X2)
        del X1
        del X2
    elif dataset_name == 'low rank matrix':
        X = make_low_rank_matrix(n_samples=500,
                                 n_features=np.int(1e4),
                                 effective_rank=100,
                                 tail_strength=.5,
                                 random_state=random_state)
    elif dataset_name == 'uncorrelated matrix':
        X, _ = make_sparse_uncorrelated(n_samples=500,
                                        n_features=10000,
                                        random_state=random_state)
    elif dataset_name == 'big sparse matrix':
        sparsity = np.int(1e6)
        size = np.int(1e6)
        small_size = np.int(1e4)
        data = np.random.normal(0, 1, np.int(sparsity / 10))
        data = np.repeat(data, 10)
        row = np.random.uniform(0, small_size, sparsity)
        col = np.random.uniform(0, small_size, sparsity)
        X = sp.sparse.csr_matrix((data, (row, col)), shape=(size, small_size))
        del data
        del row
        del col
    else:
        X = fetch_openml(dataset_name).data
    return X
Example #10
0
def test_fetch_openml_cache(monkeypatch, gzip_response, tmpdir):
    def _mock_urlopen_raise(request):
        raise ValueError('This mechanism intends to test correct cache'
                         'handling. As such, urlopen should never be '
                         'accessed. URL: %s' % request.get_full_url())

    data_id = 2
    cache_directory = str(tmpdir.mkdir('scikit_learn_data'))
    _monkey_patch_webbased_functions(monkeypatch, data_id, gzip_response)
    X_fetched, y_fetched = fetch_openml(data_id=data_id,
                                        cache=True,
                                        data_home=cache_directory,
                                        return_X_y=True)

    monkeypatch.setattr(mrex.datasets.openml, 'urlopen', _mock_urlopen_raise)

    X_cached, y_cached = fetch_openml(data_id=data_id,
                                      cache=True,
                                      data_home=cache_directory,
                                      return_X_y=True)
    np.testing.assert_array_equal(X_fetched, X_cached)
    np.testing.assert_array_equal(y_fetched, y_cached)
Example #11
0
def load_data(dtype=np.float32, order='C', shuffle=True, seed=0):
    """Load the data, then cache and memmap the train/test split"""
    print("Loading dataset...")
    data = fetch_openml('mnist_784')

    X = check_array(data['data'], dtype=dtype, order=order)
    y = data["target"]

    if shuffle:
        X, y = _shuffle(X, y, random_state=seed)

    # Normalize features
    X /= 255
    return X, y
Example #12
0
def test_fetch_openml_iris_multitarget_pandas(monkeypatch):
    # classification dataset with numeric only columns
    pd = pytest.importorskip('pandas')
    CategoricalDtype = pd.api.types.CategoricalDtype
    data_id = 61
    data_shape = (150, 3)
    target_shape = (150, 2)
    frame_shape = (150, 5)
    target_column = ['petalwidth', 'petallength']

    cat_dtype = CategoricalDtype(
        ['Iris-setosa', 'Iris-versicolor', 'Iris-virginica'])
    data_dtypes = [np.float64, np.float64] + [cat_dtype]
    data_names = ['sepallength', 'sepalwidth', 'class']
    target_dtypes = [np.float64, np.float64]
    target_names = ['petalwidth', 'petallength']

    _monkey_patch_webbased_functions(monkeypatch, data_id, True)

    bunch = fetch_openml(data_id=data_id,
                         as_frame=True,
                         cache=False,
                         target_column=target_column)
    data = bunch.data
    target = bunch.target
    frame = bunch.frame

    assert isinstance(data, pd.DataFrame)
    assert np.all(data.dtypes == data_dtypes)
    assert data.shape == data_shape
    assert np.all(data.columns == data_names)
    assert np.all(bunch.feature_names == data_names)

    assert isinstance(target, pd.DataFrame)
    assert np.all(target.dtypes == target_dtypes)
    assert target.shape == target_shape
    assert np.all(target.columns == target_names)

    assert isinstance(frame, pd.DataFrame)
    assert frame.shape == frame_shape
    assert np.all(frame.dtypes == [np.float64] * 4 + [cat_dtype])
Example #13
0
def test_fetch_openml_emotions_pandas(monkeypatch):
    # classification dataset with multiple targets (natively)
    pd = pytest.importorskip('pandas')
    CategoricalDtype = pd.api.types.CategoricalDtype

    data_id = 40589
    target_column = [
        'amazed.suprised', 'happy.pleased', 'relaxing.calm', 'quiet.still',
        'sad.lonely', 'angry.aggresive'
    ]
    data_shape = (13, 72)
    target_shape = (13, 6)
    frame_shape = (13, 78)

    expected_frame_categories = 6
    expected_frame_floats = 72

    _monkey_patch_webbased_functions(monkeypatch, data_id, True)
    bunch = fetch_openml(data_id=data_id,
                         as_frame=True,
                         cache=False,
                         target_column=target_column)
    data = bunch.data
    target = bunch.target
    frame = bunch.frame

    assert isinstance(data, pd.DataFrame)
    assert data.shape == data_shape

    assert isinstance(target, pd.DataFrame)
    assert target.shape == target_shape
    assert np.all(target.columns == target_column)

    assert isinstance(frame, pd.DataFrame)
    assert frame.shape == frame_shape
    n_categories = len([
        dtype for dtype in frame.dtypes if isinstance(dtype, CategoricalDtype)
    ])
    n_floats = len([dtype for dtype in frame.dtypes if dtype.kind == 'f'])
    assert expected_frame_categories == n_categories
    assert expected_frame_floats == n_floats
Example #14
0
def test_fetch_openml_cpu_pandas(monkeypatch):
    # regression dataset with numeric and categorical columns
    pd = pytest.importorskip('pandas')
    CategoricalDtype = pd.api.types.CategoricalDtype
    data_id = 561
    data_shape = (209, 7)
    target_shape = (209, )
    frame_shape = (209, 8)

    cat_dtype = CategoricalDtype([
        'adviser', 'amdahl', 'apollo', 'basf', 'bti', 'burroughs', 'c.r.d',
        'cdc', 'cambex', 'dec', 'dg', 'formation', 'four-phase', 'gould', 'hp',
        'harris', 'honeywell', 'ibm', 'ipl', 'magnuson', 'microdata', 'nas',
        'ncr', 'nixdorf', 'perkin-elmer', 'prime', 'siemens', 'sperry',
        'sratus', 'wang'
    ])
    data_dtypes = [cat_dtype] + [np.float64] * 6
    feature_names = [
        'vendor', 'MYCT', 'MMIN', 'MMAX', 'CACH', 'CHMIN', 'CHMAX'
    ]
    target_name = 'class'

    _monkey_patch_webbased_functions(monkeypatch, data_id, True)
    bunch = fetch_openml(data_id=data_id, as_frame=True, cache=False)
    data = bunch.data
    target = bunch.target
    frame = bunch.frame

    assert isinstance(data, pd.DataFrame)
    assert data.shape == data_shape
    assert np.all(data.dtypes == data_dtypes)
    assert np.all(data.columns == feature_names)
    assert np.all(bunch.feature_names == feature_names)

    assert isinstance(target, pd.Series)
    assert target.shape == target_shape
    assert target.dtype == np.float64
    assert target.name == target_name

    assert isinstance(frame, pd.DataFrame)
    assert frame.shape == frame_shape
Example #15
0
def load_data(dtype=np.float32, order='F'):
    """Load the data, then cache and memmap the train/test split"""
    ######################################################################
    # Load dataset
    print("Loading dataset...")
    data = fetch_openml('mnist_784')
    X = check_array(data['data'], dtype=dtype, order=order)
    y = data["target"]

    # Normalize features
    X = X / 255

    # Create train-test split (as [Joachims, 2006])
    print("Creating train-test split...")
    n_train = 60000
    X_train = X[:n_train]
    y_train = y[:n_train]
    X_test = X[n_train:]
    y_test = y[n_train:]

    return X_train, X_test, y_train, y_test
Example #16
0
def test_fetch_openml_miceprotein_pandas(monkeypatch):
    # JvR: very important check, as this dataset defined several row ids
    # and ignore attributes. Note that data_features json has 82 attributes,
    # and row id (1), ignore attributes (3) have been removed.
    pd = pytest.importorskip('pandas')
    CategoricalDtype = pd.api.types.CategoricalDtype

    data_id = 40966
    data_shape = (7, 77)
    target_shape = (7, )
    frame_shape = (7, 78)

    target_column = 'class'
    frame_n_categories = 1
    frame_n_floats = 77

    _monkey_patch_webbased_functions(monkeypatch, data_id, True)
    bunch = fetch_openml(data_id=data_id, as_frame=True, cache=False)
    data = bunch.data
    target = bunch.target
    frame = bunch.frame

    assert isinstance(data, pd.DataFrame)
    assert data.shape == data_shape
    assert np.all(data.dtypes == np.float64)

    assert isinstance(target, pd.Series)
    assert isinstance(target.dtype, CategoricalDtype)
    assert target.shape == target_shape
    assert target.name == target_column

    assert isinstance(frame, pd.DataFrame)
    assert frame.shape == frame_shape
    n_categories = len([
        dtype for dtype in frame.dtypes if isinstance(dtype, CategoricalDtype)
    ])
    n_floats = len([dtype for dtype in frame.dtypes if dtype.kind == 'f'])
    assert frame_n_categories == n_categories
    assert frame_n_floats == n_floats
Example #17
0
def test_fetch_openml_anneal_pandas(monkeypatch):
    # classification dataset with numeric and categorical columns
    pd = pytest.importorskip('pandas')
    CategoricalDtype = pd.api.types.CategoricalDtype

    data_id = 2
    target_column = 'class'
    data_shape = (11, 38)
    target_shape = (11, )
    frame_shape = (11, 39)
    expected_data_categories = 32
    expected_data_floats = 6

    _monkey_patch_webbased_functions(monkeypatch, data_id, True)

    bunch = fetch_openml(data_id=data_id,
                         as_frame=True,
                         target_column=target_column,
                         cache=False)
    data = bunch.data
    target = bunch.target
    frame = bunch.frame

    assert isinstance(data, pd.DataFrame)
    assert data.shape == data_shape
    n_categories = len([
        dtype for dtype in data.dtypes if isinstance(dtype, CategoricalDtype)
    ])
    n_floats = len([dtype for dtype in data.dtypes if dtype.kind == 'f'])
    assert expected_data_categories == n_categories
    assert expected_data_floats == n_floats

    assert isinstance(target, pd.Series)
    assert target.shape == target_shape
    assert isinstance(target.dtype, CategoricalDtype)

    assert isinstance(frame, pd.DataFrame)
    assert frame.shape == frame_shape
Example #18
0
def _test_features_list(data_id):
    # XXX Test is intended to verify/ensure correct decoding behavior
    # Not usable with sparse data or datasets that have columns marked as
    # {row_identifier, ignore}
    def decode_column(data_bunch, col_idx):
        col_name = data_bunch.feature_names[col_idx]
        if col_name in data_bunch.categories:
            # XXX: This would be faster with np.take, although it does not
            # handle missing values fast (also not with mode='wrap')
            cat = data_bunch.categories[col_name]
            result = [
                None if is_scalar_nan(idx) else cat[int(idx)]
                for idx in data_bunch.data[:, col_idx]
            ]
            return np.array(result, dtype='O')
        else:
            # non-nominal attribute
            return data_bunch.data[:, col_idx]

    data_bunch = fetch_openml(data_id=data_id, cache=False, target_column=None)

    # also obtain decoded arff
    data_description = _get_data_description_by_id(data_id, None)
    sparse = data_description['format'].lower() == 'sparse_arff'
    if sparse is True:
        raise ValueError('This test is not intended for sparse data, to keep '
                         'code relatively simple')
    data_arff = _download_data_arff(data_description['file_id'], sparse, None,
                                    False)
    data_downloaded = np.array(list(data_arff['data']), dtype='O')

    for i in range(len(data_bunch.feature_names)):
        # XXX: Test per column, as this makes it easier to avoid problems with
        # missing values

        np.testing.assert_array_equal(data_downloaded[:, i],
                                      decode_column(data_bunch, i))
Example #19
0
def load_mauna_loa_atmospheric_co2():
    ml_data = fetch_openml(data_id=41187)
    months = []
    ppmv_sums = []
    counts = []

    y = ml_data.data[:, 0]
    m = ml_data.data[:, 1]
    month_float = y + (m - 1) / 12
    ppmvs = ml_data.target

    for month, ppmv in zip(month_float, ppmvs):
        if not months or month != months[-1]:
            months.append(month)
            ppmv_sums.append(ppmv)
            counts.append(1)
        else:
            # aggregate monthly sum to produce average
            ppmv_sums[-1] += ppmv
            counts[-1] += 1

    months = np.asarray(months).reshape(-1, 1)
    avg_ppmvs = np.asarray(ppmv_sums) / counts
    return months, avg_ppmvs
Example #20
0
# License: BSD 3 clause

import numpy as np

from mrex.compose import ColumnTransformer
from mrex.datasets import fetch_openml
from mrex.pipeline import Pipeline
from mrex.impute import SimpleImputer
from mrex.preprocessing import StandardScaler, OneHotEncoder
from mrex.linear_model import LogisticRegression
from mrex.model_selection import train_test_split, GridSearchCV

np.random.seed(0)

# Load data from https://www.openml.org/d/40945
X, y = fetch_openml("titanic", version=1, as_frame=True, return_X_y=True)

# Alternatively X and y can be obtained directly from the frame attribute:
# X = titanic.frame.drop('survived', axis=1)
# y = titanic.frame['survived']

# We will train our classifier with the following features:
# Numeric Features:
# - age: float.
# - fare: float.
# Categorical Features:
# - embarked: categories encoded as strings {'C', 'S', 'Q'}.
# - sex: categories encoded as strings {'female', 'male'}.
# - pclass: ordinal integers {1, 2, 3}.

# We create the preprocessing pipelines for both numeric and categorical data.
Example #21
0
# Author: Adam Kleczewski
# License: BSD 3 clause

import numpy as np
import matplotlib.pyplot as plt
from mrex.datasets import fetch_openml
from mrex.multioutput import ClassifierChain
from mrex.model_selection import train_test_split
from mrex.multiclass import OneVsRestClassifier
from mrex.metrics import jaccard_score
from mrex.linear_model import LogisticRegression

print(__doc__)

# Load a multi-label dataset from https://www.openml.org/d/40597
X, Y = fetch_openml('yeast', version=4, return_X_y=True)
Y = Y == 'TRUE'
X_train, X_test, Y_train, Y_test = train_test_split(X,
                                                    Y,
                                                    test_size=.2,
                                                    random_state=0)

# Fit an independent logistic regression model for each class using the
# OneVsRestClassifier wrapper.
base_lr = LogisticRegression()
ovr = OneVsRestClassifier(base_lr)
ovr.fit(X_train, Y_train)
Y_pred_ovr = ovr.predict(X_test)
ovr_jaccard_score = jaccard_score(Y_test, Y_pred_ovr, average='samples')

# Fit an ensemble of logistic regression classifier chains and take the
Example #22
0
from mrex.linear_model import LogisticRegression
from mrex.model_selection import train_test_split
from mrex.preprocessing import StandardScaler
from mrex.utils import check_random_state

print(__doc__)

# Author: Arthur Mensch <*****@*****.**>
# License: BSD 3 clause

# Turn down for faster convergence
t0 = time.time()
train_samples = 5000

# Load data from https://www.openml.org/d/554
X, y = fetch_openml('mnist_784', version=1, return_X_y=True)

random_state = check_random_state(0)
permutation = random_state.permutation(X.shape[0])
X = X[permutation]
y = y[permutation]
X = X.reshape((X.shape[0], -1))

X_train, X_test, y_train, y_test = train_test_split(
    X, y, train_size=train_samples, test_size=10000)

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Turn up tolerance for faster convergence
Example #23
0
def _fetch_dataset_from_openml(data_id, data_name, data_version, target_column,
                               expected_observations, expected_features,
                               expected_missing, expected_data_dtype,
                               expected_target_dtype, expect_sparse,
                               compare_default_target):
    # fetches a dataset in three various ways from OpenML, using the
    # fetch_openml function, and does various checks on the validity of the
    # result. Note that this function can be mocked (by invoking
    # _monkey_patch_webbased_functions before invoking this function)
    data_by_name_id = fetch_openml(name=data_name,
                                   version=data_version,
                                   cache=False)
    assert int(data_by_name_id.details['id']) == data_id

    # Please note that cache=False is crucial, as the monkey patched files are
    # not consistent with reality
    fetch_openml(name=data_name, cache=False)
    # without specifying the version, there is no guarantee that the data id
    # will be the same

    # fetch with dataset id
    data_by_id = fetch_openml(data_id=data_id,
                              cache=False,
                              target_column=target_column)
    assert data_by_id.details['name'] == data_name
    assert data_by_id.data.shape == (expected_observations, expected_features)
    if isinstance(target_column, str):
        # single target, so target is vector
        assert data_by_id.target.shape == (expected_observations, )
    elif isinstance(target_column, list):
        # multi target, so target is array
        assert data_by_id.target.shape == (expected_observations,
                                           len(target_column))
    assert data_by_id.data.dtype == np.float64
    assert data_by_id.target.dtype == expected_target_dtype
    assert len(data_by_id.feature_names) == expected_features
    for feature in data_by_id.feature_names:
        assert isinstance(feature, str)

    # TODO: pass in a list of expected nominal features
    for feature, categories in data_by_id.categories.items():
        feature_idx = data_by_id.feature_names.index(feature)
        values = np.unique(data_by_id.data[:, feature_idx])
        values = values[np.isfinite(values)]
        assert set(values) <= set(range(len(categories)))

    if compare_default_target:
        # check whether the data by id and data by id target are equal
        data_by_id_default = fetch_openml(data_id=data_id, cache=False)
        if data_by_id.data.dtype == np.float64:
            np.testing.assert_allclose(data_by_id.data,
                                       data_by_id_default.data)
        else:
            assert np.array_equal(data_by_id.data, data_by_id_default.data)
        if data_by_id.target.dtype == np.float64:
            np.testing.assert_allclose(data_by_id.target,
                                       data_by_id_default.target)
        else:
            assert np.array_equal(data_by_id.target, data_by_id_default.target)

    if expect_sparse:
        assert isinstance(data_by_id.data, scipy.sparse.csr_matrix)
    else:
        assert isinstance(data_by_id.data, np.ndarray)
        # np.isnan doesn't work on CSR matrix
        assert (np.count_nonzero(np.isnan(
            data_by_id.data)) == expected_missing)

    # test return_X_y option
    fetch_func = partial(fetch_openml,
                         data_id=data_id,
                         cache=False,
                         target_column=target_column)
    check_return_X_y(data_by_id, fetch_func)
    return data_by_id
Example #24
0
# datasets available: ['http', 'smtp', 'SA', 'SF', 'shuttle', 'forestcover']
datasets = ['http', 'smtp', 'SA', 'SF', 'shuttle', 'forestcover']

plt.figure()
for dataset_name in datasets:
    # loading and vectorization
    print('loading data')
    if dataset_name in ['http', 'smtp', 'SA', 'SF']:
        dataset = fetch_kddcup99(subset=dataset_name, percent10=True,
                                 random_state=random_state)
        X = dataset.data
        y = dataset.target

    if dataset_name == 'shuttle':
        dataset = fetch_openml('shuttle')
        X = dataset.data
        y = dataset.target
        # we remove data with label 4
        # normal data are then those of class 1
        s = (y != 4)
        X = X[s, :]
        y = y[s]
        y = (y != 1).astype(int)

    if dataset_name == 'forestcover':
        dataset = fetch_covtype()
        X = dataset.data
        y = dataset.target
        # normal data are those with attribute 2
        # abnormal those with attribute 4