Esempio n. 1
0
def test_fetch_openml_australian_pandas_error_sparse(monkeypatch):
    data_id = 292

    _monkey_patch_webbased_functions(monkeypatch, data_id, True)

    msg = 'Cannot return dataframe with sparse data'
    with pytest.raises(ValueError, match=msg):
        fetch_openml(data_id=data_id, as_frame=True, cache=False)
Esempio n. 2
0
def test_convert_arff_data_dataframe_warning_low_memory_pandas(monkeypatch):
    pytest.importorskip('pandas')

    data_id = 1119
    _monkey_patch_webbased_functions(monkeypatch, data_id, True)

    msg = 'Could not adhere to working_memory config.'
    with pytest.warns(UserWarning, match=msg):
        with config_context(working_memory=1e-6):
            fetch_openml(data_id=data_id, as_frame=True, cache=False)
Esempio n. 3
0
def test_fetch_openml_adultcensus_pandas(monkeypatch):
    pd = pytest.importorskip('pandas')
    CategoricalDtype = pd.api.types.CategoricalDtype

    # Check because of the numeric row attribute (issue #12329)
    data_id = 1119
    data_shape = (10, 14)
    target_shape = (10, )
    frame_shape = (10, 15)

    expected_data_categories = 8
    expected_data_floats = 6
    target_column = 'class'

    _monkey_patch_webbased_functions(monkeypatch, data_id, True)
    bunch = fetch_openml(data_id=data_id, as_frame=True, cache=False)
    data = bunch.data
    target = bunch.target
    frame = bunch.frame

    assert isinstance(data, pd.DataFrame)
    assert data.shape == data_shape
    n_categories = len([dtype for dtype in data.dtypes
                       if isinstance(dtype, CategoricalDtype)])
    n_floats = len([dtype for dtype in data.dtypes if dtype.kind == 'f'])
    assert expected_data_categories == n_categories
    assert expected_data_floats == n_floats

    assert isinstance(target, pd.Series)
    assert target.shape == target_shape
    assert target.name == target_column

    assert isinstance(frame, pd.DataFrame)
    assert frame.shape == frame_shape
Esempio n. 4
0
def test_fetch_openml_adultcensus_pandas_return_X_y(monkeypatch):
    pd = pytest.importorskip('pandas')
    CategoricalDtype = pd.api.types.CategoricalDtype

    data_id = 1119
    data_shape = (10, 14)
    target_shape = (10, )

    expected_data_categories = 8
    expected_data_floats = 6
    target_column = 'class'

    _monkey_patch_webbased_functions(monkeypatch, data_id, True)
    X, y = fetch_openml(data_id=data_id, as_frame=True, cache=False,
                        return_X_y=True)
    assert isinstance(X, pd.DataFrame)
    assert X.shape == data_shape
    n_categories = len([dtype for dtype in X.dtypes
                       if isinstance(dtype, CategoricalDtype)])
    n_floats = len([dtype for dtype in X.dtypes if dtype.kind == 'f'])
    assert expected_data_categories == n_categories
    assert expected_data_floats == n_floats

    assert isinstance(y, pd.Series)
    assert y.shape == target_shape
    assert y.name == target_column
Esempio n. 5
0
def test_fetch_openml_anneal_pandas(monkeypatch):
    # classification dataset with numeric and categorical columns
    pd = pytest.importorskip('pandas')
    CategoricalDtype = pd.api.types.CategoricalDtype

    data_id = 2
    target_column = 'class'
    data_shape = (11, 38)
    target_shape = (11,)
    frame_shape = (11, 39)
    expected_data_categories = 32
    expected_data_floats = 6

    _monkey_patch_webbased_functions(monkeypatch, data_id, True)

    bunch = fetch_openml(data_id=data_id, as_frame=True,
                         target_column=target_column, cache=False)
    data = bunch.data
    target = bunch.target
    frame = bunch.frame

    assert isinstance(data, pd.DataFrame)
    assert data.shape == data_shape
    n_categories = len([dtype for dtype in data.dtypes
                       if isinstance(dtype, CategoricalDtype)])
    n_floats = len([dtype for dtype in data.dtypes if dtype.kind == 'f'])
    assert expected_data_categories == n_categories
    assert expected_data_floats == n_floats

    assert isinstance(target, pd.Series)
    assert target.shape == target_shape
    assert isinstance(target.dtype, CategoricalDtype)

    assert isinstance(frame, pd.DataFrame)
    assert frame.shape == frame_shape
Esempio n. 6
0
def _test_features_list(data_id):
    # XXX Test is intended to verify/ensure correct decoding behavior
    # Not usable with sparse data or datasets that have columns marked as
    # {row_identifier, ignore}
    def decode_column(data_bunch, col_idx):
        col_name = data_bunch.feature_names[col_idx]
        if col_name in data_bunch.categories:
            # XXX: This would be faster with np.take, although it does not
            # handle missing values fast (also not with mode='wrap')
            cat = data_bunch.categories[col_name]
            result = [None if is_scalar_nan(idx) else cat[int(idx)]
                      for idx in data_bunch.data[:, col_idx]]
            return np.array(result, dtype='O')
        else:
            # non-nominal attribute
            return data_bunch.data[:, col_idx]

    data_bunch = fetch_openml(data_id=data_id, cache=False, target_column=None)

    # also obtain decoded arff
    data_description = _get_data_description_by_id(data_id, None)
    sparse = data_description['format'].lower() == 'sparse_arff'
    if sparse is True:
        raise ValueError('This test is not intended for sparse data, to keep '
                         'code relatively simple')
    data_arff = _download_data_arff(data_description['file_id'],
                                    sparse, None, False)
    data_downloaded = np.array(list(data_arff['data']), dtype='O')

    for i in range(len(data_bunch.feature_names)):
        # XXX: Test per column, as this makes it easier to avoid problems with
        # missing values

        np.testing.assert_array_equal(data_downloaded[:, i],
                                      decode_column(data_bunch, i))
Esempio n. 7
0
def test_fetch_openml_iris_pandas_equal_to_no_frame(monkeypatch):
    # as_frame = True returns the same underlying data as as_frame = False
    pytest.importorskip('pandas')
    data_id = 61

    _monkey_patch_webbased_functions(monkeypatch, data_id, True)

    frame_bunch = fetch_openml(data_id=data_id, as_frame=True, cache=False)
    frame_data = frame_bunch.data
    frame_target = frame_bunch.target

    norm_bunch = fetch_openml(data_id=data_id, as_frame=False, cache=False)
    norm_data = norm_bunch.data
    norm_target = norm_bunch.target

    assert_allclose(norm_data, frame_data)
    assert_array_equal(norm_target, frame_target)
Esempio n. 8
0
def test_fetch_openml_titanic_pandas(monkeypatch):
    # dataset with strings
    pd = pytest.importorskip('pandas')
    CategoricalDtype = pd.api.types.CategoricalDtype

    data_id = 40945
    data_shape = (1309, 13)
    target_shape = (1309, )
    frame_shape = (1309, 14)
    name_to_dtype = {
        'pclass': np.float64,
        'name': object,
        'sex': CategoricalDtype(['female', 'male']),
        'age': np.float64,
        'sibsp': np.float64,
        'parch': np.float64,
        'ticket': object,
        'fare': np.float64,
        'cabin': object,
        'embarked': CategoricalDtype(['C', 'Q', 'S']),
        'boat': object,
        'body': np.float64,
        'home.dest': object,
        'survived': CategoricalDtype(['0', '1'])
    }

    frame_columns = ['pclass', 'survived', 'name', 'sex', 'age', 'sibsp',
                     'parch', 'ticket', 'fare', 'cabin', 'embarked',
                     'boat', 'body', 'home.dest']
    frame_dtypes = [name_to_dtype[col] for col in frame_columns]
    feature_names = ['pclass', 'name', 'sex', 'age', 'sibsp',
                     'parch', 'ticket', 'fare', 'cabin', 'embarked',
                     'boat', 'body', 'home.dest']
    target_name = 'survived'

    _monkey_patch_webbased_functions(monkeypatch, data_id, True)
    bunch = fetch_openml(data_id=data_id, as_frame=True, cache=False)
    data = bunch.data
    target = bunch.target
    frame = bunch.frame

    assert isinstance(data, pd.DataFrame)
    assert data.shape == data_shape
    assert np.all(data.columns == feature_names)
    assert bunch.target_names == [target_name]

    assert isinstance(target, pd.Series)
    assert target.shape == target_shape
    assert target.name == target_name
    assert target.dtype == name_to_dtype[target_name]

    assert isinstance(frame, pd.DataFrame)
    assert frame.shape == frame_shape
    assert np.all(frame.dtypes == frame_dtypes)
Esempio n. 9
0
def test_fetch_openml_notarget(monkeypatch, gzip_response):
    data_id = 61
    target_column = None
    expected_observations = 150
    expected_features = 5

    _monkey_patch_webbased_functions(monkeypatch, data_id, gzip_response)
    data = fetch_openml(data_id=data_id, target_column=target_column,
                        cache=False)
    assert data.data.shape == (expected_observations, expected_features)
    assert data.target is None
Esempio n. 10
0
def test_fetch_openml_cache(monkeypatch, gzip_response, tmpdir):
    def _mock_urlopen_raise(request):
        raise ValueError('This mechanism intends to test correct cache'
                         'handling. As such, urlopen should never be '
                         'accessed. URL: %s' % request.get_full_url())
    data_id = 2
    cache_directory = str(tmpdir.mkdir('scikit_learn_data'))
    _monkey_patch_webbased_functions(
        monkeypatch, data_id, gzip_response)
    X_fetched, y_fetched = fetch_openml(data_id=data_id, cache=True,
                                        data_home=cache_directory,
                                        return_X_y=True)

    monkeypatch.setattr(sklearn_lib.datasets._openml, 'urlopen',
                        _mock_urlopen_raise)

    X_cached, y_cached = fetch_openml(data_id=data_id, cache=True,
                                      data_home=cache_directory,
                                      return_X_y=True)
    np.testing.assert_array_equal(X_fetched, X_cached)
    np.testing.assert_array_equal(y_fetched, y_cached)
Esempio n. 11
0
def test_fetch_openml_cpu_pandas(monkeypatch):
    # regression dataset with numeric and categorical columns
    pd = pytest.importorskip('pandas')
    CategoricalDtype = pd.api.types.CategoricalDtype
    data_id = 561
    data_shape = (209, 7)
    target_shape = (209, )
    frame_shape = (209, 8)

    cat_dtype = CategoricalDtype(['adviser', 'amdahl', 'apollo', 'basf',
                                  'bti', 'burroughs', 'c.r.d', 'cdc',
                                  'cambex', 'dec', 'dg', 'formation',
                                  'four-phase', 'gould', 'hp', 'harris',
                                  'honeywell', 'ibm', 'ipl', 'magnuson',
                                  'microdata', 'nas', 'ncr', 'nixdorf',
                                  'perkin-elmer', 'prime', 'siemens',
                                  'sperry', 'sratus', 'wang'])
    data_dtypes = [cat_dtype] + [np.float64] * 6
    feature_names = ['vendor', 'MYCT', 'MMIN', 'MMAX', 'CACH',
                     'CHMIN', 'CHMAX']
    target_name = 'class'

    _monkey_patch_webbased_functions(monkeypatch, data_id, True)
    bunch = fetch_openml(data_id=data_id, as_frame=True, cache=False)
    data = bunch.data
    target = bunch.target
    frame = bunch.frame

    assert isinstance(data, pd.DataFrame)
    assert data.shape == data_shape
    assert np.all(data.dtypes == data_dtypes)
    assert np.all(data.columns == feature_names)
    assert np.all(bunch.feature_names == feature_names)
    assert bunch.target_names == [target_name]

    assert isinstance(target, pd.Series)
    assert target.shape == target_shape
    assert target.dtype == np.float64
    assert target.name == target_name

    assert isinstance(frame, pd.DataFrame)
    assert frame.shape == frame_shape
Esempio n. 12
0
def test_fetch_openml_iris_multitarget_pandas(monkeypatch):
    # classification dataset with numeric only columns
    pd = pytest.importorskip('pandas')
    CategoricalDtype = pd.api.types.CategoricalDtype
    data_id = 61
    data_shape = (150, 3)
    target_shape = (150, 2)
    frame_shape = (150, 5)
    target_column = ['petalwidth', 'petallength']

    cat_dtype = CategoricalDtype(['Iris-setosa', 'Iris-versicolor',
                                  'Iris-virginica'])
    data_dtypes = [np.float64, np.float64] + [cat_dtype]
    data_names = ['sepallength', 'sepalwidth', 'class']
    target_dtypes = [np.float64, np.float64]
    target_names = ['petalwidth', 'petallength']

    _monkey_patch_webbased_functions(monkeypatch, data_id, True)

    bunch = fetch_openml(data_id=data_id, as_frame=True, cache=False,
                         target_column=target_column)
    data = bunch.data
    target = bunch.target
    frame = bunch.frame

    assert isinstance(data, pd.DataFrame)
    assert np.all(data.dtypes == data_dtypes)
    assert data.shape == data_shape
    assert np.all(data.columns == data_names)
    assert np.all(bunch.feature_names == data_names)
    assert bunch.target_names == target_names

    assert isinstance(target, pd.DataFrame)
    assert np.all(target.dtypes == target_dtypes)
    assert target.shape == target_shape
    assert np.all(target.columns == target_names)

    assert isinstance(frame, pd.DataFrame)
    assert frame.shape == frame_shape
    assert np.all(frame.dtypes == [np.float64] * 4 + [cat_dtype])
Esempio n. 13
0
def test_fetch_openml_miceprotein_pandas(monkeypatch):
    # JvR: very important check, as this dataset defined several row ids
    # and ignore attributes. Note that data_features json has 82 attributes,
    # and row id (1), ignore attributes (3) have been removed.
    pd = pytest.importorskip('pandas')
    CategoricalDtype = pd.api.types.CategoricalDtype

    data_id = 40966
    data_shape = (7, 77)
    target_shape = (7, )
    frame_shape = (7, 78)

    target_column = 'class'
    frame_n_categories = 1
    frame_n_floats = 77

    _monkey_patch_webbased_functions(monkeypatch, data_id, True)
    bunch = fetch_openml(data_id=data_id, as_frame=True, cache=False)
    data = bunch.data
    target = bunch.target
    frame = bunch.frame

    assert isinstance(data, pd.DataFrame)
    assert data.shape == data_shape
    assert np.all(data.dtypes == np.float64)

    assert isinstance(target, pd.Series)
    assert isinstance(target.dtype, CategoricalDtype)
    assert target.shape == target_shape
    assert target.name == target_column

    assert isinstance(frame, pd.DataFrame)
    assert frame.shape == frame_shape
    n_categories = len([dtype for dtype in frame.dtypes
                       if isinstance(dtype, CategoricalDtype)])
    n_floats = len([dtype for dtype in frame.dtypes if dtype.kind == 'f'])
    assert frame_n_categories == n_categories
    assert frame_n_floats == n_floats
Esempio n. 14
0
def test_fetch_openml_emotions_pandas(monkeypatch):
    # classification dataset with multiple targets (natively)
    pd = pytest.importorskip('pandas')
    CategoricalDtype = pd.api.types.CategoricalDtype

    data_id = 40589
    target_column = ['amazed.suprised', 'happy.pleased', 'relaxing.calm',
                     'quiet.still', 'sad.lonely', 'angry.aggresive']
    data_shape = (13, 72)
    target_shape = (13, 6)
    frame_shape = (13, 78)

    expected_frame_categories = 6
    expected_frame_floats = 72

    _monkey_patch_webbased_functions(monkeypatch, data_id, True)
    bunch = fetch_openml(data_id=data_id, as_frame=True, cache=False,
                         target_column=target_column)
    data = bunch.data
    target = bunch.target
    frame = bunch.frame

    assert isinstance(data, pd.DataFrame)
    assert data.shape == data_shape

    assert isinstance(target, pd.DataFrame)
    assert target.shape == target_shape
    assert np.all(target.columns == target_column)

    assert isinstance(frame, pd.DataFrame)
    assert frame.shape == frame_shape
    n_categories = len([dtype for dtype in frame.dtypes
                       if isinstance(dtype, CategoricalDtype)])
    n_floats = len([dtype for dtype in frame.dtypes if dtype.kind == 'f'])
    assert expected_frame_categories == n_categories
    assert expected_frame_floats == n_floats
Esempio n. 15
0
def _fetch_dataset_from_openml(data_id, data_name, data_version,
                               target_column,
                               expected_observations, expected_features,
                               expected_missing,
                               expected_data_dtype, expected_target_dtype,
                               expect_sparse, compare_default_target):
    # fetches a dataset in three various ways from OpenML, using the
    # fetch_openml function, and does various checks on the validity of the
    # result. Note that this function can be mocked (by invoking
    # _monkey_patch_webbased_functions before invoking this function)
    data_by_name_id = fetch_openml(name=data_name, version=data_version,
                                   cache=False)
    assert int(data_by_name_id.details['id']) == data_id

    # Please note that cache=False is crucial, as the monkey patched files are
    # not consistent with reality
    fetch_openml(name=data_name, cache=False)
    # without specifying the version, there is no guarantee that the data id
    # will be the same

    # fetch with dataset id
    data_by_id = fetch_openml(data_id=data_id, cache=False,
                              target_column=target_column)
    assert data_by_id.details['name'] == data_name
    assert data_by_id.data.shape == (expected_observations, expected_features)
    if isinstance(target_column, str):
        # single target, so target is vector
        assert data_by_id.target.shape == (expected_observations, )
        assert data_by_id.target_names == [target_column]
    elif isinstance(target_column, list):
        # multi target, so target is array
        assert data_by_id.target.shape == (expected_observations,
                                           len(target_column))
        assert data_by_id.target_names == target_column
    assert data_by_id.data.dtype == np.float64
    assert data_by_id.target.dtype == expected_target_dtype
    assert len(data_by_id.feature_names) == expected_features
    for feature in data_by_id.feature_names:
        assert isinstance(feature, str)

    # TODO: pass in a list of expected nominal features
    for feature, categories in data_by_id.categories.items():
        feature_idx = data_by_id.feature_names.index(feature)
        values = np.unique(data_by_id.data[:, feature_idx])
        values = values[np.isfinite(values)]
        assert set(values) <= set(range(len(categories)))

    if compare_default_target:
        # check whether the data by id and data by id target are equal
        data_by_id_default = fetch_openml(data_id=data_id, cache=False)
        if data_by_id.data.dtype == np.float64:
            np.testing.assert_allclose(data_by_id.data,
                                       data_by_id_default.data)
        else:
            assert np.array_equal(data_by_id.data, data_by_id_default.data)
        if data_by_id.target.dtype == np.float64:
            np.testing.assert_allclose(data_by_id.target,
                                       data_by_id_default.target)
        else:
            assert np.array_equal(data_by_id.target, data_by_id_default.target)

    if expect_sparse:
        assert isinstance(data_by_id.data, scipy.sparse.csr_matrix)
    else:
        assert isinstance(data_by_id.data, np.ndarray)
        # np.isnan doesn't work on CSR matrix
        assert (np.count_nonzero(np.isnan(data_by_id.data)) ==
                expected_missing)

    # test return_X_y option
    fetch_func = partial(fetch_openml, data_id=data_id, cache=False,
                         target_column=target_column)
    check_return_X_y(data_by_id, fetch_func)
    return data_by_id