def test_load_digits(): digits = load_digits() assert_equal(digits.data.shape, (1797, 64)) assert_equal(numpy.unique(digits.target).size, 10) # test return_X_y option check_return_X_y(digits, partial(load_digits))
def test_fetch_rcv1(): try: data1 = fetch_rcv1(shuffle=False, download_if_missing=False) except IOError as e: if e.errno == errno.ENOENT: raise SkipTest("Download RCV1 dataset to run this test.") X1, Y1 = data1.data, data1.target cat_list, s1 = data1.target_names.tolist(), data1.sample_id # test sparsity assert_true(sp.issparse(X1)) assert_true(sp.issparse(Y1)) assert_equal(60915113, X1.data.size) assert_equal(2606875, Y1.data.size) # test shapes assert_equal((804414, 47236), X1.shape) assert_equal((804414, 103), Y1.shape) assert_equal((804414,), s1.shape) assert_equal(103, len(cat_list)) # test ordering of categories first_categories = [u'C11', u'C12', u'C13', u'C14', u'C15', u'C151'] assert_array_equal(first_categories, cat_list[:6]) # test number of sample for some categories some_categories = ('GMIL', 'E143', 'CCAT') number_non_zero_in_cat = (5, 1206, 381327) for num, cat in zip(number_non_zero_in_cat, some_categories): j = cat_list.index(cat) assert_equal(num, Y1[:, j].data.size) # test shuffling and subset data2 = fetch_rcv1(shuffle=True, subset='train', random_state=77, download_if_missing=False) X2, Y2 = data2.data, data2.target s2 = data2.sample_id # test return_X_y option fetch_func = partial(fetch_rcv1, shuffle=False, subset='train', download_if_missing=False) check_return_X_y(data2, fetch_func) # The first 23149 samples are the training samples assert_array_equal(np.sort(s1[:23149]), np.sort(s2)) # test some precise values some_sample_ids = (2286, 3274, 14042) for sample_id in some_sample_ids: idx1 = s1.tolist().index(sample_id) idx2 = s2.tolist().index(sample_id) feature_values_1 = X1[idx1, :].toarray() feature_values_2 = X2[idx2, :].toarray() assert_almost_equal(feature_values_1, feature_values_2) target_values_1 = Y1[idx1, :].toarray() target_values_2 = Y2[idx2, :].toarray() assert_almost_equal(target_values_1, target_values_2)
def test_percent10(): try: data = fetch_kddcup99(download_if_missing=False) except IOError: raise SkipTest("kddcup99 dataset can not be loaded.") assert_equal(data.data.shape, (494021, 41)) assert_equal(data.target.shape, (494021,)) data_shuffled = fetch_kddcup99(shuffle=True, random_state=0) assert_equal(data.data.shape, data_shuffled.data.shape) assert_equal(data.target.shape, data_shuffled.target.shape) data = fetch_kddcup99('SA') assert_equal(data.data.shape, (100655, 41)) assert_equal(data.target.shape, (100655,)) data = fetch_kddcup99('SF') assert_equal(data.data.shape, (73237, 4)) assert_equal(data.target.shape, (73237,)) data = fetch_kddcup99('http') assert_equal(data.data.shape, (58725, 3)) assert_equal(data.target.shape, (58725,)) data = fetch_kddcup99('smtp') assert_equal(data.data.shape, (9571, 3)) assert_equal(data.target.shape, (9571,)) fetch_func = partial(fetch_kddcup99, 'smtp') check_return_X_y(data, fetch_func)
def test_load_wine(): res = load_wine() assert_equal(res.data.shape, (178, 13)) assert_equal(res.target.size, 178) assert_equal(res.target_names.size, 3) assert res.DESCR # test return_X_y option check_return_X_y(res, partial(load_wine))
def test_load_diabetes(): res = load_diabetes() assert_equal(res.data.shape, (442, 10)) assert_true(res.target.size, 442) assert_equal(len(res.feature_names), 10) assert_true(res.DESCR) # test return_X_y option check_return_X_y(res, partial(load_diabetes))
def test_load_wine(): res = load_wine() assert res.data.shape == (178, 13) assert res.target.size == 178 assert res.target_names.size == 3 assert res.DESCR # test return_X_y option check_return_X_y(res, partial(load_wine))
def test_load_diabetes(): res = load_diabetes() assert_equal(res.data.shape, (442, 10)) assert res.target.size, 442 assert_equal(len(res.feature_names), 10) assert res.DESCR # test return_X_y option check_return_X_y(res, partial(load_diabetes))
def test_load_wine(): res = load_wine() assert_equal(res.data.shape, (178, 13)) assert_equal(res.target.size, 178) assert_equal(res.target_names.size, 3) assert_true(res.DESCR) # test return_X_y option check_return_X_y(res, partial(load_wine))
def test_fetch_rcv1(fetch_rcv1_fxt): data1 = fetch_rcv1_fxt(shuffle=False) X1, Y1 = data1.data, data1.target cat_list, s1 = data1.target_names.tolist(), data1.sample_id # test sparsity assert sp.issparse(X1) assert sp.issparse(Y1) assert 60915113 == X1.data.size assert 2606875 == Y1.data.size # test shapes assert (804414, 47236) == X1.shape assert (804414, 103) == Y1.shape assert (804414, ) == s1.shape assert 103 == len(cat_list) # test descr assert data1.DESCR.startswith(".. _rcv1_dataset:") # test ordering of categories first_categories = ["C11", "C12", "C13", "C14", "C15", "C151"] assert_array_equal(first_categories, cat_list[:6]) # test number of sample for some categories some_categories = ("GMIL", "E143", "CCAT") number_non_zero_in_cat = (5, 1206, 381327) for num, cat in zip(number_non_zero_in_cat, some_categories): j = cat_list.index(cat) assert num == Y1[:, j].data.size # test shuffling and subset data2 = fetch_rcv1_fxt(shuffle=True, subset="train", random_state=77) X2, Y2 = data2.data, data2.target s2 = data2.sample_id # test return_X_y option fetch_func = partial(fetch_rcv1_fxt, shuffle=False, subset="train") check_return_X_y(data2, fetch_func) # The first 23149 samples are the training samples assert_array_equal(np.sort(s1[:23149]), np.sort(s2)) # test some precise values some_sample_ids = (2286, 3274, 14042) for sample_id in some_sample_ids: idx1 = s1.tolist().index(sample_id) idx2 = s2.tolist().index(sample_id) feature_values_1 = X1[idx1, :].toarray() feature_values_2 = X2[idx2, :].toarray() assert_almost_equal(feature_values_1, feature_values_2) target_values_1 = Y1[idx1, :].toarray() target_values_2 = Y2[idx2, :].toarray() assert_almost_equal(target_values_1, target_values_2)
def test_load_fake_lfw_people(): lfw_people = fetch_lfw_people(data_home=SCIKIT_LEARN_DATA, min_faces_per_person=3, download_if_missing=False) # The data is croped around the center as a rectangular bounding box # around the face. Colors are converted to gray levels: assert lfw_people.images.shape == (10, 62, 47) assert lfw_people.data.shape == (10, 2914) # the target is array of person integer ids assert_array_equal(lfw_people.target, [2, 0, 1, 0, 2, 0, 2, 1, 1, 2]) # names of the persons can be found using the target_names array expected_classes = ["Abdelatif Smith", "Abhati Kepler", "Onur Lopez"] assert_array_equal(lfw_people.target_names, expected_classes) # It is possible to ask for the original data without any croping or color # conversion and not limit on the number of picture per person lfw_people = fetch_lfw_people( data_home=SCIKIT_LEARN_DATA, resize=None, slice_=None, color=True, download_if_missing=False, ) assert lfw_people.images.shape == (17, 250, 250, 3) assert lfw_people.DESCR.startswith( ".. _labeled_faces_in_the_wild_dataset:") # the ids and class names are the same as previously assert_array_equal(lfw_people.target, [0, 0, 1, 6, 5, 6, 3, 6, 0, 3, 6, 1, 2, 4, 5, 1, 2]) assert_array_equal( lfw_people.target_names, [ "Abdelatif Smith", "Abhati Kepler", "Camara Alvaro", "Chen Dupont", "John Lee", "Lin Bauman", "Onur Lopez", ], ) # test return_X_y option fetch_func = partial( fetch_lfw_people, data_home=SCIKIT_LEARN_DATA, resize=None, slice_=None, color=True, download_if_missing=False, ) check_return_X_y(lfw_people, fetch_func)
def test_load_iris(): res = load_iris() assert res.data.shape == (150, 4) assert res.target.size == 150 assert res.target_names.size == 3 assert res.DESCR assert os.path.exists(res.filename) # test return_X_y option check_return_X_y(res, partial(load_iris))
def test_load_iris(): res = load_iris() assert_equal(res.data.shape, (150, 4)) assert_equal(res.target.size, 150) assert_equal(res.target_names.size, 3) assert_true(res.DESCR) assert_true(os.path.exists(res.filename)) # test return_X_y option check_return_X_y(res, partial(load_iris))
def test_load_boston(): res = load_boston() assert_equal(res.data.shape, (506, 13)) assert_equal(res.target.size, 506) assert_equal(res.feature_names.size, 13) assert_true(res.DESCR) assert_true(os.path.exists(res.filename)) # test return_X_y option check_return_X_y(res, partial(load_boston))
def test_load_iris(): res = load_iris() assert_equal(res.data.shape, (150, 4)) assert_equal(res.target.size, 150) assert_equal(res.target_names.size, 3) assert_true(res.DESCR) assert_true(os.path.exists(res.filename)) # test return_X_y option check_return_X_y(res, partial(load_iris))
def test_load_boston(): res = load_boston() assert res.data.shape == (506, 13) assert res.target.size == 506 assert res.feature_names.size == 13 assert res.DESCR assert os.path.exists(res.filename) # test return_X_y option check_return_X_y(res, partial(load_boston))
def test_load_breast_cancer(): res = load_breast_cancer() assert res.data.shape == (569, 30) assert res.target.size == 569 assert res.target_names.size == 2 assert res.DESCR assert os.path.exists(res.filename) # test return_X_y option check_return_X_y(res, partial(load_breast_cancer))
def test_load_boston(): res = load_boston() assert_equal(res.data.shape, (506, 13)) assert_equal(res.target.size, 506) assert_equal(res.feature_names.size, 13) assert_true(res.DESCR) assert_true(os.path.exists(res.filename)) # test return_X_y option check_return_X_y(res, partial(load_boston))
def test_load_breast_cancer(): res = load_breast_cancer() assert_equal(res.data.shape, (569, 30)) assert_equal(res.target.size, 569) assert_equal(res.target_names.size, 2) assert_true(res.DESCR) assert_true(os.path.exists(res.filename)) # test return_X_y option check_return_X_y(res, partial(load_breast_cancer))
def test_load_breast_cancer(): res = load_breast_cancer() assert_equal(res.data.shape, (569, 30)) assert_equal(res.target.size, 569) assert_equal(res.target_names.size, 2) assert_true(res.DESCR) assert_true(os.path.exists(res.filename)) # test return_X_y option check_return_X_y(res, partial(load_breast_cancer))
def test_fetch(): try: data = fetch() except IOError: raise SkipTest("California housing dataset can not be loaded.") assert ((20640, 8) == data.data.shape) assert ((20640, ) == data.target.shape) # test return_X_y option fetch_func = partial(fetch) check_return_X_y(data, fetch_func)
def test_load_linnerud(): res = load_linnerud() assert_equal(res.data.shape, (20, 3)) assert_equal(res.target.shape, (20, 3)) assert_equal(len(res.target_names), 3) assert_true(res.DESCR) assert_true(os.path.exists(res.data_filename)) assert_true(os.path.exists(res.target_filename)) # test return_X_y option check_return_X_y(res, partial(load_linnerud))
def test_fetch(): try: data = fetch() except IOError: raise SkipTest("California housing dataset can not be loaded.") assert((20640, 8) == data.data.shape) assert((20640, ) == data.target.shape) # test return_X_y option fetch_func = partial(fetch) check_return_X_y(data, fetch_func)
def test_load_linnerud(): res = load_linnerud() assert_equal(res.data.shape, (20, 3)) assert_equal(res.target.shape, (20, 3)) assert_equal(len(res.target_names), 3) assert_true(res.DESCR) assert_true(os.path.exists(res.data_filename)) assert_true(os.path.exists(res.target_filename)) # test return_X_y option check_return_X_y(res, partial(load_linnerud))
def test_load_linnerud(): res = load_linnerud() assert res.data.shape == (20, 3) assert res.target.shape == (20, 3) assert len(res.target_names) == 3 assert res.DESCR assert os.path.exists(res.data_filename) assert os.path.exists(res.target_filename) # test return_X_y option check_return_X_y(res, partial(load_linnerud))
def test_olivetti_faces(fetch_olivetti_faces_fxt): data = fetch_olivetti_faces_fxt(shuffle=True, random_state=0) assert isinstance(data, Bunch) for expected_keys in ('area_data', 'images', 'target', 'DESCR'): assert expected_keys in data.keys() assert data.data.shape == (400, 4096) assert data.images.shape == (400, 64, 64) assert data.target.shape == (400, ) assert_array_equal(np.unique(np.sort(data.target)), np.arange(40)) # test the return_X_y option check_return_X_y(data, fetch_olivetti_faces_fxt)
def test_olivetti_faces(fetch_olivetti_faces_fxt): data = fetch_olivetti_faces_fxt(shuffle=True, random_state=0) assert isinstance(data, Bunch) for expected_keys in ("data", "images", "target", "DESCR"): assert expected_keys in data.keys() assert data.data.shape == (400, 4096) assert data.images.shape == (400, 64, 64) assert data.target.shape == (400, ) assert_array_equal(np.unique(np.sort(data.target)), np.arange(40)) assert data.DESCR.startswith(".. _olivetti_faces_dataset:") # test the return_X_y option check_return_X_y(data, fetch_olivetti_faces_fxt)
def test_fetch(fetch_covtype_fxt): data1 = fetch_covtype_fxt(shuffle=True, random_state=42) data2 = fetch_covtype_fxt(shuffle=True, random_state=37) X1, X2 = data1['data'], data2['data'] assert (581012, 54) == X1.shape assert X1.shape == X2.shape assert X1.sum() == X2.sum() y1, y2 = data1['target'], data2['target'] assert (X1.shape[0],) == y1.shape assert (X1.shape[0],) == y2.shape # test return_X_y option fetch_func = partial(fetch_covtype_fxt) check_return_X_y(data1, fetch_func)
def test_fetch(fetch_covtype_fxt): data1 = fetch_covtype_fxt(shuffle=True, random_state=42) data2 = fetch_covtype_fxt(shuffle=True, random_state=37) X1, X2 = data1["data"], data2["data"] assert (581012, 54) == X1.shape assert X1.shape == X2.shape assert X1.sum() == X2.sum() y1, y2 = data1["target"], data2["target"] assert (X1.shape[0],) == y1.shape assert (X1.shape[0],) == y2.shape descr_prefix = ".. _covtype_dataset:" assert data1.DESCR.startswith(descr_prefix) assert data2.DESCR.startswith(descr_prefix) # test return_X_y option fetch_func = partial(fetch_covtype_fxt) check_return_X_y(data1, fetch_func)
def test_fetch(): try: data1 = fetch(shuffle=True, random_state=42) except IOError: raise SkipTest("Covertype dataset can not be loaded.") data2 = fetch(shuffle=True, random_state=37) X1, X2 = data1['data'], data2['data'] assert (581012, 54) == X1.shape assert X1.shape == X2.shape assert X1.sum() == X2.sum() y1, y2 = data1['target'], data2['target'] assert (X1.shape[0], ) == y1.shape assert (X1.shape[0], ) == y2.shape # test return_X_y option fetch_func = partial(fetch) check_return_X_y(data1, fetch_func)
def test_fetch(): try: data1 = fetch(shuffle=True, random_state=42) except IOError: raise SkipTest("Covertype dataset can not be loaded.") data2 = fetch(shuffle=True, random_state=37) X1, X2 = data1['data'], data2['data'] assert_equal((581012, 54), X1.shape) assert_equal(X1.shape, X2.shape) assert_equal(X1.sum(), X2.sum()) y1, y2 = data1['target'], data2['target'] assert_equal((X1.shape[0],), y1.shape) assert_equal((X1.shape[0],), y2.shape) # test return_X_y option fetch_func = partial(fetch) check_return_X_y(data1, fetch_func)
def test_load_fake_lfw_people(): lfw_people = fetch_lfw_people(data_home=SCIKIT_LEARN_DATA, min_faces_per_person=3, download_if_missing=False) # The data is croped around the center as a rectangular bounding box # around the face. Colors are converted to gray levels: assert_equal(lfw_people.images.shape, (10, 62, 47)) assert_equal(lfw_people.data.shape, (10, 2914)) # the target is array of person integer ids assert_array_equal(lfw_people.target, [2, 0, 1, 0, 2, 0, 2, 1, 1, 2]) # names of the persons can be found using the target_names array expected_classes = ['Abdelatif Smith', 'Abhati Kepler', 'Onur Lopez'] assert_array_equal(lfw_people.target_names, expected_classes) # It is possible to ask for the original data without any croping or color # conversion and not limit on the number of picture per person lfw_people = fetch_lfw_people(data_home=SCIKIT_LEARN_DATA, resize=None, slice_=None, color=True, download_if_missing=False) assert_equal(lfw_people.images.shape, (17, 250, 250, 3)) # the ids and class names are the same as previously assert_array_equal(lfw_people.target, [0, 0, 1, 6, 5, 6, 3, 6, 0, 3, 6, 1, 2, 4, 5, 1, 2]) assert_array_equal(lfw_people.target_names, ['Abdelatif Smith', 'Abhati Kepler', 'Camara Alvaro', 'Chen Dupont', 'John Lee', 'Lin Bauman', 'Onur Lopez']) # test return_X_y option fetch_func = partial(fetch_lfw_people, data_home=SCIKIT_LEARN_DATA, resize=None, slice_=None, color=True, download_if_missing=False) check_return_X_y(lfw_people, fetch_func)
def test_fetch_rcv1(): try: data1 = fetch_rcv1(shuffle=False, download_if_missing=False) except IOError as e: if e.errno == errno.ENOENT: raise SkipTest("Download RCV1 dataset to run this test.") X1, Y1 = data1.data, data1.target cat_list, s1 = data1.target_names.tolist(), data1.sample_id # test sparsity assert sp.issparse(X1) assert sp.issparse(Y1) assert 60915113 == X1.data.size assert 2606875 == Y1.data.size # test shapes assert (804414, 47236) == X1.shape assert (804414, 103) == Y1.shape assert (804414, ) == s1.shape assert 103 == len(cat_list) # test ordering of categories first_categories = ['C11', 'C12', 'C13', 'C14', 'C15', 'C151'] assert_array_equal(first_categories, cat_list[:6]) # test number of sample for some categories some_categories = ('GMIL', 'E143', 'CCAT') number_non_zero_in_cat = (5, 1206, 381327) for num, cat in zip(number_non_zero_in_cat, some_categories): j = cat_list.index(cat) assert num == Y1[:, j].data.size # test shuffling and subset data2 = fetch_rcv1(shuffle=True, subset='train', random_state=77, download_if_missing=False) X2, Y2 = data2.data, data2.target s2 = data2.sample_id # test return_X_y option fetch_func = partial(fetch_rcv1, shuffle=False, subset='train', download_if_missing=False) check_return_X_y(data2, fetch_func) # The first 23149 samples are the training samples assert_array_equal(np.sort(s1[:23149]), np.sort(s2)) # test some precise values some_sample_ids = (2286, 3274, 14042) for sample_id in some_sample_ids: idx1 = s1.tolist().index(sample_id) idx2 = s2.tolist().index(sample_id) feature_values_1 = X1[idx1, :].toarray() feature_values_2 = X2[idx2, :].toarray() assert_almost_equal(feature_values_1, feature_values_2) target_values_1 = Y1[idx1, :].toarray() target_values_2 = Y2[idx2, :].toarray() assert_almost_equal(target_values_1, target_values_2)
def _fetch_dataset_from_openml(data_id, data_name, data_version, target_column, expected_observations, expected_features, expected_missing, expected_data_dtype, expected_target_dtype, expect_sparse, compare_default_target): # fetches a dataset in three various ways from OpenML, using the # fetch_openml function, and does various checks on the validity of the # result. Note that this function can be mocked (by invoking # _monkey_patch_webbased_functions before invoking this function) data_by_name_id = fetch_openml(name=data_name, version=data_version, cache=False) assert int(data_by_name_id.details['id']) == data_id fetch_openml(name=data_name, cache=False) # without specifying the version, there is no guarantee that the data id # will be the same # fetch with dataset id data_by_id = fetch_openml(data_id=data_id, cache=False, target_column=target_column) assert data_by_id.details['name'] == data_name assert data_by_id.data.shape == (expected_observations, expected_features) if isinstance(target_column, str): # single target, so target is vector assert data_by_id.target.shape == (expected_observations, ) elif isinstance(target_column, list): # multi target, so target is array assert data_by_id.target.shape == (expected_observations, len(target_column)) assert data_by_id.data.dtype == np.float64 assert data_by_id.target.dtype == expected_target_dtype assert len(data_by_id.feature_names) == expected_features for feature in data_by_id.feature_names: assert isinstance(feature, string_types) # TODO: pass in a list of expected nominal features for feature, categories in data_by_id.categories.items(): feature_idx = data_by_id.feature_names.index(feature) values = np.unique(data_by_id.data[:, feature_idx]) values = values[np.isfinite(values)] assert set(values) <= set(range(len(categories))) if compare_default_target: # check whether the data by id and data by id target are equal data_by_id_default = fetch_openml(data_id=data_id, cache=False) if data_by_id.data.dtype == np.float64: np.testing.assert_allclose(data_by_id.data, data_by_id_default.data) else: assert np.array_equal(data_by_id.data, data_by_id_default.data) if data_by_id.target.dtype == np.float64: np.testing.assert_allclose(data_by_id.target, data_by_id_default.target) else: assert np.array_equal(data_by_id.target, data_by_id_default.target) if expect_sparse: assert isinstance(data_by_id.data, scipy.sparse.csr_matrix) else: assert isinstance(data_by_id.data, np.ndarray) # np.isnan doesn't work on CSR matrix assert (np.count_nonzero(np.isnan(data_by_id.data)) == expected_missing) # test return_X_y option fetch_func = partial(fetch_openml, data_id=data_id, cache=False, target_column=target_column) check_return_X_y(data_by_id, fetch_func) return data_by_id
def _fetch_dataset_from_openml(data_id, data_name, data_version, target_column, expected_observations, expected_features, expected_missing, expected_data_dtype, expected_target_dtype, expect_sparse, compare_default_target): # fetches a dataset in three various ways from OpenML, using the # fetch_openml function, and does various checks on the validity of the # result. Note that this function can be mocked (by invoking # _monkey_patch_webbased_functions before invoking this function) data_by_name_id = fetch_openml(name=data_name, version=data_version, cache=False) assert int(data_by_name_id.details['id']) == data_id fetch_openml(name=data_name, cache=False) # without specifying the version, there is no guarantee that the data id # will be the same # fetch with dataset id data_by_id = fetch_openml(data_id=data_id, cache=False, target_column=target_column) assert data_by_id.details['name'] == data_name assert data_by_id.data.shape == (expected_observations, expected_features) if isinstance(target_column, str): # single target, so target is vector assert data_by_id.target.shape == (expected_observations, ) elif isinstance(target_column, list): # multi target, so target is array assert data_by_id.target.shape == (expected_observations, len(target_column)) assert data_by_id.data.dtype == np.float64 assert data_by_id.target.dtype == expected_target_dtype assert len(data_by_id.feature_names) == expected_features for feature in data_by_id.feature_names: assert isinstance(feature, string_types) # TODO: pass in a list of expected nominal features for feature, categories in data_by_id.categories.items(): feature_idx = data_by_id.feature_names.index(feature) values = np.unique(data_by_id.data[:, feature_idx]) values = values[np.isfinite(values)] assert set(values) <= set(range(len(categories))) if compare_default_target: # check whether the data by id and data by id target are equal data_by_id_default = fetch_openml(data_id=data_id, cache=False) if data_by_id.data.dtype == np.float64: np.testing.assert_allclose(data_by_id.data, data_by_id_default.data) else: assert np.array_equal(data_by_id.data, data_by_id_default.data) if data_by_id.target.dtype == np.float64: np.testing.assert_allclose(data_by_id.target, data_by_id_default.target) else: assert np.array_equal(data_by_id.target, data_by_id_default.target) if expect_sparse: assert isinstance(data_by_id.data, scipy.sparse.csr_matrix) else: assert isinstance(data_by_id.data, np.ndarray) # np.isnan doesn't work on CSR matrix assert (np.count_nonzero(np.isnan(data_by_id.data)) == expected_missing) # test return_X_y option fetch_func = partial(fetch_openml, data_id=data_id, cache=False, target_column=target_column) check_return_X_y(data_by_id, fetch_func) return data_by_id
def test_fetch_kddcup99_return_X_y(fetch_kddcup99_fxt): fetch_func = partial(fetch_kddcup99_fxt, subset='smtp') data = fetch_func() check_return_X_y(data, fetch_func)
def test_20news_vectorized(): try: datasets.fetch_20newsgroups(subset='all', download_if_missing=False) except IOError: raise SkipTest("Download 20 newsgroups to run this test") # test subset = train bunch = datasets.fetch_20newsgroups_vectorized(subset="train") assert_true(sp.isspmatrix_csr(bunch.data)) assert_equal(bunch.data.shape, (11314, 130107)) assert_equal(bunch.target.shape[0], 11314) assert_equal(bunch.data.dtype, np.float64) # test subset = test bunch = datasets.fetch_20newsgroups_vectorized(subset="test") assert_true(sp.isspmatrix_csr(bunch.data)) assert_equal(bunch.data.shape, (7532, 130107)) assert_equal(bunch.target.shape[0], 7532) assert_equal(bunch.data.dtype, np.float64) # test return_X_y option fetch_func = partial(datasets.fetch_20newsgroups_vectorized, subset='test') check_return_X_y(bunch, fetch_func) # test subset = all bunch = datasets.fetch_20newsgroups_vectorized(subset='all') assert_true(sp.isspmatrix_csr(bunch.data)) assert_equal(bunch.data.shape, (11314 + 7532, 130107)) assert_equal(bunch.target.shape[0], 11314 + 7532) assert_equal(bunch.data.dtype, np.float64)