def test_create_dataset_list(self): data = [ ['a', 'sunny', 85.0, 85.0, 'FALSE', 'no'], ['b', 'sunny', 80.0, 90.0, 'TRUE', 'no'], ['c', 'overcast', 83.0, 86.0, 'FALSE', 'yes'], ['d', 'rainy', 70.0, 96.0, 'FALSE', 'yes'], ['e', 'rainy', 68.0, 80.0, 'FALSE', 'yes'], ['f', 'rainy', 65.0, 70.0, 'TRUE', 'no'], ['g', 'overcast', 64.0, 65.0, 'TRUE', 'yes'], ['h', 'sunny', 72.0, 95.0, 'FALSE', 'no'], ['i', 'sunny', 69.0, 70.0, 'FALSE', 'yes'], ['j', 'rainy', 75.0, 80.0, 'FALSE', 'yes'], ['k', 'sunny', 75.0, 70.0, 'TRUE', 'yes'], ['l', 'overcast', 72.0, 90.0, 'TRUE', 'yes'], ['m', 'overcast', 81.0, 75.0, 'FALSE', 'yes'], ['n', 'rainy', 71.0, 91.0, 'TRUE', 'no'], ] attributes = [ ('rnd_str', 'STRING'), ('outlook', ['sunny', 'overcast', 'rainy']), ('temperature', 'REAL'), ('humidity', 'REAL'), ('windy', ['TRUE', 'FALSE']), ('play', ['yes', 'no']), ] dataset = create_dataset( name="%s-ModifiedWeather" % self._get_sentinel(), description=( 'Testing dataset upload when the data is a list of lists' ), creator='OpenML test', contributor=None, collection_date='21-09-2018', language='English', licence='MIT', default_target_attribute='play', row_id_attribute=None, ignore_attribute=None, citation='None', attributes=attributes, data=data, version_label='test', original_data_url='http://openml.github.io/openml-python', paper_url='http://openml.github.io/openml-python' ) upload_did = dataset.publish() self.assertEqual( _get_online_dataset_arff(upload_did), dataset._dataset, "Uploaded ARFF does not match original one" ) self.assertEqual( _get_online_dataset_format(upload_did), 'arff', "Wrong format for dataset" )
def test_create_dataset_numpy(self): data = np.array([[1, 2, 3], [1.2, 2.5, 3.8], [2, 5, 8], [0, 1, 0]]).T attributes = [('col_{}'.format(i), 'REAL') for i in range(data.shape[1])] dataset = create_dataset( name='%s-NumPy_testing_dataset' % self._get_sentinel(), description='Synthetic dataset created from a NumPy array', creator='OpenML tester', contributor=None, collection_date='01-01-2018', language='English', licence='MIT', default_target_attribute='col_{}'.format(data.shape[1] - 1), row_id_attribute=None, ignore_attribute=None, citation='None', attributes=attributes, data=data, version_label='test', original_data_url='http://openml.github.io/openml-python', paper_url='http://openml.github.io/openml-python') upload_did = dataset.publish() self.assertEqual(_get_online_dataset_arff(upload_did), dataset._dataset, "Uploaded arff does not match original one") self.assertEqual(_get_online_dataset_format(upload_did), 'arff', "Wrong format for dataset")
def test_create_dataset_row_id_attribute_inference(self): # meta-information name = '%s-pandas_testing_dataset' % self._get_sentinel() description = 'Synthetic dataset created from a Pandas DataFrame' creator = 'OpenML tester' collection_date = '01-01-2018' language = 'English' licence = 'MIT' default_target_attribute = 'target' citation = 'None' original_data_url = 'http://openml.github.io/openml-python' paper_url = 'http://openml.github.io/openml-python' # Check that the index name is well inferred. data = [['a', 1, 0], ['b', 2, 1], ['c', 3, 0], ['d', 4, 1], ['e', 5, 0]] column_names = ['rnd_str', 'integer', 'target'] df = pd.DataFrame(data, columns=column_names) row_id_attr = [None, 'integer'] df_index_name = [None, 'index_name'] expected_row_id = [None, 'index_name', 'integer', 'integer'] for output_row_id, (row_id, index_name) in zip(expected_row_id, product(row_id_attr, df_index_name)): df.index.name = index_name dataset = openml.datasets.functions.create_dataset( name=name, description=description, creator=creator, contributor=None, collection_date=collection_date, language=language, licence=licence, default_target_attribute=default_target_attribute, ignore_attribute=None, citation=citation, attributes='auto', data=df, row_id_attribute=row_id, version_label='test', original_data_url=original_data_url, paper_url=paper_url ) self.assertEqual(dataset.row_id_attribute, output_row_id) upload_did = dataset.publish() arff_dataset = arff.loads(_get_online_dataset_arff(upload_did)) arff_data = np.array(arff_dataset['data'], dtype=object) # if we set the name of the index then the index will be added to # the data expected_shape = (5, 3) if index_name is None else (5, 4) self.assertEqual(arff_data.shape, expected_shape)
def test_get_online_dataset_arff(self): dataset_id = 100 # Australian # lazy loading not used as arff file is checked. dataset = openml.datasets.get_dataset(dataset_id) decoder = arff.ArffDecoder() # check if the arff from the dataset is # the same as the arff from _get_arff function d_format = (dataset.format).lower() self.assertEqual( dataset._get_arff(d_format), decoder.decode( _get_online_dataset_arff(dataset_id), encode_nominal=True, return_type=arff.DENSE if d_format == 'arff' else arff.COO), "ARFF files are not equal")
def test_create_dataset_numpy(self): data = np.array( [ [1, 2, 3], [1.2, 2.5, 3.8], [2, 5, 8], [0, 1, 0] ] ).T attributes = [('col_{}'.format(i), 'REAL') for i in range(data.shape[1])] dataset = create_dataset( name='%s-NumPy_testing_dataset' % self._get_sentinel(), description='Synthetic dataset created from a NumPy array', creator='OpenML tester', contributor=None, collection_date='01-01-2018', language='English', licence='MIT', default_target_attribute='col_{}'.format(data.shape[1] - 1), row_id_attribute=None, ignore_attribute=None, citation='None', attributes=attributes, data=data, version_label='test', original_data_url='http://openml.github.io/openml-python', paper_url='http://openml.github.io/openml-python' ) upload_did = dataset.publish() self.assertEqual( _get_online_dataset_arff(upload_did), dataset._dataset, "Uploaded arff does not match original one" ) self.assertEqual( _get_online_dataset_format(upload_did), 'arff', "Wrong format for dataset" )
def test_get_online_dataset_arff(self): # Australian dataset dataset_id = 100 dataset = openml.datasets.get_dataset(dataset_id) decoder = arff.ArffDecoder() # check if the arff from the dataset is # the same as the arff from _get_arff function d_format = (dataset.format).lower() self.assertEqual( dataset._get_arff(d_format), decoder.decode( _get_online_dataset_arff(dataset_id), encode_nominal=True, return_type=arff.DENSE if d_format == 'arff' else arff.COO ), "ARFF files are not equal" )
def test_create_dataset_pandas(self): data = [ ['a', 'sunny', 85.0, 85.0, 'FALSE', 'no'], ['b', 'sunny', 80.0, 90.0, 'TRUE', 'no'], ['c', 'overcast', 83.0, 86.0, 'FALSE', 'yes'], ['d', 'rainy', 70.0, 96.0, 'FALSE', 'yes'], ['e', 'rainy', 68.0, 80.0, 'FALSE', 'yes'] ] column_names = ['rnd_str', 'outlook', 'temperature', 'humidity', 'windy', 'play'] df = pd.DataFrame(data, columns=column_names) # enforce the type of each column df['outlook'] = df['outlook'].astype('category') df['windy'] = df['windy'].astype('bool') df['play'] = df['play'].astype('category') # meta-information name = '%s-pandas_testing_dataset' % self._get_sentinel() description = 'Synthetic dataset created from a Pandas DataFrame' creator = 'OpenML tester' collection_date = '01-01-2018' language = 'English' licence = 'MIT' default_target_attribute = 'play' citation = 'None' original_data_url = 'http://openml.github.io/openml-python' paper_url = 'http://openml.github.io/openml-python' dataset = openml.datasets.functions.create_dataset( name=name, description=description, creator=creator, contributor=None, collection_date=collection_date, language=language, licence=licence, default_target_attribute=default_target_attribute, row_id_attribute=None, ignore_attribute=None, citation=citation, attributes='auto', data=df, version_label='test', original_data_url=original_data_url, paper_url=paper_url ) upload_did = dataset.publish() self.assertEqual( _get_online_dataset_arff(upload_did), dataset._dataset, "Uploaded ARFF does not match original one" ) # Check that SparseDataFrame are supported properly sparse_data = scipy.sparse.coo_matrix(( [0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0], ([0, 1, 1, 2, 2, 3, 3], [0, 1, 2, 0, 2, 0, 1]) )) column_names = ['input1', 'input2', 'y'] df = pd.SparseDataFrame(sparse_data, columns=column_names) # meta-information description = 'Synthetic dataset created from a Pandas SparseDataFrame' dataset = openml.datasets.functions.create_dataset( name=name, description=description, creator=creator, contributor=None, collection_date=collection_date, language=language, licence=licence, default_target_attribute=default_target_attribute, row_id_attribute=None, ignore_attribute=None, citation=citation, attributes='auto', data=df, version_label='test', original_data_url=original_data_url, paper_url=paper_url ) upload_did = dataset.publish() self.assertEqual( _get_online_dataset_arff(upload_did), dataset._dataset, "Uploaded ARFF does not match original one" ) self.assertEqual( _get_online_dataset_format(upload_did), 'sparse_arff', "Wrong format for dataset" ) # Check that we can overwrite the attributes data = [['a'], ['b'], ['c'], ['d'], ['e']] column_names = ['rnd_str'] df = pd.DataFrame(data, columns=column_names) df['rnd_str'] = df['rnd_str'].astype('category') attributes = {'rnd_str': ['a', 'b', 'c', 'd', 'e', 'f', 'g']} dataset = openml.datasets.functions.create_dataset( name=name, description=description, creator=creator, contributor=None, collection_date=collection_date, language=language, licence=licence, default_target_attribute=default_target_attribute, row_id_attribute=None, ignore_attribute=None, citation=citation, attributes=attributes, data=df, version_label='test', original_data_url=original_data_url, paper_url=paper_url ) upload_did = dataset.publish() downloaded_data = _get_online_dataset_arff(upload_did) self.assertEqual( downloaded_data, dataset._dataset, "Uploaded ARFF does not match original one" ) self.assertTrue( '@ATTRIBUTE rnd_str {a, b, c, d, e, f, g}' in downloaded_data)
def test_create_dataset_sparse(self): # test the scipy.sparse.coo_matrix sparse_data = scipy.sparse.coo_matrix(( [0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0], ([0, 1, 1, 2, 2, 3, 3], [0, 1, 2, 0, 2, 0, 1]) )) column_names = [ ('input1', 'REAL'), ('input2', 'REAL'), ('y', 'REAL'), ] xor_dataset = create_dataset( name="%s-XOR" % self._get_sentinel(), description='Dataset representing the XOR operation', creator=None, contributor=None, collection_date=None, language='English', licence=None, default_target_attribute='y', row_id_attribute=None, ignore_attribute=None, citation=None, attributes=column_names, data=sparse_data, version_label='test', ) upload_did = xor_dataset.publish() self.assertEqual( _get_online_dataset_arff(upload_did), xor_dataset._dataset, "Uploaded ARFF does not match original one" ) self.assertEqual( _get_online_dataset_format(upload_did), 'sparse_arff', "Wrong format for dataset" ) # test the list of dicts sparse representation sparse_data = [ {0: 0.0}, {1: 1.0, 2: 1.0}, {0: 1.0, 2: 1.0}, {0: 1.0, 1: 1.0} ] xor_dataset = create_dataset( name="%s-XOR" % self._get_sentinel(), description='Dataset representing the XOR operation', creator=None, contributor=None, collection_date=None, language='English', licence=None, default_target_attribute='y', row_id_attribute=None, ignore_attribute=None, citation=None, attributes=column_names, data=sparse_data, version_label='test', ) upload_did = xor_dataset.publish() self.assertEqual( _get_online_dataset_arff(upload_did), xor_dataset._dataset, "Uploaded ARFF does not match original one" ) self.assertEqual( _get_online_dataset_format(upload_did), 'sparse_arff', "Wrong format for dataset" )