コード例 #1
0
    def test_create_dataset_list(self):

        data = [
            ['a', 'sunny', 85.0, 85.0, 'FALSE', 'no'],
            ['b', 'sunny', 80.0, 90.0, 'TRUE', 'no'],
            ['c', 'overcast', 83.0, 86.0, 'FALSE', 'yes'],
            ['d', 'rainy', 70.0, 96.0, 'FALSE', 'yes'],
            ['e', 'rainy', 68.0, 80.0, 'FALSE', 'yes'],
            ['f', 'rainy', 65.0, 70.0, 'TRUE', 'no'],
            ['g', 'overcast', 64.0, 65.0, 'TRUE', 'yes'],
            ['h', 'sunny', 72.0, 95.0, 'FALSE', 'no'],
            ['i', 'sunny', 69.0, 70.0, 'FALSE', 'yes'],
            ['j', 'rainy', 75.0, 80.0, 'FALSE', 'yes'],
            ['k', 'sunny', 75.0, 70.0, 'TRUE', 'yes'],
            ['l', 'overcast', 72.0, 90.0, 'TRUE', 'yes'],
            ['m', 'overcast', 81.0, 75.0, 'FALSE', 'yes'],
            ['n', 'rainy', 71.0, 91.0, 'TRUE', 'no'],
        ]

        attributes = [
            ('rnd_str', 'STRING'),
            ('outlook', ['sunny', 'overcast', 'rainy']),
            ('temperature', 'REAL'),
            ('humidity', 'REAL'),
            ('windy', ['TRUE', 'FALSE']),
            ('play', ['yes', 'no']),
        ]

        dataset = create_dataset(
            name="%s-ModifiedWeather" % self._get_sentinel(),
            description=(
                'Testing dataset upload when the data is a list of lists'
            ),
            creator='OpenML test',
            contributor=None,
            collection_date='21-09-2018',
            language='English',
            licence='MIT',
            default_target_attribute='play',
            row_id_attribute=None,
            ignore_attribute=None,
            citation='None',
            attributes=attributes,
            data=data,
            version_label='test',
            original_data_url='http://openml.github.io/openml-python',
            paper_url='http://openml.github.io/openml-python'
        )

        upload_did = dataset.publish()
        self.assertEqual(
            _get_online_dataset_arff(upload_did),
            dataset._dataset,
            "Uploaded ARFF does not match original one"
        )
        self.assertEqual(
            _get_online_dataset_format(upload_did),
            'arff',
            "Wrong format for dataset"
        )
コード例 #2
0
    def test_create_dataset_numpy(self):

        data = np.array([[1, 2, 3], [1.2, 2.5, 3.8], [2, 5, 8], [0, 1, 0]]).T

        attributes = [('col_{}'.format(i), 'REAL')
                      for i in range(data.shape[1])]

        dataset = create_dataset(
            name='%s-NumPy_testing_dataset' % self._get_sentinel(),
            description='Synthetic dataset created from a NumPy array',
            creator='OpenML tester',
            contributor=None,
            collection_date='01-01-2018',
            language='English',
            licence='MIT',
            default_target_attribute='col_{}'.format(data.shape[1] - 1),
            row_id_attribute=None,
            ignore_attribute=None,
            citation='None',
            attributes=attributes,
            data=data,
            version_label='test',
            original_data_url='http://openml.github.io/openml-python',
            paper_url='http://openml.github.io/openml-python')

        upload_did = dataset.publish()

        self.assertEqual(_get_online_dataset_arff(upload_did),
                         dataset._dataset,
                         "Uploaded arff does not match original one")
        self.assertEqual(_get_online_dataset_format(upload_did), 'arff',
                         "Wrong format for dataset")
コード例 #3
0
    def test_get_online_dataset_format(self):

        # Phoneme dataset
        dataset_id = 77
        dataset = openml.datasets.get_dataset(dataset_id, download_data=False)

        self.assertEqual((dataset.format).lower(),
                         _get_online_dataset_format(dataset_id),
                         "The format of the ARFF files is different")
コード例 #4
0
    def test_get_online_dataset_format(self):

        # Phoneme dataset
        dataset_id = 77
        dataset = openml.datasets.get_dataset(dataset_id)

        self.assertEqual(
            (dataset.format).lower(),
            _get_online_dataset_format(dataset_id),
            "The format of the ARFF files is different"
        )
コード例 #5
0
    def test_create_dataset_numpy(self):

        data = np.array(
            [
                [1, 2, 3],
                [1.2, 2.5, 3.8],
                [2, 5, 8],
                [0, 1, 0]
            ]
        ).T

        attributes = [('col_{}'.format(i), 'REAL')
                      for i in range(data.shape[1])]

        dataset = create_dataset(
            name='%s-NumPy_testing_dataset' % self._get_sentinel(),
            description='Synthetic dataset created from a NumPy array',
            creator='OpenML tester',
            contributor=None,
            collection_date='01-01-2018',
            language='English',
            licence='MIT',
            default_target_attribute='col_{}'.format(data.shape[1] - 1),
            row_id_attribute=None,
            ignore_attribute=None,
            citation='None',
            attributes=attributes,
            data=data,
            version_label='test',
            original_data_url='http://openml.github.io/openml-python',
            paper_url='http://openml.github.io/openml-python'
        )

        upload_did = dataset.publish()

        self.assertEqual(
            _get_online_dataset_arff(upload_did),
            dataset._dataset,
            "Uploaded arff does not match original one"
        )
        self.assertEqual(
            _get_online_dataset_format(upload_did),
            'arff',
            "Wrong format for dataset"
        )
コード例 #6
0
    def test_create_dataset_pandas(self):
        data = [
            ['a', 'sunny', 85.0, 85.0, 'FALSE', 'no'],
            ['b', 'sunny', 80.0, 90.0, 'TRUE', 'no'],
            ['c', 'overcast', 83.0, 86.0, 'FALSE', 'yes'],
            ['d', 'rainy', 70.0, 96.0, 'FALSE', 'yes'],
            ['e', 'rainy', 68.0, 80.0, 'FALSE', 'yes']
        ]
        column_names = ['rnd_str', 'outlook', 'temperature', 'humidity',
                        'windy', 'play']
        df = pd.DataFrame(data, columns=column_names)
        # enforce the type of each column
        df['outlook'] = df['outlook'].astype('category')
        df['windy'] = df['windy'].astype('bool')
        df['play'] = df['play'].astype('category')
        # meta-information
        name = '%s-pandas_testing_dataset' % self._get_sentinel()
        description = 'Synthetic dataset created from a Pandas DataFrame'
        creator = 'OpenML tester'
        collection_date = '01-01-2018'
        language = 'English'
        licence = 'MIT'
        default_target_attribute = 'play'
        citation = 'None'
        original_data_url = 'http://openml.github.io/openml-python'
        paper_url = 'http://openml.github.io/openml-python'
        dataset = openml.datasets.functions.create_dataset(
            name=name,
            description=description,
            creator=creator,
            contributor=None,
            collection_date=collection_date,
            language=language,
            licence=licence,
            default_target_attribute=default_target_attribute,
            row_id_attribute=None,
            ignore_attribute=None,
            citation=citation,
            attributes='auto',
            data=df,
            version_label='test',
            original_data_url=original_data_url,
            paper_url=paper_url
        )
        upload_did = dataset.publish()
        self.assertEqual(
            _get_online_dataset_arff(upload_did),
            dataset._dataset,
            "Uploaded ARFF does not match original one"
        )

        # Check that SparseDataFrame are supported properly
        sparse_data = scipy.sparse.coo_matrix((
            [0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0],
            ([0, 1, 1, 2, 2, 3, 3], [0, 1, 2, 0, 2, 0, 1])
        ))
        column_names = ['input1', 'input2', 'y']
        df = pd.SparseDataFrame(sparse_data, columns=column_names)
        # meta-information
        description = 'Synthetic dataset created from a Pandas SparseDataFrame'
        dataset = openml.datasets.functions.create_dataset(
            name=name,
            description=description,
            creator=creator,
            contributor=None,
            collection_date=collection_date,
            language=language,
            licence=licence,
            default_target_attribute=default_target_attribute,
            row_id_attribute=None,
            ignore_attribute=None,
            citation=citation,
            attributes='auto',
            data=df,
            version_label='test',
            original_data_url=original_data_url,
            paper_url=paper_url
        )
        upload_did = dataset.publish()
        self.assertEqual(
            _get_online_dataset_arff(upload_did),
            dataset._dataset,
            "Uploaded ARFF does not match original one"
        )
        self.assertEqual(
            _get_online_dataset_format(upload_did),
            'sparse_arff',
            "Wrong format for dataset"
        )

        # Check that we can overwrite the attributes
        data = [['a'], ['b'], ['c'], ['d'], ['e']]
        column_names = ['rnd_str']
        df = pd.DataFrame(data, columns=column_names)
        df['rnd_str'] = df['rnd_str'].astype('category')
        attributes = {'rnd_str': ['a', 'b', 'c', 'd', 'e', 'f', 'g']}
        dataset = openml.datasets.functions.create_dataset(
            name=name,
            description=description,
            creator=creator,
            contributor=None,
            collection_date=collection_date,
            language=language,
            licence=licence,
            default_target_attribute=default_target_attribute,
            row_id_attribute=None,
            ignore_attribute=None,
            citation=citation,
            attributes=attributes,
            data=df,
            version_label='test',
            original_data_url=original_data_url,
            paper_url=paper_url
        )
        upload_did = dataset.publish()
        downloaded_data = _get_online_dataset_arff(upload_did)
        self.assertEqual(
            downloaded_data,
            dataset._dataset,
            "Uploaded ARFF does not match original one"
        )
        self.assertTrue(
            '@ATTRIBUTE rnd_str {a, b, c, d, e, f, g}' in downloaded_data)
コード例 #7
0
    def test_create_dataset_sparse(self):

        # test the scipy.sparse.coo_matrix
        sparse_data = scipy.sparse.coo_matrix((
            [0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0],
            ([0, 1, 1, 2, 2, 3, 3], [0, 1, 2, 0, 2, 0, 1])
        ))

        column_names = [
            ('input1', 'REAL'),
            ('input2', 'REAL'),
            ('y', 'REAL'),
        ]

        xor_dataset = create_dataset(
            name="%s-XOR" % self._get_sentinel(),
            description='Dataset representing the XOR operation',
            creator=None,
            contributor=None,
            collection_date=None,
            language='English',
            licence=None,
            default_target_attribute='y',
            row_id_attribute=None,
            ignore_attribute=None,
            citation=None,
            attributes=column_names,
            data=sparse_data,
            version_label='test',
        )

        upload_did = xor_dataset.publish()
        self.assertEqual(
            _get_online_dataset_arff(upload_did),
            xor_dataset._dataset,
            "Uploaded ARFF does not match original one"
        )
        self.assertEqual(
            _get_online_dataset_format(upload_did),
            'sparse_arff',
            "Wrong format for dataset"
        )

        # test the list of dicts sparse representation
        sparse_data = [
            {0: 0.0},
            {1: 1.0, 2: 1.0},
            {0: 1.0, 2: 1.0},
            {0: 1.0, 1: 1.0}
        ]

        xor_dataset = create_dataset(
            name="%s-XOR" % self._get_sentinel(),
            description='Dataset representing the XOR operation',
            creator=None,
            contributor=None,
            collection_date=None,
            language='English',
            licence=None,
            default_target_attribute='y',
            row_id_attribute=None,
            ignore_attribute=None,
            citation=None,
            attributes=column_names,
            data=sparse_data,
            version_label='test',
        )

        upload_did = xor_dataset.publish()
        self.assertEqual(
            _get_online_dataset_arff(upload_did),
            xor_dataset._dataset,
            "Uploaded ARFF does not match original one"
        )
        self.assertEqual(
            _get_online_dataset_format(upload_did),
            'sparse_arff',
            "Wrong format for dataset"
        )