def test_predict_proba_hardclassifier(self):
        # task 1 (test server) is important, as it is a task with an unused class
        tasks = [1, 3, 115]

        for task_id in tasks:
            task = openml.tasks.get_task(task_id)
            clf1 = sklearn.pipeline.Pipeline(steps=[(
                'imputer',
                sklearn.preprocessing.Imputer()), ('estimator', GaussianNB())])
            clf2 = sklearn.pipeline.Pipeline(
                steps=[('imputer', sklearn.preprocessing.Imputer()
                        ), ('estimator', HardNaiveBayes())])

            arff_content1, arff_header1, _, _, _ = _run_task_get_arffcontent(
                clf1, task)
            arff_content2, arff_header2, _, _, _ = _run_task_get_arffcontent(
                clf2, task)

            # verifies last two arff indices (predict and correct)
            # TODO: programmatically check wether these are indeed features (predict, correct)
            predictionsA = np.array(arff_content1)[:, -2:]
            predictionsB = np.array(arff_content2)[:, -2:]

            np.testing.assert_array_equal(predictionsA, predictionsB)
    def test_run_on_dataset_with_missing_labels(self):
        # Check that _run_task_get_arffcontent works when one of the class
        # labels only declared in the arff file, but is not present in the
        # actual data

        task = openml.tasks.get_task(2)
        class_labels = task.class_labels

        model = Pipeline(steps=[('Imputer', Imputer(
            strategy='median')), ('Estimator', DecisionTreeClassifier())])

        data_content, _, _, _, _ = _run_task_get_arffcontent(model, task)
        # 2 folds, 5 repeats; keep in mind that this task comes from the test
        # server, the task on the live server is different
        self.assertEqual(len(data_content), 4490)
        for row in data_content:
            # repeat, fold, row_id, 6 confidences, prediction and correct label
            self.assertEqual(len(row), 12)
Ejemplo n.º 3
0
    def test_run_on_dataset_with_missing_labels(self):
        # Check that _run_task_get_arffcontent works when one of the class
        # labels only declared in the arff file, but is not present in the
        # actual data

        task = openml.tasks.get_task(2)
        class_labels = task.class_labels

        model = Pipeline(steps=[('Imputer', Imputer(strategy='median')),
                                ('Estimator', DecisionTreeClassifier())])

        data_content, _, _, _ = _run_task_get_arffcontent(model, task, class_labels)
        # 2 folds, 5 repeats; keep in mind that this task comes from the test
        # server, the task on the live server is different
        self.assertEqual(len(data_content), 4490)
        for row in data_content:
            # repeat, fold, row_id, 6 confidences, prediction and correct label
            self.assertEqual(len(row), 11)