Beispiel #1
0
    def test_get_dataset_properties(self):
        # Get data to train
        fit_dictionary = get_data_to_train()

        # Build a repository with random fitted models
        try:
            backend = create(
                temporary_directory='/tmp/autoPyTorch_ensemble_test_tmp',
                output_directory='/tmp/autoPyTorch_ensemble_test_out',
                delete_tmp_folder_after_terminate=False)
        except Exception:
            self.assertRaises(FileExistsError)
            return unittest.skip("File already exists")

        fit_dictionary['backend'] = backend

        # Create the directory structure
        backend._make_internals_directory()

        # Create a datamanager for this toy problem
        datamanager = TabularDataset(
            X=fit_dictionary['X_train'],
            Y=fit_dictionary['y_train'],
            X_test=fit_dictionary['X_test'],
            Y_test=fit_dictionary['y_test'],
        )
        backend.save_datamanager(datamanager)

        datamanager = backend.load_datamanager()
        info = {
            'task_type': datamanager.task_type,
            'output_type': datamanager.output_type,
            'issparse': datamanager.issparse,
            'numerical_columns': datamanager.numerical_columns,
            'categorical_columns': datamanager.categorical_columns
        }
        dataset_requirements = get_dataset_requirements(info)

        dataset_properties = datamanager.get_dataset_properties(
            dataset_requirements)

        self.assertIsInstance(dataset_properties, dict)
        for dataset_requirement in dataset_requirements:
            self.assertIn(dataset_requirement.name, dataset_properties.keys())
            self.assertIsInstance(dataset_properties[dataset_requirement.name],
                                  dataset_requirement.supported_types)
Beispiel #2
0
def backend(request):

    test_dir = os.path.dirname(__file__)
    tmp = os.path.join(
        test_dir,
        '.tmp__%s__%s' % (request.module.__name__, request.node.name))
    output = os.path.join(
        test_dir,
        '.output__%s__%s' % (request.module.__name__, request.node.name))

    for dir in (tmp, output):
        for i in range(10):
            if os.path.exists(dir):
                try:
                    shutil.rmtree(dir)
                    break
                except OSError:
                    time.sleep(1)

    # Make sure the folders we wanna create do not already exist.
    backend = create(
        tmp,
        output,
        delete_tmp_folder_after_terminate=True,
        delete_output_folder_after_terminate=True,
    )

    def get_finalizer(tmp_dir, output_dir):
        def session_run_at_end():
            for dir in (tmp_dir, output_dir):
                for i in range(10):
                    if os.path.exists(dir):
                        try:
                            shutil.rmtree(dir)
                            break
                        except OSError:
                            time.sleep(1)

        return session_run_at_end

    request.addfinalizer(get_finalizer(tmp, output))

    return backend
Beispiel #3
0
    def setUp(self):
        self.num_features = 4
        self.num_classes = 2
        self.X, self.y = make_classification(n_samples=200,
                                             n_features=self.num_features,
                                             n_informative=3,
                                             n_redundant=1,
                                             n_repeated=0,
                                             n_classes=self.num_classes,
                                             n_clusters_per_class=2,
                                             shuffle=True,
                                             random_state=0)
        self.dataset_properties = {
            'task_type': 'tabular_classification',
            'output_type': 'binary',
            'numerical_columns': list(range(4)),
            'categorical_columns': [],
        }

        # Create run dir
        tmp_dir = '/tmp/autoPyTorch_ensemble_test_tmp'
        if os.path.exists(tmp_dir):
            shutil.rmtree(tmp_dir)
        output_dir = '/tmp/autoPyTorch_ensemble_test_out'
        if os.path.exists(output_dir):
            shutil.rmtree(output_dir)
        self.backend = create(temporary_directory=tmp_dir,
                              output_directory=output_dir,
                              delete_tmp_folder_after_terminate=False)

        # Create the directory structure
        self.backend._make_internals_directory()

        # Create a datamanager for this toy problem
        datamanager = TabularDataset(
            X=self.X,
            Y=self.y,
            X_test=self.X,
            Y_test=self.y,
        )
        self.backend.save_datamanager(datamanager)
Beispiel #4
0
    'task_type': 'tabular_classification',
    'categorical_columns': categorical_columns,
    'numerical_columns': numerical_columns,
    'output_type': output_type,
}

# Save data via backend to fit the pipeline
datamanager = TabularDataset(
    X=X_train,
    Y=y_train,
    X_test=X_test,
    Y_test=y_test,
)

backend = create(
    temporary_directory='./tmp/autoPyTorch_tabular_classification_tmp',
    output_directory='./tmp/autoPyTorch_tabular_classification_out',
    delete_tmp_folder_after_terminate=False)
backend.save_datamanager(datamanager)

pipeline = TabularClassificationPipeline(dataset_properties=dataset_properties)

# Create a fit dictionary
fit_dictionary = {
    'categorical_columns': categorical_columns,
    'numerical_columns': numerical_columns,
    'num_features': X.shape[1],
    'num_classes': len(np.unique(y)),
    'is_small_preprocess': True,
    'categories': categories,
    'X_train': X_train,
    'y_train': y_train,
Beispiel #5
0
        )

        score = accuracy_score(y_test, np.argmax(test_predictions, axis=1))
        print(f"Fitted a pipeline {idx} with score = {score}")

    return


if __name__ == "__main__":

    # Get data to train
    fit_dictionary = get_data_to_train()

    # Build a repository with random fitted models
    backend = create(temporary_directory='./tmp/autoPyTorch_ensemble_test_tmp',
                     output_directory='./tmp/autoPyTorch_ensemble_test_out',
                     delete_tmp_folder_after_terminate=False)
    fit_dictionary['backend'] = backend

    # Create the directory structure
    backend._make_internals_directory()

    # Create a datamanager for this toy problem
    datamanager = TabularDataset(
        X=fit_dictionary['X_train'],
        Y=fit_dictionary['y_train'],
        X_test=fit_dictionary['X_test'],
        Y_test=fit_dictionary['y_test'],
    )
    backend.save_datamanager(datamanager)