def calculate_metafeatures(task_id): X_train, y_train, X_test, y_test, cat, task_type, dataset_name = load_task(task_id) watch = StopWatch() if task_type == 'classification': if len(np.unique(y_train)) == 2: task_type = BINARY_CLASSIFICATION else: task_type = MULTICLASS_CLASSIFICATION else: task_type = REGRESSION _metafeatures_labels = _calculate_metafeatures( x_train=X_train, y_train=y_train, data_feat_type=cat, data_info_task=task_type, basename=dataset_name, logger_=logger, watcher=watch, ) _metafeatures_encoded_labels = _calculate_metafeatures_encoded( x_train=X_train, y_train=y_train, data_feat_type=cat, task=task_type, basename=dataset_name, logger_=logger, watcher=watch, ) mf = _metafeatures_labels mf.metafeature_values.update( _metafeatures_encoded_labels.metafeature_values) return mf
def test_metalearning(self): dataset_name_classification = 'digits' initial_challengers_classification = { "ACC_METRIC": "--initial-challengers \" " "-balancing:strategy 'weighting' " "-classifier:__choice__ 'proj_logit'", "AUC_METRIC": "--initial-challengers \" " "-balancing:strategy 'weighting' " "-classifier:__choice__ 'liblinear_svc'", "BAC_METRIC": "--initial-challengers \" " "-balancing:strategy 'weighting' " "-classifier:__choice__ 'proj_logit'", "F1_METRIC": "--initial-challengers \" " "-balancing:strategy 'weighting' " "-classifier:__choice__ 'proj_logit'", "PAC_METRIC": "--initial-challengers \" " "-balancing:strategy 'none' " "-classifier:__choice__ 'random_forest'" } dataset_name_regression = 'diabetes' initial_challengers_regression = { "A_METRIC": "--initial-challengers \" " "-imputation:strategy 'mean' " "-one_hot_encoding:minimum_fraction '0.01' " "-one_hot_encoding:use_minimum_fraction 'True' " "-preprocessor:__choice__ 'no_preprocessing' " "-regressor:__choice__ 'random_forest'", "R2_METRIC": "--initial-challengers \" " "-imputation:strategy 'mean' " "-one_hot_encoding:minimum_fraction '0.01' " "-one_hot_encoding:use_minimum_fraction 'True' " "-preprocessor:__choice__ 'no_preprocessing' " "-regressor:__choice__ 'random_forest'", } for dataset_name, task, initial_challengers in [ (dataset_name_regression, REGRESSION, initial_challengers_regression), (dataset_name_classification, MULTICLASS_CLASSIFICATION, initial_challengers_classification) ]: for metric in initial_challengers: configuration_space = get_configuration_space( { 'metric': metric, 'task': task, 'is_sparse': False }, include_preprocessors=['no_preprocessing']) X_train, Y_train, X_test, Y_test = get_dataset(dataset_name) categorical = [False] * X_train.shape[1] meta_features_label = _calculate_metafeatures( X_train, Y_train, categorical, dataset_name, task) meta_features_encoded_label = _calculate_metafeatures_encoded( X_train, Y_train, categorical, dataset_name, task) initial_configuration_strings_for_smac = \ suggest_via_metalearning( meta_features_label, meta_features_encoded_label, configuration_space, dataset_name, metric, task, False, 1, None) print(metric) print(initial_configuration_strings_for_smac[0]) self.assertTrue( initial_configuration_strings_for_smac[0].startswith( initial_challengers[metric]))