def test_compute_effects_on_compute(self): """ Tests whether computing metafeatures has any side effects on the instance metafeatures object. Fails if there are any side effects. """ required_checks = [] test_failures = {} test_name = inspect.stack()[0][3] for dataset_filename, dataset in self.datasets.items(): metafeatures_instance = Metafeatures() # first run metafeatures_instance.compute(X=dataset["X"], Y=dataset["Y"], seed=CORRECTNESS_SEED, column_types=dataset["column_types"]) # second run computed_mfs = metafeatures_instance.compute( X=dataset["X"], Y=dataset["Y"], seed=CORRECTNESS_SEED, column_types=dataset["column_types"]) known_mfs = dataset["known_metafeatures"] required_checks.append( (self._check_correctness, [computed_mfs, known_mfs, dataset_filename])) test_failures.update(self._perform_checks(required_checks)) self._report_test_failures(test_failures, test_name)
def test_n_folds_with_small_dataset(self): # should raise error with small (few instances) dataset # unless not computing landmarking mfs X_small = pd.DataFrame(np.random.rand(3, 7)) Y_small = pd.Series([0, 1, 0], name="target").astype("str") metafeatures = Metafeatures() with self.assertRaises(ValueError) as cm: metafeatures.compute(X_small, Y_small, n_folds=2) self.assertEqual( str(cm.exception), "The minimum number of instances in each class of Y is n_folds=2." + " Class 1 has 1.")
def test_numeric_targets(self): """ Test Metafeatures().compute() with numeric targets """ test_failures = {} test_name = inspect.stack()[0][3] for dataset_filename, dataset in self.datasets.items(): metafeatures = Metafeatures() column_types = dataset["column_types"].copy() column_types[dataset["Y"].name] = consts.NUMERIC computed_mfs = metafeatures.compute( X=dataset["X"], Y=pd.Series(np.random.rand(dataset["Y"].shape[0]), name=dataset["Y"].name), seed=CORRECTNESS_SEED, column_types=column_types) known_mfs = dataset["known_metafeatures"] target_dependent_metafeatures = Metafeatures.list_metafeatures( consts.MetafeatureGroup.TARGET_DEPENDENT.value) for mf_name in target_dependent_metafeatures: known_mfs[mf_name] = { consts.VALUE_KEY: consts.NUMERIC_TARGETS, consts.COMPUTE_TIME_KEY: 0. } required_checks = [(self._check_correctness, [computed_mfs, known_mfs, dataset_filename]), (self._check_compare_metafeature_lists, [computed_mfs, known_mfs, dataset_filename])] test_failures.update(self._perform_checks(required_checks)) self._report_test_failures(test_failures, test_name)
def test_soft_timeout(self): """Tests Metafeatures().compute() with timeout set""" test_name = inspect.stack()[0][3] test_failures = {} for dataset_filename, dataset in self.datasets.items(): metafeatures = Metafeatures() start_time = time.time() metafeatures.compute(X=dataset["X"], Y=dataset["Y"], seed=CORRECTNESS_SEED, column_types=dataset["column_types"]) full_compute_time = time.time() - start_time start_time = time.time() computed_mfs = metafeatures.compute( X=dataset["X"], Y=dataset["Y"], seed=CORRECTNESS_SEED, column_types=dataset["column_types"], timeout=full_compute_time / 2) limited_compute_time = time.time() - start_time self.assertGreater( full_compute_time, limited_compute_time, f"Compute metafeatures exceeded timeout on '{dataset_filename}'" ) computed_mfs_timeout = { k: v for k, v in computed_mfs.items() if v[consts.VALUE_KEY] != consts.TIMEOUT } known_mfs = dataset["known_metafeatures"] required_checks = [ (self._check_correctness, [computed_mfs_timeout, known_mfs, dataset_filename]), (self._check_compare_metafeature_lists, [computed_mfs, known_mfs, dataset_filename]) ] test_failures.update(self._perform_checks(required_checks)) self._report_test_failures(test_failures, test_name)
def test_sampling_shape_correctness(self): sample_shape = (7, 13) metafeatures = Metafeatures() dummy_mf_df = metafeatures.compute(self.dummy_features, self.dummy_target, sample_shape=sample_shape) X_sample = metafeatures._resources["XSample"]["value"] self.assertEqual( X_sample.shape, sample_shape, f"Sampling produced incorrect shape {X_sample.shape}; should have" + f" been {sample_shape}.")
def run_metafeature_benchmark(benchmark_name, iters=100): """ Computes metafeatures `iters` times over the test datasets and stores comparable information in ./<benchmark_name>.json. """ with open(METADATA_PATH, "r") as f: dataset_descriptions = json.load(f) benchmark_data = {} for dataset_metadata in dataset_descriptions: print(dataset_metadata["filename"]) X, Y, column_types = read_dataset(dataset_metadata) init_times = [] total_compute_times = [] metafeature_compute_times = {mf_id: [] for mf_id in Metafeatures.IDS} for i in range(iters): print(f"iter {i}") start_timestamp = time.time() mf = Metafeatures() init_timestamp = time.time() computed_mfs = mf.compute(X=X, Y=Y, column_types=column_types, seed=CORRECTNESS_SEED) compute_timestamp = time.time() init_times.append(init_timestamp - start_timestamp) total_compute_times.append(compute_timestamp - init_timestamp) for mf_id, result in computed_mfs.items(): metafeature_compute_times[mf_id].append( result[consts.COMPUTE_TIME_KEY]) benchmark_data[dataset_metadata["filename"]] = { "init_time": { "mean": np.mean(init_times), "std_dev": np.std(init_times) }, "total_compute_time": { "mean": np.mean(total_compute_times), "std_dev": np.std(total_compute_times) }, "metafeature_compute_time": { mf_id: { "mean": np.mean(mf_times), "std_dev": np.std(mf_times) } for mf_id, mf_times in metafeature_compute_times.items() } } write_benchmark_data(benchmark_name, benchmark_data)
def get_list_metafeatures(list_X, list_y, type_metafeatures): metafeatures = Metafeatures() list_dataset_metafeatures = [] for X, y in tqdm(zip(list_X, list_Y), total=7084): mfs = metafeatures.compute( pd.DataFrame(X), Y=pd.Series(y, dtype="category"), metafeature_ids=metafeatures.list_metafeatures( group=type_metafeatures), exclude=None, seed=0, #verbose=True, timeout=60, # return_times=True, ) list_dataset_metafeatures.append( pd.DataFrame(mfs).reset_index(drop=True)) df_metafeatures = pd.concat(list_dataset_metafeatures).fillna(0) df_metafeatures["index"] = list_files df_metafeatures.set_index("index", inplace=True) return df_metafeatures
import pandas as pd import numpy as np from metalearn import Metafeatures base = pd.read_csv('kddcup99.csv') print("Informações da Base de Dados") print("Quantidade de linhas e colunas: ", base.shape) print("Descrição do Index: ", base.index) print("Colunas presentes: ", base.columns) print("Colunas presentes: ", base.count) X = base.drop('label', axis=1) Y = base['label'] metafeatures = Metafeatures() mfs = metafeatures.compute(X, Y) print(mfs) metafeatures_output = open('metafeatures_output.txt', 'w') metafeatures_output.close()