class MetaFeatures: def __init__(self): self.mfe = MFE() self.le = preprocessing.LabelEncoder() def calculate(self, dataset_filename): # Reading dataset dataset = Dataset.get_or_insert(dataset_filename) if dataset.name.endswith("json"): data = pd.read_json(self.datasets_dir + dataset.name) elif dataset.name.endswith("arff"): data = arff_io.loadarff(self.datasets_dir + dataset.name) data = pd.DataFrame(data[0]) # Getting target column target = data["class"].values # Separating from data from labels values = data.drop("class", axis=1).values ft = self.metafeatures(values, target) # Getting metafeatures names (labels) and the calculated values (results) labels = np.array(ft[0]) results = np.array(ft[1]) # Ignoring nan values (Removing columns - features - with nan values in datasets) nan_columns = np.isnan(results) not_nan = np.invert(nan_columns) labels = labels[not_nan].tolist() results = results[not_nan].tolist() # Sometimes the result is a complex number, use just the real part for indx, result in enumerate(results): if isinstance(result, complex): results[indx] = result.real metadata = Metadata(dataset=dataset.name, features=labels, values=results).save() return (labels, results) def metafeatures(self, values, target): # Dealing with object columns (non numeric) if target.dtype == np.object: self.le.fit(target) target = self.le.transform(target) # Calculating metafeatures self.mfe.fit(values, target) try: ft = self.mfe.extract() except AttributeError: self.mfe.fit(values.astype(float), target) ft = self.mfe.extract() return ft def apply(self, datasets_fd="mock_datasets/"): # Calculates metafeatures for every datasets in the datasets directory self.datasets_dir = datasets_fd # Getting list of datasets inside directory self.datasets = [ f for f in listdir(self.datasets_dir) if (isfile(join(self.datasets_dir, f)) and ( f.endswith("json") or f.endswith("arff"))) ] for dataset in self.datasets: self.calculate(dataset)
def main(): """Extract meta-features with pyMFE and evaluate MSE with LightGBM. """ args = parse_args() wandb.init(project='DeepMetaLearning', name='classical', config=args) warnings.filterwarnings("ignore", category=RuntimeWarning) warnings.filterwarnings("ignore", category=UserWarning) mfe = MFE(random_state=args.seed) print("Extracting meta-features for train files") train_df = [] train_path = pathlib.Path(args.data_path) / 'train' train_files = list(train_path.glob('*.parquet')) scores_data = pd.read_csv("augment_data.csv", index_col="filename") for fname in tqdm(train_files): df = pd.read_parquet(fname) X = df.drop(columns=["class"]).values # First evaluate only unsupervised features #y = df["class"].values mfe.fit(X) ft = mfe.extract() ft = dict(zip(*ft)) ft["best_clf"] = scores_data.loc[fname.name].argmax() train_df.append(ft) print("Extracting meta-features for validation files") valid_df = [] valid_path = pathlib.Path(args.data_path) / 'valid' valid_files = list(valid_path.glob('*.parquet')) for fname in tqdm(valid_files): df = pd.read_parquet(fname) X = df.drop(columns=["class"]).values # First evaluate only unsupervised features #y = df["class"].values mfe.fit(X) ft = mfe.extract() ft = dict(zip(*ft)) ft["best_clf"] = scores_data.loc[fname.name].argmax() valid_df.append(ft) train_df = pd.DataFrame(train_df) valid_df = pd.DataFrame(valid_df) if args.save_mfe: train_df.to_csv("mfe.train.csv", index=False) train_df.to_csv("mfe.test.csv", index=False) drop_columns = ["best_clf"] xtrain = train_df.drop(columns=drop_columns).values xtest = valid_df.drop(columns=drop_columns).values ytrain = train_df[drop_columns] ytrue = valid_df[drop_columns] lg = LGBMClassifier(random_state=args.seed, objective='multiclass') lg.fit(xtrain, ytrain) yhat = lg.predict(xtest) recall = metrics.recall_score(ytrue, yhat, average="micro") precis = metrics.precision_score(ytrue, yhat, average="micro") wandb.log({"recall": recall}) wandb.log({"precision": precis})
def test_verbose(self, capsys): X, y = load_xy(0) model = MFE( features=["freq_class", "mean", "class_conc", "one_nn", "nodes" ]).fit(X=X.values, y=y.values) model.extract(verbose=True) captured = capsys.readouterr().out # Expected number of messages in verbose mode of mtf extraction expected_msg_num = 21 assert captured.count("\n") == expected_msg_num
def extract_from_object(dataset: Union[np.ndarray, list], mfe_params: dict = None) -> Sequence: if mfe_params is None or len(mfe_params) == 0: mfe_params = __default_mfe_params mfe = MFE(**mfe_params) mfe.fit(dataset, suppress_warnings=True) return mfe.extract(suppress_warnings=True)[1]
def test_extract_with_time_output_dictionary(self): X, y = load_xy(2) extractor = MFE(groups="general", measure_time="total").fit(X.values, y.values) res = extractor.extract(out_type=dict) assert isinstance(res, dict) assert len(res) == 3
def test_ft_methods_model_based_02(self, dt_id, ft_name, exp_value, precompute): """Function to test each meta-feature belongs to model-based group.""" precomp_group = GNAME if precompute else None X, y = load_xy(dt_id) mfe = MFE( groups=[GNAME], features=[ft_name], hypparam_model_dt={ "max_depth": 5, "min_samples_split": 10, "criterion": "entropy", }, random_state=1234, ) mfe.fit(X.values, y.values, precomp_groups=precomp_group) if precomp_group is None: # Note: the precomputation of 'model-based' group is always # forced due to the need of the 'dt_model' value mfe._precomp_args_ft = { "dt_model": mfe._precomp_args_ft.get("dt_model") } value = mfe.extract()[1] if exp_value is np.nan: assert value[0] is exp_value else: assert np.allclose(value, exp_value)
def extract_mtf_by_group(): all_mtf_names = [] all_mtf_vals = [] for cur_group in mtf_groups: cur_precomp_group = cur_group if precompute else None mfe = MFE( groups=cur_group, summary="mean", random_state=1234 ).fit( X.values, y.values if supervised else None, precomp_groups=cur_precomp_group, ) cur_names, cur_vals = mfe.extract() all_mtf_names += cur_names all_mtf_vals += cur_vals _, all_mtf_vals = zip( *sorted( zip(all_mtf_names, all_mtf_vals), key=lambda item: item[0] ) ) return all_mtf_vals
def test_none_cancor(self): X, y = load_xy(0) feats = [ "w_lambda", "p_trace", "lh_trace", "roy_root", ] mfe = MFE(groups=[GNAME], features=feats) custom_args = { "can_cors": np.array([]), "can_cor_eigvals": np.array([]), } mfe.fit(X.values, y.values, precomp_groups=None) extract_args = {cur_feat: custom_args for cur_feat in feats} vals = mfe.extract(**extract_args, suppress_warnings=True)[1] assert np.allclose(vals, np.full(shape=len(vals), fill_value=np.nan), equal_nan=True)
def test_extract_output_pandas_dataframe(self): X, y = load_xy(2) extractor = MFE(groups="general").fit(X.values, y.values) expected_mtfs = extractor.extract_metafeature_names() res = extractor.extract(out_type=pd.DataFrame) assert isinstance(res, pd.DataFrame) assert res.values.shape == (1, len(expected_mtfs)) and np.array_equal( res.columns, expected_mtfs)
def test_extract_with_time_output_pandas_dataframe_unsupervised(self): X, _ = load_xy(2) extractor = MFE(measure_time="total", groups="general").fit(X.values) expected_mtfs = extractor.extract_metafeature_names() res = extractor.extract(out_type=pd.DataFrame) assert isinstance(res, pd.DataFrame) assert res.values.shape == (2, len(expected_mtfs)) and np.array_equal( res.columns, expected_mtfs)
def test_silhouette_subsampling(self, precompute): X, y = load_xy(0) precomp_group = GNAME if precompute else None mfe = MFE(groups="clustering", features="sil", random_state=1234).fit( X.values, y.values, precomp_groups=precomp_group ) value = mfe.extract(sil={"sample_frac": 0.5})[1] assert np.allclose(value, -0.07137712254830314)
def test_integration_general(self, dt_id, exp_value, precompute): precomp_group = GNAME if precompute else None X, y = load_xy(dt_id) mfe = MFE(groups=[GNAME], summary="mean").fit( X.values, y.values, precomp_groups=precomp_group ) value = mfe.extract()[1] assert np.allclose(value, exp_value, equal_nan=True)
def _get_feats(cls): from sklearn.datasets import load_iris from pymfe.mfe import MFE data = load_iris() mfe = MFE() mfe.fit(data.data, data.target) ft = mfe.extract() _feats = [feature.replace(".", "_") for feature in ft[0]] return _feats
def test_threshold_attr_conc(self): X, y = load_xy(1) mfe = MFE(features="attr_conc", random_state=1234).fit( X.values, y.values, precomp_groups=False ) value = mfe.extract(attr_conc={"max_attr_num": 25})[1] assert np.allclose(value, [0.01682327, 0.04715381], rtol=0.2)
def test_integration_infotheo(self, dt_id, exp_value, precompute): """Function to test all info-theory meta-features.""" precomp_group = GNAME if precompute else None X, y = load_xy(dt_id) mfe = MFE(groups=[GNAME], summary="mean").fit( X.values, y.values, precomp_groups=precomp_group ) value = mfe.extract()[1] np.allclose(value, exp_value, atol=0.001, rtol=0.05, equal_nan=True)
def transform(self, X, y): if isinstance(X, pd.DataFrame): X = X.to_numpy(dtype='int8') if isinstance(y, pd.Series): y = y.to_numpy(dtype='int32') mfe = MFE(groups=["general"], summary=['kurtosis', 'min', 'max', 'median', 'skewness']) mfe.fit(X, y) ft = mfe.extract()[1] return np.nan_to_num(np.array(ft), 0)
def test_integration_clustering(self, dt_id, exp_value, precompute): """Function to test each all clustering meta-features.""" precomp_group = GNAME if precompute else None X, y = load_xy(dt_id) mfe = MFE(groups=[GNAME], summary="mean").fit( X.values, y.values, precomp_groups=precomp_group ) value = mfe.extract()[1] assert np.allclose(value, exp_value, equal_nan=True)
def test_integration_model_based(self, dt_id, exp_value, precompute): """Function to test all model-based meta-features.""" precomp_group = GNAME if precompute else None X, y = load_xy(dt_id) mfe = MFE(groups=[GNAME], summary="mean", random_state=1234) mfe.fit(X.values, y.values, precomp_groups=precomp_group) value = mfe.extract()[1] assert np.allclose(value, exp_value, equal_nan=True)
def extract_all_mtf(): mfe = MFE( groups=mtf_groups, summary="mean", random_state=1234 ).fit( X.values, y.values if supervised else None, precomp_groups=precomp_group, ) all_mtf_vals = mfe.extract()[1] return all_mtf_vals
def test_integration_complexity(self, dt_id, exp_value, precompute): """Function to test each meta-feature belongs to complexity group.""" precomp_group = GNAME if precompute else None X, y = load_xy(dt_id) mfe = MFE(groups=[GNAME], summary="mean", random_state=1234) mfe.fit(X.values, y.values, precomp_groups=precomp_group) value = mfe.extract()[1] assert np.allclose(value, exp_value, equal_nan=True, rtol=0.025)
def meta_features(X, y, groups=None, suppress=True): ''' Extracts and returns the meta-features from a dataset using the Pymfe package. Parameters: ----------- X: pd.DataFrame Contains the dataframe of a given dataset excluding its target column. y: pd.Series Contains the series of the target of a given dataset. groups: list Contains the names of the meta-feature groups as available in the Pymfe package (pymfe.readthedocs.io). Returns: -------- list Contains a list of lists where one list denotes the meta-feature names and the other denoted the meta-feature values respective to the names. ''' try: X = X.to_numpy() except: pass try: y = y.to_numpy() except: pass if groups == None: mfe = MFE(suppress_warnings=suppress) mfe.fit(X, y) ft = mfe.extract() else: mfe = MFE(groups=groups, suppress_warnings=suppress) mfe.fit(X, y) ft = mfe.extract() return ft
def test_extract_from_model(self): X, y = utils.load_xy(2) model = sklearn.tree.DecisionTreeClassifier(random_state=1234).fit( X.values, y.values) mtf_name, mtf_vals = MFE(random_state=1234).extract_from_model(model) extractor = MFE(groups="model-based", random_state=1234) extractor.fit(X=X.values, y=y.values, transform_num=False) mtf_name2, mtf_vals2 = extractor.extract() assert np.all(mtf_name == mtf_name2) and np.allclose( mtf_vals, mtf_vals2)
def test_ft_methods_general(self, dt_id, ft_name, exp_value, precompute): """Function to test each meta-feature belongs to general group.""" precomp_group = GNAME if precompute else None X, y = load_xy(dt_id) mfe = MFE(groups=[GNAME], features=[ft_name]).fit( X.values, y.values, precomp_groups=precomp_group ) value = mfe.extract()[1] if exp_value is np.nan: assert value[0] is exp_value else: assert np.allclose(value, exp_value)
def test_roy_largest_root(self, dt_id, exp_value, precompute, criterion): precomp_group = GNAME if precompute else None X, y = load_xy(dt_id) mfe = MFE(groups=[GNAME], features="roy_root").fit(X.values, y.values, precomp_groups=precomp_group) value = mfe.extract(roy_root={"criterion": criterion})[1] assert np.allclose(value, exp_value, atol=0.001, rtol=0.05, equal_nan=True)
def test_ft_methods_itemset(self, dt_id, ft_name, exp_value, precompute): """Function to test each meta-feature belongs to itemset group.""" precomp_group = GNAME if precompute else None X, y = load_xy(dt_id) mfe = MFE(groups=[GNAME], features=[ft_name], random_state=1234) mfe.fit(X.values, y.values, precomp_groups=precomp_group) value = mfe.extract()[1] if exp_value is np.nan: assert value[0] is exp_value else: assert np.allclose(value, exp_value, equal_nan=True)
def test_integration_statistical(self, dt_id, exp_value, precompute): """Function to test all statistical meta-features simultaneously.""" precomp_group = GNAME if precompute else None X, y = load_xy(dt_id) mfe = MFE(groups=[GNAME], summary="mean").fit(X.values, y.values, precomp_groups=precomp_group) value = mfe.extract()[1] assert np.allclose(value, exp_value, atol=0.001, rtol=0.05, equal_nan=True)
def test_ft_method_relative(self, dt_id, summary, precompute, sample_size, exp_value): """Test relative and subsampling relative landmarking.""" precomp_group = "relative" if precompute else None X, y = load_xy(dt_id) mfe = MFE(groups=["relative"], summary=summary, sample_size=sample_size, random_state=1234) mfe.fit(X.values, y.values, precomp_groups=precomp_group) _, vals = mfe.extract() assert np.allclose(vals, exp_value)
def test_ft_methods_statistical(self, dt_id, ft_name, exp_value, precompute): """Function to test each meta-feature belongs to statistical group.""" precomp_group = GNAME if precompute else None X, y = load_xy(dt_id) mfe = MFE(groups=[GNAME], features=[ft_name]).fit(X.values, y.values, precomp_groups=precomp_group) value = mfe.extract()[1] assert np.allclose(value, exp_value, atol=0.001, rtol=0.05, equal_nan=True)
def test_normality_tests(self, dt_id, exp_value, precompute, test, failure): """Test normality tests included in ``nr_norm`` statistical method.""" precomp_group = GNAME if precompute else None X, y = load_xy(dt_id) mfe = MFE(groups=[GNAME], features="nr_norm").fit(X.values, y.values, precomp_groups=precomp_group) value = mfe.extract(nr_norm={"failure": failure, "method": test})[1] assert np.allclose(value, exp_value, atol=0.001, rtol=0.05, equal_nan=True)
def test_parse_valid_metafeatures(self, groups): """Check the length of valid metafeatures per group.""" X, y = utils.load_xy(0) mfe = MFE( groups="all", summary=None, lm_sample_frac=0.5, random_state=1234 ) mfe.fit(X.values, y.values) res = mfe.extract() target_mtf = mfe.valid_metafeatures(groups=groups) names, _ = mfe.parse_by_group(groups, res) assert not set(names).symmetric_difference(target_mtf)