def test_none_cancor(self): X, y = load_xy(0) feats = [ "w_lambda", "p_trace", "lh_trace", "roy_root", ] mfe = MFE(groups=[GNAME], features=feats) custom_args = { "can_cors": np.array([]), "can_cor_eigvals": np.array([]), } mfe.fit(X.values, y.values, precomp_groups=None) extract_args = {cur_feat: custom_args for cur_feat in feats} vals = mfe.extract(**extract_args, suppress_warnings=True)[1] assert np.allclose(vals, np.full(shape=len(vals), fill_value=np.nan), equal_nan=True)
def test_ft_methods_model_based_02(self, dt_id, ft_name, exp_value, precompute): """Function to test each meta-feature belongs to model-based group.""" precomp_group = GNAME if precompute else None X, y = load_xy(dt_id) mfe = MFE( groups=[GNAME], features=[ft_name], hypparam_model_dt={ "max_depth": 5, "min_samples_split": 10, "criterion": "entropy", }, random_state=1234, ) mfe.fit(X.values, y.values, precomp_groups=precomp_group) if precomp_group is None: # Note: the precomputation of 'model-based' group is always # forced due to the need of the 'dt_model' value mfe._precomp_args_ft = { "dt_model": mfe._precomp_args_ft.get("dt_model") } value = mfe.extract()[1] if exp_value is np.nan: assert value[0] is exp_value else: assert np.allclose(value, exp_value)
class MetaFeatures: def __init__(self): self.mfe = MFE() self.le = preprocessing.LabelEncoder() def calculate(self, dataset_filename): # Reading dataset dataset = Dataset.get_or_insert(dataset_filename) if dataset.name.endswith("json"): data = pd.read_json(self.datasets_dir + dataset.name) elif dataset.name.endswith("arff"): data = arff_io.loadarff(self.datasets_dir + dataset.name) data = pd.DataFrame(data[0]) # Getting target column target = data["class"].values # Separating from data from labels values = data.drop("class", axis=1).values ft = self.metafeatures(values, target) # Getting metafeatures names (labels) and the calculated values (results) labels = np.array(ft[0]) results = np.array(ft[1]) # Ignoring nan values (Removing columns - features - with nan values in datasets) nan_columns = np.isnan(results) not_nan = np.invert(nan_columns) labels = labels[not_nan].tolist() results = results[not_nan].tolist() # Sometimes the result is a complex number, use just the real part for indx, result in enumerate(results): if isinstance(result, complex): results[indx] = result.real metadata = Metadata(dataset=dataset.name, features=labels, values=results).save() return (labels, results) def metafeatures(self, values, target): # Dealing with object columns (non numeric) if target.dtype == np.object: self.le.fit(target) target = self.le.transform(target) # Calculating metafeatures self.mfe.fit(values, target) try: ft = self.mfe.extract() except AttributeError: self.mfe.fit(values.astype(float), target) ft = self.mfe.extract() return ft def apply(self, datasets_fd="mock_datasets/"): # Calculates metafeatures for every datasets in the datasets directory self.datasets_dir = datasets_fd # Getting list of datasets inside directory self.datasets = [ f for f in listdir(self.datasets_dir) if (isfile(join(self.datasets_dir, f)) and ( f.endswith("json") or f.endswith("arff"))) ] for dataset in self.datasets: self.calculate(dataset)
def extract_from_object(dataset: Union[np.ndarray, list], mfe_params: dict = None) -> Sequence: if mfe_params is None or len(mfe_params) == 0: mfe_params = __default_mfe_params mfe = MFE(**mfe_params) mfe.fit(dataset, suppress_warnings=True) return mfe.extract(suppress_warnings=True)[1]
def test_one_hot_encoding_02(self): X, y = utils.load_xy(1) mfe = MFE() mfe.fit(X.values, y.values, transform_cat="one-hot-full") exp_value = np.sum([np.unique(attr).size for attr in X.values.T]) assert mfe._custom_args_ft["N"].shape[1] == exp_value
def test_one_hot_encoding_03(self): X, y = utils.load_xy(2) mfe = MFE() mfe.fit(X.values, y.values, transform_cat="one-hot") exp_value = X.values.shape[1] assert mfe._custom_args_ft["N"].shape[1] == exp_value
def main(): """Extract meta-features with pyMFE and evaluate MSE with LightGBM. """ args = parse_args() wandb.init(project='DeepMetaLearning', name='classical', config=args) warnings.filterwarnings("ignore", category=RuntimeWarning) warnings.filterwarnings("ignore", category=UserWarning) mfe = MFE(random_state=args.seed) print("Extracting meta-features for train files") train_df = [] train_path = pathlib.Path(args.data_path) / 'train' train_files = list(train_path.glob('*.parquet')) scores_data = pd.read_csv("augment_data.csv", index_col="filename") for fname in tqdm(train_files): df = pd.read_parquet(fname) X = df.drop(columns=["class"]).values # First evaluate only unsupervised features #y = df["class"].values mfe.fit(X) ft = mfe.extract() ft = dict(zip(*ft)) ft["best_clf"] = scores_data.loc[fname.name].argmax() train_df.append(ft) print("Extracting meta-features for validation files") valid_df = [] valid_path = pathlib.Path(args.data_path) / 'valid' valid_files = list(valid_path.glob('*.parquet')) for fname in tqdm(valid_files): df = pd.read_parquet(fname) X = df.drop(columns=["class"]).values # First evaluate only unsupervised features #y = df["class"].values mfe.fit(X) ft = mfe.extract() ft = dict(zip(*ft)) ft["best_clf"] = scores_data.loc[fname.name].argmax() valid_df.append(ft) train_df = pd.DataFrame(train_df) valid_df = pd.DataFrame(valid_df) if args.save_mfe: train_df.to_csv("mfe.train.csv", index=False) train_df.to_csv("mfe.test.csv", index=False) drop_columns = ["best_clf"] xtrain = train_df.drop(columns=drop_columns).values xtest = valid_df.drop(columns=drop_columns).values ytrain = train_df[drop_columns] ytrue = valid_df[drop_columns] lg = LGBMClassifier(random_state=args.seed, objective='multiclass') lg.fit(xtrain, ytrain) yhat = lg.predict(xtest) recall = metrics.recall_score(ytrue, yhat, average="micro") precis = metrics.precision_score(ytrue, yhat, average="micro") wandb.log({"recall": recall}) wandb.log({"precision": precis})
def _get_feats(cls): from sklearn.datasets import load_iris from pymfe.mfe import MFE data = load_iris() mfe = MFE() mfe.fit(data.data, data.target) ft = mfe.extract() _feats = [feature.replace(".", "_") for feature in ft[0]] return _feats
def test_one_hot_encoding_04(self): X, y = utils.load_xy(2) mfe = MFE() X = np.hstack((X.values, np.ones((y.size, 1), dtype=str))) y = y.values with pytest.raises(ValueError): mfe.fit(X=X, y=y, transform_cat="one-hot")
def transform(self, X, y): if isinstance(X, pd.DataFrame): X = X.to_numpy(dtype='int8') if isinstance(y, pd.Series): y = y.to_numpy(dtype='int32') mfe = MFE(groups=["general"], summary=['kurtosis', 'min', 'max', 'median', 'skewness']) mfe.fit(X, y) ft = mfe.extract()[1] return np.nan_to_num(np.array(ft), 0)
def test_gray_encoding_missing_value(self): X, y = utils.load_xy(1) mfe = MFE() X = np.copy(X.values) y = y.values X[5, 0] = np.nan with pytest.raises(ValueError): mfe.fit(X, y, transform_cat="gray")
def test_integration_complexity(self, dt_id, exp_value, precompute): """Function to test each meta-feature belongs to complexity group.""" precomp_group = GNAME if precompute else None X, y = load_xy(dt_id) mfe = MFE(groups=[GNAME], summary="mean", random_state=1234) mfe.fit(X.values, y.values, precomp_groups=precomp_group) value = mfe.extract()[1] assert np.allclose(value, exp_value, equal_nan=True, rtol=0.025)
def test_integration_model_based(self, dt_id, exp_value, precompute): """Function to test all model-based meta-features.""" precomp_group = GNAME if precompute else None X, y = load_xy(dt_id) mfe = MFE(groups=[GNAME], summary="mean", random_state=1234) mfe.fit(X.values, y.values, precomp_groups=precomp_group) value = mfe.extract()[1] assert np.allclose(value, exp_value, equal_nan=True)
def test_extract_from_model(self): X, y = utils.load_xy(2) model = sklearn.tree.DecisionTreeClassifier(random_state=1234).fit( X.values, y.values) mtf_name, mtf_vals = MFE(random_state=1234).extract_from_model(model) extractor = MFE(groups="model-based", random_state=1234) extractor.fit(X=X.values, y=y.values, transform_num=False) mtf_name2, mtf_vals2 = extractor.extract() assert np.all(mtf_name == mtf_name2) and np.allclose( mtf_vals, mtf_vals2)
def test_ft_methods_itemset(self, dt_id, ft_name, exp_value, precompute): """Function to test each meta-feature belongs to itemset group.""" precomp_group = GNAME if precompute else None X, y = load_xy(dt_id) mfe = MFE(groups=[GNAME], features=[ft_name], random_state=1234) mfe.fit(X.values, y.values, precomp_groups=precomp_group) value = mfe.extract()[1] if exp_value is np.nan: assert value[0] is exp_value else: assert np.allclose(value, exp_value, equal_nan=True)
def test_parse_valid_metafeatures(self, groups): """Check the length of valid metafeatures per group.""" X, y = utils.load_xy(0) mfe = MFE( groups="all", summary=None, lm_sample_frac=0.5, random_state=1234 ) mfe.fit(X.values, y.values) res = mfe.extract() target_mtf = mfe.valid_metafeatures(groups=groups) names, _ = mfe.parse_by_group(groups, res) assert not set(names).symmetric_difference(target_mtf)
def single_group_meta_features(X): # Extract single group (source/target) features features = [ "cohesiveness", "cor", "cov", "eigenvalues", "nr_cor_attr", "min", "mean", "median", "max", "iq_range", "kurtosis", "skewness", "t_mean", "var", "sd", "range", "nr_norm", "nr_outliers" ] mfe = MFE(features=features, suppress_warnings=True) mfe.fit(X, [0] * X.shape[0]) ft = mfe.extract() #return pd.Series(ft[1],index=ft[0]) return ft[1]
def test_ft_method_relative(self, dt_id, summary, precompute, sample_size, exp_value): """Test relative and subsampling relative landmarking.""" precomp_group = "relative" if precompute else None X, y = load_xy(dt_id) mfe = MFE(groups=["relative"], summary=summary, sample_size=sample_size, random_state=1234) mfe.fit(X.values, y.values, precomp_groups=precomp_group) _, vals = mfe.extract() assert np.allclose(vals, exp_value)
def bigroup_meta_features(source_pt_emb, target_pt_emb): y = [0] * source_pt_emb.shape[0] + [1] * source_pt_emb.shape[0] X = np.concatenate([source_pt_emb, target_pt_emb], axis=0) # Extract several meta-features (more than for single group) mfe = MFE(groups=["Statistical", "complexity", "concept", "clustering"], suppress_warnings=True) mfe.fit(X, y) ft = mfe.extract() feat_list = [] interest_features = [ 'ch', 'cohesiveness.mean', 'cohesiveness.sd', 'conceptvar.mean', 'conceptvar.sd', 'cor.mean', 'cor.sd', 'cov.mean', 'cov.sd', 'eigenvalues.mean', 'eigenvalues.sd', 'f3.mean', 'f4.mean', 'gravity', 'impconceptvar.mean', 'impconceptvar.sd', 'int', 'iq_range.mean', 'iq_range.sd', 'kurtosis.mean', 'kurtosis.sd', 'mad.mean', 'mad.sd', 'max.mean', 'max.sd', 'mean.mean', 'mean.sd', 'median.mean', 'median.sd', 'min.mean', 'min.sd', 'nr_cor_attr', 'nr_norm', 'nr_outliers', 'pb', 'range.mean', 'range.sd', 'sd.mean', 'sd.sd', 'sil', 'skewness.mean', 'skewness.sd', 't4', 't_mean.mean', 't_mean.sd', 'var.mean', 'var.sd', 'vdb', 'vdu', 'wg_dist.mean', 'wg_dist.sd' ] for feat, val in zip(ft[0], ft[1]): if feat in interest_features: feat_list.append(val) #We add 3 extra "distances" hung_dist = permutation_dist(source_pt_emb, target_pt_emb) wass_dist = wasserstein_dist(source_pt_emb, target_pt_emb) hauss_dist = hausdorff_dist(source_pt_emb, target_pt_emb) feat_list.append(hung_dist) feat_list.append(wass_dist) feat_list.append(hauss_dist) #return pd.Series(feat_list,index=interest_features+["hung_dist","wasser","hauss"]) return feat_list
def test_t1_arguments(self, orig_dist_mat_min, orig_dist_mat_ptp): exp_val = [0.015151516, 0.024628395] X, y = load_xy(2) extractor = MFE(groups="complexity", features="t1") extractor.fit(X.values, y.values, transform_num=False) args = {"t1": {}} if not orig_dist_mat_min: args["t1"].update({"orig_dist_mat_min": None}) if not orig_dist_mat_ptp: args["t1"].update({"orig_dist_mat_ptp": None}) _, res = extractor.extract(**args) assert np.allclose(res, exp_val)
class MFE(Element): def cs_impl(self): raise Exception('Specify parameters like "supervised"/"unsupervised"' 'in the HP tree?') def build_impl(self): self.model = PYMFE() def apply_impl(self, data): return self.use_impl(data) def use_impl(self, data): self.model.fit(*data.Xy) names, values = self.model.extract(suppress_warnings=True) l = np.array(values) # TODO: suppressing NaNs with 0s!! l[~np.isfinite(l)] = 0 return data.updated(self, l=l)
def test_extract_metafeature_names_unsupervised_01(self, groups, summary): """Test .extract_metafeature_names method.""" X, _ = utils.load_xy(0) mfe = MFE(groups=groups, summary=summary) mtf_names_1 = mfe.extract_metafeature_names(supervised=False) mtf_names_2 = mfe.fit(X.values).extract(suppress_warnings=True)[0] assert mtf_names_1 == tuple(mtf_names_2)
def test_ft_methods_model_based(self, dt_id, ft_name, exp_value, precompute): """Function to test each meta-feature belongs to model_based group. """ precomp_group = "model-based" if precompute else None X, y = load_xy(dt_id) mfe = MFE(groups=["model-based"], features=[ft_name], random_state=1234) mfe.fit(X.values, y.values, precomp_groups=precomp_group) value = mfe.extract()[1] if exp_value is np.nan: assert value[0] is exp_value else: assert np.allclose(value, exp_value)
def test_relative_correctness(self, summary, dt_id): """Test if the metafeatures postprocessed by rel. land. are correct.""" X, y = load_xy(dt_id) mfe = MFE(groups="all", summary=summary, sample_size=0.5, random_state=1234) mfe.fit(X.values, y.values) names, _ = mfe.extract() target_mtf = mfe.valid_metafeatures(groups="landmarking") relative_names = { name.split(".")[0] for name in names if name.rfind(".relative") != -1 } assert not set(relative_names).symmetric_difference(target_mtf)
def test_ft_methods_landmarking(self, dt_id, ft_name, exp_value, precompute, sample_size): """Function to test each meta-feature belongs to landmarking group. """ precomp_group = "landmarking" if precompute else None X, y = load_xy(dt_id) mfe = MFE(groups=["landmarking"], features=[ft_name], sample_size=sample_size, random_state=1234) mfe.fit(X.values, y.values, precomp_groups=precomp_group) value = mfe.extract()[1] if exp_value is np.nan: assert value[0] is exp_value else: assert np.allclose(value, exp_value)
def meta_features(X, y, groups=None, suppress=True): ''' Extracts and returns the meta-features from a dataset using the Pymfe package. Parameters: ----------- X: pd.DataFrame Contains the dataframe of a given dataset excluding its target column. y: pd.Series Contains the series of the target of a given dataset. groups: list Contains the names of the meta-feature groups as available in the Pymfe package (pymfe.readthedocs.io). Returns: -------- list Contains a list of lists where one list denotes the meta-feature names and the other denoted the meta-feature values respective to the names. ''' try: X = X.to_numpy() except: pass try: y = y.to_numpy() except: pass if groups == None: mfe = MFE(suppress_warnings=suppress) mfe.fit(X, y) ft = mfe.extract() else: mfe = MFE(groups=groups, suppress_warnings=suppress) mfe.fit(X, y) ft = mfe.extract() return ft
def get_window_features(X, mfe_features, tsfel_config, summary_funcs, n_classes=None, last_window_acc=None, current_acc=None): mfe = MFE(features=mfe_features, summary=summary_funcs) mfe.fit(X) mfe_feats = mfe.extract() tsfel_feats = gen_tsfel_features(tsfel_config, pd.DataFrame(X), summary=summary_funcs) stream_feats = pd.DataFrame( {name: [value] for name, value in zip(mfe_feats[0], mfe_feats[1])} ) stream_feats = pd.concat([stream_feats, tsfel_feats], axis=1) if last_window_acc is not None and current_acc is not None: stream_feats["window_acc_delta"] = current_acc - last_window_acc if n_classes is not None: stream_feats["n_classes"] = n_classes stream_feats["max_possible_entropy"] = math.log(n_classes, 2) return stream_feats
def test_extract_metafeature_names_unsupervised_02(self, groups, summary): """Test .extract_metafeature_names method.""" X, _ = utils.load_xy(0) mfe = MFE(groups=groups, summary=summary) mtf_names_1 = mfe.fit(X.values).extract(suppress_warnings=True)[0] # Note: by default, .extract_metafeature_names should check wether # 'y' was fitted or not if .fit was called before. Therefore, here, # supervised=True is expected to be ignored and behave like # supervised=False. mtf_names_2 = mfe.extract_metafeature_names(supervised=True) mtf_names_3 = mfe.extract_metafeature_names(supervised=False) assert tuple(mtf_names_1) == mtf_names_2 == mtf_names_3
def test_no_cat_transformation(self): X, y = utils.load_xy(1) mfe = MFE() mfe.fit(X.values, y.values, transform_cat=None) assert mfe._custom_args_ft["N"].size == 0
# The standard way to extract meta-features is using the MFE class. # The parameters are the dataset and the group of measures to be extracted. # By default, the method extract all the measures. For instance: from sklearn.datasets import load_iris from pymfe.mfe import MFE # Load a dataset data = load_iris() y = data.target X = data.data ############################################################################### # Extracting all measures mfe = MFE() mfe.fit(X, y) ft = mfe.extract() print("\n".join("{:50} {:30}".format(x, y) for x, y in zip(ft[0], ft[1]))) ############################################################################### # Extracting general, statistical and information-theoretic measures mfe = MFE(groups=["general", "statistical", "info-theory"]) mfe.fit(X, y) ft = mfe.extract() print("\n".join("{:50} {:30}".format(x, y) for x, y in zip(ft[0], ft[1]))) ############################################################################### # Changing summarization function # ------------------------------- # # Several measures return more than one value. To aggregate them, post