def test_error__set_data_numeric(self): with pytest.raises(TypeError): mfe = MFE() mfe._set_data_numeric(True) with pytest.raises(TypeError): mfe = MFE() mfe.X = np.array([]) mfe._set_data_numeric(True)
def test_extract_with_confidence_random_state1(self): X, y = utils.load_xy(2) _, mtf_vals_1, mtf_conf_int_1 = (MFE( features=["mean", "sd"], random_state=16).fit( X=X.values, y=y.values).extract_with_confidence(sample_num=3)) _, mtf_vals_2, mtf_conf_int_2 = (MFE( features=["mean", "sd"], random_state=16).fit( X=X.values, y=y.values).extract_with_confidence(sample_num=3)) assert np.allclose(mtf_vals_1, mtf_vals_2) and np.allclose( mtf_conf_int_1, mtf_conf_int_2)
def test_extract_with_confidence_random_state3(self): X, y = utils.load_xy(2) np.random.seed(1234) _, mtf_vals_1, mtf_conf_int_1 = (MFE(features=["mean", "sd"]).fit( X=X.values, y=y.values).extract_with_confidence(sample_num=3)) np.random.seed(1234) _, mtf_vals_2, mtf_conf_int_2 = (MFE(features=["mean", "sd"]).fit( X=X.values, y=y.values).extract_with_confidence(sample_num=3)) assert np.allclose( mtf_vals_1, mtf_vals_2) and np.any(~np.isclose(mtf_conf_int_1, mtf_conf_int_2))
def test_extract_from_model(self): X, y = utils.load_xy(2) model = sklearn.tree.DecisionTreeClassifier(random_state=1234).fit( X.values, y.values) mtf_name, mtf_vals = MFE(random_state=1234).extract_from_model(model) extractor = MFE(groups="model-based", random_state=1234) extractor.fit(X=X.values, y=y.values, transform_num=False) mtf_name2, mtf_vals2 = extractor.extract() assert np.all(mtf_name == mtf_name2) and np.allclose( mtf_vals, mtf_vals2)
def test_extract_with_confidence_output_dictionary_unsupervised(self): X, _ = load_xy(2) extractor = MFE(groups="general").fit(X.values) res = extractor.extract_with_confidence( 3, arguments_extract=dict(out_type=dict)) assert isinstance(res, dict) assert len(res) == 3
def test_extract_with_time_output_dictionary(self): X, y = load_xy(2) extractor = MFE(groups="general", measure_time="total").fit(X.values, y.values) res = extractor.extract(out_type=dict) assert isinstance(res, dict) assert len(res) == 3
def extract_from_object(dataset: Union[np.ndarray, list], mfe_params: dict = None) -> Sequence: if mfe_params is None or len(mfe_params) == 0: mfe_params = __default_mfe_params mfe = MFE(**mfe_params) mfe.fit(dataset, suppress_warnings=True) return mfe.extract(suppress_warnings=True)[1]
def test_extract_with_confidence_invalid3(self): X, y = utils.load_xy(2) with pytest.raises(ValueError): MFE().fit(X.values, y.values).extract_with_confidence( confidence=1.0001 )
def test_extract_with_confidence(self, confidence): X, y = utils.load_xy(2) mtf_names, mtf_vals, mtf_conf_int = ( MFE( groups="all", features=["mean", "best_node", "sil"], random_state=1234, ) .fit(X=X.values, y=y.values, precomp_groups=None) .extract_with_confidence( sample_num=64, return_avg_val=False, confidence=confidence, verbose=0, ) ) in_range_prop = np.zeros(len(mtf_names), dtype=float) for mtf_ind, cur_mtf_vals in enumerate(mtf_vals): int_low, int_high = mtf_conf_int[mtf_ind, :] in_range_prop[mtf_ind] = np.sum( np.logical_and( int_low <= cur_mtf_vals, cur_mtf_vals <= int_high ) ) / len(cur_mtf_vals) assert np.all(confidence - 0.05 <= in_range_prop)
def test_none_cancor(self): X, y = load_xy(0) feats = [ "w_lambda", "p_trace", "lh_trace", "roy_root", ] mfe = MFE(groups=[GNAME], features=feats) custom_args = { "can_cors": np.array([]), "can_cor_eigvals": np.array([]), } mfe.fit(X.values, y.values, precomp_groups=None) extract_args = {cur_feat: custom_args for cur_feat in feats} vals = mfe.extract(**extract_args, suppress_warnings=True)[1] assert np.allclose(vals, np.full(shape=len(vals), fill_value=np.nan), equal_nan=True)
def test_verbosity_3(self, verbosity, msg_expected, capsys): X, y = load_xy(0) MFE().fit(X=X.values, y=y.values).extract(verbose=verbosity) captured = capsys.readouterr().out assert (not msg_expected) or captured
def extract_mtf_by_group(): all_mtf_names = [] all_mtf_vals = [] for cur_group in mtf_groups: cur_precomp_group = cur_group if precompute else None mfe = MFE( groups=cur_group, summary="mean", random_state=1234 ).fit( X.values, y.values if supervised else None, precomp_groups=cur_precomp_group, ) cur_names, cur_vals = mfe.extract() all_mtf_names += cur_names all_mtf_vals += cur_vals _, all_mtf_vals = zip( *sorted( zip(all_mtf_names, all_mtf_vals), key=lambda item: item[0] ) ) return all_mtf_vals
def test_extract_from_model_invalid4(self): X, y = utils.load_xy(2) model = sklearn.tree.DecisionTreeClassifier().fit(X, y) with pytest.raises(ValueError): MFE(groups="general").extract_from_model(model)
def test_extract_from_model_invalid1(self): X, y = utils.load_xy(2) model = sklearn.tree.DecisionTreeRegressor().fit(X.values, y.values) with pytest.raises(TypeError): MFE().extract_from_model(model)
def test_output_lengths_2(self, dt_id, measure_time): X, y = load_xy(dt_id) res = (MFE(measure_time=measure_time).fit(X=X.values, y=y.values).extract()) vals, names, time = res assert len(vals) == len(names) == len(time)
def test_ft_methods_model_based_02(self, dt_id, ft_name, exp_value, precompute): """Function to test each meta-feature belongs to model-based group.""" precomp_group = GNAME if precompute else None X, y = load_xy(dt_id) mfe = MFE( groups=[GNAME], features=[ft_name], hypparam_model_dt={ "max_depth": 5, "min_samples_split": 10, "criterion": "entropy", }, random_state=1234, ) mfe.fit(X.values, y.values, precomp_groups=precomp_group) if precomp_group is None: # Note: the precomputation of 'model-based' group is always # forced due to the need of the 'dt_model' value mfe._precomp_args_ft = { "dt_model": mfe._precomp_args_ft.get("dt_model") } value = mfe.extract()[1] if exp_value is np.nan: assert value[0] is exp_value else: assert np.allclose(value, exp_value)
def test_scaling_error_1(self): with pytest.raises(ValueError): X, y = load_xy(0) MFE().fit(X=X.values, y=y.values, rescale="invalid", transform_cat=False)
def test_extract_from_model_invalid2(self): X, y = utils.load_xy(2) model = sklearn.tree.DecisionTreeClassifier(random_state=1234).fit( X.values, y.values) with pytest.raises(KeyError): MFE().extract_from_model(model, arguments_fit={"dt_model": model})
def test_one_hot_encoding_03(self): X, y = utils.load_xy(2) mfe = MFE() mfe.fit(X.values, y.values, transform_cat="one-hot") exp_value = X.values.shape[1] assert mfe._custom_args_ft["N"].shape[1] == exp_value
def test_one_hot_encoding_02(self): X, y = utils.load_xy(1) mfe = MFE() mfe.fit(X.values, y.values, transform_cat="one-hot-full") exp_value = np.sum([np.unique(attr).size for attr in X.values.T]) assert mfe._custom_args_ft["N"].shape[1] == exp_value
def test_extract_output_pandas_dataframe(self): X, y = load_xy(2) extractor = MFE(groups="general").fit(X.values, y.values) expected_mtfs = extractor.extract_metafeature_names() res = extractor.extract(out_type=pd.DataFrame) assert isinstance(res, pd.DataFrame) assert res.values.shape == (1, len(expected_mtfs)) and np.array_equal( res.columns, expected_mtfs)
def test_verbosity_2(self, capsys): X, y = load_xy(0) MFE().fit(X=X.values, y=y.values).extract(verbose=0) captured = capsys.readouterr().out assert not captured
def test_extract_with_time_output_pandas_dataframe_unsupervised(self): X, _ = load_xy(2) extractor = MFE(measure_time="total", groups="general").fit(X.values) expected_mtfs = extractor.extract_metafeature_names() res = extractor.extract(out_type=pd.DataFrame) assert isinstance(res, pd.DataFrame) assert res.values.shape == (2, len(expected_mtfs)) and np.array_equal( res.columns, expected_mtfs)
def main(): """Extract meta-features with pyMFE and evaluate MSE with LightGBM. """ args = parse_args() wandb.init(project='DeepMetaLearning', name='classical', config=args) warnings.filterwarnings("ignore", category=RuntimeWarning) warnings.filterwarnings("ignore", category=UserWarning) mfe = MFE(random_state=args.seed) print("Extracting meta-features for train files") train_df = [] train_path = pathlib.Path(args.data_path) / 'train' train_files = list(train_path.glob('*.parquet')) scores_data = pd.read_csv("augment_data.csv", index_col="filename") for fname in tqdm(train_files): df = pd.read_parquet(fname) X = df.drop(columns=["class"]).values # First evaluate only unsupervised features #y = df["class"].values mfe.fit(X) ft = mfe.extract() ft = dict(zip(*ft)) ft["best_clf"] = scores_data.loc[fname.name].argmax() train_df.append(ft) print("Extracting meta-features for validation files") valid_df = [] valid_path = pathlib.Path(args.data_path) / 'valid' valid_files = list(valid_path.glob('*.parquet')) for fname in tqdm(valid_files): df = pd.read_parquet(fname) X = df.drop(columns=["class"]).values # First evaluate only unsupervised features #y = df["class"].values mfe.fit(X) ft = mfe.extract() ft = dict(zip(*ft)) ft["best_clf"] = scores_data.loc[fname.name].argmax() valid_df.append(ft) train_df = pd.DataFrame(train_df) valid_df = pd.DataFrame(valid_df) if args.save_mfe: train_df.to_csv("mfe.train.csv", index=False) train_df.to_csv("mfe.test.csv", index=False) drop_columns = ["best_clf"] xtrain = train_df.drop(columns=drop_columns).values xtest = valid_df.drop(columns=drop_columns).values ytrain = train_df[drop_columns] ytrue = valid_df[drop_columns] lg = LGBMClassifier(random_state=args.seed, objective='multiclass') lg.fit(xtrain, ytrain) yhat = lg.predict(xtest) recall = metrics.recall_score(ytrue, yhat, average="micro") precis = metrics.precision_score(ytrue, yhat, average="micro") wandb.log({"recall": recall}) wandb.log({"precision": precis})
def test_integration_general(self, dt_id, exp_value, precompute): precomp_group = GNAME if precompute else None X, y = load_xy(dt_id) mfe = MFE(groups=[GNAME], summary="mean").fit( X.values, y.values, precomp_groups=precomp_group ) value = mfe.extract()[1] assert np.allclose(value, exp_value, equal_nan=True)
def test_verbosity_with_confidence(self, verbosity, msg_expected, capsys): X, y = load_xy(2) MFE().fit(X.values, y.values).extract_with_confidence(verbose=verbosity) captured = capsys.readouterr().out assert ((not msg_expected) and (not captured)) or (msg_expected and captured)
def test_default_alias_groups(self): model = MFE(groups="default") res = model.valid_groups() assert len(res) == len( _internal.VALID_GROUPS) and not set(res).symmetric_difference( _internal.VALID_GROUPS) model = MFE(groups=["default"]) res = model.valid_groups() assert len(res) == len( _internal.VALID_GROUPS) and not set(res).symmetric_difference( _internal.VALID_GROUPS) model = MFE(groups=["general", "default"]) res = model.valid_groups() assert len(res) == len( _internal.VALID_GROUPS) and not set(res).symmetric_difference( _internal.VALID_GROUPS)
def _get_feats(cls): from sklearn.datasets import load_iris from pymfe.mfe import MFE data = load_iris() mfe = MFE() mfe.fit(data.data, data.target) ft = mfe.extract() _feats = [feature.replace(".", "_") for feature in ft[0]] return _feats
def test_threshold_attr_conc(self): X, y = load_xy(1) mfe = MFE(features="attr_conc", random_state=1234).fit( X.values, y.values, precomp_groups=False ) value = mfe.extract(attr_conc={"max_attr_num": 25})[1] assert np.allclose(value, [0.01682327, 0.04715381], rtol=0.2)
def test_one_hot_encoding_04(self): X, y = utils.load_xy(2) mfe = MFE() X = np.hstack((X.values, np.ones((y.size, 1), dtype=str))) y = y.values with pytest.raises(ValueError): mfe.fit(X=X, y=y, transform_cat="one-hot")