コード例 #1
0
ファイル: metafeatures.py プロジェクト: jhosoume/curumin_mtl
class MetaFeatures:
    def __init__(self):
        self.mfe = MFE()
        self.le = preprocessing.LabelEncoder()

    def calculate(self, dataset_filename):
        # Reading dataset
        dataset = Dataset.get_or_insert(dataset_filename)
        if dataset.name.endswith("json"):
            data = pd.read_json(self.datasets_dir + dataset.name)
        elif dataset.name.endswith("arff"):
            data = arff_io.loadarff(self.datasets_dir + dataset.name)
            data = pd.DataFrame(data[0])
        # Getting target column
        target = data["class"].values
        # Separating from data from labels
        values = data.drop("class", axis=1).values
        ft = self.metafeatures(values, target)
        # Getting metafeatures names (labels) and the calculated values (results)
        labels = np.array(ft[0])
        results = np.array(ft[1])
        # Ignoring nan values (Removing columns - features - with nan values in datasets)
        nan_columns = np.isnan(results)
        not_nan = np.invert(nan_columns)
        labels = labels[not_nan].tolist()
        results = results[not_nan].tolist()
        # Sometimes the result is a complex number, use just the real part
        for indx, result in enumerate(results):
            if isinstance(result, complex):
                results[indx] = result.real
        metadata = Metadata(dataset=dataset.name,
                            features=labels,
                            values=results).save()
        return (labels, results)

    def metafeatures(self, values, target):
        # Dealing with object columns (non numeric)
        if target.dtype == np.object:
            self.le.fit(target)
            target = self.le.transform(target)
        # Calculating metafeatures
        self.mfe.fit(values, target)
        try:
            ft = self.mfe.extract()
        except AttributeError:
            self.mfe.fit(values.astype(float), target)
            ft = self.mfe.extract()
        return ft

    def apply(self, datasets_fd="mock_datasets/"):
        # Calculates metafeatures for every datasets in the datasets directory
        self.datasets_dir = datasets_fd
        # Getting list of datasets inside directory
        self.datasets = [
            f for f in listdir(self.datasets_dir)
            if (isfile(join(self.datasets_dir, f)) and (
                f.endswith("json") or f.endswith("arff")))
        ]
        for dataset in self.datasets:
            self.calculate(dataset)
コード例 #2
0
def main():
    """Extract meta-features with pyMFE and evaluate MSE with LightGBM.
    """
    args = parse_args()
    wandb.init(project='DeepMetaLearning', name='classical', config=args)
    warnings.filterwarnings("ignore", category=RuntimeWarning)
    warnings.filterwarnings("ignore", category=UserWarning)
    mfe = MFE(random_state=args.seed)
    print("Extracting meta-features for train files")
    train_df = []
    train_path = pathlib.Path(args.data_path) / 'train'
    train_files = list(train_path.glob('*.parquet'))
    scores_data = pd.read_csv("augment_data.csv", index_col="filename")
    for fname in tqdm(train_files):
        df = pd.read_parquet(fname)
        X = df.drop(columns=["class"]).values
        # First evaluate only unsupervised features
        #y = df["class"].values
        mfe.fit(X)
        ft = mfe.extract()
        ft = dict(zip(*ft))
        ft["best_clf"] = scores_data.loc[fname.name].argmax()
        train_df.append(ft)

    print("Extracting meta-features for validation files")
    valid_df = []
    valid_path = pathlib.Path(args.data_path) / 'valid'
    valid_files = list(valid_path.glob('*.parquet'))
    for fname in tqdm(valid_files):
        df = pd.read_parquet(fname)
        X = df.drop(columns=["class"]).values
        # First evaluate only unsupervised features
        #y = df["class"].values
        mfe.fit(X)
        ft = mfe.extract()
        ft = dict(zip(*ft))
        ft["best_clf"] = scores_data.loc[fname.name].argmax()
        valid_df.append(ft)

    train_df = pd.DataFrame(train_df)
    valid_df = pd.DataFrame(valid_df)
    if args.save_mfe:
        train_df.to_csv("mfe.train.csv", index=False)
        train_df.to_csv("mfe.test.csv", index=False)

    drop_columns = ["best_clf"]
    xtrain = train_df.drop(columns=drop_columns).values
    xtest = valid_df.drop(columns=drop_columns).values
    ytrain = train_df[drop_columns]
    ytrue = valid_df[drop_columns]
    lg = LGBMClassifier(random_state=args.seed, objective='multiclass')
    lg.fit(xtrain, ytrain)
    yhat = lg.predict(xtest)

    recall = metrics.recall_score(ytrue, yhat, average="micro")
    precis = metrics.precision_score(ytrue, yhat, average="micro")
    wandb.log({"recall": recall})
    wandb.log({"precision": precis})
コード例 #3
0
    def test_verbose(self, capsys):
        X, y = load_xy(0)
        model = MFE(
            features=["freq_class", "mean", "class_conc", "one_nn", "nodes"
                      ]).fit(X=X.values, y=y.values)
        model.extract(verbose=True)
        captured = capsys.readouterr().out

        # Expected number of messages in verbose mode of mtf extraction
        expected_msg_num = 21

        assert captured.count("\n") == expected_msg_num
コード例 #4
0
def extract_from_object(dataset: Union[np.ndarray, list], mfe_params: dict = None) -> Sequence:
    if mfe_params is None or len(mfe_params) == 0:
        mfe_params = __default_mfe_params

    mfe = MFE(**mfe_params)
    mfe.fit(dataset, suppress_warnings=True)
    return mfe.extract(suppress_warnings=True)[1]
コード例 #5
0
 def test_extract_with_time_output_dictionary(self):
     X, y = load_xy(2)
     extractor = MFE(groups="general",
                     measure_time="total").fit(X.values, y.values)
     res = extractor.extract(out_type=dict)
     assert isinstance(res, dict)
     assert len(res) == 3
コード例 #6
0
    def test_ft_methods_model_based_02(self, dt_id, ft_name, exp_value,
                                       precompute):
        """Function to test each meta-feature belongs to model-based group."""
        precomp_group = GNAME if precompute else None

        X, y = load_xy(dt_id)
        mfe = MFE(
            groups=[GNAME],
            features=[ft_name],
            hypparam_model_dt={
                "max_depth": 5,
                "min_samples_split": 10,
                "criterion": "entropy",
            },
            random_state=1234,
        )

        mfe.fit(X.values, y.values, precomp_groups=precomp_group)

        if precomp_group is None:
            # Note: the precomputation of 'model-based' group is always
            # forced due to the need of the 'dt_model' value
            mfe._precomp_args_ft = {
                "dt_model": mfe._precomp_args_ft.get("dt_model")
            }

        value = mfe.extract()[1]

        if exp_value is np.nan:
            assert value[0] is exp_value

        else:
            assert np.allclose(value, exp_value)
コード例 #7
0
ファイル: test_system_testing.py プロジェクト: paulasb/pymfe
        def extract_mtf_by_group():
            all_mtf_names = []
            all_mtf_vals = []

            for cur_group in mtf_groups:
                cur_precomp_group = cur_group if precompute else None

                mfe = MFE(
                    groups=cur_group, summary="mean", random_state=1234
                ).fit(
                    X.values,
                    y.values if supervised else None,
                    precomp_groups=cur_precomp_group,
                )

                cur_names, cur_vals = mfe.extract()

                all_mtf_names += cur_names
                all_mtf_vals += cur_vals

            _, all_mtf_vals = zip(
                *sorted(
                    zip(all_mtf_names, all_mtf_vals), key=lambda item: item[0]
                )
            )

            return all_mtf_vals
コード例 #8
0
    def test_none_cancor(self):
        X, y = load_xy(0)

        feats = [
            "w_lambda",
            "p_trace",
            "lh_trace",
            "roy_root",
        ]

        mfe = MFE(groups=[GNAME], features=feats)

        custom_args = {
            "can_cors": np.array([]),
            "can_cor_eigvals": np.array([]),
        }

        mfe.fit(X.values, y.values, precomp_groups=None)

        extract_args = {cur_feat: custom_args for cur_feat in feats}
        vals = mfe.extract(**extract_args, suppress_warnings=True)[1]

        assert np.allclose(vals,
                           np.full(shape=len(vals), fill_value=np.nan),
                           equal_nan=True)
コード例 #9
0
 def test_extract_output_pandas_dataframe(self):
     X, y = load_xy(2)
     extractor = MFE(groups="general").fit(X.values, y.values)
     expected_mtfs = extractor.extract_metafeature_names()
     res = extractor.extract(out_type=pd.DataFrame)
     assert isinstance(res, pd.DataFrame)
     assert res.values.shape == (1, len(expected_mtfs)) and np.array_equal(
         res.columns, expected_mtfs)
コード例 #10
0
 def test_extract_with_time_output_pandas_dataframe_unsupervised(self):
     X, _ = load_xy(2)
     extractor = MFE(measure_time="total", groups="general").fit(X.values)
     expected_mtfs = extractor.extract_metafeature_names()
     res = extractor.extract(out_type=pd.DataFrame)
     assert isinstance(res, pd.DataFrame)
     assert res.values.shape == (2, len(expected_mtfs)) and np.array_equal(
         res.columns, expected_mtfs)
コード例 #11
0
ファイル: test_clustering.py プロジェクト: paulasb/pymfe
    def test_silhouette_subsampling(self, precompute):
        X, y = load_xy(0)
        precomp_group = GNAME if precompute else None
        mfe = MFE(groups="clustering", features="sil", random_state=1234).fit(
            X.values, y.values, precomp_groups=precomp_group
        )
        value = mfe.extract(sil={"sample_frac": 0.5})[1]

        assert np.allclose(value, -0.07137712254830314)
コード例 #12
0
    def test_integration_general(self, dt_id, exp_value, precompute):
        precomp_group = GNAME if precompute else None
        X, y = load_xy(dt_id)
        mfe = MFE(groups=[GNAME], summary="mean").fit(
            X.values, y.values, precomp_groups=precomp_group
        )
        value = mfe.extract()[1]

        assert np.allclose(value, exp_value, equal_nan=True)
コード例 #13
0
ファイル: Metadata.py プロジェクト: Sanyam07/curumin_mtl
 def _get_feats(cls):
     from sklearn.datasets import load_iris
     from pymfe.mfe import MFE
     data = load_iris()
     mfe = MFE()
     mfe.fit(data.data, data.target)
     ft = mfe.extract()
     _feats = [feature.replace(".", "_") for feature in ft[0]]
     return _feats
コード例 #14
0
    def test_threshold_attr_conc(self):
        X, y = load_xy(1)
        mfe = MFE(features="attr_conc", random_state=1234).fit(
            X.values, y.values, precomp_groups=False
        )

        value = mfe.extract(attr_conc={"max_attr_num": 25})[1]

        assert np.allclose(value, [0.01682327, 0.04715381], rtol=0.2)
コード例 #15
0
    def test_integration_infotheo(self, dt_id, exp_value, precompute):
        """Function to test all info-theory meta-features."""
        precomp_group = GNAME if precompute else None
        X, y = load_xy(dt_id)
        mfe = MFE(groups=[GNAME], summary="mean").fit(
            X.values, y.values, precomp_groups=precomp_group
        )
        value = mfe.extract()[1]

        np.allclose(value, exp_value, atol=0.001, rtol=0.05, equal_nan=True)
コード例 #16
0
 def transform(self, X, y):
     if isinstance(X, pd.DataFrame):
         X = X.to_numpy(dtype='int8')
     if isinstance(y, pd.Series):
         y = y.to_numpy(dtype='int32')
     mfe = MFE(groups=["general"],
               summary=['kurtosis', 'min', 'max', 'median', 'skewness'])
     mfe.fit(X, y)
     ft = mfe.extract()[1]
     return np.nan_to_num(np.array(ft), 0)
コード例 #17
0
ファイル: test_clustering.py プロジェクト: paulasb/pymfe
    def test_integration_clustering(self, dt_id, exp_value, precompute):
        """Function to test each all clustering meta-features."""

        precomp_group = GNAME if precompute else None
        X, y = load_xy(dt_id)
        mfe = MFE(groups=[GNAME], summary="mean").fit(
            X.values, y.values, precomp_groups=precomp_group
        )
        value = mfe.extract()[1]

        assert np.allclose(value, exp_value, equal_nan=True)
コード例 #18
0
    def test_integration_model_based(self, dt_id, exp_value, precompute):
        """Function to test all model-based meta-features."""
        precomp_group = GNAME if precompute else None

        X, y = load_xy(dt_id)
        mfe = MFE(groups=[GNAME], summary="mean", random_state=1234)

        mfe.fit(X.values, y.values, precomp_groups=precomp_group)

        value = mfe.extract()[1]

        assert np.allclose(value, exp_value, equal_nan=True)
コード例 #19
0
ファイル: test_system_testing.py プロジェクト: paulasb/pymfe
        def extract_all_mtf():
            mfe = MFE(
                groups=mtf_groups, summary="mean", random_state=1234
            ).fit(
                X.values,
                y.values if supervised else None,
                precomp_groups=precomp_group,
            )

            all_mtf_vals = mfe.extract()[1]

            return all_mtf_vals
コード例 #20
0
ファイル: test_complexity.py プロジェクト: paulasb/pymfe
    def test_integration_complexity(self, dt_id, exp_value, precompute):
        """Function to test each meta-feature belongs to complexity group."""
        precomp_group = GNAME if precompute else None

        X, y = load_xy(dt_id)
        mfe = MFE(groups=[GNAME], summary="mean", random_state=1234)

        mfe.fit(X.values, y.values, precomp_groups=precomp_group)

        value = mfe.extract()[1]

        assert np.allclose(value, exp_value, equal_nan=True, rtol=0.025)
コード例 #21
0
def meta_features(X, y, groups=None, suppress=True):
    ''' Extracts and returns the meta-features from a dataset using the Pymfe
    package.

    Parameters:
    -----------
    X: pd.DataFrame
        Contains the dataframe of a given dataset excluding its target column.
    y: pd.Series
        Contains the series of the target of a given dataset.
    groups: list
        Contains the names of the meta-feature groups as available in the
        Pymfe package (pymfe.readthedocs.io).

    Returns:
    --------
    list
        Contains a list of lists where one list denotes the meta-feature names
            and the other denoted the meta-feature values respective to the names.
    '''
    try:
        X = X.to_numpy()
    except:
        pass

    try:
        y = y.to_numpy()
    except:
        pass

    if groups == None:
        mfe = MFE(suppress_warnings=suppress)
        mfe.fit(X, y)
        ft = mfe.extract()
    else:
        mfe = MFE(groups=groups, suppress_warnings=suppress)
        mfe.fit(X, y)
        ft = mfe.extract()

    return ft
コード例 #22
0
ファイル: test_architecture.py プロジェクト: ealcobaca/pymfe
    def test_extract_from_model(self):
        X, y = utils.load_xy(2)

        model = sklearn.tree.DecisionTreeClassifier(random_state=1234).fit(
            X.values, y.values)

        mtf_name, mtf_vals = MFE(random_state=1234).extract_from_model(model)

        extractor = MFE(groups="model-based", random_state=1234)
        extractor.fit(X=X.values, y=y.values, transform_num=False)
        mtf_name2, mtf_vals2 = extractor.extract()

        assert np.all(mtf_name == mtf_name2) and np.allclose(
            mtf_vals, mtf_vals2)
コード例 #23
0
    def test_ft_methods_general(self, dt_id, ft_name, exp_value, precompute):
        """Function to test each meta-feature belongs to general group."""
        precomp_group = GNAME if precompute else None
        X, y = load_xy(dt_id)
        mfe = MFE(groups=[GNAME], features=[ft_name]).fit(
            X.values, y.values, precomp_groups=precomp_group
        )
        value = mfe.extract()[1]

        if exp_value is np.nan:
            assert value[0] is exp_value

        else:
            assert np.allclose(value, exp_value)
コード例 #24
0
    def test_roy_largest_root(self, dt_id, exp_value, precompute, criterion):
        precomp_group = GNAME if precompute else None
        X, y = load_xy(dt_id)
        mfe = MFE(groups=[GNAME],
                  features="roy_root").fit(X.values,
                                           y.values,
                                           precomp_groups=precomp_group)
        value = mfe.extract(roy_root={"criterion": criterion})[1]

        assert np.allclose(value,
                           exp_value,
                           atol=0.001,
                           rtol=0.05,
                           equal_nan=True)
コード例 #25
0
ファイル: test_itemset.py プロジェクト: paulasb/pymfe
    def test_ft_methods_itemset(self, dt_id, ft_name, exp_value, precompute):
        """Function to test each meta-feature belongs to itemset group."""
        precomp_group = GNAME if precompute else None

        X, y = load_xy(dt_id)
        mfe = MFE(groups=[GNAME], features=[ft_name], random_state=1234)

        mfe.fit(X.values, y.values, precomp_groups=precomp_group)

        value = mfe.extract()[1]

        if exp_value is np.nan:
            assert value[0] is exp_value
        else:
            assert np.allclose(value, exp_value, equal_nan=True)
コード例 #26
0
    def test_integration_statistical(self, dt_id, exp_value, precompute):
        """Function to test all statistical meta-features simultaneously."""
        precomp_group = GNAME if precompute else None
        X, y = load_xy(dt_id)
        mfe = MFE(groups=[GNAME],
                  summary="mean").fit(X.values,
                                      y.values,
                                      precomp_groups=precomp_group)
        value = mfe.extract()[1]

        assert np.allclose(value,
                           exp_value,
                           atol=0.001,
                           rtol=0.05,
                           equal_nan=True)
コード例 #27
0
ファイル: test_landmarking.py プロジェクト: smastelini/pymfe
    def test_ft_method_relative(self, dt_id, summary, precompute, sample_size,
                                exp_value):
        """Test relative and subsampling relative landmarking."""
        precomp_group = "relative" if precompute else None

        X, y = load_xy(dt_id)
        mfe = MFE(groups=["relative"],
                  summary=summary,
                  sample_size=sample_size,
                  random_state=1234)

        mfe.fit(X.values, y.values, precomp_groups=precomp_group)

        _, vals = mfe.extract()

        assert np.allclose(vals, exp_value)
コード例 #28
0
    def test_ft_methods_statistical(self, dt_id, ft_name, exp_value,
                                    precompute):
        """Function to test each meta-feature belongs to statistical group."""
        precomp_group = GNAME if precompute else None
        X, y = load_xy(dt_id)
        mfe = MFE(groups=[GNAME],
                  features=[ft_name]).fit(X.values,
                                          y.values,
                                          precomp_groups=precomp_group)
        value = mfe.extract()[1]

        assert np.allclose(value,
                           exp_value,
                           atol=0.001,
                           rtol=0.05,
                           equal_nan=True)
コード例 #29
0
    def test_normality_tests(self, dt_id, exp_value, precompute, test,
                             failure):
        """Test normality tests included in ``nr_norm`` statistical method."""
        precomp_group = GNAME if precompute else None
        X, y = load_xy(dt_id)
        mfe = MFE(groups=[GNAME],
                  features="nr_norm").fit(X.values,
                                          y.values,
                                          precomp_groups=precomp_group)
        value = mfe.extract(nr_norm={"failure": failure, "method": test})[1]

        assert np.allclose(value,
                           exp_value,
                           atol=0.001,
                           rtol=0.05,
                           equal_nan=True)
コード例 #30
0
    def test_parse_valid_metafeatures(self, groups):
        """Check the length of valid metafeatures per group."""
        X, y = utils.load_xy(0)

        mfe = MFE(
            groups="all", summary=None, lm_sample_frac=0.5, random_state=1234
        )

        mfe.fit(X.values, y.values)

        res = mfe.extract()

        target_mtf = mfe.valid_metafeatures(groups=groups)
        names, _ = mfe.parse_by_group(groups, res)

        assert not set(names).symmetric_difference(target_mtf)