Exemple #1
0
def test_load_precomputed_dataset():
    """Tries to load and unpack the dataset on figshare.

    Warning: Requires ~10 GB of memory.

    NB: This test will not redownload the dataset if it is already present.

    """

    from modnet.ext_data import load_ext_dataset
    from pathlib import Path

    path = load_ext_dataset("MP_2018.6", "MODData")
    assert path == Path(__file__).parent.parent.joinpath(
        "data") / "MP_2018.6.zip"
    assert path.is_file()

    # Replace the path with some garbage file and make sure a RuntimeError is raised
    # when the dataset is loaded, as the hash will no longer match
    path.unlink()
    path.touch()

    try:
        with pytest.raises(
                RuntimeError,
                match="Precomputed MODData did not match expected MD5 from"):
            path = load_ext_dataset("MP_2018.6", "MODData")
    finally:
        path.unlink(missing_ok=True)
Exemple #2
0
def test_load_datasets():

    with pytest.raises(ValueError):
        load_ext_dataset("abcd", "MODData")

    with pytest.raises(ValueError):
        load_ext_dataset("MP_2018.6_CROSS_NMI", "MODData")

    path = load_ext_dataset("MP_2018.6_CROSS_NMI", "cross_nmi")
    df = pandas.read_pickle(path)
    assert df.shape == (1304, 1304)
    np.testing.assert_array_equal(df.columns, df.index.values)
Exemple #3
0
    def load_precomputed(cls, dataset_name: str):
        """ Load a `MODData` object from a pre-computed dataset.

        Note:
            Datasets may require significant (~10 GB) amounts of memory
            to load.

        Arguments:
            dataset: the name of the precomputed dataset to load.
                Currently available: 'MP_2018.6'.

        Returns:
            MODData: the precomputed dataset.

        """
        from modnet.ext_data import load_ext_dataset
        model_path = load_ext_dataset(dataset_name, "MODData")
        return cls.load(str(model_path))
Exemple #4
0
    def feature_selection(
        self,
        n: int = 1500,
        cross_nmi: Optional[pd.DataFrame] = None,
        use_precomputed_cross_nmi: bool = False,
        n_jobs: int = None,
    ):
        """Compute the mutual information between features and targets,
        then apply relevance-redundancy rankings to choose the top `n`
        features.

        Sets the `self.optimal_features` attribute to a list of feature
        names.

        Args:
            n: number of desired features.
            cross_nmi: specify the cross NMI between features as a
                dataframe.
            use_precomputed_cross_nmi: Whether or not to use the cross NMI
                that was computed on Materials Project features, instead of
                precomputing.
            n_jobs: max. number of processes to use when calculating cross NMI.

        """
        if getattr(self, "df_featurized", None) is None:
            raise RuntimeError(
                "Mutual information feature selection requiresd featurized data, please call `.featurize()`"
            )
        if getattr(self, "df_targets", None) is None:
            raise RuntimeError(
                "Mutual information feature selection requires target properties"
            )

        ranked_lists = []
        optimal_features_by_target = {}

        if cross_nmi is not None:
            self.cross_nmi = cross_nmi
        elif getattr(self, "cross_nmi", None) is None:
            self.cross_nmi = None

        # Loading mutual information between features
        if use_precomputed_cross_nmi:
            LOG.info("Loading cross NMI from 'Features_cross' file.")
            from modnet.ext_data import load_ext_dataset

            cnmi_path = load_ext_dataset("MP_2018.6_CROSS_NMI", "cross_nmi")
            self.cross_nmi = pd.read_pickle(cnmi_path)
            precomputed_cols = set(self.cross_nmi.columns)
            featurized_cols = set(self.df_featurized.columns)
            if len(precomputed_cols | featurized_cols) > len(precomputed_cols):
                LOG.warning(
                    "Feature mismatch between precomputed `Features_cross` and `df_featurized`. "
                    f"Missing columns: {featurized_cols - precomputed_cols}")

        if self.cross_nmi is None:
            df = self.df_featurized.copy()
            self.cross_nmi, self.feature_entropy = get_cross_nmi(
                df, return_entropy=True, n_jobs=n_jobs)

        if self.cross_nmi.isna().sum().sum() > 0:
            raise RuntimeError(
                "Cross NMI (`moddata.cross_nmi`) contains NaN values, consider setting them to zero."
            )

        for i, name in enumerate(self.names):
            LOG.info(
                f"Starting target {i + 1}/{len(self.names)}: {self.names[i]} ..."
            )

            # Computing mutual information with target
            LOG.info(
                "Computing mutual information between features and target...")
            if getattr(self, "num_classes",
                       None) and self.num_classes[name] >= 2:
                task_type = "classification"
            else:
                task_type = "regression"
            self.target_nmi = nmi_target(self.df_featurized,
                                         self.df_targets[[name]],
                                         task_type)[name]

            LOG.info("Computing optimal features...")
            optimal_features_by_target[name] = get_features_dyn(
                n, self.cross_nmi, self.target_nmi)
            ranked_lists.append(optimal_features_by_target[name])

            LOG.info("Done with target {}/{}: {}.".format(
                i + 1, len(self.names), name))

        LOG.info("Merging all features...")
        self.optimal_features = merge_ranked(ranked_lists)
        self.optimal_features_by_target = optimal_features_by_target
        LOG.info("Done.")