Beispiel #1
0
def load_ext_dataset(dataset_name: str, expected_type: Union[Usage, str]):
    """Load one of the preset datasets from the `DATASETS` constant. Will not
    overwrite any existing local data with remote datasets. Checks hashes against
    what is expected and will not depickle if unrecognised.

    Parameters:
        dataset_name: The name (key) of the dataset in `DATASETS`.
        expected_type: A string representing the expected usage of the dataset,
            e.g. `'_MODData'` or `'cross_nmi'`.

    Returns:
        The path to the downloaded or previously installed model.

    """
    import urllib.request
    import urllib.error

    if dataset_name not in DATASETS:
        raise ValueError(
            f"No dataset {dataset_name} found, must be one of {list(DATASETS.keys())}"
        )

    dataset = DATASETS[dataset_name]
    if isinstance(expected_type, str):
        if expected_type == "MODData":
            expected_type = "_MODData"
        expected_type = Usage[expected_type]
    if dataset.usage != expected_type:
        raise ValueError(
            f"Cannot load {dataset_name} as it has the wrong type {dataset.usage}."
        )

    data_dir = Path(__file__).parent.joinpath("data")
    model_path = data_dir.joinpath(dataset.filename)
    if not model_path.is_file():
        LOG.info(
            f"Downloading featurized dataset {dataset_name} from {dataset.url} into {model_path}"
        )
        if not data_dir.is_dir():
            os.makedirs(data_dir)

        try:
            zip_file, response = urllib.request.urlretrieve(
                dataset.url, model_path)
        except (urllib.error.URLError, urllib.error.HTTPError) as exc:
            raise ValueError(
                f"There was a problem downloading {dataset.url}: {exc.reason}")

    if dataset.md5 is not None:
        from modnet.utils import get_hash_of_file

        file_md5 = get_hash_of_file(model_path, algo="md5")
        if file_md5 != dataset.md5:
            raise RuntimeError(
                f"Precomputed {str(dataset.usage.name.strip('_'))} did not match expected MD5 from {dataset.url}, will not depickle."
                f"\nExpected: {str(dataset.md5)}"
                f"\nReceived: {str(file_md5)}")

    return model_path
Beispiel #2
0
    def save(self, filename: str):
        """Pickle the contents of the `MODData` object
        so that it can be loaded in  with `MODData.load()`.

        If the filename ends in "tgz", "bz2" or "zip", the pickle
        will be compressed accordingly by `pandas.to_pickle(...)`.

        """
        pd.to_pickle(self, filename)
        LOG.info(f"Data successfully saved as {filename}!")
Beispiel #3
0
    def save(self, filename: str):
        """Save the `MODNetModel` to filename:

        Parameters:
            filename: The base filename to save to.

        If the filename ends in "tgz", "bz2" or "zip", the pickle
        will be compressed accordingly by `pandas.to_pickle(...)`.

        """
        self._make_picklable()
        pd.to_pickle(self, filename)
        self._restore_model()
        LOG.info(f'Model successfully saved as {filename}!')
Beispiel #4
0
    def save(self, filename: str) -> None:
        """Save the `MODNetModel` to filename:

        If the filename ends in "tgz", "bz2" or "zip", the pickle
        will be compressed accordingly by :meth:`pandas.DataFrame.to_pickle`.

        Parameters:
            filename: The base filename to save to.


        """
        self._make_picklable()
        pd.to_pickle(self, filename)
        self._restore_model()
        LOG.info(f"Model successfully saved as {filename}!")
Beispiel #5
0
    def featurize_composition(self, df: pd.DataFrame) -> pd.DataFrame:
        """Decorate input `pandas.DataFrame` of structures with composition
        features from matminer, specified by the MODFeaturizer preset.

        Currently applies the set of all matminer composition features.

        Arguments:
            df: the input dataframe with a `"structure"` column
                containing `pymatgen.Structure` objects.

        Returns:
            pandas.DataFrame: the decorated DataFrame, or an empty
                DataFrame if no composition/oxidation featurizers
                exist for this class.

        """

        df = df.copy()

        if self.composition_featurizers:

            LOG.info("Applying composition featurizers...")
            df["composition"] = df["structure"].apply(lambda s: s.composition)
            df = self._fit_apply_featurizers(
                df, self.composition_featurizers, "composition"
            )
            df = df.rename(columns={"Input Data": ""})
            df.columns = df.columns.map("|".join).str.strip("|")

        if self.oxid_composition_featurizers:
            LOG.info("Applying oxidation state featurizers...")
            if getattr(self, "fast_oxid", False):
                df = CompositionToOxidComposition(
                    all_oxi_states=False, max_sites=-1
                ).featurize_dataframe(df, "composition")
            else:
                df = CompositionToOxidComposition().featurize_dataframe(
                    df, "composition"
                )
            df = self._fit_apply_featurizers(
                df, self.oxid_composition_featurizers, "composition_oxid"
            )
            df = df.rename(columns={"Input Data": ""})
            df.columns = df.columns.map("|".join).str.strip("|")

        return df
Beispiel #6
0
    def featurize_site(
            self,
            df: pd.DataFrame,
            aliases: Optional[Dict[str, str]] = None) -> pd.DataFrame:
        """ Decorate input `pandas.DataFrame` of structures with site
        features, specified by the MODFeaturizer preset.

        Arguments:
            df: the input dataframe with a `"structure"` column
                containing `pymatgen.Structure` objects.
            aliases: optional dictionary to map matminer output column
                names to new aliases, mostly used for
                backwards-compatibility.

        Returns:
            pandas.DataFrame: the decorated DataFrame.

        """

        LOG.info("Applying site featurizers...")

        df = df.copy()
        df.columns = ["Input data|" + x for x in df.columns]

        for fingerprint in self.site_featurizers:
            site_stats_fingerprint = SiteStatsFingerprint(
                fingerprint, stats=self.site_stats)
            df = site_stats_fingerprint.featurize_dataframe(
                df,
                "Input data|structure",
                multiindex=False,
                ignore_errors=True)

            fingerprint_name = fingerprint.__class__.__name__
            if aliases:
                fingerprint_name = aliases.get(fingerprint_name,
                                               fingerprint_name)
            if "|" not in fingerprint_name:
                fingerprint_name += "|"
            df.columns = [
                f"{fingerprint_name}{x}" if "|" not in x else x
                for x in df.columns
            ]

        return df
Beispiel #7
0
    def load(filename: str) -> "MODNetModel":
        """Load `MODNetModel` object pickled by the :meth:`MODNetModel.save` method.

        If the filename ends in "tgz", "bz2" or "zip", the pickle
        will be decompressed accordingly by :func:`pandas.read_pickle`.

        Returns:
            The loaded `MODNetModel` object.
        """
        pickled_data = None

        if isinstance(filename, Path):
            filename = str(filename)

        # handle .zip files explicitly for OS X/macOS compatibility
        if filename.endswith(".zip"):
            from zipfile import ZipFile

            with ZipFile(filename, "r") as zf:
                namelist = zf.namelist()
                _files = [
                    _ for _ in namelist if not _.startswith("__MACOSX/")
                    or _.startswith(".DS_STORE")
                ]
                if len(_files) == 1:
                    with zf.open(_files.pop()) as f:
                        pickled_data = pd.read_pickle(f)

        if pickled_data is None:
            pickled_data = pd.read_pickle(filename)

        if isinstance(pickled_data, MODNetModel):
            if not hasattr(pickled_data, "__modnet_version__"):
                pickled_data.__modnet_version__ = "unknown"
            pickled_data._restore_model()
            LOG.info(
                f"Loaded {pickled_data} object, created with modnet version {pickled_data.__modnet_version__}"
            )
            return pickled_data

        raise ValueError(
            f"File {filename} did not contain compatible data to create a MODNetModel object, "
            f"instead found {pickled_data.__class__.__name__}.")
Beispiel #8
0
    def featurize_structure(self, df: pd.DataFrame) -> pd.DataFrame:
        """Decorate input `pandas.DataFrame` of structures with structural
        features from matminer, specified by the MODFeaturizer preset.

        Currently applies the set of all matminer structure features.

        Arguments:
            df: the input dataframe with a `"structure"` column
                containing `pymatgen.Structure` objects.

        Returns:
            pandas.DataFrame: the decorated DataFrame.

        """

        LOG.info("Applying structure featurizers...")
        df = df.copy()
        df = self._fit_apply_featurizers(df, self.structure_featurizers, "structure")
        df.columns = df.columns.map("|".join).str.strip("|")

        return df
Beispiel #9
0
def get_features_dyn(n_feat, cross_nmi, target_nmi):
    missing = [x for x in cross_nmi.index if x not in target_nmi.index]
    cross_nmi = cross_nmi.drop(missing, axis=0).drop(missing, axis=1)

    missing = [x for x in target_nmi.index if x not in cross_nmi.index]
    target_nmi = target_nmi.drop(missing, axis=0)
    target_nmi = target_nmi.replace([np.inf, -np.inf, np.nan], 0)

    first_feature = target_nmi.nlargest(1).index[0]
    feature_set = [first_feature]
    get_p = get_rr_p_parameter_default
    get_c = get_rr_c_parameter_default

    if n_feat == -1:
        n_feat = len(cross_nmi.index)
    else:
        n_feat = min(len(cross_nmi.index), n_feat)

    for n in range(n_feat - 1):
        if (n + 1) % 50 == 0:
            LOG.info("Selected {}/{} features...".format(n + 1, n_feat))

        p = get_p(n)
        c = get_c(n)

        score = cross_nmi.copy()
        # score = score.loc[target_mi.index, target_mi.index]
        score = score.drop(feature_set, axis=0)
        score = score[feature_set]

        for i in score.index:
            row = score.loc[i, :]
            score.loc[i, :] = target_nmi[i] / (row**p + c)

        next_feature = score.min(axis=1).idxmax(axis=0)
        feature_set.append(next_feature)

    return feature_set
Beispiel #10
0
    def _fit_apply_featurizers(
        self,
        df: pd.DataFrame,
        featurizers: Iterable[BaseFeaturizer],
        column: str,
        fit_to_df: bool = True,
    ) -> pd.DataFrame:
        """For the list of featurizers, fit each to the chosen column of
        the input pd.DataFrame and then apply them as a MultipleFeaturizer.

        Arguments:
            df: The DataFrame to featurize.
            featurizers: The list of matminer featurizers to fit and apply
                to the DataFrame.
            column: The name of the column to apply the featurizers to.
            fit_to_df: Whether or not to fit the featurizers to the
                input dataframe. If not true, it will be assumed that
                any featurizers that required fitting have already been
                fitted.

        Returns:
            pandas.DataFrame: the decorated DataFrame.

        """
        LOG.info(f"Applying featurizers {featurizers} to column {column!r}.")
        if fit_to_df:
            _featurizers = MultipleFeaturizer(
                [feat.fit(df[column]) for feat in featurizers]
            )
        else:
            _featurizers = MultipleFeaturizer(featurizers)

        if self._n_jobs is not None:
            _featurizers.set_n_jobs(self._n_jobs)

        return _featurizers.featurize_dataframe(
            df, column, multiindex=True, ignore_errors=True
        )
Beispiel #11
0
    def fit(
        self,
        training_data: MODData,
        n_jobs=1,
        **kwargs,
    ) -> None:
        """Train the model on the passed training `MODData` object.

        Parameters:
            same as MODNetModel fit.
        """

        if self.bootstrap:
            LOG.info("Generating bootstrap data...")
            train_datas = [training_data.split((resample(np.arange(len(training_data.df_targets)),
                    replace=True, random_state=2943),[]))[0] for _ in range(self.n_models)]
        else:
            train_datas = [training_data for _ in range(self.n_models)]

        if n_jobs<=1:
            for i in range(self.n_models):
                LOG.info(f"Bootstrap fitting model #{i + 1}/{self.n_models}")
                self.model[i].fit(train_datas[i], **kwargs)
                model_summary = ""
                for k in self.model[i].history.keys():
                    model_summary += "{}: {:.4f}\t".format(k, self.model[i].history[k][-1])
                LOG.info(model_summary)
        else:
                ctx = multiprocessing.get_context('spawn')
                pool = ctx.Pool(processes=n_jobs)
                tasks =[]
                for i,m in enumerate(self.model):
                    m._make_picklable()
                    tasks.append({'model':m, 'training_data':train_datas[i], 'model_id':i, **kwargs})
                for res in tqdm.tqdm(pool.imap_unordered(_map_fit_MODNet, tasks, chunksize=1),
                                     total=self.n_models):
                    model, model_id = res
                    model._restore_model()
                    self.model[model_id] = model
                    model_summary = f"Model #{model_id}\t"
                    for k in model.history.keys():
                        model_summary += "{}: {:.4f}\t".format(k, model.history[k][-1])
                    LOG.info(model_summary)
                pool.close()
                pool.join()
Beispiel #12
0
    def feature_selection(
        self,
        n: int = 1500,
        cross_nmi: Optional[pd.DataFrame] = None,
        use_precomputed_cross_nmi: bool = False,
        n_jobs: int = None,
    ):
        """Compute the mutual information between features and targets,
        then apply relevance-redundancy rankings to choose the top `n`
        features.

        Sets the `self.optimal_features` attribute to a list of feature
        names.

        Args:
            n: number of desired features.
            cross_nmi: specify the cross NMI between features as a
                dataframe.
            use_precomputed_cross_nmi: Whether or not to use the cross NMI
                that was computed on Materials Project features, instead of
                precomputing.
            n_jobs: max. number of processes to use when calculating cross NMI.

        """
        if getattr(self, "df_featurized", None) is None:
            raise RuntimeError(
                "Mutual information feature selection requiresd featurized data, please call `.featurize()`"
            )
        if getattr(self, "df_targets", None) is None:
            raise RuntimeError(
                "Mutual information feature selection requires target properties"
            )

        ranked_lists = []
        optimal_features_by_target = {}

        if cross_nmi is not None:
            self.cross_nmi = cross_nmi
        elif getattr(self, "cross_nmi", None) is None:
            self.cross_nmi = None

        # Loading mutual information between features
        if use_precomputed_cross_nmi:
            LOG.info("Loading cross NMI from 'Features_cross' file.")
            from modnet.ext_data import load_ext_dataset

            cnmi_path = load_ext_dataset("MP_2018.6_CROSS_NMI", "cross_nmi")
            self.cross_nmi = pd.read_pickle(cnmi_path)
            precomputed_cols = set(self.cross_nmi.columns)
            featurized_cols = set(self.df_featurized.columns)
            if len(precomputed_cols | featurized_cols) > len(precomputed_cols):
                LOG.warning(
                    "Feature mismatch between precomputed `Features_cross` and `df_featurized`. "
                    f"Missing columns: {featurized_cols - precomputed_cols}")

        if self.cross_nmi is None:
            df = self.df_featurized.copy()
            self.cross_nmi, self.feature_entropy = get_cross_nmi(
                df, return_entropy=True, n_jobs=n_jobs)

        if self.cross_nmi.isna().sum().sum() > 0:
            raise RuntimeError(
                "Cross NMI (`moddata.cross_nmi`) contains NaN values, consider setting them to zero."
            )

        for i, name in enumerate(self.names):
            LOG.info(
                f"Starting target {i + 1}/{len(self.names)}: {self.names[i]} ..."
            )

            # Computing mutual information with target
            LOG.info(
                "Computing mutual information between features and target...")
            if getattr(self, "num_classes",
                       None) and self.num_classes[name] >= 2:
                task_type = "classification"
            else:
                task_type = "regression"
            self.target_nmi = nmi_target(self.df_featurized,
                                         self.df_targets[[name]],
                                         task_type)[name]

            LOG.info("Computing optimal features...")
            optimal_features_by_target[name] = get_features_dyn(
                n, self.cross_nmi, self.target_nmi)
            ranked_lists.append(optimal_features_by_target[name])

            LOG.info("Done with target {}/{}: {}.".format(
                i + 1, len(self.names), name))

        LOG.info("Merging all features...")
        self.optimal_features = merge_ranked(ranked_lists)
        self.optimal_features_by_target = optimal_features_by_target
        LOG.info("Done.")
Beispiel #13
0
    def featurize(self,
                  fast: bool = False,
                  db_file: str = "feature_database.pkl",
                  n_jobs=None):
        """For the input structures, construct many matminer features
        and save a featurized dataframe. If `db_file` is specified, this
        method will try to load previous feature calculations for each
        structure ID instead of recomputing.

        Sets the `self.df_featurized` attribute.

        Args:
            fast (bool): whether or not to try to load from a backup.
            db_file (str): filename of a pickled dataframe containing
                with the same ID index as this `MODData` object.

        """

        LOG.info("Computing features, this can take time...")

        df_done = None
        df_todo = None

        if n_jobs is not None:
            self.featurizer.set_n_jobs(n_jobs)

        if self.df_featurized is not None:
            raise RuntimeError(
                "Not overwriting existing featurized dataframe.")

        if fast:
            LOG.info("Fast featurization on, retrieving from database...")

            global DATABASE
            if DATABASE.empty:
                DATABASE = pd.read_pickle(db_file)

            ids_done = [x for x in self.structure_ids if x in DATABASE.index]

            LOG.info(
                f"Retrieved features for {len(ids_done)} out of {len(self.structure_ids)} materials"
            )
            df_done = DATABASE.loc[ids_done]
            df_todo = self.df_structure.drop(ids_done, axis=0)

        # if any structures were already loaded
        if fast and not df_done.empty:
            # if any are left to compute, do them
            if len(df_todo) > 0:
                df_finished = self.featurizer.featurize(df_todo)
                df_final = df_done.append(df_finished)
                df_final = df_final.reindex(self.structure_ids)

            # otherwise, all structures were successfully loaded
            else:
                df_final = df_done

        # otherwise, no structures were loaded, so we need to compute all
        else:
            df_final = self.featurizer.featurize(self.df_structure)

        df_final = df_final.replace([np.inf, -np.inf, np.nan], 0)

        self.df_featurized = df_final
        LOG.info("Data has successfully been featurized!")
Beispiel #14
0
    def __init__(
        self,
        materials: Optional[List[Union[Structure, Composition]]] = None,
        targets: Optional[Union[List[float], np.ndarray]] = None,
        target_names: Optional[Iterable] = None,
        structure_ids: Optional[Iterable] = None,
        num_classes: Optional[Dict[str, int]] = None,
        df_featurized: Optional[pd.DataFrame] = None,
        featurizer: Optional[Union[MODFeaturizer, str]] = None,
        structures: Optional[List[Union[Structure, Composition]]] = None,
    ):
        """Initialise the MODData object either from a list of structures
        or from an already featurized dataframe. Prediction targets per
        structure can be specified as lists or an array alongside their
        target names. A list of unique IDs can be provided to label the
        structures.

        Args:
            materials: list of structures or compositions to featurize and predict.
            targets: optional List of targets corresponding to each structure. When learning on multiple targets this
             is a ndarray where each column corresponds to a target, i.e. of shape (n_materials,n_targets).
            target_names: optional Iterable (e.g. list) of names of target properties to use in the dataframe.
            structure_ids: optional Iterable of unique IDs to use instead of generated integers.
            num_classes: Dictionary defining the target types (classification or regression).
                Should be constructed as follows: key: string giving the target name; value: integer n,
                 with n=0 for regression and n>=2 for classification with n the number of classes.
            df_featurized: optional featurized dataframe to use instead of
                featurizing a new one. Should be passed without structures.
            featurizer: optional MODFeaturizer object to use for featurization, or string
                preset to look up in presets dictionary.
            structures: deprecated (alias to materials for backward compatibility) do not use this.

        """

        from modnet.featurizers.presets import FEATURIZER_PRESETS

        self.__modnet_version__ = __version__
        self.df_featurized = df_featurized
        self.featurizer = featurizer
        self.cross_nmi = None

        if structures is not None:  # overwrite materials for backward compatibility
            materials = structures

        if materials is not None and self.df_featurized is not None:
            if len(materials) != len(self.df_featurized):
                raise RuntimeError(
                    "Mismatched shape of structures and passed df_featurized")

        if materials is None and self.df_featurized is None:
            raise RuntimeError(
                "At least one of `structures` or `df_featurized` should be passed to `MODData`."
            )

        if targets is not None:
            targets = np.array(targets).reshape((len(targets), -1))

        if materials is not None and targets is not None:
            if np.shape(targets)[0] != len(materials):
                raise ValueError(
                    f"Targets must have same length as structures: {np.shape(targets)} vs {len(materials)}"
                )

        if materials is not None and isinstance(materials[0], Composition):
            materials = [CompositionContainer(s) for s in materials]
            self._composition_only = True

        if isinstance(featurizer, str):
            self.featurizer = FEATURIZER_PRESETS.get(featurizer)()
            if self.featurizer is None:
                raise RuntimeError(
                    "Requested preset {featurizer} not found in available presets: {FEATURIZER_PRESETS.keys()}"
                )
        elif isinstance(featurizer, MODFeaturizer):
            self.featurizer = featurizer
        elif featurizer is None and self.df_featurized is None:
            if getattr(self, "_composition_only", False):
                self.featurizer = FEATURIZER_PRESETS["CompositionOnly"]()
            else:
                self.featurizer = FEATURIZER_PRESETS["DeBreuck2020"]()

        if self.featurizer is not None:
            LOG.info(
                f"Loaded {self.featurizer.__class__.__name__} featurizer.")

        if target_names is not None:
            if np.shape(targets)[-1] != len(target_names):
                raise ValueError(
                    "Target names must be supplied for every target.")
        elif targets is not None:
            target_names = ["prop" + str(i) for i in range(len(targets))]

        if structure_ids is not None:
            # for backwards compat, always store the *passed* list of
            # IDs, so they can be used when loading from a database file
            # check ids are unique
            if len(set(structure_ids)) != len(structure_ids):
                raise ValueError(
                    "List of IDs (`structure_ids`) provided must be unique.")

            if len(structure_ids) != len(materials):
                raise ValueError(
                    "List of IDs (`structure_ids`) must have same length as list of structure."
                )

        else:
            num_entries = (len(materials)
                           if materials is not None else len(df_featurized))
            structure_ids = [f"id{i}" for i in range(num_entries)]

        if targets is not None:
            # set up dataframe for targets with columns (id, property_1, ..., property_n)
            self.df_targets = pd.DataFrame(targets,
                                           index=structure_ids,
                                           columns=target_names)
            # set up number of classes
            self.num_classes = {name: 0 for name in self.target_names}
            if num_classes is not None:
                self.num_classes.update(num_classes)

        # set up dataframe for structures with columns (id, structure)
        self.df_structure = pd.DataFrame({
            "id": structure_ids,
            "structure": materials
        })
        self.df_structure.set_index("id", inplace=True)
Beispiel #15
0
def get_features_relevance_redundancy(
    target_nmi: pd.DataFrame,
    cross_nmi: pd.DataFrame,
    n_feat: Optional[int] = None,
    rr_parameters: Optional[Dict[str, Union[float, Callable[[int],
                                                            float]]]] = None,
    return_pc: bool = False,
) -> List:
    """
    Select features from the Relevance Redundancy (RR) score between the input
    features and the target output.

    The RR is defined following Equation 2 of De Breuck et al, arXiv:2004:14766,
    with default values,

    ..math:: p = \\max{0.1, 4.5 -  n^{0.4}},

    and

    ..math:: c = 10^{-6} n^3,

    where :math:`n` is the number of features in the "chosen" subset for that iteration.
    These values can be overriden with the `rr_parameters` dictionary argument.

    Args:
        target_nmi (pandas.DataFrame): dataframe  containing the Normalized
            Mutual Information (NMI) between a list of input features and a
            target variable, as computed from :py:func:`nmi_target`.
        cross_nmi (pandas.DataFrame): dataframe containing the NMI between the
            input features, as computed from :py:func:`get_cross_nmi`.
        n_feat (int): Number of features for which the RR score needs to be computed (default: all features).
        rr_parameters (dict): Allows tuning of p and c parameters. Currently
            allows fixing of p and c to constant values instead of using the
            dynamical evaluation. Expects to find keys `"p"` and `"c"`, containing
            either a callable that takes `n` as an argument and returns the
            desired `p` or `c`, or another dictionary containing the key `"value"`
            that stores a constant value of `p` or `c`.
        return_pc: Whether to return p and c values in the output dictionaries.

    Returns:
        list: List of dictionaries containing the results of the relevance-redundancy selection algorithm.

    """
    # Initial checks
    if set(cross_nmi.index) != set(cross_nmi.columns):
        raise ValueError(
            "The cross_nmi DataFrame should have its indices and columns identical."
        )
    if not set(target_nmi.index).issubset(set(cross_nmi.index)):
        raise ValueError(
            "The indices of the target DataFrame should be included in the cross_nmi DataFrame indices."
        )

    # Define the functions for the parameters
    if rr_parameters is None:
        get_p = get_rr_p_parameter_default
        get_c = get_rr_c_parameter_default
    else:
        if "p" not in rr_parameters or "c" not in rr_parameters:
            raise ValueError(
                "When tuning p and c with rr_parameters in get_features_relevance_redundancy, "
                "both parameters should be tuned")
        # Set up p
        if callable(rr_parameters["p"]):
            get_p = rr_parameters["p"]
        elif rr_parameters["p"].get("function") == "constant":

            def get_p(_):
                return rr_parameters["p"]["value"]

        else:
            raise ValueError(
                'If not passing a callable, "p" dict must contain keys "function" and "value".'
            )
        # Set up c
        if callable(rr_parameters["c"]):
            get_c = rr_parameters["c"]
        elif rr_parameters["c"].get("function") == "constant":

            def get_c(_):
                return rr_parameters["c"]["value"]

        else:
            raise ValueError(
                'If not passing a callable, "c" dict must contain keys "function" and "value".'
            )

    # Set up the output list
    out = []

    # The first feature is the one with the largest target NMI
    target_column = target_nmi.columns[0]
    first_feature = target_nmi.nlargest(1, columns=target_column).index[0]
    feature_set = [first_feature]
    feat_out = {
        "feature": first_feature,
        "RR_score": None,
        "NMI_target": target_nmi[target_column][first_feature],
    }
    if return_pc:
        feat_out["RR_p"] = None
        feat_out["RR_c"] = None
    out.append(feat_out)

    # Default is to get the RR score for all features
    if n_feat is None:
        n_feat = len(target_nmi.index)

    missing = [x for x in cross_nmi.index if x not in target_nmi.index]
    cross_nmi = cross_nmi.drop(missing, axis=0).drop(missing, axis=1)
    # Loop on the number of features
    for n in range(1, n_feat):
        LOG.debug("In selection of feature {}/{} features...".format(
            n + 1, n_feat))
        if (n + 1) % 50 == 0:
            LOG.info("Selected {}/{} features...".format(n, n_feat))
        p = get_p(n)
        c = get_c(n)

        # Compute the RR score
        score = cross_nmi.copy()
        # Remove features already selected for the index
        score = score.drop(feature_set, axis=0)
        # Use features already selected to compute the maximum NMI between
        # the remaining features and those already selected
        score = score[feature_set]

        # Get the scores of the remaining features
        for i in score.index:
            row = score.loc[i, :]
            score.loc[i, :] = target_nmi.loc[i, target_column] / (row**p + c)

        # Get the next feature (the one with the highest score)
        scores_remaining_features = score.min(axis=1)
        next_feature = scores_remaining_features.idxmax(axis=0)
        feature_set.append(next_feature)

        # Add the results for the next feature to the list
        feat_out = {
            "feature": next_feature,
            "RR_score": scores_remaining_features[next_feature],
            "NMI_target": target_nmi[target_column][next_feature],
        }
        if return_pc:
            feat_out["RR_p"] = p
            feat_out["RR_c"] = c

        out.append(feat_out)

    return out
Beispiel #16
0
def get_cross_nmi(
    df_feat: pd.DataFrame,
    drop_thr: float = 0.2,
    return_entropy=False,
    n_jobs: int = None,
    **kwargs,
) -> pd.DataFrame:
    """
    Computes the Normalized Mutual Information (NMI) between input features.

    Args:
        df_feat (pandas.DataFrame): Dataframe containing the input features for
            which the NMI with the target variable is to be computed.
        drop_thr: Features having an information entropy (or self mutual information) threshold below this value will be dropped.
        return_entropy: If set to True, the information entropy of each feature is also returned
        **kwargs: Keyword arguments to be passed down to the
            :py:func:`mutual_info_regression` function from scikit-learn. This
            can be useful e.g. for testing purposes.

    Returns:
        mutual_info: pandas.DataFrame containing the Normalized Mutual Information between features.
        if return_entropy=True : (mutual_info, diag): With diag a dictionary with all features as keys and information entropy as values.
    """

    if kwargs.get("random_state"):
        seed = kwargs.pop("random_state")
    else:
        seed = np.random.RandomState()

    if kwargs.get("n_neighbors"):
        n_neighbors = kwargs.pop("n_neighbors")
    else:
        n_neighbors = 3

    # Prepare the output DataFrame and compute the mutual information
    mutual_info = pd.DataFrame([],
                               columns=df_feat.columns,
                               index=df_feat.columns)

    # create pool of workers
    if n_jobs is None:
        n_jobs = 1
    pool = Pool(processes=n_jobs)

    LOG.info(f"Multiprocessing on {n_jobs} workers.")

    # Compute the "self" mutual information (i.e. information entropy) of the features
    LOG.info('Computing "self" MI (i.e. information entropy) of features')
    diag = {}
    tasks = []
    for x_feat in df_feat.columns:
        tasks += [{
            "x": df_feat[x_feat].values,
            "y": df_feat[x_feat].values,
            "x_name": x_feat,
            "y_name": x_feat,
            "random_state": seed,
            "n_neighbors": n_neighbors,
        }]

    for res in tqdm.tqdm(pool.imap_unordered(map_mi, tasks, chunksize=100),
                         total=len(tasks)):
        feat_name = res[1]
        diag[feat_name] = res[0]
        if (diag[feat_name] < drop_thr
                or abs(df_feat[feat_name].max() - df_feat[feat_name].min()) <
                EPS):
            mutual_info.drop(feat_name, axis=0, inplace=True)
            mutual_info.drop(feat_name, axis=1, inplace=True)
        else:
            mutual_info.loc[feat_name, feat_name] = 1.0

    tasks = []
    LOG.info("Computing cross NMI between all features...")
    for idx, x_feat in enumerate(mutual_info.columns):
        for y_feat in mutual_info.columns[idx + 1:]:
            tasks += [{
                "x": df_feat[x_feat].values,
                "y": df_feat[y_feat].values,
                "x_name": x_feat,
                "y_name": y_feat,
                "random_state": seed,
                "n_neighbors": n_neighbors,
            }]

    for res in tqdm.tqdm(pool.imap_unordered(map_mi, tasks, chunksize=100),
                         total=len(tasks)):
        mutual_info.loc[res[1], res[2]] = mutual_info.loc[
            res[2], res[1]] = res[0] / (0.5 * (diag[res[1]] + diag[res[2]]))
    pool.close()
    pool.join()

    mutual_info.fillna(0, inplace=True)  # if na => no relation => set to zero

    if return_entropy:
        return (
            mutual_info,
            diag,
        )  # diag can be useful for future elimination based on entropy without the need of recomputing the cross NMI
    else:
        return mutual_info
Beispiel #17
0
def train_fold(
    fold: Tuple[int, Tuple[MODData, MODData]],
    target: List[str],
    target_weights: Dict[str, float],
    fit_settings: Dict[str, Any],
    model_type: Type[MODNetModel] = MODNetModel,
    presets=None,
    hp_optimization=True,
    classification=False,
    save_folds=False,
    fast=False,
    save_models=False,
    nested=False,
    n_jobs=None,
    **model_kwargs,
) -> dict:
    """Train one fold of a CV.

    Unless stated, all arguments have the same meaning as in `matbench_benchmark(...)`.

    Arguments:
        fold: A tuple containing the fold index, and another tuple of the
            training MODData and test MODData.

    Returns:
        A dictionary summarising the fold results.

    """

    fold_ind, (train_data, test_data) = fold

    results = {}
    if classification:
        fit_settings["num_classes"] = {t: 2 for t in target_weights}

    multi_target = bool(len(target) - 1)

    # If not performing hp_optimization, load model init settings from fit_settings
    model_settings = {}
    if not hp_optimization:
        model_settings = {
            "num_neurons": fit_settings["num_neurons"],
            "num_classes": fit_settings.get("num_classes"),
            "act": fit_settings.get("act"),
            "out_act": fit_settings.get("out_act", "linear"),
            "n_feat": fit_settings["n_feat"],
        }

    model_settings.update(model_kwargs)

    model = model_type(target, target_weights, **model_settings)

    if hp_optimization:
        (
            models,
            val_losses,
            best_learning_curve,
            learning_curves,
            best_presets,
        ) = model.fit_preset(
            train_data,
            presets=presets,
            fast=fast,
            classification=classification,
            nested=nested,
            n_jobs=n_jobs,
        )
        if save_models:
            for ind, nested_model in enumerate(models):
                score = val_losses[ind]
                nested_model.save(
                    f"results/nested_model_{fold_ind}_{ind}_{score:3.3f}")

            model.save(f"results/best_model_{fold_ind}_{score:3.3f}")

        results["nested_losses"] = val_losses
        results["nested_learning_curves"] = learning_curves
        results["best_learning_curves"] = best_learning_curve
    else:
        if fit_settings["increase_bs"]:
            model.fit(
                train_data,
                lr=fit_settings["lr"],
                epochs=fit_settings["epochs"],
                batch_size=fit_settings["batch_size"],
                loss="mse",
            )
            model.fit(
                train_data,
                lr=fit_settings["lr"] / 7,
                epochs=fit_settings["epochs"] // 2,
                batch_size=fit_settings["batch_size"] * 2,
                loss=fit_settings["loss"],
            )
        else:
            model.fit(train_data, **fit_settings)

    try:
        predict_kwargs = {}
        if classification:
            predict_kwargs["return_prob"] = True
        if model.can_return_uncertainty:
            predict_kwargs["return_unc"] = True

        pred_results = model.predict(test_data, **predict_kwargs)
        if isinstance(pred_results, tuple):
            predictions, stds = pred_results
        else:
            predictions = pred_results
            stds = None

        targets = test_data.df_targets

        if classification:
            from sklearn.metrics import roc_auc_score
            from sklearn.preprocessing import OneHotEncoder

            y_true = OneHotEncoder().fit_transform(targets.values).toarray()
            score = roc_auc_score(y_true, predictions.values)
            pred_bool = model.predict(test_data, return_prob=False)
            LOG.info(f"ROC-AUC: {score}")
            errors = targets - pred_bool
        elif multi_target:
            errors = targets - predictions
            score = np.mean(np.abs(errors.values), axis=0)
        else:
            errors = targets - predictions
            score = np.mean(np.abs(errors.values))
    except Exception:
        print_exc()
        print("Something went wrong benchmarking this model.")
        predictions = None
        errors = None
        score = None

    if save_folds:
        opt_feat = train_data.optimal_features[:fit_settings["n_feat"]]
        df_train = train_data.df_featurized
        df_train = df_train[opt_feat]
        df_train.to_csv("folds/train_f{}.csv".format(ind + 1))
        df_test = test_data.df_featurized
        df_test = df_test[opt_feat]
        errors.columns = [x + "_error" for x in errors.columns]
        df_test = df_test.join(errors)
        df_test.to_csv("folds/test_f{}.csv".format(ind + 1))

    results["predictions"] = predictions
    if stds is not None:
        results["stds"] = stds
    results["targets"] = targets
    results["errors"] = errors
    results["scores"] = score
    results["best_presets"] = best_presets
    results["model"] = model

    return results
Beispiel #18
0
    def fit_preset(
        self,
        data: MODData,
        presets: List[Dict[str, Any]] = None,
        val_fraction: float = 0.15,
        verbose: int = 0,
        classification: bool = False,
        refit: bool = True,
        fast: bool = False,
        nested: int = 5,
        callbacks: List[Any] = None,
        n_jobs=None,
    ) -> Tuple[List[List[Any]], np.ndarray, Optional[List[float]],
               List[List[float]], Dict[str, Any], ]:
        """Chooses an optimal hyper-parametered MODNet model from different presets.

        This function implements the "inner loop" of a cross-validation workflow. By
        modifying the `nested` argument, it can be run in full nested mode (i.e.
        train n_fold * n_preset models) or just with a simple random hold-out set.

        The data is first fitted on several well working MODNet presets
        with a validation set (10% of the furnished data by default).

        Sets the `self.model` attribute to the model with the lowest mean validation loss across
        all folds.

        Args:
            data: MODData object contain training and validation samples.
            presets: A list of dictionaries containing custom presets.
            verbose: The verbosity level to pass to tf.keras
            val_fraction: The fraction of the data to use for validation.
            classification: Whether or not we are performing classification.
            refit: Whether or not to refit the final model for each fold with
                the best-performing settings.
            fast: Used for debugging. If `True`, only fit the first 2 presets and
                reduce the number of epochs.
            nested: integer specifying whether or not to perform a full nested CV. If 0,
                a simple validation split is performed based on val_fraction argument.
                If an integer, use this number of inner CV folds, ignoring the `val_fraction` argument.
                Note: If set to 1, the value will be overwritten to a default of 5 folds.
            n_jobs: number of jobs for multiprocessing

        Returns:
            - A list of length num_outer_folds containing lists of MODNet models of length num_inner_folds.
            - A list of validation losses achieved by the best model for each fold during validation (excluding refit).
            - The learning curve of the final (refitted) model (or `None` if `refit` is `False`)
            - A nested list of learning curves for each trained model of lengths (num_outer_folds,  num_inner folds).
            - The settings of the best-performing preset.

        """

        from modnet.matbench.benchmark import matbench_kfold_splits

        if callbacks is None:
            es = tf.keras.callbacks.EarlyStopping(
                monitor="loss",
                min_delta=0.001,
                patience=100,
                verbose=verbose,
                mode="auto",
                baseline=None,
                restore_best_weights=False,
            )
            callbacks = [es]

        if presets is None:
            from modnet.model_presets import gen_presets

            presets = gen_presets(
                len(data.optimal_features),
                len(data.df_targets),
                classification=classification,
            )

        if fast and len(presets) >= 2:
            presets = presets[:2]
            for k, _ in enumerate(presets):
                presets[k]["epochs"] = 100

        num_nested_folds = 5
        if nested:
            num_nested_folds = nested
        if num_nested_folds <= 1:
            num_nested_folds = 5

        # create tasks
        splits = matbench_kfold_splits(data,
                                       n_splits=num_nested_folds,
                                       classification=classification)
        if not nested:
            splits = [
                train_test_split(range(len(data.df_featurized)),
                                 test_size=val_fraction)
            ]
            n_splits = 1
        else:
            n_splits = num_nested_folds
        train_val_datas = []
        for train, val in splits:
            train_val_datas.append(data.split((train, val)))

        tasks = []
        for i, params in enumerate(presets):
            n_feat = min(len(data.get_optimal_descriptors()), params["n_feat"])

            for ind in range(n_splits):
                val_params = {}
                train_data, val_data = train_val_datas[ind]
                val_params["val_data"] = val_data

                tasks += [{
                    "train_data": train_data,
                    "targets": self.targets,
                    "weights": self.weights,
                    "num_classes": self.num_classes,
                    "n_feat": n_feat,
                    "num_neurons": params["num_neurons"],
                    "lr": params["lr"],
                    "batch_size": params["batch_size"],
                    "epochs": params["epochs"],
                    "loss": params["loss"],
                    "act": params["act"],
                    "out_act": self.out_act,
                    "callbacks": callbacks,
                    "preset_id": i,
                    "fold_id": ind,
                    "verbose": verbose,
                    **val_params,
                }]

        val_losses = 1e20 * np.ones((len(presets), n_splits))
        learning_curves = [[None for _ in range(n_splits)]
                           for _ in range(len(presets))]
        models = [[None for _ in range(n_splits)] for _ in range(len(presets))]

        ctx = multiprocessing.get_context("spawn")
        pool = ctx.Pool(processes=n_jobs)
        LOG.info(
            f"Multiprocessing on {n_jobs} cores. Total of {multiprocessing.cpu_count()} cores available."
        )

        for res in tqdm.tqdm(
                pool.imap_unordered(map_validate_model, tasks, chunksize=1),
                total=len(tasks),
        ):
            val_loss, learning_curve, model, preset_id, fold_id = res
            LOG.info(f"Preset #{preset_id} fitting finished, loss: {val_loss}")
            # reload the model object after serialization
            model._restore_model()

            val_losses[preset_id, fold_id] = val_loss
            learning_curves[preset_id][fold_id] = learning_curve
            models[preset_id][fold_id] = model

        pool.close()
        pool.join()

        val_loss_per_preset = np.mean(val_losses, axis=1)
        best_preset_idx = int(np.argmin(val_loss_per_preset))
        best_model_idx = int(np.argmin(val_losses[best_preset_idx, :]))
        best_preset = presets[best_preset_idx]
        best_learning_curve = learning_curves[best_preset_idx][best_model_idx]
        best_model = models[best_preset_idx][best_model_idx]

        LOG.info(
            "Preset #{} resulted in lowest validation loss with params {}".
            format(best_preset_idx + 1,
                   tasks[n_splits * best_preset_idx + best_model_idx]))

        if refit:
            LOG.info("Refitting with all data and parameters: {}".format(
                best_preset))
            # Building final model

            n_feat = min(len(data.get_optimal_descriptors()),
                         best_preset["n_feat"])
            self.model = MODNetModel(
                self.targets,
                self.weights,
                num_neurons=best_preset["num_neurons"],
                n_feat=n_feat,
                act=best_preset["act"],
                out_act=self.out_act,
                num_classes=self.num_classes,
            ).model
            self.n_feat = n_feat
            self.fit(
                data,
                val_fraction=0,
                lr=best_preset["lr"],
                epochs=best_preset["epochs"],
                batch_size=best_preset["batch_size"],
                loss=best_preset["loss"],
                callbacks=callbacks,
                verbose=verbose,
            )
        else:
            self.n_feat = best_model.n_feat
            self.model = best_model.model
            self._scaler = best_model._scaler

        return models, val_losses, best_learning_curve, learning_curves, best_preset
Beispiel #19
0
def gen_presets(n_feat: int,
                n_samples: int,
                classification: bool = False) -> List[Dict[str, Any]]:
    """Generates sensible preset architectures and learning parameters
    based on number of samples and features.

    Arguments:
        n_feat: The number of training features available to the model.
        n_samples: The number of training samples available to the model.

    Returns:
        List of dictionaries to individually pass as kwargs to `model.fit(...)`.

    """
    if n_samples < 1000:
        batch_sizes = [32, 64]
    else:
        batch_sizes = [64]
    learning_rates = [0.001, 0.005, 0.01]
    epochs = [1000]

    if classification:
        losses = ["categorical_crossentropy"]
    else:
        losses = ["mae"]

    activations = ["elu"]
    xscale = ["minmax", "standard"]

    n_feat_list = [64, 128, 256, 512]
    n_feat_list = [n for n in n_feat_list if n <= n_feat]
    n_feat_list = [n for n in n_feat_list if n > n_feat / 20]
    if len(n_feat_list) == 1:
        n_feat_list.append(n_feat)

    if len(n_feat_list) < 3:
        n_feat_list.append((n_feat_list[0] + n_feat_list[1]) // 2)
    n_feat_list = sorted(n_feat_list)

    archs = []
    for nf in n_feat_list:
        archs += [
            (nf, [[nf * 2], [nf // 2], [nf // 8], [nf // 8]]),
            (nf, [[nf], [nf // 2], [nf // 8], [nf // 8]]),
            (nf, [[nf // 2], [nf // 4], [nf // 8], [nf // 8]]),
        ]

    LOG.info(
        "Proceeding with grid search: archs: {}, batch sizes: {}, learning_rates: {}"
        .format(archs, batch_sizes, learning_rates))

    hyperparam_presets = []
    for a, bs, lr, e, l, act, scaler in itertools.product(
            archs, batch_sizes, learning_rates, epochs, losses, activations,
            xscale):
        preset = {
            "batch_size": bs,
            "lr": lr,
            "n_feat": a[0],
            "num_neurons": a[1],
            "epochs": e,
            "loss": l,
            "act": act,
            "xscale": scaler,
        }
        hyperparam_presets.append(preset)

    return hyperparam_presets