def test_load_precomputed_dataset(): """Tries to load and unpack the dataset on figshare. Warning: Requires ~10 GB of memory. NB: This test will not redownload the dataset if it is already present. """ from modnet.ext_data import load_ext_dataset from pathlib import Path path = load_ext_dataset("MP_2018.6", "MODData") assert path == Path(__file__).parent.parent.joinpath( "data") / "MP_2018.6.zip" assert path.is_file() # Replace the path with some garbage file and make sure a RuntimeError is raised # when the dataset is loaded, as the hash will no longer match path.unlink() path.touch() try: with pytest.raises( RuntimeError, match="Precomputed MODData did not match expected MD5 from"): path = load_ext_dataset("MP_2018.6", "MODData") finally: path.unlink(missing_ok=True)
def test_load_datasets(): with pytest.raises(ValueError): load_ext_dataset("abcd", "MODData") with pytest.raises(ValueError): load_ext_dataset("MP_2018.6_CROSS_NMI", "MODData") path = load_ext_dataset("MP_2018.6_CROSS_NMI", "cross_nmi") df = pandas.read_pickle(path) assert df.shape == (1304, 1304) np.testing.assert_array_equal(df.columns, df.index.values)
def load_precomputed(cls, dataset_name: str): """ Load a `MODData` object from a pre-computed dataset. Note: Datasets may require significant (~10 GB) amounts of memory to load. Arguments: dataset: the name of the precomputed dataset to load. Currently available: 'MP_2018.6'. Returns: MODData: the precomputed dataset. """ from modnet.ext_data import load_ext_dataset model_path = load_ext_dataset(dataset_name, "MODData") return cls.load(str(model_path))
def feature_selection( self, n: int = 1500, cross_nmi: Optional[pd.DataFrame] = None, use_precomputed_cross_nmi: bool = False, n_jobs: int = None, ): """Compute the mutual information between features and targets, then apply relevance-redundancy rankings to choose the top `n` features. Sets the `self.optimal_features` attribute to a list of feature names. Args: n: number of desired features. cross_nmi: specify the cross NMI between features as a dataframe. use_precomputed_cross_nmi: Whether or not to use the cross NMI that was computed on Materials Project features, instead of precomputing. n_jobs: max. number of processes to use when calculating cross NMI. """ if getattr(self, "df_featurized", None) is None: raise RuntimeError( "Mutual information feature selection requiresd featurized data, please call `.featurize()`" ) if getattr(self, "df_targets", None) is None: raise RuntimeError( "Mutual information feature selection requires target properties" ) ranked_lists = [] optimal_features_by_target = {} if cross_nmi is not None: self.cross_nmi = cross_nmi elif getattr(self, "cross_nmi", None) is None: self.cross_nmi = None # Loading mutual information between features if use_precomputed_cross_nmi: LOG.info("Loading cross NMI from 'Features_cross' file.") from modnet.ext_data import load_ext_dataset cnmi_path = load_ext_dataset("MP_2018.6_CROSS_NMI", "cross_nmi") self.cross_nmi = pd.read_pickle(cnmi_path) precomputed_cols = set(self.cross_nmi.columns) featurized_cols = set(self.df_featurized.columns) if len(precomputed_cols | featurized_cols) > len(precomputed_cols): LOG.warning( "Feature mismatch between precomputed `Features_cross` and `df_featurized`. " f"Missing columns: {featurized_cols - precomputed_cols}") if self.cross_nmi is None: df = self.df_featurized.copy() self.cross_nmi, self.feature_entropy = get_cross_nmi( df, return_entropy=True, n_jobs=n_jobs) if self.cross_nmi.isna().sum().sum() > 0: raise RuntimeError( "Cross NMI (`moddata.cross_nmi`) contains NaN values, consider setting them to zero." ) for i, name in enumerate(self.names): LOG.info( f"Starting target {i + 1}/{len(self.names)}: {self.names[i]} ..." ) # Computing mutual information with target LOG.info( "Computing mutual information between features and target...") if getattr(self, "num_classes", None) and self.num_classes[name] >= 2: task_type = "classification" else: task_type = "regression" self.target_nmi = nmi_target(self.df_featurized, self.df_targets[[name]], task_type)[name] LOG.info("Computing optimal features...") optimal_features_by_target[name] = get_features_dyn( n, self.cross_nmi, self.target_nmi) ranked_lists.append(optimal_features_by_target[name]) LOG.info("Done with target {}/{}: {}.".format( i + 1, len(self.names), name)) LOG.info("Merging all features...") self.optimal_features = merge_ranked(ranked_lists) self.optimal_features_by_target = optimal_features_by_target LOG.info("Done.")