def test_multiindex_in_multifeaturizer(self): # Make sure multiplefeaturizer returns the correct sub-featurizer multiindex keys # test both iteration over entries and featurizers for iter_entries in [True, False]: mf = MultipleFeaturizer([self.multi, self.single], iterate_over_entries=iter_entries) df_1lvl = pd.DataFrame({'x': [1, 2, 3]}) df_2lvl = pd.DataFrame({'x': [1, 2, 3]}) df_2lvl.columns = pd.MultiIndex.from_product((["Custom"], df_2lvl.columns.values)) df_3lvl = pd.DataFrame({'x': [1, 2, 3]}) df_3lvl.columns = pd.MultiIndex.from_product((["Custom"], ["Custom2"], df_3lvl.columns.values)) # If input dataframe has flat column index df_1lvl = mf.featurize_dataframe(df_1lvl, 'x', multiindex=True) self.assertEqual(df_1lvl[("Input Data", "x")].iloc[0], 1) self.assertEqual(df_1lvl[("MultipleFeatureFeaturizer", "w")].iloc[0], 0) self.assertEqual(df_1lvl[("SingleFeaturizer", "y")].iloc[0], 2) # If input dataframe has 2-lvl column index df_2lvl = mf.featurize_dataframe(df_2lvl, ("Custom", 'x'), multiindex=True) self.assertEqual(df_2lvl[("Custom", "x")].iloc[0], 1) self.assertEqual(df_2lvl[("MultipleFeatureFeaturizer", "w")].iloc[0], 0) self.assertEqual(df_2lvl[("SingleFeaturizer", "y")].iloc[0], 2) # If input dataframe has 2+ lvl column index with self.assertRaises(IndexError): df_3lvl = self.multi.featurize_dataframe(df_3lvl, ("Custom", "Custom2", 'x'), multiindex=True)
def test_multi_featurizer(self): ff1 = FunctionFeaturizer(expressions=["x ** 2"]) ff2 = FunctionFeaturizer(expressions=["exp(x)", "1 / x"]) mf = MultipleFeaturizer([ff1, ff2]) new_df = mf.fit_featurize_dataframe(self.test_df, ['a', 'b', 'c'], inplace=False) self.assertEqual(len(new_df), 11)
def __init__(self, pbar=False): self.regressor = RandomForestRegressor(n_estimators=500, n_jobs=-1, verbose=3) self.stc = StrToComposition() ep = ElementProperty.from_preset("magpie") ef = ElementFraction() self.featurizer = MultipleFeaturizer([ep, ef]) self.pbar = pbar
def test_multiindex_in_multifeaturizer(self): # Make sure multiplefeaturizer returns the correct sub-featurizer multiindex keys # test both iteration over entries and featurizers for iter_entries in [True, False]: mf = MultipleFeaturizer([self.multi, self.single], iterate_over_entries=iter_entries) df_1lvl = pd.DataFrame({'x': [1, 2, 3]}) df_2lvl = pd.DataFrame({'x': [1, 2, 3]}) df_2lvl.columns = pd.MultiIndex.from_product((["Custom"], df_2lvl.columns.values)) df_3lvl = pd.DataFrame({'x': [1, 2, 3]}) df_3lvl.columns = pd.MultiIndex.from_product((["Custom"], ["Custom2"], df_3lvl.columns.values)) # If input dataframe has flat column index mf.featurize_dataframe(df_1lvl, 'x', multiindex=True) self.assertEqual(df_1lvl[("Input Data", "x")].iloc[0], 1) self.assertEqual(df_1lvl[("MultipleFeatureFeaturizer", "w")].iloc[0], 0) self.assertEqual(df_1lvl[("SingleFeaturizer", "y")].iloc[0], 2) # If input dataframe has 2-lvl column index mf.featurize_dataframe(df_2lvl, ("Custom", 'x'), multiindex=True) self.assertEqual(df_2lvl[("Custom", "x")].iloc[0], 1) self.assertEqual(df_2lvl[("MultipleFeatureFeaturizer", "w")].iloc[0], 0) self.assertEqual(df_2lvl[("SingleFeaturizer", "y")].iloc[0], 2) # If input dataframe has 2+ lvl column index with self.assertRaises(IndexError): _ = self.multi.featurize_dataframe(df_3lvl, ("Custom", "Custom2", 'x'), multiindex=True)
def test_multi_featurizer(self): ff1 = FunctionFeaturizer(expressions=["x ** 2"]) ff2 = FunctionFeaturizer(expressions=["exp(x)", "1 / x"]) mf = MultipleFeaturizer([ff1, ff2]) new_df = mf.fit_featurize_dataframe(self.test_df, ['a', 'b', 'c'], inplace=False) self.assertEqual(len(new_df), 11)
def featurize_structure(df: pd.DataFrame) -> pd.DataFrame: """ Decorate input `pandas.DataFrame` of structures with structural features from matminer. Currently applies the set of all matminer structure features. Args: df (pandas.DataFrame): the input dataframe with `"structure"` column containing `pymatgen.Structure` objects. Returns: pandas.DataFrame: the decorated DataFrame. """ logging.info("Applying structure featurizers...") df = df.copy() structure_features = [ DensityFeatures(), GlobalSymmetryFeatures(), RadialDistributionFunction(), CoulombMatrix(), PartialRadialDistributionFunction(), SineCoulombMatrix(), EwaldEnergy(), BondFractions(), StructuralHeterogeneity(), MaximumPackingEfficiency(), ChemicalOrdering(), XRDPowderPattern(), BagofBonds() ] featurizer = MultipleFeaturizer([feature.fit(df["structure"]) for feature in structure_features]) df = featurizer.featurize_dataframe(df, "structure", multiindex=True, ignore_errors=True) df.columns = df.columns.map('|'.join).str.strip('|') dist = df["RadialDistributionFunction|radial distribution function"][0]['distances'][:50] for i, d in enumerate(dist): _rdf_key = "RadialDistributionFunction|radial distribution function|d_{:.2f}".format(d) df[_rdf_key] = df["RadialDistributionFunction|radial distribution function"].apply(lambda x: x['distribution'][i]) df = df.drop("RadialDistributionFunction|radial distribution function", axis=1) _crystal_system = { "cubic": 1, "tetragonal": 2, "orthorombic": 3, "hexagonal": 4, "trigonal": 5, "monoclinic": 6, "triclinic": 7 } df["GlobalSymmetryFeatures|crystal_system"] = df["GlobalSymmetryFeatures|crystal_system"].map(_crystal_system) df["GlobalSymmetryFeatures|is_centrosymmetric"] = df["GlobalSymmetryFeatures|is_centrosymmetric"].map(int) return clean_df(df)
def test_multiple(self): # test iterating over both entries and featurizers for iter_entries in [True, False]: multi_f = MultipleFeaturizer([self.single, self.multi], iterate_over_entries=iter_entries) data = self.make_test_data() self.assertArrayAlmostEqual([2, 0, 3], multi_f.featurize(1)) self.assertArrayEqual(['A'], multi_f.citations()) implementors = multi_f.implementors() self.assertIn('Us', implementors) self.assertIn('Them', implementors) self.assertEqual(2, len(implementors)) # Ensure BaseFeaturizer operation without overriden featurize_dataframe with warnings.catch_warnings(record=True) as w: data = multi_f.featurize_dataframe(data, 'x') self.assertEqual(len(w), 0) self.assertArrayAlmostEqual(data['y'], [2, 3, 4]) self.assertArrayAlmostEqual(data['w'], [0, 1, 2]) self.assertArrayAlmostEqual(data['z'], [3, 4, 5]) f = MatrixFeaturizer() multi_f = MultipleFeaturizer([self.single, self.multi, f]) data = self.make_test_data() with warnings.catch_warnings(record=True) as w: data = multi_f.featurize_dataframe(data, 'x') self.assertEqual(len(w), 0) self.assertArrayAlmostEqual(data['representation'][0], [[1.0, 0.0], [0.0, 1.0]])
def __init__(self): self.feature_calculators = MultipleFeaturizer([ cf.ElementProperty.from_preset(preset_name="magpie"), cf.Stoichiometry(), cf.ValenceOrbital(props=['frac']), cf.IonProperty(fast=True), cf.BandCenter(), cf.ElementFraction(), ]) self.str2composition = StrToComposition()
def test_caching(self): """Test whether MultiFeaturizer properly caches """ # have to iterate over entries to enable caching feat = MultipleFeaturizer([ SiteStatsFingerprint.from_preset("LocalPropertyDifference_ward-prb-2017"), SiteStatsFingerprint.from_preset("CoordinationNumber_ward-prb-2017") ], iterate_over_entries=True) # Reset the cache before tests _get_all_nearest_neighbors.cache_clear() # Create a dataframe with two SC structures in it data = pd.DataFrame({'strcs': [ Structure([[3.52, 0, 0], [0, 3.52, 0], [0, 0, 3.52]], ["Al"], [[0, 0, 0]]), Structure([[3.52, 0, 0], [0, 3.52, 0], [0, 0, 3.52]], ["Ni"], [[0, 0, 0]]), ]}) # Call featurize on both, check the number of cache misses/hits feat.featurize(data['strcs'][0]) feat.featurize(data['strcs'][1]) self.assertEqual(2, _get_all_nearest_neighbors.cache_info().hits) self.assertEqual(2, _get_all_nearest_neighbors.cache_info().misses) # Verify the number of cache misses, it should be the same as before feat.set_n_jobs(1) _get_all_nearest_neighbors.cache_clear() feat.featurize_dataframe(data, 'strcs') self.assertEqual(2, _get_all_nearest_neighbors.cache_info().hits) self.assertEqual(2, _get_all_nearest_neighbors.cache_info().misses)
def test_ignore_errors(self): # Make sure multiplefeaturizer returns the correct sub-featurizer multiindex keys # Iterate through many tests: single/parallel, returning errors or not, # multiindex or not, and interation over entires/featurizers for mi, re, n, iter_entries in product( [True, False], [True, False], [1, 2], [True, False]): mf = MultipleFeaturizer([self.multi, self.single], iterate_over_entries=iter_entries) # Make some test data that will cause errors data = pd.DataFrame({'x': ['a', 2, 3]}) # Set the number of threads mf.set_n_jobs(n) # Make sure it completes successfully results = mf.featurize_many(data['x'], ignore_errors=True, return_errors=re) self.assertEqual(5 if re else 3, len(results[0])) # Make sure it works with featurize dataframe results = mf.featurize_dataframe(data, 'x', ignore_errors=True, return_errors=re, multiindex=mi) self.assertEqual(6 if re else 4, len(results.columns)) # Special test for returning errors (only should work when returning errors) # I only am going to test the single index case for simplicity if re and not mi: self.assertIn('TypeError', results.iloc[0]['SingleFeaturizer Exceptions']) # Make sure it throws an error with self.assertRaises(TypeError): mf.featurize_many([['a'], [1], [2]])
class FeatureGenerator: """ A wraper class to generate multiple type of elemental features """ def __init__(self): self.feature_calculators = MultipleFeaturizer([ cf.ElementProperty.from_preset(preset_name="magpie"), cf.Stoichiometry(), cf.ValenceOrbital(props=['frac']), cf.IonProperty(fast=True), cf.BandCenter(), cf.ElementFraction(), ]) self.str2composition = StrToComposition() def generate(self, df: pd.DataFrame, ignore_errors: bool = False): """ generate feature from a dataframe with a "formula" column that contains chemical formulas of the compositions. """ df = self.str2composition.featurize_dataframe( df, "formula", ignore_errors=ignore_errors) df = df.dropna() df = self.feature_calculators.featurize_dataframe( df, col_id='composition', ignore_errors=ignore_errors) df["NComp"] = df["composition"].apply(len) return df
def featurize_structures(self, featurizer=None, **kwargs): """ Featurizes the hypothetical structures available from hypo_structures method. Hypothetical structures for which featurization fails are removed and valid structures are made available as valid_structures Args: featurizer (Featurizer): A MatMiner Featurizer. Defaults to MultipleFeaturizer with PRB Ward Voronoi descriptors. **kwargs (dict): kwargs passed to featurize_many method of featurizer. Returns: (pandas.DataFrame): features """ # Note the redundancy here is for pandas to work if self.hypo_structures is None: warnings.warn("No structures available. Generating structures.") self.get_structures() print("Generating features") featurizer = featurizer if featurizer else MultipleFeaturizer([ SiteStatsFingerprint.from_preset("CoordinationNumber_ward-prb-2017"), StructuralHeterogeneity(), ChemicalOrdering(), MaximumPackingEfficiency(), SiteStatsFingerprint.from_preset("LocalPropertyDifference_ward-prb-2017"), StructureComposition(Stoichiometry()), StructureComposition(ElementProperty.from_preset("magpie")), StructureComposition(ValenceOrbital(props=['frac'])), StructureComposition(IonProperty(fast=True)) ]) features = featurizer.featurize_many( self.hypo_structures['structure'], ignore_errors=True, **kwargs) n_species, formula = [], [] for s in self.hypo_structures['structure']: n_species.append(len(s.composition.elements)) formula.append(s.composition.formula) self._features_df = pd.DataFrame.from_records( features, columns=featurizer.feature_labels()) self._features_df.index = self.hypo_structures.index self._features_df['N_species'] = n_species self._features_df['Composition'] = formula self._features_df['structure'] = self.hypo_structures['structure'] self.features = self._features_df.dropna(axis=0, how='any') self.features = self.features.reindex(sorted(self.features.columns), axis=1) self._valid_structure_labels = list(self.features.index) self.valid_structures = self.hypo_structures.loc[self._valid_structure_labels] print("{} out of {} structures were successfully featurized.".format( self.features.shape[0], self._features_df.shape[0])) return self.features
def test_multifeatures(self): # Make a test dataset with two input variables data = self.make_test_data() data['x2'] = [4, 5, 6] # Create a second featurizer class MultiArgs2(SingleFeaturizerMultiArgs): def featurize(self, *x): # Making a 2D array to test whether MutliFeaturizer # can handle featurizers that have both 1D vectors with # singleton dimensions (e.g., shape==(4,1)) and those # without (e.g., shape==(4,)) return [super(MultiArgs2, self).featurize(*x)] def feature_labels(self): return ['y2'] multiargs2 = MultiArgs2() # Create featurizer multi_f = MultipleFeaturizer([self.multiargs, multiargs2]) multi_f.set_n_jobs(1) # Test featurize with multiple arguments features = multi_f.featurize(0, 2) self.assertArrayAlmostEqual([2, 2], features) # Test dataframe data = multi_f.featurize_dataframe(data, ['x', 'x2']) self.assertEquals(['y', 'y2'], multi_f.feature_labels()) self.assertArrayAlmostEqual([[5, 5], [7, 7], [9, 9]], data[['y', 'y2']])
def _fit_apply_featurizers(self, df: pd.DataFrame, featurizers: Iterable[BaseFeaturizer], column: str, fit_to_df: bool = True) -> pd.DataFrame: """ For the list of featurizers, fit each to the chosen column of the input pd.DataFrame and then apply them as a MultipleFeaturizer. Arguments: df: The DataFrame to featurize. featurizers: The list of matminer featurizers to fit and apply to the DataFrame. column: The name of the column to apply the featurizers to. fit_to_df: Whether or not to fit the featurizers to the input dataframe. If not true, it will be assumed that any featurizers that required fitting have already been fitted. Returns: pandas.DataFrame: the decorated DataFrame. """ #LOG.info("Applying featurizers {} to column {}".format(featurizers, column)) if fit_to_df: _featurizers = MultipleFeaturizer( [feat.fit(df[column]) for feat in featurizers]) else: _featurizers = MultipleFeaturizer(featurizers) if self._n_jobs is not None: _featurizers.set_n_jobs(self._n_jobs) return _featurizers.featurize_dataframe(df, column, multiindex=True, ignore_errors=True)
def test_caching(self): """Test whether MultiFeaturizer properly caches """ # have to iterate over entries to enable caching feat = MultipleFeaturizer([ SiteStatsFingerprint.from_preset("LocalPropertyDifference_ward-prb-2017"), SiteStatsFingerprint.from_preset("CoordinationNumber_ward-prb-2017") ], iterate_over_entries=True) # Create a dataframe with two SC structures in it data = pd.DataFrame({'strcs': [ Structure([[3.52, 0, 0], [0, 3.52, 0], [0, 0, 3.52]], ["Al"], [[0, 0, 0]]), Structure([[3.52, 0, 0], [0, 3.52, 0], [0, 0, 3.52]], ["Ni"], [[0, 0, 0]]), ]}) # Call featurize on both, check the number of cache misses/hits feat.featurize(data['strcs'][0]) feat.featurize(data['strcs'][1]) self.assertEqual(2, _get_all_nearest_neighbors.cache_info().hits) self.assertEqual(2, _get_all_nearest_neighbors.cache_info().misses) # Verify the number of cache misses, it should be the same as before feat.set_n_jobs(1) _get_all_nearest_neighbors.cache_clear() feat.featurize_dataframe(data, 'strcs') self.assertEqual(2, _get_all_nearest_neighbors.cache_info().hits) self.assertEqual(2, _get_all_nearest_neighbors.cache_info().misses)
def test_ignore_errors(self): # Make sure multiplefeaturizer returns the correct sub-featurizer multiindex keys # Iterate through many tests: single/parallel, returning errors or not, # multiindex or not, and interation over entires/featurizers for mi, re, n, iter_entries in product( [True, False], [True, False], [1, 2], [True, False]): mf = MultipleFeaturizer([self.multi, self.single], iterate_over_entries=iter_entries) # Make some test data that will cause errors data = pd.DataFrame({'x': ['a', 2, 3]}) # Set the number of threads mf.set_n_jobs(n) # Make sure it completes successfully results = mf.featurize_many(data['x'], ignore_errors=True, return_errors=re) self.assertEqual(5 if re else 3, len(results[0])) # Make sure it works with featurize dataframe results = mf.featurize_dataframe(data, 'x', ignore_errors=True, return_errors=re, multiindex=mi) self.assertEqual(6 if re else 4, len(results.columns)) # Special test for returning errors (only should work when returning errors) # I only am going to test the single index case for simplicity if re and not mi: self.assertIn('TypeError', results.iloc[0]['SingleFeaturizer Exceptions']) # Make sure it throws an error with self.assertRaises(TypeError): mf.featurize_many([['a'], [1], [2]])
def test_multifeatures(self): # Make a test dataset with two input variables data = self.make_test_data() data['x2'] = [4, 5, 6] multiargs2 = MultiArgs2() # Create featurizer multi_f = MultipleFeaturizer([self.multiargs, multiargs2]) # Test featurize with multiple arguments features = multi_f.featurize(0, 2) self.assertArrayAlmostEqual([2, 2], features) # Test dataframe data = multi_f.featurize_dataframe(data, ['x', 'x2']) self.assertEquals(['y', 'y2'], multi_f.feature_labels()) self.assertArrayAlmostEqual([[5, 5], [7, 7], [9, 9]], data[['y', 'y2']])
def featurizer(self): """Return the featurizer (with the suitable cutoff)""" cutoff = self.cutoff return MultipleFeaturizer( [ CrystalNNFingerprint.from_preset("ops", search_cutoff=cutoff), LocalPropertyStatsNew.from_preset("interpretable", cutoff=cutoff), GaussianSymmFunc(), ] )
def test_multitype_multifeat(self): """Test Multifeaturizer when a featurizer returns a non-numeric type""" # test both iteration over entries and featurizers for iter_entries in [True, False]: # Make the featurizer f = MultipleFeaturizer([SingleFeaturizer(), MultiTypeFeaturizer()], iterate_over_entries=iter_entries) f.set_n_jobs(1) # Make the test data data = self.make_test_data() # Add the columns data = f.featurize_dataframe(data, 'x') # Make sure the types are as expected labels = f.feature_labels() self.assertArrayEqual(['int64', 'object', 'int64'], data[labels].dtypes.astype(str).tolist()) self.assertArrayAlmostEqual(data['y'], [2, 3, 4])
def get_structure_properties(structure: Structure, mode: str = 'all') -> dict: if mode == 'all': featurizer = MultipleFeaturizer([ SiteStatsFingerprint.from_preset( 'CoordinationNumber_ward-prb-2017'), StructuralHeterogeneity(), ChemicalOrdering(), DensityFeatures(), MaximumPackingEfficiency(), SiteStatsFingerprint.from_preset( 'LocalPropertyDifference_ward-prb-2017'), StructureComposition(Stoichiometry()), StructureComposition(ElementProperty.from_preset('magpie')), StructureComposition(ValenceOrbital(props=['frac'])), ]) else: # Calculate only those which do not need a Voronoi tesselation featurizer = MultipleFeaturizer([ DensityFeatures(), StructureComposition(Stoichiometry()), StructureComposition(ElementProperty.from_preset('magpie')), StructureComposition(ValenceOrbital(props=['frac'])), ]) X = featurizer.featurize(structure) matminer_dict = dict(list(zip(featurizer.feature_labels(), X))) matminer_dict['volume'] = structure.volume return matminer_dict
def test_multifeatures_multiargs(self): multiargs2 = MultiArgs2() # test iterating over both entries and featurizers for iter_entries in [True, False]: # Make a test dataset with two input variables data = self.make_test_data() data['x2'] = [4, 5, 6] # Create featurizer multi_f = MultipleFeaturizer([self.multiargs, multiargs2], iterate_over_entries=iter_entries) # Test featurize with multiple arguments features = multi_f.featurize(0, 2) self.assertArrayAlmostEqual([2, 2], features) # Test dataframe data = multi_f.featurize_dataframe(data, ['x', 'x2']) self.assertEqual(['y', 'y2'], multi_f.feature_labels()) self.assertArrayAlmostEqual([[5, 5], [7, 7], [9, 9]], data[['y', 'y2']]) # Test with multiindex data = multi_f.featurize_dataframe(data, ['x', 'x2'], multiindex=True) self.assertIn(("MultiArgs2", "y2"), data.columns) self.assertIn(("SingleFeaturizerMultiArgs", "y"), data.columns) self.assertArrayAlmostEqual([[5, 5], [7, 7], [9, 9]], data[[ ("SingleFeaturizerMultiArgs", "y"), ("MultiArgs2", "y2") ]])
def featurize_composition(df: pd.DataFrame) -> pd.DataFrame: """ Decorate input `pandas.DataFrame` of structures with composition features from matminer. Currently applies the set of all matminer composition features. Args: df (pandas.DataFrame): the input dataframe with `"structure"` column containing `pymatgen.Structure` objects. Returns: pandas.DataFrame: the decorated DataFrame. """ logging.info("Applying composition featurizers...") df = df.copy() df['composition'] = df['structure'].apply(lambda s: s.composition) featurizer = MultipleFeaturizer([ElementProperty.from_preset("magpie"), AtomicOrbitals(), BandCenter(), # ElectronAffinity(), - This descriptor was not used in the paper preset Stoichiometry(), ValenceOrbital(), IonProperty(), ElementFraction(), TMetalFraction(), # CohesiveEnergy(), - This descriptor was not used in the paper preset Miedema(), YangSolidSolution(), AtomicPackingEfficiency(), ]) df = featurizer.featurize_dataframe(df, "composition", multiindex=True, ignore_errors=True) df.columns = df.columns.map('|'.join).str.strip('|') ox_featurizer = MultipleFeaturizer([OxidationStates(), ElectronegativityDiff() ]) df = CompositionToOxidComposition().featurize_dataframe(df, "Input Data|composition") df = ox_featurizer.featurize_dataframe(df, "composition_oxid", multiindex=True, ignore_errors=True) df = df.rename(columns={'Input Data': ''}) df.columns = df.columns.map('|'.join).str.strip('|') _orbitals = {"s": 1, "p": 2, "d": 3, "f": 4} df['AtomicOrbitals|HOMO_character'] = df['AtomicOrbitals|HOMO_character'].map(_orbitals) df['AtomicOrbitals|LUMO_character'] = df['AtomicOrbitals|LUMO_character'].map(_orbitals) df['AtomicOrbitals|HOMO_element'] = df['AtomicOrbitals|HOMO_element'].apply( lambda x: -1 if not isinstance(x, str) else Element(x).Z ) df['AtomicOrbitals|LUMO_element'] = df['AtomicOrbitals|LUMO_element'].apply( lambda x: -1 if not isinstance(x, str) else Element(x).Z ) df = df.replace([np.inf, -np.inf, np.nan], 0) return clean_df(df)
def __init__(self, structure: Structure, outpath: Union[str, Path]): """Generates features for a structures Args: structure (Structure): Pymatgen Structure object outpath (Union[str, Path]): path to which the features will be dumped Returns: """ featurizelogger = logging.getLogger("Featurize") featurizelogger.setLevel(logging.INFO) logging.basicConfig( format="%(filename)s: %(message)s", level=logging.INFO, ) self.outpath = outpath if ((outpath != "") and (outpath is not None) and (not os.path.exists(self.outpath))): os.mkdir(self.outpath) self.logger = featurizelogger self.path = None self.structure = structure self.metal_sites = [] self.metal_indices = [] self.features = [] if self.path is not None: self.outname = os.path.join( self.outpath, "".join([Path(self.path).stem, ".pkl"])) else: self.outname = os.path.join( self.outpath, "".join([self.structure.formula.replace(" ", "_"), ".pkl"]), ) self.featurizer = MultipleFeaturizer([ CrystalNNFingerprint.from_preset("ops"), LocalPropertyStatsNew.from_preset("interpretable"), GaussianSymmFunc(), ])
def similarity(_parents, target): featurizer = MultipleFeaturizer([ SiteStatsFingerprint.from_preset("CoordinationNumber_ward-prb-2017"), StructuralHeterogeneity(), ChemicalOrdering(), MaximumPackingEfficiency(), SiteStatsFingerprint.from_preset( "LocalPropertyDifference_ward-prb-2017"), StructureComposition(Stoichiometry()), StructureComposition(ElementProperty.from_preset("magpie")), StructureComposition(ValenceOrbital(props=["frac"])), StructureComposition(IonProperty(fast=True)), ]) # HACK celery doesn't work with multiprocessing (used by matminer) try: from celery import current_task if current_task: featurizer.set_n_jobs(1) except ImportError: pass x_target = pd.DataFrame.from_records([featurizer.featurize(target)], columns=featurizer.feature_labels()) x_parent = pd.DataFrame.from_records( featurizer.featurize_many(_parents, ignore_errors=True, pbar=False), columns=featurizer.feature_labels(), ) nulls = x_parent[x_parent.isnull().any(axis=1)].index.values x_parent.fillna(100000, inplace=True) x_target = x_target.reindex(sorted(x_target.columns), axis=1) x_parent = x_parent.reindex(sorted(x_parent.columns), axis=1) with open(os.path.join(settings.rxn_files, "scaler2.pickle"), "rb") as f: scaler = pickle.load(f) with open(os.path.join(settings.rxn_files, "quantiles.pickle"), "rb") as f: quantiles = pickle.load(f) X = scaler.transform(x_parent.append(x_target)) D = [pairwise_distances(np.array([row, X[-1]]))[0, 1] for row in X[:-1]] _res = [] for d in D: _res.append(np.linspace(0, 1, 101)[np.abs(quantiles - d).argmin()]) _res = np.array(_res) _res[nulls] = -1 return _res
def __init__(self, structure, outpath): """Generates features for a list of structures Args: structure outpath (str): path to which the features will be dumped Returns: """ featurizelogger = logging.getLogger('Featurize') featurizelogger.setLevel(logging.INFO) logging.basicConfig( format='%(filename)s: %(message)s', level=logging.INFO, ) self.outpath = outpath if outpath != '' and not os.path.exists(self.outpath): os.mkdir(self.outpath) self.logger = featurizelogger self.path = None self.structure = structure self.metal_sites = [] self.metal_indices = [] self.features = [] if self.path is not None: self.outname = os.path.join( self.outpath, ''.join([Path(self.path).stem, '.pkl'])) else: self.outname = os.path.join( self.outpath, ''.join([self.structure.formula.replace(' ', '_'), '.pkl']), ) self.featurizer = MultipleFeaturizer([ CrystalNNFingerprint.from_preset('ops'), LocalPropertyStatsNew.from_preset('interpretable'), GaussianSymmFunc(), ])
def test_multiple(self): multi_f = MultipleFeaturizer([self.single, self.multi]) data = self.make_test_data() self.assertArrayAlmostEqual([2, 0, 3], multi_f.featurize(1)) self.assertArrayEqual(['A'], multi_f.citations()) implementors = multi_f.implementors() self.assertIn('Us', implementors) self.assertIn('Them', implementors) self.assertEquals(2, len(implementors)) multi_f.featurize_dataframe(data, 'x') self.assertArrayAlmostEqual(data['y'], [2, 3, 4]) self.assertArrayAlmostEqual(data['w'], [0, 1, 2]) self.assertArrayAlmostEqual(data['z'], [3, 4, 5])
class FeatureGenerator: """ A wraper class to generate multiple type of elemental features """ def __init__(self): self.feature_calculators = MultipleFeaturizer([ cf.ElementProperty.from_preset(preset_name="magpie"), cf.Stoichiometry(), cf.ValenceOrbital(props=['frac']), cf.IonProperty(fast=True), cf.BandCenter(), cf.ElementFraction(), ]) self.str2composition = StrToComposition() def generate(self, df: pd.DataFrame, ignore_errors: bool = False, drop_mode=True): """ generate feature from a dataframe with a "formula" column that contains chemical formulas of the compositions. df : a dataframe with a column name formula ignore_errors : ignore errors when generating features drop_mode : drop property that generated from mode aggregation function """ df = self.str2composition.featurize_dataframe( df, "formula", ignore_errors=ignore_errors) df = df.dropna() df = self.feature_calculators.featurize_dataframe( df, col_id='composition', ignore_errors=ignore_errors) df["NComp"] = df["composition"].apply(len) if drop_mode: df = df.drop(columns=[ c for c in df.columns if "mode" in c and c.startswith("Magpie") ]) return df
class RFEstimator(BaseTesterEstimator): def __init__(self, pbar=False): self.regressor = RandomForestRegressor(n_estimators=500, n_jobs=-1, verbose=3) self.stc = StrToComposition() ep = ElementProperty.from_preset("magpie") ef = ElementFraction() self.featurizer = MultipleFeaturizer([ep, ef]) self.pbar = pbar def _generate_features(self, x): comps = [o[0] for o in self.stc.featurize_many(x, pbar=self.pbar)] features = np.asarray(self.featurizer.featurize_many(comps, pbar=self.pbar)) return features def fit(self, x, y): features = self._generate_features(x) self.regressor.fit(features, y) def predict(self, x): features = self._generate_features(x) return self.regressor.predict(features)
def test_multiple(self): multi_f = MultipleFeaturizer([self.single, self.multi]) data = self.make_test_data() self.assertArrayAlmostEqual([2, 0, 3], multi_f.featurize(1)) self.assertArrayEqual(['A'], multi_f.citations()) implementors = multi_f.implementors() self.assertIn('Us', implementors) self.assertIn('Them', implementors) self.assertEquals(2, len(implementors)) # Ensure BaseFeaturizer operation without overriden featurize_dataframe with warnings.catch_warnings(record=True) as w: multi_f.featurize_dataframe(data, 'x') self.assertEqual(len(w), 0) self.assertArrayAlmostEqual(data['y'], [2, 3, 4]) self.assertArrayAlmostEqual(data['w'], [0, 1, 2]) self.assertArrayAlmostEqual(data['z'], [3, 4, 5])
def test_multitype_multifeat(self): """Test Multifeaturizer when a featurizer returns a non-numeric type""" # Make the featurizer f = MultipleFeaturizer([SingleFeaturizer(), MultiTypeFeaturizer()]) f.set_n_jobs(1) # Make the test data data = self.make_test_data() # Add the columns data = f.featurize_dataframe(data, 'x') # Make sure the types are as expected labels = f.feature_labels() self.assertArrayEqual(['int64', 'object', 'int64'], data[labels].dtypes.astype(str).tolist()) self.assertArrayAlmostEqual(data['y'], [2, 3, 4])
# Get only the minimum energy structure at each composition data['composition'] = data['structure'].apply(lambda x: x.composition) data['integer_formula'] = data['composition'].apply( lambda x: x.get_integer_formula_and_factor()[0]) data.sort_values('e_above_hull', ascending=True, inplace=True) data.drop_duplicates('integer_formula', keep='first', inplace=True) print('Reduced dataset to {} unique compositions.'.format(len(data))) data.reset_index(inplace=True, drop=True) # Create the featurizer, which will take the composition as input featurizer = MultipleFeaturizer([ cf.Stoichiometry(), cf.ElementProperty.from_preset('magpie'), cf.ValenceOrbital(props=['frac']), cf.IonProperty(fast=True) ]) # Compute the features featurizer.set_n_jobs(1) X = featurizer.featurize_many(data['composition']) # Make the model model = Pipeline([('imputer', Imputer()), ('model', RandomForestRegressor())]) model.fit(X, data['formation_energy_per_atom']) print('Trained a RandomForest model') # Save the model, featurizer, and data using pickle with open('model.pkl', 'wb') as fp: pkl.dump(model, fp)
featurizer = MultipleFeaturizer([ GlobalSymmetryFeatures(), ElectronicRadialDistributionFunction(cutoff=7.5), SiteStatsFingerprint(AGNIFingerprints(directions=(None, 'x', 'y'))), SiteStatsFingerprint(OPSiteFingerprint()), SiteStatsFingerprint.from_preset("CoordinationNumber_ward-prb-2017"), SiteStatsFingerprint(GaussianSymmFunc()), SiteStatsFingerprint(EwaldSiteEnergy(accuracy=3)), DensityFeatures(), SiteStatsFingerprint( GeneralizedRadialDistributionFunction.from_preset('gaussian')), SiteStatsFingerprint( LocalPropertyDifference( data_source=MagpieData(), properties=[ "Number", "MendeleevNumber", "AtomicWeight", "MeltingT", "Column", "Row", "CovalentRadius", "Electronegativity", "NsValence", "NpValence", "NdValence", "NfValence", "NValence", "NsUnfilled", "NpUnfilled", "NdUnfilled", "NfUnfilled", "NUnfilled", "GSvolume_pa", "GSbandgap", "GSmagmom" ])), SiteStatsFingerprint(SiteElementalProperty.from_preset("seko-prb-2017")), EwaldEnergy(), StructuralHeterogeneity(), ChemicalOrdering(), StructureComposition(ElementProperty.from_preset('magpie')), StructureComposition(AtomicOrbitals()), StructureComposition(BandCenter()), StructureComposition(ElectronegativityDiff()), StructureComposition(ElectronAffinity()), StructureComposition(Stoichiometry()), StructureComposition(ValenceOrbital()), StructureComposition(IonProperty()), StructureComposition(Miedema()), StructureComposition(YangSolidSolution()) ])
def load_data_zT(): results_dir = setResDir() ## Metadata keys_response = [ 'Seebeck coefficient; squared', 'Electrical resistivity', 'Thermal conductivity' ] sign = np.array([ +1, # Seebeck -1, # Electric resistivity -1 # Thermal conductivity ]) ## Load data, if possible # -------------------------------------------------- try: df_X_all = pd.read_csv(results_dir + file_features) X_all = df_X_all.drop(df_X_all.columns[0], axis=1).values df_Y_all = pd.read_csv(results_dir + file_responses) Y_all = df_Y_all.drop(df_Y_all.columns[0], axis=1).values print("Cached data loaded.") except FileNotFoundError: ## Data Import # -------------------------------------------------- # Initialize client print("Accessing data from Citrination...") site = 'https://citrination.com' # Citrination client = CitrinationClient(api_key=os.environ['CITRINATION_API_KEY'], site=site) search_client = client.search # Aluminum dataset dataset_id = 178480 # ucsb_te_roomtemp_seebeck system_query = PifSystemReturningQuery( size=1000, query=DataQuery(dataset=DatasetQuery(id=Filter( equal=str(dataset_id))))) query_result = search_client.pif_search(system_query) print(" Found {} PIFs in dataset {}.".format( query_result.total_num_hits, dataset_id)) ## Wrangle # -------------------------------------------------- pifs = [x.system for x in query_result.hits] # Utility function will tabularize PIFs df_response = pifs2df(pifs) # Down-select columns to play well with to_numeric df_response = df_response[[ 'Seebeck coefficient', 'Electrical resistivity', 'Thermal conductivity' ]] df_response = df_response.apply(pd.to_numeric) # Parse chemical compositions formulas = [pif.chemical_formula for pif in pifs] df_comp = pd.DataFrame(columns=['chemical_formula'], data=formulas) # Join df_data = pd.concat([df_comp, df_response], axis=1) print(" Accessed data.") # Featurize print("Featurizing data...") df_data['composition'] = df_data['chemical_formula'].apply( get_compostion) f = MultipleFeaturizer([ cf.Stoichiometry(), cf.ElementProperty.from_preset("magpie"), cf.ValenceOrbital(props=['avg']), cf.IonProperty(fast=True) ]) X = np.array(f.featurize_many(df_data['composition'])) # Find valid response values keys_original = [ 'Seebeck coefficient', 'Electrical resistivity', 'Thermal conductivity' ] index_valid_response = { key: df_data[key].dropna().index.values for key in keys_original } index_valid_all = df_data[keys_original].dropna().index.values X_all = X[index_valid_all, :] Y_all = df_data[keys_original].iloc[index_valid_all].values # Manipulate columns for proper objective values Y_all[:, 0] = Y_all[:, 0]**2 # Squared seebeck print(" Data prepared; {0:} valid observations.".format( X_all.shape[0])) # Cache data pd.DataFrame(data=X_all).to_csv(results_dir + file_features) pd.DataFrame(data=Y_all, columns=keys_response).to_csv(results_dir + file_responses) print("Data cached in results directory.") return X_all, Y_all, sign, keys_response, prefix
class GetFeatures: # pylint:disable=too-many-instance-attributes """Featurizer""" def __init__(self, structure: Structure, outpath: Union[str, Path]): """Generates features for a structures Args: structure (Structure): Pymatgen Structure object outpath (Union[str, Path]): path to which the features will be dumped Returns: """ featurizelogger = logging.getLogger("Featurize") featurizelogger.setLevel(logging.INFO) logging.basicConfig( format="%(filename)s: %(message)s", level=logging.INFO, ) self.outpath = outpath if ((outpath != "") and (outpath is not None) and (not os.path.exists(self.outpath))): os.mkdir(self.outpath) self.logger = featurizelogger self.path = None self.structure = structure self.metal_sites = [] self.metal_indices = [] self.features = [] if self.path is not None: self.outname = os.path.join( self.outpath, "".join([Path(self.path).stem, ".pkl"])) else: self.outname = os.path.join( self.outpath, "".join([self.structure.formula.replace(" ", "_"), ".pkl"]), ) self.featurizer = MultipleFeaturizer([ CrystalNNFingerprint.from_preset("ops"), LocalPropertyStatsNew.from_preset("interpretable"), GaussianSymmFunc(), ]) @classmethod def from_file(cls, structurepath: Union[str, Path], outpath: Union[str, Path]) -> object: """Construct a featurizer class from path to structure and an output path Args: structurepath (Union[str, Path]): Path to structure file outpath (Union[str, Path]): Path to which the outputs should be written. Returns: object: Instance of the GetFeatures class """ s = GetFeatures._read_safe(structurepath) featureclass = cls(s, outpath) featureclass.path = structurepath featureclass.outname = os.path.join( featureclass.outpath, "".join([Path(featureclass.path).stem, ".pkl"])) return featureclass @classmethod def from_string(cls, structurestring: str, outpath: Union[str, Path]) -> object: """Constructor for the webapp, using a string of a structure file, e.g., a CIF Args: structurestring (str): Fileconent of a CIF as string outpath (Union[str, Path]): Path to which the output should be written. Raises: ValueError: In case the CIF could not be parsed Returns: object: Instance of GetFeatures """ with warnings.catch_warnings(): warnings.simplefilter("ignore") try: cp = CifParser.from_string(structurestring) s = cp.get_structures()[0] except Exception as execp: raise ValueError("Pymatgen could not parse ciffile") from execp else: return cls(s, outpath) @staticmethod def _read_safe(path: Union[str, Path]): """Fail early Returns: bool: True if check ok (if pymatgen can load structure) """ with warnings.catch_warnings(): warnings.simplefilter("ignore") try: atoms = read(path) structure = AseAtomsAdaptor.get_structure( atoms) # ase parser is a bit more robust return structure except Exception as execpt: # pylint: disable=broad-except raise ValueError("Could not read structure") from execpt def _get_metal_sites(self): """Stores all metal sites of structure to list""" for idx, site in enumerate(self.structure): if site.species.elements[0].is_metal: self.metal_sites.append(site) self.metal_indices.append(idx) def _get_feature_vectors(self, site): """Runs matminer on one site""" with warnings.catch_warnings(): warnings.simplefilter("ignore") X = self.featurizer.featurize(self.structure, site) return X def _dump_features(self): """Dumps all the features into one pickle file""" with open(self.outname, "wb") as filehandle: pickle.dump(list(self.features), filehandle) def return_features(self) -> List[dict]: """Runs featurization and returns a list of dictionaries Returns: List[dict]: List of dictionaries of the form {"metal": , "feature", : , "coords"}, i.e features for one metal site """ self._get_metal_sites() try: self.logger.info("iterating over {} metal sites".format( len(self.metal_sites))) for idx, metal_site in enumerate(self.metal_sites): self.features.append({ "metal": metal_site.species_string, "feature": self._get_feature_vectors(self.metal_indices[idx]), "coords": metal_site.coords, }) except Exception as e: # pylint: disable=broad-except self.logger.error("Could not featurize because of {}".format(e)) return self.features def _run_featurization(self): """loops over sites if check ok""" warnings.warn( "This method is deprecated, and will be removed in a future release", DeprecationWarning, ) self._get_metal_sites() try: self.logger.info("iterating over {} metal sites".format( len(self.metal_sites))) for idx, metal_site in enumerate(self.metal_sites): self.features.append({ "metal": metal_site.species_string, "feature": self._get_feature_vectors(self.metal_indices[idx]), "coords": metal_site.coords, }) self._dump_features() except Exception as e: # pylint: disable=broad-except self.logger.error("could not featurize {} because of {}".format( self.path, e))
class GetFeatures: """Featurizer""" def __init__(self, structure, outpath): """Generates features for a list of structures Args: structure outpath (str): path to which the features will be dumped Returns: """ featurizelogger = logging.getLogger('Featurize') featurizelogger.setLevel(logging.INFO) logging.basicConfig( format='%(filename)s: %(message)s', level=logging.INFO, ) self.outpath = outpath if outpath != '' and not os.path.exists(self.outpath): os.mkdir(self.outpath) self.logger = featurizelogger self.path = None self.structure = structure self.metal_sites = [] self.metal_indices = [] self.features = [] if self.path is not None: self.outname = os.path.join( self.outpath, ''.join([Path(self.path).stem, '.pkl'])) else: self.outname = os.path.join( self.outpath, ''.join([self.structure.formula.replace(' ', '_'), '.pkl']), ) self.featurizer = MultipleFeaturizer([ CrystalNNFingerprint.from_preset('ops'), LocalPropertyStatsNew.from_preset('interpretable'), GaussianSymmFunc(), ]) @classmethod def from_file(cls, structurepath, outpath): """ Construct a featurizer class from path to structure and an output path """ s = GetFeatures.read_safe(structurepath) featureclass = cls(s, outpath) featureclass.path = structurepath featureclass.outname = os.path.join( featureclass.outpath, ''.join([Path(featureclass.path).stem, '.pkl'])) return featureclass @classmethod def from_string(cls, structurestring, outpath): """ Constructure for the webapp """ from pymatgen.io.cif import CifParser with warnings.catch_warnings(): warnings.simplefilter('ignore') try: cp = CifParser.from_string(structurestring) s = cp.get_structures()[0] except Exception: raise ValueError('Pymatgen could not parse ciffile') else: return cls(s, outpath) @staticmethod def read_safe(path): """Fail early Returns: bool: True if check ok (if pymatgen can load structure) """ with warnings.catch_warnings(): warnings.simplefilter('ignore') try: atoms = read(path) structure = AseAtomsAdaptor.get_structure( atoms) # ase parser is a bit more robust return structure except Exception: # pylint: disable=broad-except raise ValueError('Could not read structure') def get_metal_sites(self): """Stores all metal sites of structure to list""" for idx, site in enumerate(self.structure): if site.species.elements[0].is_metal: self.metal_sites.append(site) self.metal_indices.append(idx) def get_feature_vectors(self, site): """Runs matminer on one site""" with warnings.catch_warnings(): warnings.simplefilter('ignore') X = self.featurizer.featurize(self.structure, site) return X def dump_features(self): """Dumps all the features into one pickle file""" with open(self.outname, 'wb') as filehandle: pickle.dump(list(self.features), filehandle) def return_features(self): """Runs featurization and return np array with features. """ self.get_metal_sites() try: self.logger.info('iterating over {} metal sites'.format( len(self.metal_sites))) for idx, metal_site in enumerate(self.metal_sites): self.features.append({ 'metal': metal_site.species_string, 'feature': self.get_feature_vectors(self.metal_indices[idx]), 'coords': metal_site.coords, }) except Exception as e: # pylint: disable=broad-except self.logger.error('could not featurize because of {}'.format(e)) return self.features def run_featurization(self): """loops over sites if check ok""" self.get_metal_sites() try: self.logger.info('iterating over {} metal sites'.format( len(self.metal_sites))) for idx, metal_site in enumerate(self.metal_sites): self.features.append({ 'metal': metal_site.species_string, 'feature': self.get_feature_vectors(self.metal_indices[idx]), 'coords': metal_site.coords, }) self.dump_features() except Exception as e: # pylint: disable=broad-except self.logger.error('could not featurize {} because of {}'.format( self.path, e))
def test_multiple(self): # test iterating over both entries and featurizers for iter_entries in [True, False]: multi_f = MultipleFeaturizer([self.single, self.multi], iterate_over_entries=iter_entries) data = self.make_test_data() self.assertArrayAlmostEqual([2, 0, 3], multi_f.featurize(1)) self.assertArrayEqual(['A'], multi_f.citations()) implementors = multi_f.implementors() self.assertIn('Us', implementors) self.assertIn('Them', implementors) self.assertEqual(2, len(implementors)) # Ensure BaseFeaturizer operation without overriden featurize_dataframe with warnings.catch_warnings(record=True) as w: multi_f.featurize_dataframe(data, 'x') self.assertEqual(len(w), 0) self.assertArrayAlmostEqual(data['y'], [2, 3, 4]) self.assertArrayAlmostEqual(data['w'], [0, 1, 2]) self.assertArrayAlmostEqual(data['z'], [3, 4, 5]) f = MatrixFeaturizer() multi_f = MultipleFeaturizer([self.single, self.multi, f]) data = self.make_test_data() with warnings.catch_warnings(record=True) as w: multi_f.featurize_dataframe(data, 'x') self.assertEqual(len(w), 0) self.assertArrayAlmostEqual(data['representation'][0], [[1.0, 0.0], [0.0, 1.0]])