def test_conversion_multiindex(self): d = {'comp_str': ["Fe2", "MnO2"]} df_1lvl = DataFrame(data=d) df_1lvl = StrToComposition().featurize_dataframe(df_1lvl, 'comp_str', multiindex=True) self.assertEqual( df_1lvl[("StrToComposition", "composition")].tolist(), [Composition("Fe2"), Composition("MnO2")]) df_2lvl = DataFrame(data=d) df_2lvl.columns = MultiIndex.from_product( (["custom"], df_2lvl.columns.values)) df_2lvl = StrToComposition().featurize_dataframe( df_2lvl, ("custom", "comp_str"), multiindex=True) self.assertEqual( df_2lvl[("StrToComposition", "composition")].tolist(), [Composition("Fe2"), Composition("MnO2")]) df_2lvl = DataFrame(data=d) df_2lvl.columns = MultiIndex.from_product( (["custom"], df_2lvl.columns.values)) sto = StrToComposition(target_col_id='test') df_2lvl = sto.featurize_dataframe(df_2lvl, ("custom", "comp_str"), multiindex=True) self.assertEqual( df_2lvl[("StrToComposition", "test")].tolist(), [Composition("Fe2"), Composition("MnO2")]) # if two level multiindex provided as target, it should be written there # here we test converting multiindex in place df_2lvl = DataFrame(data=d) df_2lvl.columns = MultiIndex.from_product( (["custom"], df_2lvl.columns.values)) sto = StrToComposition(target_col_id=None, overwrite_data=True) df_2lvl = sto.featurize_dataframe(df_2lvl, ("custom", "comp_str"), multiindex=True) self.assertEqual( df_2lvl[("custom", "comp_str")][df_2lvl[("custom", "comp_str")].columns[3]].tolist(), [Composition("Fe2"), Composition("MnO2")]) # Try inplace multiindex conversion with return errors df_2lvl = DataFrame(data=d) df_2lvl.columns = MultiIndex.from_product( (["custom"], df_2lvl.columns.values)) sto = StrToComposition(target_col_id=None, overwrite_data=True) df_2lvl = sto.featurize_dataframe(df_2lvl, ("custom", "comp_str"), multiindex=True, return_errors=True, ignore_errors=True) self.assertTrue( all(df_2lvl[("custom", "StrToComposition Exceptions")].isnull()))
def has_polymorphs(self): """Determine if a task's raw data contains polymorphs. Returns: (bool) If true, contains polymorphs. """ checker_key = "pmg_composition" self._check_is_loaded() if self.metadata.input_type == "composition": stc = StrToComposition(target_col_id=checker_key, reduce=True) comps = stc.featurize_dataframe(self.df, "composition")[checker_key].values elif self.metadata.input_type == "structure": stc = StructureToComposition(target_col_id=checker_key, reduce=True) comps = stc.featurize_dataframe(self.df, "structure")[checker_key].values else: raise ValueError( "Cannot check for polymorphs without input type in " "(structure, composition)!") unique_comps = set(comps) if len(unique_comps) != len(comps): return True else: return False
def test_conversion_multiindex(self): d = {'comp_str': ["Fe2", "MnO2"]} df_1lvl = DataFrame(data=d) df_1lvl = StrToComposition().featurize_dataframe( df_1lvl, 'comp_str', multiindex=True) self.assertEqual(df_1lvl[("StrToComposition", "composition")].tolist(), [Composition("Fe2"), Composition("MnO2")]) df_2lvl = DataFrame(data=d) df_2lvl.columns = MultiIndex.from_product((["custom"], df_2lvl.columns.values)) df_2lvl = StrToComposition().featurize_dataframe( df_2lvl, ("custom", "comp_str"), multiindex=True) self.assertEqual(df_2lvl[("StrToComposition", "composition")].tolist(), [Composition("Fe2"), Composition("MnO2")]) df_2lvl = DataFrame(data=d) df_2lvl.columns = MultiIndex.from_product((["custom"], df_2lvl.columns.values)) sto = StrToComposition(target_col_id='test') df_2lvl = sto.featurize_dataframe( df_2lvl, ("custom", "comp_str"), multiindex=True) self.assertEqual(df_2lvl[("StrToComposition", "test")].tolist(), [Composition("Fe2"), Composition("MnO2")]) # if two level multiindex provided as target, it should be written there # here we test converting multiindex in place df_2lvl = DataFrame(data=d) df_2lvl.columns = MultiIndex.from_product((["custom"], df_2lvl.columns.values)) sto = StrToComposition(target_col_id=None, overwrite_data=True) df_2lvl = sto.featurize_dataframe( df_2lvl, ("custom", "comp_str"), multiindex=True, inplace=False) self.assertEqual(df_2lvl[("custom", "comp_str")].tolist(), [Composition("Fe2"), Composition("MnO2")]) # Try inplace multiindex conversion with return errors df_2lvl = DataFrame(data=d) df_2lvl.columns = MultiIndex.from_product((["custom"], df_2lvl.columns.values)) sto = StrToComposition(target_col_id=None, overwrite_data=True) df_2lvl = sto.featurize_dataframe( df_2lvl, ("custom", "comp_str"), multiindex=True, return_errors=True, ignore_errors=True) self.assertTrue( all(df_2lvl[("custom", "StrToComposition Exceptions")].isnull()))
class FeatureGenerator: """ A wraper class to generate multiple type of elemental features """ def __init__(self): self.feature_calculators = MultipleFeaturizer([ cf.ElementProperty.from_preset(preset_name="magpie"), cf.Stoichiometry(), cf.ValenceOrbital(props=['frac']), cf.IonProperty(fast=True), cf.BandCenter(), cf.ElementFraction(), ]) self.str2composition = StrToComposition() def generate(self, df: pd.DataFrame, ignore_errors: bool = False): """ generate feature from a dataframe with a "formula" column that contains chemical formulas of the compositions. """ df = self.str2composition.featurize_dataframe( df, "formula", ignore_errors=ignore_errors) df = df.dropna() df = self.feature_calculators.featurize_dataframe( df, col_id='composition', ignore_errors=ignore_errors) df["NComp"] = df["composition"].apply(len) return df
def test_conversion_overwrite(self): # Test with overwrite d = {'comp_str': ["Fe2", "MnO2"]} df = DataFrame(data=d) stc = StrToComposition(target_col_id='comp_str', overwrite_data=False) with self.assertRaises(ValueError): df = stc.featurize_dataframe(df, 'comp_str', inplace=True) with self.assertRaises(ValueError): df = stc.featurize_dataframe(df, 'comp_str', inplace=False) stc = StrToComposition(target_col_id='comp_str', overwrite_data=True) dfres_ipt = df.copy() stc.featurize_dataframe(dfres_ipt, 'comp_str', inplace=True) self.assertListEqual(dfres_ipt.columns.tolist(), ["comp_str"]) dfres_ipf = stc.featurize_dataframe(df, 'comp_str', inplace=False) self.assertListEqual(dfres_ipf.columns.tolist(), ["comp_str"])
def test_str_to_composition(self): d = {'comp_str': ["Fe2", "MnO2"]} df = DataFrame(data=d) df = StrToComposition().featurize_dataframe(df, 'comp_str') self.assertEqual(df["composition"].tolist(), [Composition("Fe2"), Composition("MnO2")]) stc = StrToComposition(reduce=True, target_col_id='composition_red') df = stc.featurize_dataframe(df, 'comp_str') self.assertEqual(df["composition_red"].tolist(), [Composition("Fe"), Composition("MnO2")])
class FeatureGenerator: """ A wraper class to generate multiple type of elemental features """ def __init__(self): self.feature_calculators = MultipleFeaturizer([ cf.ElementProperty.from_preset(preset_name="magpie"), cf.Stoichiometry(), cf.ValenceOrbital(props=['frac']), cf.IonProperty(fast=True), cf.BandCenter(), cf.ElementFraction(), ]) self.str2composition = StrToComposition() def generate(self, df: pd.DataFrame, ignore_errors: bool = False, drop_mode=True): """ generate feature from a dataframe with a "formula" column that contains chemical formulas of the compositions. df : a dataframe with a column name formula ignore_errors : ignore errors when generating features drop_mode : drop property that generated from mode aggregation function """ df = self.str2composition.featurize_dataframe( df, "formula", ignore_errors=ignore_errors) df = df.dropna() df = self.feature_calculators.featurize_dataframe( df, col_id='composition', ignore_errors=ignore_errors) df["NComp"] = df["composition"].apply(len) if drop_mode: df = df.drop(columns=[ c for c in df.columns if "mode" in c and c.startswith("Magpie") ]) return df
def _tidy_column(self, df, featurizer_type): """ Various conversions to homogenize columns for featurization input. For example, take a column of compositions and ensure they are decorated with oxidation states, are not strings, etc. Args: df (pandas.DataFrame) featurizer_type: The key defining the featurizer input. For example, composition featurizers should have featurizer_type of "composition". Returns: df (pandas.DataFrame): DataFrame with featurizer_type column ready for featurization. """ # todo: Make the following conversions more robust (no [0] type checking) type_tester = df[featurizer_type].iloc[0] if featurizer_type == self.composition_col: # Convert formulas to composition objects if isinstance(type_tester, str): self.logger.info( self._log_prefix + "Compositions detected as strings. Attempting " "conversion to Composition objects...") stc = StrToComposition(overwrite_data=True, target_col_id=featurizer_type) df = stc.featurize_dataframe(df, featurizer_type, multiindex=self.multiindex, ignore_errors=True, inplace=False) elif isinstance(type_tester, dict): self.logger.info(self._log_prefix + "Compositions detected as dicts. Attempting " "conversion to Composition objects...") df[featurizer_type] = [ Composition.from_dict(d) for d in df[featurizer_type] ] # Convert non-oxidstate containing comps to oxidstate comps if self.guess_oxistates: self.logger.info( self._log_prefix + "Guessing oxidation states of compositions, as" " they were not present in input.") cto = CompositionToOxidComposition( target_col_id=featurizer_type, overwrite_data=True, return_original_on_error=True, max_sites=-50) try: df = cto.featurize_dataframe(df, featurizer_type, multiindex=self.multiindex, inplace=False) except Exception as e: self.logger.info(self._log_prefix + "Could not decorate oxidation states due " "to {}. Excluding featurizers based on " "composition oxistates".format(e)) classes_require_oxi = [ c.__class__.__name__ for c in CompositionFeaturizers().need_oxi ] self.exclude.extend(classes_require_oxi) else: # Convert structure/bs/dos dicts to objects (robust already) if isinstance(type_tester, (dict, str)): self.logger.info(self._log_prefix.capitalize() + "{} detected as string or dict. Attempting " "conversion to {} objects..." "".format(featurizer_type, featurizer_type)) if isinstance(type_tester, str): raise ValueError("{} column is type {}. Cannot convert." "".format(featurizer_type, type(type_tester))) dto = DictToObject(overwrite_data=True, target_col_id=featurizer_type) df = dto.featurize_dataframe(df, featurizer_type, inplace=False) # Decorate with oxidstates if featurizer_type == self.structure_col and \ self.guess_oxistates: self.logger.info( self._log_prefix + "Guessing oxidation states of structures if they were " "not present in input.") sto = StructureToOxidStructure( target_col_id=featurizer_type, overwrite_data=True, return_original_on_error=True, max_sites=-50) try: df = sto.featurize_dataframe( df, featurizer_type, multiindex=self.multiindex, inplace=False) except Exception as e: self.logger.info( self._log_prefix + "Could not decorate oxidation states on structures " "due to {}.".format(e)) return df
""" ['_id', 'material_id', 'formula', 'nsites', 'space_group', 'volume', 'structure', 'elastic_anisotropy', 'G_Reuss', 'G_VRH', 'G_Voigt', 'K_Reuss', 'K_VRH', 'K_Voigt', 'poisson_ratio', 'compliance_tensor', 'elastic_tensor', 'elastic_tensor_original', 'cif', 'kpoint_density', 'poscar'] """ unwanted_columns = ['_id', 'material_id', 'nsites', 'volume', 'cif', 'kpoint_density', 'poscar'] df = df.drop(unwanted_columns, axis=1) from matminer.featurizers.conversions import StrToComposition sc_feat = StrToComposition() df = sc_feat.featurize_dataframe(df, col_id='formula') from matminer.featurizers.composition import ElementProperty ep_feat = ElementProperty.from_preset(preset_name='magpie') df = ep_feat.featurize_dataframe(df, col_id='composition') from matminer.featurizers.conversions import CompositionToOxidComposition co_feat = CompositionToOxidComposition() df = co_feat.featurize_dataframe(df, col_id='composition') from matminer.featurizers.composition import OxidationStates os_feat = OxidationStates() df = os_feat.featurize_dataframe(df, col_id='composition_oxid')