def featurize_site(self, df): """Applies the preset site featurizers to the input dataframe, renames some fields and cleans the output dataframe. """ aliases = { "GeneralizedRadialDistributionFunction": "GeneralizedRDF", "AGNIFingerprints": "AGNIFingerPrint", "BondOrientationalParameter": "BondOrientationParameter", "GaussianSymmFunc": "ChemEnvSiteFingerprint|GaussianSymmFunc", } df = super().featurize_site(df, aliases=aliases) df = df.loc[:, (df != 0).any(axis=0)] return clean_df(df)
def featurize_structure(self, df): """Applies the preset structural featurizers to the input dataframe, renames some fields and cleans the output dataframe. """ df = super().featurize_structure(df) dist = df["RadialDistributionFunction|radial distribution function"].iloc[0][ "distances" ][:50] for i, d in enumerate(dist): _rdf_key = "RadialDistributionFunction|radial distribution function|d_{:.2f}".format( d ) df[_rdf_key] = df[ "RadialDistributionFunction|radial distribution function" ].apply(lambda x: x["distribution"][i]) df = df.drop("RadialDistributionFunction|radial distribution function", axis=1) _crystal_system = { "cubic": 1, "tetragonal": 2, "orthorombic": 3, "hexagonal": 4, "trigonal": 5, "monoclinic": 6, "triclinic": 7, } def _int_map(x): if x == np.nan: return 0 elif x: return 1 else: return 0 df["GlobalSymmetryFeatures|crystal_system"] = df[ "GlobalSymmetryFeatures|crystal_system" ].map(_crystal_system) df["GlobalSymmetryFeatures|is_centrosymmetric"] = df[ "GlobalSymmetryFeatures|is_centrosymmetric" ].map(_int_map) return clean_df(df)
def featurize_bandstructure(self, df): """Applies the preset band structure featurizers to the input dataframe, renames some fields and cleans the output dataframe. """ df = super().featurize_bandstructure(df) def _int_map(x): if str(x) == "False": return 0 elif str(x) == "True": return 1 df["BandFeaturizer|is_gap_direct"] = df[ "BandFeaturizer|is_gap_direct"].map(_int_map) df = df.drop(["bandstructure"], axis=1) return clean_df(df)
def featurize_composition(self, df): """Applies the preset composition featurizers to the input dataframe, renames some fields and cleans the output dataframe. """ df = super().featurize_composition(df) _orbitals = {"s": 1, "p": 2, "d": 3, "f": 4} df["AtomicOrbitals|HOMO_character"] = df[ "AtomicOrbitals|HOMO_character"].map(_orbitals) df["AtomicOrbitals|LUMO_character"] = df[ "AtomicOrbitals|LUMO_character"].map(_orbitals) df["AtomicOrbitals|HOMO_element"] = df[ "AtomicOrbitals|HOMO_element"].apply( lambda x: -1 if not isinstance(x, str) else Element(x).Z) df["AtomicOrbitals|LUMO_element"] = df[ "AtomicOrbitals|LUMO_element"].apply( lambda x: -1 if not isinstance(x, str) else Element(x).Z) return clean_df(df)
def featurize_dos(self, df): """Applies the presetdos featurizers to the input dataframe, renames some fields and cleans the output dataframe. """ df = super().featurize_dos(df) hotencodeColumns = [ "DOSFeaturizer|vbm_specie_1", "DOSFeaturizer|cbm_specie_1" ] one_hot = pd.get_dummies(df[hotencodeColumns]) df = df.drop(hotencodeColumns, axis=1).join(one_hot) _orbitals = {"s": 1, "p": 2, "d": 3, "f": 4} df["DOSFeaturizer|vbm_character_1"] = df[ "DOSFeaturizer|vbm_character_1"].map(_orbitals) df["DOSFeaturizer|cbm_character_1"] = df[ "DOSFeaturizer|cbm_character_1"].map(_orbitals) # Splitting one feature into several floating features # e.g. number;number;number into three columns splitColumns = [ "DOSFeaturizer|cbm_location_1", "DOSFeaturizer|vbm_location_1" ] for column in splitColumns: try: newColumns = df[column].str.split(";", n=2, expand=True) for i in range(0, 3): df[column + "_" + str(i)] = np.array(newColumns[i]).astype( np.float) except: continue df = df.drop(splitColumns, axis=1) df = df.drop(["dos"], axis=1) return clean_df(df)