def test_coulomb_matrix(self): # flat cm = CoulombMatrix(flatten=True) df = pd.DataFrame({"s": [self.diamond, self.nacl]}) with self.assertRaises(NotFittedError): df = cm.featurize_dataframe(df, "s") df = cm.fit_featurize_dataframe(df, "s") labels = cm.feature_labels() self.assertListEqual(labels, ["coulomb matrix eig 0", "coulomb matrix eig 1"]) self.assertArrayAlmostEqual(df[labels].iloc[0], [49.169453, 24.546758], decimal=5) self.assertArrayAlmostEqual(df[labels].iloc[1], [153.774731, 452.894322], decimal=5) # matrix species = ["C", "C", "H", "H"] coords = [[0, 0, 0], [0, 0, 1.203], [0, 0, -1.06], [0, 0, 2.263]] acetylene = Molecule(species, coords) morig = CoulombMatrix(flatten=False).featurize(acetylene) mtarget = [[36.858, 15.835391290, 2.995098235, 1.402827813], \ [15.835391290, 36.858, 1.4028278132103624, 2.9950982], \ [2.9368896127, 1.402827813, 0.5, 0.159279959], \ [1.4028278132, 2.995098235, 0.159279959, 0.5]] self.assertAlmostEqual( int(np.linalg.norm(morig - np.array(mtarget))), 0) m = CoulombMatrix(diag_elems=False, flatten=False).featurize(acetylene)[0] self.assertAlmostEqual(m[0][0], 0.0) self.assertAlmostEqual(m[1][1], 0.0) self.assertAlmostEqual(m[2][2], 0.0) self.assertAlmostEqual(m[3][3], 0.0)
def __init__(self, threshold, min_entries=10, metric='minkowski', k=1, n_jobs=1): """Initialize the metric Args: threshold (float): Maximum distance for a prediction to be "trustable" min_entries (int): Minimum number of training entries before surrogate can be evaluated metric (string): Distance metric to use k (int): Number of nearest neighbors to consider n_jobs (int): Number of threads to use when computing distances """ super().__init__() # Make the featurizer # TODO (wardlt): This code is duplicated in the inference engine. Maybe we should let "featurizer" be a param cm = CoulombMatrix(flatten=True) cm.set_n_jobs(1) self.cm = Pipeline([('featurizer', cm), ('scaler', RobustScaler())]) # Save the other things self.threshold = threshold self.min_entries = min_entries self.metric = metric self.k = k self.n_jobs = n_jobs self.nn_ = None
def __init__(self, n_neighbors: int = 5): """ Args: n_neighbors (int): Number of neighboring points to use for the NN model """ cm = CoulombMatrix(flatten=True) cm.set_n_jobs(1) model = Pipeline([('featurizer', cm), ('scaler', RobustScaler()), ('model', KNeighborsRegressor(n_neighbors))]) super().__init__(model)
def test_coulomb_matrix(self): species = ["C", "C", "H", "H"] coords = [[0, 0, 0], [0, 0, 1.203], [0, 0, -1.06], [0, 0, 2.263]] acetylene = Molecule(species, coords) morig = CoulombMatrix().featurize(acetylene, diag_elems=True) mtarget = [[36.858, 29.925, 5.66, 2.651], [29.925, 36.858, 2.651, 5.66], [5.55, 2.651, 0.5, 0.301], [2.651, 5.66, 0.301, 0.5]] self.assertAlmostEqual(int(np.linalg.norm(morig - np.array(mtarget))), 0) m = CoulombMatrix().featurize(acetylene) self.assertAlmostEqual(m[0][0], 0.0) self.assertAlmostEqual(m[1][1], 0.0) self.assertAlmostEqual(m[2][2], 0.0) self.assertAlmostEqual(m[3][3], 0.0)
def test_coulomb_matrix(self): species = ["C", "C", "H", "H"] coords = [[0, 0, 0], [0, 0, 1.203], [0, 0, -1.06], [0, 0, 2.263]] acetylene = Molecule(species, coords) morig = CoulombMatrix().featurize(acetylene) mtarget = [[36.858, 15.835391290, 2.995098235, 1.402827813], \ [15.835391290, 36.858, 1.4028278132103624, 2.9950982], \ [2.9368896127, 1.402827813, 0.5, 0.159279959], \ [1.4028278132, 2.995098235, 0.159279959, 0.5]] self.assertAlmostEqual(int(np.linalg.norm(morig - np.array(mtarget))), 0) m = CoulombMatrix(False).featurize(acetylene)[0] self.assertAlmostEqual(m[0][0], 0.0) self.assertAlmostEqual(m[1][1], 0.0) self.assertAlmostEqual(m[2][2], 0.0) self.assertAlmostEqual(m[3][3], 0.0)
def test_coulomb_matrix(self): # flat cm = CoulombMatrix(flatten=True) df = pd.DataFrame({"s": [self.diamond, self.nacl]}) with self.assertRaises(NotFittedError): df = cm.featurize_dataframe(df, "s") df = cm.fit_featurize_dataframe(df, "s") labels = cm.feature_labels() self.assertListEqual(labels, ["coulomb matrix eig 0", "coulomb matrix eig 1"]) self.assertArrayAlmostEqual(df[labels].iloc[0], [49.169453, 24.546758], decimal=5) self.assertArrayAlmostEqual(df[labels].iloc[1], [153.774731, 452.894322], decimal=5) # matrix species = ["C", "C", "H", "H"] coords = [[0, 0, 0], [0, 0, 1.203], [0, 0, -1.06], [0, 0, 2.263]] acetylene = Molecule(species, coords) morig = CoulombMatrix(flatten=False).featurize(acetylene) mtarget = [[36.858, 15.835391290, 2.995098235, 1.402827813], \ [15.835391290, 36.858, 1.4028278132103624, 2.9950982], \ [2.9368896127, 1.402827813, 0.5, 0.159279959], \ [1.4028278132, 2.995098235, 0.159279959, 0.5]] self.assertAlmostEqual(int(np.linalg.norm(morig - np.array(mtarget))), 0) m = CoulombMatrix(diag_elems=False, flatten=False).featurize(acetylene)[0] self.assertAlmostEqual(m[0][0], 0.0) self.assertAlmostEqual(m[1][1], 0.0) self.assertAlmostEqual(m[2][2], 0.0) self.assertAlmostEqual(m[3][3], 0.0)
def featurize_structure(df: pd.DataFrame) -> pd.DataFrame: """ Decorate input `pandas.DataFrame` of structures with structural features from matminer. Currently applies the set of all matminer structure features. Args: df (pandas.DataFrame): the input dataframe with `"structure"` column containing `pymatgen.Structure` objects. Returns: pandas.DataFrame: the decorated DataFrame. """ logging.info("Applying structure featurizers...") df = df.copy() structure_features = [ DensityFeatures(), GlobalSymmetryFeatures(), RadialDistributionFunction(), CoulombMatrix(), PartialRadialDistributionFunction(), SineCoulombMatrix(), EwaldEnergy(), BondFractions(), StructuralHeterogeneity(), MaximumPackingEfficiency(), ChemicalOrdering(), XRDPowderPattern(), BagofBonds() ] featurizer = MultipleFeaturizer([feature.fit(df["structure"]) for feature in structure_features]) df = featurizer.featurize_dataframe(df, "structure", multiindex=True, ignore_errors=True) df.columns = df.columns.map('|'.join).str.strip('|') dist = df["RadialDistributionFunction|radial distribution function"][0]['distances'][:50] for i, d in enumerate(dist): _rdf_key = "RadialDistributionFunction|radial distribution function|d_{:.2f}".format(d) df[_rdf_key] = df["RadialDistributionFunction|radial distribution function"].apply(lambda x: x['distribution'][i]) df = df.drop("RadialDistributionFunction|radial distribution function", axis=1) _crystal_system = { "cubic": 1, "tetragonal": 2, "orthorombic": 3, "hexagonal": 4, "trigonal": 5, "monoclinic": 6, "triclinic": 7 } df["GlobalSymmetryFeatures|crystal_system"] = df["GlobalSymmetryFeatures|crystal_system"].map(_crystal_system) df["GlobalSymmetryFeatures|is_centrosymmetric"] = df["GlobalSymmetryFeatures|is_centrosymmetric"].map(int) return clean_df(df)
def test_bob(self): # Test a single fit and featurization scm = SineCoulombMatrix(flatten=False) bob = BagofBonds(coulomb_matrix=scm, token=' - ') bob.fit([self.ni3al]) truth1 = [ 235.74041833262768, 1486.4464890775491, 1486.4464890775491, 1486.4464890775491, 38.69353092306119, 38.69353092306119, 38.69353092306119, 38.69353092306119, 38.69353092306119, 38.69353092306119, 83.33991275736257, 83.33991275736257, 83.33991275736257, 83.33991275736257, 83.33991275736257, 83.33991275736257 ] truth1_labels = [ 'Al site #0', 'Ni site #0', 'Ni site #1', 'Ni site #2', 'Al - Ni bond #0', 'Al - Ni bond #1', 'Al - Ni bond #2', 'Al - Ni bond #3', 'Al - Ni bond #4', 'Al - Ni bond #5', 'Ni - Ni bond #0', 'Ni - Ni bond #1', 'Ni - Ni bond #2', 'Ni - Ni bond #3', 'Ni - Ni bond #4', 'Ni - Ni bond #5' ] self.assertArrayAlmostEqual(bob.featurize(self.ni3al), truth1) self.assertEqual(bob.feature_labels(), truth1_labels) # Test padding from fitting and dataframe featurization bob.coulomb_matrix = CoulombMatrix(flatten=False) bob.fit([self.ni3al, self.cscl, self.diamond_no_oxi]) df = pd.DataFrame({'structures': [self.cscl]}) df = bob.featurize_dataframe(df, 'structures') self.assertEqual(len(df.columns.values), 25) self.assertAlmostEqual(df['Cs+ site #0'][0], 7513.468312122532) self.assertAlmostEqual(df['Al site #0'][0], 0.0) self.assertAlmostEqual(df['Cs+ - Cl- bond #1'][0], 135.74726437398044, 3) self.assertAlmostEqual(df['Al - Ni bond #0'][0], 0.0) # Test error handling for bad fits or null fits bob = BagofBonds(CoulombMatrix(flatten=False)) self.assertRaises(NotFittedError, bob.featurize, self.nacl) bob.fit([self.ni3al, self.diamond]) self.assertRaises(ValueError, bob.featurize, self.nacl)\
class DeBreuck2020Featurizer(modnet.featurizers.MODFeaturizer): """ Featurizer presets used for the paper 'Machine learning materials properties for small datasets' by Pierre-Paul De Breuck, Geoffroy Hautier & Gian-Marco Rignanese, arXiv:2004.14766 (2020). Uses most of the featurizers implemented by matminer at the time of writing with their default hyperparameters and presets. """ from matminer.featurizers.composition import ( AtomicOrbitals, AtomicPackingEfficiency, BandCenter, # CohesiveEnergy, - This descriptor was not used in the paper preset # ElectronAffinity, - This descriptor was not used in the paper preset ElectronegativityDiff, ElementFraction, ElementProperty, IonProperty, Miedema, OxidationStates, Stoichiometry, TMetalFraction, ValenceOrbital, YangSolidSolution, ) from matminer.featurizers.structure import ( # BagofBonds, - This descriptor was not used in the paper preset BondFractions, ChemicalOrdering, CoulombMatrix, DensityFeatures, EwaldEnergy, GlobalSymmetryFeatures, MaximumPackingEfficiency, # PartialRadialDistributionFunction, RadialDistributionFunction, SineCoulombMatrix, StructuralHeterogeneity, XRDPowderPattern, ) from matminer.featurizers.site import ( AGNIFingerprints, AverageBondAngle, AverageBondLength, BondOrientationalParameter, ChemEnvSiteFingerprint, CoordinationNumber, CrystalNNFingerprint, GaussianSymmFunc, GeneralizedRadialDistributionFunction, LocalPropertyDifference, OPSiteFingerprint, VoronoiFingerprint, ) composition_featurizers = ( AtomicOrbitals(), AtomicPackingEfficiency(), BandCenter(), ElementFraction(), ElementProperty.from_preset("magpie"), IonProperty(), Miedema(), Stoichiometry(), TMetalFraction(), ValenceOrbital(), YangSolidSolution(), ) oxide_composition_featurizers = ( ElectronegativityDiff(), OxidationStates(), ) structure_featurizers = ( DensityFeatures(), GlobalSymmetryFeatures(), RadialDistributionFunction(), CoulombMatrix(), # PartialRadialDistributionFunction(), SineCoulombMatrix(), EwaldEnergy(), BondFractions(), StructuralHeterogeneity(), MaximumPackingEfficiency(), ChemicalOrdering(), XRDPowderPattern(), # BagofBonds(), ) site_featurizers = ( AGNIFingerprints(), AverageBondAngle(VoronoiNN()), AverageBondLength(VoronoiNN()), BondOrientationalParameter(), ChemEnvSiteFingerprint.from_preset("simple"), CoordinationNumber(), CrystalNNFingerprint.from_preset("ops"), GaussianSymmFunc(), GeneralizedRadialDistributionFunction.from_preset("gaussian"), LocalPropertyDifference(), OPSiteFingerprint(), VoronoiFingerprint(), ) def featurize_composition(self, df): """ Applies the preset composition featurizers to the input dataframe, renames some fields and cleans the output dataframe. """ df = super().featurize_composition(df) _orbitals = {"s": 1, "p": 2, "d": 3, "f": 4} df['AtomicOrbitals|HOMO_character'] = df[ 'AtomicOrbitals|HOMO_character'].map(_orbitals) df['AtomicOrbitals|LUMO_character'] = df[ 'AtomicOrbitals|LUMO_character'].map(_orbitals) df['AtomicOrbitals|HOMO_element'] = df[ 'AtomicOrbitals|HOMO_element'].apply( lambda x: -1 if not isinstance(x, str) else Element(x).Z) df['AtomicOrbitals|LUMO_element'] = df[ 'AtomicOrbitals|LUMO_element'].apply( lambda x: -1 if not isinstance(x, str) else Element(x).Z) df = df.replace([np.inf, -np.inf, np.nan], 0) return modnet.featurizers.clean_df(df) def featurize_structure(self, df): """ Applies the preset structural featurizers to the input dataframe, renames some fields and cleans the output dataframe. """ df = super().featurize_structure(df) dist = df[ "RadialDistributionFunction|radial distribution function"].iloc[0][ 'distances'][:50] for i, d in enumerate(dist): _rdf_key = "RadialDistributionFunction|radial distribution function|d_{:.2f}".format( d) df[_rdf_key] = df[ "RadialDistributionFunction|radial distribution function"].apply( lambda x: x['distribution'][i]) df = df.drop("RadialDistributionFunction|radial distribution function", axis=1) _crystal_system = { "cubic": 1, "tetragonal": 2, "orthorombic": 3, "hexagonal": 4, "trigonal": 5, "monoclinic": 6, "triclinic": 7 } def _int_map(x): if x == np.nan: return 0 elif x: return 1 else: return 0 df["GlobalSymmetryFeatures|crystal_system"] = df[ "GlobalSymmetryFeatures|crystal_system"].map(_crystal_system) df["GlobalSymmetryFeatures|is_centrosymmetric"] = df[ "GlobalSymmetryFeatures|is_centrosymmetric"].map(_int_map) return modnet.featurizers.clean_df(df) def featurize_site(self, df): """ Applies the preset site featurizers to the input dataframe, renames some fields and cleans the output dataframe. """ # rename some features for backwards compatibility with pretrained models aliases = { "GeneralizedRadialDistributionFunction": "GeneralizedRDF", "AGNIFingerprints": "AGNIFingerPrint", "BondOrientationalParameter": "BondOrientationParameter", "GaussianSymmFunc": "ChemEnvSiteFingerprint|GaussianSymmFunc", } df = super().featurize_site(df, aliases=aliases) df = df.loc[:, (df != 0).any(axis=0)] return modnet.featurizers.clean_df(df)
class FUTURE_PROSPECTS_2021(featurizer.extendedMODFeaturizer): from matminer.featurizers.composition import ( AtomicOrbitals, AtomicPackingEfficiency, BandCenter, CohesiveEnergy, ElectronAffinity, ElectronegativityDiff, ElementFraction, ElementProperty, IonProperty, Miedema, OxidationStates, Stoichiometry, TMetalFraction, ValenceOrbital, YangSolidSolution, ) from matminer.featurizers.structure import ( BagofBonds, BondFractions, ChemicalOrdering, CoulombMatrix, DensityFeatures, EwaldEnergy, GlobalSymmetryFeatures, MaximumPackingEfficiency, PartialRadialDistributionFunction, RadialDistributionFunction, SineCoulombMatrix, StructuralHeterogeneity, XRDPowderPattern, ) from matminer.featurizers.site import ( AGNIFingerprints, AverageBondAngle, AverageBondLength, BondOrientationalParameter, ChemEnvSiteFingerprint, CoordinationNumber, CrystalNNFingerprint, GaussianSymmFunc, GeneralizedRadialDistributionFunction, LocalPropertyDifference, OPSiteFingerprint, VoronoiFingerprint, ) from matminer.featurizers.dos import ( DOSFeaturizer, SiteDOS, Hybridization, DosAsymmetry, ) from matminer.featurizers.bandstructure import ( BandFeaturizer, BranchPointEnergy ) composition_featurizers = ( AtomicOrbitals(), AtomicPackingEfficiency(), BandCenter(), ElementFraction(), ElementProperty.from_preset("magpie"), IonProperty(), Miedema(), Stoichiometry(), TMetalFraction(), ValenceOrbital(), YangSolidSolution(), ) oxid_composition_featurizers = ( ElectronegativityDiff(), OxidationStates(), ) structure_featurizers = ( DensityFeatures(), GlobalSymmetryFeatures(), RadialDistributionFunction(), CoulombMatrix(), #PartialRadialDistributionFunction(), #Introduces a large amount of features SineCoulombMatrix(), EwaldEnergy(), BondFractions(), StructuralHeterogeneity(), MaximumPackingEfficiency(), ChemicalOrdering(), XRDPowderPattern(), ) site_featurizers = ( AGNIFingerprints(), AverageBondAngle(VoronoiNN()), AverageBondLength(VoronoiNN()), BondOrientationalParameter(), ChemEnvSiteFingerprint.from_preset("simple"), CoordinationNumber(), CrystalNNFingerprint.from_preset("ops"), GaussianSymmFunc(), GeneralizedRadialDistributionFunction.from_preset("gaussian"), LocalPropertyDifference(), OPSiteFingerprint(), VoronoiFingerprint(), ) dos_featurizers = ( DOSFeaturizer(), SiteDOS(), Hybridization() ) band_featurizers = ( BandFeaturizer(), BranchPointEnergy() ) def __init__(self, n_jobs=None): self._n_jobs = n_jobs def featurize_composition(self, df): """Applies the preset composition featurizers to the input dataframe, renames some fields and cleans the output dataframe. """ df = super().featurize_composition(df) _orbitals = {"s": 1, "p": 2, "d": 3, "f": 4} df["AtomicOrbitals|HOMO_character"] = df["AtomicOrbitals|HOMO_character"].map( _orbitals ) df["AtomicOrbitals|LUMO_character"] = df["AtomicOrbitals|LUMO_character"].map( _orbitals ) df["AtomicOrbitals|HOMO_element"] = df["AtomicOrbitals|HOMO_element"].apply( lambda x: -1 if not isinstance(x, str) else Element(x).Z ) df["AtomicOrbitals|LUMO_element"] = df["AtomicOrbitals|LUMO_element"].apply( lambda x: -1 if not isinstance(x, str) else Element(x).Z ) return clean_df(df) def featurize_structure(self, df): """Applies the preset structural featurizers to the input dataframe, renames some fields and cleans the output dataframe. """ df = super().featurize_structure(df) dist = df["RadialDistributionFunction|radial distribution function"].iloc[0][ "distances" ][:50] for i, d in enumerate(dist): _rdf_key = "RadialDistributionFunction|radial distribution function|d_{:.2f}".format( d ) df[_rdf_key] = df[ "RadialDistributionFunction|radial distribution function" ].apply(lambda x: x["distribution"][i]) df = df.drop("RadialDistributionFunction|radial distribution function", axis=1) _crystal_system = { "cubic": 1, "tetragonal": 2, "orthorombic": 3, "hexagonal": 4, "trigonal": 5, "monoclinic": 6, "triclinic": 7, } def _int_map(x): if x == np.nan: return 0 elif x: return 1 else: return 0 df["GlobalSymmetryFeatures|crystal_system"] = df[ "GlobalSymmetryFeatures|crystal_system" ].map(_crystal_system) df["GlobalSymmetryFeatures|is_centrosymmetric"] = df[ "GlobalSymmetryFeatures|is_centrosymmetric" ].map(_int_map) return clean_df(df) def featurize_dos(self, df): """Applies the presetdos featurizers to the input dataframe, renames some fields and cleans the output dataframe. """ df = super().featurize_dos(df) hotencodeColumns = ["DOSFeaturizer|vbm_specie_1","DOSFeaturizer|cbm_specie_1"] one_hot = pd.get_dummies(df[hotencodeColumns]) df = df.drop(hotencodeColumns, axis = 1).join(one_hot) _orbitals = {"s": 1, "p": 2, "d": 3, "f": 4} df["DOSFeaturizer|vbm_character_1"] = df[ "DOSFeaturizer|vbm_character_1" ].map(_orbitals) df["DOSFeaturizer|cbm_character_1"] = df[ "DOSFeaturizer|cbm_character_1" ].map(_orbitals) # Splitting one feature into several floating features # e.g. number;number;number into three columns splitColumns = ["DOSFeaturizer|cbm_location_1", "DOSFeaturizer|vbm_location_1"] for column in splitColumns: try: newColumns = df[column].str.split(";", n = 2, expand = True) for i in range(0,3): df[column + "_" + str(i)] = np.array(newColumns[i]).astype(np.float) except: continue df = df.drop(splitColumns, axis=1) df = df.drop(["dos"], axis=1) return clean_df(df) def featurize_bandstructure(self, df): """Applies the preset band structure featurizers to the input dataframe, renames some fields and cleans the output dataframe. """ df = super().featurize_bandstructure(df) def _int_map(x): if str(x) == "False": return 0 elif str(x) == "True": return 1 df["BandFeaturizer|is_gap_direct"] = df[ "BandFeaturizer|is_gap_direct" ].map(_int_map) df = df.drop(["bandstructure"], axis=1) return clean_df(df) def featurize_site(self, df): """Applies the preset site featurizers to the input dataframe, renames some fields and cleans the output dataframe. """ aliases = { "GeneralizedRadialDistributionFunction": "GeneralizedRDF", "AGNIFingerprints": "AGNIFingerPrint", "BondOrientationalParameter": "BondOrientationParameter", "GaussianSymmFunc": "ChemEnvSiteFingerprint|GaussianSymmFunc", } df = super().featurize_site(df, aliases=aliases) df = df.loc[:, (df != 0).any(axis=0)] return clean_df(df)
import numpy as np from matminer.featurizers.structure import CoulombMatrix cen_structures = np.load('centrosymmetric_insulators.npy', allow_pickle=True) #create cm representations cm = CoulombMatrix() cen_cm = [] for item in cen_structures: cen_cm.append(cm.featurize(item)) np.save('centrosymmetric_cm_representation.npy', cen_cm) non_cen_structures = np.load('non_centrosymmetric_insulators.npy', allow_pickle=True) non_cen_cm = [] for item in non_cen_structures: non_cen_cm.append(cm.featurize(item)) np.save('non_centrosymmetric_cm_representation.npy', non_cen_cm)
structlist = [] namelist = [] structs = [] namecolumns = ['structure'] for i in CIFfiles: structlist.append([Structure.from_file(directoryname + i) ]) #Converts CIF to pymatgen structure object namelist.append(os.path.splitext(i)[0]) #Collects all the structure names structs.append(Structure.from_file(directoryname + i)) #Creates Pandas dataframe with data being a list of structures and the row name being the structure name dftest = pd.DataFrame(data=structlist, index=namelist, columns=namecolumns) p = PartialRadialDistributionFunction() p.fit(np.asarray(structs)) c = CoulombMatrix() c.fit(np.asarray(structs)) erdf = ElectronicRadialDistributionFunction() erdf.cutoff = 10 #longest diagonal of lattice...I picked a number semi-arbitrarily #Featurizes the structures featurizer = MultipleFeaturizer([ ElementProperty.from_preset('magpie'), OxidationStates(), AtomicOrbitals(), BandCenter(), ElectronegativityDiff(), DensityFeatures(), RadialDistributionFunction(), p, c, erdf ])