def split(self, X, y=None, groups=None): # Generate the composition vectors frac_computer = ElementFraction() elem_fracs = frac_computer.featurize_many(list(map(Composition, X)), pbar=False) # Generate the nearest-neighbor lookup tool neigh = NearestNeighbors(**self.nn_kwargs) neigh.fit(elem_fracs) # Generate a list of all entries all_inds = np.arange(0, len(X), 1) # Loop through each entry in X for i, x in enumerate(elem_fracs): # Get all the entries within the threshold distance of the test point too_close, = neigh.radius_neighbors([x], self.dist_threshold, return_distance=False) # Get the training set as "not these points" train_inds = np.setdiff1d(all_inds, too_close) yield train_inds, [i]
def __init__(self, pbar=False): self.regressor = RandomForestRegressor(n_estimators=500, n_jobs=-1, verbose=3) self.stc = StrToComposition() ep = ElementProperty.from_preset("magpie") ef = ElementFraction() self.featurizer = MultipleFeaturizer([ep, ef]) self.pbar = pbar
def featurize(self, comp): """ Get elemental property attributes Args: comp: Pymatgen composition object Returns: all_attributes: Specified property statistics of features """ # First 103 features are element fractions, we can get these from the ElementFraction featurizer element_fraction_features = ElementFraction().featurize(comp) # Next 9 features are statistics on elemental properties elements, fractions = zip(*comp.element_composition.items()) element_property_features = [0] * len(self._element_property_feature_labels) for i,feat in enumerate(self._element_property_feature_labels): stat = feat.split(" ")[0] attr = " ".join(feat.split(" ")[1:]) elem_data = [self.data_source.get_elemental_property(e, attr) for e in elements] element_property_features[i] = self.pstats.calc_stat(elem_data, stat, fractions) # Final 8 features are statistics on valence orbitals, available from the ValenceOrbital featurizer valence_orbital_features = ValenceOrbital(orbitals=("s", "p", "d", "f"), props=("avg", "frac")).featurize(comp) return element_fraction_features+element_property_features+valence_orbital_features
def featurize_composition(df: pd.DataFrame) -> pd.DataFrame: """ Decorate input `pandas.DataFrame` of structures with composition features from matminer. Currently applies the set of all matminer composition features. Args: df (pandas.DataFrame): the input dataframe with `"structure"` column containing `pymatgen.Structure` objects. Returns: pandas.DataFrame: the decorated DataFrame. """ logging.info("Applying composition featurizers...") df = df.copy() df['composition'] = df['structure'].apply(lambda s: s.composition) featurizer = MultipleFeaturizer([ElementProperty.from_preset("magpie"), AtomicOrbitals(), BandCenter(), # ElectronAffinity(), - This descriptor was not used in the paper preset Stoichiometry(), ValenceOrbital(), IonProperty(), ElementFraction(), TMetalFraction(), # CohesiveEnergy(), - This descriptor was not used in the paper preset Miedema(), YangSolidSolution(), AtomicPackingEfficiency(), ]) df = featurizer.featurize_dataframe(df, "composition", multiindex=True, ignore_errors=True) df.columns = df.columns.map('|'.join).str.strip('|') ox_featurizer = MultipleFeaturizer([OxidationStates(), ElectronegativityDiff() ]) df = CompositionToOxidComposition().featurize_dataframe(df, "Input Data|composition") df = ox_featurizer.featurize_dataframe(df, "composition_oxid", multiindex=True, ignore_errors=True) df = df.rename(columns={'Input Data': ''}) df.columns = df.columns.map('|'.join).str.strip('|') _orbitals = {"s": 1, "p": 2, "d": 3, "f": 4} df['AtomicOrbitals|HOMO_character'] = df['AtomicOrbitals|HOMO_character'].map(_orbitals) df['AtomicOrbitals|LUMO_character'] = df['AtomicOrbitals|LUMO_character'].map(_orbitals) df['AtomicOrbitals|HOMO_element'] = df['AtomicOrbitals|HOMO_element'].apply( lambda x: -1 if not isinstance(x, str) else Element(x).Z ) df['AtomicOrbitals|LUMO_element'] = df['AtomicOrbitals|LUMO_element'].apply( lambda x: -1 if not isinstance(x, str) else Element(x).Z ) df = df.replace([np.inf, -np.inf, np.nan], 0) return clean_df(df)
def __init__(self, threshold=0.01, n_nearest=(1, 3, 5), max_types=6): """ Initialize the featurizer Args: threshold (float):Threshold to use for determining whether a cluster is efficiently packed. n_nearest ({int}): Number of nearest clusters to use when considering features max_types (int): Maximum number of atom types to consider when looking for efficient clusters. The process for finding efficient clusters very expensive for large numbers of types """ # Store the options self.threshold = threshold self.n_nearest = n_nearest self.max_types = max_types # Tool to convert composition objects to fractions as a vector self._el_frac = ElementFraction() # Get the number of elements in the output of `_el_frac` self._n_elems = len(self._el_frac.featurize(Composition('H'))) # Tool for looking up radii self._data_source = MagpieData() # Lookup table of ideal radius ratios self.ideal_ratio = dict( [(3, 0.154701), (4, 0.224745), (5, 0.361654), (6, 0.414214), (7, 0.518145), (8, 0.616517), (9, 0.709914), (10, 0.798907), (11, 0.884003), (12, 0.902113), (13, 0.976006), (14, 1.04733), (15, 1.11632), (16, 1.18318), (17, 1.2481), (18, 1.31123), (19, 1.37271), (20, 1.43267), (21, 1.49119), (22, 1.5484), (23, 1.60436), (24, 1.65915)])
class DeBreuck2020Featurizer(modnet.featurizers.MODFeaturizer): """ Featurizer presets used for the paper 'Machine learning materials properties for small datasets' by Pierre-Paul De Breuck, Geoffroy Hautier & Gian-Marco Rignanese, arXiv:2004.14766 (2020). Uses most of the featurizers implemented by matminer at the time of writing with their default hyperparameters and presets. """ from matminer.featurizers.composition import ( AtomicOrbitals, AtomicPackingEfficiency, BandCenter, # CohesiveEnergy, - This descriptor was not used in the paper preset # ElectronAffinity, - This descriptor was not used in the paper preset ElectronegativityDiff, ElementFraction, ElementProperty, IonProperty, Miedema, OxidationStates, Stoichiometry, TMetalFraction, ValenceOrbital, YangSolidSolution, ) from matminer.featurizers.structure import ( # BagofBonds, - This descriptor was not used in the paper preset BondFractions, ChemicalOrdering, CoulombMatrix, DensityFeatures, EwaldEnergy, GlobalSymmetryFeatures, MaximumPackingEfficiency, # PartialRadialDistributionFunction, RadialDistributionFunction, SineCoulombMatrix, StructuralHeterogeneity, XRDPowderPattern, ) from matminer.featurizers.site import ( AGNIFingerprints, AverageBondAngle, AverageBondLength, BondOrientationalParameter, ChemEnvSiteFingerprint, CoordinationNumber, CrystalNNFingerprint, GaussianSymmFunc, GeneralizedRadialDistributionFunction, LocalPropertyDifference, OPSiteFingerprint, VoronoiFingerprint, ) composition_featurizers = ( AtomicOrbitals(), AtomicPackingEfficiency(), BandCenter(), ElementFraction(), ElementProperty.from_preset("magpie"), IonProperty(), Miedema(), Stoichiometry(), TMetalFraction(), ValenceOrbital(), YangSolidSolution(), ) oxide_composition_featurizers = ( ElectronegativityDiff(), OxidationStates(), ) structure_featurizers = ( DensityFeatures(), GlobalSymmetryFeatures(), RadialDistributionFunction(), CoulombMatrix(), # PartialRadialDistributionFunction(), SineCoulombMatrix(), EwaldEnergy(), BondFractions(), StructuralHeterogeneity(), MaximumPackingEfficiency(), ChemicalOrdering(), XRDPowderPattern(), # BagofBonds(), ) site_featurizers = ( AGNIFingerprints(), AverageBondAngle(VoronoiNN()), AverageBondLength(VoronoiNN()), BondOrientationalParameter(), ChemEnvSiteFingerprint.from_preset("simple"), CoordinationNumber(), CrystalNNFingerprint.from_preset("ops"), GaussianSymmFunc(), GeneralizedRadialDistributionFunction.from_preset("gaussian"), LocalPropertyDifference(), OPSiteFingerprint(), VoronoiFingerprint(), ) def featurize_composition(self, df): """ Applies the preset composition featurizers to the input dataframe, renames some fields and cleans the output dataframe. """ df = super().featurize_composition(df) _orbitals = {"s": 1, "p": 2, "d": 3, "f": 4} df['AtomicOrbitals|HOMO_character'] = df[ 'AtomicOrbitals|HOMO_character'].map(_orbitals) df['AtomicOrbitals|LUMO_character'] = df[ 'AtomicOrbitals|LUMO_character'].map(_orbitals) df['AtomicOrbitals|HOMO_element'] = df[ 'AtomicOrbitals|HOMO_element'].apply( lambda x: -1 if not isinstance(x, str) else Element(x).Z) df['AtomicOrbitals|LUMO_element'] = df[ 'AtomicOrbitals|LUMO_element'].apply( lambda x: -1 if not isinstance(x, str) else Element(x).Z) df = df.replace([np.inf, -np.inf, np.nan], 0) return modnet.featurizers.clean_df(df) def featurize_structure(self, df): """ Applies the preset structural featurizers to the input dataframe, renames some fields and cleans the output dataframe. """ df = super().featurize_structure(df) dist = df[ "RadialDistributionFunction|radial distribution function"].iloc[0][ 'distances'][:50] for i, d in enumerate(dist): _rdf_key = "RadialDistributionFunction|radial distribution function|d_{:.2f}".format( d) df[_rdf_key] = df[ "RadialDistributionFunction|radial distribution function"].apply( lambda x: x['distribution'][i]) df = df.drop("RadialDistributionFunction|radial distribution function", axis=1) _crystal_system = { "cubic": 1, "tetragonal": 2, "orthorombic": 3, "hexagonal": 4, "trigonal": 5, "monoclinic": 6, "triclinic": 7 } def _int_map(x): if x == np.nan: return 0 elif x: return 1 else: return 0 df["GlobalSymmetryFeatures|crystal_system"] = df[ "GlobalSymmetryFeatures|crystal_system"].map(_crystal_system) df["GlobalSymmetryFeatures|is_centrosymmetric"] = df[ "GlobalSymmetryFeatures|is_centrosymmetric"].map(_int_map) return modnet.featurizers.clean_df(df) def featurize_site(self, df): """ Applies the preset site featurizers to the input dataframe, renames some fields and cleans the output dataframe. """ # rename some features for backwards compatibility with pretrained models aliases = { "GeneralizedRadialDistributionFunction": "GeneralizedRDF", "AGNIFingerprints": "AGNIFingerPrint", "BondOrientationalParameter": "BondOrientationParameter", "GaussianSymmFunc": "ChemEnvSiteFingerprint|GaussianSymmFunc", } df = super().featurize_site(df, aliases=aliases) df = df.loc[:, (df != 0).any(axis=0)] return modnet.featurizers.clean_df(df)
def test_ape(self): f = AtomicPackingEfficiency() ef = ElementFraction() ef.set_n_jobs(1) # Test the APE calculation routines self.assertAlmostEqual(1.11632, f.get_ideal_radius_ratio(15)) self.assertAlmostEqual(0.154701, f.get_ideal_radius_ratio(2)) self.assertAlmostEqual(1.65915, f.get_ideal_radius_ratio(27)) self.assertAlmostEqual(15, f.find_ideal_cluster_size(1.116)[0]) self.assertAlmostEqual(3, f.find_ideal_cluster_size(0.1)[0]) self.assertAlmostEqual(24, f.find_ideal_cluster_size(2)[0]) # Test the nearest neighbor lookup tool nn_lookup = f.create_cluster_lookup_tool( [Element('Cu'), Element('Zr')]) # Check that the table gets the correct structures stable_clusters = [ Composition('CuZr10'), Composition('Cu6Zr6'), Composition('Cu8Zr5'), Composition('Cu13Zr1'), Composition('Cu3Zr12'), Composition('Cu8Zr8'), Composition('Cu12Zr5'), Composition('Cu17Zr') ] ds, _ = nn_lookup.kneighbors(ef.featurize_many(stable_clusters), n_neighbors=1) self.assertArrayAlmostEqual([[0]] * 8, ds) self.assertEqual(8, nn_lookup._fit_X.shape[0]) # Swap the order of the clusters, make sure it gets the same list nn_lookup_swapped = f.create_cluster_lookup_tool( [Element('Zr'), Element('Cu')]) self.assertArrayAlmostEqual(sorted(nn_lookup._fit_X.tolist()), sorted(nn_lookup_swapped._fit_X.tolist())) # Make sure we had a cache hit self.assertEqual(1, f._create_cluster_lookup_tool.cache_info().misses) self.assertEqual(1, f._create_cluster_lookup_tool.cache_info().hits) # Change the tolerance, see if it changes the results properly f.threshold = 0.002 nn_lookup = f.create_cluster_lookup_tool( [Element('Cu'), Element('Zr')]) self.assertEqual(2, nn_lookup._fit_X.shape[0]) ds, _ = nn_lookup.kneighbors(ef.featurize_many( [Composition('CuZr10'), Composition('Cu3Zr12')]), n_neighbors=1) self.assertArrayAlmostEqual([[0]] * 2, ds) # Make sure we had a cache miss self.assertEqual(2, f._create_cluster_lookup_tool.cache_info().misses) self.assertEqual(1, f._create_cluster_lookup_tool.cache_info().hits) # Compute the distances from Cu50Zr50 mean_dists = f.compute_nearest_cluster_distance(Composition('CuZr')) self.assertArrayAlmostEqual([0.424264, 0.667602, 0.800561], mean_dists, decimal=6) # Compute the optimal APE for Cu50Zr50 self.assertArrayAlmostEqual([0.000233857, 0.003508794], f.compute_simultaneous_packing_efficiency( Composition('Cu50Zr50'))) # Test the dataframe calculator df = pd.DataFrame({'comp': [Composition('CuZr')]}) df = f.featurize_dataframe(df, 'comp') self.assertEqual(6, len(df.columns)) self.assertIn('dist from 5 clusters |APE| < 0.002', df.columns) self.assertAlmostEqual(0.003508794, df['mean abs simul. packing efficiency'][0]) # Make sure it works with composition that do not match any efficient clusters feat = f.compute_nearest_cluster_distance(Composition('Al')) self.assertArrayAlmostEqual([1] * 3, feat)
def test_fraction(self): df_frac = ElementFraction().featurize_dataframe(self.df, col_id="composition") self.assertEqual(df_frac["O"][0], 0.6) self.assertEqual(df_frac["Fe"][0], 0.4)
class FUTURE_PROSPECTS_2021(featurizer.extendedMODFeaturizer): from matminer.featurizers.composition import ( AtomicOrbitals, AtomicPackingEfficiency, BandCenter, CohesiveEnergy, ElectronAffinity, ElectronegativityDiff, ElementFraction, ElementProperty, IonProperty, Miedema, OxidationStates, Stoichiometry, TMetalFraction, ValenceOrbital, YangSolidSolution, ) from matminer.featurizers.structure import ( BagofBonds, BondFractions, ChemicalOrdering, CoulombMatrix, DensityFeatures, EwaldEnergy, GlobalSymmetryFeatures, MaximumPackingEfficiency, PartialRadialDistributionFunction, RadialDistributionFunction, SineCoulombMatrix, StructuralHeterogeneity, XRDPowderPattern, ) from matminer.featurizers.site import ( AGNIFingerprints, AverageBondAngle, AverageBondLength, BondOrientationalParameter, ChemEnvSiteFingerprint, CoordinationNumber, CrystalNNFingerprint, GaussianSymmFunc, GeneralizedRadialDistributionFunction, LocalPropertyDifference, OPSiteFingerprint, VoronoiFingerprint, ) from matminer.featurizers.dos import ( DOSFeaturizer, SiteDOS, Hybridization, DosAsymmetry, ) from matminer.featurizers.bandstructure import ( BandFeaturizer, BranchPointEnergy ) composition_featurizers = ( AtomicOrbitals(), AtomicPackingEfficiency(), BandCenter(), ElementFraction(), ElementProperty.from_preset("magpie"), IonProperty(), Miedema(), Stoichiometry(), TMetalFraction(), ValenceOrbital(), YangSolidSolution(), ) oxid_composition_featurizers = ( ElectronegativityDiff(), OxidationStates(), ) structure_featurizers = ( DensityFeatures(), GlobalSymmetryFeatures(), RadialDistributionFunction(), CoulombMatrix(), #PartialRadialDistributionFunction(), #Introduces a large amount of features SineCoulombMatrix(), EwaldEnergy(), BondFractions(), StructuralHeterogeneity(), MaximumPackingEfficiency(), ChemicalOrdering(), XRDPowderPattern(), ) site_featurizers = ( AGNIFingerprints(), AverageBondAngle(VoronoiNN()), AverageBondLength(VoronoiNN()), BondOrientationalParameter(), ChemEnvSiteFingerprint.from_preset("simple"), CoordinationNumber(), CrystalNNFingerprint.from_preset("ops"), GaussianSymmFunc(), GeneralizedRadialDistributionFunction.from_preset("gaussian"), LocalPropertyDifference(), OPSiteFingerprint(), VoronoiFingerprint(), ) dos_featurizers = ( DOSFeaturizer(), SiteDOS(), Hybridization() ) band_featurizers = ( BandFeaturizer(), BranchPointEnergy() ) def __init__(self, n_jobs=None): self._n_jobs = n_jobs def featurize_composition(self, df): """Applies the preset composition featurizers to the input dataframe, renames some fields and cleans the output dataframe. """ df = super().featurize_composition(df) _orbitals = {"s": 1, "p": 2, "d": 3, "f": 4} df["AtomicOrbitals|HOMO_character"] = df["AtomicOrbitals|HOMO_character"].map( _orbitals ) df["AtomicOrbitals|LUMO_character"] = df["AtomicOrbitals|LUMO_character"].map( _orbitals ) df["AtomicOrbitals|HOMO_element"] = df["AtomicOrbitals|HOMO_element"].apply( lambda x: -1 if not isinstance(x, str) else Element(x).Z ) df["AtomicOrbitals|LUMO_element"] = df["AtomicOrbitals|LUMO_element"].apply( lambda x: -1 if not isinstance(x, str) else Element(x).Z ) return clean_df(df) def featurize_structure(self, df): """Applies the preset structural featurizers to the input dataframe, renames some fields and cleans the output dataframe. """ df = super().featurize_structure(df) dist = df["RadialDistributionFunction|radial distribution function"].iloc[0][ "distances" ][:50] for i, d in enumerate(dist): _rdf_key = "RadialDistributionFunction|radial distribution function|d_{:.2f}".format( d ) df[_rdf_key] = df[ "RadialDistributionFunction|radial distribution function" ].apply(lambda x: x["distribution"][i]) df = df.drop("RadialDistributionFunction|radial distribution function", axis=1) _crystal_system = { "cubic": 1, "tetragonal": 2, "orthorombic": 3, "hexagonal": 4, "trigonal": 5, "monoclinic": 6, "triclinic": 7, } def _int_map(x): if x == np.nan: return 0 elif x: return 1 else: return 0 df["GlobalSymmetryFeatures|crystal_system"] = df[ "GlobalSymmetryFeatures|crystal_system" ].map(_crystal_system) df["GlobalSymmetryFeatures|is_centrosymmetric"] = df[ "GlobalSymmetryFeatures|is_centrosymmetric" ].map(_int_map) return clean_df(df) def featurize_dos(self, df): """Applies the presetdos featurizers to the input dataframe, renames some fields and cleans the output dataframe. """ df = super().featurize_dos(df) hotencodeColumns = ["DOSFeaturizer|vbm_specie_1","DOSFeaturizer|cbm_specie_1"] one_hot = pd.get_dummies(df[hotencodeColumns]) df = df.drop(hotencodeColumns, axis = 1).join(one_hot) _orbitals = {"s": 1, "p": 2, "d": 3, "f": 4} df["DOSFeaturizer|vbm_character_1"] = df[ "DOSFeaturizer|vbm_character_1" ].map(_orbitals) df["DOSFeaturizer|cbm_character_1"] = df[ "DOSFeaturizer|cbm_character_1" ].map(_orbitals) # Splitting one feature into several floating features # e.g. number;number;number into three columns splitColumns = ["DOSFeaturizer|cbm_location_1", "DOSFeaturizer|vbm_location_1"] for column in splitColumns: try: newColumns = df[column].str.split(";", n = 2, expand = True) for i in range(0,3): df[column + "_" + str(i)] = np.array(newColumns[i]).astype(np.float) except: continue df = df.drop(splitColumns, axis=1) df = df.drop(["dos"], axis=1) return clean_df(df) def featurize_bandstructure(self, df): """Applies the preset band structure featurizers to the input dataframe, renames some fields and cleans the output dataframe. """ df = super().featurize_bandstructure(df) def _int_map(x): if str(x) == "False": return 0 elif str(x) == "True": return 1 df["BandFeaturizer|is_gap_direct"] = df[ "BandFeaturizer|is_gap_direct" ].map(_int_map) df = df.drop(["bandstructure"], axis=1) return clean_df(df) def featurize_site(self, df): """Applies the preset site featurizers to the input dataframe, renames some fields and cleans the output dataframe. """ aliases = { "GeneralizedRadialDistributionFunction": "GeneralizedRDF", "AGNIFingerprints": "AGNIFingerPrint", "BondOrientationalParameter": "BondOrientationParameter", "GaussianSymmFunc": "ChemEnvSiteFingerprint|GaussianSymmFunc", } df = super().featurize_site(df, aliases=aliases) df = df.loc[:, (df != 0).any(axis=0)] return clean_df(df)
def test_ape(self): f = AtomicPackingEfficiency() ef = ElementFraction() ef.set_n_jobs(1) # Test the APE calculation routines self.assertAlmostEqual(1.11632, f.get_ideal_radius_ratio(15)) self.assertAlmostEqual(0.154701, f.get_ideal_radius_ratio(2)) self.assertAlmostEqual(1.65915, f.get_ideal_radius_ratio(27)) self.assertAlmostEqual(15, f.find_ideal_cluster_size(1.116)[0]) self.assertAlmostEqual(3, f.find_ideal_cluster_size(0.1)[0]) self.assertAlmostEqual(24, f.find_ideal_cluster_size(2)[0]) # Test the nearest neighbor lookup tool nn_lookup = f.create_cluster_lookup_tool([Element('Cu'), Element('Zr')]) # Check that the table gets the correct structures stable_clusters = [Composition('CuZr10'), Composition('Cu6Zr6'), Composition('Cu8Zr5'), Composition('Cu13Zr1'), Composition('Cu3Zr12'), Composition('Cu8Zr8'), Composition('Cu12Zr5'), Composition('Cu17Zr')] ds, _ = nn_lookup.kneighbors( ef.featurize_many(stable_clusters), n_neighbors=1) self.assertArrayAlmostEqual([[0]]*8, ds) self.assertEqual(8, nn_lookup._fit_X.shape[0]) # Swap the order of the clusters, make sure it gets the same list nn_lookup_swapped = f.create_cluster_lookup_tool([Element('Zr'), Element('Cu')]) self.assertArrayAlmostEqual(sorted(nn_lookup._fit_X.tolist()), sorted(nn_lookup_swapped._fit_X.tolist())) # Make sure we had a cache hit self.assertEqual(1, f._create_cluster_lookup_tool.cache_info().misses) self.assertEqual(1, f._create_cluster_lookup_tool.cache_info().hits) # Change the tolerance, see if it changes the results properly f.threshold = 0.002 nn_lookup = f.create_cluster_lookup_tool([Element('Cu'), Element('Zr')]) self.assertEqual(2, nn_lookup._fit_X.shape[0]) ds, _ = nn_lookup.kneighbors( ef.featurize_many([Composition('CuZr10'), Composition('Cu3Zr12')]), n_neighbors=1) self.assertArrayAlmostEqual([[0]]*2, ds) # Make sure we had a cache miss self.assertEqual(2, f._create_cluster_lookup_tool.cache_info().misses) self.assertEqual(1, f._create_cluster_lookup_tool.cache_info().hits) # Compute the distances from Cu50Zr50 mean_dists = f.compute_nearest_cluster_distance(Composition('CuZr')) self.assertArrayAlmostEqual([0.424264, 0.667602, 0.800561], mean_dists, decimal=6) # Compute the optimal APE for Cu50Zr50 self.assertArrayAlmostEqual([0.000233857, 0.003508794], f.compute_simultaneous_packing_efficiency( Composition('Cu50Zr50') )) # Test the dataframe calculator df = pd.DataFrame({'comp': [Composition('CuZr')]}) f.featurize_dataframe(df, 'comp') self.assertEqual(6, len(df.columns)) self.assertIn('dist from 5 clusters |APE| < 0.002', df.columns) self.assertAlmostEqual(0.003508794, df['mean abs simul. packing efficiency'][0]) # Make sure it works with composition that do not match any efficient clusters feat = f.compute_nearest_cluster_distance(Composition('Al')) self.assertArrayAlmostEqual([1]*3, feat)
class AtomicPackingEfficiency(BaseFeaturizer): """ Packing efficiency based on a geometric theory of the amorphous packing of hard spheres. This featurizer computes two different kinds of the features. The first relate to the distance between a composition and the composition of the clusters of atoms expected to be efficiently packed based on a theory from `Laws et al.<http://www.nature.com/doifinder/10.1038/ncomms9123>`_. The second corresponds to the packing efficiency of a system if all atoms in the alloy are simultaneously as efficiently-packed as possible. The packing efficiency in these models is based on the Atomic Packing Efficiency (APE), which measures the difference between the ratio of the radii of the central atom to its neighbors and the ideal ratio of a cluster with the same number of atoms that has optimal packing efficiency. If the difference between the ratios is too large, the APE is positive. If the difference is too small, the APE is negative. Features: dist from {k} clusters |APE| < {thr} - The distance between an alloy composition and the k clusters that have a packing efficiency below thr from ideal mean simul. packing efficiency - Mean packing efficiency of all atoms. The packing efficiency is measured with respect to ideal (0) mean abs simul. packing efficiency - Mean absolute value of the packing efficiencies. Closer to zero is more efficiently packed References: [1] K.J. Laws, D.B. Miracle, M. Ferry, A predictive structural model for bulk metallic glasses, Nat. Commun. 6 (2015) 8123. doi:10.1038/ncomms9123. """ def __init__(self, threshold=0.01, n_nearest=(1, 3, 5), max_types=6): """ Initialize the featurizer Args: threshold (float):Threshold to use for determining whether a cluster is efficiently packed. n_nearest ({int}): Number of nearest clusters to use when considering features max_types (int): Maximum number of atom types to consider when looking for efficient clusters. The process for finding efficient clusters very expensive for large numbers of types """ # Store the options self.threshold = threshold self.n_nearest = n_nearest self.max_types = max_types # Tool to convert composition objects to fractions as a vector self._el_frac = ElementFraction() # Get the number of elements in the output of `_el_frac` self._n_elems = len(self._el_frac.featurize(Composition('H'))) # Tool for looking up radii self._data_source = MagpieData() # Lookup table of ideal radius ratios self.ideal_ratio = dict( [(3, 0.154701), (4, 0.224745), (5, 0.361654), (6, 0.414214), (7, 0.518145), (8, 0.616517), (9, 0.709914), (10, 0.798907), (11, 0.884003), (12, 0.902113), (13, 0.976006), (14, 1.04733), (15, 1.11632), (16, 1.18318), (17, 1.2481), (18, 1.31123), (19, 1.37271), (20, 1.43267), (21, 1.49119), (22, 1.5484), (23, 1.60436), (24, 1.65915)]) def __hash__(self): return hash(self.threshold) def __eq__(self, other): if isinstance(other, AtomicPackingEfficiency): return self.get_params() == other.get_params() def featurize(self, comp): return list(self.compute_simultaneous_packing_efficiency(comp)) + \ self.compute_nearest_cluster_distance(comp) def feature_labels(self): return ['mean simul. packing efficiency', 'mean abs simul. packing efficiency'] + [ f"dist from {k} clusters |APE| < {self.threshold:.3f}" for k in self.n_nearest] def citations(self): return ["@article{Laws2015," "author = {Laws, K. J. and Miracle, D. B. and Ferry, M.}," "doi = {10.1038/ncomms9123}," "journal = {Nature Communications}," "pages = {8123}," "title = {{A predictive structural model for bulk metallic glasses}}," "url = {http://www.nature.com/doifinder/10.1038/ncomms9123}," "volume = {6}," "year = {2015}"] def implementors(self): return ['Logan Ward'] def compute_simultaneous_packing_efficiency(self, comp): """Compute the packing efficiency of the system when the neighbor shell of each atom has the same composition as the alloy. When this criterion is satisfied, it is possible for every atom in this system to be simultaneously as efficiently-packed as possible. Args: comp (Composition): Composition to be assessed Returns (float) Average APE of all atoms (float) Average deviation of the APE of each atom from ideal (0) """ # Compute the average atomic radius of the system elements, fractions = zip(*comp.element_composition.items()) radii = self._data_source.get_elemental_properties(elements, 'MiracleRadius') mean_radius = PropertyStats.mean(radii, fractions) # Compute the APE for each cluster best_ape = [ self.find_ideal_cluster_size(r / mean_radius)[1] for r in radii ] # Return the averages return PropertyStats.mean(best_ape, fractions), PropertyStats.mean(np.abs(best_ape), fractions) def compute_nearest_cluster_distance(self, comp): """Compute the distance between a composition and that the nearest efficiently-packed clusters. Measures the mean :math:`L_2` distance between the alloy composition and the :math:`k`-nearest clusters with Atomic Packing Efficiencies within the user-specified tolerance of 1. :math:`k` is any of the numbers defined in the "n_nearest" parameter of this class. If there are less than `k` efficient clusters in the system, we use the maximum distance betweeen any two compositions (1) for the unmatched neighbors. Args: comp (Composition): Composition of material to evaluate Return: [float] Average distances """ # Get the most common elements elems, _ = zip(*sorted(comp.element_composition.items(), key=lambda x: x[1], reverse=True)) # Get the cluster lookup tool using the most common elements cluster_lookup = self.create_cluster_lookup_tool( elems[:self.max_types] ) # Compute the composition vector comp_vec = self._el_frac.featurize(comp) # Compute the distances means = [] for k in self.n_nearest: # Get the nearest clusters if cluster_lookup is None: dists = (np.array([]),) to_lookup = 0 else: to_lookup = min(cluster_lookup._fit_X.shape[0], k) dists, _ = cluster_lookup.kneighbors([comp_vec], to_lookup) # Pad the list with 1's dists = dists[0].tolist() + [1]*(k - to_lookup) # Compute the average means.append(np.mean(dists)) return means def create_cluster_lookup_tool(self, elements): """ Get the compositions of efficiently-packed clusters in a certain system of elements Args: elements ([Element]): Elements in system Return: (NearNeighbors): Tool to find nearby clusters in this system. None if there are no efficiently-packed clusters for this combination of elements """ elements = list(set(elements)) return self._create_cluster_lookup_tool(tuple(sorted(elements))) @lru_cache() def _create_cluster_lookup_tool(self, elements): """ Cached version of `create_cluster_lookup_tool`. Assumes that the elements are passed as sorted tuple with no duplicates Args: elements ([Element]): Elements in system Return: (NearNeighbors): Tool to find nearby clusters in this system. If there are no clusters, this class returns None """ # Get the radii radii = self._data_source.get_elemental_properties(elements, "MiracleRadius") # Get the maximum and minimum cluster sizes max_size = self.find_ideal_cluster_size(max(radii) / min(radii))[0] min_size = self.find_ideal_cluster_size(min(radii) / max(radii))[0] # Prepare a list to hold all possible clusters eff_clusters = [] # Loop through all possible neighbor shells for size in range(min_size, max_size + 1): # Get the ideal radius ratio for a cluster of this size ideal_ratio = self.get_ideal_radius_ratio(size) # Get the mean radii and compositions of all possible # combinations of elements in the neighbor shell s_radii = itertools.combinations_with_replacement(radii, size) s_elems = itertools.combinations_with_replacement(elements, size) # Put the results in arrays for fast indexing mean_radii = np.array(list(s_radii)).mean(axis=1) s_elems = np.array(list(s_elems)) # For each type of central atom, determine which have an APE # within `self.threshold` of 1 for center_radius, center_elem in zip(radii, elements): # Compute the APE of each cluster ape = 1 - np.divide(ideal_ratio, np.divide(center_radius, mean_radii)) # Get those which are within the threshold of 0 # and add their composition to the list of OK elements for hit in s_elems[np.abs(ape) < self.threshold]: eff_clusters.append([center_elem] + hit.tolist()) # Compute the composition vectors for all of the efficient clusters comps = np.zeros((len(eff_clusters), self._n_elems)) for i, elems in enumerate(eff_clusters): for elem in elems: comps[i, elem.Z - 1] += 1 comps = np.divide(comps, comps.sum(axis=1)[:, None]) # Return tool to quickly determine distance from efficient clusters # NearNeighbors requires at least 1 entry, so we return None if # there are no nearby clusters return NearestNeighbors().fit(comps) if len(comps) > 0 else None def find_ideal_cluster_size(self, radius_ratio): """ Get the optimal cluster size for a certain radius ratio Finds the number of nearest neighbors :math:`n` that minimizes :math:`|1 - rp(n)/r|`, where :math:`rp(n)` is the ideal radius ratio for a certain :math:`n` and :math:`r` is the actual ratio. Args: radius_ratio (float): :math:`r / r_{neighbor}` Returns: (int) number of neighboring atoms for that will be the most efficiently packed. (float) Optimal APE """ # Loop through cluster sizes from 3 to 24 best_ape = np.inf best_n = None for n in range(3, 25): # Compute APE, check if it is the best ape = 1 - self.get_ideal_radius_ratio(n) / radius_ratio if abs(ape) < abs(best_ape): best_ape = ape best_n = n # If the APE is negative, this is either the best APE or # We have already passed it if ape < 0: return best_n, best_ape return best_n, best_ape def get_ideal_radius_ratio(self, n_neighbors): """Compute the idea ratio between the central atom and neighboring atoms for a neighbor with a certain number of nearest neighbors. Based on work by `Miracle, Lord, and Ranganathan <https://www.jstage.jst.go.jp/article/matertrans/47/7/47_7_1737/_article/-char/en>`_. Args: n_neighbors (int): Number of atoms in 1st NN shell Return: (float) ideal radius ratio :math:`r / r_{neighbor}` """ # NN must be in [3, 24] n = max(3, min(n_neighbors, 24)) return self.ideal_ratio[n]
def feature_labels(self): # Since we have more features than just element fractions, append 'fraction' to element symbols for clarity element_fraction_features = [e + " fraction" for e in ElementFraction().feature_labels()] valence_orbital_features = ValenceOrbital().feature_labels() return element_fraction_features+self._element_property_feature_labels+valence_orbital_features