def get_structure_properties(structure: Structure, mode: str = 'all') -> dict: if mode == 'all': featurizer = MultipleFeaturizer([ SiteStatsFingerprint.from_preset( 'CoordinationNumber_ward-prb-2017'), StructuralHeterogeneity(), ChemicalOrdering(), DensityFeatures(), MaximumPackingEfficiency(), SiteStatsFingerprint.from_preset( 'LocalPropertyDifference_ward-prb-2017'), StructureComposition(Stoichiometry()), StructureComposition(ElementProperty.from_preset('magpie')), StructureComposition(ValenceOrbital(props=['frac'])), ]) else: # Calculate only those which do not need a Voronoi tesselation featurizer = MultipleFeaturizer([ DensityFeatures(), StructureComposition(Stoichiometry()), StructureComposition(ElementProperty.from_preset('magpie')), StructureComposition(ValenceOrbital(props=['frac'])), ]) X = featurizer.featurize(structure) matminer_dict = dict(list(zip(featurizer.feature_labels(), X))) matminer_dict['volume'] = structure.volume return matminer_dict
def composition_featurizer(df_input: pd.DataFrame, **kwargs) -> pd.DataFrame: """Return a Pandas DataFrame with all compositional features""" # generate the "composition" column df_comp = StrToComposition().featurize_dataframe(df_input, col_id="Compound") # generate features based on elemental properites ep_featurizer = ElementProperty.from_preset(preset_name="magpie") ep_featurizer.featurize_dataframe(df_comp, col_id="composition", inplace=True) # generate the "composition_oxid" column based on guessed oxidation states CompositionToOxidComposition( return_original_on_error=True, **kwargs).featurize_dataframe( # ignore errors from non-integer stoichiometries df_comp, "composition", ignore_errors=True, inplace=True) # correct oxidation states df_comp = correct_comp_oxid(df_comp) # generate features based on oxidation states os_featurizer = OxidationStates() os_featurizer.featurize_dataframe(df_comp, "composition_oxid", ignore_errors=True, inplace=True) # remove compounds with predicted oxidation states of 0 return df_comp[df_comp["minimum oxidation state"] != 0]
def test_elem_matminer(self): df_elem = ElementProperty.from_preset("matminer").featurize_dataframe(self.df, col_id="composition") self.assertAlmostEqual(df_elem["minimum melting_point"][0], 54.8, 1) self.assertTrue(math.isnan(df_elem["maximum bulk_modulus"][0])) self.assertAlmostEqual(df_elem["range X"][0], 1.61, 1) self.assertAlmostEqual(df_elem["mean X"][0], 2.796, 1) self.assertAlmostEqual(df_elem["maximum block"][0], 3, 1)
def test_elem_deml(self): df_elem_deml = ElementProperty.from_preset("deml").featurize_dataframe(self.df, col_id="composition") self.assertAlmostEqual(df_elem_deml["minimum atom_num"][0], 8) self.assertAlmostEqual(df_elem_deml["maximum atom_num"][0], 26) self.assertAlmostEqual(df_elem_deml["range atom_num"][0], 18) self.assertAlmostEqual(df_elem_deml["mean atom_num"][0], 15.2) self.assertAlmostEqual(df_elem_deml["std_dev atom_num"][0], 12.7279, 4)
def featurize_structures(self, featurizer=None, **kwargs): """ Featurizes the hypothetical structures available from hypo_structures method. Hypothetical structures for which featurization fails are removed and valid structures are made available as valid_structures Args: featurizer (Featurizer): A MatMiner Featurizer. Defaults to MultipleFeaturizer with PRB Ward Voronoi descriptors. **kwargs (dict): kwargs passed to featurize_many method of featurizer. Returns: (pandas.DataFrame): features """ # Note the redundancy here is for pandas to work if self.hypo_structures is None: warnings.warn("No structures available. Generating structures.") self.get_structures() print("Generating features") featurizer = featurizer if featurizer else MultipleFeaturizer([ SiteStatsFingerprint.from_preset("CoordinationNumber_ward-prb-2017"), StructuralHeterogeneity(), ChemicalOrdering(), MaximumPackingEfficiency(), SiteStatsFingerprint.from_preset("LocalPropertyDifference_ward-prb-2017"), StructureComposition(Stoichiometry()), StructureComposition(ElementProperty.from_preset("magpie")), StructureComposition(ValenceOrbital(props=['frac'])), StructureComposition(IonProperty(fast=True)) ]) features = featurizer.featurize_many( self.hypo_structures['structure'], ignore_errors=True, **kwargs) n_species, formula = [], [] for s in self.hypo_structures['structure']: n_species.append(len(s.composition.elements)) formula.append(s.composition.formula) self._features_df = pd.DataFrame.from_records( features, columns=featurizer.feature_labels()) self._features_df.index = self.hypo_structures.index self._features_df['N_species'] = n_species self._features_df['Composition'] = formula self._features_df['structure'] = self.hypo_structures['structure'] self.features = self._features_df.dropna(axis=0, how='any') self.features = self.features.reindex(sorted(self.features.columns), axis=1) self._valid_structure_labels = list(self.features.index) self.valid_structures = self.hypo_structures.loc[self._valid_structure_labels] print("{} out of {} structures were successfully featurized.".format( self.features.shape[0], self._features_df.shape[0])) return self.features
def _featurize(self, composition: "pymatgen.Composition"): """ Calculate chemical fingerprint from crystal composition. Parameters ---------- composition: pymatgen.Composition object Composition object. Returns ------- feats: np.ndarray Vector of properties and statistics derived from chemical stoichiometry. Some values may be NaN. """ try: from matminer.featurizers.composition import ElementProperty except ModuleNotFoundError: raise ValueError("This class requires matminer to be installed.") ep = ElementProperty.from_preset(self.data_source) try: feats = ep.featurize(composition) except: feats = [] return np.array(feats)
def __init__(self, pbar=False): self.regressor = RandomForestRegressor(n_estimators=500, n_jobs=-1, verbose=3) self.stc = StrToComposition() ep = ElementProperty.from_preset("magpie") ef = ElementFraction() self.featurizer = MultipleFeaturizer([ep, ef]) self.pbar = pbar
def tran_feat_composition( df, var_formula="FORMULA", preset_name="magpie", append=True, ignore_errors=True, **kwargs, ): r"""Featurize a dataset using matminer Featurize chemical composition using matminer package. Args: df (DataFrame): Data to featurize var_formula (string): Column in df with chemical formula; formula given as string append (bool): Append results to original columns? preset_name (string): Matminer featurization preset Kwargs: ignore_errors (bool): Do not throw an error while parsing formulae; set to True to return NaN's for invalid formulae. Notes: - A pre-processor and wrapper for matminer.featurizers.composition References: Ward, L., Dunn, A., Faghaninia, A., Zimmermann, N. E. R., Bajaj, S., Wang, Q., Montoya, J. H., Chen, J., Bystrom, K., Dylla, M., Chard, K., Asta, M., Persson, K., Snyder, G. J., Foster, I., Jain, A., Matminer: An open source toolkit for materials data mining. Comput. Mater. Sci. 152, 60-69 (2018). Examples: >>> import grama as gr >>> from grama.tran import tf_feat_composition >>> ( >>> gr.df_make(FORMULA=["C6H12O6"]) >>> >> gr.tf_feat_composition() >>> ) """ ## Check invariants ## Featurize featurizer = ElementProperty.from_preset(preset_name=preset_name) df_res = StrToComposition().featurize_dataframe( df[[var_formula]], var_formula, ignore_errors=ignore_errors, ) df_res = featurizer.featurize_dataframe( df_res, col_id="composition", ignore_errors=ignore_errors, **kwargs, ) df_res.drop(columns=[var_formula, "composition"], inplace=True) ## Concatenate as necessary if append: df_res = concat((df, df_res), axis=1) return df_res
def _featurize(self, comp): """ Calculate chemical fingerprint from crystal composition. Parameters ---------- comp : str Reduced formula of crystal. Returns ------- feats: np.ndarray Vector of properties and statistics derived from chemical stoichiometry. Some values may be NaN. """ from pymatgen import Composition from matminer.featurizers.composition import ElementProperty # Get pymatgen Composition object c = Composition(comp) ep = ElementProperty.from_preset(self.data_source) try: feats = ep.featurize(c) except: feats = [] return np.array(feats)
def test_elem_matminer(self): df_elem = ElementProperty.from_preset("matminer").featurize_dataframe( self.df, col_id="composition") self.assertAlmostEqual(df_elem["minimum melting_point"][0], 54.8, 1) self.assertTrue(math.isnan(df_elem["maximum bulk_modulus"][0])) self.assertAlmostEqual(df_elem["range X"][0], 1.61, 1) self.assertAlmostEqual(df_elem["mean X"][0], 2.796, 1)
def featurize_composition(df: pd.DataFrame) -> pd.DataFrame: """ Decorate input `pandas.DataFrame` of structures with composition features from matminer. Currently applies the set of all matminer composition features. Args: df (pandas.DataFrame): the input dataframe with `"structure"` column containing `pymatgen.Structure` objects. Returns: pandas.DataFrame: the decorated DataFrame. """ logging.info("Applying composition featurizers...") df = df.copy() df['composition'] = df['structure'].apply(lambda s: s.composition) featurizer = MultipleFeaturizer([ElementProperty.from_preset("magpie"), AtomicOrbitals(), BandCenter(), # ElectronAffinity(), - This descriptor was not used in the paper preset Stoichiometry(), ValenceOrbital(), IonProperty(), ElementFraction(), TMetalFraction(), # CohesiveEnergy(), - This descriptor was not used in the paper preset Miedema(), YangSolidSolution(), AtomicPackingEfficiency(), ]) df = featurizer.featurize_dataframe(df, "composition", multiindex=True, ignore_errors=True) df.columns = df.columns.map('|'.join).str.strip('|') ox_featurizer = MultipleFeaturizer([OxidationStates(), ElectronegativityDiff() ]) df = CompositionToOxidComposition().featurize_dataframe(df, "Input Data|composition") df = ox_featurizer.featurize_dataframe(df, "composition_oxid", multiindex=True, ignore_errors=True) df = df.rename(columns={'Input Data': ''}) df.columns = df.columns.map('|'.join).str.strip('|') _orbitals = {"s": 1, "p": 2, "d": 3, "f": 4} df['AtomicOrbitals|HOMO_character'] = df['AtomicOrbitals|HOMO_character'].map(_orbitals) df['AtomicOrbitals|LUMO_character'] = df['AtomicOrbitals|LUMO_character'].map(_orbitals) df['AtomicOrbitals|HOMO_element'] = df['AtomicOrbitals|HOMO_element'].apply( lambda x: -1 if not isinstance(x, str) else Element(x).Z ) df['AtomicOrbitals|LUMO_element'] = df['AtomicOrbitals|LUMO_element'].apply( lambda x: -1 if not isinstance(x, str) else Element(x).Z ) df = df.replace([np.inf, -np.inf, np.nan], 0) return clean_df(df)
def __init__(self, cation_site=None, site_ox_lim={ 'A': [0, 10], 'B': [0, 10], 'X': [-10, 0] }, site_base_ox={ 'A': 2, 'B': 4, 'X': -2 }, ordered_formulas=False, A_site_occupancy=1, anions=None): if cation_site is None and ordered_formulas is False: raise ValueError( 'Either cation sites must be assigned, or formulas must be ordered. Otherwise site assignments can not be determined' ) self.cation_site = cation_site self.site_ox_lim = site_ox_lim self.site_base_ox = site_base_ox self.ordered_formulas = ordered_formulas self.A_site_occupancy = A_site_occupancy self.anions = anions #matminer featurizers self.ValenceOrbital = ValenceOrbital() self.AtomicOrbitals = AtomicOrbitalsMod() self.CohesiveEnergy = CohesiveEnergy() #custom ElementProperty featurizer elemental_properties = [ 'BoilingT', 'MeltingT', 'BulkModulus', 'ShearModulus', 'Row', 'Column', 'Number', 'MendeleevNumber', 'SpaceGroupNumber', 'Density', 'MolarVolume', 'FusionEnthalpy', 'HeatVaporization', 'NsUnfilled', 'NpUnfilled', 'NdUnfilled', 'NfUnfilled', 'Polarizability', 'ThermalConductivity' ] self.ElementProperty = ElementProperty( data_source='magpie', features=elemental_properties, stats=["mean", "std_dev", "range"]) self.check_matminer_featurizers() self.featurize_options = {}
def test_elem(self): df_elem = ElementProperty.from_preset("magpie").featurize_dataframe(self.df, col_id="composition") self.assertAlmostEqual(df_elem["minimum Number"][0], 8) self.assertAlmostEqual(df_elem["maximum Number"][0], 26) self.assertAlmostEqual(df_elem["range Number"][0], 18) self.assertAlmostEqual(df_elem["mean Number"][0], 15.2) self.assertAlmostEqual(df_elem["avg_dev Number"][0], 8.64) self.assertAlmostEqual(df_elem["mode Number"][0], 8)
def test_elem_megnet_el(self): ep = ElementProperty.from_preset("megnet_el") df_elem = ep.featurize_dataframe(self.df, col_id="composition") self.assertAlmostEqual(df_elem["MEGNetElementData maximum embedding 1"].iloc[0], 0.127333, places=6) self.assertAlmostEqual(df_elem["MEGNetElementData maximum embedding 1"].iloc[1], 0.127333, places=6) self.assertAlmostEqual(df_elem["MEGNetElementData maximum embedding 11"].iloc[0], 0.160505, places=6) self.assertAlmostEqual(df_elem["MEGNetElementData maximum embedding 11"].iloc[1], 0.160505, places=6) self.assertTrue(ep.citations())
def magpie_feature(formula): data = [formula] df = pd.DataFrame(data, columns=["formula"]) df["composition"] = df["formula"].transform(str_to_composition) ep_feat = ElementProperty.from_preset(preset_name="magpie") df = ep_feat.featurize_dataframe(df, col_id="composition") df.drop(labels=["composition"], axis=1, inplace=True) return df.iloc[0, 1:].to_numpy()
def Magpie(formulas): if isinstance(formulas, str): formulas = [formulas] ep_feat = ElementProperty.from_preset(preset_name="magpie") df = pd.DataFrame({"formula": formulas}) df["composition"] = df["formula"].transform(str_to_composition) df = ep_feat.featurize_dataframe(df, col_id="composition") df.drop(labels=["composition", "formula"], axis=1, inplace=True) return np.array(df).astype(np.float32)
def test_fere_corr(self): df_fere_corr = ElementProperty(features=["FERE correction"], stats=["minimum", "maximum", "range", "mean", "std_dev"], data_source="deml")\ .featurize_dataframe(self.df, col_id="composition") self.assertAlmostEqual(df_fere_corr["minimum FERE correction"][0], -0.15213431610903) self.assertAlmostEqual(df_fere_corr["maximum FERE correction"][0], 0.23) self.assertAlmostEqual(df_fere_corr["range FERE correction"][0], 0.382134316) self.assertAlmostEqual(df_fere_corr["mean FERE correction"][0], 0.077146274) self.assertAlmostEqual(df_fere_corr["std_dev FERE correction"][0], 0.270209766)
def test_elem_matscholar_el(self): df_elem = ElementProperty.from_preset("matscholar_el").featurize_dataframe(self.df, col_id="composition") self.assertAlmostEqual(df_elem["range matscholar_el_149"].iloc[0], 0.06827970966696739) self.assertAlmostEqual(df_elem["range matscholar_el_149"].iloc[1], 0.06827970966696739) self.assertAlmostEqual(df_elem["mean matscholar_el_18"].iloc[0], -0.020534400502219795) self.assertAlmostEqual(df_elem["mean matscholar_el_18"].iloc[1], -0.02483355056028813)
def __init__(self,radius_type='ionic_radius',normalize_formula=False): self.radius_type = radius_type self.normalize_formula = normalize_formula self.ValenceOrbital = ValenceOrbital() self.AtomicOrbitals = AtomicOrbitalsMod() self.CohesiveEnergy = CohesiveEnergy() self.BandCenter = BandCenter() self.ValenceOrbitalEnergy = ValenceOrbitalEnergy() #custom ElementProperty featurizer elemental_properties = ['BoilingT', 'MeltingT', 'BulkModulus', 'ShearModulus', 'Row', 'Column', 'Number', 'MendeleevNumber', 'SpaceGroupNumber', 'Density','MolarVolume', 'FusionEnthalpy','HeatVaporization', 'Polarizability', 'ThermalConductivity'] self.ElementProperty = ElementProperty(data_source='magpie',features=elemental_properties, stats=["mean", "std_dev"]) #check matminer featurizers self.check_matminer_featurizers()
def __init__(self, data_source: str = 'matminer'): """ Parameters ---------- data_source: str of "matminer", "magpie" or "deml" (default "matminer") Source for element property data. """ try: from matminer.featurizers.composition import ElementProperty except ModuleNotFoundError: raise ImportError("This class requires matminer to be installed.") self.data_source = data_source self.ep_featurizer = ElementProperty.from_preset(self.data_source)
def similarity(_parents, target): featurizer = MultipleFeaturizer([ SiteStatsFingerprint.from_preset("CoordinationNumber_ward-prb-2017"), StructuralHeterogeneity(), ChemicalOrdering(), MaximumPackingEfficiency(), SiteStatsFingerprint.from_preset( "LocalPropertyDifference_ward-prb-2017"), StructureComposition(Stoichiometry()), StructureComposition(ElementProperty.from_preset("magpie")), StructureComposition(ValenceOrbital(props=["frac"])), StructureComposition(IonProperty(fast=True)), ]) # HACK celery doesn't work with multiprocessing (used by matminer) try: from celery import current_task if current_task: featurizer.set_n_jobs(1) except ImportError: pass x_target = pd.DataFrame.from_records([featurizer.featurize(target)], columns=featurizer.feature_labels()) x_parent = pd.DataFrame.from_records( featurizer.featurize_many(_parents, ignore_errors=True, pbar=False), columns=featurizer.feature_labels(), ) nulls = x_parent[x_parent.isnull().any(axis=1)].index.values x_parent.fillna(100000, inplace=True) x_target = x_target.reindex(sorted(x_target.columns), axis=1) x_parent = x_parent.reindex(sorted(x_parent.columns), axis=1) with open(os.path.join(settings.rxn_files, "scaler2.pickle"), "rb") as f: scaler = pickle.load(f) with open(os.path.join(settings.rxn_files, "quantiles.pickle"), "rb") as f: quantiles = pickle.load(f) X = scaler.transform(x_parent.append(x_target)) D = [pairwise_distances(np.array([row, X[-1]]))[0, 1] for row in X[:-1]] _res = [] for d in D: _res.append(np.linspace(0, 1, 101)[np.abs(quantiles - d).argmin()]) _res = np.array(_res) _res[nulls] = -1 return _res
def __init__(self, materials, descriptors, **kwargs): """ Calculates site-based descriptors (e.g., coordination numbers with different near-neighbor finding approaches) for materials and runs statistics analysis on selected descriptor types (order parameter-based site fingerprints). The latter is useful as a definition of a structure fingerprint on the basis of local coordination information. Furthermore, composition descriptors are calculated (Magpie element property vector). Args: materials (Store): Store of materials documents. descriptors (Store): Store of composition, site, and structure descriptor data such as tetrahedral order parameter or fraction of being 8-fold coordinated. mat_query (dict): dictionary to limit materials to be analyzed. """ self.materials = materials self.descriptors = descriptors # Set up all targeted site descriptors. self.sds = {} for nn in nn_target_classes: nn_ = getattr(local_env, nn) k = "cn_{}".format(nn) self.sds[k] = CoordinationNumber(nn_(), use_weights="none") k = "cn_wt_{}".format(nn) self.sds[k] = CoordinationNumber(nn_(), use_weights="sum") self.all_output_pieces = {"site_descriptors": [k for k in self.sds.keys()]} self.sds["csf"] = CrystalNNFingerprint.from_preset("ops", distance_cutoffs=None, x_diff_weight=None) self.all_output_pieces["statistics"] = ["csf"] # Set up all targeted composition descriptors. self.cds = {} self.cds["magpie"] = ElementProperty.from_preset("magpie") self.all_output_pieces["composition_descriptors"] = ["magpie"] self.all_output_pieces["meta"] = ["atomate"] super().__init__(source=materials, target=descriptors, ufn=self.calc, projection=["structure"], **kwargs)
def test_elem_deml(self): df_elem_deml = ElementProperty.from_preset("deml").featurize_dataframe( self.df, col_id="composition") self.assertAlmostEqual(df_elem_deml["minimum atom_num"][0], 8) self.assertAlmostEqual(df_elem_deml["maximum atom_num"][0], 26) self.assertAlmostEqual(df_elem_deml["range atom_num"][0], 18) self.assertAlmostEqual(df_elem_deml["mean atom_num"][0], 15.2) self.assertAlmostEqual(df_elem_deml["std_dev atom_num"][0], 8.81816307) #Charge dependent property self.assertAlmostEqual(df_elem_deml["minimum magn_moment"][0], 0) self.assertAlmostEqual(df_elem_deml["maximum magn_moment"][0], 5.2) self.assertAlmostEqual(df_elem_deml["range magn_moment"][0], 5.2) self.assertAlmostEqual(df_elem_deml["mean magn_moment"][0], 2.08) self.assertAlmostEqual(df_elem_deml["std_dev magn_moment"][0], 2.547469332)
def test_composition_features(self): comp = ElementProperty.from_preset("magpie") f = StructureComposition(featurizer=comp) # Test the fitting (should not crash) f.fit([self.nacl, self.diamond]) # Test the features features = f.featurize(self.nacl) self.assertArrayAlmostEqual(comp.featurize(self.nacl.composition), features) # Test the citations/implementors self.assertEqual(comp.citations(), f.citations()) self.assertEqual(comp.implementors(), f.implementors())
def __init__(self, materials, descriptors, mat_query=None, **kwargs): """ Calculates site-based descriptors (e.g., coordination numbers with different near-neighbor finding approaches) for materials and runs statistics analysis on selected descriptor types (order parameter-based site fingerprints). The latter is useful as a definition of a structure fingerprint on the basis of local coordination information. Furthermore, composition descriptors are calculated (Magpie element property vector). Args: materials (Store): Store of materials documents. descriptors (Store): Store of composition, site, and structure descriptor data such as tetrahedral order parameter or fraction of being 8-fold coordinated. mat_query (dict): dictionary to limit materials to be analyzed. """ self.materials = materials self.descriptors = descriptors self.mat_query = mat_query if mat_query else {} # Set up all targeted site descriptors. self.sds = {} for nn in nn_target_classes: nn_ = getattr(pymatgen.analysis.local_env, nn) k = 'cn_{}'.format(nn) self.sds[k] = CoordinationNumber(nn_(), use_weights='none') k = 'cn_wt_{}'.format(nn) self.sds[k] = CoordinationNumber(nn_(), use_weights='sum') self.all_output_pieces = { 'site_descriptors': [k for k in self.sds.keys()] } self.sds['csf'] = CrystalNNFingerprint.from_preset( 'ops', distance_cutoffs=None, x_diff_weight=None) self.all_output_pieces['statistics'] = ['csf'] # Set up all targeted composition descriptors. self.cds = {} self.cds["magpie"] = ElementProperty.from_preset('magpie') self.all_output_pieces['composition_descriptors'] = ['magpie'] self.all_output_pieces['meta'] = ['atomate'] super().__init__(sources=[materials], targets=[descriptors], **kwargs)
def generate_data(name): #这个函数作用,输入是指定的文件名,输出增加了gaps,is_daoti,以及其他共计145特征的完整向量矩阵 #name='test_plus_gaps.csv' df=pd.read_csv(name,index_col=[0]) df['gaps']=-10.0 df_gap=pd.read_csv("gaps.csv",index_col = [0]) print(df_gap.index) i=0 str_s="" for j in range(len(df_gap.index)): #先打印二者的id # print(df.index[i]) str_s='mp-'+str(df_gap.index[j]) if(str_s==df.index[i]): df.iloc[i,-1]=df_gap.iloc[j,0] i=i+1 #print("确实一样") print("合并完毕") #同样的方法我们来建立不同的分类 df['is_daoti']=-2 for i in range(len(df.index)): if(df.ix[i,-2]==0): df.ix[i,-1]=1 else: df.ix[i,-1]=0 print("分类feature建立完成") #首先使用describe获得对于数据的整体把握 print(df.describe()) df.describe().to_csv('general_look_jie.csv') #通过观察数据发现并没有什么异常之处 df=StrToComposition().featurize_dataframe(df,'full_formula',ignore_errors=True) print(df.head()) #print(df['composition']) ep_feat=ElementProperty.from_preset(preset_name='magpie') df=ep_feat.featurize_dataframe(df,col_id='composition',ignore_errors=True)#将composition这一列作为特征化的输入 print(df.head()) #print(ep_feat.citations()) #df.to_csv("plus the composition.csv") #以上这部分是将formula转化为composition并转化feature df=CompositionToOxidComposition().featurize_dataframe(df,col_id='composition')#引入了氧化态的相关特征 os_feat=OxidationStates() df=os_feat.featurize_dataframe(df,col_id='composition_oxid') new_name='2d_vector_plus.csv' df.to_csv(new_name)
def generate_data(): df = load_elastic_tensor() df.to_csv('原始elastic数据.csv') print(df.columns) unwanted_columns = [ 'volume', 'nsites', 'compliance_tensor', 'elastic_tensor', 'elastic_tensor_original', 'K_Voigt', 'G_Voigt', 'K_Reuss', 'G_Reuss' ] df = df.drop(unwanted_columns, axis=1) print(df.head()) df.to_csv('扔掉不需要的部分.csv') #首先使用describe获得对于数据的整体把握 print(df.describe()) df.describe().to_csv('general_look.csv') #通过观察数据发现并没有什么异常之处 df = StrToComposition().featurize_dataframe(df, 'formula') print(df.head()) df.to_csv('引入composition.csv') #下一步,我们需要其中一个特征化来增加一系列的特征算符 ep_feat = ElementProperty.from_preset(preset_name='magpie') df = ep_feat.featurize_dataframe( df, col_id='composition') #将composition这一列作为特征化的输入 print(df.head()) print(ep_feat.citations()) df.to_csv('将composition特征化后.csv') #开始引入新的特征化算符吧 df = CompositionToOxidComposition().featurize_dataframe( df, 'composition') #引入了氧化态的相关特征 os_feat = OxidationStates() df = os_feat.featurize_dataframe(df, col_id='composition_oxid') print(df.head()) df.to_csv('引入氧化态之后.csv') #其实除了基于composition的特征之外还有很多其他的,比如基于结构的 df_feat = DensityFeatures() df = df_feat.featurize_dataframe(df, 'structure') print(df.head()) df.to_csv('引入结构中的密度.csv') print(df_feat.feature_labels())
def test_featurizers(): df = pd.read_csv('test.csv', index_col=[0]) df = StrToComposition().featurize_dataframe(df, 'formula') print(df.head()) #下一步,我们需要其中一个特征化来增加一系列的特征算符 ep_feat = ElementProperty.from_preset(preset_name='magpie') df = ep_feat.featurize_dataframe( df, col_id='composition') #将composition这一列作为特征化的输入 print(df.head()) print(ep_feat.citations()) #df.to_csv('将composition特征化后.csv') #开始引入新的特征化算符吧 df = CompositionToOxidComposition().featurize_dataframe( df, 'composition') #引入了氧化态的相关特征 os_feat = OxidationStates() df = os_feat.featurize_dataframe(df, col_id='composition_oxid') print(df.head()) df.to_csv('after_test.csv')
def test_exclude_by_users(self): """ Test custom args for featurizers to use. """ df = copy.copy(self.test_df.iloc[:self.limit]) target = "K_VRH" exclude = ["ElementProperty"] ep = ElementProperty.from_preset("matminer") ep_feats = ep.feature_labels() # Test to make sure excluded does not show up af = AutoFeaturizer(exclude=exclude, preset="fast") af.fit(df, target) df = af.fit_transform(df, target) self.assertTrue(af.auto_featurizer) self.assertIn("ElementProperty", af.exclude) self.assertFalse(any([f in df.columns for f in ep_feats]))
def test_elem_megnet_el(self): ep = ElementProperty.from_preset("megnet_el") df_elem = ep.featurize_dataframe(self.df, col_id="composition") self.assertAlmostEqual( df_elem["MEGNetElementData maximum embedding 1"].iloc[0], 0.127333, places=6) self.assertAlmostEqual( df_elem["MEGNetElementData maximum embedding 1"].iloc[1], 0.127333, places=6) self.assertAlmostEqual( df_elem["MEGNetElementData maximum embedding 11"].iloc[0], 0.160505, places=6) self.assertAlmostEqual( df_elem["MEGNetElementData maximum embedding 11"].iloc[1], 0.160505, places=6) self.assertTrue(ep.citations())
def __init__(self,normalize_formula=False): self.normalize_formula = normalize_formula # don't need ValenceOrbital - valence counts etc. covered in ElementProperty.from_preset('magpie') # self.ValenceOrbital = ValenceOrbital() self.AtomicOrbitals = AtomicOrbitalsMod() self.CohesiveEnergy = CohesiveEnergy() self.BandCenter = BandCenter() self.ValenceOrbitalEnergy = ValenceOrbitalEnergy() # ElementProperty featurizer with magpie properties plus additional properties self.ElementProperty = ElementProperty.from_preset('magpie') self.ElementProperty.features += ['BoilingT', 'BulkModulus', 'ShearModulus', 'Density','MolarVolume', 'FusionEnthalpy','HeatVaporization', 'Polarizability', 'ThermalConductivity'] # range, min, max are irrelevant inside the ternary # self.ElementProperty.stats = ['mean', 'avg_dev','mode'] # check matminer featurizers self.check_matminer_featurizers()
def test_featurizers_by_users(self): df = copy.copy(self.test_df.iloc[:self.limit]) target = "K_VRH" dn = DensityFeatures() gsf = GlobalSymmetryFeatures() featurizers = {"structure": [dn, gsf]} af = AutoFeaturizer(featurizers=featurizers) df = af.fit_transform(df, target) # Ensure that the featurizers are not set automatically, metaselection # is not used, exclude is None and featurizers not passed by the users # are not used. self.assertFalse(af.auto_featurizer) self.assertTrue(af.exclude == []) self.assertIn(dn, af.featurizers["structure"]) self.assertIn(gsf, af.featurizers["structure"]) ep = ElementProperty.from_preset("matminer") ep_feats = ep.feature_labels() self.assertFalse(any([f in df.columns for f in ep_feats]))