def has_polymorphs(self): """Determine if a task's raw data contains polymorphs. Returns: (bool) If true, contains polymorphs. """ checker_key = "pmg_composition" self._check_is_loaded() if self.metadata.input_type == "composition": stc = StrToComposition(target_col_id=checker_key, reduce=True) comps = stc.featurize_dataframe(self.df, "composition")[checker_key].values elif self.metadata.input_type == "structure": stc = StructureToComposition(target_col_id=checker_key, reduce=True) comps = stc.featurize_dataframe(self.df, "structure")[checker_key].values else: raise ValueError( "Cannot check for polymorphs without input type in " "(structure, composition)!") unique_comps = set(comps) if len(unique_comps) != len(comps): return True else: return False
def tran_feat_composition( df, var_formula="FORMULA", preset_name="magpie", append=True, ignore_errors=True, **kwargs, ): r"""Featurize a dataset using matminer Featurize chemical composition using matminer package. Args: df (DataFrame): Data to featurize var_formula (string): Column in df with chemical formula; formula given as string append (bool): Append results to original columns? preset_name (string): Matminer featurization preset Kwargs: ignore_errors (bool): Do not throw an error while parsing formulae; set to True to return NaN's for invalid formulae. Notes: - A pre-processor and wrapper for matminer.featurizers.composition References: Ward, L., Dunn, A., Faghaninia, A., Zimmermann, N. E. R., Bajaj, S., Wang, Q., Montoya, J. H., Chen, J., Bystrom, K., Dylla, M., Chard, K., Asta, M., Persson, K., Snyder, G. J., Foster, I., Jain, A., Matminer: An open source toolkit for materials data mining. Comput. Mater. Sci. 152, 60-69 (2018). Examples: >>> import grama as gr >>> from grama.tran import tf_feat_composition >>> ( >>> gr.df_make(FORMULA=["C6H12O6"]) >>> >> gr.tf_feat_composition() >>> ) """ ## Check invariants ## Featurize featurizer = ElementProperty.from_preset(preset_name=preset_name) df_res = StrToComposition().featurize_dataframe( df[[var_formula]], var_formula, ignore_errors=ignore_errors, ) df_res = featurizer.featurize_dataframe( df_res, col_id="composition", ignore_errors=ignore_errors, **kwargs, ) df_res.drop(columns=[var_formula, "composition"], inplace=True) ## Concatenate as necessary if append: df_res = concat((df, df_res), axis=1) return df_res
def __init__(self, pbar=False): self.regressor = RandomForestRegressor(n_estimators=500, n_jobs=-1, verbose=3) self.stc = StrToComposition() ep = ElementProperty.from_preset("magpie") ef = ElementFraction() self.featurizer = MultipleFeaturizer([ep, ef]) self.pbar = pbar
def __init__(self): self.feature_calculators = MultipleFeaturizer([ cf.ElementProperty.from_preset(preset_name="magpie"), cf.Stoichiometry(), cf.ValenceOrbital(props=['frac']), cf.IonProperty(fast=True), cf.BandCenter(), cf.ElementFraction(), ]) self.str2composition = StrToComposition()
def generate(fake_df, ignore_errors=False): fake_df = np.array([fake_df]) fake_df = pd.DataFrame(fake_df) fake_df.columns = ['full_formula'] # print(fake_df) fake_df = StrToComposition().featurize_dataframe( fake_df, "full_formula", ignore_errors=ignore_errors) fake_df = fake_df.dropna() fake_df = feature_calculators.featurize_dataframe( fake_df, col_id='composition', ignore_errors=ignore_errors) fake_df["NComp"] = fake_df["composition"].apply(len) return fake_df
def test_str_to_composition(self): d = {'comp_str': ["Fe2", "MnO2"]} df = DataFrame(data=d) df = StrToComposition().featurize_dataframe(df, 'comp_str') self.assertEqual(df["composition"].tolist(), [Composition("Fe2"), Composition("MnO2")]) stc = StrToComposition(reduce=True, target_col_id='composition_red') df = stc.featurize_dataframe(df, 'comp_str') self.assertEqual(df["composition_red"].tolist(), [Composition("Fe"), Composition("MnO2")])
def composition_featurizer(df_input: pd.DataFrame, **kwargs) -> pd.DataFrame: """Return a Pandas DataFrame with all compositional features""" # generate the "composition" column df_comp = StrToComposition().featurize_dataframe(df_input, col_id="Compound") # generate features based on elemental properites ep_featurizer = ElementProperty.from_preset(preset_name="magpie") ep_featurizer.featurize_dataframe(df_comp, col_id="composition", inplace=True) # generate the "composition_oxid" column based on guessed oxidation states CompositionToOxidComposition( return_original_on_error=True, **kwargs).featurize_dataframe( # ignore errors from non-integer stoichiometries df_comp, "composition", ignore_errors=True, inplace=True) # correct oxidation states df_comp = correct_comp_oxid(df_comp) # generate features based on oxidation states os_featurizer = OxidationStates() os_featurizer.featurize_dataframe(df_comp, "composition_oxid", ignore_errors=True, inplace=True) # remove compounds with predicted oxidation states of 0 return df_comp[df_comp["minimum oxidation state"] != 0]
class FeatureGenerator: """ A wraper class to generate multiple type of elemental features """ def __init__(self): self.feature_calculators = MultipleFeaturizer([ cf.ElementProperty.from_preset(preset_name="magpie"), cf.Stoichiometry(), cf.ValenceOrbital(props=['frac']), cf.IonProperty(fast=True), cf.BandCenter(), cf.ElementFraction(), ]) self.str2composition = StrToComposition() def generate(self, df: pd.DataFrame, ignore_errors: bool = False): """ generate feature from a dataframe with a "formula" column that contains chemical formulas of the compositions. """ df = self.str2composition.featurize_dataframe( df, "formula", ignore_errors=ignore_errors) df = df.dropna() df = self.feature_calculators.featurize_dataframe( df, col_id='composition', ignore_errors=ignore_errors) df["NComp"] = df["composition"].apply(len) return df
def test_conversion_overwrite(self): # Test with overwrite d = {'comp_str': ["Fe2", "MnO2"]} df = DataFrame(data=d) stc = StrToComposition(target_col_id='comp_str', overwrite_data=False) with self.assertRaises(ValueError): df = stc.featurize_dataframe(df, 'comp_str', inplace=True) with self.assertRaises(ValueError): df = stc.featurize_dataframe(df, 'comp_str', inplace=False) stc = StrToComposition(target_col_id='comp_str', overwrite_data=True) dfres_ipt = df.copy() stc.featurize_dataframe(dfres_ipt, 'comp_str', inplace=True) self.assertListEqual(dfres_ipt.columns.tolist(), ["comp_str"]) dfres_ipf = stc.featurize_dataframe(df, 'comp_str', inplace=False) self.assertListEqual(dfres_ipf.columns.tolist(), ["comp_str"])
def test_featurizers(): df = pd.read_csv('test.csv', index_col=[0]) df = StrToComposition().featurize_dataframe(df, 'formula') print(df.head()) #下一步,我们需要其中一个特征化来增加一系列的特征算符 ep_feat = ElementProperty.from_preset(preset_name='magpie') df = ep_feat.featurize_dataframe( df, col_id='composition') #将composition这一列作为特征化的输入 print(df.head()) print(ep_feat.citations()) #df.to_csv('将composition特征化后.csv') #开始引入新的特征化算符吧 df = CompositionToOxidComposition().featurize_dataframe( df, 'composition') #引入了氧化态的相关特征 os_feat = OxidationStates() df = os_feat.featurize_dataframe(df, col_id='composition_oxid') print(df.head()) df.to_csv('after_test.csv')
def generate_data(name): #这个函数作用,输入是指定的文件名,输出增加了gaps,is_daoti,以及其他共计145特征的完整向量矩阵 #name='test_plus_gaps.csv' df=pd.read_csv(name,index_col=[0]) df['gaps']=-10.0 df_gap=pd.read_csv("gaps.csv",index_col = [0]) print(df_gap.index) i=0 str_s="" for j in range(len(df_gap.index)): #先打印二者的id # print(df.index[i]) str_s='mp-'+str(df_gap.index[j]) if(str_s==df.index[i]): df.iloc[i,-1]=df_gap.iloc[j,0] i=i+1 #print("确实一样") print("合并完毕") #同样的方法我们来建立不同的分类 df['is_daoti']=-2 for i in range(len(df.index)): if(df.ix[i,-2]==0): df.ix[i,-1]=1 else: df.ix[i,-1]=0 print("分类feature建立完成") #首先使用describe获得对于数据的整体把握 print(df.describe()) df.describe().to_csv('general_look_jie.csv') #通过观察数据发现并没有什么异常之处 df=StrToComposition().featurize_dataframe(df,'full_formula',ignore_errors=True) print(df.head()) #print(df['composition']) ep_feat=ElementProperty.from_preset(preset_name='magpie') df=ep_feat.featurize_dataframe(df,col_id='composition',ignore_errors=True)#将composition这一列作为特征化的输入 print(df.head()) #print(ep_feat.citations()) #df.to_csv("plus the composition.csv") #以上这部分是将formula转化为composition并转化feature df=CompositionToOxidComposition().featurize_dataframe(df,col_id='composition')#引入了氧化态的相关特征 os_feat=OxidationStates() df=os_feat.featurize_dataframe(df,col_id='composition_oxid') new_name='2d_vector_plus.csv' df.to_csv(new_name)
class FeatureGenerator: """ A wraper class to generate multiple type of elemental features """ def __init__(self): self.feature_calculators = MultipleFeaturizer([ cf.ElementProperty.from_preset(preset_name="magpie"), cf.Stoichiometry(), cf.ValenceOrbital(props=['frac']), cf.IonProperty(fast=True), cf.BandCenter(), cf.ElementFraction(), ]) self.str2composition = StrToComposition() def generate(self, df: pd.DataFrame, ignore_errors: bool = False, drop_mode=True): """ generate feature from a dataframe with a "formula" column that contains chemical formulas of the compositions. df : a dataframe with a column name formula ignore_errors : ignore errors when generating features drop_mode : drop property that generated from mode aggregation function """ df = self.str2composition.featurize_dataframe( df, "formula", ignore_errors=ignore_errors) df = df.dropna() df = self.feature_calculators.featurize_dataframe( df, col_id='composition', ignore_errors=ignore_errors) df["NComp"] = df["composition"].apply(len) if drop_mode: df = df.drop(columns=[ c for c in df.columns if "mode" in c and c.startswith("Magpie") ]) return df
class RFEstimator(BaseTesterEstimator): def __init__(self, pbar=False): self.regressor = RandomForestRegressor(n_estimators=500, n_jobs=-1, verbose=3) self.stc = StrToComposition() ep = ElementProperty.from_preset("magpie") ef = ElementFraction() self.featurizer = MultipleFeaturizer([ep, ef]) self.pbar = pbar def _generate_features(self, x): comps = [o[0] for o in self.stc.featurize_many(x, pbar=self.pbar)] features = np.asarray(self.featurizer.featurize_many(comps, pbar=self.pbar)) return features def fit(self, x, y): features = self._generate_features(x) self.regressor.fit(features, y) def predict(self, x): features = self._generate_features(x) return self.regressor.predict(features)
import matminer from matminer.data_retrieval.retrieve_MP import MPDataRetrieval from matminer.utils.io import store_dataframe_as_json from matminer.utils.io import load_dataframe_from_json from matminer.figrecipes.plot import PlotlyFig ''' #Block 1 - Loading and filtering the experimental dataframe ''' df = load_dataframe_from_json('data/Batteries_raw.json') # Select the working ion among {Li, Al, Zr, Mg} select = 'Li' # Initial filter based on the selected element from matminer.featurizers.conversions import StrToComposition fdf = StrToComposition().featurize_dataframe(df, 'Ion') select_at = fdf["composition"].apply(lambda x: x.get_atomic_fraction(select)) fdf = fdf[select_at == 1] # Debug print("Remaining samples: {}".format(fdf.describe)) fdf = fdf.drop(['composition'], axis=1) ## Initial conversion to matminer objects from matminer.featurizers.conversions import StrToComposition fdf = StrToComposition().featurize_dataframe(fdf, 'Reduced Formula') from matminer.featurizers.conversions import CompositionToOxidComposition fdf = CompositionToOxidComposition().featurize_dataframe(fdf, 'composition')
import numpy as np import pandas as pd import pickle ''' #Block 1 - Loading dataframe ''' # arbitrary inputs - Li must be excluded to ensure consistency data = [['mp-1025496', 'Nb1 Se2'], ['mp-977563', 'Nb1 Ir2'], ['mp-864631', 'Nb1 Rh2'], ['mp-3368', 'Nb3 O8']] fdf = pd.DataFrame(data, columns=['Id', 'Reduced Formula']) ## Initial conversion to matminer objects from matminer.featurizers.conversions import StrToComposition fdf = StrToComposition().featurize_dataframe(fdf, 'Reduced Formula') from matminer.featurizers.conversions import CompositionToOxidComposition fdf = CompositionToOxidComposition().featurize_dataframe(fdf, 'composition') print("The initial dataset has {}".format(fdf.shape)) print(fdf.head()) ''' Block 2 - Featurization ''' # # -- start F1 from matminer.featurizers.composition import ElementProperty ep_feat = ElementProperty.from_preset(preset_name='magpie')
def __init__(self, filepath, dataset, init_samples): self.filepath = filepath self.df = pd.read_csv( self.filepath, usecols=['material_id', 'pretty_formula', 'band_gap']) self.dataset = dataset self.init_samples = init_samples self.init_filename = './ALSearch_init_' + str(init_samples) + '.csv' if dataset is 'bandgap': #self.df = pd.read_csv('./bandgap_df_whole.csv') if os.path.exists(self.init_filename) is False: # small examples for debugging self.df = self.df.sample(n=self.init_samples, replace=True, random_state=42) added_columns_name = [] for i in range(128): added_columns_name.append('V' + str(i)) data = [] # create composition column df_comp = StrToComposition( target_col_id='composition').featurize_dataframe( self.df, 'pretty_formula') # create column with maximum atom number max_atom_num = [] for st in df_comp[['composition']].astype(str).values: # if len(st[0].as_dict()) > 8: # continue atom_list = [] # print(st[0]) s = st[0] for item in s.split(): num = re.sub(r"\D", "", item) atom_list.append(int(num)) # print(atom_list) max_atom_num.append(max(atom_list)) # update dataframe with max_atom_num self.df['max_atom_num'] = max_atom_num # remove rows whose max atom number above 20 self.df = self.df[self.df['max_atom_num'] < 21] self.df = self.df.drop(['max_atom_num'], axis=1) # convert formula to latent vector for formula in self.df['pretty_formula']: print(formula) onehot_matrix = formula2onehot_matrix(formula, l=20) lat_vec = get_latent_space(onehot_matrix) lat_list = lat_vec.tolist() data.append(lat_list[0]) print(formula + 'has been converted into latent vector~') df_added = pd.DataFrame(data, columns=added_columns_name) self.df.reset_index(drop=True, inplace=True) df_added.reset_index(drop=True, inplace=True) self.df = pd.concat([self.df, df_added], axis=1) # rename columns to eliminate ' ' column_rename = [ 'id', 'composition', 'Eg', 'V0', 'V1', 'V2', 'V3', 'V4', 'V5', 'V6', 'V7', 'V8', 'V9', 'V10', 'V11', 'V12', 'V13', 'V14', 'V15', 'V16', 'V17', 'V18', 'V19', 'V20', 'V21', 'V22', 'V23', 'V24', 'V25', 'V26', 'V27', 'V28', 'V29', 'V30', 'V31', 'V32', 'V33', 'V34', 'V35', 'V36', 'V37', 'V38', 'V39', 'V40', 'V41', 'V42', 'V43', 'V44', 'V45', 'V46', 'V47', 'V48', 'V49', 'V50', 'V51', 'V52', 'V53', 'V54', 'V55', 'V56', 'V57', 'V58', 'V59', 'V60', 'V61', 'V62', 'V63', 'V64', 'V65', 'V66', 'V67', 'V68', 'V69', 'V70', 'V71', 'V72', 'V73', 'V74', 'V75', 'V76', 'V77', 'V78', 'V79', 'V80', 'V81', 'V82', 'V83', 'V84', 'V85', 'V86', 'V87', 'V88', 'V89', 'V90', 'V91', 'V92', 'V93', 'V94', 'V95', 'V96', 'V97', 'V98', 'V99', 'V100', 'V101', 'V102', 'V103', 'V104', 'V105', 'V106', 'V107', 'V108', 'V109', 'V110', 'V111', 'V112', 'V113', 'V114', 'V115', 'V116', 'V117', 'V118', 'V119', 'V120', 'V121', 'V122', 'V123', 'V124', 'V125', 'V126', 'V127' ] self.df = self.df.set_axis(column_rename, axis=1, inplace=False) #self.df = self.df.drop() self.df.to_csv(self.init_filename, index=False, header=True) else: self.df = pd.read_csv(self.init_filename) print('The shape of initial dataset is ' + str(self.df.shape)) self.label = ['Eg'] # drop duplicate values self.df = self.df.drop_duplicates( subset=[i for i in self.df.columns if i not in self.label], keep='first') print('The shape of init dataset after dropping duplicates is ' + str(self.df.shape)) self.df = self.df.dropna() # sort dataframe by y value self.sorted_df = self.df.sort_values(by=self.label)
df = load_elastic_tensor() print(df.columns) """ Index(['material_id', 'formula', 'nsites', 'space_group', 'volume', 'structure', 'elastic_anisotropy', 'G_Reuss', 'G_VRH', 'G_Voigt', 'K_Reuss', 'K_VRH', 'K_Voigt', 'poisson_ratio', 'compliance_tensor', 'elastic_tensor', 'elastic_tensor_original'], dtype='object') """ unwanted_columns = ["volume", "nsites", "compliance_tensor", "elastic_tensor", "elastic_tensor_original", "K_Voigt", "G_Voigt", "K_Reuss", "G_Reuss"] df = df.drop(unwanted_columns, axis=1) from matminer.featurizers.conversions import StrToComposition df = StrToComposition().featurize_dataframe(df, 'formula') from matminer.featurizers.composition import ElementProperty ep_feat = ElementProperty.from_preset(preset_name="magpie") df = ep_feat.featurize_dataframe(df, col_id='composition') from matminer.featurizers.conversions import CompositionToOxidComposition from matminer.featurizers.composition import OxidationStates df = CompositionToOxidComposition().featurize_dataframe(df, "composition") os_feat = OxidationStates() df = os_feat.featurize_dataframe(df, "composition_oxid") from matminer.featurizers.structure import DensityFeatures
onehot = build_entry() print(onehot) onehot_l = list(onehot.keys()) print(onehot_l) filepath = './Utils/bandgap-magpie.csv' df = pd.read_csv(filepath) #df = df.sample(frac=0.001, replace=True, random_state=1) print('The shape of current dataset is ' + str(df.shape)) added_columns_name = [] for i in range(128): added_columns_name.append('V' + str(i)) data = [] # create composition column df_comp = StrToComposition(target_col_id='composition').featurize_dataframe( df, 'pretty_formula') # create column with maximum atom number max_atom_num = [] for st in df_comp[['composition']].astype(str).values: atom_list = [] #print(st[0]) s = st[0] for item in s.split(): num = re.sub(r"\D", "", item) atom_list.append(int(num)) #print(atom_list) max_atom_num.append(max(atom_list)) # update dataframe with max_atom_num df['max_atom_num'] = max_atom_num # remove rows whose max atom number above 20
def test_conversion_multiindex(self): d = {'comp_str': ["Fe2", "MnO2"]} df_1lvl = DataFrame(data=d) df_1lvl = StrToComposition().featurize_dataframe( df_1lvl, 'comp_str', multiindex=True) self.assertEqual(df_1lvl[("StrToComposition", "composition")].tolist(), [Composition("Fe2"), Composition("MnO2")]) df_2lvl = DataFrame(data=d) df_2lvl.columns = MultiIndex.from_product((["custom"], df_2lvl.columns.values)) df_2lvl = StrToComposition().featurize_dataframe( df_2lvl, ("custom", "comp_str"), multiindex=True) self.assertEqual(df_2lvl[("StrToComposition", "composition")].tolist(), [Composition("Fe2"), Composition("MnO2")]) df_2lvl = DataFrame(data=d) df_2lvl.columns = MultiIndex.from_product((["custom"], df_2lvl.columns.values)) sto = StrToComposition(target_col_id='test') df_2lvl = sto.featurize_dataframe( df_2lvl, ("custom", "comp_str"), multiindex=True) self.assertEqual(df_2lvl[("StrToComposition", "test")].tolist(), [Composition("Fe2"), Composition("MnO2")]) # if two level multiindex provided as target, it should be written there # here we test converting multiindex in place df_2lvl = DataFrame(data=d) df_2lvl.columns = MultiIndex.from_product((["custom"], df_2lvl.columns.values)) sto = StrToComposition(target_col_id=None, overwrite_data=True) df_2lvl = sto.featurize_dataframe( df_2lvl, ("custom", "comp_str"), multiindex=True, inplace=False) self.assertEqual(df_2lvl[("custom", "comp_str")].tolist(), [Composition("Fe2"), Composition("MnO2")]) # Try inplace multiindex conversion with return errors df_2lvl = DataFrame(data=d) df_2lvl.columns = MultiIndex.from_product((["custom"], df_2lvl.columns.values)) sto = StrToComposition(target_col_id=None, overwrite_data=True) df_2lvl = sto.featurize_dataframe( df_2lvl, ("custom", "comp_str"), multiindex=True, return_errors=True, ignore_errors=True) self.assertTrue( all(df_2lvl[("custom", "StrToComposition Exceptions")].isnull()))
# 得到数据 query_string = 'mdf.source_name:oqmd AND (oqmd.configuration:static OR ' \ 'oqmd.configuration:standard) AND dft.converged:True' if quick_demo: query_string += " AND mdf.scroll_id:<10000" data = mdf.get_data(query_string, unwind_arrays=False) print(data.head()) # 重命名、预处理和筛选,delta_e应该是形成能 data = data[['oqmd.delta_e.value', 'material.composition']] data = data.rename(columns={ 'oqmd.delta_e.value': 'delta_e', 'material.composition': 'composition' }) data = StrToComposition( target_col_id='composition_obj').featurize_dataframe( data, 'composition') data.sort_values('delta_e', ascending=True, inplace=True) print(data.head(3)) for k in ['delta_e']: data[k] = pd.to_numeric(data[k]) original_count = len(data) data = data[~data['delta_e'].isnull()] print('Removed %d/%d entries' % (original_count - len(data), original_count)) original_count = len(data) data['composition'] = data['composition_obj'].apply( lambda x: x.reduced_formula) data.sort_values('delta_e', ascending=True, inplace=True)
def generate_data(): df = load_elastic_tensor() df.to_csv('原始elastic数据.csv') print(df.columns) unwanted_columns = [ 'volume', 'nsites', 'compliance_tensor', 'elastic_tensor', 'elastic_tensor_original', 'K_Voigt', 'G_Voigt', 'K_Reuss', 'G_Reuss' ] df = df.drop(unwanted_columns, axis=1) print(df.head()) df.to_csv('扔掉不需要的部分.csv') #首先使用describe获得对于数据的整体把握 print(df.describe()) df.describe().to_csv('general_look.csv') #通过观察数据发现并没有什么异常之处 df = StrToComposition().featurize_dataframe(df, 'formula') print(df.head()) df.to_csv('引入composition.csv') #下一步,我们需要其中一个特征化来增加一系列的特征算符 ep_feat = ElementProperty.from_preset(preset_name='magpie') df = ep_feat.featurize_dataframe( df, col_id='composition') #将composition这一列作为特征化的输入 print(df.head()) print(ep_feat.citations()) df.to_csv('将composition特征化后.csv') #开始引入新的特征化算符吧 df = CompositionToOxidComposition().featurize_dataframe( df, 'composition') #引入了氧化态的相关特征 os_feat = OxidationStates() df = os_feat.featurize_dataframe(df, col_id='composition_oxid') print(df.head()) df.to_csv('引入氧化态之后.csv') #其实除了基于composition的特征之外还有很多其他的,比如基于结构的 df_feat = DensityFeatures() df = df_feat.featurize_dataframe(df, 'structure') print(df.head()) df.to_csv('引入结构中的密度.csv') print(df_feat.feature_labels())
from matminer.featurizers.conversions import StrToComposition from tqdm import tqdm import pandas as pd # pd.set_option('display.height', 1000) pd.set_option("display.max_rows", 500) pd.set_option("display.max_columns", 500) pd.set_option("display.width", 1000) df = load_dataset("glass_ternary_landolt") df = df.rename(columns={"formula": "composition"}) df = df[["composition", "gfa"]] df = StrToComposition(target_col_id="composition_obj").featurize_dataframe( df, "composition") df["composition"] = [c.reduced_formula for c in df["composition_obj"]] df = df.drop(columns=["composition_obj"]) # print("Ground truth") # print(df[df["composition"]=="ZrTi9"]) # should be False in final dataframe also!! # print(df[df["composition"]=="ZrVCo8"]) # should be True in final dataframe also! # print(df["gfa"].value_counts()) # proportion is about 5000 GFA 2054 no GFA # raise ValueError unique = df["composition"].unique() print(len(df)) print(len(unique)) problem_compositions = [] new_df_dict = {"composition": [], "gfa": []}
df['elastic_tensor_original'][i] = np.array(df['elastic_tensor_original'][i]['data']) """ ['_id', 'material_id', 'formula', 'nsites', 'space_group', 'volume', 'structure', 'elastic_anisotropy', 'G_Reuss', 'G_VRH', 'G_Voigt', 'K_Reuss', 'K_VRH', 'K_Voigt', 'poisson_ratio', 'compliance_tensor', 'elastic_tensor', 'elastic_tensor_original', 'cif', 'kpoint_density', 'poscar'] """ unwanted_columns = ['_id', 'material_id', 'nsites', 'volume', 'cif', 'kpoint_density', 'poscar'] df = df.drop(unwanted_columns, axis=1) from matminer.featurizers.conversions import StrToComposition sc_feat = StrToComposition() df = sc_feat.featurize_dataframe(df, col_id='formula') from matminer.featurizers.composition import ElementProperty ep_feat = ElementProperty.from_preset(preset_name='magpie') df = ep_feat.featurize_dataframe(df, col_id='composition') from matminer.featurizers.conversions import CompositionToOxidComposition co_feat = CompositionToOxidComposition() df = co_feat.featurize_dataframe(df, col_id='composition') from matminer.featurizers.composition import OxidationStates os_feat = OxidationStates()
def AddFeatures(df): # Add features by Matminer from matminer.featurizers.conversions import StrToComposition df = StrToComposition().featurize_dataframe(df, "formula") from matminer.featurizers.composition import ElementProperty ep_feat = ElementProperty.from_preset(preset_name="magpie") df = ep_feat.featurize_dataframe( df, col_id="composition" ) # input the "composition" column to the featurizer from matminer.featurizers.conversions import CompositionToOxidComposition from matminer.featurizers.composition import OxidationStates df = CompositionToOxidComposition().featurize_dataframe(df, "composition") os_feat = OxidationStates() df = os_feat.featurize_dataframe(df, "composition_oxid") from matminer.featurizers.composition import ElectronAffinity ea_feat = ElectronAffinity() df = ea_feat.featurize_dataframe(df, "composition_oxid", ignore_errors=True) from matminer.featurizers.composition import BandCenter bc_feat = BandCenter() df = bc_feat.featurize_dataframe(df, "composition_oxid", ignore_errors=True) from matminer.featurizers.composition import CohesiveEnergy ce_feat = CohesiveEnergy() df = ce_feat.featurize_dataframe(df, "composition_oxid", ignore_errors=True) from matminer.featurizers.composition import Miedema m_feat = Miedema() df = m_feat.featurize_dataframe(df, "composition_oxid", ignore_errors=True) from matminer.featurizers.composition import TMetalFraction tmf_feat = TMetalFraction() df = tmf_feat.featurize_dataframe(df, "composition_oxid", ignore_errors=True) from matminer.featurizers.composition import ValenceOrbital vo_feat = ValenceOrbital() df = vo_feat.featurize_dataframe(df, "composition_oxid", ignore_errors=True) from matminer.featurizers.composition import YangSolidSolution yss_feat = YangSolidSolution() df = yss_feat.featurize_dataframe(df, "composition_oxid", ignore_errors=True) from matminer.featurizers.structure import GlobalSymmetryFeatures # This is the border between compositional features and structural features. Comment out the following featurizers to use only compostional features. gsf_feat = GlobalSymmetryFeatures() df = gsf_feat.featurize_dataframe(df, "structure", ignore_errors=True) from matminer.featurizers.structure import StructuralComplexity sc_feat = StructuralComplexity() df = sc_feat.featurize_dataframe(df, "structure", ignore_errors=True) from matminer.featurizers.structure import ChemicalOrdering co_feat = ChemicalOrdering() df = co_feat.featurize_dataframe(df, "structure", ignore_errors=True) from matminer.featurizers.structure import MaximumPackingEfficiency mpe_feat = MaximumPackingEfficiency() df = mpe_feat.featurize_dataframe(df, "structure", ignore_errors=True) from matminer.featurizers.structure import MinimumRelativeDistances mrd_feat = MinimumRelativeDistances() df = mrd_feat.featurize_dataframe(df, "structure", ignore_errors=True) from matminer.featurizers.structure import StructuralHeterogeneity sh_feat = StructuralHeterogeneity() df = sh_feat.featurize_dataframe(df, "structure", ignore_errors=True) from matminer.featurizers.structure import SiteStatsFingerprint from matminer.featurizers.site import AverageBondLength from pymatgen.analysis.local_env import CrystalNN bl_feat = SiteStatsFingerprint( AverageBondLength(CrystalNN(search_cutoff=20))) df = bl_feat.featurize_dataframe(df, "structure", ignore_errors=True) from matminer.featurizers.site import AverageBondAngle ba_feat = SiteStatsFingerprint( AverageBondAngle(CrystalNN(search_cutoff=20))) df = ba_feat.featurize_dataframe(df, "structure", ignore_errors=True) from matminer.featurizers.site import BondOrientationalParameter bop_feat = SiteStatsFingerprint(BondOrientationalParameter()) df = bop_feat.featurize_dataframe(df, "structure", ignore_errors=True) from matminer.featurizers.site import CoordinationNumber cn_feat = SiteStatsFingerprint(CoordinationNumber()) df = cn_feat.featurize_dataframe(df, "structure", ignore_errors=True) from matminer.featurizers.structure import DensityFeatures df_feat = DensityFeatures() df = df_feat.featurize_dataframe(df, "structure", ignore_errors=True) return (df)
# df = mpdr.get_dataframe({"elasticity": {"$exists": True}, "elasticity.warnings": []}, # ['pretty_formula', 'elasticity.K_VRH', 'elasticity.G_VRH']) criteria = {'elasticity.K_VRH': {'$ne': None}} properties = ['pretty_formula', 'spacegroup.symbol', 'elasticity.K_VRH', 'elasticity.G_VRH','formation_energy_per_atom', 'band_gap', 'e_above_hull', 'density', 'volume', 'nsites'] df = mpr.get_dataframe(criteria=criteria, properties=properties) df1=pd.read_csv(r'D:\FYP_files\database\data_after_processing\huizong\huizong.csv') df=df.reset_index() df=pd.merge(df,df1) df=df.set_index("material_id") df = df[df['elasticity.K_VRH'] > 0] df = df[df['e_above_hull'] < 0.1] df['vpa'] = df['volume']/df['nsites'] df['poisson_ratio']=df[["elasticity.K_VRH","elasticity.G_VRH"]].apply(lambda x:(3*x["elasticity.K_VRH"]-2*x["elasticity.G_VRH"])/(6*x["elasticity.K_VRH"]+2*x["elasticity.G_VRH"]),axis=1) from matminer.featurizers.conversions import StrToComposition df = StrToComposition().featurize_dataframe(df, "pretty_formula") from matminer.featurizers.composition import ElementProperty ep_feat = ElementProperty.from_preset(preset_name="magpie") df = ep_feat.featurize_dataframe(df, col_id="composition") # input the "composition" column to the featurizer from matminer.featurizers.conversions import CompositionToOxidComposition from matminer.featurizers.composition import OxidationStates df = CompositionToOxidComposition().featurize_dataframe(df, "composition") os_feat = OxidationStates() df = os_feat.featurize_dataframe(df, "composition_oxid") dataset = PymatgenData() descriptors = ['row', 'group', 'atomic_mass', 'atomic_radius', 'boiling_point', 'melting_point', 'X'] stats = ["mean", "std_dev"] ep = ElementProperty(data_source=dataset, features=descriptors, stats=stats) df = ep.featurize_dataframe(df, "composition") #Remove NaN values
for entry in data_2['elasticity']: values_list.append(list(entry.values())) for prop in tensor_list: prop_value = list() for materials_val_list in values_list: prop_value.append(materials_val_list[tensor_list.index(prop)]) new_cols_val.append(prop_value) for prop_name in tensor_list: data_2[prop_name] = new_cols_val[tensor_list.index(prop_name)] # prepare for featurization from matminer.featurizers.conversions import StrToComposition data_3 = StrToComposition().featurize_dataframe(data_2, "pretty_formula") #data_3.columns # In[9]: # Saving this intermediate dataset before defining training data and targets import numpy as np np.savez_compressed("heusler_all.npz", data=data_3) # In[ ]: # Featurization # This part is done with reference to the matiner examples from matminer.featurizers.composition import ElementProperty ep_feat = ElementProperty.from_preset(preset_name="magpie")
def __init__(self): self.filepath = './Utils/bandgap-magpie.csv' self.df = pd.read_csv(self.filepath) # drop duplicate values print('The shape of whole dataset before dropping duplicates is ' + str(self.df.shape)) self.df = self.df.drop_duplicates(subset=['pretty_formula'], keep='first') print('The shape of whole dataset after dropping duplicates is ' + str(self.df.shape)) self.df = self.df.sample(frac=0.0001, replace=True, random_state=1) added_columns_name = [] for i in range(128): added_columns_name.append('V' + str(i)) data = [] # create composition column df_comp = StrToComposition( target_col_id='composition').featurize_dataframe( self.df, 'pretty_formula') # create column with maximum atom number max_atom_num = [] for st in df_comp[['composition']].astype(str).values: atom_list = [] s = st[0] for item in s.split(): num = re.sub(r"\D", "", item) atom_list.append(int(num)) max_atom_num.append(max(atom_list)) # update dataframe with max_atom_num self.df['max_atom_num'] = max_atom_num # remove rows whose max atom number above 8 self.df = self.df[self.df['max_atom_num'] < 9] # convert formula to latent vector for formula in self.df['pretty_formula']: print(formula) onehot_matrix = formula2onehot_matrix(formula, l=8) lat_vec = get_latent_space(onehot_matrix) lat_list = lat_vec.tolist() data.append(lat_list[0]) print(formula + 'has been converted into latent vector~') df_added = pd.DataFrame(data, columns=added_columns_name) self.df.reset_index(drop=True, inplace=True) df_added.reset_index(drop=True, inplace=True) self.df = pd.concat([self.df, df_added], axis=1) # perform autoencode to pretty formula column_to_remove = ['material_id', 'max_atom_num'] # generate column names self.df = self.df.drop(column_to_remove, axis=1) # rename columns to eliminate ' ' column_rename = [ 'pretty_formula', 'band_gap', 'MagpieData_minimum_Number', 'MagpieData_maximum_Number', 'MagpieData_range_Number', 'MagpieData_mean_Number', 'MagpieData_avg_dev_Number', 'MagpieData_mode_Number', 'MagpieData_minimum_MendeleevNumber', 'MagpieData_maximum_MendeleevNumber', 'MagpieData_range_MendeleevNumber', 'MagpieData_mean_MendeleevNumber', 'MagpieData_avg_dev_MendeleevNumber', 'MagpieData_mode_MendeleevNumber', 'MagpieData_minimum_AtomicWeight', 'MagpieData_maximum_AtomicWeight', 'MagpieData_range_AtomicWeight', 'MagpieData_mean_AtomicWeight', 'MagpieData_avg_dev_AtomicWeight', 'MagpieData_mode_AtomicWeight', 'MagpieData_minimum_MeltingT', 'MagpieData_maximum_MeltingT', 'MagpieData_range_MeltingT', 'MagpieData_mean_MeltingT', 'MagpieData_avg_dev_MeltingT', 'MagpieData_mode_MeltingT', 'MagpieData_minimum_Column', 'MagpieData_maximum_Column', 'MagpieData_range_Column', 'MagpieData_mean_Column', 'MagpieData_avg_dev_Column', 'MagpieData_mode_Column', 'MagpieData_minimum_Row', 'MagpieData_maximum_Row', 'MagpieData_range_Row', 'MagpieData_mean_Row', 'MagpieData_avg_dev_Row', 'MagpieData_mode_Row', 'MagpieData_minimum_CovalentRadius', 'MagpieData_maximum_CovalentRadius', 'MagpieData_range_CovalentRadius', 'MagpieData_mean_CovalentRadius', 'MagpieData_avg_dev_CovalentRadius', 'MagpieData_mode_CovalentRadius', 'MagpieData_minimum_Electronegativity', 'MagpieData_maximum_Electronegativity', 'MagpieData_range_Electronegativity', 'MagpieData_mean_Electronegativity', 'MagpieData_avg_dev_Electronegativity', 'MagpieData_mode_Electronegativity', 'MagpieData_minimum_NsValence', 'MagpieData_maximum_NsValence', 'MagpieData_range_NsValence', 'MagpieData_mean_NsValence', 'MagpieData_avg_dev_NsValence', 'MagpieData_mode_NsValence', 'MagpieData_minimum_NpValence', 'MagpieData_maximum_NpValence', 'MagpieData_range_NpValence', 'MagpieData_mean_NpValence', 'MagpieData_avg_dev_NpValence', 'MagpieData_mode_NpValence', 'MagpieData_minimum_NdValence', 'MagpieData_maximum_NdValence', 'MagpieData_range_NdValence', 'MagpieData_mean_NdValence', 'MagpieData_avg_dev_NdValence', 'MagpieData_mode_NdValence', 'MagpieData_minimum_NfValence', 'MagpieData_maximum_NfValence', 'MagpieData_range_NfValence', 'MagpieData_mean_NfValence', 'MagpieData_avg_dev_NfValence', 'MagpieData_mode_NfValence', 'MagpieData_minimum_NValence', 'MagpieData_maximum_NValence', 'MagpieData_range_NValence', 'MagpieData_mean_NValence', 'MagpieData_avg_dev_NValence', 'MagpieData_mode_NValence', 'MagpieData_minimum_NsUnfilled', 'MagpieData_maximum_NsUnfilled', 'MagpieData_range_NsUnfilled', 'MagpieData_mean_NsUnfilled', 'MagpieData_avg_dev_NsUnfilled', 'MagpieData_mode_NsUnfilled', 'MagpieData_minimum_NpUnfilled', 'MagpieData_maximum_NpUnfilled', 'MagpieData_range_NpUnfilled', 'MagpieData_mean_NpUnfilled', 'MagpieData_avg_dev_NpUnfilled', 'MagpieData_mode_NpUnfilled', 'MagpieData_minimum_NdUnfilled', 'MagpieData_maximum_NdUnfilled', 'MagpieData_range_NdUnfilled', 'MagpieData_mean_NdUnfilled', 'MagpieData_avg_dev_NdUnfilled', 'MagpieData_mode_NdUnfilled', 'MagpieData_minimum_NfUnfilled', 'MagpieData_maximum_NfUnfilled', 'MagpieData_range_NfUnfilled', 'MagpieData_mean_NfUnfilled', 'MagpieData_avg_dev_NfUnfilled', 'MagpieData_mode_NfUnfilled', 'MagpieData_minimum_NUnfilled', 'MagpieData_maximum_NUnfilled', 'MagpieData_range_NUnfilled', 'MagpieData_mean_NUnfilled', 'MagpieData_avg_dev_NUnfilled', 'MagpieData_mode_NUnfilled', 'MagpieData_minimum_GSvolume_pa', 'MagpieData_maximum_GSvolume_pa', 'MagpieData_range_GSvolume_pa', 'MagpieData_mean_GSvolume_pa', 'MagpieData_avg_dev_GSvolume_pa', 'MagpieData_mode_GSvolume_pa', 'MagpieData_minimum_GSbandgap', 'MagpieData_maximum_GSbandgap', 'MagpieData_range_GSbandgap', 'MagpieData_mean_GSbandgap', 'MagpieData_avg_dev_GSbandgap', 'MagpieData_mode_GSbandgap', 'MagpieData_minimum_GSmagmom', 'MagpieData_maximum_GSmagmom', 'MagpieData_range_GSmagmom', 'MagpieData_mean_GSmagmom', 'MagpieData_avg_dev_GSmagmom', 'MagpieData_mode_GSmagmom', 'MagpieData_minimum_SpaceGroupNumber', 'MagpieData_maximum_SpaceGroupNumber', 'MagpieData_range_SpaceGroupNumber', 'MagpieData_mean_SpaceGroupNumber', 'MagpieData_avg_dev_SpaceGroupNumber', 'MagpieData_mode_SpaceGroupNumber', 'V0', 'V1', 'V2', 'V3', 'V4', 'V5', 'V6', 'V7', 'V8', 'V9', 'V10', 'V11', 'V12', 'V13', 'V14', 'V15', 'V16', 'V17', 'V18', 'V19', 'V20', 'V21', 'V22', 'V23', 'V24', 'V25', 'V26', 'V27', 'V28', 'V29', 'V30', 'V31', 'V32', 'V33', 'V34', 'V35', 'V36', 'V37', 'V38', 'V39', 'V40', 'V41', 'V42', 'V43', 'V44', 'V45', 'V46', 'V47', 'V48', 'V49', 'V50', 'V51', 'V52', 'V53', 'V54', 'V55', 'V56', 'V57', 'V58', 'V59', 'V60', 'V61', 'V62', 'V63', 'V64', 'V65', 'V66', 'V67', 'V68', 'V69', 'V70', 'V71', 'V72', 'V73', 'V74', 'V75', 'V76', 'V77', 'V78', 'V79', 'V80', 'V81', 'V82', 'V83', 'V84', 'V85', 'V86', 'V87', 'V88', 'V89', 'V90', 'V91', 'V92', 'V93', 'V94', 'V95', 'V96', 'V97', 'V98', 'V99', 'V100', 'V101', 'V102', 'V103', 'V104', 'V105', 'V106', 'V107', 'V108', 'V109', 'V110', 'V111', 'V112', 'V113', 'V114', 'V115', 'V116', 'V117', 'V118', 'V119', 'V120', 'V121', 'V122', 'V123', 'V124', 'V125', 'V126', 'V127' ] self.df = self.df.set_axis(column_rename, axis=1, inplace=False) self.df.to_csv(r'bandgap_df_new_114.csv', index=False, header=True)
def _tidy_column(self, df, featurizer_type): """ Various conversions to homogenize columns for featurization input. For example, take a column of compositions and ensure they are decorated with oxidation states, are not strings, etc. Args: df (pandas.DataFrame) featurizer_type: The key defining the featurizer input. For example, composition featurizers should have featurizer_type of "composition". Returns: df (pandas.DataFrame): DataFrame with featurizer_type column ready for featurization. """ # todo: Make the following conversions more robust (no [0] type checking) type_tester = df[featurizer_type].iloc[0] if featurizer_type == self.composition_col: # Convert formulas to composition objects if isinstance(type_tester, str): self.logger.info( self._log_prefix + "Compositions detected as strings. Attempting " "conversion to Composition objects...") stc = StrToComposition(overwrite_data=True, target_col_id=featurizer_type) df = stc.featurize_dataframe(df, featurizer_type, multiindex=self.multiindex, ignore_errors=True, inplace=False) elif isinstance(type_tester, dict): self.logger.info(self._log_prefix + "Compositions detected as dicts. Attempting " "conversion to Composition objects...") df[featurizer_type] = [ Composition.from_dict(d) for d in df[featurizer_type] ] # Convert non-oxidstate containing comps to oxidstate comps if self.guess_oxistates: self.logger.info( self._log_prefix + "Guessing oxidation states of compositions, as" " they were not present in input.") cto = CompositionToOxidComposition( target_col_id=featurizer_type, overwrite_data=True, return_original_on_error=True, max_sites=-50) try: df = cto.featurize_dataframe(df, featurizer_type, multiindex=self.multiindex, inplace=False) except Exception as e: self.logger.info(self._log_prefix + "Could not decorate oxidation states due " "to {}. Excluding featurizers based on " "composition oxistates".format(e)) classes_require_oxi = [ c.__class__.__name__ for c in CompositionFeaturizers().need_oxi ] self.exclude.extend(classes_require_oxi) else: # Convert structure/bs/dos dicts to objects (robust already) if isinstance(type_tester, (dict, str)): self.logger.info(self._log_prefix.capitalize() + "{} detected as string or dict. Attempting " "conversion to {} objects..." "".format(featurizer_type, featurizer_type)) if isinstance(type_tester, str): raise ValueError("{} column is type {}. Cannot convert." "".format(featurizer_type, type(type_tester))) dto = DictToObject(overwrite_data=True, target_col_id=featurizer_type) df = dto.featurize_dataframe(df, featurizer_type, inplace=False) # Decorate with oxidstates if featurizer_type == self.structure_col and \ self.guess_oxistates: self.logger.info( self._log_prefix + "Guessing oxidation states of structures if they were " "not present in input.") sto = StructureToOxidStructure( target_col_id=featurizer_type, overwrite_data=True, return_original_on_error=True, max_sites=-50) try: df = sto.featurize_dataframe( df, featurizer_type, multiindex=self.multiindex, inplace=False) except Exception as e: self.logger.info( self._log_prefix + "Could not decorate oxidation states on structures " "due to {}.".format(e)) return df
from matminer.featurizers import composition as cf from matminer.featurizers.conversions import StrToComposition import numpy as np import pandas as pd import csv import os import itertools from pymatgen import Composition from pymatgen.core.periodic_table import Element # Read in dataset filepath = "pifs.csv" glass_data = pd.read_csv(filepath) # Make the compositions of the glasses data into pymatgen objects to match the data from OQMD # Convert compositions to pymatgen objects. comps = StrToComposition().featurize_dataframe( glass_data, "formula", ignore_errors=True)["composition"] # Loop through all elements and list the ones that come up. # Also keep track fo how many elements there are of each. majority = [] for c in comps: print(c) max_comp = -1 main_element = "" elements = c.items() for e in elements: if e[1] > max_comp: max_comp = e[1] main_element = e[0] majority.append(str(main_element))
json.loads( urlopen("http://aflowlib.duke.edu/search/API/?" + MATCHBOOK + ",$paging(0)").read().decode("utf-8")))['compound'] matrix = pd.DataFrame([metal] * len(substrate)) Tsplit = [float(i) for i in Trange.split('-')] if Tsplit[2] != 0: Tlist = np.arange(Tsplit[0], Tsplit[1], Tsplit[2]).tolist() sys_cond_0 = pd.concat([matrix, substrate], axis=1) sys_cond_0['Temp'] = pd.DataFrame([Tsplit[0]] * len(substrate)) sys_cond_0.columns = ['Metal', 'Substrate', 'Temp'] metal_matminer = pd.DataFrame([metal], columns=['Metal']) metal_matminer = StrToComposition(target_col_id='Me_comp').featurize_dataframe( metal_matminer, 'Metal') data_Me = magpie.featurize_dataframe(metal_matminer, col_id="Me_comp", ignore_errors=True) metal_features = pd.DataFrame(data_Me.values.tolist() * len(substrate), columns=data_Me.columns) feature_Me = metal_features.filter(like='mean') feature_Me = feature_Me.drop(columns=['MagpieData mean NfUnfilled']) feature_Me.columns = ['Me_' + j for j in feature_Me.columns] sys_cond_0 = StrToComposition(target_col_id='Sub_comp').featurize_dataframe( sys_cond_0, 'Substrate') data_Sub = magpie.featurize_dataframe(sys_cond_0, col_id="Sub_comp", ignore_errors=True) feature_Sub = data_Sub.filter(like='mean')