def test_featurize_bsdos(self, refresh_df_init=False, limit=1): """ Tests featurize_dos and featurize_bandstructure. Args: refresh_df_init (bool): for developers, if the test need to be updated set to True. Otherwise set to False to make the final test independent of MPRester and faster. limit (int): the maximum final number of entries. Returns (None): """ target = "color" df_bsdos_pickled = "mp_data_with_dos_bandstructure.pickle" if refresh_df_init: mpdr = MPDataRetrieval() df = mpdr.get_dataframe(criteria={"material_id": "mp-149"}, properties=[ "pretty_formula", "dos", "bandstructure", "bandstructure_uniform" ]) df.to_pickle(os.path.join(TEST_DIR, df_bsdos_pickled)) else: df = pd.read_pickle(os.path.join(TEST_DIR, df_bsdos_pickled)) df = df.dropna(axis=0) df = df.rename( columns={ "bandstructure_uniform": "bandstructure", "bandstructure": "line bandstructure" }) df[target] = [["red"]] n_cols_init = df.shape[1] featurizer = AutoFeaturizer(preset="express", ignore_errors=False, multiindex=False) df = featurizer.fit_transform(df, target) # sanity checks self.assertTrue(len(df), limit) self.assertGreater(len(df.columns), n_cols_init) # DOSFeaturizer: self.assertEqual(df["cbm_character_1"][0], "p") # DopingFermi: self.assertAlmostEqual(df["fermi_c1e+20T300"][0], -0.539, 3) # Hybridization: self.assertAlmostEqual(df["vbm_sp"][0], 0.181, 3) self.assertAlmostEqual(df["cbm_s"][0], 0.4416, 3) self.assertAlmostEqual(df["cbm_sp"][0], 0.9864, 3) # BandFeaturizer: self.assertAlmostEqual(df["direct_gap"][0], 2.556, 3) self.assertAlmostEqual(df["n_ex1_norm"][0], 0.6285, 4) # BranchPointEnergy: self.assertAlmostEqual(df["branch_point_energy"][0], 5.7677, 4)
def plot_expt_compt_band_gaps(citrine_api_key, limit=0): """ Pulls experimental band gaps from Citrine (w/o dataset limitations) and evaluate the DFT computed band gaps (data from materialsproject.org) in xy scatter plot. To compare the right values, we pick the computed band gaps calculated for a chemical formula that has the lowest energy above hull (the most stable structure). Args: citrine_api_key (str): Your Citrine API key for getting data. Don't have a Citrine account? Visit https://citrine.io/ limit (int): limit the number of entries (0 means no limit) Returns: plotly plots in "offline" mode poped in the default browser. """ # pull experimental band gaps from Citrine cdr = CitrineDataRetrieval(api_key=citrine_api_key) cols = ['chemicalFormula', 'Band gap'] df_ct = cdr.get_dataframe(prop='band gap', data_type='experimental', show_columns=cols, max_results=limit).rename( columns={'chemicalFormula': 'Formula', 'Band gap': 'Expt. gap'}) df_ct = df_ct[df_ct['Formula'] != 'In1p1'] # p1 not recognized in Composition df_ct = df_ct.dropna() # null band gaps cause problem when plotting residuals df_ct['Formula'] = df_ct['Formula'].transform( lambda x: Composition(x).get_reduced_formula_and_factor()[0]) # pull computational band gaps from the Materials Project df = MPDataRetrieval().get_dataframe( criteria={'pretty_formula': {'$in': list(df_ct['Formula'].values)}}, properties=['pretty_formula', 'material_id', 'band_gap', 'e_above_hull'], index_mpid=False).rename( columns={'pretty_formula': 'Formula', 'band_gap': 'MP computed gap', 'material_id': 'mpid'}) # pick the most stable structure df_mp = df.loc[df.groupby("Formula")["e_above_hull"].idxmin()] df_final = df_ct.merge(df_mp, on='Formula').drop( 'e_above_hull', axis=1).set_index('mpid') pf = PlotlyFig(df_final, x_title='Experimental band gap (eV)', y_title='Computed Band Gap (eV)', filename='band_gaps') # computed vs. experimental band gap: pf.xy([ ('Expt. gap', 'MP computed gap'), ([0, 12], [0, 12]) ], lines=[{}, {'color': 'black', 'dash': 'dash'}], labels=df_final.index, modes=['markers', 'lines'], names=['Computed vs. expt.', 'Expt. gap']) # residual: residuals = df_final['MP computed gap']-df_final['Expt. gap'].astype(float) pf.set_arguments(x_title='Experimental band gap (eV)', y_title='Residual (Computed - Expt.) Band Gap (eV)', filename='band_gap_residuals') pf.xy(('Expt. gap', residuals), labels = df_final.index)
class MPDataRetrievalTest(unittest.TestCase): def setUp(self): self.mpdr = MPDataRetrieval(mapi_key) def test_get_data(self): df = self.mpdr.get_dataframe(criteria={"material_id": "mp-23"}, properties=["structure"])
class MPDataRetrievalTest(unittest.TestCase): def setUp(self): self.mpdr = MPDataRetrieval() def test_get_data(self): if self.mpdr.mprester.api_key: df = self.mpdr.get_dataframe(criteria={"material_id": "mp-23"}, properties=["structure"]) self.assertEqual(len(df["structure"]), 1) else: raise SkipTest( "Skipped MPDataRetrieval test; no MAPI_KEY detected")
class MPDataRetrievalTest(unittest.TestCase): def setUp(self): self.mpdr = MPDataRetrieval() def test_get_data(self): df = self.mpdr.get_dataframe(criteria={"material_id": "mp-23"}, properties=["structure", "bandstructure", "bandstructure_uniform", "dos"]) self.assertEqual(len(df["structure"]), 1) self.assertEqual(df["bandstructure"][0].get_band_gap()["energy"], 0) self.assertTrue(isinstance(df["bandstructure"][0], BandStructureSymmLine)) self.assertTrue(isinstance(df["bandstructure_uniform"][0], BandStructure)) self.assertTrue(isinstance(df["dos"][0], CompleteDos))
""" from pymatgen import MPRester from matminer.datasets.dataset_retrieval import load_dataset from matminer.data_retrieval.retrieve_MP import MPDataRetrieval import pandas as pd import numpy as np from tqdm import tqdm pd.set_option('display.max_rows', 500) pd.set_option('display.max_columns', 500) pd.set_option('display.width', 1000) chunksize = 1000 mpdr = MPDataRetrieval() mpr = MPRester() def chunks(l, n): """Yield successive n-sized chunks from l.""" for i in range(0, len(l), n): yield l[i:i + n] df = mpdr.get_dataframe(criteria={ "e_above_hull": { "$lt": 0.150 }, "formation_energy_per_atom": { "$lt": 0.150
- elasticity_G_VRH - elasticity_log10(G_VRH) From matminer's dataset library. """ from matminer.datasets.dataset_retrieval import load_dataset from matminer.data_retrieval.retrieve_MP import MPDataRetrieval import pandas as pd import numpy as np pd.set_option("display.max_rows", 500) pd.set_option("display.max_columns", 500) pd.set_option("display.width", 1000) mpdr = MPDataRetrieval() df = mpdr.get_dataframe( criteria={ "e_above_hull": { "$lt": 0.150 }, "formation_energy_per_atom": { "$lt": 0.150 }, "elasticity": { "$exists": 1, "$ne": None }, }, # "elements": },
def setUp(self): self.mpdr = MPDataRetrieval(mapi_key)
def query_data(pname,api_key,path=''): mpdr = MPDataRetrieval(api_key) # query properties props = mpdr.get_dataframe(criteria={pname: {"$exists": True}, # "elements": {"$all": ["Li", "Fe", "O"]}, ("{}.warnings".format(pname)): None}, properties=['pretty_formula',pname,'e_above_hull']) print("There are {} entries satisfying criteria".format(props[pname].count())) # Load crystal structures # initialize dataframe structures = pd.DataFrame(columns=['structure']) # lists of mp ids to avo chunk_size = 1000 mp_ids = props.index.tolist() sublists = [mp_ids[i:i+chunk_size] for i in range(0, len(mp_ids), chunk_size)] # query structures for sublist in sublists: structures = structures.append(mpdr.get_dataframe({"material_id":{"$in": sublist}}, ['structure'])) data = pd.concat([props,structures],axis=1) fname = '%s/%s.pkl' % (path,pname) data.to_pickle(fname) print('Saved file to ',fname) return data def filter_data(df,elems,pname,pmin=None,pmax=None,stab=None): '''Filter data by criteria''' print('# entries before filters: ',len(df)) # filter by chemistry inds = np.zeros((len(elems),len(df))) for i,item in enumerate(elems): inds[i,:] = (df['pretty_formula'].str.contains(item)) idx = np.prod(inds,axis=0) df = df[idx==1] print('# entries after chemistry: ',len(df)) # filter by property values if pmin: df = df[df[pname] >= pmin] if pmax: df = df[df[pname] <= pmax] print('# entries after property: ',len(df)) # filter by stability if stab: df = df[df['e_above_hull'] <= stab] print('# entries after stability: ',len(df)) return df def get_xy(df,elems,pname,pmin,pmax,stab): '''Get x and y from data''' # filter NaNs and entries based on criteria df = df.dropna() df = filter_data(df,elems,pname,pmin=pmin,pmax=pmax,stab=stab) # exclude non-input columns exclude = ['pretty_formula',pname,'e_above_hull','structure','composition','composition_oxid','radial distribution function'] # get X and Y x = df.sort_index().drop(exclude, axis=1) y = df[pname].sort_index().values return x,y def fit_forest(x,y,lbl='Full'): # split data x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42) # grid-search optimal parameters rf = RandomForestRegressor() param_grid = { 'n_estimators' : [10,25,50,100,250], 'max_features' : ['auto','sqrt','log2'], 'min_samples_split' : [2,4,8], 'min_samples_leaf' : [1, 2, 5] } grid = GridSearchCV(rf, param_grid, n_jobs=-1, cv=5) grid.fit(x_train, y_train) print(grid.best_score_) print(grid.best_params_) print(grid.score(x_test, y_test)) # use optimal parameters rf.set_params(**grid.best_params_) rf.fit(x_train, y_train) y_hat_train = rf.predict(x_train) y_hat_test = rf.predict(x_test) mae_train = np.mean(abs(y_hat_train-y_train))/np.mean(y_train) print('%s RF, train error: %.3f' % (lbl,mae_train)) mae_test = np.mean(abs(y_hat_test-y_test))/np.mean(y_test) print('%s RF, test error : %.3f' % (lbl,mae_test)) return rf def fit_model(x,y,show_flag=False): # fit RF using all variables print('Fitting full random forest...') rf = fit_forest(x,y,lbl='Full') # variable importances nvar = 10 imp = rf.feature_importances_ idx = np.argsort(imp)[::-1] print('%d most important variables:' % nvar) print(x.columns.values[idx][0:nvar]) # prune variables thr = 0.5*np.median(imp) idx = imp < thr exclude = list(x.columns.values[idx]) x_sel = x.drop(exclude, axis=1) # fit RF using important variables print('\nFitting pruned random forest...') rf = fit_forest(x_sel,y,lbl='Pruned') print('%d pruned variables:' % len(x_sel.columns)) print(x_sel.columns.values) if show_flag: # plt.figure(figsize=(7, 4)) # importance chart plt.subplot(121) ind = np.argsort(imp)[::-1] plt.bar(x=x.columns.values[ind][0:nvar], height=imp[ind][0:nvar],color=(0.3,0.3,0.9)) plt.xticks(x.columns.values[ind][0:nvar], x.columns.values[ind][0:nvar], rotation='vertical') plt.xlabel('Variables') plt.ylabel('Importance') # parity plot ax = plt.subplot(122) ax.set_aspect(1) plt.scatter(y, rf.predict(x_sel),marker='s',alpha=.25,c=(0.9,0.3,0.3)) plt.plot(np.arange(np.max(y)),c='gray') plt.xlabel('Ground truth') plt.ylabel('RF prediction') plt.subplots_adjust(bottom=0.25,top=0.75) plt.draw() plt.show() return rf def add_atom_feats(df): avg_row = [] avg_col = [] avg_num = [] el_neg = [] at_mass = [] at_r = [] io_r = [] # loop through entries for index, row in df.iterrows(): comp = Composition(row['pretty_formula']) elem,fracs = zip(*comp.fractional_composition.items()) # 0. average row in the periodic table try: avg_row.append(sum([el.row*fr for (el,fr) in zip(elem,fracs)])) except TypeError: avg_row.append(float('nan')) # 1. average column in the periodic table try: avg_col.append(sum([el.group*fr for (el,fr) in zip(elem,fracs)])) except TypeError: avg_col.append(float('nan')) # 2. average atomic number try: avg_num.append(sum([el.number*fr for (el,fr) in zip(elem,fracs)])) except TypeError: avg_num.append(float('nan')) # 3. average electronegativity try: el_neg.append(sum([el.X*fr for (el,fr) in zip(elem,fracs)])) except TypeError: el_neg.append(float('nan')) # 4. average atomic mass try: at_mass.append(sum([el.data['Atomic mass']*fr for (el,fr) in zip(elem,fracs)])) except TypeError: at_mass.append(float('nan')) # 5. average atomic radius try: at_r.append(sum([el.data['Atomic radius']*fr for (el,fr) in zip(elem,fracs)])) except TypeError: at_r.append(float('nan')) # 6. average ionic radius try: io_r.append(sum([el.average_ionic_radius*fr for (el,fr) in zip(elem,fracs)])) except TypeError: io_r.append(float('nan')) df['avg row'] = pd.Series(avg_row, index=df.index) df['avg column'] = pd.Series(avg_col, index=df.index) df['avg num'] = pd.Series(avg_num, index=df.index) df['avg el-neg'] = pd.Series(el_neg, index=df.index) df['avg atom mass'] = pd.Series(at_mass, index=df.index) df['avg atom radius'] = pd.Series(at_r, index=df.index) df['avg ionic radius'] = pd.Series(io_r, index=df.index) feat_labels = ['avg row','avg column','avg num','avg el-neg', 'avg atom mass','avg atom radius','avg ionic radius'] return df,feat_labels def add_cs_features(df,rdf_flag=False): df["composition"] = str_to_composition(df["pretty_formula"]) df["composition_oxid"] = composition_to_oxidcomposition(df["composition"]) df["structure"] = dict_to_object(df["structure"]) vo = ValenceOrbital() df = vo.featurize_dataframe(df,"composition") ox = OxidationStates() df = ox.featurize_dataframe(df, "composition_oxid") # structure features den = DensityFeatures() df = den.featurize_dataframe(df, "structure") if rdf_flag: rdf = RadialDistributionFunction(cutoff=15.0,bin_size=0.2) df = rdf.featurize_dataframe(df, "structure") return df
""" This file makes the following benchmarking datasets: - castelli From matminer's dataset library. """ from matminer.datasets.dataset_retrieval import load_dataset from matminer.data_retrieval.retrieve_MP import MPDataRetrieval import pandas as pd # pd.set_option('display.height', 1000) pd.set_option("display.max_rows", 500) pd.set_option("display.max_columns", 500) pd.set_option("display.width", 1000) mpdr = MPDataRetrieval() df = load_dataset("castelli_perovskites") df = df[["structure", "e_form"]] df = df.reset_index(drop=True) print(df) df.to_pickle("castelli.pickle.gz")
def featurize_by_material_id( material_ids: np.array, featurizerObject: featurizer.extendedMODFeaturizer, MAPI_KEY: str, writeToFile: bool = True) -> pd.DataFrame: """ Run all of the preset featurizers on the input dataframe. Arguments: df: the input dataframe with a `"structure"` column containing `pymatgen.Structure` objects. Returns: The featurized DataFrame. """ def apply_featurizers(criterion, properties, mpdr, featurizerObject): LOG.info("Downloading dos and bandstructure objects..") timeDownloadStart = time.time() df_portion = mpdr.get_dataframe(criteria=criterion, properties=properties) timeDownloadEnd = time.time() LOG.info(df_portion) df_time, df_portion = featurizerObject.featurize(df_portion) df_time["download_objects"] = [timeDownloadEnd - timeDownloadStart] return df_time, df_portion properties = [ "material_id", "full_formula", "bandstructure", "dos", "structure" ] mpdr = MPDataRetrieval(MAPI_KEY) steps = 1 leftover = len(material_ids) % steps df = pd.DataFrame({}) df_timers = pd.DataFrame({}) for i in tqdm(range(0, len(material_ids), steps)): portionReturned = True if not (i + steps > len(material_ids)): LOG.info(list(material_ids[i:i + steps])) criteria = {"task_id": {"$in": list(material_ids[i:i + steps])}} while (portionReturned): try: df_time, df_portion = apply_featurizers( criteria, properties, mpdr, featurizerObject) portionReturned = False except: LOG.info("Except - try again.") # Add ID to recognize afterwards df_portion["material_id"] = material_ids[i:i + steps] df = pd.concat([df, df_portion]) df_timers = pd.concat([df_timers, df_time]) LOG.info("CURRENT SHAPE:{}".format(df.shape)) if writeToFile: df.to_pickle( Path(__file__).resolve().parents[2] / "data" / "raw" / "featurizer" / "featurized.pkl") df_timers.to_csv( Path(__file__).resolve().parents[2] / "data" / "raw" / "featurizer" / "timing.csv") if (leftover): LOG.info(list(material_ids[i:i + leftover])) criteria = {"task_id": {"$in": list(material_ids[i:i + leftover])}} df_time, df_portion = apply_featurizers(criteria, properties, mpdr, featurizerObject) df_portion["material_id"] = material_ids[i:i + leftover] df = pd.concat([df, df_portion]) df_timers = pd.concat([df_timers, df_time]) if writeToFile: df.to_pickle( Path(__file__).resolve().parents[2] / "data" / "raw" / "featurizer" / "featurized.pkl") df_timers.to_csv( Path(__file__).resolve().parents[2] / "data" / "raw" / "featurizer" / "timing.csv") return df
# Print parameters. print("REMOVE UNSTABLE ENTRIES:", FILTER) print("USE FABER DATASET:", FABER) print("USE TERNARY OXIDE DATASET:", not FABER) print("NUMBER OF JOBS:", NJOBS) print("DEBUG MODE:", args.debug) # Set up dataset if FABER: df = load_flla() else: # Initialize data retrieval class from matminer.data_retrieval.retrieve_MP import MPDataRetrieval mpr = MPDataRetrieval() criteria = "*-*-O" # Choose list of properties to retrive properties = ['structure', 'nsites', 'formation_energy_per_atom', 'e_above_hull'] # Get the dataframe with the matching structure from the Materials Project df = mpr.get_dataframe(criteria=criteria, properties=properties) # Create the formation_energy feature for the SCM regression, since the SCM # model learns formation energy per unit cell rather than per atom. df['formation_energy'] = df['formation_energy_per_atom'] * df['nsites'] # Structures are retrieved as dictionaries but can easily be converted to # pymatgen.core.Structure objects as shown. df['structure'] = pd.Series([Structure.from_dict(df['structure'][i])\ for i in range(df.shape[0])], df.index) # Filter the dataset if it consists of ternary oxides df = df[df['e_above_hull'] < 0.1] df = df[df['nsites'] <= 30]
Regenerating from the newest Materials Project calculations """ from matminer.datasets.dataset_retrieval import load_dataset from matminer.data_retrieval.retrieve_MP import MPDataRetrieval from pymatgen import Element import pandas as pd import numpy as np # pd.set_option('display.height', 1000) pd.set_option("display.max_rows", 500) pd.set_option("display.max_columns", 500) pd.set_option("display.width", 1000) mpdr = MPDataRetrieval() # df = load_dataset("dielectric_constant") df = mpdr.get_dataframe( criteria={"has": "diel"}, properties=[ "material_id", "diel.n", "formation_energy_per_atom", "e_above_hull", "structure", ], index_mpid=False, ) df = df[(df["e_above_hull"] < 0.150)
import numpy as np import pandas as pd from ast import literal_eval from tqdm import tqdm from pydash import py_ from matminer.data_retrieval.retrieve_MP import MPDataRetrieval mpdr = MPDataRetrieval(api_key='3AdDSGEqlThTHVeu') def Retrieve_data(bg_lower, bg_upper, raw_name): """ Downloads data from the MPD Parameters ---------- bg_lower : Int or float Lower bound of bandgap for the materials to be collected bg_upper : Int or float Upper bound of bandgap for the materials to be collected raw_name : Str Desired file name for raw data """ properties = ['material_id', 'xrd.Cu', 'band_gap', 'efermi'] criteria = { "band_gap": { '$gt': bg_lower, '$lt': bg_upper }, "efermi": { '$exists': True
def setUp(self): self.mpdr = MPDataRetrieval()
from matminer.data_retrieval.retrieve_MP import MPDataRetrieval from pymatgen.electronic_structure.plotter import BSDOSPlotter from matminer.data_retrieval.retrieve_Citrine import CitrineDataRetrieval from matminer.data_retrieval.retrieve_MDF import MDFDataRetrieval mpdr = MPDataRetrieval() df = mpdr.get_dataframe(criteria={"nelements": 1}, properties=['density', 'pretty_formula']) print("There are {} entries on MP with 1 element".format( df['density'].count())) print(df.head()) df = mpdr.get_dataframe({"band_gap": { "$gt": 4.0 }}, ['pretty_formula', 'band_gap']) print("There are {} entries on MP with a band gap larger than 4.0".format( df['band_gap'].count())) df.to_csv('gt4.csv') df = mpdr.get_dataframe( { "elasticity": { "$exists": True }, "elasticity.warnings": [] }, ['pretty_formula', 'elasticity.K_VRH', 'elasticity.G_VRH']) print("There are {} elastic entries on MP with no warnings".format( df['elasticity.K_VRH'].count())) df = mpdr.get_dataframe( criteria={ "elasticity": { "$exists": True
import matminer import pymatgen import pandas from matminer.data_retrieval.retrieve_MP import MPDataRetrieval df_mp = MPDataRetrieval("y6hicvzKBaLRWuG8").get_dataframe( criteria={"task_id": { "$in": ["mp-22862"] }}, properties=["structure"]) print(type(df_mp)) #pandas dataframe print( df_mp.iloc[0] ) #outputs the following: structure [[0. 0. 0.] Na, [2.32362417 1.64305041 4.02463... # Name: mp-22862, dtype: object # Needed to go one level deeper in df_mp print(df_mp.iloc[0][0]) # If you look at the type of file that is it is pymatgen.structure.Structure object print(type(df_mp.iloc[0][0])) #this is a dataformat we can create
warnings.filterwarnings('ignore') import numpy as np import matplotlib.pyplot as plt import pandas as pd # Set pandas view options pd.set_option('display.width', 1000) pd.set_option('display.max_columns', None) pd.set_option('display.max_rows', None) from sklearn.model_selection import train_test_split from automatminer import MatPipe from matminer.data_retrieval.retrieve_MP import MPDataRetrieval # mpr=MPDataRetrieval() # mpdr=MPDataRetrieval() api_key = 'x3NlvC67Z9tPykwGz' # Set your MP API key here. mpr = MPDataRetrieval(api_key) # api_key = None # Set your MP API key here. # mpr = MPDataRetrieval(api_key) df = mpr.get_dataframe( { "elasticity": { "$exists": True }, "elasticity.warnings": [] }, ['pretty_formula', 'elasticity.K_VRH', 'elasticity.G_VRH']) #/ criteria = {'elasticity.K_VRH': {'$ne': None}} #/ properties = ['pretty_formula', 'elasticity.K_VRH', 'elasticity.G_VRH'] # get the data # df=mpr.get_dataframe(criteria=criteria, properties=properties) # Filter out unstable entries and negative bulk moduli df = df[df['elasticity.K_VRH'] > 0]
import numpy as np import matplotlib.pyplot as plt import matplotlib.gridspec as gridspec plt.rcParams["figure.figsize"] = [6, 7] font = {'family': 'Avenir', 'weight': 'normal', 'size': 26} math_font = 'stixsans' plt.rc('font', **font) plt.rcParams['mathtext.fontset'] = math_font plt.rcParams['axes.labelsize'] = font['size'] plt.rcParams['xtick.labelsize'] = font['size'] - 2 plt.rcParams['ytick.labelsize'] = font['size'] - 2 plt.rcParams['legend.fontsize'] = font['size'] - 2.5 mat_api_key = 'YourPymatgenAPI' mpdr = MPDataRetrieval(mat_api_key) df_terqua = mpdr.get_dataframe(criteria={ 'nsites': { '$lt': 41 }, 'e_above_hull': { '$lt': 0.08 }, 'nelements': { '$gt': 2, '$lt': 5 }, }, properties=[ 'material_id',
#!/usr/bin/env python # coding: utf-8 # In[1]: # UROP Phase 1 Data Retrieval and partial preprocessing # all data is from material project database. from matminer.datasets import load_dataset from matminer.data_retrieval.retrieve_MP import MPDataRetrieval mpdr = MPDataRetrieval(api_key="KcDv6qi5w4rUZSlt") d = load_dataset("heusler_magnetic") heusler_formula = d['formula'] # In[ ]: import pandas as pd import time query_time = list() false_list = list() true_list = list() data = list() heusler_matrix = pd.DataFrame() start_time = time.time() for name in heusler_formula: t_1 = time.time() data_got = mpdr.get_data( criteria=name, properties=['pretty_formula', 'structure', 'elasticity']) t_2 = time.time()
def generate_mp(max_nsites=None, properties=None, write_to_csv=False, write_to_compressed_json=True): """ Grabs all mp materials. This will return two csv/json.gz files: * mp_nostruct: All MP materials, not including structures * mp_all: All MP materials, including structures Args: max_nsites (int): The maximum number of sites to include in the query. properties (iterable of strings): list of properties supported by MPDataRetrieval write_to_csv (bool): whether to write resulting dataframe to csv write_to_compressed_json (bool): whether to write resulting dataframe to json.gz file Returns (pandas.DataFrame): retrieved/generated data """ # Set default properties if None and ensure is a list if properties is None: properties = ['pretty_formula', 'e_above_hull', 'band_gap', 'total_magnetization', 'elasticity.elastic_anisotropy', 'elasticity.K_VRH', 'elasticity.G_VRH', 'structure', 'energy', 'energy_per_atom', 'formation_energy_per_atom'] elif not isinstance(properties, list): properties = list(properties) # Pick columns to drop structure data from drop_cols = [] for col_name in ["structure", "initial_structure"]: if col_name in properties: drop_cols.append(col_name) mpdr = MPDataRetrieval() if max_nsites is not None: sites_list = [i for i in range(1, max_nsites + 1)] else: sites_list = [i for i in range(1, 101)] + [{"$gt": 100}] df = pd.DataFrame() for site_specifier in tqdm(sites_list, desc="Querying Materials Project"): # While loop to repeat queries if server request fails while True: try: site_response = mpdr.get_dataframe( criteria={"nsites": site_specifier}, properties=properties, index_mpid=True ) break except MPRestError: tqdm.write("Error querying materials project, " "trying again after 5 sec") sleep(5) df = df.append(site_response) tqdm.write("DataFrame with {} entries created".format(len(df))) # Write data out to file if user so chooses if write_to_csv: df.to_csv("mp_all.csv") df.drop(drop_cols, axis=1, inplace=True) df.to_csv("mp_nostruct.csv") if write_to_compressed_json: store_dataframe_as_json(df, "mp_all.json.gz", compression="gz") df = df.drop(drop_cols, axis=1) store_dataframe_as_json(df, "mp_nostruct.json.gz", compression="gz") return df
def data_query(mp_api_key, max_elms=3, min_elms=3, max_sites=20, include_te=False): """ The function queries data from Materials Project. Parameters ---------- mp_api_key : str The API key for Mateirals Project. max_elms : int, optional Maximum number of components/elements for crystals to be queried. The default is 3. min_elms : int, optional Minimum number of components/elements for crystals to be queried. The default is 3. max_sites : int, optional Maximum number of components/elements for crystals to be queried. The default is 20. include_te : bool, optional DESCRIPTION. The default is False. Returns ------- dataframe : pandas dataframe Dataframe returned by MPDataRetrieval. """ mpdr = MPDataRetrieval(mp_api_key) # Specify query criteria in MongoDB style query_criteria = { 'e_above_hull': { '$lte': 0.08 }, # eV/atom 'nelements': { '$gte': min_elms, '$lte': max_elms }, 'nsites': { '$lte': max_sites }, } # Specify properties to be queried, properties avaible are at https://github.com/materialsproject/mapidoc/tree/master/materials query_properties = [ 'material_id', 'formation_energy_per_atom', 'band_gap', 'pretty_formula', 'e_above_hull', 'elements', 'cif', 'spacegroup.number' ] # Obtain queried dataframe containing CIFs and groud-state property labels dataframe = mpdr.get_dataframe( criteria=query_criteria, properties=query_properties, ) dataframe['ind'] = np.arange(len(dataframe)) if include_te: dataframe['ind'] = np.arange(0, len(dataframe)) # Read thermoelectric properties from https://datadryad.org/stash/dataset/doi:10.5061/dryad.gn001 te = pd.read_csv('data/thermoelectric_prop.csv', index_col=0) te = te.dropna() # Get compound index that has both ground-state and thermoelectric properties ind = dataframe.index.intersection(te.index) # Concatenate thermoelectric properties to corresponding compounds dataframe = pd.concat([dataframe, te.loc[ind, :]], axis=1) dataframe['Seebeck'] = dataframe['Seebeck'].apply(np.abs) return dataframe
import os import subprocess import tempfile import nbformat import unittest from matminer.data_retrieval.retrieve_MP import MPDataRetrieval module_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), '..', 'notebooks') citrine_key = os.environ.get("CITRINE_KEY") mpds_key = os.environ.get("MPDS_KEY") mp_key = MPDataRetrieval().mprester.api_key class NotebookExampleTest(unittest.TestCase): def test_intro_predicting_bulk_modulus(self): path = os.path.join(module_dir, "intro_predicting_bulk_modulus.ipynb") _notebook_run(path) @unittest.skipIf(citrine_key is None, "CITRINE_KEY env variable not set.") def test_experiment_vs_computed_bandgap(self): path = os.path.join(module_dir, "experiment_vs_computed_bandgap.ipynb") _notebook_run(path) @unittest.skipIf(not all([citrine_key, mpds_key, mp_key]), "data retrieval keys not set") def test_get_data(self):
import warnings warnings.filterwarnings('ignore') import numpy as np import matplotlib.pyplot as plt import pandas as pd # Set pandas view options pd.set_option('display.width', 1000) pd.set_option('display.max_columns', None) pd.set_option('display.max_rows', None) from sklearn.model_selection import train_test_split from automatminer import MatPipe from matminer.data_retrieval.retrieve_MP import MPDataRetrieval from matminer.utils.data import PymatgenData from pymatgen import Composition mpr=MPDataRetrieval() api_key = 'x3NlvC67Z9tPykwGz' # Set your MP API key here. mpr = MPDataRetrieval(api_key) mpdr=MPDataRetrieval(api_key) # df = mpdr.get_dataframe({"elasticity": {"$exists": True}, "elasticity.warnings": []}, # ['pretty_formula', 'elasticity.K_VRH', 'elasticity.G_VRH']) criteria = {'elasticity.K_VRH': {'$ne': None}} properties = ['pretty_formula', 'spacegroup.symbol', 'elasticity.K_VRH', 'elasticity.G_VRH','formation_energy_per_atom', 'band_gap', 'e_above_hull', 'density', 'volume', 'nsites'] df = mpr.get_dataframe(criteria=criteria, properties=properties) df1=pd.read_csv(r'D:\FYP_files\database\data_after_processing\huizong\huizong.csv') df=df.reset_index() df=pd.merge(df,df1) df=df.set_index("material_id") df = df[df['elasticity.K_VRH'] > 0]
NJOBS = args.njobs # Print parameters. print("REMOVE UNSTABLE ENTRIES:", FILTER) print("USE FABER DATASET:", FABER) print("USE TERNARY OXIDE DATASET:", not FABER) print("NUMBER OF JOBS:", NJOBS) print("DEBUG MODE:", args.debug) # Set up dataset if FABER: df = load_dataset("flla") else: # Initialize data retrieval class from matminer.data_retrieval.retrieve_MP import MPDataRetrieval mpr = MPDataRetrieval() criteria = "*-*-O" # Choose list of properties to retrive properties = [ 'structure', 'nsites', 'formation_energy_per_atom', 'e_above_hull' ] # Get the dataframe with the matching structure from the Materials Project df = mpr.get_dataframe(criteria=criteria, properties=properties) # Create the formation_energy feature for the SCM regression, since the SCM # model learns formation energy per unit cell rather than per atom. df['formation_energy'] = df['formation_energy_per_atom'] * df['nsites'] # Structures are retrieved as dictionaries but can easily be converted to # pymatgen.core.Structure objects as shown. df['structure'] = pd.Series([Structure.from_dict(df['structure'][i])\ for i in range(df.shape[0])], df.index) # Filter the dataset if it consists of ternary oxides
This file makes the following benchmarking datasets: - phonons From matminer's dataset library. """ from matminer.datasets.dataset_retrieval import load_dataset from matminer.data_retrieval.retrieve_MP import MPDataRetrieval import pandas as pd # pd.set_option('display.height', 1000) pd.set_option('display.max_rows', 500) pd.set_option('display.max_columns', 500) pd.set_option('display.width', 1000) mpdr = MPDataRetrieval() df = load_dataset("phonon_dielectric_mp") print(df) mpids = df["mpid"].tolist() dfe = mpdr.get_dataframe( criteria={"material_id": { "$in": mpids }}, properties=["e_above_hull", "formation_energy_per_atom", "material_id"], index_mpid=False) dfe = dfe.rename(columns={"material_id": "mpid"}) df = pd.merge(df, dfe, how='inner')
from pymatgen import MPRester from matminer.datasets.dataset_retrieval import load_dataset from matminer.data_retrieval.retrieve_MP import MPDataRetrieval import pandas as pd import numpy as np from tqdm import tqdm pd.set_option("display.max_rows", 500) pd.set_option("display.max_columns", 500) pd.set_option("display.width", 1000) chunksize = 1000 mpdr = MPDataRetrieval() mpr = MPRester() def chunks(l, n): """Yield successive n-sized chunks from l.""" for i in range(0, len(l), n): yield l[i : i + n] df = mpdr.get_dataframe( criteria={"formation_energy_per_atom": {"$lt": 2.5}}, properties=["material_id", "warnings"], index_mpid=False, )
def plot_expt_compt_band_gaps(citrine_api_key, limit=0): """ Pulls experimental band gaps from Citrine (w/o dataset limitations) and evaluate the DFT computed band gaps (data from materialsproject.org) in xy scatter plot. To compare the right values, we pick the computed band gaps calculated for a chemical formula that has the lowest energy above hull (the most stable structure). Args: citrine_api_key (str): Your Citrine API key for getting data. Don't have a Citrine account? Visit https://citrine.io/ limit (int): limit the number of entries (0 means no limit) Returns: plotly plots in "offline" mode popped in the default browser. """ # pull experimental band gaps from Citrine cdr = CitrineDataRetrieval(api_key=citrine_api_key) cols = ['chemicalFormula', 'Band gap'] df_ct = cdr.get_dataframe(criteria={'data_type':'experimental', 'max_results':limit}, secondary_fields=True, properties=['Band gap']) df_ct = df_ct[cols].rename(columns={'chemicalFormula': 'Formula', 'Band gap': 'Expt. gap'}) df_ct = df_ct[df_ct['Formula'] != 'In1p1'] # p1 not recognized in Composition df_ct = df_ct.dropna() # null band gaps cause problem when plotting residuals df_ct['Formula'] = df_ct['Formula'].transform( lambda x: Composition(x).get_reduced_formula_and_factor()[0]) # pull computational band gaps from the Materials Project df = MPDataRetrieval().get_dataframe( criteria={'pretty_formula': {'$in': list(df_ct['Formula'].values)}}, properties=['pretty_formula', 'material_id', 'band_gap', 'e_above_hull'], index_mpid=False).rename( columns={'pretty_formula': 'Formula', 'band_gap': 'MP computed gap', 'material_id': 'mpid'}) # pick the most stable structure df_mp = df.loc[df.groupby("Formula")["e_above_hull"].idxmin()] df_final = df_ct.merge(df_mp, on='Formula').drop( 'e_above_hull', axis=1).set_index('mpid') pf = PlotlyFig(df_final, x_title='Experimental band gap (eV)', y_title='Computed Band Gap (eV)', filename='band_gaps') # computed vs. experimental band gap: pf.xy([ ('Expt. gap', 'MP computed gap'), ([0, 12], [0, 12]) ], lines=[{}, {'color': 'black', 'dash': 'dash'}], labels=['Formula', df_final.index], modes=['markers', 'lines'], names=['Computed vs. expt.', 'Expt. gap']) # residual: residuals = df_final['MP computed gap']-df_final['Expt. gap'].astype(float) pf.set_arguments(x_title='Experimental band gap (eV)', y_title='Residual (Computed - Expt.) Band Gap (eV)', filename='band_gap_residuals') pf.xy(('Expt. gap', residuals), labels = ['Formula', df_final.index])