Example #1
0
    def test_featurize_bsdos(self, refresh_df_init=False, limit=1):
        """
        Tests featurize_dos and featurize_bandstructure.

        Args:
            refresh_df_init (bool): for developers, if the test need to be
                updated set to True. Otherwise set to False to make the final
                test independent of MPRester and faster.
            limit (int): the maximum final number of entries.

        Returns (None):
        """
        target = "color"
        df_bsdos_pickled = "mp_data_with_dos_bandstructure.pickle"
        if refresh_df_init:
            mpdr = MPDataRetrieval()
            df = mpdr.get_dataframe(criteria={"material_id": "mp-149"},
                                    properties=[
                                        "pretty_formula", "dos",
                                        "bandstructure",
                                        "bandstructure_uniform"
                                    ])
            df.to_pickle(os.path.join(TEST_DIR, df_bsdos_pickled))
        else:
            df = pd.read_pickle(os.path.join(TEST_DIR, df_bsdos_pickled))
        df = df.dropna(axis=0)
        df = df.rename(
            columns={
                "bandstructure_uniform": "bandstructure",
                "bandstructure": "line bandstructure"
            })
        df[target] = [["red"]]
        n_cols_init = df.shape[1]

        featurizer = AutoFeaturizer(preset="express",
                                    ignore_errors=False,
                                    multiindex=False)
        df = featurizer.fit_transform(df, target)

        # sanity checks
        self.assertTrue(len(df), limit)
        self.assertGreater(len(df.columns), n_cols_init)

        # DOSFeaturizer:
        self.assertEqual(df["cbm_character_1"][0], "p")

        # DopingFermi:
        self.assertAlmostEqual(df["fermi_c1e+20T300"][0], -0.539, 3)

        # Hybridization:
        self.assertAlmostEqual(df["vbm_sp"][0], 0.181, 3)
        self.assertAlmostEqual(df["cbm_s"][0], 0.4416, 3)
        self.assertAlmostEqual(df["cbm_sp"][0], 0.9864, 3)

        # BandFeaturizer:
        self.assertAlmostEqual(df["direct_gap"][0], 2.556, 3)
        self.assertAlmostEqual(df["n_ex1_norm"][0], 0.6285, 4)

        # BranchPointEnergy:
        self.assertAlmostEqual(df["branch_point_energy"][0], 5.7677, 4)
Example #2
0
def plot_expt_compt_band_gaps(citrine_api_key, limit=0):
    """
    Pulls experimental band gaps from Citrine (w/o dataset limitations) and
        evaluate the DFT computed band gaps (data from materialsproject.org)
        in xy scatter plot. To compare the right values, we pick the computed
        band gaps calculated for a chemical formula that has the lowest energy
        above hull (the most stable structure).
    Args:
        citrine_api_key (str): Your Citrine API key for getting data. Don't have
            a Citrine account? Visit https://citrine.io/
        limit (int): limit the number of entries (0 means no limit)
    Returns:
        plotly plots in "offline" mode poped in the default browser.
    """

    # pull experimental band gaps from Citrine
    cdr = CitrineDataRetrieval(api_key=citrine_api_key)
    cols = ['chemicalFormula', 'Band gap']
    df_ct = cdr.get_dataframe(prop='band gap', data_type='experimental',
                              show_columns=cols, max_results=limit).rename(
        columns={'chemicalFormula': 'Formula', 'Band gap': 'Expt. gap'})
    df_ct = df_ct[df_ct['Formula'] != 'In1p1'] # p1 not recognized in Composition
    df_ct = df_ct.dropna() # null band gaps cause problem when plotting residuals
    df_ct['Formula'] = df_ct['Formula'].transform(
        lambda x: Composition(x).get_reduced_formula_and_factor()[0])

    # pull computational band gaps from the Materials Project
    df = MPDataRetrieval().get_dataframe(
        criteria={'pretty_formula': {'$in': list(df_ct['Formula'].values)}},
        properties=['pretty_formula', 'material_id', 'band_gap', 'e_above_hull'],
        index_mpid=False).rename(
        columns={'pretty_formula': 'Formula', 'band_gap': 'MP computed gap',
                 'material_id': 'mpid'})


    # pick the most stable structure
    df_mp = df.loc[df.groupby("Formula")["e_above_hull"].idxmin()]
    df_final = df_ct.merge(df_mp, on='Formula').drop(
                                    'e_above_hull', axis=1).set_index('mpid')
    pf = PlotlyFig(df_final, x_title='Experimental band gap (eV)',
                   y_title='Computed Band Gap (eV)',
                   filename='band_gaps')

    # computed vs. experimental band gap:
    pf.xy([
        ('Expt. gap', 'MP computed gap'),
        ([0, 12], [0, 12])
    ],
        lines=[{}, {'color': 'black', 'dash': 'dash'}],
        labels=df_final.index, modes=['markers', 'lines'],
        names=['Computed vs. expt.', 'Expt. gap'])

    # residual:
    residuals = df_final['MP computed gap']-df_final['Expt. gap'].astype(float)
    pf.set_arguments(x_title='Experimental band gap (eV)',
                    y_title='Residual (Computed - Expt.) Band Gap (eV)',
                    filename='band_gap_residuals')
    pf.xy(('Expt. gap', residuals), labels = df_final.index)
Example #3
0
class MPDataRetrievalTest(unittest.TestCase):
    def setUp(self):
        self.mpdr = MPDataRetrieval(mapi_key)

    def test_get_data(self):
        df = self.mpdr.get_dataframe(criteria={"material_id": "mp-23"},
                                     properties=["structure"])
class MPDataRetrievalTest(unittest.TestCase):

    def setUp(self):
        self.mpdr = MPDataRetrieval(mapi_key)

    def test_get_data(self):
        df = self.mpdr.get_dataframe(criteria={"material_id": "mp-23"}, properties=["structure"])
Example #5
0
class MPDataRetrievalTest(unittest.TestCase):
    def setUp(self):
        self.mpdr = MPDataRetrieval()

    def test_get_data(self):
        if self.mpdr.mprester.api_key:
            df = self.mpdr.get_dataframe(criteria={"material_id": "mp-23"},
                                         properties=["structure"])
            self.assertEqual(len(df["structure"]), 1)
        else:
            raise SkipTest(
                "Skipped MPDataRetrieval test; no MAPI_KEY detected")
Example #6
0
class MPDataRetrievalTest(unittest.TestCase):
    def setUp(self):
        self.mpdr = MPDataRetrieval()

    def test_get_data(self):
        df = self.mpdr.get_dataframe(criteria={"material_id": "mp-23"},
                                     properties=["structure",
                                                 "bandstructure",
                                                 "bandstructure_uniform",
                                                 "dos"])
        self.assertEqual(len(df["structure"]), 1)
        self.assertEqual(df["bandstructure"][0].get_band_gap()["energy"], 0)
        self.assertTrue(isinstance(df["bandstructure"][0],
                                   BandStructureSymmLine))
        self.assertTrue(isinstance(df["bandstructure_uniform"][0],
                                   BandStructure))
        self.assertTrue(isinstance(df["dos"][0], CompleteDos))
Example #7
0
class MPDataRetrievalTest(unittest.TestCase):
    def setUp(self):
        self.mpdr = MPDataRetrieval()

    def test_get_data(self):
        df = self.mpdr.get_dataframe(criteria={"material_id": "mp-23"},
                                     properties=["structure",
                                                 "bandstructure",
                                                 "bandstructure_uniform",
                                                 "dos"])
        self.assertEqual(len(df["structure"]), 1)
        self.assertEqual(df["bandstructure"][0].get_band_gap()["energy"], 0)
        self.assertTrue(isinstance(df["bandstructure"][0],
                                   BandStructureSymmLine))
        self.assertTrue(isinstance(df["bandstructure_uniform"][0],
                                   BandStructure))
        self.assertTrue(isinstance(df["dos"][0], CompleteDos))
Example #8
0
"""

from pymatgen import MPRester
from matminer.datasets.dataset_retrieval import load_dataset
from matminer.data_retrieval.retrieve_MP import MPDataRetrieval
import pandas as pd
import numpy as np
from tqdm import tqdm

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

chunksize = 1000

mpdr = MPDataRetrieval()
mpr = MPRester()


def chunks(l, n):
    """Yield successive n-sized chunks from l."""
    for i in range(0, len(l), n):
        yield l[i:i + n]


df = mpdr.get_dataframe(criteria={
    "e_above_hull": {
        "$lt": 0.150
    },
    "formation_energy_per_atom": {
        "$lt": 0.150
Example #9
0
    - elasticity_G_VRH
    - elasticity_log10(G_VRH)

From matminer's dataset library.
"""

from matminer.datasets.dataset_retrieval import load_dataset
from matminer.data_retrieval.retrieve_MP import MPDataRetrieval
import pandas as pd
import numpy as np

pd.set_option("display.max_rows", 500)
pd.set_option("display.max_columns", 500)
pd.set_option("display.width", 1000)

mpdr = MPDataRetrieval()

df = mpdr.get_dataframe(
    criteria={
        "e_above_hull": {
            "$lt": 0.150
        },
        "formation_energy_per_atom": {
            "$lt": 0.150
        },
        "elasticity": {
            "$exists": 1,
            "$ne": None
        },
    },
    # "elements": },
Example #10
0
 def setUp(self):
     self.mpdr = MPDataRetrieval(mapi_key)
Example #11
0
def query_data(pname,api_key,path=''):

	mpdr = MPDataRetrieval(api_key)

	# query properties
	props = mpdr.get_dataframe(criteria={pname: {"$exists": True},
	#                                      "elements": {"$all": ["Li", "Fe", "O"]},
										("{}.warnings".format(pname)): None},
							  properties=['pretty_formula',pname,'e_above_hull'])
	print("There are {} entries satisfying criteria".format(props[pname].count()))

	# Load crystal structures
	# initialize dataframe
	structures = pd.DataFrame(columns=['structure'])

	# lists of mp ids to avo
	chunk_size = 1000
	mp_ids = props.index.tolist()
	sublists = [mp_ids[i:i+chunk_size] for i in range(0, len(mp_ids), chunk_size)]

	# query structures 
	for sublist in sublists:
	structures = structures.append(mpdr.get_dataframe({"material_id":{"$in": sublist}}, ['structure']))

	data = pd.concat([props,structures],axis=1)
	fname = '%s/%s.pkl' % (path,pname)

	data.to_pickle(fname)
	print('Saved file to ',fname)

	return data

def filter_data(df,elems,pname,pmin=None,pmax=None,stab=None):
	'''Filter data by criteria'''

	print('# entries before filters: ',len(df))

	# filter by chemistry
	inds = np.zeros((len(elems),len(df)))
	for i,item in enumerate(elems):
	  inds[i,:] = (df['pretty_formula'].str.contains(item))
	  
	idx = np.prod(inds,axis=0)
	df = df[idx==1]
	print('# entries after chemistry: ',len(df))

	# filter by property values
	if pmin:
	  df = df[df[pname] >= pmin]
	if pmax:
	  df = df[df[pname] <= pmax]
	print('# entries after property: ',len(df))
	  
	# filter by stability
	if stab:
	  df = df[df['e_above_hull'] <= stab]
	print('# entries after stability: ',len(df))

	return df

def get_xy(df,elems,pname,pmin,pmax,stab):
	'''Get x and y from data'''

	# filter NaNs and entries based on criteria
	df = df.dropna()
	df = filter_data(df,elems,pname,pmin=pmin,pmax=pmax,stab=stab)

	# exclude non-input columns
	exclude = ['pretty_formula',pname,'e_above_hull','structure','composition','composition_oxid','radial distribution function']
	
	# get X and Y
	x = df.sort_index().drop(exclude, axis=1)
	y = df[pname].sort_index().values

	return x,y

def fit_forest(x,y,lbl='Full'):

  # split data
  x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

  # grid-search optimal parameters
  rf = RandomForestRegressor()
  param_grid = { 
        'n_estimators'      : [10,25,50,100,250],
        'max_features'      : ['auto','sqrt','log2'],
        'min_samples_split' : [2,4,8],
        'min_samples_leaf'  : [1, 2, 5]
        }
  grid = GridSearchCV(rf, param_grid, n_jobs=-1, cv=5)
  grid.fit(x_train, y_train)

  print(grid.best_score_)
  print(grid.best_params_)
  print(grid.score(x_test, y_test))

  # use optimal parameters
  rf.set_params(**grid.best_params_)
  rf.fit(x_train, y_train)

  y_hat_train = rf.predict(x_train) 
  y_hat_test = rf.predict(x_test) 

  mae_train = np.mean(abs(y_hat_train-y_train))/np.mean(y_train)
  print('%s RF, train error: %.3f' % (lbl,mae_train))

  mae_test = np.mean(abs(y_hat_test-y_test))/np.mean(y_test)
  print('%s RF, test error : %.3f' % (lbl,mae_test))

  return rf

def fit_model(x,y,show_flag=False):

  # fit RF using all variables
  print('Fitting full random forest...')
  rf = fit_forest(x,y,lbl='Full')

  # variable importances
  nvar = 10
  imp = rf.feature_importances_
  idx = np.argsort(imp)[::-1]
  print('%d most important variables:' % nvar)
  print(x.columns.values[idx][0:nvar])

  # prune variables
  thr = 0.5*np.median(imp)
  idx = imp < thr
  exclude = list(x.columns.values[idx])
  x_sel = x.drop(exclude, axis=1)

  # fit RF using important variables
  print('\nFitting pruned random forest...')
  rf = fit_forest(x_sel,y,lbl='Pruned')
  
  print('%d pruned variables:' % len(x_sel.columns))
  print(x_sel.columns.values)
  
  if show_flag:
    # plt.figure(figsize=(7, 4))

    # importance chart
    plt.subplot(121)
    
    ind = np.argsort(imp)[::-1]
    plt.bar(x=x.columns.values[ind][0:nvar], height=imp[ind][0:nvar],color=(0.3,0.3,0.9))
    plt.xticks(x.columns.values[ind][0:nvar], x.columns.values[ind][0:nvar], rotation='vertical')
    plt.xlabel('Variables')
    plt.ylabel('Importance')

    # parity plot
    ax = plt.subplot(122)
    ax.set_aspect(1)
    
    plt.scatter(y, rf.predict(x_sel),marker='s',alpha=.25,c=(0.9,0.3,0.3))
    plt.plot(np.arange(np.max(y)),c='gray')
    plt.xlabel('Ground truth')
    plt.ylabel('RF prediction')
    
    plt.subplots_adjust(bottom=0.25,top=0.75)
    plt.draw()
    plt.show()

  return rf

def add_atom_feats(df):
  
  avg_row = []
  avg_col = []
  avg_num = []
  el_neg = []
  at_mass = []
  at_r = []
  io_r = []
  
  # loop through entries
  for index, row in df.iterrows(): 
    
    comp = Composition(row['pretty_formula'])
    elem,fracs = zip(*comp.fractional_composition.items())

    # 0. average row in the periodic table
    try:
      avg_row.append(sum([el.row*fr for (el,fr) in zip(elem,fracs)]))
    except TypeError:
      avg_row.append(float('nan'))
    
    # 1. average column in the periodic table
    try:
      avg_col.append(sum([el.group*fr for (el,fr) in zip(elem,fracs)]))
    except TypeError:
      avg_col.append(float('nan'))
  
    # 2. average atomic number
    try:
      avg_num.append(sum([el.number*fr for (el,fr) in zip(elem,fracs)]))
    except TypeError:
      avg_num.append(float('nan'))
    
    # 3. average electronegativity
    try:
      el_neg.append(sum([el.X*fr for (el,fr) in zip(elem,fracs)]))
    except TypeError:
      el_neg.append(float('nan'))
    
    # 4. average atomic mass
    try:
      at_mass.append(sum([el.data['Atomic mass']*fr for (el,fr) in zip(elem,fracs)]))
    except TypeError:
      at_mass.append(float('nan'))
    
    # 5. average atomic radius
    try:
      at_r.append(sum([el.data['Atomic radius']*fr for (el,fr) in zip(elem,fracs)]))
    except TypeError:
      at_r.append(float('nan'))
    
    # 6. average ionic radius
    try:
      io_r.append(sum([el.average_ionic_radius*fr for (el,fr) in zip(elem,fracs)]))
    except TypeError:
      io_r.append(float('nan'))
      
  df['avg row'] = pd.Series(avg_row, index=df.index)
  df['avg column'] = pd.Series(avg_col, index=df.index)
  df['avg num'] = pd.Series(avg_num, index=df.index)
  df['avg el-neg'] = pd.Series(el_neg, index=df.index)
  df['avg atom mass'] = pd.Series(at_mass, index=df.index)
  df['avg atom radius'] = pd.Series(at_r, index=df.index)
  df['avg ionic radius'] = pd.Series(io_r, index=df.index)
  
  feat_labels = ['avg row','avg column','avg num','avg el-neg',
                 'avg atom mass','avg atom radius','avg ionic radius']
  
  return df,feat_labels

def add_cs_features(df,rdf_flag=False):

  df["composition"] = str_to_composition(df["pretty_formula"]) 
  df["composition_oxid"] = composition_to_oxidcomposition(df["composition"])
  df["structure"] = dict_to_object(df["structure"]) 

  vo = ValenceOrbital()
  df = vo.featurize_dataframe(df,"composition")

  ox = OxidationStates()
  df = ox.featurize_dataframe(df, "composition_oxid")
  
  # structure features
  den = DensityFeatures()
  df = den.featurize_dataframe(df, "structure")
  
  if rdf_flag:
    rdf = RadialDistributionFunction(cutoff=15.0,bin_size=0.2)
    df = rdf.featurize_dataframe(df, "structure") 
  
  return df
Example #12
0
"""
This file makes the following benchmarking datasets:
    - castelli

From matminer's dataset library.
"""

from matminer.datasets.dataset_retrieval import load_dataset
from matminer.data_retrieval.retrieve_MP import MPDataRetrieval

import pandas as pd

# pd.set_option('display.height', 1000)
pd.set_option("display.max_rows", 500)
pd.set_option("display.max_columns", 500)
pd.set_option("display.width", 1000)

mpdr = MPDataRetrieval()

df = load_dataset("castelli_perovskites")
df = df[["structure", "e_form"]]
df = df.reset_index(drop=True)

print(df)
df.to_pickle("castelli.pickle.gz")
def featurize_by_material_id(
        material_ids: np.array,
        featurizerObject: featurizer.extendedMODFeaturizer,
        MAPI_KEY: str,
        writeToFile: bool = True) -> pd.DataFrame:
    """ Run all of the preset featurizers on the input dataframe.
    Arguments:
        df: the input dataframe with a `"structure"` column
            containing `pymatgen.Structure` objects.
    Returns:
        The featurized DataFrame.
    """
    def apply_featurizers(criterion, properties, mpdr, featurizerObject):
        LOG.info("Downloading dos and bandstructure objects..")

        timeDownloadStart = time.time()
        df_portion = mpdr.get_dataframe(criteria=criterion,
                                        properties=properties)
        timeDownloadEnd = time.time()

        LOG.info(df_portion)
        df_time, df_portion = featurizerObject.featurize(df_portion)
        df_time["download_objects"] = [timeDownloadEnd - timeDownloadStart]

        return df_time, df_portion

    properties = [
        "material_id", "full_formula", "bandstructure", "dos", "structure"
    ]

    mpdr = MPDataRetrieval(MAPI_KEY)

    steps = 1
    leftover = len(material_ids) % steps

    df = pd.DataFrame({})
    df_timers = pd.DataFrame({})

    for i in tqdm(range(0, len(material_ids), steps)):
        portionReturned = True
        if not (i + steps > len(material_ids)):

            LOG.info(list(material_ids[i:i + steps]))
            criteria = {"task_id": {"$in": list(material_ids[i:i + steps])}}

            while (portionReturned):
                try:
                    df_time, df_portion = apply_featurizers(
                        criteria, properties, mpdr, featurizerObject)
                    portionReturned = False
                except:
                    LOG.info("Except - try again.")

            # Add ID to recognize afterwards
            df_portion["material_id"] = material_ids[i:i + steps]

            df = pd.concat([df, df_portion])
            df_timers = pd.concat([df_timers, df_time])

            LOG.info("CURRENT SHAPE:{}".format(df.shape))
            if writeToFile:
                df.to_pickle(
                    Path(__file__).resolve().parents[2] / "data" / "raw" /
                    "featurizer" / "featurized.pkl")
                df_timers.to_csv(
                    Path(__file__).resolve().parents[2] / "data" / "raw" /
                    "featurizer" / "timing.csv")

    if (leftover):
        LOG.info(list(material_ids[i:i + leftover]))
        criteria = {"task_id": {"$in": list(material_ids[i:i + leftover])}}
        df_time, df_portion = apply_featurizers(criteria, properties, mpdr,
                                                featurizerObject)
        df_portion["material_id"] = material_ids[i:i + leftover]

        df = pd.concat([df, df_portion])
        df_timers = pd.concat([df_timers, df_time])
        if writeToFile:
            df.to_pickle(
                Path(__file__).resolve().parents[2] / "data" / "raw" /
                "featurizer" / "featurized.pkl")
            df_timers.to_csv(
                Path(__file__).resolve().parents[2] / "data" / "raw" /
                "featurizer" / "timing.csv")

    return df
# Print parameters.
print("REMOVE UNSTABLE ENTRIES:", FILTER)
print("USE FABER DATASET:", FABER)
print("USE TERNARY OXIDE DATASET:", not FABER)
print("NUMBER OF JOBS:", NJOBS)
print("DEBUG MODE:", args.debug)


# Set up dataset
if FABER:
    df = load_flla()
else:
    # Initialize data retrieval class
    from matminer.data_retrieval.retrieve_MP import MPDataRetrieval
    mpr = MPDataRetrieval()
    criteria = "*-*-O"
    # Choose list of properties to retrive
    properties = ['structure', 'nsites', 'formation_energy_per_atom', 'e_above_hull']
    # Get the dataframe with the matching structure from the Materials Project
    df = mpr.get_dataframe(criteria=criteria, properties=properties)
    # Create the formation_energy feature for the SCM regression, since the SCM
    # model learns formation energy per unit cell rather than per atom.
    df['formation_energy'] = df['formation_energy_per_atom'] * df['nsites']
    # Structures are retrieved as dictionaries but can easily be converted to
    # pymatgen.core.Structure objects as shown.
    df['structure'] = pd.Series([Structure.from_dict(df['structure'][i])\
        for i in range(df.shape[0])], df.index)
    # Filter the dataset if it consists of ternary oxides
    df = df[df['e_above_hull'] < 0.1]
    df = df[df['nsites'] <= 30]
Example #15
0
Regenerating from the newest Materials Project calculations
"""

from matminer.datasets.dataset_retrieval import load_dataset
from matminer.data_retrieval.retrieve_MP import MPDataRetrieval
from pymatgen import Element

import pandas as pd
import numpy as np

# pd.set_option('display.height', 1000)
pd.set_option("display.max_rows", 500)
pd.set_option("display.max_columns", 500)
pd.set_option("display.width", 1000)

mpdr = MPDataRetrieval()

# df = load_dataset("dielectric_constant")

df = mpdr.get_dataframe(
    criteria={"has": "diel"},
    properties=[
        "material_id",
        "diel.n",
        "formation_energy_per_atom",
        "e_above_hull",
        "structure",
    ],
    index_mpid=False,
)
df = df[(df["e_above_hull"] < 0.150)
Example #16
0
import numpy as np
import pandas as pd
from ast import literal_eval
from tqdm import tqdm
from pydash import py_
from matminer.data_retrieval.retrieve_MP import MPDataRetrieval
mpdr = MPDataRetrieval(api_key='3AdDSGEqlThTHVeu')


def Retrieve_data(bg_lower, bg_upper, raw_name):
    """
    Downloads data from the MPD

    Parameters
    ----------
    bg_lower : Int or float
      Lower bound of bandgap for the materials to be collected
    bg_upper : Int or float
      Upper bound of bandgap for the materials to be collected
    raw_name : Str
      Desired file name for raw data
    """

    properties = ['material_id', 'xrd.Cu', 'band_gap', 'efermi']
    criteria = {
        "band_gap": {
            '$gt': bg_lower,
            '$lt': bg_upper
        },
        "efermi": {
            '$exists': True
Example #17
0
 def setUp(self):
     self.mpdr = MPDataRetrieval()
Example #18
0
from matminer.data_retrieval.retrieve_MP import MPDataRetrieval
from pymatgen.electronic_structure.plotter import BSDOSPlotter
from matminer.data_retrieval.retrieve_Citrine import CitrineDataRetrieval
from matminer.data_retrieval.retrieve_MDF import MDFDataRetrieval

mpdr = MPDataRetrieval()

df = mpdr.get_dataframe(criteria={"nelements": 1},
                        properties=['density', 'pretty_formula'])
print("There are {} entries on MP with 1 element".format(
    df['density'].count()))
print(df.head())
df = mpdr.get_dataframe({"band_gap": {
    "$gt": 4.0
}}, ['pretty_formula', 'band_gap'])
print("There are {} entries on MP with a band gap larger than 4.0".format(
    df['band_gap'].count()))
df.to_csv('gt4.csv')
df = mpdr.get_dataframe(
    {
        "elasticity": {
            "$exists": True
        },
        "elasticity.warnings": []
    }, ['pretty_formula', 'elasticity.K_VRH', 'elasticity.G_VRH'])
print("There are {} elastic entries on MP with no warnings".format(
    df['elasticity.K_VRH'].count()))
df = mpdr.get_dataframe(
    criteria={
        "elasticity": {
            "$exists": True
Example #19
0
import matminer
import pymatgen
import pandas

from matminer.data_retrieval.retrieve_MP import MPDataRetrieval

df_mp = MPDataRetrieval("y6hicvzKBaLRWuG8").get_dataframe(
    criteria={"task_id": {
        "$in": ["mp-22862"]
    }}, properties=["structure"])
print(type(df_mp))  #pandas dataframe
print(
    df_mp.iloc[0]
)  #outputs the following: structure    [[0. 0. 0.] Na, [2.32362417 1.64305041 4.02463...
#                       Name: mp-22862, dtype: object
# Needed to go one level deeper in df_mp
print(df_mp.iloc[0][0])
# If you look at the type of file that is it is pymatgen.structure.Structure object
print(type(df_mp.iloc[0][0]))
#this is a dataformat we can create
warnings.filterwarnings('ignore')
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
# Set pandas view options
pd.set_option('display.width', 1000)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
from sklearn.model_selection import train_test_split
from automatminer import MatPipe
from matminer.data_retrieval.retrieve_MP import MPDataRetrieval
# mpr=MPDataRetrieval()
# mpdr=MPDataRetrieval()
api_key = 'x3NlvC67Z9tPykwGz'
# Set your MP API key here.
mpr = MPDataRetrieval(api_key)
# api_key = None   # Set your MP API key here.
# mpr = MPDataRetrieval(api_key)
df = mpr.get_dataframe(
    {
        "elasticity": {
            "$exists": True
        },
        "elasticity.warnings": []
    }, ['pretty_formula', 'elasticity.K_VRH', 'elasticity.G_VRH'])
#/ criteria = {'elasticity.K_VRH': {'$ne': None}}
#/ properties = ['pretty_formula', 'elasticity.K_VRH', 'elasticity.G_VRH']
# get the data
# df=mpr.get_dataframe(criteria=criteria, properties=properties)
# Filter out unstable entries and negative bulk moduli
df = df[df['elasticity.K_VRH'] > 0]
Example #21
0
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec

plt.rcParams["figure.figsize"] = [6, 7]
font = {'family': 'Avenir', 'weight': 'normal', 'size': 26}
math_font = 'stixsans'
plt.rc('font', **font)
plt.rcParams['mathtext.fontset'] = math_font
plt.rcParams['axes.labelsize'] = font['size']
plt.rcParams['xtick.labelsize'] = font['size'] - 2
plt.rcParams['ytick.labelsize'] = font['size'] - 2
plt.rcParams['legend.fontsize'] = font['size'] - 2.5

mat_api_key = 'YourPymatgenAPI'
mpdr = MPDataRetrieval(mat_api_key)

df_terqua = mpdr.get_dataframe(criteria={
    'nsites': {
        '$lt': 41
    },
    'e_above_hull': {
        '$lt': 0.08
    },
    'nelements': {
        '$gt': 2,
        '$lt': 5
    },
},
                               properties=[
                                   'material_id',
Example #22
0
#!/usr/bin/env python
# coding: utf-8

# In[1]:

# UROP Phase 1 Data Retrieval and partial preprocessing
# all data is from material project database.

from matminer.datasets import load_dataset
from matminer.data_retrieval.retrieve_MP import MPDataRetrieval
mpdr = MPDataRetrieval(api_key="KcDv6qi5w4rUZSlt")
d = load_dataset("heusler_magnetic")
heusler_formula = d['formula']

# In[ ]:

import pandas as pd
import time

query_time = list()
false_list = list()
true_list = list()
data = list()
heusler_matrix = pd.DataFrame()
start_time = time.time()
for name in heusler_formula:
    t_1 = time.time()
    data_got = mpdr.get_data(
        criteria=name,
        properties=['pretty_formula', 'structure', 'elasticity'])
    t_2 = time.time()
Example #23
0
def generate_mp(max_nsites=None, properties=None, write_to_csv=False,
                write_to_compressed_json=True):
    """
    Grabs all mp materials. This will return two csv/json.gz files:
        * mp_nostruct: All MP materials, not including structures
        * mp_all: All MP materials, including structures

    Args:
        max_nsites (int): The maximum number of sites to include in the query.

        properties (iterable of strings): list of properties supported by
            MPDataRetrieval

        write_to_csv (bool): whether to write resulting dataframe to csv

        write_to_compressed_json (bool): whether to write resulting
            dataframe to json.gz file

    Returns (pandas.DataFrame):
        retrieved/generated data
    """

    # Set default properties if None and ensure is a list
    if properties is None:
        properties = ['pretty_formula', 'e_above_hull', 'band_gap',
                      'total_magnetization', 'elasticity.elastic_anisotropy',
                      'elasticity.K_VRH', 'elasticity.G_VRH', 'structure',
                      'energy', 'energy_per_atom', 'formation_energy_per_atom']
    elif not isinstance(properties, list):
        properties = list(properties)

    # Pick columns to drop structure data from
    drop_cols = []
    for col_name in ["structure", "initial_structure"]:
        if col_name in properties:
            drop_cols.append(col_name)

    mpdr = MPDataRetrieval()
    if max_nsites is not None:
        sites_list = [i for i in range(1, max_nsites + 1)]
    else:
        sites_list = [i for i in range(1, 101)] + [{"$gt": 100}]

    df = pd.DataFrame()
    for site_specifier in tqdm(sites_list, desc="Querying Materials Project"):
        # While loop to repeat queries if server request fails
        while True:
            try:
                site_response = mpdr.get_dataframe(
                    criteria={"nsites": site_specifier},
                    properties=properties, index_mpid=True
                )
                break

            except MPRestError:
                tqdm.write("Error querying materials project, "
                           "trying again after 5 sec")
                sleep(5)

        df = df.append(site_response)

    tqdm.write("DataFrame with {} entries created".format(len(df)))

    # Write data out to file if user so chooses
    if write_to_csv:
        df.to_csv("mp_all.csv")
        df.drop(drop_cols, axis=1, inplace=True)
        df.to_csv("mp_nostruct.csv")

    if write_to_compressed_json:
        store_dataframe_as_json(df, "mp_all.json.gz", compression="gz")
        df = df.drop(drop_cols, axis=1)
        store_dataframe_as_json(df, "mp_nostruct.json.gz", compression="gz")

    return df
 def setUp(self):
     self.mpdr = MPDataRetrieval(mapi_key)
Example #25
0
 def setUp(self):
     self.mpdr = MPDataRetrieval()
Example #26
0
File: data.py Project: PV-Lab/FTCP
def data_query(mp_api_key,
               max_elms=3,
               min_elms=3,
               max_sites=20,
               include_te=False):
    """
    The function queries data from Materials Project.

    Parameters
    ----------
    mp_api_key : str
        The API key for Mateirals Project.
    max_elms : int, optional
        Maximum number of components/elements for crystals to be queried.
        The default is 3.
    min_elms : int, optional
        Minimum number of components/elements for crystals to be queried.
        The default is 3.
    max_sites : int, optional
        Maximum number of components/elements for crystals to be queried.
        The default is 20.
    include_te : bool, optional
        DESCRIPTION. The default is False.

    Returns
    -------
    dataframe : pandas dataframe
        Dataframe returned by MPDataRetrieval.

    """
    mpdr = MPDataRetrieval(mp_api_key)
    # Specify query criteria in MongoDB style
    query_criteria = {
        'e_above_hull': {
            '$lte': 0.08
        },  # eV/atom
        'nelements': {
            '$gte': min_elms,
            '$lte': max_elms
        },
        'nsites': {
            '$lte': max_sites
        },
    }
    # Specify properties to be queried, properties avaible are at https://github.com/materialsproject/mapidoc/tree/master/materials
    query_properties = [
        'material_id', 'formation_energy_per_atom', 'band_gap',
        'pretty_formula', 'e_above_hull', 'elements', 'cif',
        'spacegroup.number'
    ]
    # Obtain queried dataframe containing CIFs and groud-state property labels
    dataframe = mpdr.get_dataframe(
        criteria=query_criteria,
        properties=query_properties,
    )
    dataframe['ind'] = np.arange(len(dataframe))

    if include_te:
        dataframe['ind'] = np.arange(0, len(dataframe))
        # Read thermoelectric properties from https://datadryad.org/stash/dataset/doi:10.5061/dryad.gn001
        te = pd.read_csv('data/thermoelectric_prop.csv', index_col=0)
        te = te.dropna()
        # Get compound index that has both ground-state and thermoelectric properties
        ind = dataframe.index.intersection(te.index)
        # Concatenate thermoelectric properties to corresponding compounds
        dataframe = pd.concat([dataframe, te.loc[ind, :]], axis=1)
        dataframe['Seebeck'] = dataframe['Seebeck'].apply(np.abs)

    return dataframe
Example #27
0
import os
import subprocess
import tempfile

import nbformat
import unittest

from matminer.data_retrieval.retrieve_MP import MPDataRetrieval

module_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), '..',
                          'notebooks')

citrine_key = os.environ.get("CITRINE_KEY")
mpds_key = os.environ.get("MPDS_KEY")
mp_key = MPDataRetrieval().mprester.api_key


class NotebookExampleTest(unittest.TestCase):
    def test_intro_predicting_bulk_modulus(self):
        path = os.path.join(module_dir, "intro_predicting_bulk_modulus.ipynb")
        _notebook_run(path)

    @unittest.skipIf(citrine_key is None, "CITRINE_KEY env variable not set.")
    def test_experiment_vs_computed_bandgap(self):
        path = os.path.join(module_dir, "experiment_vs_computed_bandgap.ipynb")
        _notebook_run(path)

    @unittest.skipIf(not all([citrine_key, mpds_key, mp_key]),
                     "data retrieval keys not set")
    def test_get_data(self):
import warnings
warnings.filterwarnings('ignore')
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
# Set pandas view options
pd.set_option('display.width', 1000)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
from sklearn.model_selection import train_test_split
from automatminer import MatPipe
from matminer.data_retrieval.retrieve_MP import MPDataRetrieval
from matminer.utils.data import PymatgenData
from pymatgen import Composition
mpr=MPDataRetrieval()

api_key = 'x3NlvC67Z9tPykwGz'   
# Set your MP API key here. 
mpr = MPDataRetrieval(api_key)  
mpdr=MPDataRetrieval(api_key)
# df = mpdr.get_dataframe({"elasticity": {"$exists": True}, "elasticity.warnings": []},
                        # ['pretty_formula', 'elasticity.K_VRH', 'elasticity.G_VRH']) 
criteria = {'elasticity.K_VRH': {'$ne': None}}
properties = ['pretty_formula', 'spacegroup.symbol', 'elasticity.K_VRH', 'elasticity.G_VRH','formation_energy_per_atom', 'band_gap',
              'e_above_hull', 'density', 'volume', 'nsites']
df = mpr.get_dataframe(criteria=criteria, properties=properties)
df1=pd.read_csv(r'D:\FYP_files\database\data_after_processing\huizong\huizong.csv')
df=df.reset_index()
df=pd.merge(df,df1)
df=df.set_index("material_id")
df = df[df['elasticity.K_VRH'] > 0]
NJOBS = args.njobs

# Print parameters.
print("REMOVE UNSTABLE ENTRIES:", FILTER)
print("USE FABER DATASET:", FABER)
print("USE TERNARY OXIDE DATASET:", not FABER)
print("NUMBER OF JOBS:", NJOBS)
print("DEBUG MODE:", args.debug)

# Set up dataset
if FABER:
    df = load_dataset("flla")
else:
    # Initialize data retrieval class
    from matminer.data_retrieval.retrieve_MP import MPDataRetrieval
    mpr = MPDataRetrieval()
    criteria = "*-*-O"
    # Choose list of properties to retrive
    properties = [
        'structure', 'nsites', 'formation_energy_per_atom', 'e_above_hull'
    ]
    # Get the dataframe with the matching structure from the Materials Project
    df = mpr.get_dataframe(criteria=criteria, properties=properties)
    # Create the formation_energy feature for the SCM regression, since the SCM
    # model learns formation energy per unit cell rather than per atom.
    df['formation_energy'] = df['formation_energy_per_atom'] * df['nsites']
    # Structures are retrieved as dictionaries but can easily be converted to
    # pymatgen.core.Structure objects as shown.
    df['structure'] = pd.Series([Structure.from_dict(df['structure'][i])\
        for i in range(df.shape[0])], df.index)
    # Filter the dataset if it consists of ternary oxides
Example #30
0
This file makes the following benchmarking datasets:
    - phonons

From matminer's dataset library.
"""

from matminer.datasets.dataset_retrieval import load_dataset
from matminer.data_retrieval.retrieve_MP import MPDataRetrieval

import pandas as pd
# pd.set_option('display.height', 1000)
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

mpdr = MPDataRetrieval()

df = load_dataset("phonon_dielectric_mp")

print(df)

mpids = df["mpid"].tolist()
dfe = mpdr.get_dataframe(
    criteria={"material_id": {
        "$in": mpids
    }},
    properties=["e_above_hull", "formation_energy_per_atom", "material_id"],
    index_mpid=False)
dfe = dfe.rename(columns={"material_id": "mpid"})

df = pd.merge(df, dfe, how='inner')
Example #31
0
from pymatgen import MPRester
from matminer.datasets.dataset_retrieval import load_dataset
from matminer.data_retrieval.retrieve_MP import MPDataRetrieval
import pandas as pd
import numpy as np
from tqdm import tqdm


pd.set_option("display.max_rows", 500)
pd.set_option("display.max_columns", 500)
pd.set_option("display.width", 1000)

chunksize = 1000

mpdr = MPDataRetrieval()
mpr = MPRester()


def chunks(l, n):
    """Yield successive n-sized chunks from l."""
    for i in range(0, len(l), n):
        yield l[i : i + n]


df = mpdr.get_dataframe(
    criteria={"formation_energy_per_atom": {"$lt": 2.5}},
    properties=["material_id", "warnings"],
    index_mpid=False,
)
Example #32
0
def plot_expt_compt_band_gaps(citrine_api_key, limit=0):
    """
    Pulls experimental band gaps from Citrine (w/o dataset limitations) and
        evaluate the DFT computed band gaps (data from materialsproject.org)
        in xy scatter plot. To compare the right values, we pick the computed
        band gaps calculated for a chemical formula that has the lowest energy
        above hull (the most stable structure).
    Args:
        citrine_api_key (str): Your Citrine API key for getting data. Don't have
            a Citrine account? Visit https://citrine.io/
        limit (int): limit the number of entries (0 means no limit)
    Returns:
        plotly plots in "offline" mode popped in the default browser.
    """

    # pull experimental band gaps from Citrine
    cdr = CitrineDataRetrieval(api_key=citrine_api_key)
    cols = ['chemicalFormula', 'Band gap']
    df_ct = cdr.get_dataframe(criteria={'data_type':'experimental',
                                        'max_results':limit},
                              secondary_fields=True,
                              properties=['Band gap'])
    df_ct = df_ct[cols].rename(columns={'chemicalFormula': 'Formula',
                                        'Band gap': 'Expt. gap'})
    df_ct = df_ct[df_ct['Formula'] != 'In1p1'] # p1 not recognized in Composition
    df_ct = df_ct.dropna() # null band gaps cause problem when plotting residuals
    df_ct['Formula'] = df_ct['Formula'].transform(
        lambda x: Composition(x).get_reduced_formula_and_factor()[0])

    # pull computational band gaps from the Materials Project
    df = MPDataRetrieval().get_dataframe(
        criteria={'pretty_formula': {'$in': list(df_ct['Formula'].values)}},
        properties=['pretty_formula', 'material_id', 'band_gap', 'e_above_hull'],
        index_mpid=False).rename(
        columns={'pretty_formula': 'Formula', 'band_gap': 'MP computed gap',
                 'material_id': 'mpid'})


    # pick the most stable structure
    df_mp = df.loc[df.groupby("Formula")["e_above_hull"].idxmin()]
    df_final = df_ct.merge(df_mp, on='Formula').drop(
                                    'e_above_hull', axis=1).set_index('mpid')
    pf = PlotlyFig(df_final, x_title='Experimental band gap (eV)',
                   y_title='Computed Band Gap (eV)',
                   filename='band_gaps')

    # computed vs. experimental band gap:
    pf.xy([
        ('Expt. gap', 'MP computed gap'),
        ([0, 12], [0, 12])
    ],
        lines=[{}, {'color': 'black', 'dash': 'dash'}],
        labels=['Formula', df_final.index],
        modes=['markers', 'lines'],
        names=['Computed vs. expt.', 'Expt. gap'])

    # residual:
    residuals = df_final['MP computed gap']-df_final['Expt. gap'].astype(float)
    pf.set_arguments(x_title='Experimental band gap (eV)',
                    y_title='Residual (Computed - Expt.) Band Gap (eV)',
                    filename='band_gap_residuals')
    pf.xy(('Expt. gap', residuals),
          labels = ['Formula', df_final.index])