Exemple #1
0
    def has_polymorphs(self):
        """Determine if a task's raw data contains polymorphs.

        Returns:
            (bool) If true, contains polymorphs.
        """
        checker_key = "pmg_composition"
        self._check_is_loaded()
        if self.metadata.input_type == "composition":
            stc = StrToComposition(target_col_id=checker_key, reduce=True)
            comps = stc.featurize_dataframe(self.df,
                                            "composition")[checker_key].values
        elif self.metadata.input_type == "structure":
            stc = StructureToComposition(target_col_id=checker_key,
                                         reduce=True)
            comps = stc.featurize_dataframe(self.df,
                                            "structure")[checker_key].values
        else:
            raise ValueError(
                "Cannot check for polymorphs without input type in "
                "(structure, composition)!")

        unique_comps = set(comps)
        if len(unique_comps) != len(comps):
            return True
        else:
            return False
Exemple #2
0
def tran_feat_composition(
    df,
    var_formula="FORMULA",
    preset_name="magpie",
    append=True,
    ignore_errors=True,
    **kwargs,
):
    r"""Featurize a dataset using matminer

    Featurize chemical composition using matminer package.

    Args:
        df (DataFrame): Data to featurize
        var_formula (string): Column in df with chemical formula; formula
            given as string
        append (bool): Append results to original columns?
        preset_name (string): Matminer featurization preset

    Kwargs:
        ignore_errors (bool): Do not throw an error while parsing formulae; set to
            True to return NaN's for invalid formulae.

    Notes:
        - A pre-processor and wrapper for matminer.featurizers.composition

    References:
        Ward, L., Dunn, A., Faghaninia, A., Zimmermann, N. E. R., Bajaj, S., Wang, Q., Montoya, J. H., Chen, J., Bystrom, K., Dylla, M., Chard, K., Asta, M., Persson, K., Snyder, G. J., Foster, I., Jain, A., Matminer: An open source toolkit for materials data mining. Comput. Mater. Sci. 152, 60-69 (2018).

    Examples:
        >>> import grama as gr
        >>> from grama.tran import tf_feat_composition
        >>> (
        >>>     gr.df_make(FORMULA=["C6H12O6"])
        >>>     >> gr.tf_feat_composition()
        >>> )

    """
    ## Check invariants

    ## Featurize
    featurizer = ElementProperty.from_preset(preset_name=preset_name)
    df_res = StrToComposition().featurize_dataframe(
        df[[var_formula]],
        var_formula,
        ignore_errors=ignore_errors,
    )
    df_res = featurizer.featurize_dataframe(
        df_res,
        col_id="composition",
        ignore_errors=ignore_errors,
        **kwargs,
    )
    df_res.drop(columns=[var_formula, "composition"], inplace=True)

    ## Concatenate as necessary
    if append:
        df_res = concat((df, df_res), axis=1)

    return df_res
Exemple #3
0
 def __init__(self, pbar=False):
     self.regressor = RandomForestRegressor(n_estimators=500, n_jobs=-1, verbose=3)
     self.stc = StrToComposition()
     ep = ElementProperty.from_preset("magpie")
     ef = ElementFraction()
     self.featurizer = MultipleFeaturizer([ep, ef])
     self.pbar = pbar
Exemple #4
0
    def __init__(self):
        self.feature_calculators = MultipleFeaturizer([
            cf.ElementProperty.from_preset(preset_name="magpie"),
            cf.Stoichiometry(),
            cf.ValenceOrbital(props=['frac']),
            cf.IonProperty(fast=True),
            cf.BandCenter(),
            cf.ElementFraction(),
        ])

        self.str2composition = StrToComposition()
Exemple #5
0
def generate(fake_df, ignore_errors=False):
    fake_df = np.array([fake_df])
    fake_df = pd.DataFrame(fake_df)
    fake_df.columns = ['full_formula']
    # print(fake_df)
    fake_df = StrToComposition().featurize_dataframe(
        fake_df, "full_formula", ignore_errors=ignore_errors)
    fake_df = fake_df.dropna()
    fake_df = feature_calculators.featurize_dataframe(
        fake_df, col_id='composition', ignore_errors=ignore_errors)
    fake_df["NComp"] = fake_df["composition"].apply(len)
    return fake_df
Exemple #6
0
    def test_str_to_composition(self):
        d = {'comp_str': ["Fe2", "MnO2"]}

        df = DataFrame(data=d)
        df = StrToComposition().featurize_dataframe(df, 'comp_str')

        self.assertEqual(df["composition"].tolist(),
                         [Composition("Fe2"), Composition("MnO2")])

        stc = StrToComposition(reduce=True, target_col_id='composition_red')
        df = stc.featurize_dataframe(df, 'comp_str')

        self.assertEqual(df["composition_red"].tolist(),
                         [Composition("Fe"), Composition("MnO2")])
def composition_featurizer(df_input: pd.DataFrame, **kwargs) -> pd.DataFrame:
    """Return a Pandas DataFrame with all compositional features"""

    # generate the "composition" column
    df_comp = StrToComposition().featurize_dataframe(df_input,
                                                     col_id="Compound")
    # generate features based on elemental properites
    ep_featurizer = ElementProperty.from_preset(preset_name="magpie")
    ep_featurizer.featurize_dataframe(df_comp,
                                      col_id="composition",
                                      inplace=True)
    # generate the "composition_oxid" column based on guessed oxidation states
    CompositionToOxidComposition(
        return_original_on_error=True, **kwargs).featurize_dataframe(
            # ignore errors from non-integer stoichiometries
            df_comp,
            "composition",
            ignore_errors=True,
            inplace=True)
    # correct oxidation states
    df_comp = correct_comp_oxid(df_comp)
    # generate features based on oxidation states
    os_featurizer = OxidationStates()
    os_featurizer.featurize_dataframe(df_comp,
                                      "composition_oxid",
                                      ignore_errors=True,
                                      inplace=True)
    # remove compounds with predicted oxidation states of 0
    return df_comp[df_comp["minimum oxidation state"] != 0]
Exemple #8
0
class FeatureGenerator:
    """
        A wraper class to generate multiple type of elemental features
    """
    def __init__(self):
        self.feature_calculators = MultipleFeaturizer([
            cf.ElementProperty.from_preset(preset_name="magpie"),
            cf.Stoichiometry(),
            cf.ValenceOrbital(props=['frac']),
            cf.IonProperty(fast=True),
            cf.BandCenter(),
            cf.ElementFraction(),
        ])

        self.str2composition = StrToComposition()

    def generate(self, df: pd.DataFrame, ignore_errors: bool = False):
        """
            generate feature from a dataframe with a "formula" column that contains 
            chemical formulas of the compositions.
        """
        df = self.str2composition.featurize_dataframe(
            df, "formula", ignore_errors=ignore_errors)
        df = df.dropna()
        df = self.feature_calculators.featurize_dataframe(
            df, col_id='composition', ignore_errors=ignore_errors)
        df["NComp"] = df["composition"].apply(len)
        return df
Exemple #9
0
    def test_conversion_overwrite(self):
        # Test with overwrite
        d = {'comp_str': ["Fe2", "MnO2"]}
        df = DataFrame(data=d)

        stc = StrToComposition(target_col_id='comp_str', overwrite_data=False)
        with self.assertRaises(ValueError):
            df = stc.featurize_dataframe(df, 'comp_str', inplace=True)

        with self.assertRaises(ValueError):
            df = stc.featurize_dataframe(df, 'comp_str', inplace=False)

        stc = StrToComposition(target_col_id='comp_str', overwrite_data=True)

        dfres_ipt = df.copy()
        stc.featurize_dataframe(dfres_ipt, 'comp_str', inplace=True)
        self.assertListEqual(dfres_ipt.columns.tolist(), ["comp_str"])

        dfres_ipf = stc.featurize_dataframe(df, 'comp_str', inplace=False)
        self.assertListEqual(dfres_ipf.columns.tolist(), ["comp_str"])
def test_featurizers():
    df = pd.read_csv('test.csv', index_col=[0])
    df = StrToComposition().featurize_dataframe(df, 'formula')
    print(df.head())
    #下一步,我们需要其中一个特征化来增加一系列的特征算符
    ep_feat = ElementProperty.from_preset(preset_name='magpie')
    df = ep_feat.featurize_dataframe(
        df, col_id='composition')  #将composition这一列作为特征化的输入
    print(df.head())
    print(ep_feat.citations())
    #df.to_csv('将composition特征化后.csv')

    #开始引入新的特征化算符吧
    df = CompositionToOxidComposition().featurize_dataframe(
        df, 'composition')  #引入了氧化态的相关特征
    os_feat = OxidationStates()
    df = os_feat.featurize_dataframe(df, col_id='composition_oxid')
    print(df.head())
    df.to_csv('after_test.csv')
Exemple #11
0
def generate_data(name):
    #这个函数作用,输入是指定的文件名,输出增加了gaps,is_daoti,以及其他共计145特征的完整向量矩阵
    #name='test_plus_gaps.csv'
    df=pd.read_csv(name,index_col=[0])
    df['gaps']=-10.0   
    df_gap=pd.read_csv("gaps.csv",index_col = [0])
    print(df_gap.index)
    i=0    
    str_s=""
    for j in range(len(df_gap.index)):
        #先打印二者的id
       # print(df.index[i])
        str_s='mp-'+str(df_gap.index[j])
        if(str_s==df.index[i]):
            df.iloc[i,-1]=df_gap.iloc[j,0]
            i=i+1
            #print("确实一样") 
    print("合并完毕")

    #同样的方法我们来建立不同的分类
    df['is_daoti']=-2
    for i in range(len(df.index)):
        if(df.ix[i,-2]==0):
            df.ix[i,-1]=1
        else:
            df.ix[i,-1]=0
    print("分类feature建立完成")   
    
#首先使用describe获得对于数据的整体把握
    print(df.describe())
    df.describe().to_csv('general_look_jie.csv')
#通过观察数据发现并没有什么异常之处
    df=StrToComposition().featurize_dataframe(df,'full_formula',ignore_errors=True)
    print(df.head())   
    #print(df['composition'])
    ep_feat=ElementProperty.from_preset(preset_name='magpie')
    df=ep_feat.featurize_dataframe(df,col_id='composition',ignore_errors=True)#将composition这一列作为特征化的输入
    print(df.head())
    #print(ep_feat.citations())
    #df.to_csv("plus the composition.csv")
    #以上这部分是将formula转化为composition并转化feature

    df=CompositionToOxidComposition().featurize_dataframe(df,col_id='composition')#引入了氧化态的相关特征
    os_feat=OxidationStates()
    df=os_feat.featurize_dataframe(df,col_id='composition_oxid')
    new_name='2d_vector_plus.csv'
    df.to_csv(new_name)
Exemple #12
0
class FeatureGenerator:
    """
        A wraper class to generate multiple type of elemental features
    """
    def __init__(self):
        self.feature_calculators = MultipleFeaturizer([
            cf.ElementProperty.from_preset(preset_name="magpie"),
            cf.Stoichiometry(),
            cf.ValenceOrbital(props=['frac']),
            cf.IonProperty(fast=True),
            cf.BandCenter(),
            cf.ElementFraction(),
        ])

        self.str2composition = StrToComposition()

    def generate(self,
                 df: pd.DataFrame,
                 ignore_errors: bool = False,
                 drop_mode=True):
        """
            generate feature from a dataframe with a "formula" column that contains 
            chemical formulas of the compositions.

            df : a dataframe with a column name formula
            ignore_errors : ignore errors when generating features
            drop_mode : drop property that generated from mode aggregation function

        """
        df = self.str2composition.featurize_dataframe(
            df, "formula", ignore_errors=ignore_errors)
        df = df.dropna()
        df = self.feature_calculators.featurize_dataframe(
            df, col_id='composition', ignore_errors=ignore_errors)
        df["NComp"] = df["composition"].apply(len)
        if drop_mode:
            df = df.drop(columns=[
                c for c in df.columns if "mode" in c and c.startswith("Magpie")
            ])
        return df
Exemple #13
0
class RFEstimator(BaseTesterEstimator):

    def __init__(self, pbar=False):
        self.regressor = RandomForestRegressor(n_estimators=500, n_jobs=-1, verbose=3)
        self.stc = StrToComposition()
        ep = ElementProperty.from_preset("magpie")
        ef = ElementFraction()
        self.featurizer = MultipleFeaturizer([ep, ef])
        self.pbar = pbar

    def _generate_features(self, x):
        comps = [o[0] for o in self.stc.featurize_many(x, pbar=self.pbar)]
        features = np.asarray(self.featurizer.featurize_many(comps, pbar=self.pbar))
        return features

    def fit(self, x, y):
        features = self._generate_features(x)
        self.regressor.fit(features, y)

    def predict(self, x):
        features = self._generate_features(x)
        return self.regressor.predict(features)
Exemple #14
0
import matminer
from matminer.data_retrieval.retrieve_MP import MPDataRetrieval
from matminer.utils.io import store_dataframe_as_json
from matminer.utils.io import load_dataframe_from_json
from matminer.figrecipes.plot import PlotlyFig
'''
#Block 1 - Loading and filtering the experimental dataframe
'''
df = load_dataframe_from_json('data/Batteries_raw.json')

# Select the working ion among {Li, Al, Zr, Mg}
select = 'Li'

# Initial filter based on the selected element
from matminer.featurizers.conversions import StrToComposition
fdf = StrToComposition().featurize_dataframe(df, 'Ion')

select_at = fdf["composition"].apply(lambda x: x.get_atomic_fraction(select))
fdf = fdf[select_at == 1]

# Debug
print("Remaining samples: {}".format(fdf.describe))
fdf = fdf.drop(['composition'], axis=1)

## Initial conversion to matminer objects
from matminer.featurizers.conversions import StrToComposition
fdf = StrToComposition().featurize_dataframe(fdf, 'Reduced Formula')

from matminer.featurizers.conversions import CompositionToOxidComposition
fdf = CompositionToOxidComposition().featurize_dataframe(fdf, 'composition')
Exemple #15
0
import numpy as np
import pandas as pd
import pickle
'''
#Block 1 - Loading dataframe
'''
# arbitrary inputs - Li must be excluded to ensure consistency
data = [['mp-1025496', 'Nb1 Se2'], ['mp-977563', 'Nb1 Ir2'],
        ['mp-864631', 'Nb1 Rh2'], ['mp-3368', 'Nb3 O8']]

fdf = pd.DataFrame(data, columns=['Id', 'Reduced Formula'])

## Initial conversion to matminer objects
from matminer.featurizers.conversions import StrToComposition

fdf = StrToComposition().featurize_dataframe(fdf, 'Reduced Formula')

from matminer.featurizers.conversions import CompositionToOxidComposition

fdf = CompositionToOxidComposition().featurize_dataframe(fdf, 'composition')

print("The initial dataset has {}".format(fdf.shape))
print(fdf.head())
'''
Block 2 - Featurization
'''
#
# -- start F1
from matminer.featurizers.composition import ElementProperty

ep_feat = ElementProperty.from_preset(preset_name='magpie')
    def __init__(self, filepath, dataset, init_samples):
        self.filepath = filepath
        self.df = pd.read_csv(
            self.filepath,
            usecols=['material_id', 'pretty_formula', 'band_gap'])
        self.dataset = dataset
        self.init_samples = init_samples
        self.init_filename = './ALSearch_init_' + str(init_samples) + '.csv'
        if dataset is 'bandgap':
            #self.df = pd.read_csv('./bandgap_df_whole.csv')
            if os.path.exists(self.init_filename) is False:
                # small examples for debugging
                self.df = self.df.sample(n=self.init_samples,
                                         replace=True,
                                         random_state=42)
                added_columns_name = []
                for i in range(128):
                    added_columns_name.append('V' + str(i))
                data = []
                # create composition column
                df_comp = StrToComposition(
                    target_col_id='composition').featurize_dataframe(
                        self.df, 'pretty_formula')
                # create column with maximum atom number
                max_atom_num = []
                for st in df_comp[['composition']].astype(str).values:
                    # if len(st[0].as_dict()) > 8:
                    #     continue
                    atom_list = []
                    # print(st[0])
                    s = st[0]
                    for item in s.split():
                        num = re.sub(r"\D", "", item)
                        atom_list.append(int(num))
                    # print(atom_list)
                    max_atom_num.append(max(atom_list))

                # update dataframe with max_atom_num
                self.df['max_atom_num'] = max_atom_num
                # remove rows whose max atom number above 20
                self.df = self.df[self.df['max_atom_num'] < 21]
                self.df = self.df.drop(['max_atom_num'], axis=1)
                # convert formula to latent vector
                for formula in self.df['pretty_formula']:
                    print(formula)
                    onehot_matrix = formula2onehot_matrix(formula, l=20)
                    lat_vec = get_latent_space(onehot_matrix)
                    lat_list = lat_vec.tolist()
                    data.append(lat_list[0])
                    print(formula + 'has been converted into latent vector~')

                df_added = pd.DataFrame(data, columns=added_columns_name)
                self.df.reset_index(drop=True, inplace=True)
                df_added.reset_index(drop=True, inplace=True)
                self.df = pd.concat([self.df, df_added], axis=1)

                # rename columns to eliminate ' '
                column_rename = [
                    'id', 'composition', 'Eg', 'V0', 'V1', 'V2', 'V3', 'V4',
                    'V5', 'V6', 'V7', 'V8', 'V9', 'V10', 'V11', 'V12', 'V13',
                    'V14', 'V15', 'V16', 'V17', 'V18', 'V19', 'V20', 'V21',
                    'V22', 'V23', 'V24', 'V25', 'V26', 'V27', 'V28', 'V29',
                    'V30', 'V31', 'V32', 'V33', 'V34', 'V35', 'V36', 'V37',
                    'V38', 'V39', 'V40', 'V41', 'V42', 'V43', 'V44', 'V45',
                    'V46', 'V47', 'V48', 'V49', 'V50', 'V51', 'V52', 'V53',
                    'V54', 'V55', 'V56', 'V57', 'V58', 'V59', 'V60', 'V61',
                    'V62', 'V63', 'V64', 'V65', 'V66', 'V67', 'V68', 'V69',
                    'V70', 'V71', 'V72', 'V73', 'V74', 'V75', 'V76', 'V77',
                    'V78', 'V79', 'V80', 'V81', 'V82', 'V83', 'V84', 'V85',
                    'V86', 'V87', 'V88', 'V89', 'V90', 'V91', 'V92', 'V93',
                    'V94', 'V95', 'V96', 'V97', 'V98', 'V99', 'V100', 'V101',
                    'V102', 'V103', 'V104', 'V105', 'V106', 'V107', 'V108',
                    'V109', 'V110', 'V111', 'V112', 'V113', 'V114', 'V115',
                    'V116', 'V117', 'V118', 'V119', 'V120', 'V121', 'V122',
                    'V123', 'V124', 'V125', 'V126', 'V127'
                ]
                self.df = self.df.set_axis(column_rename,
                                           axis=1,
                                           inplace=False)
                #self.df = self.df.drop()

                self.df.to_csv(self.init_filename, index=False, header=True)

            else:
                self.df = pd.read_csv(self.init_filename)
        print('The shape of initial dataset is ' + str(self.df.shape))
        self.label = ['Eg']

        # drop duplicate values
        self.df = self.df.drop_duplicates(
            subset=[i for i in self.df.columns if i not in self.label],
            keep='first')
        print('The shape of init dataset after dropping duplicates is ' +
              str(self.df.shape))

        self.df = self.df.dropna()

        # sort dataframe by y value
        self.sorted_df = self.df.sort_values(by=self.label)
df = load_elastic_tensor()
print(df.columns)
"""
Index(['material_id', 'formula', 'nsites', 'space_group', 'volume',
       'structure', 'elastic_anisotropy', 'G_Reuss', 'G_VRH', 'G_Voigt',
       'K_Reuss', 'K_VRH', 'K_Voigt', 'poisson_ratio', 'compliance_tensor',
       'elastic_tensor', 'elastic_tensor_original'],
      dtype='object')
"""
unwanted_columns = ["volume", "nsites", "compliance_tensor", "elastic_tensor",
                    "elastic_tensor_original", "K_Voigt", "G_Voigt", "K_Reuss", "G_Reuss"]
df = df.drop(unwanted_columns, axis=1)

from matminer.featurizers.conversions import StrToComposition

df = StrToComposition().featurize_dataframe(df, 'formula')

from matminer.featurizers.composition import ElementProperty

ep_feat = ElementProperty.from_preset(preset_name="magpie")
df = ep_feat.featurize_dataframe(df, col_id='composition')

from matminer.featurizers.conversions import CompositionToOxidComposition
from matminer.featurizers.composition import OxidationStates

df = CompositionToOxidComposition().featurize_dataframe(df, "composition")

os_feat = OxidationStates()
df = os_feat.featurize_dataframe(df, "composition_oxid")

from matminer.featurizers.structure import DensityFeatures
onehot = build_entry()
print(onehot)
onehot_l = list(onehot.keys())
print(onehot_l)

filepath = './Utils/bandgap-magpie.csv'
df = pd.read_csv(filepath)
#df = df.sample(frac=0.001, replace=True, random_state=1)
print('The shape of current dataset is ' + str(df.shape))

added_columns_name = []
for i in range(128):
    added_columns_name.append('V' + str(i))
data = []
# create composition column
df_comp = StrToComposition(target_col_id='composition').featurize_dataframe(
    df, 'pretty_formula')
# create column with maximum atom number
max_atom_num = []
for st in df_comp[['composition']].astype(str).values:
    atom_list = []
    #print(st[0])
    s = st[0]
    for item in s.split():
        num = re.sub(r"\D", "", item)
        atom_list.append(int(num))
    #print(atom_list)
    max_atom_num.append(max(atom_list))

# update dataframe with max_atom_num
df['max_atom_num'] = max_atom_num
# remove rows whose max atom number above 20
Exemple #19
0
    def test_conversion_multiindex(self):
        d = {'comp_str': ["Fe2", "MnO2"]}

        df_1lvl = DataFrame(data=d)

        df_1lvl = StrToComposition().featurize_dataframe(
            df_1lvl, 'comp_str', multiindex=True)
        self.assertEqual(df_1lvl[("StrToComposition", "composition")].tolist(),
                         [Composition("Fe2"), Composition("MnO2")])

        df_2lvl = DataFrame(data=d)
        df_2lvl.columns = MultiIndex.from_product((["custom"],
                                                   df_2lvl.columns.values))

        df_2lvl = StrToComposition().featurize_dataframe(
            df_2lvl, ("custom", "comp_str"), multiindex=True)
        self.assertEqual(df_2lvl[("StrToComposition", "composition")].tolist(),
                         [Composition("Fe2"), Composition("MnO2")])

        df_2lvl = DataFrame(data=d)
        df_2lvl.columns = MultiIndex.from_product((["custom"],
                                                   df_2lvl.columns.values))

        sto = StrToComposition(target_col_id='test')
        df_2lvl = sto.featurize_dataframe(
            df_2lvl, ("custom", "comp_str"), multiindex=True)
        self.assertEqual(df_2lvl[("StrToComposition", "test")].tolist(),
                         [Composition("Fe2"), Composition("MnO2")])

        # if two level multiindex provided as target, it should be written there
        # here we test converting multiindex in place
        df_2lvl = DataFrame(data=d)
        df_2lvl.columns = MultiIndex.from_product((["custom"],
                                                   df_2lvl.columns.values))

        sto = StrToComposition(target_col_id=None, overwrite_data=True)

        df_2lvl = sto.featurize_dataframe(
            df_2lvl, ("custom", "comp_str"), multiindex=True, inplace=False)
        self.assertEqual(df_2lvl[("custom", "comp_str")].tolist(),
                         [Composition("Fe2"), Composition("MnO2")])

        # Try inplace multiindex conversion with return errors
        df_2lvl = DataFrame(data=d)
        df_2lvl.columns = MultiIndex.from_product((["custom"],
                                                   df_2lvl.columns.values))

        sto = StrToComposition(target_col_id=None, overwrite_data=True)
        df_2lvl = sto.featurize_dataframe(
            df_2lvl, ("custom", "comp_str"), multiindex=True,
            return_errors=True, ignore_errors=True)

        self.assertTrue(
            all(df_2lvl[("custom", "StrToComposition Exceptions")].isnull()))
Exemple #20
0
    # 得到数据
    query_string = 'mdf.source_name:oqmd AND (oqmd.configuration:static OR ' \
                   'oqmd.configuration:standard) AND dft.converged:True'
    if quick_demo:
        query_string += " AND mdf.scroll_id:<10000"

    data = mdf.get_data(query_string, unwind_arrays=False)
    print(data.head())
    # 重命名、预处理和筛选,delta_e应该是形成能
    data = data[['oqmd.delta_e.value', 'material.composition']]
    data = data.rename(columns={
        'oqmd.delta_e.value': 'delta_e',
        'material.composition': 'composition'
    })
    data = StrToComposition(
        target_col_id='composition_obj').featurize_dataframe(
            data, 'composition')
    data.sort_values('delta_e', ascending=True, inplace=True)
    print(data.head(3))
    for k in ['delta_e']:
        data[k] = pd.to_numeric(data[k])

    original_count = len(data)
    data = data[~data['delta_e'].isnull()]
    print('Removed %d/%d entries' %
          (original_count - len(data), original_count))

    original_count = len(data)
    data['composition'] = data['composition_obj'].apply(
        lambda x: x.reduced_formula)
    data.sort_values('delta_e', ascending=True, inplace=True)
def generate_data():
    df = load_elastic_tensor()
    df.to_csv('原始elastic数据.csv')
    print(df.columns)

    unwanted_columns = [
        'volume', 'nsites', 'compliance_tensor', 'elastic_tensor',
        'elastic_tensor_original', 'K_Voigt', 'G_Voigt', 'K_Reuss', 'G_Reuss'
    ]
    df = df.drop(unwanted_columns, axis=1)
    print(df.head())
    df.to_csv('扔掉不需要的部分.csv')

    #首先使用describe获得对于数据的整体把握
    print(df.describe())
    df.describe().to_csv('general_look.csv')
    #通过观察数据发现并没有什么异常之处
    df = StrToComposition().featurize_dataframe(df, 'formula')
    print(df.head())
    df.to_csv('引入composition.csv')

    #下一步,我们需要其中一个特征化来增加一系列的特征算符
    ep_feat = ElementProperty.from_preset(preset_name='magpie')
    df = ep_feat.featurize_dataframe(
        df, col_id='composition')  #将composition这一列作为特征化的输入
    print(df.head())
    print(ep_feat.citations())
    df.to_csv('将composition特征化后.csv')

    #开始引入新的特征化算符吧
    df = CompositionToOxidComposition().featurize_dataframe(
        df, 'composition')  #引入了氧化态的相关特征
    os_feat = OxidationStates()
    df = os_feat.featurize_dataframe(df, col_id='composition_oxid')
    print(df.head())
    df.to_csv('引入氧化态之后.csv')

    #其实除了基于composition的特征之外还有很多其他的,比如基于结构的
    df_feat = DensityFeatures()
    df = df_feat.featurize_dataframe(df, 'structure')
    print(df.head())
    df.to_csv('引入结构中的密度.csv')
    print(df_feat.feature_labels())
Exemple #22
0
from matminer.featurizers.conversions import StrToComposition
from tqdm import tqdm

import pandas as pd

# pd.set_option('display.height', 1000)
pd.set_option("display.max_rows", 500)
pd.set_option("display.max_columns", 500)
pd.set_option("display.width", 1000)

df = load_dataset("glass_ternary_landolt")

df = df.rename(columns={"formula": "composition"})
df = df[["composition", "gfa"]]

df = StrToComposition(target_col_id="composition_obj").featurize_dataframe(
    df, "composition")
df["composition"] = [c.reduced_formula for c in df["composition_obj"]]
df = df.drop(columns=["composition_obj"])

# print("Ground truth")
# print(df[df["composition"]=="ZrTi9"])  # should be False in final dataframe also!!
# print(df[df["composition"]=="ZrVCo8"]) # should be True in final dataframe also!
# print(df["gfa"].value_counts())    # proportion is about 5000 GFA 2054 no GFA
# raise ValueError

unique = df["composition"].unique()
print(len(df))
print(len(unique))

problem_compositions = []
new_df_dict = {"composition": [], "gfa": []}
    df['elastic_tensor_original'][i] = np.array(df['elastic_tensor_original'][i]['data'])

"""
['_id', 'material_id', 'formula', 'nsites', 'space_group', 'volume',
       'structure', 'elastic_anisotropy', 'G_Reuss', 'G_VRH', 'G_Voigt',
       'K_Reuss', 'K_VRH', 'K_Voigt', 'poisson_ratio', 'compliance_tensor',
       'elastic_tensor', 'elastic_tensor_original', 'cif', 'kpoint_density',
       'poscar']
"""
unwanted_columns = ['_id', 'material_id', 'nsites', 'volume',
                    'cif', 'kpoint_density', 'poscar']
df = df.drop(unwanted_columns, axis=1)

from matminer.featurizers.conversions import StrToComposition

sc_feat = StrToComposition()
df = sc_feat.featurize_dataframe(df, col_id='formula')

from matminer.featurizers.composition import ElementProperty

ep_feat = ElementProperty.from_preset(preset_name='magpie')
df = ep_feat.featurize_dataframe(df, col_id='composition')

from matminer.featurizers.conversions import CompositionToOxidComposition

co_feat = CompositionToOxidComposition()
df = co_feat.featurize_dataframe(df, col_id='composition')

from matminer.featurizers.composition import OxidationStates

os_feat = OxidationStates()
def AddFeatures(df):  # Add features by Matminer
    from matminer.featurizers.conversions import StrToComposition
    df = StrToComposition().featurize_dataframe(df, "formula")

    from matminer.featurizers.composition import ElementProperty

    ep_feat = ElementProperty.from_preset(preset_name="magpie")
    df = ep_feat.featurize_dataframe(
        df, col_id="composition"
    )  # input the "composition" column to the featurizer

    from matminer.featurizers.conversions import CompositionToOxidComposition
    from matminer.featurizers.composition import OxidationStates

    df = CompositionToOxidComposition().featurize_dataframe(df, "composition")

    os_feat = OxidationStates()
    df = os_feat.featurize_dataframe(df, "composition_oxid")

    from matminer.featurizers.composition import ElectronAffinity

    ea_feat = ElectronAffinity()
    df = ea_feat.featurize_dataframe(df,
                                     "composition_oxid",
                                     ignore_errors=True)

    from matminer.featurizers.composition import BandCenter

    bc_feat = BandCenter()
    df = bc_feat.featurize_dataframe(df,
                                     "composition_oxid",
                                     ignore_errors=True)

    from matminer.featurizers.composition import CohesiveEnergy

    ce_feat = CohesiveEnergy()
    df = ce_feat.featurize_dataframe(df,
                                     "composition_oxid",
                                     ignore_errors=True)

    from matminer.featurizers.composition import Miedema

    m_feat = Miedema()
    df = m_feat.featurize_dataframe(df, "composition_oxid", ignore_errors=True)

    from matminer.featurizers.composition import TMetalFraction

    tmf_feat = TMetalFraction()
    df = tmf_feat.featurize_dataframe(df,
                                      "composition_oxid",
                                      ignore_errors=True)

    from matminer.featurizers.composition import ValenceOrbital

    vo_feat = ValenceOrbital()
    df = vo_feat.featurize_dataframe(df,
                                     "composition_oxid",
                                     ignore_errors=True)

    from matminer.featurizers.composition import YangSolidSolution

    yss_feat = YangSolidSolution()
    df = yss_feat.featurize_dataframe(df,
                                      "composition_oxid",
                                      ignore_errors=True)

    from matminer.featurizers.structure import GlobalSymmetryFeatures

    # This is the border between compositional features and structural features. Comment out the following featurizers to use only compostional features.

    gsf_feat = GlobalSymmetryFeatures()
    df = gsf_feat.featurize_dataframe(df, "structure", ignore_errors=True)

    from matminer.featurizers.structure import StructuralComplexity
    sc_feat = StructuralComplexity()
    df = sc_feat.featurize_dataframe(df, "structure", ignore_errors=True)

    from matminer.featurizers.structure import ChemicalOrdering
    co_feat = ChemicalOrdering()
    df = co_feat.featurize_dataframe(df, "structure", ignore_errors=True)

    from matminer.featurizers.structure import MaximumPackingEfficiency
    mpe_feat = MaximumPackingEfficiency()
    df = mpe_feat.featurize_dataframe(df, "structure", ignore_errors=True)

    from matminer.featurizers.structure import MinimumRelativeDistances
    mrd_feat = MinimumRelativeDistances()
    df = mrd_feat.featurize_dataframe(df, "structure", ignore_errors=True)

    from matminer.featurizers.structure import StructuralHeterogeneity
    sh_feat = StructuralHeterogeneity()
    df = sh_feat.featurize_dataframe(df, "structure", ignore_errors=True)

    from matminer.featurizers.structure import SiteStatsFingerprint

    from matminer.featurizers.site import AverageBondLength
    from pymatgen.analysis.local_env import CrystalNN
    bl_feat = SiteStatsFingerprint(
        AverageBondLength(CrystalNN(search_cutoff=20)))
    df = bl_feat.featurize_dataframe(df, "structure", ignore_errors=True)

    from matminer.featurizers.site import AverageBondAngle
    ba_feat = SiteStatsFingerprint(
        AverageBondAngle(CrystalNN(search_cutoff=20)))
    df = ba_feat.featurize_dataframe(df, "structure", ignore_errors=True)

    from matminer.featurizers.site import BondOrientationalParameter
    bop_feat = SiteStatsFingerprint(BondOrientationalParameter())
    df = bop_feat.featurize_dataframe(df, "structure", ignore_errors=True)

    from matminer.featurizers.site import CoordinationNumber
    cn_feat = SiteStatsFingerprint(CoordinationNumber())
    df = cn_feat.featurize_dataframe(df, "structure", ignore_errors=True)

    from matminer.featurizers.structure import DensityFeatures
    df_feat = DensityFeatures()
    df = df_feat.featurize_dataframe(df, "structure", ignore_errors=True)
    return (df)
# df = mpdr.get_dataframe({"elasticity": {"$exists": True}, "elasticity.warnings": []},
                        # ['pretty_formula', 'elasticity.K_VRH', 'elasticity.G_VRH']) 
criteria = {'elasticity.K_VRH': {'$ne': None}}
properties = ['pretty_formula', 'spacegroup.symbol', 'elasticity.K_VRH', 'elasticity.G_VRH','formation_energy_per_atom', 'band_gap',
              'e_above_hull', 'density', 'volume', 'nsites']
df = mpr.get_dataframe(criteria=criteria, properties=properties)
df1=pd.read_csv(r'D:\FYP_files\database\data_after_processing\huizong\huizong.csv')
df=df.reset_index()
df=pd.merge(df,df1)
df=df.set_index("material_id")
df = df[df['elasticity.K_VRH'] > 0]
df = df[df['e_above_hull'] < 0.1]  
df['vpa'] = df['volume']/df['nsites']        
df['poisson_ratio']=df[["elasticity.K_VRH","elasticity.G_VRH"]].apply(lambda x:(3*x["elasticity.K_VRH"]-2*x["elasticity.G_VRH"])/(6*x["elasticity.K_VRH"]+2*x["elasticity.G_VRH"]),axis=1)
from matminer.featurizers.conversions import StrToComposition
df = StrToComposition().featurize_dataframe(df, "pretty_formula")
from matminer.featurizers.composition import ElementProperty
ep_feat = ElementProperty.from_preset(preset_name="magpie")
df = ep_feat.featurize_dataframe(df, col_id="composition")  # input the "composition" column to the featurizer
from matminer.featurizers.conversions import CompositionToOxidComposition
from matminer.featurizers.composition import OxidationStates
df = CompositionToOxidComposition().featurize_dataframe(df, "composition")
os_feat = OxidationStates()
df = os_feat.featurize_dataframe(df, "composition_oxid")
dataset = PymatgenData()
descriptors = ['row', 'group', 'atomic_mass',
               'atomic_radius', 'boiling_point', 'melting_point', 'X']
stats = ["mean", "std_dev"]
ep = ElementProperty(data_source=dataset, features=descriptors, stats=stats)
df = ep.featurize_dataframe(df, "composition")
#Remove NaN values
Exemple #26
0
for entry in data_2['elasticity']:
    values_list.append(list(entry.values()))

for prop in tensor_list:
    prop_value = list()
    for materials_val_list in values_list:
        prop_value.append(materials_val_list[tensor_list.index(prop)])
    new_cols_val.append(prop_value)

for prop_name in tensor_list:
    data_2[prop_name] = new_cols_val[tensor_list.index(prop_name)]

# prepare for featurization
from matminer.featurizers.conversions import StrToComposition
data_3 = StrToComposition().featurize_dataframe(data_2, "pretty_formula")
#data_3.columns

# In[9]:

# Saving this intermediate dataset before defining training data and targets
import numpy as np
np.savez_compressed("heusler_all.npz", data=data_3)

# In[ ]:

# Featurization
# This part is done with reference to the matiner examples
from matminer.featurizers.composition import ElementProperty

ep_feat = ElementProperty.from_preset(preset_name="magpie")
    def __init__(self):
        self.filepath = './Utils/bandgap-magpie.csv'
        self.df = pd.read_csv(self.filepath)
        # drop duplicate values
        print('The shape of whole dataset before dropping duplicates is ' +
              str(self.df.shape))

        self.df = self.df.drop_duplicates(subset=['pretty_formula'],
                                          keep='first')
        print('The shape of whole dataset after dropping duplicates is ' +
              str(self.df.shape))

        self.df = self.df.sample(frac=0.0001, replace=True, random_state=1)
        added_columns_name = []
        for i in range(128):
            added_columns_name.append('V' + str(i))
        data = []
        # create composition column
        df_comp = StrToComposition(
            target_col_id='composition').featurize_dataframe(
                self.df, 'pretty_formula')
        # create column with maximum atom number
        max_atom_num = []
        for st in df_comp[['composition']].astype(str).values:
            atom_list = []
            s = st[0]
            for item in s.split():
                num = re.sub(r"\D", "", item)
                atom_list.append(int(num))
            max_atom_num.append(max(atom_list))

        # update dataframe with max_atom_num
        self.df['max_atom_num'] = max_atom_num
        # remove rows whose max atom number above 8
        self.df = self.df[self.df['max_atom_num'] < 9]
        # convert formula to latent vector
        for formula in self.df['pretty_formula']:
            print(formula)
            onehot_matrix = formula2onehot_matrix(formula, l=8)
            lat_vec = get_latent_space(onehot_matrix)
            lat_list = lat_vec.tolist()
            data.append(lat_list[0])
            print(formula + 'has been converted into latent vector~')

        df_added = pd.DataFrame(data, columns=added_columns_name)
        self.df.reset_index(drop=True, inplace=True)
        df_added.reset_index(drop=True, inplace=True)
        self.df = pd.concat([self.df, df_added], axis=1)

        # perform autoencode to pretty formula
        column_to_remove = ['material_id', 'max_atom_num']

        # generate column names
        self.df = self.df.drop(column_to_remove, axis=1)

        # rename columns to eliminate ' '
        column_rename = [
            'pretty_formula', 'band_gap', 'MagpieData_minimum_Number',
            'MagpieData_maximum_Number', 'MagpieData_range_Number',
            'MagpieData_mean_Number', 'MagpieData_avg_dev_Number',
            'MagpieData_mode_Number', 'MagpieData_minimum_MendeleevNumber',
            'MagpieData_maximum_MendeleevNumber',
            'MagpieData_range_MendeleevNumber',
            'MagpieData_mean_MendeleevNumber',
            'MagpieData_avg_dev_MendeleevNumber',
            'MagpieData_mode_MendeleevNumber',
            'MagpieData_minimum_AtomicWeight',
            'MagpieData_maximum_AtomicWeight', 'MagpieData_range_AtomicWeight',
            'MagpieData_mean_AtomicWeight', 'MagpieData_avg_dev_AtomicWeight',
            'MagpieData_mode_AtomicWeight', 'MagpieData_minimum_MeltingT',
            'MagpieData_maximum_MeltingT', 'MagpieData_range_MeltingT',
            'MagpieData_mean_MeltingT', 'MagpieData_avg_dev_MeltingT',
            'MagpieData_mode_MeltingT', 'MagpieData_minimum_Column',
            'MagpieData_maximum_Column', 'MagpieData_range_Column',
            'MagpieData_mean_Column', 'MagpieData_avg_dev_Column',
            'MagpieData_mode_Column', 'MagpieData_minimum_Row',
            'MagpieData_maximum_Row', 'MagpieData_range_Row',
            'MagpieData_mean_Row', 'MagpieData_avg_dev_Row',
            'MagpieData_mode_Row', 'MagpieData_minimum_CovalentRadius',
            'MagpieData_maximum_CovalentRadius',
            'MagpieData_range_CovalentRadius',
            'MagpieData_mean_CovalentRadius',
            'MagpieData_avg_dev_CovalentRadius',
            'MagpieData_mode_CovalentRadius',
            'MagpieData_minimum_Electronegativity',
            'MagpieData_maximum_Electronegativity',
            'MagpieData_range_Electronegativity',
            'MagpieData_mean_Electronegativity',
            'MagpieData_avg_dev_Electronegativity',
            'MagpieData_mode_Electronegativity',
            'MagpieData_minimum_NsValence', 'MagpieData_maximum_NsValence',
            'MagpieData_range_NsValence', 'MagpieData_mean_NsValence',
            'MagpieData_avg_dev_NsValence', 'MagpieData_mode_NsValence',
            'MagpieData_minimum_NpValence', 'MagpieData_maximum_NpValence',
            'MagpieData_range_NpValence', 'MagpieData_mean_NpValence',
            'MagpieData_avg_dev_NpValence', 'MagpieData_mode_NpValence',
            'MagpieData_minimum_NdValence', 'MagpieData_maximum_NdValence',
            'MagpieData_range_NdValence', 'MagpieData_mean_NdValence',
            'MagpieData_avg_dev_NdValence', 'MagpieData_mode_NdValence',
            'MagpieData_minimum_NfValence', 'MagpieData_maximum_NfValence',
            'MagpieData_range_NfValence', 'MagpieData_mean_NfValence',
            'MagpieData_avg_dev_NfValence', 'MagpieData_mode_NfValence',
            'MagpieData_minimum_NValence', 'MagpieData_maximum_NValence',
            'MagpieData_range_NValence', 'MagpieData_mean_NValence',
            'MagpieData_avg_dev_NValence', 'MagpieData_mode_NValence',
            'MagpieData_minimum_NsUnfilled', 'MagpieData_maximum_NsUnfilled',
            'MagpieData_range_NsUnfilled', 'MagpieData_mean_NsUnfilled',
            'MagpieData_avg_dev_NsUnfilled', 'MagpieData_mode_NsUnfilled',
            'MagpieData_minimum_NpUnfilled', 'MagpieData_maximum_NpUnfilled',
            'MagpieData_range_NpUnfilled', 'MagpieData_mean_NpUnfilled',
            'MagpieData_avg_dev_NpUnfilled', 'MagpieData_mode_NpUnfilled',
            'MagpieData_minimum_NdUnfilled', 'MagpieData_maximum_NdUnfilled',
            'MagpieData_range_NdUnfilled', 'MagpieData_mean_NdUnfilled',
            'MagpieData_avg_dev_NdUnfilled', 'MagpieData_mode_NdUnfilled',
            'MagpieData_minimum_NfUnfilled', 'MagpieData_maximum_NfUnfilled',
            'MagpieData_range_NfUnfilled', 'MagpieData_mean_NfUnfilled',
            'MagpieData_avg_dev_NfUnfilled', 'MagpieData_mode_NfUnfilled',
            'MagpieData_minimum_NUnfilled', 'MagpieData_maximum_NUnfilled',
            'MagpieData_range_NUnfilled', 'MagpieData_mean_NUnfilled',
            'MagpieData_avg_dev_NUnfilled', 'MagpieData_mode_NUnfilled',
            'MagpieData_minimum_GSvolume_pa', 'MagpieData_maximum_GSvolume_pa',
            'MagpieData_range_GSvolume_pa', 'MagpieData_mean_GSvolume_pa',
            'MagpieData_avg_dev_GSvolume_pa', 'MagpieData_mode_GSvolume_pa',
            'MagpieData_minimum_GSbandgap', 'MagpieData_maximum_GSbandgap',
            'MagpieData_range_GSbandgap', 'MagpieData_mean_GSbandgap',
            'MagpieData_avg_dev_GSbandgap', 'MagpieData_mode_GSbandgap',
            'MagpieData_minimum_GSmagmom', 'MagpieData_maximum_GSmagmom',
            'MagpieData_range_GSmagmom', 'MagpieData_mean_GSmagmom',
            'MagpieData_avg_dev_GSmagmom', 'MagpieData_mode_GSmagmom',
            'MagpieData_minimum_SpaceGroupNumber',
            'MagpieData_maximum_SpaceGroupNumber',
            'MagpieData_range_SpaceGroupNumber',
            'MagpieData_mean_SpaceGroupNumber',
            'MagpieData_avg_dev_SpaceGroupNumber',
            'MagpieData_mode_SpaceGroupNumber', 'V0', 'V1', 'V2', 'V3', 'V4',
            'V5', 'V6', 'V7', 'V8', 'V9', 'V10', 'V11', 'V12', 'V13', 'V14',
            'V15', 'V16', 'V17', 'V18', 'V19', 'V20', 'V21', 'V22', 'V23',
            'V24', 'V25', 'V26', 'V27', 'V28', 'V29', 'V30', 'V31', 'V32',
            'V33', 'V34', 'V35', 'V36', 'V37', 'V38', 'V39', 'V40', 'V41',
            'V42', 'V43', 'V44', 'V45', 'V46', 'V47', 'V48', 'V49', 'V50',
            'V51', 'V52', 'V53', 'V54', 'V55', 'V56', 'V57', 'V58', 'V59',
            'V60', 'V61', 'V62', 'V63', 'V64', 'V65', 'V66', 'V67', 'V68',
            'V69', 'V70', 'V71', 'V72', 'V73', 'V74', 'V75', 'V76', 'V77',
            'V78', 'V79', 'V80', 'V81', 'V82', 'V83', 'V84', 'V85', 'V86',
            'V87', 'V88', 'V89', 'V90', 'V91', 'V92', 'V93', 'V94', 'V95',
            'V96', 'V97', 'V98', 'V99', 'V100', 'V101', 'V102', 'V103', 'V104',
            'V105', 'V106', 'V107', 'V108', 'V109', 'V110', 'V111', 'V112',
            'V113', 'V114', 'V115', 'V116', 'V117', 'V118', 'V119', 'V120',
            'V121', 'V122', 'V123', 'V124', 'V125', 'V126', 'V127'
        ]
        self.df = self.df.set_axis(column_rename, axis=1, inplace=False)

        self.df.to_csv(r'bandgap_df_new_114.csv', index=False, header=True)
Exemple #28
0
    def _tidy_column(self, df, featurizer_type):
        """
        Various conversions to homogenize columns for featurization input.
        For example, take a column of compositions and ensure they are decorated
        with oxidation states, are not strings, etc.

        Args:
            df (pandas.DataFrame)
            featurizer_type: The key defining the featurizer input. For example,
                composition featurizers should have featurizer_type of
                "composition".

        Returns:
            df (pandas.DataFrame): DataFrame with featurizer_type column
                ready for featurization.
        """
        # todo: Make the following conversions more robust (no [0] type checking)
        type_tester = df[featurizer_type].iloc[0]

        if featurizer_type == self.composition_col:
            # Convert formulas to composition objects
            if isinstance(type_tester, str):
                self.logger.info(
                    self._log_prefix +
                    "Compositions detected as strings. Attempting "
                    "conversion to Composition objects...")
                stc = StrToComposition(overwrite_data=True,
                                       target_col_id=featurizer_type)
                df = stc.featurize_dataframe(df,
                                             featurizer_type,
                                             multiindex=self.multiindex,
                                             ignore_errors=True,
                                             inplace=False)

            elif isinstance(type_tester, dict):
                self.logger.info(self._log_prefix +
                                 "Compositions detected as dicts. Attempting "
                                 "conversion to Composition objects...")
                df[featurizer_type] = [
                    Composition.from_dict(d) for d in df[featurizer_type]
                ]

            # Convert non-oxidstate containing comps to oxidstate comps
            if self.guess_oxistates:
                self.logger.info(
                    self._log_prefix +
                    "Guessing oxidation states of compositions, as"
                    " they were not present in input.")
                cto = CompositionToOxidComposition(
                    target_col_id=featurizer_type,
                    overwrite_data=True,
                    return_original_on_error=True,
                    max_sites=-50)
                try:
                    df = cto.featurize_dataframe(df,
                                                 featurizer_type,
                                                 multiindex=self.multiindex,
                                                 inplace=False)
                except Exception as e:
                    self.logger.info(self._log_prefix +
                                     "Could not decorate oxidation states due "
                                     "to {}. Excluding featurizers based on "
                                     "composition oxistates".format(e))
                    classes_require_oxi = [
                        c.__class__.__name__
                        for c in CompositionFeaturizers().need_oxi
                    ]
                    self.exclude.extend(classes_require_oxi)

        else:
            # Convert structure/bs/dos dicts to objects (robust already)
            if isinstance(type_tester, (dict, str)):
                self.logger.info(self._log_prefix.capitalize() +
                                 "{} detected as string or dict. Attempting "
                                 "conversion to {} objects..."
                                 "".format(featurizer_type, featurizer_type))
                if isinstance(type_tester, str):
                    raise ValueError("{} column is type {}. Cannot convert."
                                     "".format(featurizer_type,
                                               type(type_tester)))
                dto = DictToObject(overwrite_data=True,
                                   target_col_id=featurizer_type)
                df = dto.featurize_dataframe(df,
                                             featurizer_type,
                                             inplace=False)

                # Decorate with oxidstates
                if featurizer_type == self.structure_col and \
                        self.guess_oxistates:
                    self.logger.info(
                        self._log_prefix +
                        "Guessing oxidation states of structures if they were "
                        "not present in input.")
                    sto = StructureToOxidStructure(
                        target_col_id=featurizer_type,
                        overwrite_data=True,
                        return_original_on_error=True,
                        max_sites=-50)
                    try:
                        df = sto.featurize_dataframe(
                            df,
                            featurizer_type,
                            multiindex=self.multiindex,
                            inplace=False)
                    except Exception as e:
                        self.logger.info(
                            self._log_prefix +
                            "Could not decorate oxidation states on structures "
                            "due to {}.".format(e))
        return df
from matminer.featurizers import composition as cf
from matminer.featurizers.conversions import StrToComposition
import numpy as np
import pandas as pd
import csv
import os
import itertools
from pymatgen import Composition
from pymatgen.core.periodic_table import Element

# Read in dataset
filepath = "pifs.csv"
glass_data = pd.read_csv(filepath)
# Make the compositions of the glasses data into pymatgen objects to match the data from OQMD
# Convert compositions to pymatgen objects.
comps = StrToComposition().featurize_dataframe(
    glass_data, "formula", ignore_errors=True)["composition"]

# Loop through all elements and list the ones that come up.
# Also keep track fo how many elements there are of each.
majority = []
for c in comps:
    print(c)
    max_comp = -1
    main_element = ""
    elements = c.items()
    for e in elements:
        if e[1] > max_comp:
            max_comp = e[1]
            main_element = e[0]
    majority.append(str(main_element))
Exemple #30
0
        json.loads(
            urlopen("http://aflowlib.duke.edu/search/API/?" + MATCHBOOK +
                    ",$paging(0)").read().decode("utf-8")))['compound']

matrix = pd.DataFrame([metal] * len(substrate))

Tsplit = [float(i) for i in Trange.split('-')]
if Tsplit[2] != 0:
    Tlist = np.arange(Tsplit[0], Tsplit[1], Tsplit[2]).tolist()

sys_cond_0 = pd.concat([matrix, substrate], axis=1)
sys_cond_0['Temp'] = pd.DataFrame([Tsplit[0]] * len(substrate))
sys_cond_0.columns = ['Metal', 'Substrate', 'Temp']

metal_matminer = pd.DataFrame([metal], columns=['Metal'])
metal_matminer = StrToComposition(target_col_id='Me_comp').featurize_dataframe(
    metal_matminer, 'Metal')
data_Me = magpie.featurize_dataframe(metal_matminer,
                                     col_id="Me_comp",
                                     ignore_errors=True)
metal_features = pd.DataFrame(data_Me.values.tolist() * len(substrate),
                              columns=data_Me.columns)
feature_Me = metal_features.filter(like='mean')
feature_Me = feature_Me.drop(columns=['MagpieData mean NfUnfilled'])
feature_Me.columns = ['Me_' + j for j in feature_Me.columns]

sys_cond_0 = StrToComposition(target_col_id='Sub_comp').featurize_dataframe(
    sys_cond_0, 'Substrate')
data_Sub = magpie.featurize_dataframe(sys_cond_0,
                                      col_id="Sub_comp",
                                      ignore_errors=True)
feature_Sub = data_Sub.filter(like='mean')