def composition_featurizer(df_input: pd.DataFrame, **kwargs) -> pd.DataFrame:
    """Return a Pandas DataFrame with all compositional features"""

    # generate the "composition" column
    df_comp = StrToComposition().featurize_dataframe(df_input,
                                                     col_id="Compound")
    # generate features based on elemental properites
    ep_featurizer = ElementProperty.from_preset(preset_name="magpie")
    ep_featurizer.featurize_dataframe(df_comp,
                                      col_id="composition",
                                      inplace=True)
    # generate the "composition_oxid" column based on guessed oxidation states
    CompositionToOxidComposition(
        return_original_on_error=True, **kwargs).featurize_dataframe(
            # ignore errors from non-integer stoichiometries
            df_comp,
            "composition",
            ignore_errors=True,
            inplace=True)
    # correct oxidation states
    df_comp = correct_comp_oxid(df_comp)
    # generate features based on oxidation states
    os_featurizer = OxidationStates()
    os_featurizer.featurize_dataframe(df_comp,
                                      "composition_oxid",
                                      ignore_errors=True,
                                      inplace=True)
    # remove compounds with predicted oxidation states of 0
    return df_comp[df_comp["minimum oxidation state"] != 0]
Exemple #2
0
 def test_oxidation_states(self):
     featurizer = OxidationStates.from_preset("deml")
     features = dict(
         zip(featurizer.feature_labels(),
             featurizer.featurize(self.df["composition"][1])))
     self.assertAlmostEqual(4, features["range oxidation state"])
     self.assertAlmostEqual(2, features["maximum oxidation state"])
Exemple #3
0
def featurize_composition(df: pd.DataFrame) -> pd.DataFrame:
    """ Decorate input `pandas.DataFrame` of structures with composition
    features from matminer.

    Currently applies the set of all matminer composition features.

    Args:
        df (pandas.DataFrame): the input dataframe with `"structure"`
            column containing `pymatgen.Structure` objects.

    Returns:
        pandas.DataFrame: the decorated DataFrame.

    """
    logging.info("Applying composition featurizers...")
    df = df.copy()
    df['composition'] = df['structure'].apply(lambda s: s.composition)
    featurizer = MultipleFeaturizer([ElementProperty.from_preset("magpie"),
                                     AtomicOrbitals(),
                                     BandCenter(),
                                     # ElectronAffinity(), - This descriptor was not used in the paper preset
                                     Stoichiometry(),
                                     ValenceOrbital(),
                                     IonProperty(),
                                     ElementFraction(),
                                     TMetalFraction(),
                                     # CohesiveEnergy(), - This descriptor was not used in the paper preset
                                     Miedema(),
                                     YangSolidSolution(),
                                     AtomicPackingEfficiency(),
                                     ])

    df = featurizer.featurize_dataframe(df, "composition", multiindex=True, ignore_errors=True)
    df.columns = df.columns.map('|'.join).str.strip('|')

    ox_featurizer = MultipleFeaturizer([OxidationStates(),
                                        ElectronegativityDiff()
                                        ])

    df = CompositionToOxidComposition().featurize_dataframe(df, "Input Data|composition")

    df = ox_featurizer.featurize_dataframe(df, "composition_oxid", multiindex=True, ignore_errors=True)
    df = df.rename(columns={'Input Data': ''})
    df.columns = df.columns.map('|'.join).str.strip('|')

    _orbitals = {"s": 1, "p": 2, "d": 3, "f": 4}

    df['AtomicOrbitals|HOMO_character'] = df['AtomicOrbitals|HOMO_character'].map(_orbitals)
    df['AtomicOrbitals|LUMO_character'] = df['AtomicOrbitals|LUMO_character'].map(_orbitals)

    df['AtomicOrbitals|HOMO_element'] = df['AtomicOrbitals|HOMO_element'].apply(
        lambda x: -1 if not isinstance(x, str) else Element(x).Z
    )
    df['AtomicOrbitals|LUMO_element'] = df['AtomicOrbitals|LUMO_element'].apply(
        lambda x: -1 if not isinstance(x, str) else Element(x).Z
    )

    df = df.replace([np.inf, -np.inf, np.nan], 0)

    return clean_df(df)
Exemple #4
0
def generate_data(name):
    #这个函数作用,输入是指定的文件名,输出增加了gaps,is_daoti,以及其他共计145特征的完整向量矩阵
    #name='test_plus_gaps.csv'
    df=pd.read_csv(name,index_col=[0])
    df['gaps']=-10.0   
    df_gap=pd.read_csv("gaps.csv",index_col = [0])
    print(df_gap.index)
    i=0    
    str_s=""
    for j in range(len(df_gap.index)):
        #先打印二者的id
       # print(df.index[i])
        str_s='mp-'+str(df_gap.index[j])
        if(str_s==df.index[i]):
            df.iloc[i,-1]=df_gap.iloc[j,0]
            i=i+1
            #print("确实一样") 
    print("合并完毕")

    #同样的方法我们来建立不同的分类
    df['is_daoti']=-2
    for i in range(len(df.index)):
        if(df.ix[i,-2]==0):
            df.ix[i,-1]=1
        else:
            df.ix[i,-1]=0
    print("分类feature建立完成")   
    
#首先使用describe获得对于数据的整体把握
    print(df.describe())
    df.describe().to_csv('general_look_jie.csv')
#通过观察数据发现并没有什么异常之处
    df=StrToComposition().featurize_dataframe(df,'full_formula',ignore_errors=True)
    print(df.head())   
    #print(df['composition'])
    ep_feat=ElementProperty.from_preset(preset_name='magpie')
    df=ep_feat.featurize_dataframe(df,col_id='composition',ignore_errors=True)#将composition这一列作为特征化的输入
    print(df.head())
    #print(ep_feat.citations())
    #df.to_csv("plus the composition.csv")
    #以上这部分是将formula转化为composition并转化feature

    df=CompositionToOxidComposition().featurize_dataframe(df,col_id='composition')#引入了氧化态的相关特征
    os_feat=OxidationStates()
    df=os_feat.featurize_dataframe(df,col_id='composition_oxid')
    new_name='2d_vector_plus.csv'
    df.to_csv(new_name)
def test_featurizers():
    df = pd.read_csv('test.csv', index_col=[0])
    df = StrToComposition().featurize_dataframe(df, 'formula')
    print(df.head())
    #下一步,我们需要其中一个特征化来增加一系列的特征算符
    ep_feat = ElementProperty.from_preset(preset_name='magpie')
    df = ep_feat.featurize_dataframe(
        df, col_id='composition')  #将composition这一列作为特征化的输入
    print(df.head())
    print(ep_feat.citations())
    #df.to_csv('将composition特征化后.csv')

    #开始引入新的特征化算符吧
    df = CompositionToOxidComposition().featurize_dataframe(
        df, 'composition')  #引入了氧化态的相关特征
    os_feat = OxidationStates()
    df = os_feat.featurize_dataframe(df, col_id='composition_oxid')
    print(df.head())
    df.to_csv('after_test.csv')
def generate_data():
    df = load_elastic_tensor()
    df.to_csv('原始elastic数据.csv')
    print(df.columns)

    unwanted_columns = [
        'volume', 'nsites', 'compliance_tensor', 'elastic_tensor',
        'elastic_tensor_original', 'K_Voigt', 'G_Voigt', 'K_Reuss', 'G_Reuss'
    ]
    df = df.drop(unwanted_columns, axis=1)
    print(df.head())
    df.to_csv('扔掉不需要的部分.csv')

    #首先使用describe获得对于数据的整体把握
    print(df.describe())
    df.describe().to_csv('general_look.csv')
    #通过观察数据发现并没有什么异常之处
    df = StrToComposition().featurize_dataframe(df, 'formula')
    print(df.head())
    df.to_csv('引入composition.csv')

    #下一步,我们需要其中一个特征化来增加一系列的特征算符
    ep_feat = ElementProperty.from_preset(preset_name='magpie')
    df = ep_feat.featurize_dataframe(
        df, col_id='composition')  #将composition这一列作为特征化的输入
    print(df.head())
    print(ep_feat.citations())
    df.to_csv('将composition特征化后.csv')

    #开始引入新的特征化算符吧
    df = CompositionToOxidComposition().featurize_dataframe(
        df, 'composition')  #引入了氧化态的相关特征
    os_feat = OxidationStates()
    df = os_feat.featurize_dataframe(df, col_id='composition_oxid')
    print(df.head())
    df.to_csv('引入氧化态之后.csv')

    #其实除了基于composition的特征之外还有很多其他的,比如基于结构的
    df_feat = DensityFeatures()
    df = df_feat.featurize_dataframe(df, 'structure')
    print(df.head())
    df.to_csv('引入结构中的密度.csv')
    print(df_feat.feature_labels())
Exemple #7
0
def add_cs_features(df,rdf_flag=False):

  df["composition"] = str_to_composition(df["pretty_formula"]) 
  df["composition_oxid"] = composition_to_oxidcomposition(df["composition"])
  df["structure"] = dict_to_object(df["structure"]) 

  vo = ValenceOrbital()
  df = vo.featurize_dataframe(df,"composition")

  ox = OxidationStates()
  df = ox.featurize_dataframe(df, "composition_oxid")
  
  # structure features
  den = DensityFeatures()
  df = den.featurize_dataframe(df, "structure")
  
  if rdf_flag:
    rdf = RadialDistributionFunction(cutoff=15.0,bin_size=0.2)
    df = rdf.featurize_dataframe(df, "structure") 
  
  return df
Exemple #8
0
class DeBreuck2020Featurizer(modnet.featurizers.MODFeaturizer):
    """ Featurizer presets used for the paper 'Machine learning
    materials properties for small datasets' by Pierre-Paul De Breuck,
    Geoffroy Hautier & Gian-Marco Rignanese, arXiv:2004.14766 (2020).

    Uses most of the featurizers implemented by matminer at the time of
    writing with their default hyperparameters and presets.

    """
    from matminer.featurizers.composition import (
        AtomicOrbitals,
        AtomicPackingEfficiency,
        BandCenter,
        # CohesiveEnergy, - This descriptor was not used in the paper preset
        # ElectronAffinity, - This descriptor was not used in the paper preset
        ElectronegativityDiff,
        ElementFraction,
        ElementProperty,
        IonProperty,
        Miedema,
        OxidationStates,
        Stoichiometry,
        TMetalFraction,
        ValenceOrbital,
        YangSolidSolution,
    )
    from matminer.featurizers.structure import (
        # BagofBonds, - This descriptor was not used in the paper preset
        BondFractions,
        ChemicalOrdering,
        CoulombMatrix,
        DensityFeatures,
        EwaldEnergy,
        GlobalSymmetryFeatures,
        MaximumPackingEfficiency,
        # PartialRadialDistributionFunction,
        RadialDistributionFunction,
        SineCoulombMatrix,
        StructuralHeterogeneity,
        XRDPowderPattern,
    )

    from matminer.featurizers.site import (
        AGNIFingerprints,
        AverageBondAngle,
        AverageBondLength,
        BondOrientationalParameter,
        ChemEnvSiteFingerprint,
        CoordinationNumber,
        CrystalNNFingerprint,
        GaussianSymmFunc,
        GeneralizedRadialDistributionFunction,
        LocalPropertyDifference,
        OPSiteFingerprint,
        VoronoiFingerprint,
    )

    composition_featurizers = (
        AtomicOrbitals(),
        AtomicPackingEfficiency(),
        BandCenter(),
        ElementFraction(),
        ElementProperty.from_preset("magpie"),
        IonProperty(),
        Miedema(),
        Stoichiometry(),
        TMetalFraction(),
        ValenceOrbital(),
        YangSolidSolution(),
    )

    oxide_composition_featurizers = (
        ElectronegativityDiff(),
        OxidationStates(),
    )

    structure_featurizers = (
        DensityFeatures(),
        GlobalSymmetryFeatures(),
        RadialDistributionFunction(),
        CoulombMatrix(),
        # PartialRadialDistributionFunction(),
        SineCoulombMatrix(),
        EwaldEnergy(),
        BondFractions(),
        StructuralHeterogeneity(),
        MaximumPackingEfficiency(),
        ChemicalOrdering(),
        XRDPowderPattern(),
        # BagofBonds(),
    )
    site_featurizers = (
        AGNIFingerprints(),
        AverageBondAngle(VoronoiNN()),
        AverageBondLength(VoronoiNN()),
        BondOrientationalParameter(),
        ChemEnvSiteFingerprint.from_preset("simple"),
        CoordinationNumber(),
        CrystalNNFingerprint.from_preset("ops"),
        GaussianSymmFunc(),
        GeneralizedRadialDistributionFunction.from_preset("gaussian"),
        LocalPropertyDifference(),
        OPSiteFingerprint(),
        VoronoiFingerprint(),
    )

    def featurize_composition(self, df):
        """ Applies the preset composition featurizers to the input dataframe,
        renames some fields and cleans the output dataframe.

        """
        df = super().featurize_composition(df)

        _orbitals = {"s": 1, "p": 2, "d": 3, "f": 4}
        df['AtomicOrbitals|HOMO_character'] = df[
            'AtomicOrbitals|HOMO_character'].map(_orbitals)
        df['AtomicOrbitals|LUMO_character'] = df[
            'AtomicOrbitals|LUMO_character'].map(_orbitals)

        df['AtomicOrbitals|HOMO_element'] = df[
            'AtomicOrbitals|HOMO_element'].apply(
                lambda x: -1 if not isinstance(x, str) else Element(x).Z)
        df['AtomicOrbitals|LUMO_element'] = df[
            'AtomicOrbitals|LUMO_element'].apply(
                lambda x: -1 if not isinstance(x, str) else Element(x).Z)

        df = df.replace([np.inf, -np.inf, np.nan], 0)

        return modnet.featurizers.clean_df(df)

    def featurize_structure(self, df):
        """ Applies the preset structural featurizers to the input dataframe,
        renames some fields and cleans the output dataframe.

        """
        df = super().featurize_structure(df)

        dist = df[
            "RadialDistributionFunction|radial distribution function"].iloc[0][
                'distances'][:50]
        for i, d in enumerate(dist):
            _rdf_key = "RadialDistributionFunction|radial distribution function|d_{:.2f}".format(
                d)
            df[_rdf_key] = df[
                "RadialDistributionFunction|radial distribution function"].apply(
                    lambda x: x['distribution'][i])

        df = df.drop("RadialDistributionFunction|radial distribution function",
                     axis=1)

        _crystal_system = {
            "cubic": 1,
            "tetragonal": 2,
            "orthorombic": 3,
            "hexagonal": 4,
            "trigonal": 5,
            "monoclinic": 6,
            "triclinic": 7
        }

        def _int_map(x):
            if x == np.nan:
                return 0
            elif x:
                return 1
            else:
                return 0

        df["GlobalSymmetryFeatures|crystal_system"] = df[
            "GlobalSymmetryFeatures|crystal_system"].map(_crystal_system)
        df["GlobalSymmetryFeatures|is_centrosymmetric"] = df[
            "GlobalSymmetryFeatures|is_centrosymmetric"].map(_int_map)

        return modnet.featurizers.clean_df(df)

    def featurize_site(self, df):
        """ Applies the preset site featurizers to the input dataframe,
        renames some fields and cleans the output dataframe.

        """

        # rename some features for backwards compatibility with pretrained models
        aliases = {
            "GeneralizedRadialDistributionFunction": "GeneralizedRDF",
            "AGNIFingerprints": "AGNIFingerPrint",
            "BondOrientationalParameter": "BondOrientationParameter",
            "GaussianSymmFunc": "ChemEnvSiteFingerprint|GaussianSymmFunc",
        }
        df = super().featurize_site(df, aliases=aliases)
        df = df.loc[:, (df != 0).any(axis=0)]

        return modnet.featurizers.clean_df(df)
Exemple #9
0
# In[ ]:

# Featurization
# This part is done with reference to the matiner examples
from matminer.featurizers.composition import ElementProperty

ep_feat = ElementProperty.from_preset(preset_name="magpie")
data_3 = ep_feat.featurize_dataframe(data_3, col_id="composition")

from matminer.featurizers.conversions import CompositionToOxidComposition
from matminer.featurizers.composition import OxidationStates

data_3 = CompositionToOxidComposition().featurize_dataframe(
    data_3, "composition")

os_feat = OxidationStates()
data_3 = os_feat.featurize_dataframe(data_3, "composition_oxid")

from matminer.featurizers.structure import DensityFeatures

df_feat = DensityFeatures()
data_3 = df_feat.featurize_dataframe(data_3, "structure")

unwanted_columns = [
    "elasticity", "material_id", "nsites", "compliance_tensor",
    "elastic_tensor", "elastic_tensor_original", "K_Voigt", "G_Voigt",
    "K_Reuss", "G_Reuss", "warnings"
]
data_4 = data_3.drop(unwanted_columns, axis=1)

# In[ ]:
Exemple #10
0
new_cols = []
for col in fdf.columns:
    if 'MagpieData' in col:
        new_col = col.split('MagpieData ', 1)
        del new_col[0]
        fin_col = ''.join(new_col)
        new_cols.append(fin_col)

cols_dict = dict(zip(magpie_cols, new_cols))
fdf = fdf.rename(columns=cols_dict)
# -- end F1

# -- start F3 --
from matminer.featurizers.composition import OxidationStates

os_feat = OxidationStates()
fdf = os_feat.featurize_dataframe(fdf, 'composition_oxid', ignore_errors=True)
# -- end F3

# -- start F4 --
from matminer.featurizers.composition import AtomicOrbitals

ao_feat = AtomicOrbitals()
fdf = ao_feat.featurize_dataframe(fdf,
                                  col_id='composition',
                                  ignore_errors=True)
# -- end F4

# -- start F5
from matminer.featurizers.composition import BandCenter
Exemple #11
0
class FUTURE_PROSPECTS_2021(featurizer.extendedMODFeaturizer):

    from matminer.featurizers.composition import (
        AtomicOrbitals,
        AtomicPackingEfficiency,
        BandCenter,
        CohesiveEnergy,
        ElectronAffinity,
        ElectronegativityDiff,
        ElementFraction,
        ElementProperty,
        IonProperty,
        Miedema,
        OxidationStates,
        Stoichiometry,
        TMetalFraction,
        ValenceOrbital,
        YangSolidSolution,
    )
    from matminer.featurizers.structure import (
        BagofBonds,
        BondFractions,
        ChemicalOrdering,
        CoulombMatrix,
        DensityFeatures,
        EwaldEnergy,
        GlobalSymmetryFeatures,
        MaximumPackingEfficiency,
        PartialRadialDistributionFunction,
        RadialDistributionFunction,
        SineCoulombMatrix,
        StructuralHeterogeneity,
        XRDPowderPattern,
    )

    from matminer.featurizers.site import (
        AGNIFingerprints,
        AverageBondAngle,
        AverageBondLength,
        BondOrientationalParameter,
        ChemEnvSiteFingerprint,
        CoordinationNumber,
        CrystalNNFingerprint,
        GaussianSymmFunc,
        GeneralizedRadialDistributionFunction,
        LocalPropertyDifference,
        OPSiteFingerprint,
        VoronoiFingerprint,
    )
    from matminer.featurizers.dos import (
        DOSFeaturizer,
        SiteDOS,
        Hybridization,
        DosAsymmetry,
    )
    from matminer.featurizers.bandstructure import (
        BandFeaturizer,
        BranchPointEnergy
    )

    composition_featurizers = (
        AtomicOrbitals(),
        AtomicPackingEfficiency(),
        BandCenter(),
        ElementFraction(),
        ElementProperty.from_preset("magpie"),
        IonProperty(),
        Miedema(),
        Stoichiometry(),
        TMetalFraction(),
        ValenceOrbital(),
        YangSolidSolution(),
    )

    oxid_composition_featurizers = (
        ElectronegativityDiff(),
        OxidationStates(),
    )

    structure_featurizers = (
        DensityFeatures(),
        GlobalSymmetryFeatures(),
        RadialDistributionFunction(),
        CoulombMatrix(),
        #PartialRadialDistributionFunction(), #Introduces a large amount of features
        SineCoulombMatrix(),
        EwaldEnergy(),
        BondFractions(),
        StructuralHeterogeneity(),
        MaximumPackingEfficiency(),
        ChemicalOrdering(),
        XRDPowderPattern(),
    )
    site_featurizers = (
        AGNIFingerprints(),
        AverageBondAngle(VoronoiNN()),
        AverageBondLength(VoronoiNN()),
        BondOrientationalParameter(),
        ChemEnvSiteFingerprint.from_preset("simple"),
        CoordinationNumber(),
        CrystalNNFingerprint.from_preset("ops"),
        GaussianSymmFunc(),
        GeneralizedRadialDistributionFunction.from_preset("gaussian"),
        LocalPropertyDifference(),
        OPSiteFingerprint(),
        VoronoiFingerprint(),
    )

    dos_featurizers = (
        DOSFeaturizer(),
        SiteDOS(),
        Hybridization()
    )

    band_featurizers = (
        BandFeaturizer(),
        BranchPointEnergy()
    )
    def __init__(self, n_jobs=None):
            self._n_jobs = n_jobs

    def featurize_composition(self, df):
        """Applies the preset composition featurizers to the input dataframe,
        renames some fields and cleans the output dataframe.
        """
        df = super().featurize_composition(df)

        _orbitals = {"s": 1, "p": 2, "d": 3, "f": 4}
        df["AtomicOrbitals|HOMO_character"] = df["AtomicOrbitals|HOMO_character"].map(
            _orbitals
        )
        df["AtomicOrbitals|LUMO_character"] = df["AtomicOrbitals|LUMO_character"].map(
            _orbitals
        )

        df["AtomicOrbitals|HOMO_element"] = df["AtomicOrbitals|HOMO_element"].apply(
            lambda x: -1 if not isinstance(x, str) else Element(x).Z
        )
        df["AtomicOrbitals|LUMO_element"] = df["AtomicOrbitals|LUMO_element"].apply(
            lambda x: -1 if not isinstance(x, str) else Element(x).Z
        )

        return clean_df(df)

    def featurize_structure(self, df):
        """Applies the preset structural featurizers to the input dataframe,
        renames some fields and cleans the output dataframe.
        """
        df = super().featurize_structure(df)

        dist = df["RadialDistributionFunction|radial distribution function"].iloc[0][
            "distances"
        ][:50]
        for i, d in enumerate(dist):
            _rdf_key = "RadialDistributionFunction|radial distribution function|d_{:.2f}".format(
                d
            )
            df[_rdf_key] = df[
                "RadialDistributionFunction|radial distribution function"
            ].apply(lambda x: x["distribution"][i])

        df = df.drop("RadialDistributionFunction|radial distribution function", axis=1)

        _crystal_system = {
            "cubic": 1,
            "tetragonal": 2,
            "orthorombic": 3,
            "hexagonal": 4,
            "trigonal": 5,
            "monoclinic": 6,
            "triclinic": 7,
        }

        def _int_map(x):
            if x == np.nan:
                return 0
            elif x:
                return 1
            else:
                return 0

        df["GlobalSymmetryFeatures|crystal_system"] = df[
            "GlobalSymmetryFeatures|crystal_system"
        ].map(_crystal_system)
        df["GlobalSymmetryFeatures|is_centrosymmetric"] = df[
            "GlobalSymmetryFeatures|is_centrosymmetric"
        ].map(_int_map)

        return clean_df(df)

    def featurize_dos(self, df):
        """Applies the presetdos featurizers to the input dataframe,
        renames some fields and cleans the output dataframe.
        """

        df = super().featurize_dos(df)


        hotencodeColumns = ["DOSFeaturizer|vbm_specie_1","DOSFeaturizer|cbm_specie_1"]

        one_hot = pd.get_dummies(df[hotencodeColumns])
        df = df.drop(hotencodeColumns, axis = 1).join(one_hot)

        _orbitals = {"s": 1, "p": 2, "d": 3, "f": 4}

        df["DOSFeaturizer|vbm_character_1"] = df[
           "DOSFeaturizer|vbm_character_1"
           ].map(_orbitals)
        df["DOSFeaturizer|cbm_character_1"] = df[
           "DOSFeaturizer|cbm_character_1"
           ].map(_orbitals)

        # Splitting one feature into several floating features
        # e.g. number;number;number into three columns
        splitColumns = ["DOSFeaturizer|cbm_location_1", "DOSFeaturizer|vbm_location_1"]

        for column in splitColumns:
            try:
                newColumns = df[column].str.split(";", n = 2, expand = True)
                for i in range(0,3):
                    df[column + "_" + str(i)] = np.array(newColumns[i]).astype(np.float)
            except:
                continue
        df = df.drop(splitColumns, axis=1)
        df = df.drop(["dos"], axis=1)
        return clean_df(df)

    def featurize_bandstructure(self, df):
        """Applies the preset band structure featurizers to the input dataframe,
        renames some fields and cleans the output dataframe.
        """

        df = super().featurize_bandstructure(df)

        def _int_map(x):
            if str(x) == "False":
                return 0
            elif str(x) == "True":
                return 1

        df["BandFeaturizer|is_gap_direct"] = df[
            "BandFeaturizer|is_gap_direct"
        ].map(_int_map)


        df = df.drop(["bandstructure"], axis=1)

        return clean_df(df)


    def featurize_site(self, df):
        """Applies the preset site featurizers to the input dataframe,
        renames some fields and cleans the output dataframe.
        """

        aliases = {
            "GeneralizedRadialDistributionFunction": "GeneralizedRDF",
            "AGNIFingerprints": "AGNIFingerPrint",
            "BondOrientationalParameter": "BondOrientationParameter",
            "GaussianSymmFunc": "ChemEnvSiteFingerprint|GaussianSymmFunc",
        }
        df = super().featurize_site(df, aliases=aliases)
        df = df.loc[:, (df != 0).any(axis=0)]

        return clean_df(df)
Exemple #12
0
def predict_log10_eps(
    target: Union[Structure, Composition],
    dielectric_type: str,
    model_type: str,
) -> float:
    """
    :param target: structure or composition to predict dielectric constants
    :param dielectric_type: "el" or "ion"
    :param model_type: "comp" or "comp_st"
    :return: Descriptor vector
    """
    if dielectric_type not in ["el", "ion"]:
        raise ValueError(
            f'Specify dielectric type "el" or "ion"\nInput: {dielectric_type}')
    if model_type not in ["comp", "comp_st"]:
        raise ValueError(
            f'Specify regression_type "comp" or "comp_st"\nInput: {model_type}'
        )

    if model_type == "comp":
        if isinstance(target, Structure):
            comp = target.composition
        else:
            comp = target
        comp_oxi = comp.add_charges_from_oxi_state_guesses()
        if dielectric_type == "el":
            ep = ScalarFeaturizer(ElementProperty.from_preset("matminer"),
                                  comp)
            valence = ScalarFeaturizer(ValenceOrbital(), comp)
            ion_prop = ScalarFeaturizer(IonProperty(), comp)
            en_diff = ScalarFeaturizer(ElectronegativityDiff(), comp_oxi)
            oxi_state = ScalarFeaturizer(OxidationStates.from_preset("deml"),
                                         comp_oxi)
            atomic_orbital = ScalarFeaturizer(AtomicOrbitals(), comp)
            descriptor = [
                ep.get_from_label("PymatgenData minimum X"),
                ep.get_from_label("PymatgenData range X"),
                ep.get_from_label("PymatgenData std_dev X"),
                ep.get_from_label("PymatgenData mean row"),
                ep.get_from_label("PymatgenData std_dev row"),
                ep.get_from_label("PymatgenData mean group"),
                ep.get_from_label("PymatgenData mean block"),
                ep.get_from_label("PymatgenData std_dev block"),
                ep.get_from_label("PymatgenData mean atomic_mass"),
                ep.get_from_label("PymatgenData std_dev atomic_mass"),
                ep.get_from_label("PymatgenData std_dev atomic_radius"),
                ep.get_from_label("PymatgenData minimum mendeleev_no"),
                ep.get_from_label("PymatgenData range mendeleev_no"),
                ep.get_from_label("PymatgenData std_dev mendeleev_no"),
                ep.get_from_label("PymatgenData mean thermal_conductivity"),
                ep.get_from_label("PymatgenData std_dev thermal_conductivity"),
                ep.get_from_label("PymatgenData mean melting_point"),
                ep.get_from_label("PymatgenData std_dev melting_point"),
                valence.get_from_label("avg s valence electrons"),
                valence.get_from_label("avg d valence electrons"),
                valence.get_from_label("frac s valence electrons"),
                valence.get_from_label("frac p valence electrons"),
                valence.get_from_label("frac d valence electrons"),
                ion_prop.get_from_label("avg ionic char"),
                TMetalFraction().featurize(comp)[0],
                en_diff.get_from_label("maximum EN difference"),
                en_diff.get_from_label("range EN difference"),
                en_diff.get_from_label("mean EN difference"),
                en_diff.get_from_label("std_dev EN difference"),
                BandCenter().featurize(comp)[0],
                oxi_state.get_from_label("std_dev oxidation state"),
                atomic_orbital.get_from_label("HOMO_energy"),
                atomic_orbital.get_from_label("LUMO_energy"),
                atomic_orbital.get_from_label("gap_AO"),
            ]
        elif dielectric_type == "ion":
            stoich = ScalarFeaturizer(Stoichiometry(), comp)
            ep = ScalarFeaturizer(ElementProperty.from_preset("matminer"),
                                  comp)
            valence = ScalarFeaturizer(ValenceOrbital(), comp)
            ion_prop = ScalarFeaturizer(IonProperty(), comp)
            en_diff = ScalarFeaturizer(ElectronegativityDiff(), comp_oxi)
            oxi_state = ScalarFeaturizer(OxidationStates.from_preset("deml"),
                                         comp_oxi)
            atomic_orbital = ScalarFeaturizer(AtomicOrbitals(), comp)
            at_pack_eff = ScalarFeaturizer(AtomicPackingEfficiency(), comp)
            descriptor = [
                stoich.get_from_label("3-norm"),
                stoich.get_from_label("5-norm"),
                ep.get_from_label("PymatgenData mean X"),
                ep.get_from_label("PymatgenData mean row"),
                ep.get_from_label("PymatgenData std_dev row"),
                ep.get_from_label("PymatgenData std_dev group"),
                ep.get_from_label("PymatgenData mean block"),
                ep.get_from_label("PymatgenData std_dev block"),
                ep.get_from_label("PymatgenData maximum atomic_mass"),
                ep.get_from_label("PymatgenData range atomic_mass"),
                ep.get_from_label("PymatgenData mean atomic_mass"),
                ep.get_from_label("PymatgenData std_dev atomic_mass"),
                ep.get_from_label("PymatgenData maximum atomic_radius"),
                ep.get_from_label("PymatgenData range atomic_radius"),
                ep.get_from_label("PymatgenData mean atomic_radius"),
                ep.get_from_label("PymatgenData std_dev atomic_radius"),
                ep.get_from_label("PymatgenData minimum mendeleev_no"),
                ep.get_from_label("PymatgenData mean mendeleev_no"),
                ep.get_from_label("PymatgenData std_dev mendeleev_no"),
                ep.get_from_label("PymatgenData mean thermal_conductivity"),
                ep.get_from_label("PymatgenData std_dev thermal_conductivity"),
                ep.get_from_label("PymatgenData mean melting_point"),
                ep.get_from_label("PymatgenData std_dev melting_point"),
                valence.get_from_label("avg s valence electrons"),
                valence.get_from_label("frac s valence electrons"),
                valence.get_from_label("frac p valence electrons"),
                valence.get_from_label("frac d valence electrons"),
                ion_prop.get_from_label("avg ionic char"),
                TMetalFraction().featurize(comp)[0],
                en_diff.get_from_label("minimum EN difference"),
                en_diff.get_from_label("range EN difference"),
                en_diff.get_from_label("mean EN difference"),
                en_diff.get_from_label("std_dev EN difference"),
                oxi_state.get_from_label("range oxidation state"),
                oxi_state.get_from_label("std_dev oxidation state"),
                atomic_orbital.get_from_label("LUMO_energy"),
                atomic_orbital.get_from_label("gap_AO"),
                at_pack_eff.get_from_label("mean simul. packing efficiency"),
                at_pack_eff.get_from_label(
                    "mean abs simul. packing efficiency"),
                at_pack_eff.get_from_label(
                    "dist from 1 clusters |APE| < 0.010"),
                at_pack_eff.get_from_label(
                    "dist from 3 clusters |APE| < 0.010"),
                at_pack_eff.get_from_label(
                    "dist from 5 clusters |APE| < 0.010"),
            ]
    elif model_type == "comp_st":
        if isinstance(target, Composition):
            raise ValueError(
                'comp_st (Using compositional and structural descriptor) is specified, '
                'but target is composition')
        comp: Composition = target.composition
        comp_oxi = comp.add_charges_from_oxi_state_guesses()
        target_orig = deepcopy(target)
        target.add_oxidation_state_by_guess()
        if dielectric_type == "el":
            ep = ScalarFeaturizer(ElementProperty.from_preset("matminer"),
                                  comp)
            valence = ScalarFeaturizer(ValenceOrbital(), comp)
            en_diff = ScalarFeaturizer(ElectronegativityDiff(), comp_oxi)
            atomic_orbital = ScalarFeaturizer(AtomicOrbitals(), comp)
            density = ScalarFeaturizer(DensityFeatures(), target)
            dist_btw_nn = MinimumRelativeDistances().featurize(target_orig)
            opsf = SiteFeaturizer(OPSiteFingerprint(), target)
            voro_fp = SiteFeaturizer(VoronoiFingerprint(use_symm_weights=True),
                                     target)
            gsf = SiteFeaturizer(GaussianSymmFunc(), target)
            lpd = SiteFeaturizer(
                LocalPropertyDifference.from_preset("ward-prb-2017"), target)
            descriptor = [
                ep.get_from_label("PymatgenData std_dev X"),
                ep.get_from_label("PymatgenData mean block"),
                ep.get_from_label("PymatgenData std_dev atomic_mass"),
                valence.get_from_label("frac d valence electrons"),
                TMetalFraction().featurize(comp)[0],
                en_diff.get_from_label("maximum EN difference"),
                en_diff.get_from_label("mean EN difference"),
                atomic_orbital.get_from_label("HOMO_energy"),
                atomic_orbital.get_from_label("LUMO_energy"),
                density.get_from_label("density"),
                np.mean(dist_btw_nn),
                np.std(dist_btw_nn),
                opsf.get_from_label_func("tetrahedral CN_4", np.max),
                opsf.get_from_label_func("rectangular see-saw-like CN_4",
                                         np.max),
                np.max([
                    EwaldSiteEnergy(accuracy=4).featurize(target, i)
                    for i in range(target.num_sites)
                ]),
                voro_fp.get_from_label_func("Voro_area_std_dev", np.max),
                voro_fp.get_from_label_func("Voro_area_std_dev", np.mean),
                voro_fp.get_from_label_func("Voro_dist_minimum", np.min),
                voro_fp.get_from_label_func("Voro_dist_minimum", np.std),
                gsf.get_from_label_func("G2_20.0", np.std),
                gsf.get_from_label_func("G2_80.0", np.max),
                gsf.get_from_label_func("G4_0.005_4.0_-1.0", np.mean),
                lpd.get_from_label_func("local difference in NdValence",
                                        np.mean),
                lpd.get_from_label_func("local difference in NValence",
                                        np.min),
                lpd.get_from_label_func("local difference in NValence",
                                        np.std),
                lpd.get_from_label_func("local difference in NdUnfilled",
                                        np.mean),
                lpd.get_from_label_func("local difference in NUnfilled",
                                        np.min),
                lpd.get_from_label_func("local difference in NUnfilled",
                                        np.mean),
                lpd.get_from_label_func("local difference in GSmagmom",
                                        np.mean)
            ]
        elif dielectric_type == "ion":
            ep = ScalarFeaturizer(ElementProperty.from_preset("matminer"),
                                  comp)
            atomic_orbitals = ScalarFeaturizer(AtomicOrbitals(), comp)
            density = ScalarFeaturizer(DensityFeatures(), target)
            str_het = ScalarFeaturizer(StructuralHeterogeneity(), target)
            opsf = SiteFeaturizer(OPSiteFingerprint(), target)
            voro_fp = SiteFeaturizer(VoronoiFingerprint(use_symm_weights=True),
                                     target)
            gsf = SiteFeaturizer(GaussianSymmFunc(), target)
            lpd = SiteFeaturizer(
                LocalPropertyDifference.from_preset("ward-prb-2017"), target)
            descriptor = [
                ep.get_from_label("PymatgenData std_dev row"),
                ep.get_from_label("PymatgenData mean thermal_conductivity"),
                ep.get_from_label("PymatgenData std_dev melting_point"),
                TMetalFraction().featurize(comp)[0],
                atomic_orbitals.get_from_label("gap_AO"),
                density.get_from_label("density"),
                density.get_from_label("packing fraction"),
                str_het.get_from_label("mean neighbor distance variation"),
                str_het.get_from_label("avg_dev neighbor distance variation"),
                opsf.get_from_label_func("sgl_bd CN_1", np.mean),
                opsf.get_from_label_func("bent 150 degrees CN_2", np.mean),
                opsf.get_from_label_func("linear CN_2", np.mean),
                opsf.get_from_label_func("trigonal planar CN_3", np.mean),
                opsf.get_from_label_func("pentagonal planar CN_5", np.std),
                opsf.get_from_label_func("octahedral CN_6", np.max),
                opsf.get_from_label_func("octahedral CN_6", np.std),
                opsf.get_from_label_func("q6 CN_12", np.mean),
                np.max([
                    EwaldSiteEnergy(accuracy=4).featurize(target, i)
                    for i in range(target.num_sites)
                ]),
                voro_fp.get_from_label_func("Symmetry_weighted_index_4",
                                            np.std),
                voro_fp.get_from_label_func("Voro_vol_maximum", np.mean),
                voro_fp.get_from_label_func("Voro_area_std_dev", np.mean),
                voro_fp.get_from_label_func("Voro_area_minimum", np.std),
                voro_fp.get_from_label_func("Voro_area_maximum", np.min),
                voro_fp.get_from_label_func("Voro_dist_std_dev", np.mean),
                gsf.get_from_label_func("G2_80.0", np.min),
                gsf.get_from_label_func("G4_0.005_4.0_1.0", np.std),
                lpd.get_from_label_func("local difference in Number", np.max),
                lpd.get_from_label_func("local difference in MendeleevNumber",
                                        np.max),
                lpd.get_from_label_func("local difference in MendeleevNumber",
                                        np.min),
                lpd.get_from_label_func("local difference in AtomicWeight",
                                        np.max),
                lpd.get_from_label_func("local difference in AtomicWeight",
                                        np.mean),
                lpd.get_from_label_func("local difference in MeltingT",
                                        np.mean),
                lpd.get_from_label_func("local difference in Row", np.max),
                lpd.get_from_label_func(
                    "local difference in Electronegativity", np.min),
                lpd.get_from_label_func("local difference in NValence",
                                        np.std),
                lpd.get_from_label_func("local difference in NsUnfilled",
                                        np.mean),
                lpd.get_from_label_func("local difference in NdUnfilled",
                                        np.max),
                lpd.get_from_label_func("local difference in NdUnfilled",
                                        np.std),
                lpd.get_from_label_func("local difference in NUnfilled",
                                        np.max),
                lpd.get_from_label_func("local difference in NUnfilled",
                                        np.min),
                lpd.get_from_label_func("local difference in NUnfilled",
                                        np.mean),
                lpd.get_from_label_func("local difference in NUnfilled",
                                        np.std),
                lpd.get_from_label_func("local difference in GSvolume_pa",
                                        np.max),
                lpd.get_from_label_func("local difference in GSvolume_pa",
                                        np.min),
                lpd.get_from_label_func("local difference in SpaceGroupNumber",
                                        np.max),
            ]
    with open(
            f"{os.path.dirname(__file__)}/{dielectric_type}_{model_type}.joblib",
            "rb") as fr:
        model: RandomForestRegressor = joblib.load(fr)
    with open(
            f"{os.path.dirname(__file__)}/{dielectric_type}_{model_type}_scaler.joblib",
            "rb") as fr:
        scaler: StandardScaler = joblib.load(fr)
    descriptor = scaler.transform([descriptor])
    return model.predict(descriptor)[0]
def AddFeatures(df):  # Add features by Matminer
    from matminer.featurizers.conversions import StrToComposition
    df = StrToComposition().featurize_dataframe(df, "formula")

    from matminer.featurizers.composition import ElementProperty

    ep_feat = ElementProperty.from_preset(preset_name="magpie")
    df = ep_feat.featurize_dataframe(
        df, col_id="composition"
    )  # input the "composition" column to the featurizer

    from matminer.featurizers.conversions import CompositionToOxidComposition
    from matminer.featurizers.composition import OxidationStates

    df = CompositionToOxidComposition().featurize_dataframe(df, "composition")

    os_feat = OxidationStates()
    df = os_feat.featurize_dataframe(df, "composition_oxid")

    from matminer.featurizers.composition import ElectronAffinity

    ea_feat = ElectronAffinity()
    df = ea_feat.featurize_dataframe(df,
                                     "composition_oxid",
                                     ignore_errors=True)

    from matminer.featurizers.composition import BandCenter

    bc_feat = BandCenter()
    df = bc_feat.featurize_dataframe(df,
                                     "composition_oxid",
                                     ignore_errors=True)

    from matminer.featurizers.composition import CohesiveEnergy

    ce_feat = CohesiveEnergy()
    df = ce_feat.featurize_dataframe(df,
                                     "composition_oxid",
                                     ignore_errors=True)

    from matminer.featurizers.composition import Miedema

    m_feat = Miedema()
    df = m_feat.featurize_dataframe(df, "composition_oxid", ignore_errors=True)

    from matminer.featurizers.composition import TMetalFraction

    tmf_feat = TMetalFraction()
    df = tmf_feat.featurize_dataframe(df,
                                      "composition_oxid",
                                      ignore_errors=True)

    from matminer.featurizers.composition import ValenceOrbital

    vo_feat = ValenceOrbital()
    df = vo_feat.featurize_dataframe(df,
                                     "composition_oxid",
                                     ignore_errors=True)

    from matminer.featurizers.composition import YangSolidSolution

    yss_feat = YangSolidSolution()
    df = yss_feat.featurize_dataframe(df,
                                      "composition_oxid",
                                      ignore_errors=True)

    from matminer.featurizers.structure import GlobalSymmetryFeatures

    # This is the border between compositional features and structural features. Comment out the following featurizers to use only compostional features.

    gsf_feat = GlobalSymmetryFeatures()
    df = gsf_feat.featurize_dataframe(df, "structure", ignore_errors=True)

    from matminer.featurizers.structure import StructuralComplexity
    sc_feat = StructuralComplexity()
    df = sc_feat.featurize_dataframe(df, "structure", ignore_errors=True)

    from matminer.featurizers.structure import ChemicalOrdering
    co_feat = ChemicalOrdering()
    df = co_feat.featurize_dataframe(df, "structure", ignore_errors=True)

    from matminer.featurizers.structure import MaximumPackingEfficiency
    mpe_feat = MaximumPackingEfficiency()
    df = mpe_feat.featurize_dataframe(df, "structure", ignore_errors=True)

    from matminer.featurizers.structure import MinimumRelativeDistances
    mrd_feat = MinimumRelativeDistances()
    df = mrd_feat.featurize_dataframe(df, "structure", ignore_errors=True)

    from matminer.featurizers.structure import StructuralHeterogeneity
    sh_feat = StructuralHeterogeneity()
    df = sh_feat.featurize_dataframe(df, "structure", ignore_errors=True)

    from matminer.featurizers.structure import SiteStatsFingerprint

    from matminer.featurizers.site import AverageBondLength
    from pymatgen.analysis.local_env import CrystalNN
    bl_feat = SiteStatsFingerprint(
        AverageBondLength(CrystalNN(search_cutoff=20)))
    df = bl_feat.featurize_dataframe(df, "structure", ignore_errors=True)

    from matminer.featurizers.site import AverageBondAngle
    ba_feat = SiteStatsFingerprint(
        AverageBondAngle(CrystalNN(search_cutoff=20)))
    df = ba_feat.featurize_dataframe(df, "structure", ignore_errors=True)

    from matminer.featurizers.site import BondOrientationalParameter
    bop_feat = SiteStatsFingerprint(BondOrientationalParameter())
    df = bop_feat.featurize_dataframe(df, "structure", ignore_errors=True)

    from matminer.featurizers.site import CoordinationNumber
    cn_feat = SiteStatsFingerprint(CoordinationNumber())
    df = cn_feat.featurize_dataframe(df, "structure", ignore_errors=True)

    from matminer.featurizers.structure import DensityFeatures
    df_feat = DensityFeatures()
    df = df_feat.featurize_dataframe(df, "structure", ignore_errors=True)
    return (df)
 def test_oxidation_states(self):
     featurizer = OxidationStates.from_preset("deml")
     features = dict(zip(featurizer.feature_labels(), featurizer.featurize(self.df["composition"][1])))
     self.assertAlmostEqual(4, features["range oxidation state"])
     self.assertAlmostEqual(2, features["maximum oxidation state"])
sc_feat = StrToComposition()
df = sc_feat.featurize_dataframe(df, col_id='formula')

from matminer.featurizers.composition import ElementProperty

ep_feat = ElementProperty.from_preset(preset_name='magpie')
df = ep_feat.featurize_dataframe(df, col_id='composition')

from matminer.featurizers.conversions import CompositionToOxidComposition

co_feat = CompositionToOxidComposition()
df = co_feat.featurize_dataframe(df, col_id='composition')

from matminer.featurizers.composition import OxidationStates

os_feat = OxidationStates()
df = os_feat.featurize_dataframe(df, col_id='composition_oxid')

from matminer.featurizers.structure import DensityFeatures

df_feat = DensityFeatures()
df = df_feat.featurize_dataframe(df, col_id='structure')

"""
formula, structure, elastic_anisotropy, G_Reuss, G_VRH, G_Voigt, K_Reuss, K_VRH, K_Voigt,
poisson_ratio, compliance_tensor, elastic_tensor, elastic_tensor_original, composition
"""

y = df['K_VRH'].values
excluded = ['formula', 'structure', 'elastic_anisotropy', 'G_Reuss', 'G_VRH', 'G_Voigt',
            'K_Reuss', 'K_VRH', 'K_Voigt', 'poisson_ratio', 'compliance_tensor',
    structlist.append([Structure.from_file(directoryname + i)
                       ])  #Converts CIF to pymatgen structure object
    namelist.append(os.path.splitext(i)[0])  #Collects all the structure names
    structs.append(Structure.from_file(directoryname + i))
#Creates Pandas dataframe with data being a list of structures and the row name being the structure name
dftest = pd.DataFrame(data=structlist, index=namelist, columns=namecolumns)

p = PartialRadialDistributionFunction()
p.fit(np.asarray(structs))

c = CoulombMatrix()
c.fit(np.asarray(structs))

erdf = ElectronicRadialDistributionFunction()
erdf.cutoff = 10  #longest diagonal of lattice...I picked a number semi-arbitrarily

#Featurizes the structures
featurizer = MultipleFeaturizer([
    ElementProperty.from_preset('magpie'),
    OxidationStates(),
    AtomicOrbitals(),
    BandCenter(),
    ElectronegativityDiff(),
    DensityFeatures(),
    RadialDistributionFunction(), p, c, erdf
])

r = (featurizer.featurize_many(dftest, ['structure'])
     )  #Featurizes entire Pandas Dataframe
#Yay it runs!