def get_tet_bcc_motif(structure, idx):
    """
    Convenience class-method from Nils Zimmermann.
    Used to distinguish coordination environment in half-Heuslers.
    Args:
        structure (pymatgen Structure): the target structure to evaluate
        idx (index): the site index in the structure
    Returns:
        (str) that describes site coordination enviornment
            'bcc'
            'tet'
            'unrecognized'
    """

    op_site_fp = OPSiteFingerprint()
    fp = op_site_fp.featurize(structure, idx)
    labels = op_site_fp.feature_labels()
    i_tet = labels.index('tet CN_4')
    i_bcc = labels.index('bcc CN_8')
    if fp[i_bcc] > 0.5:
        return 'bcc'
    elif fp[i_tet] > 0.5:
        return 'tet'
    else:
        return 'unrecognized'
Beispiel #2
0
def get_fps(structure, cutoff=10.0, processes=8):
    all_descrs = []

    try:
        coordination_number_ = CoordinationNumber.from_preset('VoronoiNN')
        voronoi_fps_ = VoronoiFingerprintModified(
            cutoff=cutoff).featurize_structure(structure)
        crystal_nn_fingerprint_ = CrystalNNFingerprint.from_preset('cn')
        op_site_fingerprint_ = OPSiteFingerprint()
        agni_fingerprints_ = AGNIFingerprints()
        gaussian_symm_func_fps_ = GaussianSymmFuncModified(
        ).featurize_structure(structure)
        pymatgen_data_ = PymatgenData()
        magpie_data_ = MagpieData()

        data_list = [[
            structure, i, site, coordination_number_, voronoi_fps_,
            crystal_nn_fingerprint_, op_site_fingerprint_, agni_fingerprints_,
            gaussian_symm_func_fps_, pymatgen_data_, magpie_data_
        ] for i, site in enumerate(structure)]

        pool = multiprocessing.Pool(processes=processes)
        all_descrs = np.array(pool.map(get_all_site_descrs, data_list))

    except (AttributeError, IndexError) as error:
        pass

    return all_descrs
Beispiel #3
0
    def __init__(self, materials, site_descriptors, query=None, **kwargs):
        """
        Calculates site descriptors for materials

        Args:
            materials (Store): Store of materials documents
            site_descriptors (Store): Store of site-descriptors data such as tetrahedral order parameter or percentage of 8-fold coordination
            query (dict): dictionary to limit materials to be analyzed
        """

        self.materials = materials
        self.site_descriptors = site_descriptors
        self.query = query if query else {}

        # Set up all targeted site descriptors.
        self.sds = {}
        for nn in NearNeighbors.__subclasses__():
            nn_ = getattr(pymatgen.analysis.local_env, nn.__name__)
            t = nn.__name__ if nn.__name__ \
                not in cls_to_abbrev.keys() \
                else cls_to_abbrev[nn.__name__]
            k = 'cn_{}'.format(t)
            self.sds[k] = CoordinationNumber(nn_(), use_weights=False)
            k = 'cn_wt_{}'.format(t)
            self.sds[k] = CoordinationNumber(nn_(), use_weights=True)
        self.sds['opsf'] = OPSiteFingerprint()
        #self.sds['csf'] = CrystalSiteFingerprint.from_preset('ops')

        super().__init__(sources=[materials],
                         targets=[site_descriptors],
                         **kwargs)
Beispiel #4
0
    def __init__(self, op_site_fp=None, stats=('mean', 'std_dev', 'minimum',
                                               'maximum'), min_oxi=None,
                 max_oxi=None):

        self.op_site_fp = OPSiteFingerprint() if op_site_fp is None \
            else op_site_fp
        self._labels = self.op_site_fp.feature_labels()
        self.stats = tuple([stats]) if type(stats) == str else stats
        if self.stats and '_mode' in ''.join(self.stats):
            nmodes = 0
            for stat in self.stats:
                if '_mode' in stat and int(stat[0]) > nmodes:
                    nmodes = int(stat[0])
            self.nmodes = nmodes

        self.min_oxi = min_oxi
        self.max_oxi = max_oxi
Beispiel #5
0
def get_op_stats_vector_diff(s1, s2, max_dr=0.2, ddr=0.01, ddist=0.01):
    """
    Determine the difference vector between two order parameter-statistics
    feature vector resulting from two input structures.

    Args:
        s1 (Structure): first input structure.
        s2 (Structure): second input structure.
        max_dr (float): maximum neighbor-finding parameter to be tested.
        ddr (float): step size for increasing neighbor-finding parameter.
        ddist (float): bin size for histogramming distances of varying dr.

    Returns: (float, [float]) optimal neighbor-finding parameter
        and difference vector between order
        parameter-statistics feature vectors obtained from the
        two input structures (s1 - s2).
    """
    # Compute OP stats vector distances for varying neigh-find paras.
    dr = []
    dist = []
    delta = []
    nbins = int(max_dr/ddr) + 1
    for i in range(nbins):
        dr.append(float(i+1)*ddr)
        opsf = OPStructureFingerprint(op_site_fp=OPSiteFingerprint(dr=dr[i]))
        delta.append(np.array(
            opsf.featurize(s1)) - np.array(opsf.featurize(s2)))
        dist.append(np.linalg.norm(delta[i]))

    # Compute distance histogram, determine peak, and location
    # of smallest dr with peak value.
    nbins = int(max(dist) / ddist) + 1
    hist, bin_edges = np.histogram(
        dist, bins=[float(i)*ddist for i in range(nbins)],
        normed=False, weights=None, density=False)
    idx = list(hist).index(max(hist))
    dist_peak = 0.5 * (bin_edges[idx] + bin_edges[idx+1])
    idx = -1
    for i, d in enumerate(dist):
        if fabs(d - dist_peak) <= ddist:
            idx = i
            break

    return dr[idx], delta[idx]
Beispiel #6
0
    def __init__(self, materials, site_descriptors, mat_query=None, **kwargs):
        """
        Calculates site-based descriptors (e.g., coordination numbers
        with different near-neighbor finding approaches) for materials and
        runs statistics analysis on selected descriptor types
        (order parameter-based site fingerprints).  The latter is
        useful as a definition of a structure fingerprint
        on the basis of local coordination information.

        Args:
            materials (Store): Store of materials documents.
            site_descriptors (Store): Store of site-descriptors data such
                                      as tetrahedral order parameter or
                                      fraction of being 8-fold coordinated.
            mat_query (dict): dictionary to limit materials to be analyzed.
        """

        self.materials = materials
        self.site_descriptors = site_descriptors
        self.mat_query = mat_query if mat_query else {}

        # Set up all targeted site descriptors.
        self.sds = {}
        for nn in NearNeighbors.__subclasses__():
            nn_ = getattr(pymatgen.analysis.local_env, nn.__name__)
            t = nn.__name__
            k = 'cn_{}'.format(t)
            self.sds[k] = CoordinationNumber(nn_(), use_weights='none')
            k = 'cn_wt_{}'.format(t)
            self.sds[k] = CoordinationNumber(nn_(), use_weights='sum')
        self.all_output_pieces = {
            'site_descriptors': [k for k in self.sds.keys()]
        }
        self.sds['opsf'] = OPSiteFingerprint()
        self.sds['csf'] = CrystalSiteFingerprint.from_preset('ops')
        self.all_output_pieces['statistics'] = ['opsf', 'csf']

        super().__init__(sources=[materials],
                         targets=[site_descriptors],
                         **kwargs)
Beispiel #7
0
    def test_op_site_fingerprint(self):
        opsf = OPSiteFingerprint()
        l = opsf.feature_labels()
        t = ['sgl_bd CN_1', 'L-shaped CN_2', 'water-like CN_2', \
             'bent 120 degrees CN_2', 'bent 150 degrees CN_2', \
             'linear CN_2', 'trigonal planar CN_3', \
             'trigonal non-coplanar CN_3', 'T-shaped CN_3', \
             'square co-planar CN_4', 'tetrahedral CN_4', \
             'rectangular see-saw-like CN_4', 'see-saw-like CN_4', \
             'trigonal pyramidal CN_4', 'pentagonal planar CN_5', \
             'square pyramidal CN_5', 'trigonal bipyramidal CN_5', \
             'hexagonal planar CN_6', 'octahedral CN_6', \
             'pentagonal pyramidal CN_6', 'hexagonal pyramidal CN_7', \
             'pentagonal bipyramidal CN_7', 'body-centered cubic CN_8', \
             'hexagonal bipyramidal CN_8', 'q2 CN_9', 'q4 CN_9', 'q6 CN_9', \
             'q2 CN_10', 'q4 CN_10', 'q6 CN_10', \
             'q2 CN_11', 'q4 CN_11', 'q6 CN_11', \
             'cuboctahedral CN_12', 'q2 CN_12', 'q4 CN_12', 'q6 CN_12']
        for i in range(len(l)):
            self.assertEqual(l[i], t[i])
        ops = opsf.featurize(self.sc, 0)
        self.assertEqual(len(ops), 37)
        self.assertAlmostEqual(
            ops[opsf.feature_labels().index('octahedral CN_6')],
            0.9995,
            places=7)
        ops = opsf.featurize(self.cscl, 0)
        self.assertAlmostEqual(
            ops[opsf.feature_labels().index('body-centered cubic CN_8')],
            0.8955,
            places=7)
        opsf = OPSiteFingerprint(dist_exp=0)
        ops = opsf.featurize(self.cscl, 0)
        self.assertAlmostEqual(
            ops[opsf.feature_labels().index('body-centered cubic CN_8')],
            0.9555,
            places=7)

        # The following test aims at ensuring the copying of the OP dictionaries work.
        opsfp = OPSiteFingerprint()
        cnnfp = CrystalNNFingerprint.from_preset('ops')
        self.assertEqual(
            len([1 for l in opsfp.feature_labels() if l.split()[0] == 'wt']),
            0)
Beispiel #8
0
class DeBreuck2020Featurizer(modnet.featurizers.MODFeaturizer):
    """ Featurizer presets used for the paper 'Machine learning
    materials properties for small datasets' by Pierre-Paul De Breuck,
    Geoffroy Hautier & Gian-Marco Rignanese, arXiv:2004.14766 (2020).

    Uses most of the featurizers implemented by matminer at the time of
    writing with their default hyperparameters and presets.

    """
    from matminer.featurizers.composition import (
        AtomicOrbitals,
        AtomicPackingEfficiency,
        BandCenter,
        # CohesiveEnergy, - This descriptor was not used in the paper preset
        # ElectronAffinity, - This descriptor was not used in the paper preset
        ElectronegativityDiff,
        ElementFraction,
        ElementProperty,
        IonProperty,
        Miedema,
        OxidationStates,
        Stoichiometry,
        TMetalFraction,
        ValenceOrbital,
        YangSolidSolution,
    )
    from matminer.featurizers.structure import (
        # BagofBonds, - This descriptor was not used in the paper preset
        BondFractions,
        ChemicalOrdering,
        CoulombMatrix,
        DensityFeatures,
        EwaldEnergy,
        GlobalSymmetryFeatures,
        MaximumPackingEfficiency,
        # PartialRadialDistributionFunction,
        RadialDistributionFunction,
        SineCoulombMatrix,
        StructuralHeterogeneity,
        XRDPowderPattern,
    )

    from matminer.featurizers.site import (
        AGNIFingerprints,
        AverageBondAngle,
        AverageBondLength,
        BondOrientationalParameter,
        ChemEnvSiteFingerprint,
        CoordinationNumber,
        CrystalNNFingerprint,
        GaussianSymmFunc,
        GeneralizedRadialDistributionFunction,
        LocalPropertyDifference,
        OPSiteFingerprint,
        VoronoiFingerprint,
    )

    composition_featurizers = (
        AtomicOrbitals(),
        AtomicPackingEfficiency(),
        BandCenter(),
        ElementFraction(),
        ElementProperty.from_preset("magpie"),
        IonProperty(),
        Miedema(),
        Stoichiometry(),
        TMetalFraction(),
        ValenceOrbital(),
        YangSolidSolution(),
    )

    oxide_composition_featurizers = (
        ElectronegativityDiff(),
        OxidationStates(),
    )

    structure_featurizers = (
        DensityFeatures(),
        GlobalSymmetryFeatures(),
        RadialDistributionFunction(),
        CoulombMatrix(),
        # PartialRadialDistributionFunction(),
        SineCoulombMatrix(),
        EwaldEnergy(),
        BondFractions(),
        StructuralHeterogeneity(),
        MaximumPackingEfficiency(),
        ChemicalOrdering(),
        XRDPowderPattern(),
        # BagofBonds(),
    )
    site_featurizers = (
        AGNIFingerprints(),
        AverageBondAngle(VoronoiNN()),
        AverageBondLength(VoronoiNN()),
        BondOrientationalParameter(),
        ChemEnvSiteFingerprint.from_preset("simple"),
        CoordinationNumber(),
        CrystalNNFingerprint.from_preset("ops"),
        GaussianSymmFunc(),
        GeneralizedRadialDistributionFunction.from_preset("gaussian"),
        LocalPropertyDifference(),
        OPSiteFingerprint(),
        VoronoiFingerprint(),
    )

    def featurize_composition(self, df):
        """ Applies the preset composition featurizers to the input dataframe,
        renames some fields and cleans the output dataframe.

        """
        df = super().featurize_composition(df)

        _orbitals = {"s": 1, "p": 2, "d": 3, "f": 4}
        df['AtomicOrbitals|HOMO_character'] = df[
            'AtomicOrbitals|HOMO_character'].map(_orbitals)
        df['AtomicOrbitals|LUMO_character'] = df[
            'AtomicOrbitals|LUMO_character'].map(_orbitals)

        df['AtomicOrbitals|HOMO_element'] = df[
            'AtomicOrbitals|HOMO_element'].apply(
                lambda x: -1 if not isinstance(x, str) else Element(x).Z)
        df['AtomicOrbitals|LUMO_element'] = df[
            'AtomicOrbitals|LUMO_element'].apply(
                lambda x: -1 if not isinstance(x, str) else Element(x).Z)

        df = df.replace([np.inf, -np.inf, np.nan], 0)

        return modnet.featurizers.clean_df(df)

    def featurize_structure(self, df):
        """ Applies the preset structural featurizers to the input dataframe,
        renames some fields and cleans the output dataframe.

        """
        df = super().featurize_structure(df)

        dist = df[
            "RadialDistributionFunction|radial distribution function"].iloc[0][
                'distances'][:50]
        for i, d in enumerate(dist):
            _rdf_key = "RadialDistributionFunction|radial distribution function|d_{:.2f}".format(
                d)
            df[_rdf_key] = df[
                "RadialDistributionFunction|radial distribution function"].apply(
                    lambda x: x['distribution'][i])

        df = df.drop("RadialDistributionFunction|radial distribution function",
                     axis=1)

        _crystal_system = {
            "cubic": 1,
            "tetragonal": 2,
            "orthorombic": 3,
            "hexagonal": 4,
            "trigonal": 5,
            "monoclinic": 6,
            "triclinic": 7
        }

        def _int_map(x):
            if x == np.nan:
                return 0
            elif x:
                return 1
            else:
                return 0

        df["GlobalSymmetryFeatures|crystal_system"] = df[
            "GlobalSymmetryFeatures|crystal_system"].map(_crystal_system)
        df["GlobalSymmetryFeatures|is_centrosymmetric"] = df[
            "GlobalSymmetryFeatures|is_centrosymmetric"].map(_int_map)

        return modnet.featurizers.clean_df(df)

    def featurize_site(self, df):
        """ Applies the preset site featurizers to the input dataframe,
        renames some fields and cleans the output dataframe.

        """

        # rename some features for backwards compatibility with pretrained models
        aliases = {
            "GeneralizedRadialDistributionFunction": "GeneralizedRDF",
            "AGNIFingerprints": "AGNIFingerPrint",
            "BondOrientationalParameter": "BondOrientationParameter",
            "GaussianSymmFunc": "ChemEnvSiteFingerprint|GaussianSymmFunc",
        }
        df = super().featurize_site(df, aliases=aliases)
        df = df.loc[:, (df != 0).any(axis=0)]

        return modnet.featurizers.clean_df(df)
Beispiel #9
0
    def test_op_site_fingerprint(self):
        opsf = OPSiteFingerprint()
        l = opsf.feature_labels()
        t = ['sgl_bd CN_1', 'L-shaped CN_2', 'water-like CN_2', \
             'bent 120 degrees CN_2', 'bent 150 degrees CN_2', \
             'linear CN_2', 'trigonal planar CN_3', \
             'trigonal non-coplanar CN_3', 'T-shaped CN_3', \
             'square co-planar CN_4', 'tetrahedral CN_4', \
             'rectangular see-saw-like CN_4', 'see-saw-like CN_4', \
             'trigonal pyramidal CN_4', 'pentagonal planar CN_5', \
             'square pyramidal CN_5', 'trigonal bipyramidal CN_5', \
             'hexagonal planar CN_6', 'octahedral CN_6', \
             'pentagonal pyramidal CN_6', 'hexagonal pyramidal CN_7', \
             'pentagonal bipyramidal CN_7', 'body-centered cubic CN_8', \
             'hexagonal bipyramidal CN_8', 'q2 CN_9', 'q4 CN_9', 'q6 CN_9', \
             'q2 CN_10', 'q4 CN_10', 'q6 CN_10', \
             'q2 CN_11', 'q4 CN_11', 'q6 CN_11', \
             'cuboctahedral CN_12', 'q2 CN_12', 'q4 CN_12', 'q6 CN_12']
        for i in range(len(l)):
            self.assertEqual(l[i], t[i])
        ops = opsf.featurize(self.sc, 0)
        self.assertEqual(len(ops), 37)
        self.assertAlmostEqual(ops[opsf.feature_labels().index(
            'octahedral CN_6')], 0.9995, places=7)
        ops = opsf.featurize(self.cscl, 0)
        self.assertAlmostEqual(ops[opsf.feature_labels().index(
            'body-centered cubic CN_8')], 0.8955, places=7)
        opsf = OPSiteFingerprint(dist_exp=0)
        ops = opsf.featurize(self.cscl, 0)
        self.assertAlmostEqual(ops[opsf.feature_labels().index(
            'body-centered cubic CN_8')], 0.9555, places=7)

        # The following test aims at ensuring the copying of the OP dictionaries work.
        opsfp = OPSiteFingerprint()
        cnnfp = CrystalNNFingerprint.from_preset('ops')
        self.assertEqual(len([1 for l in opsfp.feature_labels() if l.split()[0] == 'wt']), 0)
Beispiel #10
0
class FUTURE_PROSPECTS_2021(featurizer.extendedMODFeaturizer):

    from matminer.featurizers.composition import (
        AtomicOrbitals,
        AtomicPackingEfficiency,
        BandCenter,
        CohesiveEnergy,
        ElectronAffinity,
        ElectronegativityDiff,
        ElementFraction,
        ElementProperty,
        IonProperty,
        Miedema,
        OxidationStates,
        Stoichiometry,
        TMetalFraction,
        ValenceOrbital,
        YangSolidSolution,
    )
    from matminer.featurizers.structure import (
        BagofBonds,
        BondFractions,
        ChemicalOrdering,
        CoulombMatrix,
        DensityFeatures,
        EwaldEnergy,
        GlobalSymmetryFeatures,
        MaximumPackingEfficiency,
        PartialRadialDistributionFunction,
        RadialDistributionFunction,
        SineCoulombMatrix,
        StructuralHeterogeneity,
        XRDPowderPattern,
    )

    from matminer.featurizers.site import (
        AGNIFingerprints,
        AverageBondAngle,
        AverageBondLength,
        BondOrientationalParameter,
        ChemEnvSiteFingerprint,
        CoordinationNumber,
        CrystalNNFingerprint,
        GaussianSymmFunc,
        GeneralizedRadialDistributionFunction,
        LocalPropertyDifference,
        OPSiteFingerprint,
        VoronoiFingerprint,
    )
    from matminer.featurizers.dos import (
        DOSFeaturizer,
        SiteDOS,
        Hybridization,
        DosAsymmetry,
    )
    from matminer.featurizers.bandstructure import (
        BandFeaturizer,
        BranchPointEnergy
    )

    composition_featurizers = (
        AtomicOrbitals(),
        AtomicPackingEfficiency(),
        BandCenter(),
        ElementFraction(),
        ElementProperty.from_preset("magpie"),
        IonProperty(),
        Miedema(),
        Stoichiometry(),
        TMetalFraction(),
        ValenceOrbital(),
        YangSolidSolution(),
    )

    oxid_composition_featurizers = (
        ElectronegativityDiff(),
        OxidationStates(),
    )

    structure_featurizers = (
        DensityFeatures(),
        GlobalSymmetryFeatures(),
        RadialDistributionFunction(),
        CoulombMatrix(),
        #PartialRadialDistributionFunction(), #Introduces a large amount of features
        SineCoulombMatrix(),
        EwaldEnergy(),
        BondFractions(),
        StructuralHeterogeneity(),
        MaximumPackingEfficiency(),
        ChemicalOrdering(),
        XRDPowderPattern(),
    )
    site_featurizers = (
        AGNIFingerprints(),
        AverageBondAngle(VoronoiNN()),
        AverageBondLength(VoronoiNN()),
        BondOrientationalParameter(),
        ChemEnvSiteFingerprint.from_preset("simple"),
        CoordinationNumber(),
        CrystalNNFingerprint.from_preset("ops"),
        GaussianSymmFunc(),
        GeneralizedRadialDistributionFunction.from_preset("gaussian"),
        LocalPropertyDifference(),
        OPSiteFingerprint(),
        VoronoiFingerprint(),
    )

    dos_featurizers = (
        DOSFeaturizer(),
        SiteDOS(),
        Hybridization()
    )

    band_featurizers = (
        BandFeaturizer(),
        BranchPointEnergy()
    )
    def __init__(self, n_jobs=None):
            self._n_jobs = n_jobs

    def featurize_composition(self, df):
        """Applies the preset composition featurizers to the input dataframe,
        renames some fields and cleans the output dataframe.
        """
        df = super().featurize_composition(df)

        _orbitals = {"s": 1, "p": 2, "d": 3, "f": 4}
        df["AtomicOrbitals|HOMO_character"] = df["AtomicOrbitals|HOMO_character"].map(
            _orbitals
        )
        df["AtomicOrbitals|LUMO_character"] = df["AtomicOrbitals|LUMO_character"].map(
            _orbitals
        )

        df["AtomicOrbitals|HOMO_element"] = df["AtomicOrbitals|HOMO_element"].apply(
            lambda x: -1 if not isinstance(x, str) else Element(x).Z
        )
        df["AtomicOrbitals|LUMO_element"] = df["AtomicOrbitals|LUMO_element"].apply(
            lambda x: -1 if not isinstance(x, str) else Element(x).Z
        )

        return clean_df(df)

    def featurize_structure(self, df):
        """Applies the preset structural featurizers to the input dataframe,
        renames some fields and cleans the output dataframe.
        """
        df = super().featurize_structure(df)

        dist = df["RadialDistributionFunction|radial distribution function"].iloc[0][
            "distances"
        ][:50]
        for i, d in enumerate(dist):
            _rdf_key = "RadialDistributionFunction|radial distribution function|d_{:.2f}".format(
                d
            )
            df[_rdf_key] = df[
                "RadialDistributionFunction|radial distribution function"
            ].apply(lambda x: x["distribution"][i])

        df = df.drop("RadialDistributionFunction|radial distribution function", axis=1)

        _crystal_system = {
            "cubic": 1,
            "tetragonal": 2,
            "orthorombic": 3,
            "hexagonal": 4,
            "trigonal": 5,
            "monoclinic": 6,
            "triclinic": 7,
        }

        def _int_map(x):
            if x == np.nan:
                return 0
            elif x:
                return 1
            else:
                return 0

        df["GlobalSymmetryFeatures|crystal_system"] = df[
            "GlobalSymmetryFeatures|crystal_system"
        ].map(_crystal_system)
        df["GlobalSymmetryFeatures|is_centrosymmetric"] = df[
            "GlobalSymmetryFeatures|is_centrosymmetric"
        ].map(_int_map)

        return clean_df(df)

    def featurize_dos(self, df):
        """Applies the presetdos featurizers to the input dataframe,
        renames some fields and cleans the output dataframe.
        """

        df = super().featurize_dos(df)


        hotencodeColumns = ["DOSFeaturizer|vbm_specie_1","DOSFeaturizer|cbm_specie_1"]

        one_hot = pd.get_dummies(df[hotencodeColumns])
        df = df.drop(hotencodeColumns, axis = 1).join(one_hot)

        _orbitals = {"s": 1, "p": 2, "d": 3, "f": 4}

        df["DOSFeaturizer|vbm_character_1"] = df[
           "DOSFeaturizer|vbm_character_1"
           ].map(_orbitals)
        df["DOSFeaturizer|cbm_character_1"] = df[
           "DOSFeaturizer|cbm_character_1"
           ].map(_orbitals)

        # Splitting one feature into several floating features
        # e.g. number;number;number into three columns
        splitColumns = ["DOSFeaturizer|cbm_location_1", "DOSFeaturizer|vbm_location_1"]

        for column in splitColumns:
            try:
                newColumns = df[column].str.split(";", n = 2, expand = True)
                for i in range(0,3):
                    df[column + "_" + str(i)] = np.array(newColumns[i]).astype(np.float)
            except:
                continue
        df = df.drop(splitColumns, axis=1)
        df = df.drop(["dos"], axis=1)
        return clean_df(df)

    def featurize_bandstructure(self, df):
        """Applies the preset band structure featurizers to the input dataframe,
        renames some fields and cleans the output dataframe.
        """

        df = super().featurize_bandstructure(df)

        def _int_map(x):
            if str(x) == "False":
                return 0
            elif str(x) == "True":
                return 1

        df["BandFeaturizer|is_gap_direct"] = df[
            "BandFeaturizer|is_gap_direct"
        ].map(_int_map)


        df = df.drop(["bandstructure"], axis=1)

        return clean_df(df)


    def featurize_site(self, df):
        """Applies the preset site featurizers to the input dataframe,
        renames some fields and cleans the output dataframe.
        """

        aliases = {
            "GeneralizedRadialDistributionFunction": "GeneralizedRDF",
            "AGNIFingerprints": "AGNIFingerPrint",
            "BondOrientationalParameter": "BondOrientationParameter",
            "GaussianSymmFunc": "ChemEnvSiteFingerprint|GaussianSymmFunc",
        }
        df = super().featurize_site(df, aliases=aliases)
        df = df.loc[:, (df != 0).any(axis=0)]

        return clean_df(df)
Beispiel #11
0
def predict_log10_eps(
    target: Union[Structure, Composition],
    dielectric_type: str,
    model_type: str,
) -> float:
    """
    :param target: structure or composition to predict dielectric constants
    :param dielectric_type: "el" or "ion"
    :param model_type: "comp" or "comp_st"
    :return: Descriptor vector
    """
    if dielectric_type not in ["el", "ion"]:
        raise ValueError(
            f'Specify dielectric type "el" or "ion"\nInput: {dielectric_type}')
    if model_type not in ["comp", "comp_st"]:
        raise ValueError(
            f'Specify regression_type "comp" or "comp_st"\nInput: {model_type}'
        )

    if model_type == "comp":
        if isinstance(target, Structure):
            comp = target.composition
        else:
            comp = target
        comp_oxi = comp.add_charges_from_oxi_state_guesses()
        if dielectric_type == "el":
            ep = ScalarFeaturizer(ElementProperty.from_preset("matminer"),
                                  comp)
            valence = ScalarFeaturizer(ValenceOrbital(), comp)
            ion_prop = ScalarFeaturizer(IonProperty(), comp)
            en_diff = ScalarFeaturizer(ElectronegativityDiff(), comp_oxi)
            oxi_state = ScalarFeaturizer(OxidationStates.from_preset("deml"),
                                         comp_oxi)
            atomic_orbital = ScalarFeaturizer(AtomicOrbitals(), comp)
            descriptor = [
                ep.get_from_label("PymatgenData minimum X"),
                ep.get_from_label("PymatgenData range X"),
                ep.get_from_label("PymatgenData std_dev X"),
                ep.get_from_label("PymatgenData mean row"),
                ep.get_from_label("PymatgenData std_dev row"),
                ep.get_from_label("PymatgenData mean group"),
                ep.get_from_label("PymatgenData mean block"),
                ep.get_from_label("PymatgenData std_dev block"),
                ep.get_from_label("PymatgenData mean atomic_mass"),
                ep.get_from_label("PymatgenData std_dev atomic_mass"),
                ep.get_from_label("PymatgenData std_dev atomic_radius"),
                ep.get_from_label("PymatgenData minimum mendeleev_no"),
                ep.get_from_label("PymatgenData range mendeleev_no"),
                ep.get_from_label("PymatgenData std_dev mendeleev_no"),
                ep.get_from_label("PymatgenData mean thermal_conductivity"),
                ep.get_from_label("PymatgenData std_dev thermal_conductivity"),
                ep.get_from_label("PymatgenData mean melting_point"),
                ep.get_from_label("PymatgenData std_dev melting_point"),
                valence.get_from_label("avg s valence electrons"),
                valence.get_from_label("avg d valence electrons"),
                valence.get_from_label("frac s valence electrons"),
                valence.get_from_label("frac p valence electrons"),
                valence.get_from_label("frac d valence electrons"),
                ion_prop.get_from_label("avg ionic char"),
                TMetalFraction().featurize(comp)[0],
                en_diff.get_from_label("maximum EN difference"),
                en_diff.get_from_label("range EN difference"),
                en_diff.get_from_label("mean EN difference"),
                en_diff.get_from_label("std_dev EN difference"),
                BandCenter().featurize(comp)[0],
                oxi_state.get_from_label("std_dev oxidation state"),
                atomic_orbital.get_from_label("HOMO_energy"),
                atomic_orbital.get_from_label("LUMO_energy"),
                atomic_orbital.get_from_label("gap_AO"),
            ]
        elif dielectric_type == "ion":
            stoich = ScalarFeaturizer(Stoichiometry(), comp)
            ep = ScalarFeaturizer(ElementProperty.from_preset("matminer"),
                                  comp)
            valence = ScalarFeaturizer(ValenceOrbital(), comp)
            ion_prop = ScalarFeaturizer(IonProperty(), comp)
            en_diff = ScalarFeaturizer(ElectronegativityDiff(), comp_oxi)
            oxi_state = ScalarFeaturizer(OxidationStates.from_preset("deml"),
                                         comp_oxi)
            atomic_orbital = ScalarFeaturizer(AtomicOrbitals(), comp)
            at_pack_eff = ScalarFeaturizer(AtomicPackingEfficiency(), comp)
            descriptor = [
                stoich.get_from_label("3-norm"),
                stoich.get_from_label("5-norm"),
                ep.get_from_label("PymatgenData mean X"),
                ep.get_from_label("PymatgenData mean row"),
                ep.get_from_label("PymatgenData std_dev row"),
                ep.get_from_label("PymatgenData std_dev group"),
                ep.get_from_label("PymatgenData mean block"),
                ep.get_from_label("PymatgenData std_dev block"),
                ep.get_from_label("PymatgenData maximum atomic_mass"),
                ep.get_from_label("PymatgenData range atomic_mass"),
                ep.get_from_label("PymatgenData mean atomic_mass"),
                ep.get_from_label("PymatgenData std_dev atomic_mass"),
                ep.get_from_label("PymatgenData maximum atomic_radius"),
                ep.get_from_label("PymatgenData range atomic_radius"),
                ep.get_from_label("PymatgenData mean atomic_radius"),
                ep.get_from_label("PymatgenData std_dev atomic_radius"),
                ep.get_from_label("PymatgenData minimum mendeleev_no"),
                ep.get_from_label("PymatgenData mean mendeleev_no"),
                ep.get_from_label("PymatgenData std_dev mendeleev_no"),
                ep.get_from_label("PymatgenData mean thermal_conductivity"),
                ep.get_from_label("PymatgenData std_dev thermal_conductivity"),
                ep.get_from_label("PymatgenData mean melting_point"),
                ep.get_from_label("PymatgenData std_dev melting_point"),
                valence.get_from_label("avg s valence electrons"),
                valence.get_from_label("frac s valence electrons"),
                valence.get_from_label("frac p valence electrons"),
                valence.get_from_label("frac d valence electrons"),
                ion_prop.get_from_label("avg ionic char"),
                TMetalFraction().featurize(comp)[0],
                en_diff.get_from_label("minimum EN difference"),
                en_diff.get_from_label("range EN difference"),
                en_diff.get_from_label("mean EN difference"),
                en_diff.get_from_label("std_dev EN difference"),
                oxi_state.get_from_label("range oxidation state"),
                oxi_state.get_from_label("std_dev oxidation state"),
                atomic_orbital.get_from_label("LUMO_energy"),
                atomic_orbital.get_from_label("gap_AO"),
                at_pack_eff.get_from_label("mean simul. packing efficiency"),
                at_pack_eff.get_from_label(
                    "mean abs simul. packing efficiency"),
                at_pack_eff.get_from_label(
                    "dist from 1 clusters |APE| < 0.010"),
                at_pack_eff.get_from_label(
                    "dist from 3 clusters |APE| < 0.010"),
                at_pack_eff.get_from_label(
                    "dist from 5 clusters |APE| < 0.010"),
            ]
    elif model_type == "comp_st":
        if isinstance(target, Composition):
            raise ValueError(
                'comp_st (Using compositional and structural descriptor) is specified, '
                'but target is composition')
        comp: Composition = target.composition
        comp_oxi = comp.add_charges_from_oxi_state_guesses()
        target_orig = deepcopy(target)
        target.add_oxidation_state_by_guess()
        if dielectric_type == "el":
            ep = ScalarFeaturizer(ElementProperty.from_preset("matminer"),
                                  comp)
            valence = ScalarFeaturizer(ValenceOrbital(), comp)
            en_diff = ScalarFeaturizer(ElectronegativityDiff(), comp_oxi)
            atomic_orbital = ScalarFeaturizer(AtomicOrbitals(), comp)
            density = ScalarFeaturizer(DensityFeatures(), target)
            dist_btw_nn = MinimumRelativeDistances().featurize(target_orig)
            opsf = SiteFeaturizer(OPSiteFingerprint(), target)
            voro_fp = SiteFeaturizer(VoronoiFingerprint(use_symm_weights=True),
                                     target)
            gsf = SiteFeaturizer(GaussianSymmFunc(), target)
            lpd = SiteFeaturizer(
                LocalPropertyDifference.from_preset("ward-prb-2017"), target)
            descriptor = [
                ep.get_from_label("PymatgenData std_dev X"),
                ep.get_from_label("PymatgenData mean block"),
                ep.get_from_label("PymatgenData std_dev atomic_mass"),
                valence.get_from_label("frac d valence electrons"),
                TMetalFraction().featurize(comp)[0],
                en_diff.get_from_label("maximum EN difference"),
                en_diff.get_from_label("mean EN difference"),
                atomic_orbital.get_from_label("HOMO_energy"),
                atomic_orbital.get_from_label("LUMO_energy"),
                density.get_from_label("density"),
                np.mean(dist_btw_nn),
                np.std(dist_btw_nn),
                opsf.get_from_label_func("tetrahedral CN_4", np.max),
                opsf.get_from_label_func("rectangular see-saw-like CN_4",
                                         np.max),
                np.max([
                    EwaldSiteEnergy(accuracy=4).featurize(target, i)
                    for i in range(target.num_sites)
                ]),
                voro_fp.get_from_label_func("Voro_area_std_dev", np.max),
                voro_fp.get_from_label_func("Voro_area_std_dev", np.mean),
                voro_fp.get_from_label_func("Voro_dist_minimum", np.min),
                voro_fp.get_from_label_func("Voro_dist_minimum", np.std),
                gsf.get_from_label_func("G2_20.0", np.std),
                gsf.get_from_label_func("G2_80.0", np.max),
                gsf.get_from_label_func("G4_0.005_4.0_-1.0", np.mean),
                lpd.get_from_label_func("local difference in NdValence",
                                        np.mean),
                lpd.get_from_label_func("local difference in NValence",
                                        np.min),
                lpd.get_from_label_func("local difference in NValence",
                                        np.std),
                lpd.get_from_label_func("local difference in NdUnfilled",
                                        np.mean),
                lpd.get_from_label_func("local difference in NUnfilled",
                                        np.min),
                lpd.get_from_label_func("local difference in NUnfilled",
                                        np.mean),
                lpd.get_from_label_func("local difference in GSmagmom",
                                        np.mean)
            ]
        elif dielectric_type == "ion":
            ep = ScalarFeaturizer(ElementProperty.from_preset("matminer"),
                                  comp)
            atomic_orbitals = ScalarFeaturizer(AtomicOrbitals(), comp)
            density = ScalarFeaturizer(DensityFeatures(), target)
            str_het = ScalarFeaturizer(StructuralHeterogeneity(), target)
            opsf = SiteFeaturizer(OPSiteFingerprint(), target)
            voro_fp = SiteFeaturizer(VoronoiFingerprint(use_symm_weights=True),
                                     target)
            gsf = SiteFeaturizer(GaussianSymmFunc(), target)
            lpd = SiteFeaturizer(
                LocalPropertyDifference.from_preset("ward-prb-2017"), target)
            descriptor = [
                ep.get_from_label("PymatgenData std_dev row"),
                ep.get_from_label("PymatgenData mean thermal_conductivity"),
                ep.get_from_label("PymatgenData std_dev melting_point"),
                TMetalFraction().featurize(comp)[0],
                atomic_orbitals.get_from_label("gap_AO"),
                density.get_from_label("density"),
                density.get_from_label("packing fraction"),
                str_het.get_from_label("mean neighbor distance variation"),
                str_het.get_from_label("avg_dev neighbor distance variation"),
                opsf.get_from_label_func("sgl_bd CN_1", np.mean),
                opsf.get_from_label_func("bent 150 degrees CN_2", np.mean),
                opsf.get_from_label_func("linear CN_2", np.mean),
                opsf.get_from_label_func("trigonal planar CN_3", np.mean),
                opsf.get_from_label_func("pentagonal planar CN_5", np.std),
                opsf.get_from_label_func("octahedral CN_6", np.max),
                opsf.get_from_label_func("octahedral CN_6", np.std),
                opsf.get_from_label_func("q6 CN_12", np.mean),
                np.max([
                    EwaldSiteEnergy(accuracy=4).featurize(target, i)
                    for i in range(target.num_sites)
                ]),
                voro_fp.get_from_label_func("Symmetry_weighted_index_4",
                                            np.std),
                voro_fp.get_from_label_func("Voro_vol_maximum", np.mean),
                voro_fp.get_from_label_func("Voro_area_std_dev", np.mean),
                voro_fp.get_from_label_func("Voro_area_minimum", np.std),
                voro_fp.get_from_label_func("Voro_area_maximum", np.min),
                voro_fp.get_from_label_func("Voro_dist_std_dev", np.mean),
                gsf.get_from_label_func("G2_80.0", np.min),
                gsf.get_from_label_func("G4_0.005_4.0_1.0", np.std),
                lpd.get_from_label_func("local difference in Number", np.max),
                lpd.get_from_label_func("local difference in MendeleevNumber",
                                        np.max),
                lpd.get_from_label_func("local difference in MendeleevNumber",
                                        np.min),
                lpd.get_from_label_func("local difference in AtomicWeight",
                                        np.max),
                lpd.get_from_label_func("local difference in AtomicWeight",
                                        np.mean),
                lpd.get_from_label_func("local difference in MeltingT",
                                        np.mean),
                lpd.get_from_label_func("local difference in Row", np.max),
                lpd.get_from_label_func(
                    "local difference in Electronegativity", np.min),
                lpd.get_from_label_func("local difference in NValence",
                                        np.std),
                lpd.get_from_label_func("local difference in NsUnfilled",
                                        np.mean),
                lpd.get_from_label_func("local difference in NdUnfilled",
                                        np.max),
                lpd.get_from_label_func("local difference in NdUnfilled",
                                        np.std),
                lpd.get_from_label_func("local difference in NUnfilled",
                                        np.max),
                lpd.get_from_label_func("local difference in NUnfilled",
                                        np.min),
                lpd.get_from_label_func("local difference in NUnfilled",
                                        np.mean),
                lpd.get_from_label_func("local difference in NUnfilled",
                                        np.std),
                lpd.get_from_label_func("local difference in GSvolume_pa",
                                        np.max),
                lpd.get_from_label_func("local difference in GSvolume_pa",
                                        np.min),
                lpd.get_from_label_func("local difference in SpaceGroupNumber",
                                        np.max),
            ]
    with open(
            f"{os.path.dirname(__file__)}/{dielectric_type}_{model_type}.joblib",
            "rb") as fr:
        model: RandomForestRegressor = joblib.load(fr)
    with open(
            f"{os.path.dirname(__file__)}/{dielectric_type}_{model_type}_scaler.joblib",
            "rb") as fr:
        scaler: StandardScaler = joblib.load(fr)
    descriptor = scaler.transform([descriptor])
    return model.predict(descriptor)[0]
Beispiel #12
0
    def from_preset(preset, **kwargs):
        """
        Create a SiteStatsFingerprint class according to a preset

        Args:
            preset (str) - Name of preset
            kwargs - Options for SiteStatsFingerprint
        """

        if preset == "CrystalNNFingerprint_cn":
            return SiteStatsFingerprint(
                CrystalNNFingerprint.from_preset("cn", cation_anion=False),
                **kwargs)

        elif preset == "CrystalNNFingerprint_cn_cation_anion":
            return SiteStatsFingerprint(
                CrystalNNFingerprint.from_preset("cn", cation_anion=True),
                **kwargs)

        elif preset == "CrystalNNFingerprint_ops":
            return SiteStatsFingerprint(
                CrystalNNFingerprint.from_preset("ops", cation_anion=False),
                **kwargs)

        elif preset == "CrystalNNFingerprint_ops_cation_anion":
            return SiteStatsFingerprint(
                CrystalNNFingerprint.from_preset("ops", cation_anion=True),
                **kwargs)

        elif preset == "OPSiteFingerprint":
            return SiteStatsFingerprint(OPSiteFingerprint(), **kwargs)

        elif preset == "LocalPropertyDifference_ward-prb-2017":
            return SiteStatsFingerprint(
                LocalPropertyDifference.from_preset("ward-prb-2017"),
                stats=["minimum", "maximum", "range", "mean", "avg_dev"])

        elif preset == "CoordinationNumber_ward-prb-2017":
            return SiteStatsFingerprint(
                CoordinationNumber(nn=VoronoiNN(weight='area'),
                                   use_weights="effective"),
                stats=["minimum", "maximum", "range", "mean", "avg_dev"])

        elif preset == "Composition-dejong2016_AD":
            return SiteStatsFingerprint(
                LocalPropertyDifference(properties=[
                    "Number", "AtomicWeight", "Column", "Row",
                    "CovalentRadius", "Electronegativity"
                ],
                                        signed=False),
                stats=['holder_mean::%d' % d
                       for d in range(0, 4 + 1)] + ['std_dev'],
            )

        elif preset == "Composition-dejong2016_SD":
            return SiteStatsFingerprint(
                LocalPropertyDifference(properties=[
                    "Number", "AtomicWeight", "Column", "Row",
                    "CovalentRadius", "Electronegativity"
                ],
                                        signed=True),
                stats=['holder_mean::%d' % d for d in [1, 2, 4]] + ['std_dev'],
            )

        elif preset == "BondLength-dejong2016":
            return SiteStatsFingerprint(
                AverageBondLength(VoronoiNN()),
                stats=['holder_mean::%d' % d for d in range(-4, 4 + 1)] +
                ['std_dev', 'geom_std_dev'])

        elif preset == "BondAngle-dejong2016":
            return SiteStatsFingerprint(
                AverageBondAngle(VoronoiNN()),
                stats=['holder_mean::%d' % d for d in range(-4, 4 + 1)] +
                ['std_dev', 'geom_std_dev'])

        else:
            # TODO: Why assume coordination number? Should this just raise an error? - lw
            # One of the various Coordination Number presets:
            # MinimumVIRENN, MinimumDistanceNN, JmolNN, VoronoiNN, etc.
            try:
                return SiteStatsFingerprint(
                    CoordinationNumber.from_preset(preset), **kwargs)
            except:
                pass

        raise ValueError("Unrecognized preset!")
Beispiel #13
0
 def test_op_site_fingerprint(self):
     opsf = OPSiteFingerprint()
     l = opsf.feature_labels()
     t = ["sgl_bd CN_1", "bent180 CN_2", "bent45 CN_2", "bent90 CN_2", \
         "bent135 CN_2", "tri_plan CN_3", "tet CN_3", "T CN_3", \
         "sq_plan CN_4", "sq CN_4", "tet CN_4", "see_saw CN_4", \
         "tri_pyr CN_4", "pent_plan CN_5", "sq_pyr CN_5", \
         "tri_bipyr CN_5", "oct CN_6", "pent_pyr CN_6", "hex_pyr CN_7", \
         "pent_bipyr CN_7", "bcc CN_8", "hex_bipyr CN_8", \
         "q2 CN_9", "q4 CN_9", "q6 CN_9", \
         "q2 CN_10", "q4 CN_10", "q6 CN_10",
         "q2 CN_11", "q4 CN_11", "q6 CN_11", \
         "cuboct CN_12", "q2 CN_12", "q4 CN_12", "q6 CN_12"]
     for i in range(len(l)):
         self.assertEqual(l[i], t[i])
     ops = opsf.featurize(self.sc, 0)
     self.assertEqual(len(ops), 35)
     self.assertAlmostEqual(int(1000 * ops[opsf.feature_labels().index(
         'oct CN_6')]), 999)
     ops = opsf.featurize(self.cscl, 0)
     self.assertAlmostEqual(int(1000 * ops[opsf.feature_labels().index(
         'bcc CN_8')] + 0.5), 895)
     opsf = OPSiteFingerprint(dist_exp=0)
     ops = opsf.featurize(self.cscl, 0)
     self.assertAlmostEqual(int(1000 * ops[opsf.feature_labels().index(
         'bcc CN_8')] + 0.5), 955)
 def test_op_site_fingerprint(self):
     opsf = OPSiteFingerprint()
     l = opsf.feature_labels()
     t = ['sgl_bd CN_1', 'L-shaped CN_2', 'water-like CN_2', \
          'bent 120 degrees CN_2', 'bent 150 degrees CN_2', \
          'linear CN_2', 'trigonal planar CN_3', \
          'trigonal non-coplanar CN_3', 'T-shaped CN_3', \
          'square co-planar CN_4', 'tetrahedral CN_4', \
          'rectangular see-saw-like CN_4', 'see-saw-like CN_4', \
          'trigonal pyramidal CN_4', 'pentagonal planar CN_5', \
          'square pyramidal CN_5', 'trigonal bipyramidal CN_5', \
          'hexagonal planar CN_6', 'octahedral CN_6', \
          'pentagonal pyramidal CN_6', 'hexagonal pyramidal CN_7', \
          'pentagonal bipyramidal CN_7', 'body-centered cubic CN_8', \
          'hexagonal bipyramidal CN_8', 'q2 CN_9', 'q4 CN_9', 'q6 CN_9', \
          'q2 CN_10', 'q4 CN_10', 'q6 CN_10', \
          'q2 CN_11', 'q4 CN_11', 'q6 CN_11', \
          'cuboctahedral CN_12', 'q2 CN_12', 'q4 CN_12', 'q6 CN_12']
     for i in range(len(l)):
         self.assertEqual(l[i], t[i])
     ops = opsf.featurize(self.sc, 0)
     self.assertEqual(len(ops), 37)
     self.assertAlmostEqual(
         ops[opsf.feature_labels().index('octahedral CN_6')],
         0.9995,
         places=7)
     ops = opsf.featurize(self.cscl, 0)
     self.assertAlmostEqual(
         ops[opsf.feature_labels().index('body-centered cubic CN_8')],
         0.8955,
         places=7)
     opsf = OPSiteFingerprint(dist_exp=0)
     ops = opsf.featurize(self.cscl, 0)
     self.assertAlmostEqual(
         ops[opsf.feature_labels().index('body-centered cubic CN_8')],
         0.9555,
         places=7)
Beispiel #15
0
def featurize_site(df: pd.DataFrame, site_stats=("mean", "std_dev")) -> pd.DataFrame:
    """ Decorate input `pandas.DataFrame` of structures with site
    features from matminer.

    Currently creates the set of all matminer structure features with
    the `matminer.featurizers.structure.SiteStatsFingerprint`.

    Args:
        df (pandas.DataFrame): the input dataframe with `"structure"`
            column containing `pymatgen.Structure` objects.
        site_stats (Tuple[str]): the matminer site stats to use in the
            `SiteStatsFingerprint` for all features.

    Returns:
        pandas.DataFrame: the decorated DataFrame.

    """

    logging.info("Applying site featurizers...")

    df = df.copy()
    df.columns = ["Input data|" + x for x in df.columns]

    site_fingerprints = (
        AGNIFingerprints(),
        GeneralizedRadialDistributionFunction.from_preset("gaussian"),
        OPSiteFingerprint(),
        CrystalNNFingerprint.from_preset("ops"),
        VoronoiFingerprint(),
        GaussianSymmFunc(),
        ChemEnvSiteFingerprint.from_preset("simple"),
        CoordinationNumber(),
        LocalPropertyDifference(),
        BondOrientationalParameter(),
        AverageBondLength(VoronoiNN()),
        AverageBondAngle(VoronoiNN())
    )

    for fingerprint in site_fingerprints:
        site_stats_fingerprint = SiteStatsFingerprint(
            fingerprint,
            stats=site_stats
        )

        df = site_stats_fingerprint.featurize_dataframe(
            df,
            "Input data|structure",
            multiindex=False,
            ignore_errors=True
        )

        fingerprint_name = fingerprint.__class__.__name__

        # rename some features for backwards compatibility with pretrained models
        if fingerprint_name == "GeneralizedRadialDistributionFunction":
            fingerprint_name = "GeneralizedRDF"
        elif fingerprint_name == "AGNIFingerprints":
            fingerprint_name = "AGNIFingerPrint"
        elif fingerprint_name == "BondOrientationalParameter":
            fingerprint_name = "BondOrientationParameter"
        elif fingerprint_name == "GaussianSymmFunc":
            fingerprint_name = "ChemEnvSiteFingerprint|GaussianSymmFunc"

        if "|" not in fingerprint_name:
            fingerprint_name += "|"

        df.columns = [f"{fingerprint_name}{x}" if "|" not in x else x for x in df.columns]

    df = df.loc[:, (df != 0).any(axis=0)]

    return clean_df(df)
Beispiel #16
0
def get_op_site_features(s, site_idx):

    opsf = OPSiteFingerprint()
    f = opsf.featurize(s, site_idx)    
    return f.tolist()
Beispiel #17
0
class OPStructureFingerprint(BaseFeaturizer):
    """
    Calculates all order parameters (OPs) for all sites in a crystal
    structure.
    Args:
        op_site_fp (OPSiteFingerprint): defines the types of order
            parameters to be calculated.
        stats ([str]): list of weighted statistics to compute for each feature.
            If stats is None, for each order parameter, a list is returned that
            contains the calculated parameter for each site in the structure.
            *Note for nth mode, stat must be 'n*_mode'; e.g. stat='2nd_mode'
        min_oxi (int): minimum site oxidation state for inclusion (e.g.,
            zero means metals/cations only)
        max_oxi (int): maximum site oxidation state for inclusion
    """
    def __init__(self, op_site_fp=None, stats=('mean', 'std_dev', 'minimum',
                                               'maximum'), min_oxi=None,
                 max_oxi=None):

        self.op_site_fp = OPSiteFingerprint() if op_site_fp is None \
            else op_site_fp
        self._labels = self.op_site_fp.feature_labels()
        self.stats = tuple([stats]) if type(stats) == str else stats
        if self.stats and '_mode' in ''.join(self.stats):
            nmodes = 0
            for stat in self.stats:
                if '_mode' in stat and int(stat[0]) > nmodes:
                    nmodes = int(stat[0])
            self.nmodes = nmodes

        self.min_oxi = min_oxi
        self.max_oxi = max_oxi

    def featurize(self, s):
        """
        Calculate all sites' local structure order parameters (LSOPs).

        Args:
            s: Pymatgen Structure object.

            Returns:
                opvals: (2D array of floats) LSOP values of all sites'
                (1st dimension) order parameters (2nd dimension). 46 order
                parameters are computed per site: q_cn (coordination
                number), q_lin, 35 x q_bent (starting with a target angle
                of 5 degrees and, increasing by 5 degrees, until 175 degrees),
                q_tet, q_oct, q_bcc, q_2, q_4, q_6, q_reg_tri, q_sq, q_sq_pyr.
        """
        opvals = [[] for t in self._labels]
        for i, site in enumerate(s.sites):
            if (self.min_oxi is None or site.specie.oxi_state >= self.min_oxi) \
                    and (self.max_oxi is None or site.specie.oxi_state >= self.max_oxi):
                opvalstmp = self.op_site_fp.featurize(s, i)
                for j, opval in enumerate(opvalstmp):
                    if opval is None:
                        opvals[j].append(0.0)
                    else:
                        opvals[j].append(opval)

        if self.stats:
            opstats = []
            for op in opvals:
                if '_mode' in ''.join(self.stats):
                    modes = self.n_numerical_modes(op, self.nmodes, 0.01)
                for stat in self.stats:
                    if '_mode' in stat:
                        opstats.append(modes[int(stat[0])-1])
                    else:
                        opstats.append(PropertyStats().calc_stat(op, stat))

            return opstats
        else:
            return opvals

    def feature_labels(self):
        if self.stats:
            labels = []
            for attr in self._labels:
                for stat in self.stats:
                    labels.append('%s %s' % (stat, attr))
            return labels
        else:
            return self._labels

    def citations(self):
        return ('@article{zimmermann_jain_2017, title={Applications of order'
                ' parameter feature vectors}, journal={in progress}, author={'
                'Zimmermann, N. E. R. and Jain, A.}, year={2017}}')

    def implementors(self):
        return (['Nils E. R. Zimmermann', 'Alireza Faghaninia', 'Anubhav Jain'])

    @staticmethod
    def n_numerical_modes(data_lst, n=2, dl=0.1):
        """
        Returns the n first modes of a data set that are obtained with
            a finite bin size for the underlying frequency distribution.
        Args:
            data_lst ([float]): data values.
            n (integer): number of most frequent elements to be determined.
            dl (float): bin size of underlying (coarsened) distribution.
        Returns:
            ([float]): first n most frequent entries (or nan if not found).
        """
        if len(set(data_lst)) == 1:
            return [data_lst[0]] + [float('NaN') for _ in range(n-1)]
        hist, bins = np.histogram(data_lst, bins=np.arange(
                min(data_lst), max(data_lst), dl), density=False)
        modes = list(bins[np.argsort(hist)[-n:]][::-1])
        return modes + [float('NaN') for _ in range(n-len(modes))]