Example #1
0
 def test_sine_coulomb_matrix(self):
     scm = SineCoulombMatrix()
     sin_mat = scm.featurize(self.diamond)
     mtarget = [[36.8581, 6.147068], [6.147068, 36.8581]]
     self.assertAlmostEqual(
         np.linalg.norm(sin_mat - np.array(mtarget)), 0.0, places=4)
     scm = SineCoulombMatrix(False)
     sin_mat = scm.featurize(self.diamond)[0]
     self.assertEqual(sin_mat[0][0], 0)
     self.assertEqual(sin_mat[1][1], 0)
Example #2
0
    def test_sine_coulomb_matrix(self):
        # flat
        scm = SineCoulombMatrix(flatten=True)
        df = pd.DataFrame({"s": [self.sc, self.ni3al]})
        with self.assertRaises(NotFittedError):
            df = scm.featurize_dataframe(df, "s")
        df = scm.fit_featurize_dataframe(df, "s")
        labels = scm.feature_labels()
        self.assertEqual(labels[0], "sine coulomb matrix eig 0")
        self.assertArrayAlmostEqual(
            df[labels].iloc[0],
            [235.740418, 0.0, 0.0, 0.0],
            decimal=5)
        self.assertArrayAlmostEqual(
            df[labels].iloc[1],
            [232.578562, 1656.288171, 1403.106576, 1403.106576],
            decimal=5)

        # matrix
        scm = SineCoulombMatrix(flatten=False)
        sin_mat = scm.featurize(self.diamond)
        mtarget = [[36.8581, 6.147068], [6.147068, 36.8581]]
        self.assertAlmostEqual(
            np.linalg.norm(sin_mat - np.array(mtarget)), 0.0, places=4)
        scm = SineCoulombMatrix(diag_elems=False, flatten=False)
        sin_mat = scm.featurize(self.diamond)[0]
        self.assertEqual(sin_mat[0][0], 0)
        self.assertEqual(sin_mat[1][1], 0)
Example #3
0
 def __init__(self,
              coulomb_matrix=SineCoulombMatrix(flatten=False),
              token=' - '):
     self.coulomb_matrix = coulomb_matrix
     self.token = token
     self.bag_lens = None
     self.ordered_bonds = None
Example #4
0
def featurize_structure(df: pd.DataFrame) -> pd.DataFrame:
    """ Decorate input `pandas.DataFrame` of structures with structural
    features from matminer.

    Currently applies the set of all matminer structure features.

    Args:
        df (pandas.DataFrame): the input dataframe with `"structure"`
            column containing `pymatgen.Structure` objects.

    Returns:
        pandas.DataFrame: the decorated DataFrame.

    """

    logging.info("Applying structure featurizers...")

    df = df.copy()

    structure_features = [
         DensityFeatures(),
         GlobalSymmetryFeatures(),
         RadialDistributionFunction(),
         CoulombMatrix(),
         PartialRadialDistributionFunction(),
         SineCoulombMatrix(),
         EwaldEnergy(),
         BondFractions(),
         StructuralHeterogeneity(),
         MaximumPackingEfficiency(),
         ChemicalOrdering(),
         XRDPowderPattern(),
         BagofBonds()
    ]

    featurizer = MultipleFeaturizer([feature.fit(df["structure"]) for feature in structure_features])

    df = featurizer.featurize_dataframe(df, "structure", multiindex=True, ignore_errors=True)
    df.columns = df.columns.map('|'.join).str.strip('|')

    dist = df["RadialDistributionFunction|radial distribution function"][0]['distances'][:50]
    for i, d in enumerate(dist):
        _rdf_key = "RadialDistributionFunction|radial distribution function|d_{:.2f}".format(d)
        df[_rdf_key] = df["RadialDistributionFunction|radial distribution function"].apply(lambda x: x['distribution'][i])

    df = df.drop("RadialDistributionFunction|radial distribution function", axis=1)

    _crystal_system = {
        "cubic": 1, "tetragonal": 2, "orthorombic": 3,
        "hexagonal": 4, "trigonal": 5, "monoclinic": 6, "triclinic": 7
    }

    df["GlobalSymmetryFeatures|crystal_system"] = df["GlobalSymmetryFeatures|crystal_system"].map(_crystal_system)
    df["GlobalSymmetryFeatures|is_centrosymmetric"] = df["GlobalSymmetryFeatures|is_centrosymmetric"].map(int)

    return clean_df(df)
  def _featurize(self, struct):
    """
    Calculate sine Coulomb matrix from pymatgen structure.

    Parameters
    ----------
    struct : dict
      Json-serializable dictionary representation of pymatgen.core.structure
      https://pymatgen.org/pymatgen.core.structure.html

    Returns
    -------
    features: np.ndarray
      2D sine Coulomb matrix with shape (max_atoms, max_atoms),
      or 1D matrix eigenvalues with shape (max_atoms,). 

    """

    from pymatgen import Structure
    from matminer.featurizers.structure import SineCoulombMatrix as SCM

    s = Structure.from_dict(struct)

    # Get full N x N SCM
    scm = SCM(flatten=False)
    sine_mat = scm.featurize(s)

    if self.flatten:
      eigs, _ = np.linalg.eig(sine_mat)
      zeros = np.zeros((self.max_atoms,))
      zeros[:len(eigs)] = eigs
      features = zeros
    else:
      features = pad_array(sine_mat, self.max_atoms)

    features = np.asarray(features)

    return features
    def _featurize(self, struct: "pymatgen.Structure"):
        """
    Calculate sine Coulomb matrix from pymatgen structure.

    Parameters
    ----------
    struct : pymatgen.Structure
      A periodic crystal composed of a lattice and a sequence of atomic
      sites with 3D coordinates and elements.
      
    Returns
    -------
    features: np.ndarray
      2D sine Coulomb matrix with shape (max_atoms, max_atoms),
      or 1D matrix eigenvalues with shape (max_atoms,). 

    """

        try:
            from matminer.featurizers.structure import SineCoulombMatrix as SCM
        except ModuleNotFoundError:
            raise ValueError("This class requires matminer to be installed.")

        # Get full N x N SCM
        scm = SCM(flatten=False)
        sine_mat = scm.featurize(struct)

        if self.flatten:
            eigs, _ = np.linalg.eig(sine_mat)
            zeros = np.zeros((1, self.max_atoms))
            zeros[:len(eigs)] = eigs
            features = zeros
        else:
            features = pad_array(sine_mat, self.max_atoms)

        features = np.asarray(features)

        return features
Example #7
0
 def test_sine_coulomb_matrix(self):
     scm = SineCoulombMatrix()
     sin_mat = scm.featurize(self.diamond)
     mtarget = [[36.8581, 6.147068], [6.147068, 36.8581]]
     self.assertAlmostEqual(
         np.linalg.norm(sin_mat - np.array(mtarget)), 0.0, places=4)
     scm = SineCoulombMatrix(False)
     sin_mat = scm.featurize(self.diamond)[0]
     self.assertEqual(sin_mat[0][0], 0)
     self.assertEqual(sin_mat[1][1], 0)
Example #8
0
    def __init__(self, max_atoms: int = 100, flatten: bool = True):
        """
    Parameters
    ----------
    max_atoms: int (default 100)
      Maximum number of atoms for any crystal in the dataset. Used to
      pad the Coulomb matrix.
    flatten: bool (default True)
      Return flattened vector of matrix eigenvalues.
    """
        try:
            from matminer.featurizers.structure import SineCoulombMatrix as SCM
        except ModuleNotFoundError:
            raise ValueError("This class requires matminer to be installed.")

        self.max_atoms = max_atoms
        self.flatten = flatten
        self.scm = SCM(flatten=False)
    def _featurize(self, datapoint: PymatgenStructure, **kwargs) -> np.ndarray:
        """
    Calculate sine Coulomb matrix from pymatgen structure.

    Parameters
    ----------
    datapoint: pymatgen.core.Structure
      A periodic crystal composed of a lattice and a sequence of atomic
      sites with 3D coordinates and elements.

    Returns
    -------
    features: np.ndarray
      2D sine Coulomb matrix with shape (max_atoms, max_atoms),
      or 1D matrix eigenvalues with shape (max_atoms,).
    """
        if 'struct' in kwargs and datapoint is None:
            datapoint = kwargs.get("struct")
            raise DeprecationWarning(
                'Struct is being phased out as a parameter, please pass "datapoint" instead.'
            )

        if self.scm is None:
            try:
                from matminer.featurizers.structure import SineCoulombMatrix as SCM
                self.scm = SCM(flatten=False)
            except ModuleNotFoundError:
                raise ImportError(
                    "This class requires matminer to be installed.")

        # Get full N x N SCM
        sine_mat = self.scm.featurize(datapoint)

        if self.flatten:
            eigs, _ = np.linalg.eig(sine_mat)
            zeros = np.zeros(self.max_atoms)
            zeros[:len(eigs[0])] = eigs[0]
            features = zeros
        else:
            features = pad_array(sine_mat, self.max_atoms)

        features = np.asarray(features)

        return features
Example #10
0
    def test_bob(self):

        # Test a single fit and featurization
        scm = SineCoulombMatrix(flatten=False)
        bob = BagofBonds(coulomb_matrix=scm, token=' - ')
        bob.fit([self.ni3al])
        truth1 = [
            235.74041833262768, 1486.4464890775491, 1486.4464890775491,
            1486.4464890775491, 38.69353092306119, 38.69353092306119,
            38.69353092306119, 38.69353092306119, 38.69353092306119,
            38.69353092306119, 83.33991275736257, 83.33991275736257,
            83.33991275736257, 83.33991275736257, 83.33991275736257,
            83.33991275736257
        ]
        truth1_labels = [
            'Al site #0', 'Ni site #0', 'Ni site #1', 'Ni site #2',
            'Al - Ni bond #0', 'Al - Ni bond #1', 'Al - Ni bond #2',
            'Al - Ni bond #3', 'Al - Ni bond #4', 'Al - Ni bond #5',
            'Ni - Ni bond #0', 'Ni - Ni bond #1', 'Ni - Ni bond #2',
            'Ni - Ni bond #3', 'Ni - Ni bond #4', 'Ni - Ni bond #5'
        ]
        self.assertArrayAlmostEqual(bob.featurize(self.ni3al), truth1)
        self.assertEqual(bob.feature_labels(), truth1_labels)

        # Test padding from fitting and dataframe featurization
        bob.coulomb_matrix = CoulombMatrix(flatten=False)
        bob.fit([self.ni3al, self.cscl, self.diamond_no_oxi])
        df = pd.DataFrame({'structures': [self.cscl]})
        df = bob.featurize_dataframe(df, 'structures')
        self.assertEqual(len(df.columns.values), 25)
        self.assertAlmostEqual(df['Cs+ site #0'][0], 7513.468312122532)
        self.assertAlmostEqual(df['Al site #0'][0], 0.0)
        self.assertAlmostEqual(df['Cs+ - Cl- bond #1'][0], 135.74726437398044,
                               3)
        self.assertAlmostEqual(df['Al - Ni bond #0'][0], 0.0)

        # Test error handling for bad fits or null fits
        bob = BagofBonds(CoulombMatrix(flatten=False))
        self.assertRaises(NotFittedError, bob.featurize, self.nacl)
        bob.fit([self.ni3al, self.diamond])
        self.assertRaises(ValueError, bob.featurize, self.nacl)\
Example #11
0
class DeBreuck2020Featurizer(modnet.featurizers.MODFeaturizer):
    """ Featurizer presets used for the paper 'Machine learning
    materials properties for small datasets' by Pierre-Paul De Breuck,
    Geoffroy Hautier & Gian-Marco Rignanese, arXiv:2004.14766 (2020).

    Uses most of the featurizers implemented by matminer at the time of
    writing with their default hyperparameters and presets.

    """
    from matminer.featurizers.composition import (
        AtomicOrbitals,
        AtomicPackingEfficiency,
        BandCenter,
        # CohesiveEnergy, - This descriptor was not used in the paper preset
        # ElectronAffinity, - This descriptor was not used in the paper preset
        ElectronegativityDiff,
        ElementFraction,
        ElementProperty,
        IonProperty,
        Miedema,
        OxidationStates,
        Stoichiometry,
        TMetalFraction,
        ValenceOrbital,
        YangSolidSolution,
    )
    from matminer.featurizers.structure import (
        # BagofBonds, - This descriptor was not used in the paper preset
        BondFractions,
        ChemicalOrdering,
        CoulombMatrix,
        DensityFeatures,
        EwaldEnergy,
        GlobalSymmetryFeatures,
        MaximumPackingEfficiency,
        # PartialRadialDistributionFunction,
        RadialDistributionFunction,
        SineCoulombMatrix,
        StructuralHeterogeneity,
        XRDPowderPattern,
    )

    from matminer.featurizers.site import (
        AGNIFingerprints,
        AverageBondAngle,
        AverageBondLength,
        BondOrientationalParameter,
        ChemEnvSiteFingerprint,
        CoordinationNumber,
        CrystalNNFingerprint,
        GaussianSymmFunc,
        GeneralizedRadialDistributionFunction,
        LocalPropertyDifference,
        OPSiteFingerprint,
        VoronoiFingerprint,
    )

    composition_featurizers = (
        AtomicOrbitals(),
        AtomicPackingEfficiency(),
        BandCenter(),
        ElementFraction(),
        ElementProperty.from_preset("magpie"),
        IonProperty(),
        Miedema(),
        Stoichiometry(),
        TMetalFraction(),
        ValenceOrbital(),
        YangSolidSolution(),
    )

    oxide_composition_featurizers = (
        ElectronegativityDiff(),
        OxidationStates(),
    )

    structure_featurizers = (
        DensityFeatures(),
        GlobalSymmetryFeatures(),
        RadialDistributionFunction(),
        CoulombMatrix(),
        # PartialRadialDistributionFunction(),
        SineCoulombMatrix(),
        EwaldEnergy(),
        BondFractions(),
        StructuralHeterogeneity(),
        MaximumPackingEfficiency(),
        ChemicalOrdering(),
        XRDPowderPattern(),
        # BagofBonds(),
    )
    site_featurizers = (
        AGNIFingerprints(),
        AverageBondAngle(VoronoiNN()),
        AverageBondLength(VoronoiNN()),
        BondOrientationalParameter(),
        ChemEnvSiteFingerprint.from_preset("simple"),
        CoordinationNumber(),
        CrystalNNFingerprint.from_preset("ops"),
        GaussianSymmFunc(),
        GeneralizedRadialDistributionFunction.from_preset("gaussian"),
        LocalPropertyDifference(),
        OPSiteFingerprint(),
        VoronoiFingerprint(),
    )

    def featurize_composition(self, df):
        """ Applies the preset composition featurizers to the input dataframe,
        renames some fields and cleans the output dataframe.

        """
        df = super().featurize_composition(df)

        _orbitals = {"s": 1, "p": 2, "d": 3, "f": 4}
        df['AtomicOrbitals|HOMO_character'] = df[
            'AtomicOrbitals|HOMO_character'].map(_orbitals)
        df['AtomicOrbitals|LUMO_character'] = df[
            'AtomicOrbitals|LUMO_character'].map(_orbitals)

        df['AtomicOrbitals|HOMO_element'] = df[
            'AtomicOrbitals|HOMO_element'].apply(
                lambda x: -1 if not isinstance(x, str) else Element(x).Z)
        df['AtomicOrbitals|LUMO_element'] = df[
            'AtomicOrbitals|LUMO_element'].apply(
                lambda x: -1 if not isinstance(x, str) else Element(x).Z)

        df = df.replace([np.inf, -np.inf, np.nan], 0)

        return modnet.featurizers.clean_df(df)

    def featurize_structure(self, df):
        """ Applies the preset structural featurizers to the input dataframe,
        renames some fields and cleans the output dataframe.

        """
        df = super().featurize_structure(df)

        dist = df[
            "RadialDistributionFunction|radial distribution function"].iloc[0][
                'distances'][:50]
        for i, d in enumerate(dist):
            _rdf_key = "RadialDistributionFunction|radial distribution function|d_{:.2f}".format(
                d)
            df[_rdf_key] = df[
                "RadialDistributionFunction|radial distribution function"].apply(
                    lambda x: x['distribution'][i])

        df = df.drop("RadialDistributionFunction|radial distribution function",
                     axis=1)

        _crystal_system = {
            "cubic": 1,
            "tetragonal": 2,
            "orthorombic": 3,
            "hexagonal": 4,
            "trigonal": 5,
            "monoclinic": 6,
            "triclinic": 7
        }

        def _int_map(x):
            if x == np.nan:
                return 0
            elif x:
                return 1
            else:
                return 0

        df["GlobalSymmetryFeatures|crystal_system"] = df[
            "GlobalSymmetryFeatures|crystal_system"].map(_crystal_system)
        df["GlobalSymmetryFeatures|is_centrosymmetric"] = df[
            "GlobalSymmetryFeatures|is_centrosymmetric"].map(_int_map)

        return modnet.featurizers.clean_df(df)

    def featurize_site(self, df):
        """ Applies the preset site featurizers to the input dataframe,
        renames some fields and cleans the output dataframe.

        """

        # rename some features for backwards compatibility with pretrained models
        aliases = {
            "GeneralizedRadialDistributionFunction": "GeneralizedRDF",
            "AGNIFingerprints": "AGNIFingerPrint",
            "BondOrientationalParameter": "BondOrientationParameter",
            "GaussianSymmFunc": "ChemEnvSiteFingerprint|GaussianSymmFunc",
        }
        df = super().featurize_site(df, aliases=aliases)
        df = df.loc[:, (df != 0).any(axis=0)]

        return modnet.featurizers.clean_df(df)
Example #12
0
    def test_sine_coulomb_matrix(self):
        # flat
        scm = SineCoulombMatrix(flatten=True)
        df = pd.DataFrame({"s": [self.sc, self.ni3al]})
        with self.assertRaises(NotFittedError):
            df = scm.featurize_dataframe(df, "s")
        df = scm.fit_featurize_dataframe(df, "s")
        labels = scm.feature_labels()
        self.assertEqual(labels[0], "sine coulomb matrix eig 0")
        self.assertArrayAlmostEqual(df[labels].iloc[0],
                                    [235.740418, 0.0, 0.0, 0.0],
                                    decimal=5)
        self.assertArrayAlmostEqual(
            df[labels].iloc[1],
            [232.578562, 1656.288171, 1403.106576, 1403.106576],
            decimal=5)

        # matrix
        scm = SineCoulombMatrix(flatten=False)
        sin_mat = scm.featurize(self.diamond)
        mtarget = [[36.8581, 6.147068], [6.147068, 36.8581]]
        self.assertAlmostEqual(np.linalg.norm(sin_mat - np.array(mtarget)),
                               0.0,
                               places=4)
        scm = SineCoulombMatrix(diag_elems=False, flatten=False)
        sin_mat = scm.featurize(self.diamond)[0]
        self.assertEqual(sin_mat[0][0], 0)
        self.assertEqual(sin_mat[1][1], 0)
Example #13
0
class SineCoulombMatrix(MaterialStructureFeaturizer):
    """
  Calculate sine Coulomb matrix for crystals.

  A variant of Coulomb matrix for periodic crystals.

  The sine Coulomb matrix is identical to the Coulomb matrix, except
  that the inverse distance function is replaced by the inverse of
  sin**2 of the vector between sites which are periodic in the
  dimensions of the crystal lattice.

  Features are flattened into a vector of matrix eigenvalues by default
  for ML-readiness. To ensure that all feature vectors are equal
  length, the maximum number of atoms (eigenvalues) in the input
  dataset must be specified.

  This featurizer requires the optional dependencies pymatgen and
  matminer. It may be useful when crystal structures with 3D coordinates
  are available.

  See [1]_ for more details.

  References
  ----------
  .. [1] Faber et al. Inter. J. Quantum Chem. 115, 16, 2015.

  Examples
  --------
  >>> import pymatgen as mg
  >>> lattice = mg.Lattice.cubic(4.2)
  >>> structure = mg.Structure(lattice, ["Cs", "Cl"], [[0, 0, 0], [0.5, 0.5, 0.5]])
  >>> featurizer = SineCoulombMatrix(max_atoms=2)
  >>> features = featurizer.featurize([structure])

  Notes
  -----
  This class requires matminer and Pymatgen to be installed.
  """
    def __init__(self, max_atoms: int = 100, flatten: bool = True):
        """
    Parameters
    ----------
    max_atoms: int (default 100)
      Maximum number of atoms for any crystal in the dataset. Used to
      pad the Coulomb matrix.
    flatten: bool (default True)
      Return flattened vector of matrix eigenvalues.
    """
        try:
            from matminer.featurizers.structure import SineCoulombMatrix as SCM
        except ModuleNotFoundError:
            raise ValueError("This class requires matminer to be installed.")

        self.max_atoms = max_atoms
        self.flatten = flatten
        self.scm = SCM(flatten=False)

    def _featurize(self, struct: PymatgenStructure) -> np.ndarray:
        """
    Calculate sine Coulomb matrix from pymatgen structure.

    Parameters
    ----------
    struct: pymatgen.Structure
      A periodic crystal composed of a lattice and a sequence of atomic
      sites with 3D coordinates and elements.

    Returns
    -------
    features: np.ndarray
      2D sine Coulomb matrix with shape (max_atoms, max_atoms),
      or 1D matrix eigenvalues with shape (max_atoms,).
    """
        # Get full N x N SCM
        sine_mat = self.scm.featurize(struct)

        if self.flatten:
            eigs, _ = np.linalg.eig(sine_mat)
            zeros = np.zeros(self.max_atoms)
            zeros[:len(eigs[0])] = eigs[0]
            features = zeros
        else:
            features = pad_array(sine_mat, self.max_atoms)

        features = np.asarray(features)

        return features
        sizes = np.array([self.length(row) for row in X])
        y_pred = self.predict(X) / sizes
        y_true = y / sizes
        return sklearn.metrics.r2_score(y_true, y_pred)

    def length(self, vec):
        return vec[vec != 0].shape[0]


# SCM evaluation
DIAG = True
print("DIAG ELEMS", DIAG)

# Featurize dataframe with sine coulomb matrix and time it
start = time.monotonic()
scm = SineCoulombMatrix(DIAG)
# Set the number of jobs for parallelization
scm.set_n_jobs(NJOBS)
df = scm.featurize_dataframe(df, 'structure')
# Take the eigenvalues of the SCMs to form vector descriptors
df['sine coulomb matrix'] = pd.Series(
    [np.sort(np.linalg.eigvals(s))[::-1] for s in df['sine coulomb matrix']],
    df.index)
finish = time.monotonic()
print("TIME TO FEATURIZE SCM %f SECONDS" % (finish - start))
print()

# Set up KRR model
krr = KrrScm()
print(krr.get_params().keys())
centro_elastic_compliance=np.load('/Users/dennistrujillo/Dropbox/mp_share_dt_ag/elasticity_compliance/centro_elasticity.npy',allow_pickle=True)
centro_dielectric_tensor=np.load('/Users/dennistrujillo/Dropbox/mp_share_dt_ag/dielectric_total/centro_diel.npy',allow_pickle=True)

data = {'structures': centrosymmetric_structures, 'ids' : task_ids}
df = pd.DataFrame(data)


#soap representation
#from matminer.featurizers.structure import SOAP 
#soap = SOAP(periodic=True)
#soap=soap.fit(data['structures'])
#labels=soap.feature_labels()
#df = soap.featurize_dataframe(df,'structures')

from matminer.featurizers.structure import SineCoulombMatrix
sine_coulomb = SineCoulombMatrix()
sine_coulomb.set_n_jobs(28)
sine_coulomb.fit(centrosymmetric_structures)#data['structures'])
labels=sine_coulomb.feature_labels()
df  = sine_coulomb.featurize_dataframe(df, 'structures')#,ignore_errors=True)

#agni
#from matminer.featurizers.site import AGNIFingerprints 
#agni=AGNIFingerprints(directions=['x','y','z']) 
#agni.set_n_jobs(28)
#labels=agni.feature_labels()
#df = agni.featurize(df['structures'],0)
#df  = agni.featurize_dataframe(df, ['structures', 'site'])#,ignore_errors=True)

#get s_vs_ep
ec_list=[]
    def score(self, X, y):
        sizes = np.array([self.length(row) for row in X])
        y_pred = self.predict(X) / sizes
        y_true = y / sizes
        return sklearn.metrics.r2_score(y_true, y_pred)

    def length(self, vec):
        return vec[vec != 0].shape[0]

# SCM evaluation
DIAG = True
print ("DIAG ELEMS", DIAG)

# Featurize dataframe with sine coulomb matrix and time it
start = time.monotonic()
scm = SineCoulombMatrix(DIAG)
# Set the number of jobs for parallelization
scm.set_n_jobs(NJOBS)
df = scm.featurize_dataframe(df, 'structure')
# Take the eigenvalues of the SCMs to form vector descriptors
df['sine coulomb matrix'] = pd.Series([np.sort(np.linalg.eigvals(s))[::-1] \
    for s in df['sine coulomb matrix']], df.index)
finish = time.monotonic()
print ("TIME TO FEATURIZE SCM %f SECONDS" % (finish-start))
print()

# Set up KRR model
krr = KrrScm()
print(krr.get_params().keys())

# Initialize hyperparameter grid search
Example #17
0
class FUTURE_PROSPECTS_2021(featurizer.extendedMODFeaturizer):

    from matminer.featurizers.composition import (
        AtomicOrbitals,
        AtomicPackingEfficiency,
        BandCenter,
        CohesiveEnergy,
        ElectronAffinity,
        ElectronegativityDiff,
        ElementFraction,
        ElementProperty,
        IonProperty,
        Miedema,
        OxidationStates,
        Stoichiometry,
        TMetalFraction,
        ValenceOrbital,
        YangSolidSolution,
    )
    from matminer.featurizers.structure import (
        BagofBonds,
        BondFractions,
        ChemicalOrdering,
        CoulombMatrix,
        DensityFeatures,
        EwaldEnergy,
        GlobalSymmetryFeatures,
        MaximumPackingEfficiency,
        PartialRadialDistributionFunction,
        RadialDistributionFunction,
        SineCoulombMatrix,
        StructuralHeterogeneity,
        XRDPowderPattern,
    )

    from matminer.featurizers.site import (
        AGNIFingerprints,
        AverageBondAngle,
        AverageBondLength,
        BondOrientationalParameter,
        ChemEnvSiteFingerprint,
        CoordinationNumber,
        CrystalNNFingerprint,
        GaussianSymmFunc,
        GeneralizedRadialDistributionFunction,
        LocalPropertyDifference,
        OPSiteFingerprint,
        VoronoiFingerprint,
    )
    from matminer.featurizers.dos import (
        DOSFeaturizer,
        SiteDOS,
        Hybridization,
        DosAsymmetry,
    )
    from matminer.featurizers.bandstructure import (
        BandFeaturizer,
        BranchPointEnergy
    )

    composition_featurizers = (
        AtomicOrbitals(),
        AtomicPackingEfficiency(),
        BandCenter(),
        ElementFraction(),
        ElementProperty.from_preset("magpie"),
        IonProperty(),
        Miedema(),
        Stoichiometry(),
        TMetalFraction(),
        ValenceOrbital(),
        YangSolidSolution(),
    )

    oxid_composition_featurizers = (
        ElectronegativityDiff(),
        OxidationStates(),
    )

    structure_featurizers = (
        DensityFeatures(),
        GlobalSymmetryFeatures(),
        RadialDistributionFunction(),
        CoulombMatrix(),
        #PartialRadialDistributionFunction(), #Introduces a large amount of features
        SineCoulombMatrix(),
        EwaldEnergy(),
        BondFractions(),
        StructuralHeterogeneity(),
        MaximumPackingEfficiency(),
        ChemicalOrdering(),
        XRDPowderPattern(),
    )
    site_featurizers = (
        AGNIFingerprints(),
        AverageBondAngle(VoronoiNN()),
        AverageBondLength(VoronoiNN()),
        BondOrientationalParameter(),
        ChemEnvSiteFingerprint.from_preset("simple"),
        CoordinationNumber(),
        CrystalNNFingerprint.from_preset("ops"),
        GaussianSymmFunc(),
        GeneralizedRadialDistributionFunction.from_preset("gaussian"),
        LocalPropertyDifference(),
        OPSiteFingerprint(),
        VoronoiFingerprint(),
    )

    dos_featurizers = (
        DOSFeaturizer(),
        SiteDOS(),
        Hybridization()
    )

    band_featurizers = (
        BandFeaturizer(),
        BranchPointEnergy()
    )
    def __init__(self, n_jobs=None):
            self._n_jobs = n_jobs

    def featurize_composition(self, df):
        """Applies the preset composition featurizers to the input dataframe,
        renames some fields and cleans the output dataframe.
        """
        df = super().featurize_composition(df)

        _orbitals = {"s": 1, "p": 2, "d": 3, "f": 4}
        df["AtomicOrbitals|HOMO_character"] = df["AtomicOrbitals|HOMO_character"].map(
            _orbitals
        )
        df["AtomicOrbitals|LUMO_character"] = df["AtomicOrbitals|LUMO_character"].map(
            _orbitals
        )

        df["AtomicOrbitals|HOMO_element"] = df["AtomicOrbitals|HOMO_element"].apply(
            lambda x: -1 if not isinstance(x, str) else Element(x).Z
        )
        df["AtomicOrbitals|LUMO_element"] = df["AtomicOrbitals|LUMO_element"].apply(
            lambda x: -1 if not isinstance(x, str) else Element(x).Z
        )

        return clean_df(df)

    def featurize_structure(self, df):
        """Applies the preset structural featurizers to the input dataframe,
        renames some fields and cleans the output dataframe.
        """
        df = super().featurize_structure(df)

        dist = df["RadialDistributionFunction|radial distribution function"].iloc[0][
            "distances"
        ][:50]
        for i, d in enumerate(dist):
            _rdf_key = "RadialDistributionFunction|radial distribution function|d_{:.2f}".format(
                d
            )
            df[_rdf_key] = df[
                "RadialDistributionFunction|radial distribution function"
            ].apply(lambda x: x["distribution"][i])

        df = df.drop("RadialDistributionFunction|radial distribution function", axis=1)

        _crystal_system = {
            "cubic": 1,
            "tetragonal": 2,
            "orthorombic": 3,
            "hexagonal": 4,
            "trigonal": 5,
            "monoclinic": 6,
            "triclinic": 7,
        }

        def _int_map(x):
            if x == np.nan:
                return 0
            elif x:
                return 1
            else:
                return 0

        df["GlobalSymmetryFeatures|crystal_system"] = df[
            "GlobalSymmetryFeatures|crystal_system"
        ].map(_crystal_system)
        df["GlobalSymmetryFeatures|is_centrosymmetric"] = df[
            "GlobalSymmetryFeatures|is_centrosymmetric"
        ].map(_int_map)

        return clean_df(df)

    def featurize_dos(self, df):
        """Applies the presetdos featurizers to the input dataframe,
        renames some fields and cleans the output dataframe.
        """

        df = super().featurize_dos(df)


        hotencodeColumns = ["DOSFeaturizer|vbm_specie_1","DOSFeaturizer|cbm_specie_1"]

        one_hot = pd.get_dummies(df[hotencodeColumns])
        df = df.drop(hotencodeColumns, axis = 1).join(one_hot)

        _orbitals = {"s": 1, "p": 2, "d": 3, "f": 4}

        df["DOSFeaturizer|vbm_character_1"] = df[
           "DOSFeaturizer|vbm_character_1"
           ].map(_orbitals)
        df["DOSFeaturizer|cbm_character_1"] = df[
           "DOSFeaturizer|cbm_character_1"
           ].map(_orbitals)

        # Splitting one feature into several floating features
        # e.g. number;number;number into three columns
        splitColumns = ["DOSFeaturizer|cbm_location_1", "DOSFeaturizer|vbm_location_1"]

        for column in splitColumns:
            try:
                newColumns = df[column].str.split(";", n = 2, expand = True)
                for i in range(0,3):
                    df[column + "_" + str(i)] = np.array(newColumns[i]).astype(np.float)
            except:
                continue
        df = df.drop(splitColumns, axis=1)
        df = df.drop(["dos"], axis=1)
        return clean_df(df)

    def featurize_bandstructure(self, df):
        """Applies the preset band structure featurizers to the input dataframe,
        renames some fields and cleans the output dataframe.
        """

        df = super().featurize_bandstructure(df)

        def _int_map(x):
            if str(x) == "False":
                return 0
            elif str(x) == "True":
                return 1

        df["BandFeaturizer|is_gap_direct"] = df[
            "BandFeaturizer|is_gap_direct"
        ].map(_int_map)


        df = df.drop(["bandstructure"], axis=1)

        return clean_df(df)


    def featurize_site(self, df):
        """Applies the preset site featurizers to the input dataframe,
        renames some fields and cleans the output dataframe.
        """

        aliases = {
            "GeneralizedRadialDistributionFunction": "GeneralizedRDF",
            "AGNIFingerprints": "AGNIFingerPrint",
            "BondOrientationalParameter": "BondOrientationParameter",
            "GaussianSymmFunc": "ChemEnvSiteFingerprint|GaussianSymmFunc",
        }
        df = super().featurize_site(df, aliases=aliases)
        df = df.loc[:, (df != 0).any(axis=0)]

        return clean_df(df)
Example #18
0
class SineCoulombMatrix(MaterialStructureFeaturizer):
    """
  Calculate sine Coulomb matrix for crystals.

  A variant of Coulomb matrix for periodic crystals.

  The sine Coulomb matrix is identical to the Coulomb matrix, except
  that the inverse distance function is replaced by the inverse of
  sin**2 of the vector between sites which are periodic in the
  dimensions of the crystal lattice.

  Features are flattened into a vector of matrix eigenvalues by default
  for ML-readiness. To ensure that all feature vectors are equal
  length, the maximum number of atoms (eigenvalues) in the input
  dataset must be specified.

  This featurizer requires the optional dependencies pymatgen and
  matminer. It may be useful when crystal structures with 3D coordinates
  are available.

  See [1]_ for more details.

  References
  ----------
  .. [1] Faber et al. "Crystal Structure Representations for Machine
         Learning Models of Formation Energies", Inter. J. Quantum Chem.
         115, 16, 2015. https://arxiv.org/abs/1503.07406

  Examples
  --------
  >>> import deepchem as dc
  >>> import pymatgen as mg
  >>> lattice = mg.core.Lattice.cubic(4.2)
  >>> structure = mg.core.Structure(lattice, ["Cs", "Cl"], [[0, 0, 0], [0.5, 0.5, 0.5]])
  >>> featurizer = dc.feat.SineCoulombMatrix(max_atoms=2)
  >>> features = featurizer.featurize([structure])
  >>> type(features[0])
  <class 'numpy.ndarray'>
  >>> features[0].shape # (max_atoms,)
  (2,)


  Note
  ----
  This class requires matminer and Pymatgen to be installed.
  """
    def __init__(self, max_atoms: int = 100, flatten: bool = True):
        """
    Parameters
    ----------
    max_atoms: int (default 100)
      Maximum number of atoms for any crystal in the dataset. Used to
      pad the Coulomb matrix.
    flatten: bool (default True)
      Return flattened vector of matrix eigenvalues.
    """
        self.max_atoms = max_atoms
        self.flatten = flatten
        self.scm: Any = None

    def _featurize(self, datapoint: PymatgenStructure, **kwargs) -> np.ndarray:
        """
    Calculate sine Coulomb matrix from pymatgen structure.

    Parameters
    ----------
    datapoint: pymatgen.core.Structure
      A periodic crystal composed of a lattice and a sequence of atomic
      sites with 3D coordinates and elements.

    Returns
    -------
    features: np.ndarray
      2D sine Coulomb matrix with shape (max_atoms, max_atoms),
      or 1D matrix eigenvalues with shape (max_atoms,).
    """
        if 'struct' in kwargs and datapoint is None:
            datapoint = kwargs.get("struct")
            raise DeprecationWarning(
                'Struct is being phased out as a parameter, please pass "datapoint" instead.'
            )

        if self.scm is None:
            try:
                from matminer.featurizers.structure import SineCoulombMatrix as SCM
                self.scm = SCM(flatten=False)
            except ModuleNotFoundError:
                raise ImportError(
                    "This class requires matminer to be installed.")

        # Get full N x N SCM
        sine_mat = self.scm.featurize(datapoint)

        if self.flatten:
            eigs, _ = np.linalg.eig(sine_mat)
            zeros = np.zeros(self.max_atoms)
            zeros[:len(eigs[0])] = eigs[0]
            features = zeros
        else:
            features = pad_array(sine_mat, self.max_atoms)

        features = np.asarray(features)

        return features
        sizes = np.array([self.length(row) for row in X])
        y_pred = self.predict(X) / sizes
        y_true = y / sizes
        return sklearn.metrics.r2_score(y_true, y_pred)

    def length(self, vec):
        return vec[vec != 0].shape[0]


# SCM evaluation
DIAG = True
print("DIAG ELEMS", DIAG)

# Featurize dataframe with sine coulomb matrix and time it
start = time.monotonic()
scm = SineCoulombMatrix(diag_elems=DIAG, flatten=True)
# Set the number of jobs for parallelization
scm.set_n_jobs(NJOBS)
df = scm.fit_featurize_dataframe(df, 'structure')
# Take the eigenvalues of the SCMs to form vector descriptors
# df['sine coulomb matrix'] = pd.Series([np.sort(np.linalg.eigvals(s))[::-1]
#                                        for s in df['sine coulomb matrix']],
#                                       df.index)
finish = time.monotonic()
print("TIME TO FEATURIZE SCM %f SECONDS" % (finish - start))
print()

# Set up KRR model
krr = KrrScm()
print(krr.get_params().keys())