Beispiel #1
0
    def test_prdf(self):
        # Test a few peaks in diamond
        # These expected numbers were derived by performing
        # the calculation in another code
        p, r = PartialRadialDistributionFunction().featurize(self.diamond)
        self.assertEquals(len(p), 1)
        self.assertEquals(p[('C', 'C')][int(round(1.4 / 0.1))], 0)
        self.assertAlmostEqual(p[('C', 'C')][int(round(1.5 / 0.1))],
                               1.324451676)
        self.assertAlmostEqual(r.max(), 19.9)
        self.assertAlmostEqual(p[('C', 'C')][int(round(19.9 / 0.1))],
                               0.07197902)

        # Test a few peaks in CsCl, make sure it gets all types correctly
        p, r = PartialRadialDistributionFunction().featurize(self.cscl,
                                                             cutoff=10)
        self.assertEquals(len(p), 4)
        self.assertAlmostEqual(r.max(), 9.9)
        self.assertAlmostEquals(p[('Cs', 'Cl')][int(round(3.6 / 0.1))],
                                0.477823197)
        self.assertAlmostEquals(p[('Cl', 'Cs')][int(round(3.6 / 0.1))],
                                0.477823197)
        self.assertAlmostEquals(p[('Cs', 'Cs')][int(round(3.6 / 0.1))], 0)

        # Do Ni3Al, make sure it captures the antisymmetry of Ni/Al sites
        p, r = PartialRadialDistributionFunction().featurize(self.ni3al,
                                                             cutoff=10,
                                                             bin_size=0.5)
        self.assertEquals(len(p), 4)
        self.assertAlmostEquals(p[('Ni', 'Al')][int(round(2 / 0.5))],
                                0.125236677)
        self.assertAlmostEquals(p[('Al', 'Ni')][int(round(2 / 0.5))],
                                0.37571003)
        self.assertAlmostEquals(p[('Al', 'Al')][int(round(2 / 0.5))], 0)
Beispiel #2
0
def featurize_structure(df: pd.DataFrame) -> pd.DataFrame:
    """ Decorate input `pandas.DataFrame` of structures with structural
    features from matminer.

    Currently applies the set of all matminer structure features.

    Args:
        df (pandas.DataFrame): the input dataframe with `"structure"`
            column containing `pymatgen.Structure` objects.

    Returns:
        pandas.DataFrame: the decorated DataFrame.

    """

    logging.info("Applying structure featurizers...")

    df = df.copy()

    structure_features = [
         DensityFeatures(),
         GlobalSymmetryFeatures(),
         RadialDistributionFunction(),
         CoulombMatrix(),
         PartialRadialDistributionFunction(),
         SineCoulombMatrix(),
         EwaldEnergy(),
         BondFractions(),
         StructuralHeterogeneity(),
         MaximumPackingEfficiency(),
         ChemicalOrdering(),
         XRDPowderPattern(),
         BagofBonds()
    ]

    featurizer = MultipleFeaturizer([feature.fit(df["structure"]) for feature in structure_features])

    df = featurizer.featurize_dataframe(df, "structure", multiindex=True, ignore_errors=True)
    df.columns = df.columns.map('|'.join).str.strip('|')

    dist = df["RadialDistributionFunction|radial distribution function"][0]['distances'][:50]
    for i, d in enumerate(dist):
        _rdf_key = "RadialDistributionFunction|radial distribution function|d_{:.2f}".format(d)
        df[_rdf_key] = df["RadialDistributionFunction|radial distribution function"].apply(lambda x: x['distribution'][i])

    df = df.drop("RadialDistributionFunction|radial distribution function", axis=1)

    _crystal_system = {
        "cubic": 1, "tetragonal": 2, "orthorombic": 3,
        "hexagonal": 4, "trigonal": 5, "monoclinic": 6, "triclinic": 7
    }

    df["GlobalSymmetryFeatures|crystal_system"] = df["GlobalSymmetryFeatures|crystal_system"].map(_crystal_system)
    df["GlobalSymmetryFeatures|is_centrosymmetric"] = df["GlobalSymmetryFeatures|is_centrosymmetric"].map(int)

    return clean_df(df)
Beispiel #3
0
    def test_prdf(self):
        # Test a few peaks in diamond
        # These expected numbers were derived by performing
        # the calculation in another code
        distances, prdf = PartialRadialDistributionFunction().compute_prdf(
            self.diamond)
        self.assertEqual(len(prdf.values()), 1)
        self.assertAlmostEqual(prdf[('C', 'C')][int(round(1.4 / 0.1))], 0)
        self.assertAlmostEqual(prdf[('C', 'C')][int(round(1.5 / 0.1))],
                               1.32445167622)
        self.assertAlmostEqual(max(distances), 19.9)
        self.assertAlmostEqual(prdf[('C', 'C')][int(round(19.9 / 0.1))],
                               0.07197902)

        # Test a few peaks in CsCl, make sure it gets all types correctly
        distances, prdf = PartialRadialDistributionFunction(
            cutoff=10).compute_prdf(self.cscl)
        self.assertEqual(len(prdf.values()), 4)
        self.assertAlmostEqual(max(distances), 9.9)
        self.assertAlmostEqual(prdf[('Cs', 'Cl')][int(round(3.6 / 0.1))],
                               0.477823197)
        self.assertAlmostEqual(prdf[('Cl', 'Cs')][int(round(3.6 / 0.1))],
                               0.477823197)
        self.assertAlmostEqual(prdf[('Cs', 'Cs')][int(round(3.6 / 0.1))], 0)

        # Do Ni3Al, make sure it captures the antisymmetry of Ni/Al sites
        distances, prdf = PartialRadialDistributionFunction(cutoff=10, bin_size=0.5)\
            .compute_prdf(self.ni3al)
        self.assertEqual(len(prdf.values()), 4)
        self.assertAlmostEqual(prdf[('Ni', 'Al')][int(round(2 / 0.5))],
                               0.125236677)
        self.assertAlmostEqual(prdf[('Al', 'Ni')][int(round(2 / 0.5))],
                               0.37571003)
        self.assertAlmostEqual(prdf[('Al', 'Al')][int(round(2 / 0.5))], 0)

        # Check the fit operation
        featurizer = PartialRadialDistributionFunction()
        featurizer.fit([self.diamond, self.cscl, self.ni3al])
        self.assertEqual({'Cs', 'Cl', 'C', 'Ni', 'Al'},
                         set(featurizer.elements_))

        featurizer.exclude_elems = ['Cs', 'Al']
        featurizer.fit([self.diamond, self.cscl, self.ni3al])
        self.assertEqual({'Cl', 'C', 'Ni'}, set(featurizer.elements_))

        featurizer.include_elems = ['H']
        featurizer.fit([self.diamond, self.cscl, self.ni3al])
        self.assertEqual({'H', 'Cl', 'C', 'Ni'}, set(featurizer.elements_))

        # Check the feature labels
        featurizer.exclude_elems = ()
        featurizer.include_elems = ()
        featurizer.elements_ = ['Al', 'Ni']
        labels = featurizer.feature_labels()
        n_bins = len(featurizer._make_bins()) - 1

        self.assertEqual(3 * n_bins, len(labels))
        self.assertIn('Al-Ni PRDF r=0.00-0.10', labels)

        # Check the featurize method
        featurizer.elements_ = ['C']
        features = featurizer.featurize(self.diamond)
        prdf = featurizer.compute_prdf(self.diamond)[1]
        self.assertArrayAlmostEqual(features, prdf[('C', 'C')])

        # Check the featurize_dataframe
        df = pd.DataFrame.from_dict({"structure": [self.diamond, self.cscl]})
        featurizer.fit(df["structure"])
        df = featurizer.featurize_dataframe(df, col_id="structure")
        self.assertEqual(df["Cs-Cl PRDF r=0.00-0.10"][0], 0.0)
        self.assertAlmostEqual(df["Cl-Cl PRDF r=19.70-19.80"][1], 0.049, 3)
        self.assertEqual(df["Cl-Cl PRDF r=19.90-20.00"][0], 0.0)

        # Make sure labels and features are in the same order
        featurizer.elements_ = ['Al', 'Ni']
        features = featurizer.featurize(self.ni3al)
        labels = featurizer.feature_labels()
        prdf = featurizer.compute_prdf(self.ni3al)[1]
        self.assertEqual((n_bins * 3, ), features.shape)
        self.assertTrue(labels[0].startswith('Al-Al'))
        self.assertTrue(labels[n_bins].startswith('Al-Ni'))
        self.assertTrue(labels[2 * n_bins].startswith('Ni-Ni'))
        self.assertArrayAlmostEqual(
            features,
            np.hstack(
                [prdf[('Al', 'Al')], prdf[('Al', 'Ni')], prdf[('Ni', 'Ni')]]))
        CIFfiles.append(i)  #List of CIF files

#Creates a list of pymatgen.structure objects and a name of each structure
structlist = []
namelist = []
structs = []
namecolumns = ['structure']
for i in CIFfiles:
    structlist.append([Structure.from_file(directoryname + i)
                       ])  #Converts CIF to pymatgen structure object
    namelist.append(os.path.splitext(i)[0])  #Collects all the structure names
    structs.append(Structure.from_file(directoryname + i))
#Creates Pandas dataframe with data being a list of structures and the row name being the structure name
dftest = pd.DataFrame(data=structlist, index=namelist, columns=namecolumns)

p = PartialRadialDistributionFunction()
p.fit(np.asarray(structs))

c = CoulombMatrix()
c.fit(np.asarray(structs))

erdf = ElectronicRadialDistributionFunction()
erdf.cutoff = 10  #longest diagonal of lattice...I picked a number semi-arbitrarily

#Featurizes the structures
featurizer = MultipleFeaturizer([
    ElementProperty.from_preset('magpie'),
    OxidationStates(),
    AtomicOrbitals(),
    BandCenter(),
    ElectronegativityDiff(),