Beispiel #1
0
    def featurize_structures(self, featurizer=None, **kwargs):
        """
        Featurizes the hypothetical structures available from
        hypo_structures method. Hypothetical structures for which
        featurization fails are removed and valid structures are
        made available as valid_structures

        Args:
            featurizer (Featurizer): A MatMiner Featurizer.
                Defaults to MultipleFeaturizer with PRB Ward
                Voronoi descriptors.
            **kwargs (dict): kwargs passed to featurize_many
                method of featurizer.

        Returns:
            (pandas.DataFrame): features

        """
        # Note the redundancy here is for pandas to work
        if self.hypo_structures is None:
            warnings.warn("No structures available. Generating structures.")
            self.get_structures()

        print("Generating features")
        featurizer = featurizer if featurizer else MultipleFeaturizer([
            SiteStatsFingerprint.from_preset("CoordinationNumber_ward-prb-2017"),
            StructuralHeterogeneity(),
            ChemicalOrdering(),
            MaximumPackingEfficiency(),
            SiteStatsFingerprint.from_preset("LocalPropertyDifference_ward-prb-2017"),
            StructureComposition(Stoichiometry()),
            StructureComposition(ElementProperty.from_preset("magpie")),
            StructureComposition(ValenceOrbital(props=['frac'])),
            StructureComposition(IonProperty(fast=True))
        ])

        features = featurizer.featurize_many(
            self.hypo_structures['structure'],
            ignore_errors=True, **kwargs)

        n_species, formula = [], []
        for s in self.hypo_structures['structure']:
            n_species.append(len(s.composition.elements))
            formula.append(s.composition.formula)

        self._features_df = pd.DataFrame.from_records(
            features, columns=featurizer.feature_labels())
        self._features_df.index = self.hypo_structures.index
        self._features_df['N_species'] = n_species
        self._features_df['Composition'] = formula
        self._features_df['structure'] = self.hypo_structures['structure']
        self.features = self._features_df.dropna(axis=0, how='any')
        self.features = self.features.reindex(sorted(self.features.columns), axis=1)

        self._valid_structure_labels = list(self.features.index)
        self.valid_structures = self.hypo_structures.loc[self._valid_structure_labels]

        print("{} out of {} structures were successfully featurized.".format(
            self.features.shape[0], self._features_df.shape[0]))
        return self.features
Beispiel #2
0
def similarity(_parents, target):
    featurizer = MultipleFeaturizer([
        SiteStatsFingerprint.from_preset("CoordinationNumber_ward-prb-2017"),
        StructuralHeterogeneity(),
        ChemicalOrdering(),
        MaximumPackingEfficiency(),
        SiteStatsFingerprint.from_preset(
            "LocalPropertyDifference_ward-prb-2017"),
        StructureComposition(Stoichiometry()),
        StructureComposition(ElementProperty.from_preset("magpie")),
        StructureComposition(ValenceOrbital(props=["frac"])),
        StructureComposition(IonProperty(fast=True)),
    ])

    # HACK celery doesn't work with multiprocessing (used by matminer)
    try:
        from celery import current_task
        if current_task:
            featurizer.set_n_jobs(1)
    except ImportError:
        pass

    x_target = pd.DataFrame.from_records([featurizer.featurize(target)],
                                         columns=featurizer.feature_labels())
    x_parent = pd.DataFrame.from_records(
        featurizer.featurize_many(_parents, ignore_errors=True, pbar=False),
        columns=featurizer.feature_labels(),
    )
    nulls = x_parent[x_parent.isnull().any(axis=1)].index.values
    x_parent.fillna(100000, inplace=True)

    x_target = x_target.reindex(sorted(x_target.columns), axis=1)
    x_parent = x_parent.reindex(sorted(x_parent.columns), axis=1)

    with open(os.path.join(settings.rxn_files, "scaler2.pickle"), "rb") as f:
        scaler = pickle.load(f)
    with open(os.path.join(settings.rxn_files, "quantiles.pickle"), "rb") as f:
        quantiles = pickle.load(f)

    X = scaler.transform(x_parent.append(x_target))

    D = [pairwise_distances(np.array([row, X[-1]]))[0, 1] for row in X[:-1]]

    _res = []
    for d in D:
        _res.append(np.linspace(0, 1, 101)[np.abs(quantiles - d).argmin()])
    _res = np.array(_res)
    _res[nulls] = -1
    return _res
def get_structure_properties(structure: Structure, mode: str = 'all') -> dict:

    if mode == 'all':
        featurizer = MultipleFeaturizer([
            SiteStatsFingerprint.from_preset(
                'CoordinationNumber_ward-prb-2017'),
            StructuralHeterogeneity(),
            ChemicalOrdering(),
            DensityFeatures(),
            MaximumPackingEfficiency(),
            SiteStatsFingerprint.from_preset(
                'LocalPropertyDifference_ward-prb-2017'),
            StructureComposition(Stoichiometry()),
            StructureComposition(ElementProperty.from_preset('magpie')),
            StructureComposition(ValenceOrbital(props=['frac'])),
        ])
    else:
        # Calculate only those which do not need a Voronoi tesselation
        featurizer = MultipleFeaturizer([
            DensityFeatures(),
            StructureComposition(Stoichiometry()),
            StructureComposition(ElementProperty.from_preset('magpie')),
            StructureComposition(ValenceOrbital(props=['frac'])),
        ])

    X = featurizer.featurize(structure)

    matminer_dict = dict(list(zip(featurizer.feature_labels(), X)))

    matminer_dict['volume'] = structure.volume
    return matminer_dict
Beispiel #4
0
    def test_composition_features(self):
        comp = ElementProperty.from_preset("magpie")
        f = StructureComposition(featurizer=comp)

        # Test the fitting (should not crash)
        f.fit([self.nacl, self.diamond])

        # Test the features
        features = f.featurize(self.nacl)
        self.assertArrayAlmostEqual(comp.featurize(self.nacl.composition),
                                    features)

        # Test the citations/implementors
        self.assertEqual(comp.citations(), f.citations())
        self.assertEqual(comp.implementors(), f.implementors())
Beispiel #5
0
                      'MeltingT', 'NsValence', 'NpValence', 'NdValence',
                      'NfValence', 'NValence', 'NsUnfilled', 'NpUnfilled',
                      'NdUnfilled', 'NfUnfilled', 'NUnfilled', 'GSvolume_pa',
                      'SpaceGroupNumber', 'GSbandgap', 'GSmagmom')

#The following features will be created by using matminer package.
featurizer = MultipleFeaturizer([
    SiteStatsFingerprint(CoordinationNumber().from_preset('VoronoiNN'),
                         stats=('mean', 'std_dev', 'minimum', 'maximum')),
    StructuralHeterogeneity(),
    ChemicalOrdering(),
    MaximumPackingEfficiency(),
    SiteStatsFingerprint(
        LocalPropertyDifference(properties=element_properties),
        stats=('mean', 'std_dev', 'minimum', 'maximum', 'range')),
    StructureComposition(Stoichiometry()),
    StructureComposition(ElementProperty.from_preset("magpie")),
    StructureComposition(ValenceOrbital(props=['frac'])),
    StructureComposition(IonProperty(fast=True))
])

#Generate VT based features from the material's crystal lat_params.
feature_data = featurizer.featurize_dataframe(df,
                                              col_id=['structure'],
                                              ignore_errors=True)
#"lat_params","compound possible" and "material_id" are not resonable physical features, so we drop these three columns
feature_data = feature_data.drop(
    ["structure", "compound possible", "material_id"], axis=1)
#write the data into a csv file for later use
feature_data.to_csv("data_delta_e_data.csv", index=False)
from sklearn.model_selection import KFold, cross_val_score
#Grabs all CIF files in a directory
CIFfiles = []
directoryname = '../examples/'  #The directory it looks in
allfiles = os.listdir(directoryname)
for i in allfiles:
    if os.path.splitext(i)[-1] == '.cif':
        CIFfiles.append(i)  #List of CIF files

#Creates a list of pymatgen.structure objects and a name of each structure
structlist = []
namelist = []
namecolumns = ['structure']
for i in CIFfiles:
    structlist.append([Structure.from_file(directoryname + i)
                       ])  #Converts CIF to pymatgen structure object
    namelist.append(os.path.splitext(i)[0])  #Collects all the structure names

#Creates Pandas dataframe with data being a list of structures and the row name being the structure name
dftest = pd.DataFrame(data=structlist, index=namelist, columns=namecolumns)

#Featurizes the structures
featurizer = MultipleFeaturizer([
    StructuralHeterogeneity(),  #sets the featurizers that are going to be used
    StructureComposition(ElementProperty.from_preset('magpie'))
])  # This one also collects the composition from the structures
#more featurizers can be added

r = (featurizer.featurize_dataframe(dftest, ['structure'])
     )  #Featurizes entire Pands Dataframe