Beispiel #1
0
    def test_multiindex_in_multifeaturizer(self):
        # Make sure multiplefeaturizer returns the correct sub-featurizer multiindex keys
        # test both iteration over entries and featurizers

        for iter_entries in [True, False]:

            mf = MultipleFeaturizer([self.multi, self.single],
                                    iterate_over_entries=iter_entries)

            df_1lvl = pd.DataFrame({'x': [1, 2, 3]})
            df_2lvl = pd.DataFrame({'x': [1, 2, 3]})
            df_2lvl.columns = pd.MultiIndex.from_product((["Custom"],
                                                          df_2lvl.columns.values))
            df_3lvl = pd.DataFrame({'x': [1, 2, 3]})
            df_3lvl.columns = pd.MultiIndex.from_product((["Custom"],
                                                          ["Custom2"],
                                                          df_3lvl.columns.values))

            # If input dataframe has flat column index
            df_1lvl = mf.featurize_dataframe(df_1lvl, 'x', multiindex=True)
            self.assertEqual(df_1lvl[("Input Data", "x")].iloc[0], 1)
            self.assertEqual(df_1lvl[("MultipleFeatureFeaturizer", "w")].iloc[0], 0)
            self.assertEqual(df_1lvl[("SingleFeaturizer", "y")].iloc[0], 2)

            # If input dataframe has 2-lvl column index
            df_2lvl = mf.featurize_dataframe(df_2lvl, ("Custom", 'x'), multiindex=True)
            self.assertEqual(df_2lvl[("Custom", "x")].iloc[0], 1)
            self.assertEqual(df_2lvl[("MultipleFeatureFeaturizer", "w")].iloc[0], 0)
            self.assertEqual(df_2lvl[("SingleFeaturizer", "y")].iloc[0], 2)

            # If input dataframe has 2+ lvl column index
            with self.assertRaises(IndexError):
                df_3lvl = self.multi.featurize_dataframe(df_3lvl,
                                                   ("Custom", "Custom2", 'x'),
                                                   multiindex=True)
Beispiel #2
0
 def test_multi_featurizer(self):
     ff1 = FunctionFeaturizer(expressions=["x ** 2"])
     ff2 = FunctionFeaturizer(expressions=["exp(x)", "1 / x"])
     mf = MultipleFeaturizer([ff1, ff2])
     new_df = mf.fit_featurize_dataframe(self.test_df, ['a', 'b', 'c'],
                                         inplace=False)
     self.assertEqual(len(new_df), 11)
Beispiel #3
0
 def __init__(self, pbar=False):
     self.regressor = RandomForestRegressor(n_estimators=500, n_jobs=-1, verbose=3)
     self.stc = StrToComposition()
     ep = ElementProperty.from_preset("magpie")
     ef = ElementFraction()
     self.featurizer = MultipleFeaturizer([ep, ef])
     self.pbar = pbar
Beispiel #4
0
    def test_multiindex_in_multifeaturizer(self):
        # Make sure multiplefeaturizer returns the correct sub-featurizer multiindex keys
        # test both iteration over entries and featurizers

        for iter_entries in [True, False]:

            mf = MultipleFeaturizer([self.multi, self.single],
                                    iterate_over_entries=iter_entries)

            df_1lvl = pd.DataFrame({'x': [1, 2, 3]})
            df_2lvl = pd.DataFrame({'x': [1, 2, 3]})
            df_2lvl.columns = pd.MultiIndex.from_product((["Custom"],
                                                          df_2lvl.columns.values))
            df_3lvl = pd.DataFrame({'x': [1, 2, 3]})
            df_3lvl.columns = pd.MultiIndex.from_product((["Custom"],
                                                          ["Custom2"],
                                                          df_3lvl.columns.values))

            # If input dataframe has flat column index
            mf.featurize_dataframe(df_1lvl, 'x', multiindex=True)
            self.assertEqual(df_1lvl[("Input Data", "x")].iloc[0], 1)
            self.assertEqual(df_1lvl[("MultipleFeatureFeaturizer", "w")].iloc[0], 0)
            self.assertEqual(df_1lvl[("SingleFeaturizer", "y")].iloc[0], 2)

            # If input dataframe has 2-lvl column index
            mf.featurize_dataframe(df_2lvl, ("Custom", 'x'), multiindex=True)
            self.assertEqual(df_2lvl[("Custom", "x")].iloc[0], 1)
            self.assertEqual(df_2lvl[("MultipleFeatureFeaturizer", "w")].iloc[0], 0)
            self.assertEqual(df_2lvl[("SingleFeaturizer", "y")].iloc[0], 2)

            # If input dataframe has 2+ lvl column index
            with self.assertRaises(IndexError):
                _ = self.multi.featurize_dataframe(df_3lvl,
                                                   ("Custom", "Custom2", 'x'),
                                                   multiindex=True)
Beispiel #5
0
 def test_multi_featurizer(self):
     ff1 = FunctionFeaturizer(expressions=["x ** 2"])
     ff2 = FunctionFeaturizer(expressions=["exp(x)", "1 / x"])
     mf = MultipleFeaturizer([ff1, ff2])
     new_df = mf.fit_featurize_dataframe(self.test_df, ['a', 'b', 'c'],
                                         inplace=False)
     self.assertEqual(len(new_df), 11)
Beispiel #6
0
def featurize_structure(df: pd.DataFrame) -> pd.DataFrame:
    """ Decorate input `pandas.DataFrame` of structures with structural
    features from matminer.

    Currently applies the set of all matminer structure features.

    Args:
        df (pandas.DataFrame): the input dataframe with `"structure"`
            column containing `pymatgen.Structure` objects.

    Returns:
        pandas.DataFrame: the decorated DataFrame.

    """

    logging.info("Applying structure featurizers...")

    df = df.copy()

    structure_features = [
         DensityFeatures(),
         GlobalSymmetryFeatures(),
         RadialDistributionFunction(),
         CoulombMatrix(),
         PartialRadialDistributionFunction(),
         SineCoulombMatrix(),
         EwaldEnergy(),
         BondFractions(),
         StructuralHeterogeneity(),
         MaximumPackingEfficiency(),
         ChemicalOrdering(),
         XRDPowderPattern(),
         BagofBonds()
    ]

    featurizer = MultipleFeaturizer([feature.fit(df["structure"]) for feature in structure_features])

    df = featurizer.featurize_dataframe(df, "structure", multiindex=True, ignore_errors=True)
    df.columns = df.columns.map('|'.join).str.strip('|')

    dist = df["RadialDistributionFunction|radial distribution function"][0]['distances'][:50]
    for i, d in enumerate(dist):
        _rdf_key = "RadialDistributionFunction|radial distribution function|d_{:.2f}".format(d)
        df[_rdf_key] = df["RadialDistributionFunction|radial distribution function"].apply(lambda x: x['distribution'][i])

    df = df.drop("RadialDistributionFunction|radial distribution function", axis=1)

    _crystal_system = {
        "cubic": 1, "tetragonal": 2, "orthorombic": 3,
        "hexagonal": 4, "trigonal": 5, "monoclinic": 6, "triclinic": 7
    }

    df["GlobalSymmetryFeatures|crystal_system"] = df["GlobalSymmetryFeatures|crystal_system"].map(_crystal_system)
    df["GlobalSymmetryFeatures|is_centrosymmetric"] = df["GlobalSymmetryFeatures|is_centrosymmetric"].map(int)

    return clean_df(df)
Beispiel #7
0
    def test_multiple(self):
        # test iterating over both entries and featurizers
        for iter_entries in [True, False]:
            multi_f = MultipleFeaturizer([self.single, self.multi],
                                         iterate_over_entries=iter_entries)
            data = self.make_test_data()

            self.assertArrayAlmostEqual([2, 0, 3], multi_f.featurize(1))

            self.assertArrayEqual(['A'], multi_f.citations())

            implementors = multi_f.implementors()
            self.assertIn('Us', implementors)
            self.assertIn('Them', implementors)
            self.assertEqual(2, len(implementors))

            # Ensure BaseFeaturizer operation without overriden featurize_dataframe
            with warnings.catch_warnings(record=True) as w:
                data = multi_f.featurize_dataframe(data, 'x')
                self.assertEqual(len(w), 0)
            self.assertArrayAlmostEqual(data['y'], [2, 3, 4])
            self.assertArrayAlmostEqual(data['w'], [0, 1, 2])
            self.assertArrayAlmostEqual(data['z'], [3, 4, 5])

            f = MatrixFeaturizer()
            multi_f = MultipleFeaturizer([self.single, self.multi, f])
            data = self.make_test_data()
            with warnings.catch_warnings(record=True) as w:
                data = multi_f.featurize_dataframe(data, 'x')
                self.assertEqual(len(w), 0)

            self.assertArrayAlmostEqual(data['representation'][0],
                                        [[1.0, 0.0], [0.0, 1.0]])
Beispiel #8
0
    def __init__(self):
        self.feature_calculators = MultipleFeaturizer([
            cf.ElementProperty.from_preset(preset_name="magpie"),
            cf.Stoichiometry(),
            cf.ValenceOrbital(props=['frac']),
            cf.IonProperty(fast=True),
            cf.BandCenter(),
            cf.ElementFraction(),
        ])

        self.str2composition = StrToComposition()
Beispiel #9
0
    def test_caching(self):
        """Test whether MultiFeaturizer properly caches """

        # have to iterate over entries to enable caching
        feat = MultipleFeaturizer([
            SiteStatsFingerprint.from_preset("LocalPropertyDifference_ward-prb-2017"),
            SiteStatsFingerprint.from_preset("CoordinationNumber_ward-prb-2017")
        ], iterate_over_entries=True)
        
        # Reset the cache before tests
        _get_all_nearest_neighbors.cache_clear()

        # Create a dataframe with two SC structures in it
        data = pd.DataFrame({'strcs': [
            Structure([[3.52, 0, 0], [0, 3.52, 0], [0, 0, 3.52]], ["Al"], [[0, 0, 0]]),
            Structure([[3.52, 0, 0], [0, 3.52, 0], [0, 0, 3.52]], ["Ni"], [[0, 0, 0]]),
        ]})

        # Call featurize on both, check the number of cache misses/hits
        feat.featurize(data['strcs'][0])
        feat.featurize(data['strcs'][1])
        self.assertEqual(2, _get_all_nearest_neighbors.cache_info().hits)
        self.assertEqual(2, _get_all_nearest_neighbors.cache_info().misses)

        # Verify the number of cache misses, it should be the same as before
        feat.set_n_jobs(1)
        _get_all_nearest_neighbors.cache_clear()
        feat.featurize_dataframe(data, 'strcs')

        self.assertEqual(2, _get_all_nearest_neighbors.cache_info().hits)
        self.assertEqual(2, _get_all_nearest_neighbors.cache_info().misses)
Beispiel #10
0
    def test_ignore_errors(self):
        # Make sure multiplefeaturizer returns the correct sub-featurizer multiindex keys

        # Iterate through many tests: single/parallel, returning errors or not,
        # multiindex or not, and interation over entires/featurizers

        for mi, re, n, iter_entries in product(
                [True, False], [True, False], [1, 2], [True, False]):

            mf = MultipleFeaturizer([self.multi, self.single],
                                    iterate_over_entries=iter_entries)
            # Make some test data that will cause errors
            data = pd.DataFrame({'x': ['a', 2, 3]})

            # Set the number of threads
            mf.set_n_jobs(n)

            # Make sure it completes successfully
            results = mf.featurize_many(data['x'], ignore_errors=True, return_errors=re)
            self.assertEqual(5 if re else 3, len(results[0]))

            # Make sure it works with featurize dataframe
            results = mf.featurize_dataframe(data, 'x', ignore_errors=True,
                                             return_errors=re, multiindex=mi)
            self.assertEqual(6 if re else 4, len(results.columns))

            #  Special test for returning errors (only should work when returning errors)
            #   I only am going to test the single index case for simplicity
            if re and not mi:
                self.assertIn('TypeError', results.iloc[0]['SingleFeaturizer Exceptions'])

            # Make sure it throws an error
            with self.assertRaises(TypeError):
                mf.featurize_many([['a'], [1], [2]])
Beispiel #11
0
class FeatureGenerator:
    """
        A wraper class to generate multiple type of elemental features
    """
    def __init__(self):
        self.feature_calculators = MultipleFeaturizer([
            cf.ElementProperty.from_preset(preset_name="magpie"),
            cf.Stoichiometry(),
            cf.ValenceOrbital(props=['frac']),
            cf.IonProperty(fast=True),
            cf.BandCenter(),
            cf.ElementFraction(),
        ])

        self.str2composition = StrToComposition()

    def generate(self, df: pd.DataFrame, ignore_errors: bool = False):
        """
            generate feature from a dataframe with a "formula" column that contains 
            chemical formulas of the compositions.
        """
        df = self.str2composition.featurize_dataframe(
            df, "formula", ignore_errors=ignore_errors)
        df = df.dropna()
        df = self.feature_calculators.featurize_dataframe(
            df, col_id='composition', ignore_errors=ignore_errors)
        df["NComp"] = df["composition"].apply(len)
        return df
Beispiel #12
0
    def featurize_structures(self, featurizer=None, **kwargs):
        """
        Featurizes the hypothetical structures available from
        hypo_structures method. Hypothetical structures for which
        featurization fails are removed and valid structures are
        made available as valid_structures

        Args:
            featurizer (Featurizer): A MatMiner Featurizer.
                Defaults to MultipleFeaturizer with PRB Ward
                Voronoi descriptors.
            **kwargs (dict): kwargs passed to featurize_many
                method of featurizer.

        Returns:
            (pandas.DataFrame): features

        """
        # Note the redundancy here is for pandas to work
        if self.hypo_structures is None:
            warnings.warn("No structures available. Generating structures.")
            self.get_structures()

        print("Generating features")
        featurizer = featurizer if featurizer else MultipleFeaturizer([
            SiteStatsFingerprint.from_preset("CoordinationNumber_ward-prb-2017"),
            StructuralHeterogeneity(),
            ChemicalOrdering(),
            MaximumPackingEfficiency(),
            SiteStatsFingerprint.from_preset("LocalPropertyDifference_ward-prb-2017"),
            StructureComposition(Stoichiometry()),
            StructureComposition(ElementProperty.from_preset("magpie")),
            StructureComposition(ValenceOrbital(props=['frac'])),
            StructureComposition(IonProperty(fast=True))
        ])

        features = featurizer.featurize_many(
            self.hypo_structures['structure'],
            ignore_errors=True, **kwargs)

        n_species, formula = [], []
        for s in self.hypo_structures['structure']:
            n_species.append(len(s.composition.elements))
            formula.append(s.composition.formula)

        self._features_df = pd.DataFrame.from_records(
            features, columns=featurizer.feature_labels())
        self._features_df.index = self.hypo_structures.index
        self._features_df['N_species'] = n_species
        self._features_df['Composition'] = formula
        self._features_df['structure'] = self.hypo_structures['structure']
        self.features = self._features_df.dropna(axis=0, how='any')
        self.features = self.features.reindex(sorted(self.features.columns), axis=1)

        self._valid_structure_labels = list(self.features.index)
        self.valid_structures = self.hypo_structures.loc[self._valid_structure_labels]

        print("{} out of {} structures were successfully featurized.".format(
            self.features.shape[0], self._features_df.shape[0]))
        return self.features
Beispiel #13
0
    def test_multifeatures(self):
        # Make a test dataset with two input variables
        data = self.make_test_data()
        data['x2'] = [4, 5, 6]

        # Create a second featurizer
        class MultiArgs2(SingleFeaturizerMultiArgs):
            def featurize(self, *x):
                # Making a 2D array to test whether MutliFeaturizer
                #  can handle featurizers that have both 1D vectors with
                #  singleton dimensions (e.g., shape==(4,1)) and those
                #  without (e.g., shape==(4,))
                return [super(MultiArgs2, self).featurize(*x)]

            def feature_labels(self):
                return ['y2']

        multiargs2 = MultiArgs2()

        # Create featurizer
        multi_f = MultipleFeaturizer([self.multiargs, multiargs2])
        multi_f.set_n_jobs(1)

        # Test featurize with multiple arguments
        features = multi_f.featurize(0, 2)
        self.assertArrayAlmostEqual([2, 2], features)

        # Test dataframe
        data = multi_f.featurize_dataframe(data, ['x', 'x2'])
        self.assertEquals(['y', 'y2'], multi_f.feature_labels())
        self.assertArrayAlmostEqual([[5, 5], [7, 7], [9, 9]], data[['y',
                                                                    'y2']])
Beispiel #14
0
    def _fit_apply_featurizers(self,
                               df: pd.DataFrame,
                               featurizers: Iterable[BaseFeaturizer],
                               column: str,
                               fit_to_df: bool = True) -> pd.DataFrame:
        """ For the list of featurizers, fit each to the chosen column of
        the input pd.DataFrame and then apply them as a MultipleFeaturizer.
        Arguments:
            df: The DataFrame to featurize.
            featurizers: The list of matminer featurizers to fit and apply
                to the DataFrame.
            column: The name of the column to apply the featurizers to.
            fit_to_df: Whether or not to fit the featurizers to the
                input dataframe. If not true, it will be assumed that
                any featurizers that required fitting have already been
                fitted.
        Returns:
            pandas.DataFrame: the decorated DataFrame.
        """
        #LOG.info("Applying featurizers {} to column {}".format(featurizers, column))
        if fit_to_df:
            _featurizers = MultipleFeaturizer(
                [feat.fit(df[column]) for feat in featurizers])
        else:
            _featurizers = MultipleFeaturizer(featurizers)

        if self._n_jobs is not None:
            _featurizers.set_n_jobs(self._n_jobs)

        return _featurizers.featurize_dataframe(df,
                                                column,
                                                multiindex=True,
                                                ignore_errors=True)
Beispiel #15
0
    def test_caching(self):
        """Test whether MultiFeaturizer properly caches """

        # have to iterate over entries to enable caching
        feat = MultipleFeaturizer([
            SiteStatsFingerprint.from_preset("LocalPropertyDifference_ward-prb-2017"),
            SiteStatsFingerprint.from_preset("CoordinationNumber_ward-prb-2017")
        ], iterate_over_entries=True)

        # Create a dataframe with two SC structures in it
        data = pd.DataFrame({'strcs': [
            Structure([[3.52, 0, 0], [0, 3.52, 0], [0, 0, 3.52]], ["Al"], [[0, 0, 0]]),
            Structure([[3.52, 0, 0], [0, 3.52, 0], [0, 0, 3.52]], ["Ni"], [[0, 0, 0]]),
        ]})

        # Call featurize on both, check the number of cache misses/hits
        feat.featurize(data['strcs'][0])
        feat.featurize(data['strcs'][1])
        self.assertEqual(2, _get_all_nearest_neighbors.cache_info().hits)
        self.assertEqual(2, _get_all_nearest_neighbors.cache_info().misses)

        # Verify the number of cache misses, it should be the same as before
        feat.set_n_jobs(1)
        _get_all_nearest_neighbors.cache_clear()
        feat.featurize_dataframe(data, 'strcs')

        self.assertEqual(2, _get_all_nearest_neighbors.cache_info().hits)
        self.assertEqual(2, _get_all_nearest_neighbors.cache_info().misses)
Beispiel #16
0
    def test_ignore_errors(self):
        # Make sure multiplefeaturizer returns the correct sub-featurizer multiindex keys

        # Iterate through many tests: single/parallel, returning errors or not,
        # multiindex or not, and interation over entires/featurizers

        for mi, re, n, iter_entries in product(
                [True, False], [True, False], [1, 2], [True, False]):

            mf = MultipleFeaturizer([self.multi, self.single],
                                    iterate_over_entries=iter_entries)
            # Make some test data that will cause errors
            data = pd.DataFrame({'x': ['a', 2, 3]})

            # Set the number of threads
            mf.set_n_jobs(n)

            # Make sure it completes successfully
            results = mf.featurize_many(data['x'], ignore_errors=True, return_errors=re)
            self.assertEqual(5 if re else 3, len(results[0]))

            # Make sure it works with featurize dataframe
            results = mf.featurize_dataframe(data, 'x', ignore_errors=True,
                                             return_errors=re, multiindex=mi)
            self.assertEqual(6 if re else 4, len(results.columns))

            #  Special test for returning errors (only should work when returning errors)
            #   I only am going to test the single index case for simplicity
            if re and not mi:
                self.assertIn('TypeError', results.iloc[0]['SingleFeaturizer Exceptions'])

            # Make sure it throws an error
            with self.assertRaises(TypeError):
                mf.featurize_many([['a'], [1], [2]])
Beispiel #17
0
    def test_multifeatures(self):
        # Make a test dataset with two input variables
        data = self.make_test_data()
        data['x2'] = [4, 5, 6]

        multiargs2 = MultiArgs2()

        # Create featurizer
        multi_f = MultipleFeaturizer([self.multiargs, multiargs2])

        # Test featurize with multiple arguments
        features = multi_f.featurize(0, 2)
        self.assertArrayAlmostEqual([2, 2], features)

        # Test dataframe
        data = multi_f.featurize_dataframe(data, ['x', 'x2'])
        self.assertEquals(['y', 'y2'], multi_f.feature_labels())
        self.assertArrayAlmostEqual([[5, 5], [7, 7], [9, 9]], data[['y', 'y2']])
 def featurizer(self):
     """Return the featurizer (with the suitable cutoff)"""
     cutoff = self.cutoff
     return MultipleFeaturizer(
         [
             CrystalNNFingerprint.from_preset("ops", search_cutoff=cutoff),
             LocalPropertyStatsNew.from_preset("interpretable", cutoff=cutoff),
             GaussianSymmFunc(),
         ]
     )
Beispiel #19
0
    def test_multitype_multifeat(self):
        """Test Multifeaturizer when a featurizer returns a non-numeric type"""

        # test both iteration over entries and featurizers
        for iter_entries in [True, False]:
            # Make the featurizer
            f = MultipleFeaturizer([SingleFeaturizer(), MultiTypeFeaturizer()],
                                   iterate_over_entries=iter_entries)
            f.set_n_jobs(1)

            # Make the test data
            data = self.make_test_data()

            # Add the columns
            data = f.featurize_dataframe(data, 'x')

            # Make sure the types are as expected
            labels = f.feature_labels()
            self.assertArrayEqual(['int64', 'object', 'int64'],
                                  data[labels].dtypes.astype(str).tolist())
            self.assertArrayAlmostEqual(data['y'], [2, 3, 4])
def get_structure_properties(structure: Structure, mode: str = 'all') -> dict:

    if mode == 'all':
        featurizer = MultipleFeaturizer([
            SiteStatsFingerprint.from_preset(
                'CoordinationNumber_ward-prb-2017'),
            StructuralHeterogeneity(),
            ChemicalOrdering(),
            DensityFeatures(),
            MaximumPackingEfficiency(),
            SiteStatsFingerprint.from_preset(
                'LocalPropertyDifference_ward-prb-2017'),
            StructureComposition(Stoichiometry()),
            StructureComposition(ElementProperty.from_preset('magpie')),
            StructureComposition(ValenceOrbital(props=['frac'])),
        ])
    else:
        # Calculate only those which do not need a Voronoi tesselation
        featurizer = MultipleFeaturizer([
            DensityFeatures(),
            StructureComposition(Stoichiometry()),
            StructureComposition(ElementProperty.from_preset('magpie')),
            StructureComposition(ValenceOrbital(props=['frac'])),
        ])

    X = featurizer.featurize(structure)

    matminer_dict = dict(list(zip(featurizer.feature_labels(), X)))

    matminer_dict['volume'] = structure.volume
    return matminer_dict
Beispiel #21
0
    def test_multifeatures_multiargs(self):
        multiargs2 = MultiArgs2()

        # test iterating over both entries and featurizers
        for iter_entries in [True, False]:
            # Make a test dataset with two input variables
            data = self.make_test_data()
            data['x2'] = [4, 5, 6]

            # Create featurizer
            multi_f = MultipleFeaturizer([self.multiargs, multiargs2],
                                         iterate_over_entries=iter_entries)

            # Test featurize with multiple arguments
            features = multi_f.featurize(0, 2)
            self.assertArrayAlmostEqual([2, 2], features)

            # Test dataframe
            data = multi_f.featurize_dataframe(data, ['x', 'x2'])
            self.assertEqual(['y', 'y2'], multi_f.feature_labels())
            self.assertArrayAlmostEqual([[5, 5], [7, 7], [9, 9]],
                                        data[['y', 'y2']])
            # Test with multiindex
            data = multi_f.featurize_dataframe(data, ['x', 'x2'],
                                               multiindex=True)
            self.assertIn(("MultiArgs2", "y2"), data.columns)
            self.assertIn(("SingleFeaturizerMultiArgs", "y"), data.columns)
            self.assertArrayAlmostEqual([[5, 5], [7, 7], [9, 9]], data[[
                ("SingleFeaturizerMultiArgs", "y"), ("MultiArgs2", "y2")
            ]])
Beispiel #22
0
def featurize_composition(df: pd.DataFrame) -> pd.DataFrame:
    """ Decorate input `pandas.DataFrame` of structures with composition
    features from matminer.

    Currently applies the set of all matminer composition features.

    Args:
        df (pandas.DataFrame): the input dataframe with `"structure"`
            column containing `pymatgen.Structure` objects.

    Returns:
        pandas.DataFrame: the decorated DataFrame.

    """
    logging.info("Applying composition featurizers...")
    df = df.copy()
    df['composition'] = df['structure'].apply(lambda s: s.composition)
    featurizer = MultipleFeaturizer([ElementProperty.from_preset("magpie"),
                                     AtomicOrbitals(),
                                     BandCenter(),
                                     # ElectronAffinity(), - This descriptor was not used in the paper preset
                                     Stoichiometry(),
                                     ValenceOrbital(),
                                     IonProperty(),
                                     ElementFraction(),
                                     TMetalFraction(),
                                     # CohesiveEnergy(), - This descriptor was not used in the paper preset
                                     Miedema(),
                                     YangSolidSolution(),
                                     AtomicPackingEfficiency(),
                                     ])

    df = featurizer.featurize_dataframe(df, "composition", multiindex=True, ignore_errors=True)
    df.columns = df.columns.map('|'.join).str.strip('|')

    ox_featurizer = MultipleFeaturizer([OxidationStates(),
                                        ElectronegativityDiff()
                                        ])

    df = CompositionToOxidComposition().featurize_dataframe(df, "Input Data|composition")

    df = ox_featurizer.featurize_dataframe(df, "composition_oxid", multiindex=True, ignore_errors=True)
    df = df.rename(columns={'Input Data': ''})
    df.columns = df.columns.map('|'.join).str.strip('|')

    _orbitals = {"s": 1, "p": 2, "d": 3, "f": 4}

    df['AtomicOrbitals|HOMO_character'] = df['AtomicOrbitals|HOMO_character'].map(_orbitals)
    df['AtomicOrbitals|LUMO_character'] = df['AtomicOrbitals|LUMO_character'].map(_orbitals)

    df['AtomicOrbitals|HOMO_element'] = df['AtomicOrbitals|HOMO_element'].apply(
        lambda x: -1 if not isinstance(x, str) else Element(x).Z
    )
    df['AtomicOrbitals|LUMO_element'] = df['AtomicOrbitals|LUMO_element'].apply(
        lambda x: -1 if not isinstance(x, str) else Element(x).Z
    )

    df = df.replace([np.inf, -np.inf, np.nan], 0)

    return clean_df(df)
    def __init__(self, structure: Structure, outpath: Union[str, Path]):
        """Generates features for a structures

        Args:
            structure (Structure): Pymatgen Structure object
            outpath (Union[str, Path]): path to which the features will be dumped
        Returns:

        """
        featurizelogger = logging.getLogger("Featurize")
        featurizelogger.setLevel(logging.INFO)
        logging.basicConfig(
            format="%(filename)s: %(message)s",
            level=logging.INFO,
        )

        self.outpath = outpath
        if ((outpath != "") and (outpath is not None)
                and (not os.path.exists(self.outpath))):
            os.mkdir(self.outpath)
        self.logger = featurizelogger
        self.path = None
        self.structure = structure
        self.metal_sites = []
        self.metal_indices = []
        self.features = []
        if self.path is not None:
            self.outname = os.path.join(
                self.outpath, "".join([Path(self.path).stem, ".pkl"]))
        else:
            self.outname = os.path.join(
                self.outpath,
                "".join([self.structure.formula.replace(" ", "_"), ".pkl"]),
            )
        self.featurizer = MultipleFeaturizer([
            CrystalNNFingerprint.from_preset("ops"),
            LocalPropertyStatsNew.from_preset("interpretable"),
            GaussianSymmFunc(),
        ])
Beispiel #24
0
def similarity(_parents, target):
    featurizer = MultipleFeaturizer([
        SiteStatsFingerprint.from_preset("CoordinationNumber_ward-prb-2017"),
        StructuralHeterogeneity(),
        ChemicalOrdering(),
        MaximumPackingEfficiency(),
        SiteStatsFingerprint.from_preset(
            "LocalPropertyDifference_ward-prb-2017"),
        StructureComposition(Stoichiometry()),
        StructureComposition(ElementProperty.from_preset("magpie")),
        StructureComposition(ValenceOrbital(props=["frac"])),
        StructureComposition(IonProperty(fast=True)),
    ])

    # HACK celery doesn't work with multiprocessing (used by matminer)
    try:
        from celery import current_task
        if current_task:
            featurizer.set_n_jobs(1)
    except ImportError:
        pass

    x_target = pd.DataFrame.from_records([featurizer.featurize(target)],
                                         columns=featurizer.feature_labels())
    x_parent = pd.DataFrame.from_records(
        featurizer.featurize_many(_parents, ignore_errors=True, pbar=False),
        columns=featurizer.feature_labels(),
    )
    nulls = x_parent[x_parent.isnull().any(axis=1)].index.values
    x_parent.fillna(100000, inplace=True)

    x_target = x_target.reindex(sorted(x_target.columns), axis=1)
    x_parent = x_parent.reindex(sorted(x_parent.columns), axis=1)

    with open(os.path.join(settings.rxn_files, "scaler2.pickle"), "rb") as f:
        scaler = pickle.load(f)
    with open(os.path.join(settings.rxn_files, "quantiles.pickle"), "rb") as f:
        quantiles = pickle.load(f)

    X = scaler.transform(x_parent.append(x_target))

    D = [pairwise_distances(np.array([row, X[-1]]))[0, 1] for row in X[:-1]]

    _res = []
    for d in D:
        _res.append(np.linspace(0, 1, 101)[np.abs(quantiles - d).argmin()])
    _res = np.array(_res)
    _res[nulls] = -1
    return _res
Beispiel #25
0
    def __init__(self, structure, outpath):
        """Generates features for a list of structures

        Args:
            structure
            outpath (str): path to which the features will be dumped
        Returns:

        """
        featurizelogger = logging.getLogger('Featurize')
        featurizelogger.setLevel(logging.INFO)
        logging.basicConfig(
            format='%(filename)s: %(message)s',
            level=logging.INFO,
        )

        self.outpath = outpath
        if outpath != '' and not os.path.exists(self.outpath):
            os.mkdir(self.outpath)
        self.logger = featurizelogger
        self.path = None
        self.structure = structure
        self.metal_sites = []
        self.metal_indices = []
        self.features = []
        if self.path is not None:
            self.outname = os.path.join(
                self.outpath, ''.join([Path(self.path).stem, '.pkl']))
        else:
            self.outname = os.path.join(
                self.outpath,
                ''.join([self.structure.formula.replace(' ', '_'), '.pkl']),
            )
        self.featurizer = MultipleFeaturizer([
            CrystalNNFingerprint.from_preset('ops'),
            LocalPropertyStatsNew.from_preset('interpretable'),
            GaussianSymmFunc(),
        ])
Beispiel #26
0
    def test_multiple(self):
        multi_f = MultipleFeaturizer([self.single, self.multi])
        data = self.make_test_data()

        self.assertArrayAlmostEqual([2, 0, 3], multi_f.featurize(1))

        self.assertArrayEqual(['A'], multi_f.citations())

        implementors = multi_f.implementors()
        self.assertIn('Us', implementors)
        self.assertIn('Them', implementors)
        self.assertEquals(2, len(implementors))

        multi_f.featurize_dataframe(data, 'x')
        self.assertArrayAlmostEqual(data['y'], [2, 3, 4])
        self.assertArrayAlmostEqual(data['w'], [0, 1, 2])
        self.assertArrayAlmostEqual(data['z'], [3, 4, 5])
Beispiel #27
0
class FeatureGenerator:
    """
        A wraper class to generate multiple type of elemental features
    """
    def __init__(self):
        self.feature_calculators = MultipleFeaturizer([
            cf.ElementProperty.from_preset(preset_name="magpie"),
            cf.Stoichiometry(),
            cf.ValenceOrbital(props=['frac']),
            cf.IonProperty(fast=True),
            cf.BandCenter(),
            cf.ElementFraction(),
        ])

        self.str2composition = StrToComposition()

    def generate(self,
                 df: pd.DataFrame,
                 ignore_errors: bool = False,
                 drop_mode=True):
        """
            generate feature from a dataframe with a "formula" column that contains 
            chemical formulas of the compositions.

            df : a dataframe with a column name formula
            ignore_errors : ignore errors when generating features
            drop_mode : drop property that generated from mode aggregation function

        """
        df = self.str2composition.featurize_dataframe(
            df, "formula", ignore_errors=ignore_errors)
        df = df.dropna()
        df = self.feature_calculators.featurize_dataframe(
            df, col_id='composition', ignore_errors=ignore_errors)
        df["NComp"] = df["composition"].apply(len)
        if drop_mode:
            df = df.drop(columns=[
                c for c in df.columns if "mode" in c and c.startswith("Magpie")
            ])
        return df
Beispiel #28
0
class RFEstimator(BaseTesterEstimator):

    def __init__(self, pbar=False):
        self.regressor = RandomForestRegressor(n_estimators=500, n_jobs=-1, verbose=3)
        self.stc = StrToComposition()
        ep = ElementProperty.from_preset("magpie")
        ef = ElementFraction()
        self.featurizer = MultipleFeaturizer([ep, ef])
        self.pbar = pbar

    def _generate_features(self, x):
        comps = [o[0] for o in self.stc.featurize_many(x, pbar=self.pbar)]
        features = np.asarray(self.featurizer.featurize_many(comps, pbar=self.pbar))
        return features

    def fit(self, x, y):
        features = self._generate_features(x)
        self.regressor.fit(features, y)

    def predict(self, x):
        features = self._generate_features(x)
        return self.regressor.predict(features)
    def test_multiple(self):
        multi_f = MultipleFeaturizer([self.single, self.multi])
        data = self.make_test_data()

        self.assertArrayAlmostEqual([2, 0, 3], multi_f.featurize(1))

        self.assertArrayEqual(['A'], multi_f.citations())

        implementors = multi_f.implementors()
        self.assertIn('Us', implementors)
        self.assertIn('Them', implementors)
        self.assertEquals(2, len(implementors))

        # Ensure BaseFeaturizer operation without overriden featurize_dataframe
        with warnings.catch_warnings(record=True) as w:
            multi_f.featurize_dataframe(data, 'x')
            self.assertEqual(len(w), 0)
        self.assertArrayAlmostEqual(data['y'], [2, 3, 4])
        self.assertArrayAlmostEqual(data['w'], [0, 1, 2])
        self.assertArrayAlmostEqual(data['z'], [3, 4, 5])
    def test_multitype_multifeat(self):
        """Test Multifeaturizer when a featurizer returns a non-numeric type"""

        # Make the featurizer
        f = MultipleFeaturizer([SingleFeaturizer(), MultiTypeFeaturizer()])
        f.set_n_jobs(1)

        # Make the test data
        data = self.make_test_data()

        # Add the columns
        data = f.featurize_dataframe(data, 'x')

        # Make sure the types are as expected
        labels = f.feature_labels()
        self.assertArrayEqual(['int64', 'object', 'int64'],
                              data[labels].dtypes.astype(str).tolist())
        self.assertArrayAlmostEqual(data['y'], [2, 3, 4])
Beispiel #31
0
# Get only the minimum energy structure at each composition
data['composition'] = data['structure'].apply(lambda x: x.composition)
data['integer_formula'] = data['composition'].apply(
    lambda x: x.get_integer_formula_and_factor()[0])

data.sort_values('e_above_hull', ascending=True, inplace=True)
data.drop_duplicates('integer_formula', keep='first', inplace=True)
print('Reduced dataset to {} unique compositions.'.format(len(data)))

data.reset_index(inplace=True, drop=True)

# Create the featurizer, which will take the composition as input
featurizer = MultipleFeaturizer([
    cf.Stoichiometry(),
    cf.ElementProperty.from_preset('magpie'),
    cf.ValenceOrbital(props=['frac']),
    cf.IonProperty(fast=True)
])

# Compute the features
featurizer.set_n_jobs(1)
X = featurizer.featurize_many(data['composition'])

# Make the model
model = Pipeline([('imputer', Imputer()), ('model', RandomForestRegressor())])
model.fit(X, data['formation_energy_per_atom'])
print('Trained a RandomForest model')

# Save the model, featurizer, and data using pickle
with open('model.pkl', 'wb') as fp:
    pkl.dump(model, fp)
featurizer = MultipleFeaturizer([
    GlobalSymmetryFeatures(),
    ElectronicRadialDistributionFunction(cutoff=7.5),
    SiteStatsFingerprint(AGNIFingerprints(directions=(None, 'x', 'y'))),
    SiteStatsFingerprint(OPSiteFingerprint()),
    SiteStatsFingerprint.from_preset("CoordinationNumber_ward-prb-2017"),
    SiteStatsFingerprint(GaussianSymmFunc()),
    SiteStatsFingerprint(EwaldSiteEnergy(accuracy=3)),
    DensityFeatures(),
    SiteStatsFingerprint(
        GeneralizedRadialDistributionFunction.from_preset('gaussian')),
    SiteStatsFingerprint(
        LocalPropertyDifference(
            data_source=MagpieData(),
            properties=[
                "Number", "MendeleevNumber", "AtomicWeight", "MeltingT",
                "Column", "Row", "CovalentRadius", "Electronegativity",
                "NsValence", "NpValence", "NdValence", "NfValence", "NValence",
                "NsUnfilled", "NpUnfilled", "NdUnfilled", "NfUnfilled",
                "NUnfilled", "GSvolume_pa", "GSbandgap", "GSmagmom"
            ])),
    SiteStatsFingerprint(SiteElementalProperty.from_preset("seko-prb-2017")),
    EwaldEnergy(),
    StructuralHeterogeneity(),
    ChemicalOrdering(),
    StructureComposition(ElementProperty.from_preset('magpie')),
    StructureComposition(AtomicOrbitals()),
    StructureComposition(BandCenter()),
    StructureComposition(ElectronegativityDiff()),
    StructureComposition(ElectronAffinity()),
    StructureComposition(Stoichiometry()),
    StructureComposition(ValenceOrbital()),
    StructureComposition(IonProperty()),
    StructureComposition(Miedema()),
    StructureComposition(YangSolidSolution())
])
Beispiel #33
0
def load_data_zT():
    results_dir = setResDir()

    ## Metadata
    keys_response = [
        'Seebeck coefficient; squared', 'Electrical resistivity',
        'Thermal conductivity'
    ]
    sign = np.array([
        +1,  # Seebeck
        -1,  # Electric resistivity
        -1  # Thermal conductivity
    ])

    ## Load data, if possible
    # --------------------------------------------------
    try:
        df_X_all = pd.read_csv(results_dir + file_features)
        X_all = df_X_all.drop(df_X_all.columns[0], axis=1).values

        df_Y_all = pd.read_csv(results_dir + file_responses)
        Y_all = df_Y_all.drop(df_Y_all.columns[0], axis=1).values
        print("Cached data loaded.")

    except FileNotFoundError:
        ## Data Import
        # --------------------------------------------------
        # Initialize client
        print("Accessing data from Citrination...")
        site = 'https://citrination.com'  # Citrination
        client = CitrinationClient(api_key=os.environ['CITRINATION_API_KEY'],
                                   site=site)
        search_client = client.search
        # Aluminum dataset
        dataset_id = 178480  # ucsb_te_roomtemp_seebeck
        system_query = PifSystemReturningQuery(
            size=1000,
            query=DataQuery(dataset=DatasetQuery(id=Filter(
                equal=str(dataset_id)))))

        query_result = search_client.pif_search(system_query)
        print("    Found {} PIFs in dataset {}.".format(
            query_result.total_num_hits, dataset_id))

        ## Wrangle
        # --------------------------------------------------
        pifs = [x.system for x in query_result.hits]
        # Utility function will tabularize PIFs
        df_response = pifs2df(pifs)
        # Down-select columns to play well with to_numeric
        df_response = df_response[[
            'Seebeck coefficient', 'Electrical resistivity',
            'Thermal conductivity'
        ]]
        df_response = df_response.apply(pd.to_numeric)

        # Parse chemical compositions
        formulas = [pif.chemical_formula for pif in pifs]

        df_comp = pd.DataFrame(columns=['chemical_formula'], data=formulas)

        # Join
        df_data = pd.concat([df_comp, df_response], axis=1)
        print("    Accessed data.")

        # Featurize
        print("Featurizing data...")
        df_data['composition'] = df_data['chemical_formula'].apply(
            get_compostion)

        f = MultipleFeaturizer([
            cf.Stoichiometry(),
            cf.ElementProperty.from_preset("magpie"),
            cf.ValenceOrbital(props=['avg']),
            cf.IonProperty(fast=True)
        ])

        X = np.array(f.featurize_many(df_data['composition']))

        # Find valid response values
        keys_original = [
            'Seebeck coefficient', 'Electrical resistivity',
            'Thermal conductivity'
        ]

        index_valid_response = {
            key: df_data[key].dropna().index.values
            for key in keys_original
        }

        index_valid_all = df_data[keys_original].dropna().index.values
        X_all = X[index_valid_all, :]
        Y_all = df_data[keys_original].iloc[index_valid_all].values

        # Manipulate columns for proper objective values
        Y_all[:, 0] = Y_all[:, 0]**2  # Squared seebeck
        print("    Data prepared; {0:} valid observations.".format(
            X_all.shape[0]))

        # Cache data
        pd.DataFrame(data=X_all).to_csv(results_dir + file_features)
        pd.DataFrame(data=Y_all, columns=keys_response).to_csv(results_dir +
                                                               file_responses)
        print("Data cached in results directory.")

    return X_all, Y_all, sign, keys_response, prefix
class GetFeatures:  # pylint:disable=too-many-instance-attributes
    """Featurizer"""
    def __init__(self, structure: Structure, outpath: Union[str, Path]):
        """Generates features for a structures

        Args:
            structure (Structure): Pymatgen Structure object
            outpath (Union[str, Path]): path to which the features will be dumped
        Returns:

        """
        featurizelogger = logging.getLogger("Featurize")
        featurizelogger.setLevel(logging.INFO)
        logging.basicConfig(
            format="%(filename)s: %(message)s",
            level=logging.INFO,
        )

        self.outpath = outpath
        if ((outpath != "") and (outpath is not None)
                and (not os.path.exists(self.outpath))):
            os.mkdir(self.outpath)
        self.logger = featurizelogger
        self.path = None
        self.structure = structure
        self.metal_sites = []
        self.metal_indices = []
        self.features = []
        if self.path is not None:
            self.outname = os.path.join(
                self.outpath, "".join([Path(self.path).stem, ".pkl"]))
        else:
            self.outname = os.path.join(
                self.outpath,
                "".join([self.structure.formula.replace(" ", "_"), ".pkl"]),
            )
        self.featurizer = MultipleFeaturizer([
            CrystalNNFingerprint.from_preset("ops"),
            LocalPropertyStatsNew.from_preset("interpretable"),
            GaussianSymmFunc(),
        ])

    @classmethod
    def from_file(cls, structurepath: Union[str, Path],
                  outpath: Union[str, Path]) -> object:
        """Construct a featurizer class from path to structure
            and an output path

        Args:
            structurepath (Union[str, Path]): Path to structure file
            outpath (Union[str, Path]): Path to which the outputs should be written.

        Returns:
            object: Instance of the GetFeatures class
        """
        s = GetFeatures._read_safe(structurepath)
        featureclass = cls(s, outpath)
        featureclass.path = structurepath
        featureclass.outname = os.path.join(
            featureclass.outpath,
            "".join([Path(featureclass.path).stem, ".pkl"]))
        return featureclass

    @classmethod
    def from_string(cls, structurestring: str, outpath: Union[str,
                                                              Path]) -> object:
        """Constructor for the webapp, using a string of a structure file,
        e.g., a CIF

        Args:
            structurestring (str): Fileconent of a CIF as string
            outpath (Union[str, Path]): Path to which the output should be written.

        Raises:
            ValueError: In case the CIF could not be parsed

        Returns:
            object: Instance of GetFeatures
        """
        with warnings.catch_warnings():
            warnings.simplefilter("ignore")
            try:
                cp = CifParser.from_string(structurestring)
                s = cp.get_structures()[0]
            except Exception as execp:
                raise ValueError("Pymatgen could not parse ciffile") from execp
            else:
                return cls(s, outpath)

    @staticmethod
    def _read_safe(path: Union[str, Path]):
        """Fail early

        Returns:
            bool: True if check ok (if pymatgen can load structure)

        """
        with warnings.catch_warnings():
            warnings.simplefilter("ignore")
            try:
                atoms = read(path)
                structure = AseAtomsAdaptor.get_structure(
                    atoms)  # ase parser is a bit more robust
                return structure
            except Exception as execpt:  # pylint: disable=broad-except
                raise ValueError("Could not read structure") from execpt

    def _get_metal_sites(self):
        """Stores all metal sites of structure to list"""
        for idx, site in enumerate(self.structure):
            if site.species.elements[0].is_metal:
                self.metal_sites.append(site)
                self.metal_indices.append(idx)

    def _get_feature_vectors(self, site):
        """Runs matminer on one site"""
        with warnings.catch_warnings():
            warnings.simplefilter("ignore")

            X = self.featurizer.featurize(self.structure, site)

        return X

    def _dump_features(self):
        """Dumps all the features into one pickle file"""
        with open(self.outname, "wb") as filehandle:
            pickle.dump(list(self.features), filehandle)

    def return_features(self) -> List[dict]:
        """Runs featurization and returns a list of dictionaries

        Returns:
            List[dict]: List of dictionaries of the form {"metal": , "feature", : , "coords"},
                i.e features for one metal site
        """
        self._get_metal_sites()
        try:
            self.logger.info("iterating over {} metal sites".format(
                len(self.metal_sites)))
            for idx, metal_site in enumerate(self.metal_sites):
                self.features.append({
                    "metal":
                    metal_site.species_string,
                    "feature":
                    self._get_feature_vectors(self.metal_indices[idx]),
                    "coords":
                    metal_site.coords,
                })
        except Exception as e:  # pylint: disable=broad-except
            self.logger.error("Could not featurize because of {}".format(e))

        return self.features

    def _run_featurization(self):
        """loops over sites if check ok"""
        warnings.warn(
            "This method is deprecated, and will be removed in a future release",
            DeprecationWarning,
        )
        self._get_metal_sites()
        try:
            self.logger.info("iterating over {} metal sites".format(
                len(self.metal_sites)))
            for idx, metal_site in enumerate(self.metal_sites):
                self.features.append({
                    "metal":
                    metal_site.species_string,
                    "feature":
                    self._get_feature_vectors(self.metal_indices[idx]),
                    "coords":
                    metal_site.coords,
                })
            self._dump_features()
        except Exception as e:  # pylint: disable=broad-except
            self.logger.error("could not featurize {} because of {}".format(
                self.path, e))
Beispiel #35
0
class GetFeatures:
    """Featurizer"""
    def __init__(self, structure, outpath):
        """Generates features for a list of structures

        Args:
            structure
            outpath (str): path to which the features will be dumped
        Returns:

        """
        featurizelogger = logging.getLogger('Featurize')
        featurizelogger.setLevel(logging.INFO)
        logging.basicConfig(
            format='%(filename)s: %(message)s',
            level=logging.INFO,
        )

        self.outpath = outpath
        if outpath != '' and not os.path.exists(self.outpath):
            os.mkdir(self.outpath)
        self.logger = featurizelogger
        self.path = None
        self.structure = structure
        self.metal_sites = []
        self.metal_indices = []
        self.features = []
        if self.path is not None:
            self.outname = os.path.join(
                self.outpath, ''.join([Path(self.path).stem, '.pkl']))
        else:
            self.outname = os.path.join(
                self.outpath,
                ''.join([self.structure.formula.replace(' ', '_'), '.pkl']),
            )
        self.featurizer = MultipleFeaturizer([
            CrystalNNFingerprint.from_preset('ops'),
            LocalPropertyStatsNew.from_preset('interpretable'),
            GaussianSymmFunc(),
        ])

    @classmethod
    def from_file(cls, structurepath, outpath):
        """
        Construct a featurizer class from path to structure and an output path
        """
        s = GetFeatures.read_safe(structurepath)
        featureclass = cls(s, outpath)
        featureclass.path = structurepath
        featureclass.outname = os.path.join(
            featureclass.outpath,
            ''.join([Path(featureclass.path).stem, '.pkl']))
        return featureclass

    @classmethod
    def from_string(cls, structurestring, outpath):
        """
        Constructure for the webapp
        """
        from pymatgen.io.cif import CifParser

        with warnings.catch_warnings():
            warnings.simplefilter('ignore')
            try:
                cp = CifParser.from_string(structurestring)
                s = cp.get_structures()[0]
            except Exception:
                raise ValueError('Pymatgen could not parse ciffile')
            else:
                return cls(s, outpath)

    @staticmethod
    def read_safe(path):
        """Fail early

        Returns:
            bool: True if check ok (if pymatgen can load structure)

        """
        with warnings.catch_warnings():
            warnings.simplefilter('ignore')
            try:
                atoms = read(path)
                structure = AseAtomsAdaptor.get_structure(
                    atoms)  # ase parser is a bit more robust
                return structure
            except Exception:  # pylint: disable=broad-except
                raise ValueError('Could not read structure')

    def get_metal_sites(self):
        """Stores all metal sites of structure  to list"""
        for idx, site in enumerate(self.structure):
            if site.species.elements[0].is_metal:
                self.metal_sites.append(site)
                self.metal_indices.append(idx)

    def get_feature_vectors(self, site):
        """Runs matminer on one site"""
        with warnings.catch_warnings():
            warnings.simplefilter('ignore')

            X = self.featurizer.featurize(self.structure, site)

        return X

    def dump_features(self):
        """Dumps all the features into one pickle file"""
        with open(self.outname, 'wb') as filehandle:
            pickle.dump(list(self.features), filehandle)

    def return_features(self):
        """Runs featurization and return np array with features.
        """
        self.get_metal_sites()
        try:
            self.logger.info('iterating over {} metal sites'.format(
                len(self.metal_sites)))
            for idx, metal_site in enumerate(self.metal_sites):
                self.features.append({
                    'metal':
                    metal_site.species_string,
                    'feature':
                    self.get_feature_vectors(self.metal_indices[idx]),
                    'coords':
                    metal_site.coords,
                })
        except Exception as e:  # pylint: disable=broad-except
            self.logger.error('could not featurize because of {}'.format(e))

        return self.features

    def run_featurization(self):
        """loops over sites if check ok"""
        self.get_metal_sites()
        try:
            self.logger.info('iterating over {} metal sites'.format(
                len(self.metal_sites)))
            for idx, metal_site in enumerate(self.metal_sites):
                self.features.append({
                    'metal':
                    metal_site.species_string,
                    'feature':
                    self.get_feature_vectors(self.metal_indices[idx]),
                    'coords':
                    metal_site.coords,
                })
            self.dump_features()
        except Exception as e:  # pylint: disable=broad-except
            self.logger.error('could not featurize {} because of {}'.format(
                self.path, e))
Beispiel #36
-1
    def test_multiple(self):
        # test iterating over both entries and featurizers
        for iter_entries in [True, False]:
            multi_f = MultipleFeaturizer([self.single, self.multi],
                                         iterate_over_entries=iter_entries)
            data = self.make_test_data()

            self.assertArrayAlmostEqual([2, 0, 3], multi_f.featurize(1))

            self.assertArrayEqual(['A'], multi_f.citations())

            implementors = multi_f.implementors()
            self.assertIn('Us', implementors)
            self.assertIn('Them', implementors)
            self.assertEqual(2, len(implementors))

            # Ensure BaseFeaturizer operation without overriden featurize_dataframe
            with warnings.catch_warnings(record=True) as w:
                multi_f.featurize_dataframe(data, 'x')
                self.assertEqual(len(w), 0)
            self.assertArrayAlmostEqual(data['y'], [2, 3, 4])
            self.assertArrayAlmostEqual(data['w'], [0, 1, 2])
            self.assertArrayAlmostEqual(data['z'], [3, 4, 5])

            f = MatrixFeaturizer()
            multi_f = MultipleFeaturizer([self.single, self.multi, f])
            data = self.make_test_data()
            with warnings.catch_warnings(record=True) as w:
                multi_f.featurize_dataframe(data, 'x')
                self.assertEqual(len(w), 0)

            self.assertArrayAlmostEqual(data['representation'][0],
                                        [[1.0, 0.0], [0.0, 1.0]])