def test_StructureFeaturizers_needs_fitting(self): fset_nofit = StructureFeaturizers().best fset_needfit = StructureFeaturizers().all af_nofit = AutoFeaturizer(featurizers={"structure": fset_nofit}) af_needfit = AutoFeaturizer(featurizers={"structure": fset_needfit}) self.assertTrue(af_needfit.needs_fit) self.assertFalse(af_nofit.needs_fit)
def test_structure_featurizers(self): ignore = ['StructureComposition', 'CGCNNFeaturizer'] ignore += [ klass.__class__.__name__ for klass in StructureFeaturizers().matrix ] if not dscribe: ignore += ["SOAP"] true_feats = self.get_featurizers(sf, ignore) test_feats = self.allfs.structure self._test_features_implemented(test_feats, true_feats)
def test_presets(self): target = "K_VRH" df = copy.copy(self.test_df.iloc[:self.limit]) af = AutoFeaturizer(preset="fast") df = af.fit_transform(df, target) known_feats = CompositionFeaturizers().fast + \ StructureFeaturizers().fast n_structure_featurizers = len(af.featurizers["structure"]) n_composition_featurizers = len(af.featurizers["composition"]) n_featurizers = n_structure_featurizers + n_composition_featurizers self.assertEqual(n_featurizers, len(known_feats))
def test_presets(self): target = "K_VRH" df = copy.copy(self.test_df.iloc[: self.limit]) af = AutoFeaturizer(preset="express") df = af.fit_transform(df, target) known_feats = ( CompositionFeaturizers().express + StructureFeaturizers().express ) n_structure_featurizers = len(af.featurizers["structure"]) n_composition_featurizers = len(af.featurizers["composition"]) n_removed_featurizers = len(af.removed_featurizers) n_featurizers = ( n_structure_featurizers + n_composition_featurizers + n_removed_featurizers ) self.assertEqual(n_featurizers, len(known_feats))
def __init__(self, cache_src=None, preset=None, featurizers=None, exclude=None, functionalize=False, ignore_cols=None, ignore_errors=True, drop_inputs=True, guess_oxistates=True, multiindex=False, do_precheck=True, n_jobs=None, logger=True, composition_col="composition", structure_col="structure", bandstructure_col="bandstructure", dos_col="dos"): if featurizers and preset: raise AutomatminerError("Featurizers and preset were both set. " "Please either use a preset ('express', " "'all', 'debug', 'heavy') or set " "featurizers manually.") if not featurizers and not preset: raise AutomatminerError("Please specify set(s) of featurizers to " "use either through the featurizers" "argument or through the preset argument.") self.cache_src = cache_src self.preset = "express" if preset is None else preset self._logger = self.get_logger(logger) self.featurizers = featurizers self.exclude = exclude if exclude else [] self.functionalize = functionalize self.ignore_cols = ignore_cols or [] self.is_fit = False self.fitted_input_df = None self.converted_input_df = None self.ignore_errors = ignore_errors self.drop_inputs = drop_inputs self.multiindex = multiindex self.do_precheck = do_precheck self.n_jobs = n_jobs self.guess_oxistates = guess_oxistates self.features = [] self.auto_featurizer = True if self.featurizers is None else False self.removed_featurizers = None self.composition_col = composition_col self.structure_col = structure_col self.bandstruct_col = bandstructure_col self.dos_col = dos_col _supported_featurizers = { composition_col: CompositionFeaturizers, structure_col: StructureFeaturizers, bandstructure_col: BSFeaturizers, dos_col: DOSFeaturizers } # user-set featurizers if self.featurizers: if not isinstance(self.featurizers, dict): raise TypeError("Featurizers must be a dictionary with keys" "matching your {}".format(_COMMON_COL_ERR_STR)) invalid_ftypes = [ f for f in self.featurizers.keys() if f not in _supported_featurizers.keys() ] if invalid_ftypes: raise KeyError( "The following keys were specified as featurizer" " types but were not set in {}" "".format(_COMMON_COL_ERR_STR)) for ftype, fset in self.featurizers.items(): _allowed = [ f.__class__.__name__ for f in _supported_featurizers[ftype]().all ] for f in fset: if f.__class__.__name__ not in _allowed: raise ValueError( "The {} featurizer {} is not supported by " "AutoFeaturizer. Try updating your version of " "automatminer and matminer.".format(ftype, f)) # auto-set featurizers else: featurizers = dict() for featurizer_type in _supported_featurizers.keys(): featurizer_set = _supported_featurizers[featurizer_type] featurizers[featurizer_type] = getattr( featurizer_set(exclude=self.exclude), self.preset) self.featurizers = featurizers # Check if any featurizers need fitting (useful for MatPipe) needs_fit = False fittable_fs = StructureFeaturizers().need_fit self.fittable_fcls = set([f.__class__.__name__ for f in fittable_fs]) # Currently structure featurizers are the only featurizer types which # can be fittable for f in self.featurizers[self.structure_col]: if f.__class__.__name__ in self.fittable_fcls: needs_fit = True break self.needs_fit = needs_fit if self.needs_fit and self.cache_src: self.logger.warn(self._log_prefix + "Using cached features on fittable featurizers! " "Please make sure you are not benchmarking with " "these options enabled; it is likely you will be" "leaking data (i.e., features) from the testing" "sets into the training.") if self.cache_src and "json" not in self.cache_src.lower(): raise ValueError("The cache_src filename does not contain json." "JSON is the required file type for featurizer" "caching.") self.min_precheck_frac = 0.9
def setUp(self): self.required_attrs = ["express", "heavy", "debug", "all"] self.c = CompositionFeaturizers() self.s = StructureFeaturizers() self.b = BSFeaturizers() self.d = DOSFeaturizers()