def test_benchmarking(self): pipe = MatPipe(**debug_config) df = self.df.iloc[500:700] df_test = pipe.benchmark(df, self.target, test_spec=0.25) self.assertEqual(df_test.shape[0], 50) true = df_test[self.target] test = df_test[self.target + " predicted"] self.assertTrue(r2_score(true, test) > 0.5)
def test_instantiation(self): learner = self.config["learner"] autofeaturizer = self.config["autofeaturizer"] with self.assertRaises(AutomatminerError): MatPipe(learner=learner) with self.assertRaises(AutomatminerError): MatPipe(autofeaturizer=autofeaturizer) with self.assertRaises(AutomatminerError): MatPipe(autofeaturizer=autofeaturizer, learner=learner) MatPipe() MatPipe(**self.config)
def setUp(self): df = load_dataset("elastic_tensor_2015").rename( columns={"formula": "composition"}) self.df = df[["composition", "K_VRH"]] self.df_struc = df[["composition", "structure", "K_VRH"]] self.extra_features = df["G_VRH"] self.target = "K_VRH" self.config = get_preset_config("debug_single") self.config_cached = get_preset_config("debug_single", cache_src=CACHE_SRC) self.pipe = MatPipe(**self.config) self.pipe_cached = MatPipe(**self.config_cached)
def test_from_preset(self): for preset in get_available_presets(): MatPipe.from_preset(preset) MatPipe.from_preset("debug", cache_src="some_file.json") MatPipe.from_preset("debug") MatPipe.from_preset("debug", log_level=1)
def test_user_features(self): pipe = MatPipe(**debug_config) df = self.df df["G_VRH"] = self.extra_features self.assertTrue("G_VRH" in df.columns) self.assertTrue("K_VRH" in df.columns) df_train = df.iloc[:200] df_test = df.iloc[201:250] pipe.fit(df_train, self.target) # If shear modulus is included as a feature it should probably show up # in the final pipeline self.assertTrue("G_VRH" in pipe.learner.features) df_test = pipe.predict(df_test, self.target) true = df_test[self.target] test = df_test[self.target + " predicted"] self.assertTrue(r2_score(true, test) > 0.75)
def test_persistence(self): with self.assertRaises(NotFittedError): self.pipe.save() df = self.df[-200:] self.pipe.fit(df, self.target) # Load test self.pipe.save(filename=PIPE_PATH) self.pipe = MatPipe.load(PIPE_PATH) df_test = self.pipe.predict(self.df[-220:-201]) self.assertTrue(self.target in df_test.columns) self.assertTrue(self.target + " predicted" in df_test.columns) # Version test self.pipe.version = "not a real version" self.pipe.save(VERSION_PIPE_PATH) with self.assertRaises(AutomatminerError): MatPipe.load(VERSION_PIPE_PATH)
def test_persistence_and_digest(self): with self.assertRaises(NotFittedError): self.pipe.save() df = self.df[-200:] self.pipe.fit(df, self.target) filename = os.path.join(test_dir, PIPE_PATH) self.pipe.save(filename=filename) self.pipe = MatPipe.load(filename, logger=False) df_test = self.pipe.predict(self.df[-220:-201], self.target) self.assertTrue(self.target in df_test.columns) self.assertTrue(self.target + " predicted" in df_test.columns) digest_file = os.path.join(test_dir, DIGEST_PATH) digest = self.pipe.digest(filename=digest_file) self.assertTrue(os.path.isfile(digest_file)) self.assertTrue(isinstance(digest, str))
def run_task(self, fw_spec): # Read data from fw_spec pipe_config_dict = fw_spec["pipe_config"] target = fw_spec["target"] data_file = fw_spec["data_file"] learner_name = pipe_config_dict["learner_name"] learner_kwargs = pipe_config_dict["learner_kwargs"] reducer_kwargs = pipe_config_dict["reducer_kwargs"] cleaner_kwargs = pipe_config_dict["cleaner_kwargs"] autofeaturizer_kwargs = pipe_config_dict["autofeaturizer_kwargs"] # Modify data_file based on computing resource data_dir = os.environ["AMM_DATASET_DIR"] data_file = os.path.join(data_dir, data_file) # Modify save_dir based on computing resource bench_dir = os.environ["AMM_SINGLE_FIT_DIR"] base_save_dir = fw_spec["base_save_dir"] base_save_dir = os.path.join(bench_dir, base_save_dir) if not os.path.exists(base_save_dir): os.makedirs(base_save_dir) # Set up pipeline config if learner_name == "TPOTAdaptor": learner = TPOTAdaptor(**learner_kwargs) elif learner_name == "rf": warnings.warn( "Learner kwargs passed into RF regressor/classifiers bc. rf being used." ) learner = SinglePipelineAdaptor( regressor=RandomForestRegressor(**learner_kwargs), classifier=RandomForestClassifier(**learner_kwargs), ) else: raise ValueError("{} not supported yet!" "".format(learner_name)) pipe_config = { "learner": learner, "reducer": FeatureReducer(**reducer_kwargs), "cleaner": DataCleaner(**cleaner_kwargs), "autofeaturizer": AutoFeaturizer(**autofeaturizer_kwargs), } pipe = MatPipe(**pipe_config) # Set up dataset # Dataset should already be set up correctly as json beforehand. # this includes targets being converted to classification, removing # extra columns, having the names of featurization cols set to the # same as the matpipe config, etc. df = load_dataframe_from_json(data_file) pipe.fit(df, target) pipe.save(os.path.join(base_save_dir, "pipe.p"))
def test_transferability(self): df_train = self.df.iloc[:200] df_test = self.df.iloc[201:250] pipe = MatPipe(**debug_config) pipe.fit(df_train, self.target) df_test = pipe.predict(df_test, self.target) true = df_test[self.target] test = df_test[self.target + " predicted"] self.assertTrue("composition" not in df_test.columns) self.assertTrue(r2_score(true, test) > 0.5) # Use the same pipe object by refitting and reusing df_train2 = self.df.iloc[250:450] df_test2 = self.df.iloc[451:500] pipe.fit(df_train2, self.target) df_test2 = pipe.predict(df_test2, self.target) true2 = df_test2[self.target] test2 = df_test2[self.target + " predicted"] self.assertTrue("composition" not in df_test2.columns) self.assertTrue(r2_score(true2, test2) > 0.5)
if len(feature_ids) != 1: raise ValueError("Error, this method does not yet support " "computing plots for more than one feature at a " "time") axs = self.interpreter.partial_dependence.plot_partial_dependence( feature_ids, self.model, sample=False, progressbar=False, with_variance=True ) return axs[0][0], axs[0][1] if __name__ == '__main__': if not os.path.exists("tests/test_pipe.p"): df = load_dataset('elastic_tensor_2015') df = df[["formula", "K_VRH"]] df = df.rename({"formula": "composition"}, axis=1) fitted_pipeline = MatPipe().fit(df, "K_VRH") fitted_pipeline.save("tests/test_pipe.p") else: fitted_pipeline = MatPipe().load("tests/test_pipe.p") analyzer = Analytics(fitted_pipeline) feature_importance = analyzer.get_feature_importance() for feature in reversed(feature_importance.index): analyzer.plot_partial_dependence(feature)
class TestMatPipe(unittest.TestCase): def setUp(self): df = load_dataset("elastic_tensor_2015").rename( columns={"formula": "composition"}) self.df = df[["composition", "K_VRH"]] self.df_struc = df[["composition", "structure", "K_VRH"]] self.extra_features = df["G_VRH"] self.target = "K_VRH" self.config = get_preset_config(config_preset, n_jobs=n_jobs) self.config_cached = get_preset_config(config_preset, cache_src=CACHE_SRC, n_jobs=n_jobs) self.pipe = MatPipe(**self.config) self.pipe_cached = MatPipe(**self.config_cached) @unittest.skipIf("transferability" in skip, reason) def test_transferability(self): df_train = self.df.iloc[:200] df_test = self.df.iloc[201:250] self.pipe.fit(df_train, self.target) df_test = self.pipe.predict(df_test) true = df_test[self.target] test = df_test[self.target + " predicted"] self.assertTrue("composition" not in df_test.columns) self.assertTrue(r2_score(true, test) > 0.5) # Use the same pipe object by refitting and reusing df_train2 = self.df.iloc[250:450] df_test2 = self.df.iloc[451:500] self.pipe.fit(df_train2, self.target) df_test2 = self.pipe.predict(df_test2) true2 = df_test2[self.target] test2 = df_test2[self.target + " predicted"] self.assertTrue("composition" not in df_test2.columns) self.assertTrue(r2_score(true2, test2) > 0.5) @unittest.skipIf("user_features" in skip, reason) def test_user_features(self): df = self.df df["G_VRH"] = self.extra_features self.assertTrue("G_VRH" in df.columns) self.assertTrue("K_VRH" in df.columns) df_train = df.iloc[:200] df_test = df.iloc[201:250] self.pipe.fit(df_train, self.target) # If shear modulus is included as a feature it should probably show up # in the final pipeline self.assertTrue("G_VRH" in self.pipe.learner.features) df_test = self.pipe.predict(df_test) true = df_test[self.target] test = df_test[self.target + " predicted"] self.assertTrue(r2_score(true, test) > 0.75) @unittest.skipIf("predict_kwargs" in skip, reason) def test_predict_kwargs(self): # Test mat_pipe.predict()'s ignore and output_col kwargs. df_train = self.df.iloc[:200] df_test = self.df.iloc[201:250] ef = "ExtraFeature" df_test[ef] = [i + 100 for i in range(df_test.shape[0])] self.pipe.fit(df_train, self.target) self.assertTrue(ef in df_test.columns) self.assertTrue("composition" in df_test.columns) ignore = [ef, "composition"] predicted_ignored = self.pipe.predict(df_test, ignore=ignore) self.assertTrue(ef in predicted_ignored.columns) self.assertTrue("composition" in predicted_ignored.columns) predicted_none = self.pipe.predict(df_test, ignore=None) self.assertFalse(ef in predicted_none.columns) self.assertFalse("composition" in predicted_none.columns) some = ["composition"] predicted_some = self.pipe.predict(df_test, ignore=some) self.assertFalse(ef in predicted_some.columns) self.assertTrue("composition" in predicted_some.columns) output_col_name = self.target + "_pred" predicted_custom_col = self.pipe.predict( df_test, output_col=output_col_name) self.assertTrue(output_col_name in predicted_custom_col) @unittest.skipIf("benchmarking" in skip, reason) def test_benchmarking_no_cache(self): pipe = self.pipe # Make sure we can't run a cached run with no cache AF and cache pipe with self.assertRaises(AutomatminerError): self._run_benchmark(cache=True, pipe=pipe) self._run_benchmark(cache=False, pipe=pipe) @unittest.skipIf("benchmarking" in skip, reason) def test_benchmarking_cache(self): pipe = self.pipe_cached # Make sure we can't run a cached run with no cache AF and cache pipe with self.assertRaises(AutomatminerError): self._run_benchmark(cache=False, pipe=pipe) self._run_benchmark(cache=True, pipe=pipe) @unittest.skipIf("persistence" in skip, reason) def test_persistence(self): with self.assertRaises(NotFittedError): self.pipe.save() df = self.df[-200:] self.pipe.fit(df, self.target) # Load test self.pipe.save(filename=PIPE_PATH) self.pipe = MatPipe.load(PIPE_PATH) df_test = self.pipe.predict(self.df[-220:-201]) self.assertTrue(self.target in df_test.columns) self.assertTrue(self.target + " predicted" in df_test.columns) # Version test self.pipe.version = "not a real version" self.pipe.save(VERSION_PIPE_PATH) with self.assertRaises(AutomatminerError): MatPipe.load(VERSION_PIPE_PATH) @unittest.skipIf("digests" in skip, reason) def test_summarize_and_inspect(self): df = self.df[-200:] self.pipe.fit(df, self.target) for ext in AMM_SUPPORTED_EXTS: digest = self.pipe.inspect(filename=DIGEST_PATH + ext) self.assertTrue(os.path.isfile(DIGEST_PATH + ext)) self.assertTrue(isinstance(digest, dict)) for ext in AMM_SUPPORTED_EXTS: digest = self.pipe.summarize(filename=DIGEST_PATH + ext) self.assertTrue(os.path.isfile(DIGEST_PATH + ext)) self.assertTrue(isinstance(digest, dict)) def _run_benchmark(self, cache, pipe): # Test static, regular benchmark (no fittable featurizers) df = self.df.iloc[500:600] kfold = KFold(n_splits=2) df_tests = pipe.benchmark(df, self.target, kfold, cache=cache) self.assertEqual(len(df_tests), kfold.n_splits) # Make sure we retain a good amount of test samples... df_tests_all = pd.concat(df_tests) self.assertGreaterEqual(len(df_tests_all), 0.95 * len(df)) # Test static subset of kfold df2 = self.df.iloc[500:550] df_tests2 = pipe.benchmark(df2, self.target, kfold, fold_subset=[0], cache=cache) self.assertEqual(len(df_tests2), 1) def tearDown(self) -> None: digests = [DIGEST_PATH + ext for ext in AMM_SUPPORTED_EXTS] for remnant in [CACHE_SRC, PIPE_PATH, VERSION_PIPE_PATH, *digests]: if os.path.exists(remnant): os.remove(remnant)
def run_task(self, fw_spec): # Read data from fw_spec pipe_config_dict = fw_spec["pipe_config"] fold = fw_spec["fold"] kfold_config = fw_spec["kfold_config"] target = fw_spec["target"] data_pickle = fw_spec["data_pickle"] clf_pos_label = fw_spec["clf_pos_label"] problem_type = fw_spec["problem_type"] learner_name = pipe_config_dict["learner_name"] cache = fw_spec["cache"] learner_kwargs = pipe_config_dict["learner_kwargs"] reducer_kwargs = pipe_config_dict["reducer_kwargs"] cleaner_kwargs = pipe_config_dict["cleaner_kwargs"] autofeaturizer_kwargs = pipe_config_dict["autofeaturizer_kwargs"] # Modify data_pickle based on computing resource data_dir = os.environ['AMM_DATASET_DIR'] data_file = os.path.join(data_dir, data_pickle) # Modify save_dir based on computing resource bench_dir = os.environ['AMM_BENCH_DIR'] base_save_dir = fw_spec["base_save_dir"] base_save_dir = os.path.join(bench_dir, base_save_dir) save_dir = fw_spec.pop("save_dir") save_dir = os.path.join(base_save_dir, save_dir) if not os.path.exists(save_dir): os.makedirs(save_dir) from multiprocessing import cpu_count ont = os.environ.get("OMP_NUM_THREADS", None) print("Number of omp threads: {}".format(ont)) print("Number of cpus: {}".format(cpu_count())) # n_jobs = int(cpu_count()/2) # print("Setting number of featurization jobs to: {}".format(n_jobs)) # autofeaturizer_kwargs["n_jobs"] = n_jobs # learner_kwargs["verbosity"] = 3 # Set up pipeline config if learner_name == "TPOTAdaptor": learner = TPOTAdaptor(**learner_kwargs) elif learner_name == "rf": warnings.warn( "Learner kwargs passed into RF regressor/classifiers bc. rf being used." ) learner = SinglePipelineAdaptor( regressor=RandomForestRegressor(**learner_kwargs), classifier=RandomForestClassifier(**learner_kwargs)) else: raise ValueError("{} not supported by RunPipe yet!" "".format(learner_name)) if cache: autofeaturizer_kwargs["cache_src"] = os.path.join( base_save_dir, "features.json") pipe_config = { "learner": learner, "reducer": FeatureReducer(**reducer_kwargs), "cleaner": DataCleaner(**cleaner_kwargs), "autofeaturizer": AutoFeaturizer(**autofeaturizer_kwargs) } logger = initialize_logger(AMM_LOGGER_BASENAME, filepath=save_dir) pipe = MatPipe(**pipe_config, logger=logger) # Set up dataset # Dataset should already be set up correctly as pickle beforehand. # this includes targets being converted to classification, removing # extra columns, having the names of featurization cols set to the # same as the matpipe config, etc. df = pd.read_pickle(data_file) # Check other parameters that would otherwise not be checked until after # benchmarking, hopefully saves some errors at the end during scoring. if problem_type not in [AMM_CLF_NAME, AMM_REG_NAME]: raise ValueError("Problem must be either classification or " "regression.") elif problem_type == AMM_CLF_NAME: if not isinstance(clf_pos_label, (str, bool)): raise TypeError( "The classification positive label should be a " "string, or bool not {}." "".format(type(clf_pos_label))) elif clf_pos_label not in df[target]: raise ValueError("The classification positive label should be" "present in the target column.") elif len(df[target].unique()) > 2: raise ValueError("Only binary classification scoring available" "at this time.") # Set up testing scheme if problem_type == AMM_REG_NAME: kfold = KFold(**kfold_config) else: kfold = StratifiedKFold(**kfold_config) if fold >= kfold.n_splits: raise ValueError("{} is out of range for KFold with n_splits=" "{}".format(fold, kfold)) # Run the benchmark t1 = time.time() results = pipe.benchmark(df, target, kfold, fold_subset=[fold], cache=True) result_df = results[0] elapsed_time = time.time() - t1 # Save everything pipe.save(os.path.join(save_dir, "pipe.p")) pipe.digest(os.path.join(save_dir, "digest.txt")) result_df.to_csv(os.path.join(save_dir, "test_df.csv")) pipe.post_fit_df.to_csv(os.path.join(save_dir, "fitted_df.csv")) # Evaluate model true = result_df[target] test = result_df[target + " predicted"] pass_to_storage = {} if problem_type == AMM_REG_NAME: pass_to_storage["r2"] = r2_score(true, test) pass_to_storage["mae"] = mean_absolute_error(true, test) pass_to_storage['rmse'] = sqrt(mean_squared_error(true, test)) elif problem_type == AMM_CLF_NAME: pass_to_storage["f1"] = f1_score(true, test, pos_label=clf_pos_label) pass_to_storage["roc_auc"] = roc_auc_score(true, test) pass_to_storage["accuracy"] = accuracy_score(true, test) else: raise ValueError("Scoring method for problem type {} not supported" "".format(problem_type)) # Extract important details for storage try: # TPOT Adaptor best_pipeline = [ str(step) for step in pipe.learner.best_pipeline.steps ] except AttributeError: best_pipeline = str(pipe.learner.best_pipeline) features = pipe.learner.features n_features = len(features) fold_orig = list(kfold.split(df, y=df[target]))[fold] n_samples_train_original = len(fold_orig[0]) n_samples_test_original = len(fold_orig[1]) pass_to_storage.update({ "target": target, "best_pipeline": best_pipeline, "elapsed_time": elapsed_time, "features": features, "n_features": n_features, "n_test_samples_original": n_samples_test_original, "n_train_samples_original": n_samples_train_original, "n_train_samples": len(pipe.post_fit_df), "n_test_samples": len(test), "test_sample_frac_retained": len(test) / n_samples_test_original, "completion_time": datetime.datetime.now(), "base_save_dir": base_save_dir, "save_dir": save_dir }) fw_spec.update(pass_to_storage)
def test_persistence_and_digest(self): pipe = MatPipe(**debug_config) with self.assertRaises(NotFittedError): pipe.save() df = self.df[-200:] pipe.fit(df, self.target) filename = os.path.join(test_dir, "test_pipe.p") pipe.save(filename=filename) pipe = MatPipe.load(filename, logger=False) df_test = pipe.predict(self.df[-220:-201], self.target) self.assertTrue(self.target in df_test.columns) self.assertTrue(self.target + " predicted" in df_test.columns) digest_file = os.path.join(test_dir, "matdigest.txt") digest = pipe.digest(filename=digest_file) self.assertTrue(os.path.isfile(digest_file)) self.assertTrue(isinstance(digest, str))
class TestMatPipe(unittest.TestCase): def setUp(self): df = load_dataset("elastic_tensor_2015").rename( columns={"formula": "composition"}) self.df = df[["composition", "K_VRH"]] self.df_struc = df[["composition", "structure", "K_VRH"]] self.extra_features = df["G_VRH"] self.target = "K_VRH" self.config = get_preset_config("debug_single") self.config_cached = get_preset_config("debug_single", cache_src=CACHE_SRC) self.pipe = MatPipe(**self.config) self.pipe_cached = MatPipe(**self.config_cached) def test_transferability(self): df_train = self.df.iloc[:200] df_test = self.df.iloc[201:250] self.pipe.fit(df_train, self.target) df_test = self.pipe.predict(df_test, self.target) true = df_test[self.target] test = df_test[self.target + " predicted"] self.assertTrue("composition" not in df_test.columns) self.assertTrue(r2_score(true, test) > 0.5) # Use the same pipe object by refitting and reusing df_train2 = self.df.iloc[250:450] df_test2 = self.df.iloc[451:500] self.pipe.fit(df_train2, self.target) df_test2 = self.pipe.predict(df_test2, self.target) true2 = df_test2[self.target] test2 = df_test2[self.target + " predicted"] self.assertTrue("composition" not in df_test2.columns) self.assertTrue(r2_score(true2, test2) > 0.5) def test_user_features(self): df = self.df df["G_VRH"] = self.extra_features self.assertTrue("G_VRH" in df.columns) self.assertTrue("K_VRH" in df.columns) df_train = df.iloc[:200] df_test = df.iloc[201:250] self.pipe.fit(df_train, self.target) # If shear modulus is included as a feature it should probably show up # in the final pipeline self.assertTrue("G_VRH" in self.pipe.learner.features) df_test = self.pipe.predict(df_test, self.target) true = df_test[self.target] test = df_test[self.target + " predicted"] self.assertTrue(r2_score(true, test) > 0.75) @unittest.skipIf(int(os.environ.get("SKIP_INTENSIVE", 0)), "Test too intensive for CircleCI commit builds.") def test_benchmarking_no_cache(self): pipe = self.pipe # Make sure we can't run a cached run with no cache AF and cache pipe with self.assertRaises(AutomatminerError): self._run_benchmark(cache=True, pipe=pipe) self._run_benchmark(cache=False, pipe=pipe) @unittest.skipIf(int(os.environ.get("SKIP_INTENSIVE", 0)), "Test too intensive for CircleCI commit builds.") def test_benchmarking_cache(self): pipe = self.pipe_cached # Make sure we can't run a cached run with no cache AF and cache pipe with self.assertRaises(AutomatminerError): self._run_benchmark(cache=False, pipe=pipe) self._run_benchmark(cache=True, pipe=pipe) def test_persistence_and_digest(self): with self.assertRaises(NotFittedError): self.pipe.save() df = self.df[-200:] self.pipe.fit(df, self.target) filename = os.path.join(test_dir, PIPE_PATH) self.pipe.save(filename=filename) self.pipe = MatPipe.load(filename, logger=False) df_test = self.pipe.predict(self.df[-220:-201], self.target) self.assertTrue(self.target in df_test.columns) self.assertTrue(self.target + " predicted" in df_test.columns) digest_file = os.path.join(test_dir, DIGEST_PATH) digest = self.pipe.digest(filename=digest_file) self.assertTrue(os.path.isfile(digest_file)) self.assertTrue(isinstance(digest, str)) def _run_benchmark(self, cache, pipe): # Test static, regular benchmark (no fittable featurizers) df = self.df.iloc[500:600] kfold = KFold(n_splits=5) df_tests = pipe.benchmark(df, self.target, kfold, cache=cache) self.assertEqual(len(df_tests), kfold.n_splits) # Make sure we retain a good amount of test samples... df_tests_all = pd.concat(df_tests) self.assertGreaterEqual(len(df_tests_all), 0.95 * len(df)) # Test static subset of kfold df2 = self.df.iloc[500:550] df_tests2 = pipe.benchmark(df2, self.target, kfold, fold_subset=[0, 3], cache=cache) self.assertEqual(len(df_tests2), 2) def tearDown(self): for remnant in [CACHE_SRC, PIPE_PATH, DIGEST_PATH]: if os.path.exists(remnant): os.remove(remnant)
else: removed_feat = idx if removed_feat not in rm_feats: rm_feats.append(removed_feat) self.logger.debug('"{}" correlates strongly with ' '"{}"'.format(feature, idx)) self.logger.debug( 'removing "{}"...'.format(removed_feat)) if removed_feat == feature: break if len(rm_feats) > 0: df = df.drop(rm_feats, axis=1) self.logger.info('These {} features were removed due to cross ' 'correlation with the current features more than ' '{}:\n{}'.format(len(rm_feats), R_max, rm_feats)) return df if __name__ == "__main__": from matminer.datasets.dataset_retrieval import load_dataset from automatminer.pipeline import MatPipe, debug_config target = "eij_max" df = load_dataset("piezoelectric_tensor").rename( columns={"formula": "composition"})[[ target, "composition", "structure" ]] mp = MatPipe(**debug_config) df2 = mp.benchmark(df, target, test_spec=0.2) print(df2)