예제 #1
0
 def test_benchmarking(self):
     pipe = MatPipe(**debug_config)
     df = self.df.iloc[500:700]
     df_test = pipe.benchmark(df, self.target, test_spec=0.25)
     self.assertEqual(df_test.shape[0], 50)
     true = df_test[self.target]
     test = df_test[self.target + " predicted"]
     self.assertTrue(r2_score(true, test) > 0.5)
예제 #2
0
 def test_instantiation(self):
     learner = self.config["learner"]
     autofeaturizer = self.config["autofeaturizer"]
     with self.assertRaises(AutomatminerError):
         MatPipe(learner=learner)
     with self.assertRaises(AutomatminerError):
         MatPipe(autofeaturizer=autofeaturizer)
     with self.assertRaises(AutomatminerError):
         MatPipe(autofeaturizer=autofeaturizer, learner=learner)
     MatPipe()
     MatPipe(**self.config)
예제 #3
0
 def setUp(self):
     df = load_dataset("elastic_tensor_2015").rename(
         columns={"formula": "composition"})
     self.df = df[["composition", "K_VRH"]]
     self.df_struc = df[["composition", "structure", "K_VRH"]]
     self.extra_features = df["G_VRH"]
     self.target = "K_VRH"
     self.config = get_preset_config("debug_single")
     self.config_cached = get_preset_config("debug_single",
                                            cache_src=CACHE_SRC)
     self.pipe = MatPipe(**self.config)
     self.pipe_cached = MatPipe(**self.config_cached)
예제 #4
0
    def test_from_preset(self):
        for preset in get_available_presets():
            MatPipe.from_preset(preset)

        MatPipe.from_preset("debug", cache_src="some_file.json")
        MatPipe.from_preset("debug")
        MatPipe.from_preset("debug", log_level=1)
예제 #5
0
    def test_user_features(self):
        pipe = MatPipe(**debug_config)
        df = self.df
        df["G_VRH"] = self.extra_features
        self.assertTrue("G_VRH" in df.columns)
        self.assertTrue("K_VRH" in df.columns)
        df_train = df.iloc[:200]
        df_test = df.iloc[201:250]
        pipe.fit(df_train, self.target)

        # If shear modulus is included as a feature it should probably show up
        # in the final pipeline
        self.assertTrue("G_VRH" in pipe.learner.features)
        df_test = pipe.predict(df_test, self.target)
        true = df_test[self.target]
        test = df_test[self.target + " predicted"]
        self.assertTrue(r2_score(true, test) > 0.75)
예제 #6
0
        def test_persistence(self):
            with self.assertRaises(NotFittedError):
                self.pipe.save()
            df = self.df[-200:]
            self.pipe.fit(df, self.target)

            # Load test
            self.pipe.save(filename=PIPE_PATH)
            self.pipe = MatPipe.load(PIPE_PATH)
            df_test = self.pipe.predict(self.df[-220:-201])
            self.assertTrue(self.target in df_test.columns)
            self.assertTrue(self.target + " predicted" in df_test.columns)

            # Version test
            self.pipe.version = "not a real version"
            self.pipe.save(VERSION_PIPE_PATH)
            with self.assertRaises(AutomatminerError):
                MatPipe.load(VERSION_PIPE_PATH)
예제 #7
0
    def test_persistence_and_digest(self):
        with self.assertRaises(NotFittedError):
            self.pipe.save()
        df = self.df[-200:]
        self.pipe.fit(df, self.target)

        filename = os.path.join(test_dir, PIPE_PATH)
        self.pipe.save(filename=filename)
        self.pipe = MatPipe.load(filename, logger=False)
        df_test = self.pipe.predict(self.df[-220:-201], self.target)
        self.assertTrue(self.target in df_test.columns)
        self.assertTrue(self.target + " predicted" in df_test.columns)

        digest_file = os.path.join(test_dir, DIGEST_PATH)
        digest = self.pipe.digest(filename=digest_file)
        self.assertTrue(os.path.isfile(digest_file))
        self.assertTrue(isinstance(digest, str))
예제 #8
0
    def run_task(self, fw_spec):
        # Read data from fw_spec
        pipe_config_dict = fw_spec["pipe_config"]
        target = fw_spec["target"]
        data_file = fw_spec["data_file"]
        learner_name = pipe_config_dict["learner_name"]
        learner_kwargs = pipe_config_dict["learner_kwargs"]
        reducer_kwargs = pipe_config_dict["reducer_kwargs"]
        cleaner_kwargs = pipe_config_dict["cleaner_kwargs"]
        autofeaturizer_kwargs = pipe_config_dict["autofeaturizer_kwargs"]

        # Modify data_file based on computing resource
        data_dir = os.environ["AMM_DATASET_DIR"]
        data_file = os.path.join(data_dir, data_file)

        # Modify save_dir based on computing resource
        bench_dir = os.environ["AMM_SINGLE_FIT_DIR"]
        base_save_dir = fw_spec["base_save_dir"]
        base_save_dir = os.path.join(bench_dir, base_save_dir)

        if not os.path.exists(base_save_dir):
            os.makedirs(base_save_dir)

        # Set up pipeline config
        if learner_name == "TPOTAdaptor":
            learner = TPOTAdaptor(**learner_kwargs)
        elif learner_name == "rf":
            warnings.warn(
                "Learner kwargs passed into RF regressor/classifiers bc. rf being used."
            )
            learner = SinglePipelineAdaptor(
                regressor=RandomForestRegressor(**learner_kwargs),
                classifier=RandomForestClassifier(**learner_kwargs),
            )
        else:
            raise ValueError("{} not supported yet!" "".format(learner_name))
        pipe_config = {
            "learner": learner,
            "reducer": FeatureReducer(**reducer_kwargs),
            "cleaner": DataCleaner(**cleaner_kwargs),
            "autofeaturizer": AutoFeaturizer(**autofeaturizer_kwargs),
        }
        pipe = MatPipe(**pipe_config)

        # Set up dataset
        # Dataset should already be set up correctly as json beforehand.
        # this includes targets being converted to classification, removing
        # extra columns, having the names of featurization cols set to the
        # same as the matpipe config, etc.
        df = load_dataframe_from_json(data_file)

        pipe.fit(df, target)
        pipe.save(os.path.join(base_save_dir, "pipe.p"))
예제 #9
0
    def test_transferability(self):
        df_train = self.df.iloc[:200]
        df_test = self.df.iloc[201:250]
        pipe = MatPipe(**debug_config)
        pipe.fit(df_train, self.target)
        df_test = pipe.predict(df_test, self.target)
        true = df_test[self.target]
        test = df_test[self.target + " predicted"]
        self.assertTrue("composition" not in df_test.columns)
        self.assertTrue(r2_score(true, test) > 0.5)

        # Use the same pipe object by refitting and reusing
        df_train2 = self.df.iloc[250:450]
        df_test2 = self.df.iloc[451:500]
        pipe.fit(df_train2, self.target)
        df_test2 = pipe.predict(df_test2, self.target)
        true2 = df_test2[self.target]
        test2 = df_test2[self.target + " predicted"]
        self.assertTrue("composition" not in df_test2.columns)
        self.assertTrue(r2_score(true2, test2) > 0.5)
예제 #10
0
        if len(feature_ids) != 1:
            raise ValueError("Error, this method does not yet support "
                             "computing plots for more than one feature at a "
                             "time")

        axs = self.interpreter.partial_dependence.plot_partial_dependence(
            feature_ids, self.model, sample=False, progressbar=False,
            with_variance=True
        )

        return axs[0][0], axs[0][1]


if __name__ == '__main__':
    if not os.path.exists("tests/test_pipe.p"):
        df = load_dataset('elastic_tensor_2015')
        df = df[["formula", "K_VRH"]]
        df = df.rename({"formula": "composition"}, axis=1)

        fitted_pipeline = MatPipe().fit(df, "K_VRH")
        fitted_pipeline.save("tests/test_pipe.p")
    else:
        fitted_pipeline = MatPipe().load("tests/test_pipe.p")

    analyzer = Analytics(fitted_pipeline)

    feature_importance = analyzer.get_feature_importance()

    for feature in reversed(feature_importance.index):
        analyzer.plot_partial_dependence(feature)
예제 #11
0
    class TestMatPipe(unittest.TestCase):
        def setUp(self):
            df = load_dataset("elastic_tensor_2015").rename(
                columns={"formula": "composition"})
            self.df = df[["composition", "K_VRH"]]
            self.df_struc = df[["composition", "structure", "K_VRH"]]
            self.extra_features = df["G_VRH"]
            self.target = "K_VRH"
            self.config = get_preset_config(config_preset, n_jobs=n_jobs)
            self.config_cached = get_preset_config(config_preset,
                                                   cache_src=CACHE_SRC,
                                                   n_jobs=n_jobs)
            self.pipe = MatPipe(**self.config)
            self.pipe_cached = MatPipe(**self.config_cached)

        @unittest.skipIf("transferability" in skip, reason)
        def test_transferability(self):
            df_train = self.df.iloc[:200]
            df_test = self.df.iloc[201:250]
            self.pipe.fit(df_train, self.target)
            df_test = self.pipe.predict(df_test)
            true = df_test[self.target]
            test = df_test[self.target + " predicted"]
            self.assertTrue("composition" not in df_test.columns)
            self.assertTrue(r2_score(true, test) > 0.5)

            # Use the same pipe object by refitting and reusing
            df_train2 = self.df.iloc[250:450]
            df_test2 = self.df.iloc[451:500]
            self.pipe.fit(df_train2, self.target)
            df_test2 = self.pipe.predict(df_test2)
            true2 = df_test2[self.target]
            test2 = df_test2[self.target + " predicted"]
            self.assertTrue("composition" not in df_test2.columns)
            self.assertTrue(r2_score(true2, test2) > 0.5)

        @unittest.skipIf("user_features" in skip, reason)
        def test_user_features(self):
            df = self.df
            df["G_VRH"] = self.extra_features
            self.assertTrue("G_VRH" in df.columns)
            self.assertTrue("K_VRH" in df.columns)
            df_train = df.iloc[:200]
            df_test = df.iloc[201:250]
            self.pipe.fit(df_train, self.target)

            # If shear modulus is included as a feature it should probably show up
            # in the final pipeline
            self.assertTrue("G_VRH" in self.pipe.learner.features)
            df_test = self.pipe.predict(df_test)
            true = df_test[self.target]
            test = df_test[self.target + " predicted"]
            self.assertTrue(r2_score(true, test) > 0.75)

        @unittest.skipIf("predict_kwargs" in skip, reason)
        def test_predict_kwargs(self):
            # Test mat_pipe.predict()'s ignore and output_col kwargs.
            df_train = self.df.iloc[:200]
            df_test = self.df.iloc[201:250]
            ef = "ExtraFeature"
            df_test[ef] = [i + 100 for i in range(df_test.shape[0])]
            self.pipe.fit(df_train, self.target)

            self.assertTrue(ef in df_test.columns)
            self.assertTrue("composition" in df_test.columns)

            ignore = [ef, "composition"]
            predicted_ignored = self.pipe.predict(df_test, ignore=ignore)
            self.assertTrue(ef in predicted_ignored.columns)
            self.assertTrue("composition" in predicted_ignored.columns)

            predicted_none = self.pipe.predict(df_test, ignore=None)
            self.assertFalse(ef in predicted_none.columns)
            self.assertFalse("composition" in predicted_none.columns)

            some = ["composition"]
            predicted_some = self.pipe.predict(df_test, ignore=some)
            self.assertFalse(ef in predicted_some.columns)
            self.assertTrue("composition" in predicted_some.columns)

            output_col_name = self.target + "_pred"
            predicted_custom_col = self.pipe.predict(
                df_test, output_col=output_col_name)
            self.assertTrue(output_col_name in predicted_custom_col)

        @unittest.skipIf("benchmarking" in skip, reason)
        def test_benchmarking_no_cache(self):
            pipe = self.pipe
            # Make sure we can't run a cached run with no cache AF and cache pipe
            with self.assertRaises(AutomatminerError):
                self._run_benchmark(cache=True, pipe=pipe)

            self._run_benchmark(cache=False, pipe=pipe)

        @unittest.skipIf("benchmarking" in skip, reason)
        def test_benchmarking_cache(self):
            pipe = self.pipe_cached

            # Make sure we can't run a cached run with no cache AF and cache pipe
            with self.assertRaises(AutomatminerError):
                self._run_benchmark(cache=False, pipe=pipe)
            self._run_benchmark(cache=True, pipe=pipe)

        @unittest.skipIf("persistence" in skip, reason)
        def test_persistence(self):
            with self.assertRaises(NotFittedError):
                self.pipe.save()
            df = self.df[-200:]
            self.pipe.fit(df, self.target)

            # Load test
            self.pipe.save(filename=PIPE_PATH)
            self.pipe = MatPipe.load(PIPE_PATH)
            df_test = self.pipe.predict(self.df[-220:-201])
            self.assertTrue(self.target in df_test.columns)
            self.assertTrue(self.target + " predicted" in df_test.columns)

            # Version test
            self.pipe.version = "not a real version"
            self.pipe.save(VERSION_PIPE_PATH)
            with self.assertRaises(AutomatminerError):
                MatPipe.load(VERSION_PIPE_PATH)

        @unittest.skipIf("digests" in skip, reason)
        def test_summarize_and_inspect(self):
            df = self.df[-200:]
            self.pipe.fit(df, self.target)

            for ext in AMM_SUPPORTED_EXTS:
                digest = self.pipe.inspect(filename=DIGEST_PATH + ext)
                self.assertTrue(os.path.isfile(DIGEST_PATH + ext))
                self.assertTrue(isinstance(digest, dict))

            for ext in AMM_SUPPORTED_EXTS:
                digest = self.pipe.summarize(filename=DIGEST_PATH + ext)
                self.assertTrue(os.path.isfile(DIGEST_PATH + ext))
                self.assertTrue(isinstance(digest, dict))

        def _run_benchmark(self, cache, pipe):
            # Test static, regular benchmark (no fittable featurizers)
            df = self.df.iloc[500:600]
            kfold = KFold(n_splits=2)
            df_tests = pipe.benchmark(df, self.target, kfold, cache=cache)
            self.assertEqual(len(df_tests), kfold.n_splits)

            # Make sure we retain a good amount of test samples...
            df_tests_all = pd.concat(df_tests)
            self.assertGreaterEqual(len(df_tests_all), 0.95 * len(df))

            # Test static subset of kfold
            df2 = self.df.iloc[500:550]
            df_tests2 = pipe.benchmark(df2,
                                       self.target,
                                       kfold,
                                       fold_subset=[0],
                                       cache=cache)
            self.assertEqual(len(df_tests2), 1)

        def tearDown(self) -> None:
            digests = [DIGEST_PATH + ext for ext in AMM_SUPPORTED_EXTS]
            for remnant in [CACHE_SRC, PIPE_PATH, VERSION_PIPE_PATH, *digests]:
                if os.path.exists(remnant):
                    os.remove(remnant)
예제 #12
0
    def run_task(self, fw_spec):
        # Read data from fw_spec
        pipe_config_dict = fw_spec["pipe_config"]
        fold = fw_spec["fold"]
        kfold_config = fw_spec["kfold_config"]
        target = fw_spec["target"]
        data_pickle = fw_spec["data_pickle"]
        clf_pos_label = fw_spec["clf_pos_label"]
        problem_type = fw_spec["problem_type"]
        learner_name = pipe_config_dict["learner_name"]
        cache = fw_spec["cache"]
        learner_kwargs = pipe_config_dict["learner_kwargs"]
        reducer_kwargs = pipe_config_dict["reducer_kwargs"]
        cleaner_kwargs = pipe_config_dict["cleaner_kwargs"]
        autofeaturizer_kwargs = pipe_config_dict["autofeaturizer_kwargs"]

        # Modify data_pickle based on computing resource
        data_dir = os.environ['AMM_DATASET_DIR']
        data_file = os.path.join(data_dir, data_pickle)

        # Modify save_dir based on computing resource
        bench_dir = os.environ['AMM_BENCH_DIR']
        base_save_dir = fw_spec["base_save_dir"]
        base_save_dir = os.path.join(bench_dir, base_save_dir)
        save_dir = fw_spec.pop("save_dir")
        save_dir = os.path.join(base_save_dir, save_dir)

        if not os.path.exists(save_dir):
            os.makedirs(save_dir)

        from multiprocessing import cpu_count
        ont = os.environ.get("OMP_NUM_THREADS", None)
        print("Number of omp threads: {}".format(ont))
        print("Number of cpus: {}".format(cpu_count()))
        # n_jobs = int(cpu_count()/2)
        # print("Setting number of featurization jobs to: {}".format(n_jobs))
        # autofeaturizer_kwargs["n_jobs"] = n_jobs
        # learner_kwargs["verbosity"] = 3

        # Set up pipeline config
        if learner_name == "TPOTAdaptor":
            learner = TPOTAdaptor(**learner_kwargs)
        elif learner_name == "rf":
            warnings.warn(
                "Learner kwargs passed into RF regressor/classifiers bc. rf being used."
            )
            learner = SinglePipelineAdaptor(
                regressor=RandomForestRegressor(**learner_kwargs),
                classifier=RandomForestClassifier(**learner_kwargs))
        else:
            raise ValueError("{} not supported by RunPipe yet!"
                             "".format(learner_name))
        if cache:
            autofeaturizer_kwargs["cache_src"] = os.path.join(
                base_save_dir, "features.json")
        pipe_config = {
            "learner": learner,
            "reducer": FeatureReducer(**reducer_kwargs),
            "cleaner": DataCleaner(**cleaner_kwargs),
            "autofeaturizer": AutoFeaturizer(**autofeaturizer_kwargs)
        }

        logger = initialize_logger(AMM_LOGGER_BASENAME, filepath=save_dir)
        pipe = MatPipe(**pipe_config, logger=logger)

        # Set up dataset
        # Dataset should already be set up correctly as pickle beforehand.
        # this includes targets being converted to classification, removing
        # extra columns, having the names of featurization cols set to the
        # same as the matpipe config, etc.
        df = pd.read_pickle(data_file)

        # Check other parameters that would otherwise not be checked until after
        # benchmarking, hopefully saves some errors at the end during scoring.
        if problem_type not in [AMM_CLF_NAME, AMM_REG_NAME]:
            raise ValueError("Problem must be either classification or "
                             "regression.")
        elif problem_type == AMM_CLF_NAME:
            if not isinstance(clf_pos_label, (str, bool)):
                raise TypeError(
                    "The classification positive label should be a "
                    "string, or bool not {}."
                    "".format(type(clf_pos_label)))
            elif clf_pos_label not in df[target]:
                raise ValueError("The classification positive label should be"
                                 "present in the target column.")
            elif len(df[target].unique()) > 2:
                raise ValueError("Only binary classification scoring available"
                                 "at this time.")

        # Set up testing scheme
        if problem_type == AMM_REG_NAME:
            kfold = KFold(**kfold_config)
        else:
            kfold = StratifiedKFold(**kfold_config)
        if fold >= kfold.n_splits:
            raise ValueError("{} is out of range for KFold with n_splits="
                             "{}".format(fold, kfold))

        # Run the benchmark
        t1 = time.time()
        results = pipe.benchmark(df,
                                 target,
                                 kfold,
                                 fold_subset=[fold],
                                 cache=True)
        result_df = results[0]
        elapsed_time = time.time() - t1

        # Save everything
        pipe.save(os.path.join(save_dir, "pipe.p"))
        pipe.digest(os.path.join(save_dir, "digest.txt"))
        result_df.to_csv(os.path.join(save_dir, "test_df.csv"))
        pipe.post_fit_df.to_csv(os.path.join(save_dir, "fitted_df.csv"))

        # Evaluate model
        true = result_df[target]
        test = result_df[target + " predicted"]

        pass_to_storage = {}
        if problem_type == AMM_REG_NAME:
            pass_to_storage["r2"] = r2_score(true, test)
            pass_to_storage["mae"] = mean_absolute_error(true, test)
            pass_to_storage['rmse'] = sqrt(mean_squared_error(true, test))
        elif problem_type == AMM_CLF_NAME:
            pass_to_storage["f1"] = f1_score(true,
                                             test,
                                             pos_label=clf_pos_label)
            pass_to_storage["roc_auc"] = roc_auc_score(true, test)
            pass_to_storage["accuracy"] = accuracy_score(true, test)
        else:
            raise ValueError("Scoring method for problem type {} not supported"
                             "".format(problem_type))

        # Extract important details for storage
        try:
            # TPOT Adaptor
            best_pipeline = [
                str(step) for step in pipe.learner.best_pipeline.steps
            ]
        except AttributeError:
            best_pipeline = str(pipe.learner.best_pipeline)

        features = pipe.learner.features
        n_features = len(features)
        fold_orig = list(kfold.split(df, y=df[target]))[fold]
        n_samples_train_original = len(fold_orig[0])
        n_samples_test_original = len(fold_orig[1])

        pass_to_storage.update({
            "target": target,
            "best_pipeline": best_pipeline,
            "elapsed_time": elapsed_time,
            "features": features,
            "n_features": n_features,
            "n_test_samples_original": n_samples_test_original,
            "n_train_samples_original": n_samples_train_original,
            "n_train_samples": len(pipe.post_fit_df),
            "n_test_samples": len(test),
            "test_sample_frac_retained": len(test) / n_samples_test_original,
            "completion_time": datetime.datetime.now(),
            "base_save_dir": base_save_dir,
            "save_dir": save_dir
        })
        fw_spec.update(pass_to_storage)
예제 #13
0
    def test_persistence_and_digest(self):
        pipe = MatPipe(**debug_config)
        with self.assertRaises(NotFittedError):
            pipe.save()
        df = self.df[-200:]
        pipe.fit(df, self.target)

        filename = os.path.join(test_dir, "test_pipe.p")
        pipe.save(filename=filename)
        pipe = MatPipe.load(filename, logger=False)
        df_test = pipe.predict(self.df[-220:-201], self.target)
        self.assertTrue(self.target in df_test.columns)
        self.assertTrue(self.target + " predicted" in df_test.columns)

        digest_file = os.path.join(test_dir, "matdigest.txt")
        digest = pipe.digest(filename=digest_file)
        self.assertTrue(os.path.isfile(digest_file))
        self.assertTrue(isinstance(digest, str))
예제 #14
0
class TestMatPipe(unittest.TestCase):
    def setUp(self):
        df = load_dataset("elastic_tensor_2015").rename(
            columns={"formula": "composition"})
        self.df = df[["composition", "K_VRH"]]
        self.df_struc = df[["composition", "structure", "K_VRH"]]
        self.extra_features = df["G_VRH"]
        self.target = "K_VRH"
        self.config = get_preset_config("debug_single")
        self.config_cached = get_preset_config("debug_single",
                                               cache_src=CACHE_SRC)
        self.pipe = MatPipe(**self.config)
        self.pipe_cached = MatPipe(**self.config_cached)

    def test_transferability(self):
        df_train = self.df.iloc[:200]
        df_test = self.df.iloc[201:250]
        self.pipe.fit(df_train, self.target)
        df_test = self.pipe.predict(df_test, self.target)
        true = df_test[self.target]
        test = df_test[self.target + " predicted"]
        self.assertTrue("composition" not in df_test.columns)
        self.assertTrue(r2_score(true, test) > 0.5)

        # Use the same pipe object by refitting and reusing
        df_train2 = self.df.iloc[250:450]
        df_test2 = self.df.iloc[451:500]
        self.pipe.fit(df_train2, self.target)
        df_test2 = self.pipe.predict(df_test2, self.target)
        true2 = df_test2[self.target]
        test2 = df_test2[self.target + " predicted"]
        self.assertTrue("composition" not in df_test2.columns)
        self.assertTrue(r2_score(true2, test2) > 0.5)

    def test_user_features(self):
        df = self.df
        df["G_VRH"] = self.extra_features
        self.assertTrue("G_VRH" in df.columns)
        self.assertTrue("K_VRH" in df.columns)
        df_train = df.iloc[:200]
        df_test = df.iloc[201:250]
        self.pipe.fit(df_train, self.target)

        # If shear modulus is included as a feature it should probably show up
        # in the final pipeline
        self.assertTrue("G_VRH" in self.pipe.learner.features)
        df_test = self.pipe.predict(df_test, self.target)
        true = df_test[self.target]
        test = df_test[self.target + " predicted"]
        self.assertTrue(r2_score(true, test) > 0.75)

    @unittest.skipIf(int(os.environ.get("SKIP_INTENSIVE", 0)),
                     "Test too intensive for CircleCI commit builds.")
    def test_benchmarking_no_cache(self):
        pipe = self.pipe
        # Make sure we can't run a cached run with no cache AF and cache pipe
        with self.assertRaises(AutomatminerError):
            self._run_benchmark(cache=True, pipe=pipe)

        self._run_benchmark(cache=False, pipe=pipe)

    @unittest.skipIf(int(os.environ.get("SKIP_INTENSIVE", 0)),
                     "Test too intensive for CircleCI commit builds.")
    def test_benchmarking_cache(self):
        pipe = self.pipe_cached

        # Make sure we can't run a cached run with no cache AF and cache pipe
        with self.assertRaises(AutomatminerError):
            self._run_benchmark(cache=False, pipe=pipe)
        self._run_benchmark(cache=True, pipe=pipe)

    def test_persistence_and_digest(self):
        with self.assertRaises(NotFittedError):
            self.pipe.save()
        df = self.df[-200:]
        self.pipe.fit(df, self.target)

        filename = os.path.join(test_dir, PIPE_PATH)
        self.pipe.save(filename=filename)
        self.pipe = MatPipe.load(filename, logger=False)
        df_test = self.pipe.predict(self.df[-220:-201], self.target)
        self.assertTrue(self.target in df_test.columns)
        self.assertTrue(self.target + " predicted" in df_test.columns)

        digest_file = os.path.join(test_dir, DIGEST_PATH)
        digest = self.pipe.digest(filename=digest_file)
        self.assertTrue(os.path.isfile(digest_file))
        self.assertTrue(isinstance(digest, str))

    def _run_benchmark(self, cache, pipe):
        # Test static, regular benchmark (no fittable featurizers)
        df = self.df.iloc[500:600]
        kfold = KFold(n_splits=5)
        df_tests = pipe.benchmark(df, self.target, kfold, cache=cache)
        self.assertEqual(len(df_tests), kfold.n_splits)

        # Make sure we retain a good amount of test samples...
        df_tests_all = pd.concat(df_tests)
        self.assertGreaterEqual(len(df_tests_all), 0.95 * len(df))

        # Test static subset of kfold
        df2 = self.df.iloc[500:550]
        df_tests2 = pipe.benchmark(df2,
                                   self.target,
                                   kfold,
                                   fold_subset=[0, 3],
                                   cache=cache)
        self.assertEqual(len(df_tests2), 2)

    def tearDown(self):
        for remnant in [CACHE_SRC, PIPE_PATH, DIGEST_PATH]:
            if os.path.exists(remnant):
                os.remove(remnant)
예제 #15
0
                        else:
                            removed_feat = idx
                        if removed_feat not in rm_feats:
                            rm_feats.append(removed_feat)
                            self.logger.debug('"{}" correlates strongly with '
                                              '"{}"'.format(feature, idx))
                            self.logger.debug(
                                'removing "{}"...'.format(removed_feat))
                        if removed_feat == feature:
                            break
        if len(rm_feats) > 0:
            df = df.drop(rm_feats, axis=1)
            self.logger.info('These {} features were removed due to cross '
                             'correlation with the current features more than '
                             '{}:\n{}'.format(len(rm_feats), R_max, rm_feats))
        return df


if __name__ == "__main__":
    from matminer.datasets.dataset_retrieval import load_dataset
    from automatminer.pipeline import MatPipe, debug_config
    target = "eij_max"
    df = load_dataset("piezoelectric_tensor").rename(
        columns={"formula": "composition"})[[
            target, "composition", "structure"
        ]]

    mp = MatPipe(**debug_config)
    df2 = mp.benchmark(df, target, test_spec=0.2)
    print(df2)