Python MatPipe.benchmark Examples

Programming Language: Python

Namespace/Package Name: automatminer.pipeline

Class/Type: MatPipe

Method/Function: benchmark

Examples at hotexamples.com: 3

Python MatPipe.benchmark - 3 examples found. These are the top rated real world Python examples of automatminer.pipeline.MatPipe.benchmark extracted from open source projects. You can rate examples to help us improve the quality of examples.

Frequently Used Methods

Show Hide

MatPipe(10)

fit(6)

save(6)

predict(5)

benchmark(3)

digest(3)

load(3)

inspect(2)

from_preset(1)

summarize(1)

Example #1

Show file

 def test_benchmarking(self):
     pipe = MatPipe(**debug_config)
     df = self.df.iloc[500:700]
     df_test = pipe.benchmark(df, self.target, test_spec=0.25)
     self.assertEqual(df_test.shape[0], 50)
     true = df_test[self.target]
     test = df_test[self.target + " predicted"]
     self.assertTrue(r2_score(true, test) > 0.5)

Example #2

Show file

    def run_task(self, fw_spec):
        # Read data from fw_spec
        pipe_config_dict = fw_spec["pipe_config"]
        fold = fw_spec["fold"]
        kfold_config = fw_spec["kfold_config"]
        target = fw_spec["target"]
        data_pickle = fw_spec["data_pickle"]
        clf_pos_label = fw_spec["clf_pos_label"]
        problem_type = fw_spec["problem_type"]
        learner_name = pipe_config_dict["learner_name"]
        cache = fw_spec["cache"]
        learner_kwargs = pipe_config_dict["learner_kwargs"]
        reducer_kwargs = pipe_config_dict["reducer_kwargs"]
        cleaner_kwargs = pipe_config_dict["cleaner_kwargs"]
        autofeaturizer_kwargs = pipe_config_dict["autofeaturizer_kwargs"]

        # Modify data_pickle based on computing resource
        data_dir = os.environ['AMM_DATASET_DIR']
        data_file = os.path.join(data_dir, data_pickle)

        # Modify save_dir based on computing resource
        bench_dir = os.environ['AMM_BENCH_DIR']
        base_save_dir = fw_spec["base_save_dir"]
        base_save_dir = os.path.join(bench_dir, base_save_dir)
        save_dir = fw_spec.pop("save_dir")
        save_dir = os.path.join(base_save_dir, save_dir)

        if not os.path.exists(save_dir):
            os.makedirs(save_dir)

        from multiprocessing import cpu_count
        ont = os.environ.get("OMP_NUM_THREADS", None)
        print("Number of omp threads: {}".format(ont))
        print("Number of cpus: {}".format(cpu_count()))
        # n_jobs = int(cpu_count()/2)
        # print("Setting number of featurization jobs to: {}".format(n_jobs))
        # autofeaturizer_kwargs["n_jobs"] = n_jobs
        # learner_kwargs["verbosity"] = 3

        # Set up pipeline config
        if learner_name == "TPOTAdaptor":
            learner = TPOTAdaptor(**learner_kwargs)
        elif learner_name == "rf":
            warnings.warn(
                "Learner kwargs passed into RF regressor/classifiers bc. rf being used."
            )
            learner = SinglePipelineAdaptor(
                regressor=RandomForestRegressor(**learner_kwargs),
                classifier=RandomForestClassifier(**learner_kwargs))
        else:
            raise ValueError("{} not supported by RunPipe yet!"
                             "".format(learner_name))
        if cache:
            autofeaturizer_kwargs["cache_src"] = os.path.join(
                base_save_dir, "features.json")
        pipe_config = {
            "learner": learner,
            "reducer": FeatureReducer(**reducer_kwargs),
            "cleaner": DataCleaner(**cleaner_kwargs),
            "autofeaturizer": AutoFeaturizer(**autofeaturizer_kwargs)
        }

        logger = initialize_logger(AMM_LOGGER_BASENAME, filepath=save_dir)
        pipe = MatPipe(**pipe_config, logger=logger)

        # Set up dataset
        # Dataset should already be set up correctly as pickle beforehand.
        # this includes targets being converted to classification, removing
        # extra columns, having the names of featurization cols set to the
        # same as the matpipe config, etc.
        df = pd.read_pickle(data_file)

        # Check other parameters that would otherwise not be checked until after
        # benchmarking, hopefully saves some errors at the end during scoring.
        if problem_type not in [AMM_CLF_NAME, AMM_REG_NAME]:
            raise ValueError("Problem must be either classification or "
                             "regression.")
        elif problem_type == AMM_CLF_NAME:
            if not isinstance(clf_pos_label, (str, bool)):
                raise TypeError(
                    "The classification positive label should be a "
                    "string, or bool not {}."
                    "".format(type(clf_pos_label)))
            elif clf_pos_label not in df[target]:
                raise ValueError("The classification positive label should be"
                                 "present in the target column.")
            elif len(df[target].unique()) > 2:
                raise ValueError("Only binary classification scoring available"
                                 "at this time.")

        # Set up testing scheme
        if problem_type == AMM_REG_NAME:
            kfold = KFold(**kfold_config)
        else:
            kfold = StratifiedKFold(**kfold_config)
        if fold >= kfold.n_splits:
            raise ValueError("{} is out of range for KFold with n_splits="
                             "{}".format(fold, kfold))

        # Run the benchmark
        t1 = time.time()
        results = pipe.benchmark(df,
                                 target,
                                 kfold,
                                 fold_subset=[fold],
                                 cache=True)
        result_df = results[0]
        elapsed_time = time.time() - t1

        # Save everything
        pipe.save(os.path.join(save_dir, "pipe.p"))
        pipe.digest(os.path.join(save_dir, "digest.txt"))
        result_df.to_csv(os.path.join(save_dir, "test_df.csv"))
        pipe.post_fit_df.to_csv(os.path.join(save_dir, "fitted_df.csv"))

        # Evaluate model
        true = result_df[target]
        test = result_df[target + " predicted"]

        pass_to_storage = {}
        if problem_type == AMM_REG_NAME:
            pass_to_storage["r2"] = r2_score(true, test)
            pass_to_storage["mae"] = mean_absolute_error(true, test)
            pass_to_storage['rmse'] = sqrt(mean_squared_error(true, test))
        elif problem_type == AMM_CLF_NAME:
            pass_to_storage["f1"] = f1_score(true,
                                             test,
                                             pos_label=clf_pos_label)
            pass_to_storage["roc_auc"] = roc_auc_score(true, test)
            pass_to_storage["accuracy"] = accuracy_score(true, test)
        else:
            raise ValueError("Scoring method for problem type {} not supported"
                             "".format(problem_type))

        # Extract important details for storage
        try:
            # TPOT Adaptor
            best_pipeline = [
                str(step) for step in pipe.learner.best_pipeline.steps
            ]
        except AttributeError:
            best_pipeline = str(pipe.learner.best_pipeline)

        features = pipe.learner.features
        n_features = len(features)
        fold_orig = list(kfold.split(df, y=df[target]))[fold]
        n_samples_train_original = len(fold_orig[0])
        n_samples_test_original = len(fold_orig[1])

        pass_to_storage.update({
            "target": target,
            "best_pipeline": best_pipeline,
            "elapsed_time": elapsed_time,
            "features": features,
            "n_features": n_features,
            "n_test_samples_original": n_samples_test_original,
            "n_train_samples_original": n_samples_train_original,
            "n_train_samples": len(pipe.post_fit_df),
            "n_test_samples": len(test),
            "test_sample_frac_retained": len(test) / n_samples_test_original,
            "completion_time": datetime.datetime.now(),
            "base_save_dir": base_save_dir,
            "save_dir": save_dir
        })
        fw_spec.update(pass_to_storage)

Example #3

Show file

File: core.py Project: theiman112860/automatminer

                        else:
                            removed_feat = idx
                        if removed_feat not in rm_feats:
                            rm_feats.append(removed_feat)
                            self.logger.debug('"{}" correlates strongly with '
                                              '"{}"'.format(feature, idx))
                            self.logger.debug(
                                'removing "{}"...'.format(removed_feat))
                        if removed_feat == feature:
                            break
        if len(rm_feats) > 0:
            df = df.drop(rm_feats, axis=1)
            self.logger.info('These {} features were removed due to cross '
                             'correlation with the current features more than '
                             '{}:\n{}'.format(len(rm_feats), R_max, rm_feats))
        return df


if __name__ == "__main__":
    from matminer.datasets.dataset_retrieval import load_dataset
    from automatminer.pipeline import MatPipe, debug_config
    target = "eij_max"
    df = load_dataset("piezoelectric_tensor").rename(
        columns={"formula": "composition"})[[
            target, "composition", "structure"
        ]]

    mp = MatPipe(**debug_config)
    df2 = mp.benchmark(df, target, test_spec=0.2)
    print(df2)