def setup_learning_curve_point( # pylint:disable=too-many-arguments, too-many-locals trainxpath, trainypath, validxpath, validypath, points, basepath): # Set up directories learning_curve_point_dir = os.path.join( basepath, "_".join(["model", str(int(points))])) make_if_not_exist(learning_curve_point_dir) datadir = os.path.join(learning_curve_point_dir, "data") make_if_not_exist(datadir) modeldir = os.path.join(learning_curve_point_dir, "model") make_if_not_exist(modeldir) # Select diverse subset x = np.load(trainxpath) y = np.load(trainypath) xvalid = np.load(validxpath) yvalid = np.load(validypath) xsummarized, ysummarized = summarize_data(x, y, points) xsummarized_valid, ysummarized_valid = summarize_data( xvalid, yvalid, points) np.save(os.path.join(datadir, "features"), xsummarized) np.save(os.path.join(datadir, "labels"), ysummarized) np.save(os.path.join(datadir, "features_valid"), xsummarized_valid) np.save(os.path.join(datadir, "labels_valid"), ysummarized_valid) return datadir, modeldir
def main(modelpath, xtrainpath, ytrainpath, xtestpath, ytestpath, outdir): # pylint:disable=too-many-arguments,too-many-locals """CLI""" scalerpath = os.path.join(modelpath, "scaler_0.joblib") assert os.path.exists(scalerpath) scaler = joblib.load(scalerpath) make_if_not_exist(outdir) print("loading data") X_train = scaler.transform(np.load(xtrainpath)) X_test = scaler.transform(np.load(xtestpath)) y_train = np.load(ytrainpath).astype(np.int) y_test = np.load(ytestpath).astype(np.int) models = glob(os.path.join(modelpath, "*.joblib")) print("now starting dask and running the actual computation") global cluster global client cluster = LocalCluster(memory_limit="28GB", n_workers=4) client = Client(cluster) relevant_models = [[model, joblib.load(model)] for model in models if not "scaler" in model] bvpartial = partial(bv_decomp_wrapper, xtrain=X_train, ytrain=y_train, xtest=X_test, ytest=y_test) futures = client.map(bvpartial, relevant_models) results = client.gather(futures) print("finished crunching, now dumping results") with open(os.path.join(outdir, "bv_decomposition.pkl"), "wb") as fh: pickle.dump(results, fh)
def setup_learning_curve_point(trainxpath, trainypath, points): # Set up directories learning_curve_point_dir = "_".join(["model", str(int(points))]) make_if_not_exist(learning_curve_point_dir) datadir = os.path.join(learning_curve_point_dir, "data") make_if_not_exist(datadir) modeldir = os.path.join(learning_curve_point_dir, "model") make_if_not_exist(modeldir) # Select diverse subset x = np.load(trainxpath) y = np.load(trainypath) xsummarized, ysummarized = summarize_data(x, y, points) np.save(os.path.join(datadir, "features"), xsummarized) np.save(os.path.join(datadir, "labels"), ysummarized) return datadir, modeldir
def main(submit): make_if_not_exist("learning_curves") for estimator in to_analyze: modelpath = underscore_join(["model", estimator]) if check_if_model_exists(modelpath): for model in glob(os.path.join(modelpath, "*.joblib")): if "ensemble" in model: p = Path(model) modelbasename = os.path.join( "learning_curves", underscore_join([estimator, p.stem])) make_if_not_exist(modelbasename) for point in POINTS: modelpointname = os.path.join(modelbasename, str(point)) make_if_not_exist(modelpointname) command = write_run_command( model, underscore_join(["data", estimator]), os.path.join( underscore_join(["houldout", estimator]), "valid"), underscore_join(["houldout", estimator]), modelpointname, point, ) submission_name = underscore_join( [estimator, Path(modelpointname).stem]) write_slurmfile(submission_name, command) if submit: subprocess.call( "sbatch submit_{}.slurm".format( submission_name), shell=True, )
def main( # pylint:disable=too-many-arguments, too-many-locals model, xtrainpath, ytrainpath, xtestpath, ytestpath, featurenamespath, outpath, rounds, points, use_shap, ): # load model and data and also scale the data print("loading model and data") model = joblib.load(model) scaler = StandardScaler() X_train = scaler.fit_transform(np.load(xtrainpath)) y_train = np.load(ytrainpath) X_test = scaler.transform(np.load(xtestpath)) y_test = np.load(ytestpath) X_train, y_train = summarize_data(X_train, y_train, points) X_test, y_test = summarize_data(X_test, y_test, points) # load the feature names feature_names = read_pickle(featurenamespath) permuation_importance_partial = partial(permuation_importance_wrapper, rounds=rounds, model=model) sets = [(X_train, y_train, "train"), (X_test, y_test, "test")] if not use_shap: print("starting permutation feature importance") # We do permutation feature importance for rounds rounds, using balanced accuracy as metric with concurrent.futures.ProcessPoolExecutor(max_workers=2) as executor: results = [] for result in executor.map(permuation_importance_partial, sets): results.append(result) make_if_not_exist(outpath) with open(os.path.join(outpath, "permutation_feature_importance.pkl"), "wb") as fh: pickle.dump(results, fh) else: print("starting SHAP feature importance") make_if_not_exist(outpath) explainer = shap.KernelExplainer( model.predict, X_train ) # note that we use the training set as the background dataset to integrate out features shap_values = explainer.shap_values(X_test) shap_values_df = pd.DataFrame() shap_values_df["feature"] = feature_names for i, shap_value in enumerate(shap_values): # Computing average impact of each feature in on model output (mean(abs(shap_values)) / per fold abs_mean_shap_values = np.mean(np.abs(shap_value), axis=0) expected_value = (explainer.expected_value[i] if explainer.expected_value[i] is not None else None) shap_values_df["shap_value_target_{}".format( str(i))] = abs_mean_shap_values shap_values_df["expected_value_target_{}".format( str(i))] = expected_value joblib.dump(explainer, os.path.join(outpath, "shap_explainer")) shap_values_df.to_csv(os.path.join(outpath, "shap_df.csv"), index=False)