def predict(self, force_refresh): model = self.options.get("MODEL") if model is None: self.logger.error( "If you are in predict model, please specify a MODEL in OPTS. Either a file location or a training task name." ) return False if not os.path.exists(get_output_loc(model)): # If its not a file, it must be a task for t in self.dependencies: if model == t.name: self.logger.debug( f"Found task dependency {t.name} with model file {t.output['model_filename']}" ) model = t.output["model_filename"] else: model = get_output_loc(model) types = " ".join([ str(a) for a in self.get_simulation_dependency().output["types_dict"]["IA"] ]) if not types: types = "1" command = (f"-p " f"--features {self.features} " f"--done_file {self.done_file} " f"--model {model} " f"--types {types} " f"--name {self.get_prob_column_name()} " f"--output {self.predictions_filename} " f"{self.fitres_file}") return self.classify(force_refresh, command)
def predict(self, force_refresh): model = self.options.get("MODEL") if model is None: self.logger.error( "If you are in predict model, please specify a MODEL in OPTS. Either a file location or a training task name." ) return False potential_path = get_output_loc(model) if os.path.exists(potential_path): self.logger.debug(f"Found existing model file at {potential_path}") model = potential_path else: if "/" in model: self.logger.warning( f"Your model {model} looks like a path, but I couldn't find a model at {potential_path}" ) # If its not a file, it must be a task for t in self.dependencies: if model == t.name: self.logger.debug( f"Found task dependency {t.name} with model file {t.output['model_filename']}" ) model = t.output["model_filename"] command = (f"--nc 4 " f"--nclass 2 " f"--ft {self.features} " f"--restore " f"--pklfile {model} " f"--pklformat FITRES " f"{self.get_rf_conf()}" f"--test {self.fitres_file} " f"--filedir {self.output_dir} " f"--done_file {self.done_file} " f"--use_filenames ") return self.classify(force_refresh, command)
def __init__(self, name, output_dir, dependencies, mode, options): super().__init__(name, output_dir, dependencies, mode, options) self.global_config = get_config() self.dump_dir = output_dir + "/dump" self.job_base_name = os.path.basename(output_dir) self.tmp_output = None self.done_file = os.path.join(self.output_dir, "done_task.txt") self.variant = options.get("VARIANT", "vanilla").lower() assert self.variant in ["vanilla", "variational", "bayesian"], \ f"Variant {self.variant} is not vanilla, variational or bayesian" self.slurm = """#!/bin/bash #SBATCH --job-name={job_name} #SBATCH --time=15:00:00 #SBATCH --nodes=1 #SBATCH --ntasks=1 #SBATCH --partition=gpu2 #SBATCH --gres=gpu:1 #SBATCH --output=output.log #SBATCH --account=pi-rkessler #SBATCH --mem=64GB source activate {conda_env} module load cuda echo `which python` cd {path_to_classifier} python run.py --data --sntypes '{sntypes}' --dump_dir {dump_dir} --raw_dir {photometry_dir} {fit_dir} {phot} {clump} {test_or_train} python run.py --use_cuda {cyclic} --sntypes '{sntypes}' --done_file {done_file} --dump_dir {dump_dir} {cyclic} {variant} {model} {phot} {command} """ self.conda_env = self.global_config["SuperNNova"]["conda_env"] self.path_to_classifier = get_output_loc( self.global_config["SuperNNova"]["location"])
def __init__(self, name, output_dir, dependencies, options, recal_aggtask): super().__init__(name, output_dir, dependencies=dependencies) self.passed = False self.classifiers = [ d for d in dependencies if isinstance(d, Classifier) ] self.lcfit_deps = [ c.get_fit_dependency(output=False) for c in self.classifiers ] self.lcfit_names = list( set([l.output["name"] for l in self.lcfit_deps if l is not None])) self.output["lcfit_names"] = self.lcfit_names if not self.lcfit_names: self.logger.debug( "No jobs depend on the LCFIT, so adding a dummy one") self.lcfit_names = [""] self.sim_task = self.get_underlying_sim_task() self.output["sim_name"] = self.sim_task.name self.recal_aggtask = recal_aggtask self.num_versions = len(self.sim_task.output["sim_folders"]) self.output_dfs = [ os.path.join(self.output_dir, f"merged_{i}.csv") for i in range(self.num_versions) ] self.output_dfs_key = [[ os.path.join(self.output_dir, f"merged_{l}_{i}.key") for l in self.lcfit_names ] for i in range(self.num_versions)] self.output_cals = [ os.path.join(self.output_dir, f"calibration_{i}.csv") for i in range(self.num_versions) ] self.id = "CID" self.type_name = "SNTYPE" self.options = options self.include_type = bool(options.get("INCLUDE_TYPE", False)) self.plot = options.get("PLOT", True) self.plot_all = options.get("PLOT_ALL", False) self.output["classifiers"] = self.classifiers self.output["calibration_files"] = self.output_cals if isinstance(self.plot, bool): self.python_file = os.path.dirname( inspect.stack()[0][1]) + "/external/aggregator_plot.py" else: self.python_file = self.plot self.python_file = get_output_loc(self.python_file) if not os.path.exists(self.python_file): Task.fail_config( f"Attempting to find python file {self.python_file} but it's not there!" )
def __init__(self, name, output_dir, dependencies, mode, options): super().__init__(name, output_dir, dependencies, mode, options) self.global_config = get_config() self.num_jobs = 4 self.conda_env = self.global_config["ArgonneClassifier"]["conda_env"] self.path_to_classifier = get_output_loc(self.global_config["ArgonneClassifier"]["location"]) self.job_base_name = os.path.basename(output_dir) self.features = options.get("FEATURES", "x1 c zHD x1ERR cERR PKMJDERR") self.model_pk_file = "modelpkl.pkl" self.output_pk_file = os.path.join(self.output_dir, self.model_pk_file) self.slurm = """#!/bin/bash
def predict(self, force_refresh): train_info = self.get_fit_dependency() model = self.options.get("MODEL") assert model is not None, "If TRAIN is not specified, you have to point to a model to use" for t in self.dependencies: if model == t.name: self.logger.debug( f"Found task dependency {t.name} with model file {t.output['model_filename']}" ) model = t.output["model_filename"] model_path = get_output_loc(model) self.logger.debug(f"Looking for model in {model_path}") if not os.path.exists(model_path): self.logger.error(f"Cannot find {model_path}") return False old_hash = self.get_old_hash() new_hash = self.get_hash_from_string(self.name + model_path) if force_refresh or new_hash != old_hash: self.logger.debug("Regenerating") if os.path.exists(self.output_dir): shutil.rmtree(self.output_dir, ignore_errors=True) mkdirs(self.output_dir) self.save_new_hash(new_hash) job_name = 'nearnbr_apply.exe' inArgs = f'-inFile_data {train_info["fitres_file"]} -inFile_MLpar {model_path}' outArgs = f'-outFile {self.outfile_predict} -varName_prob {self.get_prob_column_name()}' cmd_job = ('%s %s %s' % (job_name, inArgs, outArgs)) self.logger.debug(f"Executing command {cmd_job}") with open(self.logging_file, "w") as f: val = subprocess.run(cmd_job.split(" "), stdout=f, stderr=subprocess.STDOUT, cwd=self.output_dir) with open(self.done_file, "w") as f: if val.returncode == 0: f.write("SUCCESS") else: f.write("FAILURE") else: self.logger.debug("Not regenerating") return True
def __init__(self, name, output_dir, options, dependencies=None): super().__init__(name, output_dir, dependencies=dependencies) self.options = options self.global_config = get_config() self.logfile = os.path.join(self.output_dir, "output.log") self.conda_env = self.global_config["DataSkimmer"]["conda_env"] self.path_to_task = output_dir self.raw_dir = self.options.get("RAW_DIR") self.genversion = os.path.basename(self.raw_dir) self.data_path = os.path.dirname(self.raw_dir) self.job_name = f"DATAPREP_{self.name}" self.output["genversion"] = self.genversion self.output["data_path"] = self.data_path self.output["photometry_dir"] = get_output_loc(self.raw_dir) self.output["raw_dir"] = self.raw_dir self.clump_file = os.path.join(self.output_dir, self.genversion + ".SNANA.TEXT") self.output["clump_file"] = self.clump_file self.slurm = """#!/bin/bash #SBATCH --job-name={job_name} #SBATCH --time=0:20:00 #SBATCH --nodes=1 #SBATCH --ntasks-per-node=1 #SBATCH --partition=broadwl #SBATCH --output={log_file} #SBATCH --account=pi-rkessler #SBATCH --mem=2GB cd {path_to_task} snana.exe clump.nml if [ $? -eq 0 ]; then echo SUCCESS > {done_file} else echo FAILURE > {done_file} fi """ self.clump_command = """#
def __init__(self, name, output_dir, config, dependencies, mode, options, index=0, model_name=None): super().__init__(name, output_dir, config, dependencies, mode, options, index=index, model_name=model_name) self.global_config = get_config() self.num_jobs = 4 self.conda_env = self.global_config["SNIRF"]["conda_env"] self.path_to_classifier = get_output_loc( self.global_config["SNIRF"]["location"]) self.job_base_name = os.path.basename( Path(output_dir).parents[1]) + "__" + os.path.basename(output_dir) self.features = options.get("FEATURES", "x1 c zHD x1ERR cERR PKMJDERR") self.validate_model() self.model_pk_file = "model.pkl" self.output_pk_file = os.path.join(self.output_dir, self.model_pk_file) self.fitopt = options.get("FITOPT", "DEFAULT") self.fitres_filename = None self.fitres_file = None self.batch_file = self.options.get("BATCH_FILE") if self.batch_file is not None: self.batch_file = get_data_loc(self.batch_file) self.batch_replace = self.options.get("BATCH_REPLACE", {}) self.slurm = """{sbatch_header}
def __init__(self, name, output_dir, dependencies, mode, options, index=0, model_name=None): super().__init__(name, output_dir, dependencies, mode, options, index=index, model_name=model_name) self.global_config = get_config() self.num_jobs = 4 self.conda_env = self.global_config["SNIRF"]["conda_env"] self.path_to_classifier = get_output_loc( self.global_config["SNIRF"]["location"]) self.job_base_name = os.path.basename( Path(output_dir).parents[1]) + "__" + os.path.basename(output_dir) self.features = options.get("FEATURES", "x1 c zHD x1ERR cERR PKMJDERR") self.validate_model() self.model_pk_file = "model.pkl" self.output_pk_file = os.path.join(self.output_dir, self.model_pk_file) self.fitopt = options.get("FITOPT", "DEFAULT") lcfit = self.get_fit_dependency() self.fitres_filename = lcfit["fitopt_map"][self.fitopt] self.fitres_file = os.path.abspath( os.path.join(lcfit["fitres_dirs"][self.index], self.fitres_filename)) self.slurm = """#!/bin/bash
def predict(self, force_refresh): model = self.options.get("MODEL") if model is None: self.logger.error("If you are in predict model, please specify a MODEL in OPTS. Either a file location or a training task name.") return False if not os.path.exists(get_output_loc(model)): # If its not a file, it must be a task for t in self.dependencies: if model == t.name: self.logger.debug(f"Found task dependency {t.name} with model file {t.output['model_filename']}") model = t.output["model_filename"] command = ( f"--nc 4 " f"--nclass 2 " f"--ft {self.features} " f"--restore " f"--pklfile {model} " f"--pklformat FITRES " f"--test {self.get_fits_file()} " f"--filedir {self.output_dir} " f"--done_file {self.done_file} " f"--use_filenames " ) return self.classify(force_refresh, command)
def __init__(self, name, output_dir, config, dependencies, mode, options, index=0, model_name=None): super().__init__(name, output_dir, config, dependencies, mode, options, index=index, model_name=model_name) self.global_config = get_config() self.dump_dir = output_dir + "/dump" self.job_base_name = os.path.basename(output_dir) self.gpu = config.get("GPU", True) self.tmp_output = None self.done_file = os.path.join(self.output_dir, "done_task.txt") self.done_file2 = os.path.join(self.output_dir, "done_task2.txt") self.variant = options.get("VARIANT", "vanilla").lower() self.redshift = "zspe" if options.get("REDSHIFT", True) else "none" self.norm = options.get("NORM", "cosmo") self.cyclic = options.get("CYCLIC", True) self.seed = options.get("SEED", 0) self.clean = config.get("CLEAN", True) self.batch_size = options.get("BATCH_SIZE", 128) self.num_layers = options.get("NUM_LAYERS", 2) self.hidden_dim = options.get("HIDDEN_DIM", 32) # Setup yml files self.data_yml_file = options.get("DATA_YML", None) self.output_data_yml = os.path.join(self.output_dir, "data.yml") self.classification_yml_file = options.get("CLASSIFICATION_YML", None) self.output_classification_yml = os.path.join(self.output_dir, "classification.yml") # XOR - only runs if either but not both yml's are None if (self.data_yml_file is None) ^ (self.classification_yml_file is None): self.logger.error( f"If using yml inputs, both 'DATA_YML' (currently {self.data_yml} and 'CLASSIFICATION_YML' (currently {self.classification_yml}) must be provided" ) elif self.data_yml_file is not None: with open(self.data_yml_file, 'r') as f: self.data_yml = f.read() with open(self.classification_yml_file, 'r') as f: self.classification_yml = f.read() self.has_yml = True self.variant = self.get_variant_from_yml(self.classification_yml) else: self.data_yml = None self.classification_yml = None self.has_yml = False self.batch_file = self.options.get("BATCH_FILE") if self.batch_file is not None: self.batch_file = get_data_loc(self.batch_file) self.batch_replace = self.options.get("BATCH_REPLACE", {}) self.validate_model() assert self.norm in [ "global", "cosmo", "perfilter", "cosmo_quantile", "none", ], f"Norm option is set to {self.norm}, needs to be one of 'global', 'cosmo', 'perfilter', 'cosmo_quantile" assert self.variant in [ "vanilla", "variational", "bayesian" ], f"Variant {self.variant} is not vanilla, variational or bayesian" self.slurm = """{sbatch_header} {task_setup} """ self.conda_env = self.global_config["SuperNNova"]["conda_env"] self.path_to_classifier = get_output_loc( self.global_config["SuperNNova"]["location"])
def classify(self, training): model = self.options.get("MODEL") model_path = "" if not training: assert model is not None, "If TRAIN is not specified, you have to point to a model to use" if not os.path.exists(get_output_loc(model)): for t in self.dependencies: if model == t.name: self.logger.debug( f"Found task dependency {t.name} with model file {t.output['model_filename']}" ) model = t.output["model_filename"] model_path = get_output_loc(model) self.logger.debug(f"Looking for model in {model_path}") assert os.path.exists(model_path), f"Cannot find {model_path}" types = self.get_types() if types is None: types = OrderedDict({ "1": "Ia", "0": "unknown", "2": "SNIax", "3": "SNIa-pec", "20": "SNIIP", "21": "SNIIL", "22": "SNIIn", "29": "SNII", "32": "SNIb", "33": "SNIc", "39": "SNIbc", "41": "SLSN-I", "42": "SLSN-II", "43": "SLSN-R", "80": "AGN", "81": "galaxy", "98": "None", "99": "pending", "101": "Ia", "120": "SNII", "130": "SNIbc", }) else: has_ia = False has_cc = False self.logger.debug(f"Input types set to {types}") for key, value in types.items(): if value.upper() == "IA": has_ia = True elif value.upper() in ["II", "IBC"]: has_cc = True if not has_ia: self.logger.debug("No Ia type found, injecting type") types[1] = "Ia" types = dict( sorted(types.items(), key=lambda x: -1 if x[0] == 1 else x[0])) self.logger.debug(f"Inject types with Ias are {types}") if not has_cc: self.logger.debug("No cc type found, injecting type") types[29] = "II" str_types = json.dumps(types) self.logger.debug(f"Types set to {str_types}") sim_dep = self.get_simulation_dependency() light_curve_dir = sim_dep.output["photometry_dirs"][self.index] self.raw_dir = light_curve_dir fit = self.get_fit_dependency() fit_dir = f"" if fit is None else f"--fits_dir {fit['fitres_dirs'][self.index]}" cyclic = "--cyclic" if self.variant in ["vanilla", "variational" ] and self.cyclic else "" batch_size = f"--batch_size {self.batch_size}" num_layers = f"--num_layers {self.num_layers}" hidden_dim = f"--hidden_dim {self.hidden_dim}" variant = f"--model {self.variant}" if self.variant == "bayesian": variant += " --num_inference_samples 20" clump = sim_dep.output.get("clump_file") if clump is None: clump_txt = "" else: clump_txt = f"--photo_window_files {clump}" if self.batch_file is None: if self.gpu: self.sbatch_header = self.sbatch_gpu_header else: self.sbatch_header = self.sbatch_cpu_header else: with open(self.batch_file, 'r') as f: self.sbatch_header = f.read() self.sbatch_header = self.clean_header(self.sbatch_header) if self.has_yml: self.update_yml() setup_file = "supernnova_yml" else: setup_file = "supernnova" header_dict = { "REPLACE_NAME": self.job_base_name, "REPLACE_WALLTIME": "23:00:00", "REPLACE_LOGFILE": "output.log", "REPLACE_MEM": "32GB", "APPEND": ["#SBATCH --ntasks=1", "#SBATCH --cpus-per-task=1"] } header_dict = merge_dict(header_dict, self.batch_replace) self.update_header(header_dict) setup_dict = { "conda_env": self.conda_env, "dump_dir": self.dump_dir, "photometry_dir": light_curve_dir, "fit_dir": fit_dir, "path_to_classifier": self.path_to_classifier, "job_name": self.job_base_name, "command": "--train_rnn" if training else "--validate_rnn", "sntypes": str_types, "variant": variant, "cyclic": cyclic, "model": "" if training else f"--model_files {model_path}", "phot": "", "test_or_train": "" if training else "--data_testing", "redshift": "--redshift " + self.redshift, "norm": "--norm " + self.norm, "done_file": self.done_file, "clump": clump_txt, "done_file2": self.done_file2, "partition": "gpu2" if self.gpu else "broadwl", "gres": "#SBATCH --gres=gpu:1" if self.gpu else "", "cuda": "--use_cuda" if self.gpu else "", "clean_command": f"rm -rf {self.dump_dir}/processed" if self.clean else "", "seed": f"--seed {self.seed}" if self.seed else "", "batch_size": batch_size, "num_layers": num_layers, "hidden_dim": hidden_dim, "data_yml": self.output_data_yml, "classification_yml": self.output_classification_yml, "classification_command": "train_rnn" if training else "validate_rnn" } format_dict = { "sbatch_header": self.sbatch_header, "task_setup": self.update_setup(setup_dict, self.task_setup[setup_file]) } slurm_output_file = self.output_dir + "/job.slurm" self.logger.info( f"Running SuperNNova, slurm job outputting to {slurm_output_file}") slurm_text = self.slurm.format(**format_dict) new_hash = self.get_hash_from_string(slurm_text) if not self._check_regenerate(new_hash): self.should_be_done() else: self.logger.info("Rerunning. Cleaning output_dir") shutil.rmtree(self.output_dir, ignore_errors=True) mkdirs(self.output_dir) if self.has_yml: with open(self.output_data_yml, 'w') as f: f.write(self.data_yml) with open(self.output_classification_yml, 'w') as f: f.write(self.classification_yml) self.save_new_hash(new_hash) with open(slurm_output_file, "w") as f: f.write(slurm_text) self.logger.info( f"Submitting batch job to {'train' if training else 'predict using'} SuperNNova" ) subprocess.run(["sbatch", slurm_output_file], cwd=self.output_dir) return True
def __init__(self, name, output_dir, options, global_config, dependencies=None): super().__init__(name, output_dir, dependencies=dependencies) self.options = options self.global_config = global_config self.job_name = os.path.basename( Path(output_dir).parents[1]) + "_COSMOMC_" + name self.logfile = os.path.join(self.output_dir, "output.log") self.path_to_cosmomc = get_output_loc( self.global_config["CosmoMC"]["location"]) self.create_cov_dep = self.get_dep(CreateCov) self.blind = self.create_cov_dep.output[ "blind"] if self.create_cov_dep is not None else self.options.get( "BLIND", False) assert isinstance( self.blind, (bool, np.bool_)), "Blind should be set to a boolan value!" self.ini_prefix = options.get("INI") self.static = self.ini_prefix in ["cmb_omw", "cmb_omol"] self.static_path = "cosmomc_static_chains/" if self.create_cov_dep is None: self.ini_files = [f"{self.ini_prefix}.ini"] self.num_walkers = 4 self.covopts = ["ALL"] self.covopts_numbers = [0] self.labels = [self.name] self.num_jobs = 1 else: self.num_walkers = options.get("NUM_WALKERS", 8) avail_cov_opts = self.create_cov_dep.output["covopts"] self.covopts = options.get("COVOPTS") or list( avail_cov_opts.keys()) self.covopts_numbers = [avail_cov_opts[k] for k in self.covopts] self.ini_files = [ f"{self.ini_prefix}_{num}.ini" for num in self.covopts_numbers ] self.output["hubble_plot"] = self.create_cov_dep.output[ "hubble_plot"] self.output["bcor_name"] = self.create_cov_dep.output["bcor_name"] self.labels = [self.name + "_" + c for c in self.covopts] self.num_jobs = len(self.covopts) self.chain_dir = os.path.join(self.output_dir, "chains/") self.param_dict = { l: os.path.join(self.chain_dir, i.replace(".ini", ".paramnames")) for l, i in zip(self.covopts, self.ini_files) } self.done_files = [f"done_{num}.txt" for num in self.covopts_numbers] self.chain_dict = { l: os.path.join(self.chain_dir, i.replace(".ini", f"_{n + 1}.txt")) for l, i in zip(self.covopts, self.ini_files) for n in range(self.num_walkers) } self.base_dict = { l: os.path.join(self.chain_dir, i.replace(".ini", "")) for l, i in zip(self.covopts, self.ini_files) for n in range(self.num_walkers) } self.output["chain_dir"] = self.chain_dir self.output["param_dict"] = self.param_dict self.output["chain_dict"] = self.chain_dict self.output["base_dict"] = self.base_dict self.output["covopts"] = self.covopts self.output["blind"] = self.blind self.output["label"] = (self.options.get( "LABEL", f"({' + '.join(self.ini_prefix.upper().split('_')[:-1])})") + " " + (self.create_cov_dep.output["name"] if self.create_cov_dep is not None else "")) # TODO: Better logic here please final = self.ini_prefix.split("_")[-1] ps = { "omw": ["omegam", "w"], "omol": ["omegam", "omegal"], "wnu": ["w", "nu"], "wwa": ["w", "wa"] } self.output["cosmology_params"] = ps[final] self.slurm = """#!/bin/bash
def __init__(self, name, output_dir, dependencies, mode, options, index=0, model_name=None): super().__init__(name, output_dir, dependencies, mode, options, index=index, model_name=model_name) self.global_config = get_config() self.dump_dir = output_dir + "/dump" self.job_base_name = os.path.basename(output_dir) self.gpu = True self.tmp_output = None self.done_file = os.path.join(self.output_dir, "done_task.txt") self.done_file2 = os.path.join(self.output_dir, "done_task2.txt") self.variant = options.get("VARIANT", "vanilla").lower() self.redshift = "zspe" if options.get("REDSHIFT", True) else "none" self.norm = options.get("NORM", "cosmo") self.validate_model() assert self.norm in [ "global", "cosmo", "perfilter" ], f"Norm option is set to {self.norm}, needs to be one of 'global', 'cosmo', 'perfilter'" assert self.variant in [ "vanilla", "variational", "bayesian" ], f"Variant {self.variant} is not vanilla, variational or bayesian" self.slurm = """#!/bin/bash #SBATCH --job-name={job_name} #SBATCH --time=23:00:00 #SBATCH --nodes=1 #SBATCH --ntasks=1 #SBATCH --partition=gpu2 #SBATCH --gres=gpu:1 #SBATCH --output=output.log #SBATCH --account=pi-rkessler #SBATCH --mem=64GB source activate {conda_env} module load cuda echo `which python` cd {path_to_classifier} echo "#################TIMING Starting here: `date`" python run.py --data --sntypes '{sntypes}' --dump_dir {dump_dir} --raw_dir {photometry_dir} {fit_dir} {phot} {clump} {norm} {test_or_train} if [ $? -ne 0 ]; then echo FAILURE > {done_file2} else echo "#################TIMING Database done now, starting classifier: `date`" python run.py --use_cuda {cyclic} --sntypes '{sntypes}' --done_file {done_file} --batch_size 20 --dump_dir {dump_dir} {cyclic} {variant} {model} {phot} {redshift} {norm} {command} if [ $? -eq 0 ]; then rm -rf {dump_dir}/processed echo SUCCESS > {done_file2} else echo FAILURE > {done_file2} fi fi echo "#################TIMING Classifier finished: `date`" """ self.conda_env = self.global_config["SuperNNova"]["conda_env"] self.path_to_classifier = get_output_loc( self.global_config["SuperNNova"]["location"])
def classify(self, training, force_refresh): model = self.options.get("MODEL") model_path = "" if not training: assert model is not None, "If TRAIN is not specified, you have to point to a model to use" if not os.path.exists(get_output_loc(model)): for t in self.dependencies: if model == t.name: self.logger.debug( f"Found task dependency {t.name} with model file {t.output['model_filename']}" ) model = t.output["model_filename"] model_path = get_output_loc(model) self.logger.debug(f"Looking for model in {model_path}") assert os.path.exists(model_path), f"Cannot find {model_path}" types = self.get_types() if types is None: types = OrderedDict({ "1": "Ia", "0": "unknown", "2": "SNIax", "3": "SNIa-pec", "20": "SNIIP", "21": "SNIIL", "22": "SNIIn", "29": "SNII", "32": "SNIb", "33": "SNIc", "39": "SNIbc", "41": "SLSN-I", "42": "SLSN-II", "43": "SLSN-R", "80": "AGN", "81": "galaxy", "98": "None", "99": "pending", "101": "Ia", "120": "SNII", "130": "SNIbc", }) else: has_ia = False has_cc = False self.logger.debug(f"Input types set to {types}") for key, value in types.items(): if value.upper() == "IA": has_ia = True elif value.upper() in ["II", "IBC"]: has_cc = True if not has_ia: self.logger.debug("No Ia type found, injecting type") types.update({"1": "Ia"}) types.move_to_end("1", last=False) if not has_cc: self.logger.debug("No cc type found, injecting type") types.update({"29": "II"}) str_types = json.dumps(types) self.logger.debug(f"Types set to {str_types}") sim_dep = self.get_simulation_dependency() light_curve_dir = sim_dep.output["photometry_dirs"][self.index] fit = self.get_fit_dependency() fit_dir = f"" if fit is None else f"--fits_dir {fit['fitres_dirs'][self.index]}" cyclic = "--cyclic" if self.variant in ["vanilla", "variational" ] else "" variant = f"--model {self.variant}" if self.variant == "bayesian": variant += " --num_inference_samples 20" clump = sim_dep.output.get("clump_file") if clump is None: clump_txt = "" else: clump_txt = f"--photo_window_files {clump}" format_dict = { "conda_env": self.conda_env, "dump_dir": self.dump_dir, "photometry_dir": light_curve_dir, "fit_dir": fit_dir, "path_to_classifier": self.path_to_classifier, "job_name": self.job_base_name, "command": "--train_rnn" if training else "--validate_rnn", "sntypes": str_types, "variant": variant, "cyclic": cyclic, "model": "" if training else f"--model_files {model_path}", "phot": "", "test_or_train": "" if training else "--data_testing", "redshift": "--redshift " + self.redshift, "norm": "--norm " + self.norm, "done_file": self.done_file, "clump": clump_txt, "done_file2": self.done_file2, } slurm_output_file = self.output_dir + "/job.slurm" self.logger.info( f"Running SuperNNova, slurm job outputting to {slurm_output_file}") slurm_text = self.slurm.format(**format_dict) old_hash = self.get_old_hash() new_hash = self.get_hash_from_string(slurm_text) if not force_refresh and new_hash == old_hash: self.logger.info("Hash check passed, not rerunning") self.should_be_done() else: self.logger.info("Rerunning. Cleaning output_dir") shutil.rmtree(self.output_dir, ignore_errors=True) mkdirs(self.output_dir) self.save_new_hash(new_hash) with open(slurm_output_file, "w") as f: f.write(slurm_text) self.logger.info( f"Submitting batch job to {'train' if training else 'predict using'} SuperNNova" ) subprocess.run(["sbatch", slurm_output_file], cwd=self.output_dir) return True
def __init__(self, name, output_dir, config, dependencies, options, recal_aggtask): super().__init__(name, output_dir, config=config, dependencies=dependencies) self.passed = False self.classifiers = [d for d in dependencies if isinstance(d, Classifier)] self.lcfit_deps = [c.get_fit_dependency(output=False) for c in self.classifiers] self.lcfit_names = list(set([l.output["name"] for l in self.lcfit_deps if l is not None])) self.output["lcfit_names"] = self.lcfit_names if not self.lcfit_names: self.logger.debug("No jobs depend on the LCFIT, so adding a dummy one") self.lcfit_names = [""] self.sim_task = self.get_underlying_sim_task() self.output["sim_name"] = self.sim_task.name self.recal_aggtask = recal_aggtask self.num_versions = len(self.sim_task.output["sim_folders"]) self.output_dfs = [os.path.join(self.output_dir, f"merged_{i}.csv") for i in range(self.num_versions)] self.output_dfs_key = [[os.path.join(self.output_dir, f"merged_{l}_{i}.key") for l in self.lcfit_names] for i in range(self.num_versions)] self.output_cals = [os.path.join(self.output_dir, f"calibration_{i}.csv") for i in range(self.num_versions)] self.id = "CID" self.type_name = "SNTYPE" self.options = options self.include_type = bool(options.get("INCLUDE_TYPE", False)) self.plot = options.get("PLOT", False) self.plot_all = options.get("PLOT_ALL", False) self.output["classifier_names"] = [c.name for c in self.classifiers] self.output["classifier_indexes"] = [c.index for c in self.classifiers] self.output["calibration_files"] = self.output_cals self.output["empty_agg"] = False if isinstance(self.plot, bool): self.python_file = os.path.dirname(inspect.stack()[0][1]) + "/external/aggregator_plot.py" else: self.python_file = self.plot self.python_file = get_output_loc(self.python_file) if not os.path.exists(self.python_file): Task.fail_config(f"Attempting to find python file {self.python_file} but it's not there!") merge_classifiers = self.config.get("MERGE_CLASSIFIERS") self.classifier_merge = {c.output['name']: c.get_prob_column_name() for c in self.classifiers} if merge_classifiers is not None: self.classifier_merge = dict() for c in self.classifiers: prob_col = [] for prob_col_name in merge_classifiers.keys(): mask_list = ensure_list(merge_classifiers[prob_col_name]) match = False for m in mask_list: if match: continue else: if m in c.output['name']: match = True if match: if prob_col_name[:5] != "PROB_": prob_col_name = "PROB_" + prob_col_name prob_col.append(prob_col_name) if len(prob_col) == 1: self.classifier_merge[c.output['name']] = prob_col[0] else: if len(prob_col) == 0: self.classifier_merge[c.output['name']] = c.get_prob_column_name() else: Task.fail_config(f"Classifier task {c.output['name']} matched multiple MERGE_CLASSIFIERS keys: {prob_col}. Please provide more specific keys") self.logger.debug(f"Classifier merge = {self.classifier_merge}") self.output["classifier_merge"] = self.classifier_merge
def get_tasks(c, prior_tasks, base_output_dir, stage_number, prefix, global_config): from pippin.classifiers.factory import ClassifierFactory def _get_clas_output_dir(base_output_dir, stage_number, sim_name, fit_name, clas_name, index=None, extra=None): sim_name = "" if sim_name is None or fit_name is not None else "_" + sim_name fit_name = "" if fit_name is None else "_" + fit_name extra_name = "" if extra is None else "_" + extra index = "" if index is None else f"_{index}" return f"{base_output_dir}/{stage_number}_CLAS/{clas_name}{index}{sim_name}{fit_name}{extra_name}" def get_num_ranseed(sim_task, lcfit_task): if sim_task is not None: return len(sim_task.output["sim_folders"]) if lcfit_task is not None: return len(lcfit_task.output["fitres_dirs"]) raise ValueError( "Classifier dependency has no sim_task or lcfit_task?") tasks = [] lcfit_tasks = Task.get_task_of_type(prior_tasks, SNANALightCurveFit) sim_tasks = Task.get_task_of_type(prior_tasks, DataPrep, SNANASimulation) for clas_name in c.get("CLASSIFICATION", []): config = c["CLASSIFICATION"][clas_name] name = config["CLASSIFIER"] cls = ClassifierFactory.get(name) options = config.get("OPTS", {}) if "MODE" not in config: Task.fail_config( f"Classifier task {clas_name} needs to specify MODE as train or predict" ) mode = config["MODE"].lower() assert mode in ["train", "predict" ], "MODE should be either train or predict" if mode == "train": mode = Classifier.TRAIN else: mode = Classifier.PREDICT # Validate that train is not used on certain classifiers if mode == Classifier.TRAIN: assert name not in [ "PerfectClassifier", "UnityClassifier", "FitProbClassifier" ], f"Can not use train mode with {name}" needs_sim, needs_lc = cls.get_requirements(options) runs = [] if needs_sim and needs_lc: runs = [(l.dependencies[0], l) for l in lcfit_tasks] elif needs_sim: runs = [(s, None) for s in sim_tasks] elif needs_lc: runs = [(l.dependencies[0], l) for l in lcfit_tasks] else: Task.logger.warn( f"Classifier {name} does not need sims or fits. Wat.") num_gen = 0 mask = config.get("MASK", "") mask_sim = config.get("MASK_SIM", "") mask_fit = config.get("MASK_FIT", "") for s, l in runs: sim_name = s.name if s is not None else None fit_name = l.name if l is not None else None matched_sim = True matched_fit = True if mask: matched_sim = matched_sim and mask in sim_name if mask_sim: matched_sim = matched_sim and mask_sim in sim_name if mask: matched_fit = matched_fit and mask in sim_name if mask_fit: matched_fit = matched_fit and mask_sim in sim_name if not matched_fit or not matched_sim: continue deps = [] if s is not None: deps.append(s) if l is not None: deps.append(l) model = options.get("MODEL") # Validate to make sure training samples only have one sim. if mode == Classifier.TRAIN: if s is not None: folders = s.output["sim_folders"] assert ( len(folders) == 1 ), f"Training requires one version of the sim, you have {len(folders)} for sim task {s}. Make sure your training sim doesn't set RANSEED_CHANGE" if l is not None: folders = l.output["fitres_dirs"] assert ( len(folders) == 1 ), f"Training requires one version of the lcfits, you have {len(folders)} for lcfit task {l}. Make sure your training sim doesn't set RANSEED_CHANGE" if model is not None: if "/" in model or "." in model: potential_path = get_output_loc(model) if os.path.exists(potential_path): extra = os.path.basename( os.path.dirname(potential_path)) # Nasty duplicate code, TODO fix this indexes = get_num_ranseed(s, l) for i in range(indexes): num = i + 1 if indexes > 1 else None clas_output_dir = _get_clas_output_dir( base_output_dir, stage_number, sim_name, fit_name, clas_name, index=num, extra=extra) cc = cls(clas_name, clas_output_dir, config, deps, mode, options, index=i, model_name=extra) Task.logger.info( f"Creating classification task {name} with {cc.num_jobs} jobs, for LC fit {fit_name} on simulation {sim_name} and index {i}" ) num_gen += 1 tasks.append(cc) else: Task.fail_config( f"Your model {model} looks like a path, but I couldn't find a model at {potential_path}" ) else: for t in tasks: if model == t.name: # deps.append(t) extra = t.get_unique_name() assert t.__class__ == cls, f"Model {clas_name} with class {cls} has model {model} with class {t.__class__}, they should match!" indexes = get_num_ranseed(s, l) for i in range(indexes): num = i + 1 if indexes > 1 else None clas_output_dir = _get_clas_output_dir( base_output_dir, stage_number, sim_name, fit_name, clas_name, index=num, extra=extra) cc = cls(clas_name, clas_output_dir, config, deps + [t], mode, options, index=i) Task.logger.info( f"Creating classification task {name} with {cc.num_jobs} jobs, for LC fit {fit_name} on simulation {sim_name} and index {i}" ) num_gen += 1 tasks.append(cc) else: indexes = get_num_ranseed(s, l) for i in range(indexes): num = i + 1 if indexes > 1 else None clas_output_dir = _get_clas_output_dir(base_output_dir, stage_number, sim_name, fit_name, clas_name, index=num) cc = cls(clas_name, clas_output_dir, config, deps, mode, options, index=i) Task.logger.info( f"Creating classification task {name} with {cc.num_jobs} jobs, for LC fit {fit_name} on simulation {sim_name} and index {i}" ) num_gen += 1 tasks.append(cc) if num_gen == 0: Task.fail_config( f"Classifier {clas_name} with masks |{mask}|{mask_sim}|{mask_fit}| matched no combination of sims and fits" ) return tasks
def __init__(self, name, output_dir, config, options, global_config, dependencies=None): super().__init__(name, output_dir, config=config, dependencies=dependencies) self.options = options self.global_config = get_config() self.logfile = os.path.join(self.output_dir, "output.log") self.conda_env = self.global_config["DataSkimmer"]["conda_env"] self.path_to_task = output_dir self.unparsed_raw = self.options.get("RAW_DIR") self.raw_dir = get_data_loc(self.unparsed_raw) if self.raw_dir is None: Task.fail_config(f"Unable to find {self.options.get('RAW_DIR')}") self.genversion = os.path.basename(self.raw_dir) self.data_path = os.path.dirname(self.raw_dir) if self.unparsed_raw == "$SCRATCH_SIMDIR" or "SNDATA_ROOT/SIM" in self.raw_dir: self.logger.debug("Removing PRIVATE_DATA_PATH from NML file") self.data_path = "" self.job_name = os.path.basename( Path(output_dir).parents[1]) + "_DATAPREP_" + self.name self.output_info = os.path.join(self.output_dir, f"{self.genversion}.YAML") self.output["genversion"] = self.genversion self.opt_setpkmjd = options.get("OPT_SETPKMJD", 16) self.photflag_mskrej = options.get("PHOTFLAG_MSKREJ", 1016) self.output["data_path"] = self.data_path self.output["photometry_dirs"] = [get_output_loc(self.raw_dir)] self.output["sim_folders"] = [get_output_loc(self.raw_dir)] self.output["raw_dir"] = self.raw_dir self.clump_file = os.path.join(self.output_dir, self.genversion + ".SNANA.TEXT") self.output["clump_file"] = self.clump_file self.output["ranseed_change"] = False is_sim = options.get("SIM", False) self.output["is_sim"] = is_sim self.output["blind"] = options.get("BLIND", True) self.types_dict = options.get("TYPES") if self.types_dict is None: self.types_dict = { "IA": [1], "NONIA": [ 2, 20, 21, 22, 29, 30, 31, 32, 33, 39, 40, 41, 42, 43, 80, 81 ] } else: for key in self.types_dict.keys(): self.types_dict[key] = [int(c) for c in self.types_dict[key]] self.batch_file = self.options.get("BATCH_FILE") if self.batch_file is not None: self.batch_file = get_data_loc(self.batch_file) self.batch_replace = self.options.get("BATCH_REPLACE", {}) self.logger.debug(f"\tIA types are {self.types_dict['IA']}") self.logger.debug(f"\tNONIA types are {self.types_dict['NONIA']}") self.output["types_dict"] = self.types_dict self.types = OrderedDict() for n in self.types_dict["IA"]: self.types.update({n: "Ia"}) for n in self.types_dict["NONIA"]: self.types.update({n: "II"}) self.output["types"] = self.types self.slurm = """{sbatch_header} {task_setup}""" self.clump_command = """#
def __init__(self, name, output_dir, options, global_config, dependencies=None): super().__init__(name, output_dir, dependencies=dependencies) self.options = options self.global_config = get_config() self.logfile = os.path.join(self.output_dir, "output.log") self.conda_env = self.global_config["DataSkimmer"]["conda_env"] self.path_to_task = output_dir self.unparsed_raw = self.options.get("RAW_DIR") self.raw_dir = get_data_loc(self.unparsed_raw) if self.raw_dir is None: Task.fail_config(f"Unable to find {self.options.get('RAW_DIR')}") self.genversion = os.path.basename(self.raw_dir) self.data_path = os.path.dirname(self.raw_dir) if self.unparsed_raw == "$SCRATCH_SIMDIR" or "SNDATA_ROOT/SIM" in self.raw_dir: self.logger.debug("Removing PRIVATE_DATA_PATH from NML file") self.data_path = "" self.job_name = os.path.basename( Path(output_dir).parents[1]) + "_DATAPREP_" + self.name self.output["genversion"] = self.genversion self.output["data_path"] = self.data_path self.output["photometry_dirs"] = [get_output_loc(self.raw_dir)] self.output["sim_folders"] = [get_output_loc(self.raw_dir)] self.output["raw_dir"] = self.raw_dir self.clump_file = os.path.join(self.output_dir, self.genversion + ".SNANA.TEXT") self.output["clump_file"] = self.clump_file self.output["ranseed_change"] = False is_sim = options.get("SIM", False) self.output["is_sim"] = is_sim self.output["blind"] = options.get("BLIND", not is_sim) self.types_dict = options.get("TYPES") if self.types_dict is None: self.types_dict = { "IA": [1], "NONIA": [ 2, 20, 21, 22, 29, 30, 31, 32, 33, 39, 40, 41, 42, 42, 43, 80, 81 ] } else: for key in self.types_dict.keys(): self.types_dict[key] = [int(c) for c in self.types_dict[key]] self.logger.debug(f"\tIA types are {self.types_dict['IA']}") self.logger.debug(f"\tNONIA types are {self.types_dict['NONIA']}") self.output["types_dict"] = self.types_dict self.types = OrderedDict() for n in self.types_dict["IA"]: self.types.update({n: "Ia"}) for n in self.types_dict["NONIA"]: self.types.update({n: "II"}) self.output["types"] = self.types self.slurm = """#!/bin/bash #SBATCH --job-name={job_name} #SBATCH --time=0:20:00 #SBATCH --nodes=1 #SBATCH --ntasks-per-node=1 #SBATCH --partition=broadwl #SBATCH --output={log_file} #SBATCH --account=pi-rkessler #SBATCH --mem=2GB cd {path_to_task} snana.exe clump.nml if [ $? -eq 0 ]; then echo SUCCESS > {done_file} else echo FAILURE > {done_file} fi """ self.clump_command = """#
def classify(self, training, force_refresh): use_photometry = self.options.get("USE_PHOTOMETRY", False) model = self.options.get("MODEL") model_path = "" if not training: assert model is not None, "If TRAIN is not specified, you have to point to a model to use" for t in self.dependencies: if model == t.name: self.logger.debug( f"Found task dependency {t.name} with model file {t.output['model_filename']}" ) model = t.output["model_filename"] model_path = get_output_loc(model) self.logger.debug(f"Looking for model in {model_path}") assert os.path.exists(model_path), f"Cannot find {model_path}" types = self.get_types() if types is None: types = OrderedDict({ "1": "Ia", "0": "unknown", "2": "SNIax", "3": "SNIa-pec", "20": "SNIIP", "21": "SNIIL", "22": "SNIIn", "29": "SNII", "32": "SNIb", "33": "SNIc", "39": "SNIbc", "41": "SLSN-I", "42": "SLSN-II", "43": "SLSN-R", "80": "AGN", "81": "galaxy", "98": "None", "99": "pending" }) str_types = json.dumps(types) sim_dep = self.get_simulation_dependency() light_curve_dir = sim_dep.output["photometry_dir"] fit = self.get_fit_dependency() fit_dir = f"" if fit is None else f"--fits_dir {fit['fitres_dir']}" cyclic = "--cyclic" if self.variant in ["vanilla", "variational" ] else "" variant = f"--model {self.variant}" clump = sim_dep.output.get("clump_file") if clump is None: clump_txt = "" else: clump_txt = f"--photo_window_files {clump}" format_dict = { "conda_env": self.conda_env, "dump_dir": self.dump_dir, "photometry_dir": light_curve_dir, "fit_dir": fit_dir, "path_to_classifier": self.path_to_classifier, "job_name": self.job_base_name, "command": "--train_rnn" if training else "--validate_rnn", "sntypes": str_types, "variant": variant, "cyclic": cyclic, "model": "" if training else f"--model_files {model_path}", "phot": "" if not use_photometry else "--source_data photometry", "test_or_train": "" if training else "--data_testing", "done_file": self.done_file, "clump": clump_txt } slurm_output_file = self.output_dir + "/job.slurm" self.logger.info( f"Running SuperNNova, slurm job outputting to {slurm_output_file}") slurm_text = self.slurm.format(**format_dict) old_hash = self.get_old_hash() new_hash = self.get_hash_from_string(slurm_text) if not force_refresh and new_hash == old_hash: self.logger.info("Hash check passed, not rerunning") else: self.logger.info("Rerunning. Cleaning output_dir") shutil.rmtree(self.output_dir, ignore_errors=True) mkdirs(self.output_dir) self.save_new_hash(new_hash) with open(slurm_output_file, "w") as f: f.write(slurm_text) self.logger.info( f"Submitting batch job to {'train' if training else 'predict using'} SuperNNova" ) subprocess.run(["sbatch", slurm_output_file], cwd=self.output_dir) return True
def __init__(self, name, output_dir, config, options, global_config, dependencies=None): super().__init__(name, output_dir, config=config, dependencies=dependencies) self.options = options self.global_config = global_config self.job_name = os.path.basename( Path(output_dir).parents[1]) + "_COSMOMC_" + name self.logfile = os.path.join(self.output_dir, "output.log") self.path_to_cosmomc = get_output_loc( self.global_config["CosmoMC"]["location"]) self.create_cov_dep = self.get_dep(CreateCov) self.blind = self.create_cov_dep.output[ "blind"] if self.create_cov_dep is not None else self.options.get( "BLIND", False) assert isinstance( self.blind, (bool, np.bool_)), "Blind should be set to a boolan value!" self.ini_prefix = options.get("INI").replace(".ini", "") self.static = self.ini_prefix.replace(".ini", "") in ["cmb_omw", "cmb_omol"] self.static_path = "cosmomc_static_chains/" if self.create_cov_dep is None: self.ini_files = [f"{self.ini_prefix}.ini"] self.num_walkers = 4 self.covopts = ["ALL"] self.covopts_numbers = [0] self.labels = [self.name] self.num_jobs = 1 else: self.num_walkers = options.get("NUM_WALKERS", 8) avail_cov_opts = self.create_cov_dep.output["covopts"] self.covopts = options.get("COVOPTS") or list( avail_cov_opts.keys()) self.covopts_numbers = [avail_cov_opts[k] for k in self.covopts] self.ini_files = [ f"{self.ini_prefix}_{num}.ini" for num in self.covopts_numbers ] self.output["hubble_plot"] = self.create_cov_dep.output[ "hubble_plot"] self.output["bcor_name"] = self.create_cov_dep.output["bcor_name"] self.labels = [self.name + "_" + c for c in self.covopts] self.num_jobs = len(self.covopts) self.ntasks = 10 self.logger.debug(f"Num Walkers: {self.num_walkers}") self.chain_dir = os.path.join(self.output_dir, "chains/") self.param_dict = { l: os.path.join(self.chain_dir, i.replace(".ini", ".paramnames")) for l, i in zip(self.covopts, self.ini_files) } self.done_files = [f"done_{num}.txt" for num in self.covopts_numbers] self.chain_dict = { l: os.path.join(self.chain_dir, i.replace(".ini", f"_{n + 1}.txt")) for l, i in zip(self.covopts, self.ini_files) for n in range(self.ntasks) } self.base_dict = { l: os.path.join(self.chain_dir, i.replace(".ini", "")) for l, i in zip(self.covopts, self.ini_files) for n in range(self.ntasks) } self.output["chain_dir"] = self.chain_dir self.output["param_dict"] = self.param_dict self.output["chain_dict"] = self.chain_dict self.output["base_dict"] = self.base_dict self.output["covopts"] = self.covopts self.output["blind"] = self.blind self.output["label"] = (self.options.get( "LABEL", f"({' + '.join(self.ini_prefix.upper().split('_')[:-1])})") + " " + (self.create_cov_dep.output["name"] if self.create_cov_dep is not None else "")) # TODO: Better logic here please final = self.ini_prefix.split("_")[-1] ps = { "omw": ["omegam", "w"], "flatomol": ["omegam"], "omol": ["omegam", "omegal"], "wnu": ["w", "nu"], "wwa": ["w", "wa"] } if final not in ps.keys(): self.fail_config( f"The filename passed in ({self.ini_prefix}) needs to have format 'components_cosmology.ini', where the cosmology is omw, omol, wnu or wwa. Is this a custom file?" ) self.output["cosmology_params"] = ps[final] self.batch_file = self.options.get("BATCH_FILE") if self.batch_file is not None: self.batch_file = get_data_loc(self.batch_file) self.batch_replace = self.options.get("BATCH_REPLACE", {}) self.slurm = """{sbatch_header}