def get_tasks(config, prior_tasks, base_output_dir, stage_number, prefix, global_config): tasks = [] all_deps = Task.match_tasks_of_type(None, prior_tasks, DataPrep, SNANASimulation) for fit_name in config.get("LCFIT", []): num_matches = 0 fit_config = config["LCFIT"][fit_name] mask = fit_config.get("MASK", "") sim_tasks = Task.match_tasks_of_type(mask, prior_tasks, DataPrep, SNANASimulation) for sim in sim_tasks: num_matches += 1 fit_output_dir = f"{base_output_dir}/{stage_number}_LCFIT/{fit_name}_{sim.name}" f = SNANALightCurveFit(f"{fit_name}_{sim.name}", fit_output_dir, sim, fit_config, global_config) Task.logger.info( f"Creating fitting task {fit_name} with {f.num_jobs} jobs, for simulation {sim.name}" ) tasks.append(f) if num_matches == 0: Task.fail_config( f"LCFIT task {fit_name} with mask '{mask}' matched no sim_names: {[sim.name for sim in all_deps]}" ) return tasks
def get_tasks(c, prior_tasks, base_output_dir, stage_number, prefix, global_config): create_cov_tasks = Task.get_task_of_type(prior_tasks, CreateCov) def _get_wfit_dir(base_output_dir, stage_number, name): return f"{base_output_dir}/{stage_number}_COSMOFIT/WFIT/{name}" tasks = [] key = "WFIT" for name in c.get(key, []): config = c[key].get(name, {}) name = f"WFIT_{name}" options = config.get("OPTS", {}) mask = config.get("MASK", "") ctasks = [ ctask for ctask in create_cov_tasks if mask in ctask.name ] t = WFit(name, _get_wfit_dir(base_output_dir, stage_number, name), ctasks, config, options, global_config) Task.logger.info(f"Creating WFit task {name} {t.num_jobs} jobs") tasks.append(t) if len(create_cov_tasks) == 0: Task.fail_config( f"WFit task {name} has no create_cov task to run on!") return tasks
def classify(self): new_hash = self.get_hash_from_string(self.name + f"{self.prob_ia}_{self.prob_cc}") if self._check_regenerate(new_hash): shutil.rmtree(self.output_dir, ignore_errors=True) mkdirs(self.output_dir) try: name = self.get_prob_column_name() cid = "CID" s = self.get_simulation_dependency() df = None phot_dir = s.output["photometry_dirs"][self.index] headers = [ os.path.join(phot_dir, a) for a in os.listdir(phot_dir) if "HEAD" in a ] if not headers: Task.fail_config( f"No HEAD fits files found in {phot_dir}!") else: types = self.get_simulation_dependency( ).output["types_dict"] self.logger.debug(f"Input types are {types}") for h in headers: with fits.open(h) as hdul: data = hdul[1].data snid = np.array(data.field("SNID")) sntype = np.array(data.field("SNTYPE")).astype( np.int64) is_ia = np.isin(sntype, types["IA"]) prob = (is_ia * self.prob_ia) + (~is_ia * self.prob_cc) dataframe = pd.DataFrame({cid: snid, name: prob}) dataframe[cid] = dataframe[cid].apply(str) dataframe[cid] = dataframe[cid].str.strip() if df is None: df = dataframe else: df = pd.concat([df, dataframe]) df.drop_duplicates(subset=cid, inplace=True) self.logger.info(f"Saving probabilities to {self.output_file}") df.to_csv(self.output_file, index=False, float_format="%0.4f") chown_dir(self.output_dir) with open(self.done_file, "w") as f: f.write("SUCCESS") self.save_new_hash(new_hash) except Exception as e: self.logger.exception(e, exc_info=True) self.passed = False with open(self.done_file, "w") as f: f.write("FAILED") return False else: self.should_be_done() self.passed = True return True
def get_tasks(task_config, prior_tasks, output_dir, stage_num, prefix, global_config): from pippin.cosmofitters.factory import FitterFactory Task.logger.debug("Setting up CosmoFit tasks") tasks = [] for fitter_name in task_config.get("COSMOFIT", []): Task.logger.info( f"Found fitter of type {fitter_name}, generating tasks.") config = {fitter_name: task_config["COSMOFIT"][fitter_name]} Task.logger.debug(f"Config for {fitter_name}: {config}") fitter = FitterFactory.get(fitter_name.lower()) Task.logger.debug(f"Fitter class for {fitter_name}: {fitter}") if fitter is None: Task.logger.error( f"Fitter of type {fitter_name} not found, perhaps it's a typo? Skipping." ) continue Task.logger.debug( f"get_task function for {fitter_name}: {fitter.get_tasks}") ts = fitter.get_tasks(config, prior_tasks, output_dir, stage_num, prefix, global_config) Task.logger.debug(f"{fitter} tasks: {ts}") tasks += ts if len(tasks) == 0: Task.fail_config("No CosmoFit tasks generated!") Task.logger.info(f"Generated {len(tasks)} CosmoFit tasks.") return tasks
def get_tasks(c, prior_tasks, base_output_dir, stage_number, prefix, global_config): agg_tasks = Task.get_task_of_type(prior_tasks, Aggregator) lcfit_tasks = Task.get_task_of_type(prior_tasks, SNANALightCurveFit) tasks = [] def _get_merge_output_dir(base_output_dir, stage_number, merge_name, lcfit_name): return f"{base_output_dir}/{stage_number}_MERGE/{merge_name}_{lcfit_name}" for name in c.get("MERGE", []): num_gen = 0 config = c["MERGE"].get(name, {}) if config is None: config = {} options = config.get("OPTS", {}) mask = config.get("MASK", "") mask_sim = config.get("MASK_SIM", "") mask_lc = config.get("MASK_FIT", "") mask_agg = config.get("MASK_AGG", "") for lcfit in lcfit_tasks: if mask and mask not in lcfit.name: continue if mask_lc and mask_lc not in lcfit.name: continue sim = lcfit.get_dep(SNANASimulation, DataPrep) if mask and mask not in sim.name: continue if mask_sim and mask_sim not in sim.name: continue for agg in agg_tasks: if mask_agg and mask_agg not in agg.name: continue if mask and mask not in agg.name: continue # Check if the sim is the same for both if sim != agg.get_underlying_sim_task(): continue num_gen += 1 merge_name2 = f"{name}_{lcfit.name}" task = Merger( merge_name2, _get_merge_output_dir(base_output_dir, stage_number, name, lcfit.name), config, [lcfit, agg], options) Task.logger.info( f"Creating merge task {merge_name2} for {lcfit.name} and {agg.name} with {task.num_jobs} jobs" ) tasks.append(task) if num_gen == 0: Task.fail_config( f"Merger {name} with mask {mask} matched no combination of aggregators and fits" ) return tasks
def __init__(self, name, output_dir, dependencies, options, recal_aggtask): super().__init__(name, output_dir, dependencies=dependencies) self.passed = False self.classifiers = [ d for d in dependencies if isinstance(d, Classifier) ] self.lcfit_deps = [ c.get_fit_dependency(output=False) for c in self.classifiers ] self.lcfit_names = list( set([l.output["name"] for l in self.lcfit_deps if l is not None])) self.output["lcfit_names"] = self.lcfit_names if not self.lcfit_names: self.logger.debug( "No jobs depend on the LCFIT, so adding a dummy one") self.lcfit_names = [""] self.sim_task = self.get_underlying_sim_task() self.output["sim_name"] = self.sim_task.name self.recal_aggtask = recal_aggtask self.num_versions = len(self.sim_task.output["sim_folders"]) self.output_dfs = [ os.path.join(self.output_dir, f"merged_{i}.csv") for i in range(self.num_versions) ] self.output_dfs_key = [[ os.path.join(self.output_dir, f"merged_{l}_{i}.key") for l in self.lcfit_names ] for i in range(self.num_versions)] self.output_cals = [ os.path.join(self.output_dir, f"calibration_{i}.csv") for i in range(self.num_versions) ] self.id = "CID" self.type_name = "SNTYPE" self.options = options self.include_type = bool(options.get("INCLUDE_TYPE", False)) self.plot = options.get("PLOT", True) self.plot_all = options.get("PLOT_ALL", False) self.output["classifiers"] = self.classifiers self.output["calibration_files"] = self.output_cals if isinstance(self.plot, bool): self.python_file = os.path.dirname( inspect.stack()[0][1]) + "/external/aggregator_plot.py" else: self.python_file = self.plot self.python_file = get_output_loc(self.python_file) if not os.path.exists(self.python_file): Task.fail_config( f"Attempting to find python file {self.python_file} but it's not there!" )
def get_tasks(configs, prior_tasks, base_output_dir, stage_number, prefix, global_config): def _get_analyse_dir(base_output_dir, stage_number, name): return f"{base_output_dir}/{stage_number}_ANALYSE/{name}" tasks = [] key = "ANALYSE" for cname in configs.get(key, []): config = configs[key].get(cname, {}) if config is None: config = {} options = config.get("OPTS", {}) mask_cosmofit = config.get("MASK_COSMOFIT") mask_biascor = config.get("MASK_BIASCOR") if config.get("HISTOGRAM") is not None: Task.fail_config( "Sorry to do this, but please change HISTOGRAM into MASK_LCFIT to bring it into line with others." ) mask_lcfit = config.get("MASK_LCFIT") # TODO: Add aggregation to compile all the plots here deps_cosmofit = Task.match_tasks_of_type(mask_cosmofit, prior_tasks, CosmoFit, match_none=False, allowed_failure=True) Task.logger.debug(f"deps_cosmofit: {deps_cosmofit}") deps_biascor = Task.match_tasks_of_type(mask_biascor, prior_tasks, BiasCor, match_none=False) Task.logger.debug(f"deps_biascor: {deps_biascor}") deps_lcfit = Task.match_tasks_of_type(mask_lcfit, prior_tasks, SNANALightCurveFit, match_none=False) Task.logger.debug(f"deps_lcfit: {deps_lcfit}") deps = deps_cosmofit + deps_biascor + deps_lcfit if len(deps) == 0: Task.fail_config(f"Analyse task {cname} has no dependencies!") a = AnalyseChains( cname, _get_analyse_dir(base_output_dir, stage_number, cname), config, options, deps) Task.logger.info( f"Creating Analyse task {cname} for {[c.name for c in deps]} with {a.num_jobs} jobs" ) tasks.append(a) return tasks
def get_tasks(c, prior_tasks, base_output_dir, stage_number, prefix, global_config): create_cov_tasks = Task.get_task_of_type(prior_tasks, CreateCov) def _get_cosmomc_dir(base_output_dir, stage_number, name): return f"{base_output_dir}/{stage_number}_COSMOFIT/COSMOMC/{name}" tasks = [] key = "COSMOMC" for cname in c.get(key, []): config = c[key].get(cname, {}) options = config.get("OPTS", {}) mask = config.get("MASK_CREATE_COV", config.get("MASK", "")) # Check if this is static. Could scan the folder, but dont have all the chains yet. # TODO: Update this when I have all the chains if options.get("INI") in ["cmb_omw", "cmb_omol"]: a = CosmoMC( cname, _get_cosmomc_dir(base_output_dir, stage_number, cname), config, options, global_config) Task.logger.info( f"Creating CosmoMC task {cname} for {a.num_jobs} jobs") tasks.append(a) else: for ctask in create_cov_tasks: if mask not in ctask.name: continue name = f"COSMOMC_{cname}_{ctask.name}" a = CosmoMC(name, _get_cosmomc_dir(base_output_dir, stage_number, name), config, options, global_config, dependencies=[ctask]) Task.logger.info( f"Creating CosmoMC task {name} for {ctask.name} with {a.num_jobs} jobs" ) tasks.append(a) if len(create_cov_tasks) == 0: Task.fail_config( f"CosmoMC task {cname} has no create_cov task to run on!" ) return tasks
def get_tasks(config, prior_tasks, base_output_dir, stage_number, prefix, global_config): tasks = [] for name in config.get("DATAPREP", []): output_dir = f"{base_output_dir}/{stage_number}_DATAPREP/{name}" options = config["DATAPREP"][name].get("OPTS") if options is None: Task.fail_config( f"DATAPREP task {name} needs to specify OPTS!") s = DataPrep(name, output_dir, options, global_config) Task.logger.debug( f"Creating data prep task {name} with {s.num_jobs} jobs, output to {output_dir}" ) tasks.append(s) return tasks
def validate_model(self): if self.mode == Classifier.PREDICT: model = self.options.get("MODEL") if model is None: Task.fail_config( f"Classifier {self.name} is in predict mode but does not have a model specified" ) model_classifier = self.get_model_classifier() if model_classifier is not None and model_classifier.name == model: return True path = get_data_loc(model) if not os.path.exists(path): Task.fail_config( f"Classifier {self.name} does not have a classifier dependency and model is not a serialised file path" ) return True
def get_tasks(c, prior_tasks, base_output_dir, stage_number, prefix, global_config): biascor_tasks = Task.get_task_of_type(prior_tasks, BiasCor) def _get_createcov_dir(base_output_dir, stage_number, name): return f"{base_output_dir}/{stage_number}_CREATE_COV/{name}" tasks = [] for cname in c.get("CREATE_COV", []): config = c["CREATE_COV"][cname] if config is None: config = {} options = config.get("OPTS", {}) mask = config.get("MASK", config.get("MASK_BIASCOR", "")) for btask in biascor_tasks: if mask not in btask.name: continue num = len(btask.output["subdirs"]) for i in range(num): ii = "" if num == 1 else f"_{i + 1}" name = f"{cname}_{btask.name}{ii}" a = CreateCov(name, _get_createcov_dir(base_output_dir, stage_number, name), config, options, global_config, dependencies=[btask], index=i) Task.logger.info( f"Creating createcov task {name} for {btask.name} with {a.num_jobs} jobs" ) tasks.append(a) if len(biascor_tasks) == 0: Task.fail_config( f"Create cov task {cname} has no biascor task to run on!") return tasks
def validate_classifiers(classifier_names): prob_col = [] for name in classifier_names: col = prob_cols.get(name) if col is None: # Check whether it is instead the prob_col name if name in prob_cols.values(): prob_col.append(name) else: Task.fail_config( f"Classifier {name} has no prob column name in {prob_cols}. This should never happen!" ) else: prob_col.append(col) if len(set(prob_col)) > 1: Task.fail_config( f"Classifiers {classifier_names} map to different probability columns: {prob_cols}, you may need to map them to the same name via MERGE_CLASSIFIERS in the AGGREGATION stage." ) else: Task.logger.debug( f"Classifiers {classifier_names} map to {prob_col[0]}")
def resolve_classifiers(names): task = [c for c in classifier_tasks if c.name in names] if len(task) == 0: if len(names) > 1: Task.fail_config( f"CLASSIFIERS {names} do not match any classifiers. If these are prob column names, you must specify only one!" ) Task.logger.info( f"CLASSIFIERS {names} matched no classifiers. Checking prob column names instead." ) task = [ c for c in classifier_tasks if prob_cols[c.name] in names ] if len(task) == 0: choices = [prob_cols[c.name] for c in task] message = f"Unable to resolve classifiers {names} from list of classifiers {classifier_tasks} using either name or prob columns {choices}" Task.fail_config(message) else: task = [task[0]] elif len(task) > 1: choices = list(set([prob_cols[c.name] for c in task])) if len(choices) == 1: task = [task[0]] else: Task.fail_config( f"Found multiple classifiers. Please instead specify a column name. Your choices: {choices}" ) return task[0] # We only care about the prob column name
def get_tasks(c, prior_tasks, base_output_dir, stage_number, prefix, global_config): sim_tasks = Task.get_task_of_type(prior_tasks, SNANASimulation, DataPrep) classifier_tasks = Task.get_task_of_type(prior_tasks, Classifier) def _get_aggregator_dir(base_output_dir, stage_number, agg_name): return f"{base_output_dir}/{stage_number}_AGG/{agg_name}" tasks = [] # Check for recalibration, and if so, find that task first for agg_name in c.get("AGGREGATION", []): config = c["AGGREGATION"][agg_name] if config is None: config = {} options = config.get("OPTS", {}) mask = config.get("MASK", "") mask_sim = config.get("MASK_SIM", "") mask_clas = config.get("MASK_CLAS", "") recalibration = config.get("RECALIBRATION") recal_simtask = None recal_aggtask = None if recalibration: recal_sim = [i for i, s in enumerate(sim_tasks) if s.name == recalibration] if len(recal_sim) == 0: Task.fail_config(f"Recalibration sim {recalibration} not in the list of available sims: {[s.name for s in sim_tasks]}") elif len(recal_sim) > 1: Task.fail_config(f"Recalibration aggregation {recalibration} not in the list of available aggs: {[s.name for s in sim_tasks]}") # Move the recal sim task to the front of the queue so it executes first recal_sim_index = recal_sim[0] recal_simtask = sim_tasks[recal_sim_index] sim_tasks.insert(0, sim_tasks.pop(recal_sim_index)) for sim_task in sim_tasks: if mask_sim not in sim_task.name or mask not in sim_task.name and recal_simtask != sim_task: continue agg_name2 = f"{agg_name}_{sim_task.name}" deps = [ c for c in classifier_tasks if mask in c.name and mask_clas in c.name and c.mode == Classifier.PREDICT and c.get_simulation_dependency() == sim_task ] if len(deps) == 0: deps = [sim_task] if recalibration and sim_task != recal_simtask: if recal_aggtask is None: Task.fail_config(f"The aggregator task for {recalibration} has not been made yet. Sam probably screwed up dependency order.") else: deps.append(recal_aggtask) a = Aggregator(agg_name2, _get_aggregator_dir(base_output_dir, stage_number, agg_name2), config, deps, options, recal_aggtask) if sim_task == recal_simtask: recal_aggtask = a Task.logger.info(f"Creating aggregation task {agg_name2} for {sim_task.name} with {a.num_jobs} jobs") tasks.append(a) return tasks
def __init__(self, name, output_dir, create_cov_tasks, config, options, global_config): # First check if all required options exist # In this case, WFITOPTS must exist with at least 1 entry self.wfitopts = options.get("WFITOPTS") if self.wfitopts is None: Task.fail_config( f"You have not specified any WFITOPTS for task {name}") Task.logger.debug(f"WFITOPTS for task {name}: {self.wfitopts}") if len(self.wfitopts) == 0: Task.fail_config( f"WFITOPTS for task {name} does not have any options!") base_file = get_data_loc("wfit/input_file.INPUT") super().__init__(name, output_dir, config, base_file, default_assignment=": ", dependencies=create_cov_tasks) self.num_jobs = len(self.wfitopts) self.create_cov_tasks = create_cov_tasks self.logger.debug("CreateCov tasks: {self.create_cov_tasks}") self.create_cov_dirs = [ os.path.join(t.output_dir, "output") for t in self.create_cov_tasks ] self.logger.debug("CreateCov directories: {self.create_cov_dirs}") self.options = options self.global_config = global_config self.done_file = os.path.join(self.output_dir, "output", "ALL.DONE") self.job_name = os.path.basename( Path(output_dir).parents[1]) + "_WFIT_" + name self.logfile = os.path.join(self.output_dir, "output.log") self.input_name = f"{self.job_name}.INPUT" self.input_file = os.path.join(self.output_dir, self.input_name)
def resolve_merged_fitres_files(name, classifier_name): task = [ m for m in merge_tasks if m.output["lcfit_name"] == name ] if len(task) == 0: valid = [m.output["lcfit_name"] for m in merge_tasks] message = f"Unable to resolve merge {name} from list of merge_tasks. There are valid options: {valid}" Task.fail_config(message) elif len(task) > 1: message = f"Resolved multiple merge tasks {task} for name {name}" Task.fail_config(message) else: if classifier_name is not None and classifier_name not in task[ 0].output["classifier_names"]: if prob_cols[classifier_name] not in [ prob_cols[n] for n in task[0].output['classifier_names'] ]: Task.logger.warning( f"When constructing Biascor {gname}, merge input {name} does not have classifier {classifier_name}. " f"If this is a spec confirmed sample, or an EXTERNAL task, all good, else check this." ) return task[0]
def __init__(self, name, output_dir, config, options, global_config, dependencies=None): super().__init__(name, output_dir, config=config, dependencies=dependencies) self.options = options self.global_config = get_config() self.logfile = os.path.join(self.output_dir, "output.log") self.conda_env = self.global_config["DataSkimmer"]["conda_env"] self.path_to_task = output_dir self.unparsed_raw = self.options.get("RAW_DIR") self.raw_dir = get_data_loc(self.unparsed_raw) if self.raw_dir is None: Task.fail_config(f"Unable to find {self.options.get('RAW_DIR')}") self.genversion = os.path.basename(self.raw_dir) self.data_path = os.path.dirname(self.raw_dir) if self.unparsed_raw == "$SCRATCH_SIMDIR" or "SNDATA_ROOT/SIM" in self.raw_dir: self.logger.debug("Removing PRIVATE_DATA_PATH from NML file") self.data_path = "" self.job_name = os.path.basename( Path(output_dir).parents[1]) + "_DATAPREP_" + self.name self.output_info = os.path.join(self.output_dir, f"{self.genversion}.YAML") self.output["genversion"] = self.genversion self.opt_setpkmjd = options.get("OPT_SETPKMJD", 16) self.photflag_mskrej = options.get("PHOTFLAG_MSKREJ", 1016) self.output["data_path"] = self.data_path self.output["photometry_dirs"] = [get_output_loc(self.raw_dir)] self.output["sim_folders"] = [get_output_loc(self.raw_dir)] self.output["raw_dir"] = self.raw_dir self.clump_file = os.path.join(self.output_dir, self.genversion + ".SNANA.TEXT") self.output["clump_file"] = self.clump_file self.output["ranseed_change"] = False is_sim = options.get("SIM", False) self.output["is_sim"] = is_sim self.output["blind"] = options.get("BLIND", True) self.types_dict = options.get("TYPES") if self.types_dict is None: self.types_dict = { "IA": [1], "NONIA": [ 2, 20, 21, 22, 29, 30, 31, 32, 33, 39, 40, 41, 42, 43, 80, 81 ] } else: for key in self.types_dict.keys(): self.types_dict[key] = [int(c) for c in self.types_dict[key]] self.batch_file = self.options.get("BATCH_FILE") if self.batch_file is not None: self.batch_file = get_data_loc(self.batch_file) self.batch_replace = self.options.get("BATCH_REPLACE", {}) self.logger.debug(f"\tIA types are {self.types_dict['IA']}") self.logger.debug(f"\tNONIA types are {self.types_dict['NONIA']}") self.output["types_dict"] = self.types_dict self.types = OrderedDict() for n in self.types_dict["IA"]: self.types.update({n: "Ia"}) for n in self.types_dict["NONIA"]: self.types.update({n: "II"}) self.output["types"] = self.types self.slurm = """{sbatch_header} {task_setup}""" self.clump_command = """#
def get_tasks(c, prior_tasks, base_output_dir, stage_number, prefix, global_config): merge_tasks = Task.get_task_of_type(prior_tasks, Merger) prob_cols = { k: v for d in [t.output["classifier_merge"] for t in merge_tasks] for k, v in d.items() } classifier_tasks = Task.get_task_of_type(prior_tasks, Classifier) tasks = [] def _get_biascor_output_dir(base_output_dir, stage_number, biascor_name): return f"{base_output_dir}/{stage_number}_BIASCOR/{biascor_name}" for name in c.get("BIASCOR", []): gname = name config = c["BIASCOR"][name] options = config.get("OPTS", {}) deps = [] # Create dict but swap out the names for tasks # do this for key 0 and for muopts # modify config directly # create copy to start with to keep labels if needed config_copy = copy.deepcopy(config) # Should return a single classifier task which maps to the desired prob column def resolve_classifiers(names): task = [c for c in classifier_tasks if c.name in names] if len(task) == 0: if len(names) > 1: Task.fail_config( f"CLASSIFIERS {names} do not match any classifiers. If these are prob column names, you must specify only one!" ) Task.logger.info( f"CLASSIFIERS {names} matched no classifiers. Checking prob column names instead." ) task = [ c for c in classifier_tasks if prob_cols[c.name] in names ] if len(task) == 0: choices = [prob_cols[c.name] for c in task] message = f"Unable to resolve classifiers {names} from list of classifiers {classifier_tasks} using either name or prob columns {choices}" Task.fail_config(message) else: task = [task[0]] elif len(task) > 1: choices = list(set([prob_cols[c.name] for c in task])) if len(choices) == 1: task = [task[0]] else: Task.fail_config( f"Found multiple classifiers. Please instead specify a column name. Your choices: {choices}" ) return task[0] # We only care about the prob column name def resolve_merged_fitres_files(name, classifier_name): task = [ m for m in merge_tasks if m.output["lcfit_name"] == name ] if len(task) == 0: valid = [m.output["lcfit_name"] for m in merge_tasks] message = f"Unable to resolve merge {name} from list of merge_tasks. There are valid options: {valid}" Task.fail_config(message) elif len(task) > 1: message = f"Resolved multiple merge tasks {task} for name {name}" Task.fail_config(message) else: if classifier_name is not None and classifier_name not in task[ 0].output["classifier_names"]: if prob_cols[classifier_name] not in [ prob_cols[n] for n in task[0].output['classifier_names'] ]: Task.logger.warning( f"When constructing Biascor {gname}, merge input {name} does not have classifier {classifier_name}. " f"If this is a spec confirmed sample, or an EXTERNAL task, all good, else check this." ) return task[0] # Ensure classifiers point to the same prob column def validate_classifiers(classifier_names): prob_col = [] for name in classifier_names: col = prob_cols.get(name) if col is None: # Check whether it is instead the prob_col name if name in prob_cols.values(): prob_col.append(name) else: Task.fail_config( f"Classifier {name} has no prob column name in {prob_cols}. This should never happen!" ) else: prob_col.append(col) if len(set(prob_col)) > 1: Task.fail_config( f"Classifiers {classifier_names} map to different probability columns: {prob_cols}, you may need to map them to the same name via MERGE_CLASSIFIERS in the AGGREGATION stage." ) else: Task.logger.debug( f"Classifiers {classifier_names} map to {prob_col[0]}") def resolve_conf(subdict, default=None): """ Resolve the sub-dictionary and keep track of all the dependencies """ deps = [] # If this is a muopt, allow access to the base config's resolution if default is None: default = {} # Get the specific classifier classifier_names = subdict.get( "CLASSIFIER") # Specific classifier name if classifier_names is not None: classifier_names = ensure_list(classifier_names) validate_classifiers(classifier_names) #Task.logger.debug(f"XXX names: {classifier_names}") # Only if all classifiers point to the same prob_column should you continue classifier_task = None if classifier_names is not None: classifier_task = resolve_classifiers(classifier_names) #Task.logger.debug(f"XXX tasks: {classifier_task}") classifier_dep = classifier_task or default.get( "CLASSIFIER") # For resolving merge tasks if classifier_dep is not None: classifier_dep = classifier_dep.name #Task.logger.debug(f"XXX deps: {classifier_dep}") if "CLASSIFIER" in subdict: subdict["CLASSIFIER"] = classifier_task if classifier_task is not None: deps.append(classifier_task) #Task.logger.debug(f"XXX global deps: {deps}") # Get the Ia sims simfile_ia = subdict.get("SIMFILE_BIASCOR") if default is None and simfile_ia is None: Task.fail_config( f"You must specify SIMFILE_BIASCOR for the default biascor. Supply a simulation name that has a merged output" ) if simfile_ia is not None: simfile_ia = ensure_list(simfile_ia) simfile_ia_tasks = [ resolve_merged_fitres_files(s, classifier_dep) for s in simfile_ia ] deps += simfile_ia_tasks subdict["SIMFILE_BIASCOR"] = simfile_ia_tasks # Resolve the cc sims simfile_cc = subdict.get("SIMFILE_CCPRIOR") if default is None and simfile_ia is None: message = f"No SIMFILE_CCPRIOR specified. Hope you're doing a Ia only analysis" Task.logger.warning(message) if simfile_cc is not None: simfile_cc = ensure_list(simfile_cc) simfile_cc_tasks = [ resolve_merged_fitres_files(s, classifier_dep) for s in simfile_cc ] deps += simfile_cc_tasks subdict["SIMFILE_CCPRIOR"] = simfile_cc_tasks return deps # Changes to dict are by ref, will modify original deps += resolve_conf(config) # Resolve the data section data_names = config.get("DATA") if data_names is None: Task.fail_config( "For BIASCOR tasks you need to specify an input DATA which is a mask for a merged task" ) data_names = ensure_list(data_names) class_task = config.get("CLASSIFIER") class_name = class_task.name if class_task is not None else None data_tasks = [ resolve_merged_fitres_files(s, class_name) for s in data_names ] deps += data_tasks config["DATA"] = data_tasks config["PROB_COLS"] = prob_cols # Resolve every MUOPT muopts = config.get("MUOPTS", {}) for label, mu_conf in muopts.items(): deps += resolve_conf(mu_conf, default=config) task = BiasCor( name, _get_biascor_output_dir(base_output_dir, stage_number, name), config, deps, options, global_config) Task.logger.info( f"Creating aggregation task {name} with {task.num_jobs}") tasks.append(task) return tasks
def __init__(self, name, output_dir, sim_task, config, global_config): self.config = config self.global_config = global_config base = config.get("BASE") if base is None: Task.fail_config( f"You have not specified a BASE nml file for task {name}") self.base_file = get_data_loc(base) if self.base_file is None: Task.fail_config( f"Base file {base} cannot be found for task {name}") super().__init__(name, output_dir, self.base_file, " = ", dependencies=[sim_task]) self.sim_task = sim_task self.sim_version = sim_task.output["genversion"] self.config_path = self.output_dir + "/FIT_" + self.sim_version + ".nml" self.lc_output_dir = os.path.join(self.output_dir, "output") self.lc_log_dir = os.path.join(self.lc_output_dir, "SPLIT_JOBS_LCFIT") self.fitres_dirs = [ os.path.join(self.lc_output_dir, os.path.basename(s)) for s in self.sim_task.output["sim_folders"] ] self.logging_file = self.config_path.replace(".nml", ".nml_log") self.done_file = f"{self.output_dir}/FINISHED.DONE" secondary_log = os.path.join(self.lc_log_dir, "MERGELOGS/MERGE2.LOG") self.log_files = [self.logging_file, secondary_log] self.num_empty_threshold = 20 # Damn that tarball creation can be so slow self.display_threshold = 8 self.output["fitres_dirs"] = self.fitres_dirs self.output["nml_file"] = self.config_path self.output["genversion"] = self.sim_version self.output["sim_name"] = sim_task.output["name"] self.output["blind"] = sim_task.output["blind"] self.output["lc_output_dir"] = self.lc_output_dir self.str_pattern = re.compile("[A-DG-SU-Za-dg-su-z]") is_data = False for d in self.dependencies: if isinstance(d, DataPrep): is_data = not d.output["is_sim"] self.output["is_data"] = is_data # Loading fitopts fitopts = config.get("FITOPTS", []) if isinstance(fitopts, str): fitopts = [fitopts] self.logger.debug("Loading fitopts") self.fitopts = [] for f in fitopts: potential_path = get_data_loc(f) if os.path.exists(potential_path): self.logger.debug(f"Loading in fitopts from {potential_path}") with open(potential_path) as f: new_fitopts = list(f.read().splitlines()) self.fitopts += new_fitopts self.logger.debug( f"Loaded {len(new_fitopts)} fitopts file from {potential_path}" ) else: assert "[" in f and "]" in f, f"Manual fitopt {f} for lcfit {self.name} should specify a label in square brackets" if not f.startswith("FITOPT:"): f = "FITOPT: " + f self.logger.debug(f"Adding manual fitopt {f}") self.fitopts.append(f) # Map the fitopt outputs mapped = {"DEFAULT": "FITOPT000.FITRES"} mapped2 = {0: "DEFAULT"} for i, line in enumerate(self.fitopts): label = line.split("[")[1].split("]")[0] mapped[line] = f"FITOPT{i + 1:3d}.FITRES" mapped2[i] = label self.output["fitopt_map"] = mapped self.output["fitopt_index"] = mapped self.output["fitres_file"] = os.path.join(self.fitres_dirs[0], mapped["DEFAULT"]) self.options = self.config.get("OPTS", {}) # Try to determine how many jobs will be put in the queue try: property = self.options.get("BATCH_INFO") or self.get_property( "BATCH_INFO", assignment=": ") self.num_jobs = int(property.split()[-1]) except Exception: self.num_jobs = 10
def resolve_conf(subdict, default=None): """ Resolve the sub-dictionary and keep track of all the dependencies """ deps = [] # If this is a muopt, allow access to the base config's resolution if default is None: default = {} # Get the specific classifier classifier_names = subdict.get( "CLASSIFIER") # Specific classifier name if classifier_names is not None: classifier_names = ensure_list(classifier_names) validate_classifiers(classifier_names) #Task.logger.debug(f"XXX names: {classifier_names}") # Only if all classifiers point to the same prob_column should you continue classifier_task = None if classifier_names is not None: classifier_task = resolve_classifiers(classifier_names) #Task.logger.debug(f"XXX tasks: {classifier_task}") classifier_dep = classifier_task or default.get( "CLASSIFIER") # For resolving merge tasks if classifier_dep is not None: classifier_dep = classifier_dep.name #Task.logger.debug(f"XXX deps: {classifier_dep}") if "CLASSIFIER" in subdict: subdict["CLASSIFIER"] = classifier_task if classifier_task is not None: deps.append(classifier_task) #Task.logger.debug(f"XXX global deps: {deps}") # Get the Ia sims simfile_ia = subdict.get("SIMFILE_BIASCOR") if default is None and simfile_ia is None: Task.fail_config( f"You must specify SIMFILE_BIASCOR for the default biascor. Supply a simulation name that has a merged output" ) if simfile_ia is not None: simfile_ia = ensure_list(simfile_ia) simfile_ia_tasks = [ resolve_merged_fitres_files(s, classifier_dep) for s in simfile_ia ] deps += simfile_ia_tasks subdict["SIMFILE_BIASCOR"] = simfile_ia_tasks # Resolve the cc sims simfile_cc = subdict.get("SIMFILE_CCPRIOR") if default is None and simfile_ia is None: message = f"No SIMFILE_CCPRIOR specified. Hope you're doing a Ia only analysis" Task.logger.warning(message) if simfile_cc is not None: simfile_cc = ensure_list(simfile_cc) simfile_cc_tasks = [ resolve_merged_fitres_files(s, classifier_dep) for s in simfile_cc ] deps += simfile_cc_tasks subdict["SIMFILE_CCPRIOR"] = simfile_cc_tasks return deps # Changes to dict are by ref, will modify original
def __init__(self, name, output_dir, config, global_config, combine="combine.input"): self.data_dirs = global_config["DATA_DIRS"] base_file = get_data_loc(combine) super().__init__(name, output_dir, config, base_file, ": ") # Check for any replacements path_sndata_sim = get_config().get("SNANA").get("sim_dir") self.logger.debug(f"Setting PATH_SNDATA_SIM to {path_sndata_sim}") self.yaml["CONFIG"]["PATH_SNDATA_SIM"] = path_sndata_sim self.genversion = self.config["GENVERSION"] if len(self.genversion) < 30: self.genprefix = self.genversion else: hash = get_hash(self.genversion)[:5] self.genprefix = self.genversion[:25] + hash self.options = self.config.get("OPTS", {}) self.reserved_keywords = ["BASE"] self.reserved_top = ["GENVERSION", "GLOBAL", "OPTS", "EXTERNAL"] self.config_path = f"{self.output_dir}/{self.genversion}.input" # Make sure this syncs with the tmp file name self.global_config = global_config self.sim_log_dir = f"{self.output_dir}/LOGS" self.total_summary = os.path.join(self.sim_log_dir, "MERGE.LOG") self.done_file = f"{self.output_dir}/LOGS/ALL.DONE" self.logging_file = self.config_path.replace(".input", ".LOG") self.kill_file = self.config_path.replace(".input", "_KILL.LOG") if "EXTERNAL" not in self.config.keys(): # Deterime the type of each component keys = [ k for k in self.config.keys() if k not in self.reserved_top ] self.base_ia = [] self.base_cc = [] types = {} types_dict = {"IA": [], "NONIA": []} for k in keys: d = self.config[k] base_file = d.get("BASE") if base_file is None: Task.fail_config( f"Your simulation component {k} for sim name {self.name} needs to specify a BASE input file" ) base_path = get_data_loc(base_file) if base_path is None: Task.fail_config( f"Cannot find sim component {k} base file at {base_path} for sim name {self.name}" ) gentype, genmodel = None, None with open(base_path) as f: for line in f.read().splitlines(): if line.upper().strip().startswith("GENTYPE:"): gentype = line.upper().split(":")[1].strip() if line.upper().strip().startswith("GENMODEL:"): genmodel = line.upper().split(":")[1].strip() gentype = gentype or d.get("GENTYPE") if gentype is None: self.fail_config( f"The simulation component {k} needs to specify a GENTYPE in its input file" ) gentype = int(gentype) genmodel = genmodel or d.get("GENMODEL") if not gentype: Task.fail_config( f"Cannot find GENTYPE for component {k} and base file {base_path}" ) if not genmodel: Task.fail_config( f"Cannot find GENMODEL for component {k} and base file {base_path}" ) type2 = 100 + gentype if "SALT2" in genmodel: self.base_ia.append(base_file) types[gentype] = "Ia" types[type2] = "Ia" types_dict["IA"].append(gentype) types_dict["IA"].append(type2) else: self.base_cc.append(base_file) types[gentype] = "II" types[type2] = "II" types_dict["NONIA"].append(gentype) types_dict["NONIA"].append(type2) sorted_types = dict(sorted(types.items())) self.logger.debug(f"Types found: {json.dumps(sorted_types)}") self.output["types_dict"] = types_dict self.output["types"] = sorted_types rankeys = [ r for r in self.config["GLOBAL"].keys() if r.startswith("RANSEED_") ] value = int(self.config["GLOBAL"][rankeys[0]].split(" ") [0]) if rankeys else 1 self.set_num_jobs(2 * value) self.output["blind"] = self.options.get("BLIND", False) self.derived_batch_info = None # Determine if all the top level input files exist if len(self.base_ia + self.base_cc) == 0: Task.fail_config( "Your sim has no components specified! Please add something to simulate!" ) # Try to determine how many jobs will be put in the queue # First see if it's been explicitly set num_jobs = self.options.get("NUM_JOBS") if num_jobs is not None: self.num_jobs = num_jobs self.logger.debug( f"Num jobs set by NUM_JOBS option to {self.num_jobs}") else: try: # If BATCH_INFO is set, we'll use that batch_info = self.config.get("GLOBAL", {}).get("BATCH_INFO") default_batch_info = self.yaml["CONFIG"].get("BATCH_INFO") # If its not set, lets check for ranseed_repeat or ranseed_change if batch_info is None: ranseed_repeat = self.config.get( "GLOBAL", {}).get("RANSEED_REPEAT") ranseed_change = self.config.get( "GLOBAL", {}).get("RANSEED_CHANGE") default = self.yaml.get("CONFIG", {}).get("RANSEED_REPEAT") ranseed = ranseed_repeat or ranseed_change or default if ranseed: num_jobs = int(ranseed.strip().split()[0]) self.logger.debug( f"Found a randseed with {num_jobs}, deriving batch info" ) comps = default_batch_info.strip().split() comps[-1] = str(num_jobs) self.derived_batch_info = " ".join(comps) self.num_jobs = num_jobs self.logger.debug( f"Num jobs set by RANSEED to {self.num_jobs}") else: # self.logger.debug(f"BATCH INFO property detected as {property}") self.num_jobs = int(batch_info.split()[-1]) self.logger.debug( f"Num jobs set by BATCH_INFO to {self.num_jobs}") except Exception: self.logger.warning( f"Unable to determine how many jobs simulation {self.name} has" ) self.num_jobs = 1 self.output["genversion"] = self.genversion self.output["genprefix"] = self.genprefix self.ranseed_change = self.config.get("GLOBAL", {}).get("RANSEED_CHANGE") base = os.path.expandvars(self.global_config["SNANA"]["sim_dir"]) self.output["ranseed_change"] = self.ranseed_change is not None self.output["ranseed_change_val"] = self.ranseed_change self.get_sim_folders(base, self.genversion) self.output["sim_folders"] = self.sim_folders else: self.sim_folders = self.output["sim_folders"]
def get_tasks(c, prior_tasks, base_output_dir, stage_number, prefix, global_config): from pippin.classifiers.factory import ClassifierFactory def _get_clas_output_dir(base_output_dir, stage_number, sim_name, fit_name, clas_name, index=None, extra=None): sim_name = "" if sim_name is None or fit_name is not None else "_" + sim_name fit_name = "" if fit_name is None else "_" + fit_name extra_name = "" if extra is None else "_" + extra index = "" if index is None else f"_{index}" return f"{base_output_dir}/{stage_number}_CLAS/{clas_name}{index}{sim_name}{fit_name}{extra_name}" def get_num_ranseed(sim_task, lcfit_task): if sim_task is not None: return len(sim_task.output["sim_folders"]) if lcfit_task is not None: return len(lcfit_task.output["fitres_dirs"]) raise ValueError( "Classifier dependency has no sim_task or lcfit_task?") tasks = [] lcfit_tasks = Task.get_task_of_type(prior_tasks, SNANALightCurveFit) sim_tasks = Task.get_task_of_type(prior_tasks, DataPrep, SNANASimulation) for clas_name in c.get("CLASSIFICATION", []): config = c["CLASSIFICATION"][clas_name] name = config["CLASSIFIER"] cls = ClassifierFactory.get(name) options = config.get("OPTS", {}) if "MODE" not in config: Task.fail_config( f"Classifier task {clas_name} needs to specify MODE as train or predict" ) mode = config["MODE"].lower() assert mode in ["train", "predict" ], "MODE should be either train or predict" if mode == "train": mode = Classifier.TRAIN else: mode = Classifier.PREDICT # Validate that train is not used on certain classifiers if mode == Classifier.TRAIN: assert name not in [ "PerfectClassifier", "UnityClassifier", "FitProbClassifier" ], f"Can not use train mode with {name}" needs_sim, needs_lc = cls.get_requirements(options) runs = [] if needs_sim and needs_lc: runs = [(l.dependencies[0], l) for l in lcfit_tasks] elif needs_sim: runs = [(s, None) for s in sim_tasks] elif needs_lc: runs = [(l.dependencies[0], l) for l in lcfit_tasks] else: Task.logger.warn( f"Classifier {name} does not need sims or fits. Wat.") num_gen = 0 mask = config.get("MASK", "") mask_sim = config.get("MASK_SIM", "") mask_fit = config.get("MASK_FIT", "") for s, l in runs: sim_name = s.name if s is not None else None fit_name = l.name if l is not None else None matched_sim = True matched_fit = True if mask: matched_sim = matched_sim and mask in sim_name if mask_sim: matched_sim = matched_sim and mask_sim in sim_name if mask: matched_fit = matched_fit and mask in sim_name if mask_fit: matched_fit = matched_fit and mask_sim in sim_name if not matched_fit or not matched_sim: continue deps = [] if s is not None: deps.append(s) if l is not None: deps.append(l) model = options.get("MODEL") # Validate to make sure training samples only have one sim. if mode == Classifier.TRAIN: if s is not None: folders = s.output["sim_folders"] assert ( len(folders) == 1 ), f"Training requires one version of the sim, you have {len(folders)} for sim task {s}. Make sure your training sim doesn't set RANSEED_CHANGE" if l is not None: folders = l.output["fitres_dirs"] assert ( len(folders) == 1 ), f"Training requires one version of the lcfits, you have {len(folders)} for lcfit task {l}. Make sure your training sim doesn't set RANSEED_CHANGE" if model is not None: if "/" in model or "." in model: potential_path = get_output_loc(model) if os.path.exists(potential_path): extra = os.path.basename( os.path.dirname(potential_path)) # Nasty duplicate code, TODO fix this indexes = get_num_ranseed(s, l) for i in range(indexes): num = i + 1 if indexes > 1 else None clas_output_dir = _get_clas_output_dir( base_output_dir, stage_number, sim_name, fit_name, clas_name, index=num, extra=extra) cc = cls(clas_name, clas_output_dir, config, deps, mode, options, index=i, model_name=extra) Task.logger.info( f"Creating classification task {name} with {cc.num_jobs} jobs, for LC fit {fit_name} on simulation {sim_name} and index {i}" ) num_gen += 1 tasks.append(cc) else: Task.fail_config( f"Your model {model} looks like a path, but I couldn't find a model at {potential_path}" ) else: for t in tasks: if model == t.name: # deps.append(t) extra = t.get_unique_name() assert t.__class__ == cls, f"Model {clas_name} with class {cls} has model {model} with class {t.__class__}, they should match!" indexes = get_num_ranseed(s, l) for i in range(indexes): num = i + 1 if indexes > 1 else None clas_output_dir = _get_clas_output_dir( base_output_dir, stage_number, sim_name, fit_name, clas_name, index=num, extra=extra) cc = cls(clas_name, clas_output_dir, config, deps + [t], mode, options, index=i) Task.logger.info( f"Creating classification task {name} with {cc.num_jobs} jobs, for LC fit {fit_name} on simulation {sim_name} and index {i}" ) num_gen += 1 tasks.append(cc) else: indexes = get_num_ranseed(s, l) for i in range(indexes): num = i + 1 if indexes > 1 else None clas_output_dir = _get_clas_output_dir(base_output_dir, stage_number, sim_name, fit_name, clas_name, index=num) cc = cls(clas_name, clas_output_dir, config, deps, mode, options, index=i) Task.logger.info( f"Creating classification task {name} with {cc.num_jobs} jobs, for LC fit {fit_name} on simulation {sim_name} and index {i}" ) num_gen += 1 tasks.append(cc) if num_gen == 0: Task.fail_config( f"Classifier {clas_name} with masks |{mask}|{mask_sim}|{mask_fit}| matched no combination of sims and fits" ) return tasks
def __init__(self, name, output_dir, genversion, config, global_config, combine="combine.input"): self.data_dirs = global_config["DATA_DIRS"] base_file = get_data_loc(combine) super().__init__(name, output_dir, base_file, ": ") self.genversion = genversion if len(genversion) < 30: self.genprefix = self.genversion else: hash = get_hash(self.genversion)[:5] self.genprefix = self.genversion[:25] + hash self.config = config self.options = config.get("OPTS", {}) self.reserved_keywords = ["BASE"] self.config_path = f"{self.output_dir}/{self.genversion}.input" # Make sure this syncs with the tmp file name # Deterime the type of each component keys = [k for k in config.keys() if k != "GLOBAL" and k != "OPTS"] self.base_ia = [] self.base_cc = [] types = {} types_dict = {"IA": [], "NONIA": []} for k in keys: d = config[k] base_file = d.get("BASE") if base_file is None: Task.fail_config( f"Your simulation component {k} for sim name {self.name} needs to specify a BASE input file" ) base_path = get_data_loc(base_file) if base_path is None: Task.fail_config( f"Cannot find sim component {k} base file at {base_path} for sim name {self.name}" ) gentype, genmodel = None, None with open(base_path) as f: for line in f.read().splitlines(): if line.upper().strip().startswith("GENTYPE:"): gentype = line.upper().split(":")[1].strip() if line.upper().strip().startswith("GENMODEL:"): genmodel = line.upper().split(":")[1].strip() gentype = gentype or d.get("GENTYPE") genmodel = genmodel or d.get("GENMODEL") if not gentype: Task.fail_config( f"Cannot find GENTYPE for component {k} and base file {base_path}" ) if not genmodel: Task.fail_config( f"Cannot find GENMODEL for component {k} and base file {base_path}" ) type2 = "1" + f"{int(gentype):02d}" if "SALT2" in genmodel: self.base_ia.append(base_file) types[gentype] = "Ia" types[type2] = "Ia" types_dict["IA"].append(int(gentype)) types_dict["IA"].append(int(type2)) else: self.base_cc.append(base_file) types[gentype] = "II" types[type2] = "II" types_dict["NONIA"].append(int(gentype)) types_dict["NONIA"].append(int(type2)) sorted_types = collections.OrderedDict(sorted(types.items())) self.logger.debug(f"Types found: {json.dumps(sorted_types)}") self.output["types_dict"] = types_dict self.output["types"] = sorted_types self.global_config = global_config rankeys = [ r for r in config["GLOBAL"].keys() if r.startswith("RANSEED_") ] value = int( config["GLOBAL"][rankeys[0]].split(" ")[0]) if rankeys else 1 self.set_num_jobs(2 * value) self.sim_log_dir = f"{self.output_dir}/LOGS" self.total_summary = os.path.join(self.sim_log_dir, "TOTAL_SUMMARY.LOG") self.done_file = f"{self.output_dir}/FINISHED.DONE" self.logging_file = self.config_path.replace(".input", ".LOG") self.output["blind"] = self.options.get("BLIND", False) self.derived_batch_info = None # Determine if all the top level input files exist if len(self.base_ia + self.base_cc) == 0: Task.fail_config( "Your sim has no components specified! Please add something to simulate!" ) # Try to determine how many jobs will be put in the queue try: # If BATCH_INFO is set, we'll use that batch_info = self.config.get("GLOBAL", {}).get("BATCH_INFO") default_batch_info = self.get_property("BATCH_INFO", assignment=": ") # If its not set, lets check for ranseed_repeat or ranseed_change if batch_info is None: ranseed_repeat = self.config.get("GLOBAL", {}).get("RANSEED_REPEAT") ranseed_change = self.config.get("GLOBAL", {}).get("RANSEED_CHANGE") ranseed = ranseed_repeat or ranseed_change if ranseed: num_jobs = int(ranseed.strip().split()[0]) self.logger.debug( f"Found a randseed with {num_jobs}, deriving batch info" ) comps = default_batch_info.strip().split() comps[-1] = str(num_jobs) self.derived_batch_info = " ".join(comps) self.num_jobs = num_jobs else: # self.logger.debug(f"BATCH INFO property detected as {property}") self.num_jobs = int(default_batch_info.split()[-1]) except Exception: self.logger.warning( f"Unable to determine how many jobs simulation {self.name} has" ) self.num_jobs = 10 self.output["genversion"] = self.genversion self.output["genprefix"] = self.genprefix ranseed_change = self.config.get("GLOBAL", {}).get("RANSEED_CHANGE") base = os.path.expandvars( f"{self.global_config['SNANA']['sim_dir']}/{self.genversion}") if ranseed_change: num_sims = int(ranseed_change.split()[0]) self.logger.debug( "Detected randseed change with {num_sims} sims, updating sim_folders" ) self.sim_folders = [ base + f"-{i + 1:04d}" for i in range(num_sims) ] else: self.sim_folders = [base] self.output["ranseed_change"] = ranseed_change is not None self.output["sim_folders"] = self.sim_folders
def __init__(self, name, output_dir, config, dependencies, options, recal_aggtask): super().__init__(name, output_dir, config=config, dependencies=dependencies) self.passed = False self.classifiers = [d for d in dependencies if isinstance(d, Classifier)] self.lcfit_deps = [c.get_fit_dependency(output=False) for c in self.classifiers] self.lcfit_names = list(set([l.output["name"] for l in self.lcfit_deps if l is not None])) self.output["lcfit_names"] = self.lcfit_names if not self.lcfit_names: self.logger.debug("No jobs depend on the LCFIT, so adding a dummy one") self.lcfit_names = [""] self.sim_task = self.get_underlying_sim_task() self.output["sim_name"] = self.sim_task.name self.recal_aggtask = recal_aggtask self.num_versions = len(self.sim_task.output["sim_folders"]) self.output_dfs = [os.path.join(self.output_dir, f"merged_{i}.csv") for i in range(self.num_versions)] self.output_dfs_key = [[os.path.join(self.output_dir, f"merged_{l}_{i}.key") for l in self.lcfit_names] for i in range(self.num_versions)] self.output_cals = [os.path.join(self.output_dir, f"calibration_{i}.csv") for i in range(self.num_versions)] self.id = "CID" self.type_name = "SNTYPE" self.options = options self.include_type = bool(options.get("INCLUDE_TYPE", False)) self.plot = options.get("PLOT", False) self.plot_all = options.get("PLOT_ALL", False) self.output["classifier_names"] = [c.name for c in self.classifiers] self.output["classifier_indexes"] = [c.index for c in self.classifiers] self.output["calibration_files"] = self.output_cals self.output["empty_agg"] = False if isinstance(self.plot, bool): self.python_file = os.path.dirname(inspect.stack()[0][1]) + "/external/aggregator_plot.py" else: self.python_file = self.plot self.python_file = get_output_loc(self.python_file) if not os.path.exists(self.python_file): Task.fail_config(f"Attempting to find python file {self.python_file} but it's not there!") merge_classifiers = self.config.get("MERGE_CLASSIFIERS") self.classifier_merge = {c.output['name']: c.get_prob_column_name() for c in self.classifiers} if merge_classifiers is not None: self.classifier_merge = dict() for c in self.classifiers: prob_col = [] for prob_col_name in merge_classifiers.keys(): mask_list = ensure_list(merge_classifiers[prob_col_name]) match = False for m in mask_list: if match: continue else: if m in c.output['name']: match = True if match: if prob_col_name[:5] != "PROB_": prob_col_name = "PROB_" + prob_col_name prob_col.append(prob_col_name) if len(prob_col) == 1: self.classifier_merge[c.output['name']] = prob_col[0] else: if len(prob_col) == 0: self.classifier_merge[c.output['name']] = c.get_prob_column_name() else: Task.fail_config(f"Classifier task {c.output['name']} matched multiple MERGE_CLASSIFIERS keys: {prob_col}. Please provide more specific keys") self.logger.debug(f"Classifier merge = {self.classifier_merge}") self.output["classifier_merge"] = self.classifier_merge
def __init__(self, name, output_dir, options, global_config, dependencies=None): super().__init__(name, output_dir, dependencies=dependencies) self.options = options self.global_config = get_config() self.logfile = os.path.join(self.output_dir, "output.log") self.conda_env = self.global_config["DataSkimmer"]["conda_env"] self.path_to_task = output_dir self.unparsed_raw = self.options.get("RAW_DIR") self.raw_dir = get_data_loc(self.unparsed_raw) if self.raw_dir is None: Task.fail_config(f"Unable to find {self.options.get('RAW_DIR')}") self.genversion = os.path.basename(self.raw_dir) self.data_path = os.path.dirname(self.raw_dir) if self.unparsed_raw == "$SCRATCH_SIMDIR" or "SNDATA_ROOT/SIM" in self.raw_dir: self.logger.debug("Removing PRIVATE_DATA_PATH from NML file") self.data_path = "" self.job_name = os.path.basename( Path(output_dir).parents[1]) + "_DATAPREP_" + self.name self.output["genversion"] = self.genversion self.output["data_path"] = self.data_path self.output["photometry_dirs"] = [get_output_loc(self.raw_dir)] self.output["sim_folders"] = [get_output_loc(self.raw_dir)] self.output["raw_dir"] = self.raw_dir self.clump_file = os.path.join(self.output_dir, self.genversion + ".SNANA.TEXT") self.output["clump_file"] = self.clump_file self.output["ranseed_change"] = False is_sim = options.get("SIM", False) self.output["is_sim"] = is_sim self.output["blind"] = options.get("BLIND", not is_sim) self.types_dict = options.get("TYPES") if self.types_dict is None: self.types_dict = { "IA": [1], "NONIA": [ 2, 20, 21, 22, 29, 30, 31, 32, 33, 39, 40, 41, 42, 42, 43, 80, 81 ] } else: for key in self.types_dict.keys(): self.types_dict[key] = [int(c) for c in self.types_dict[key]] self.logger.debug(f"\tIA types are {self.types_dict['IA']}") self.logger.debug(f"\tNONIA types are {self.types_dict['NONIA']}") self.output["types_dict"] = self.types_dict self.types = OrderedDict() for n in self.types_dict["IA"]: self.types.update({n: "Ia"}) for n in self.types_dict["NONIA"]: self.types.update({n: "II"}) self.output["types"] = self.types self.slurm = """#!/bin/bash #SBATCH --job-name={job_name} #SBATCH --time=0:20:00 #SBATCH --nodes=1 #SBATCH --ntasks-per-node=1 #SBATCH --partition=broadwl #SBATCH --output={log_file} #SBATCH --account=pi-rkessler #SBATCH --mem=2GB cd {path_to_task} snana.exe clump.nml if [ $? -eq 0 ]; then echo SUCCESS > {done_file} else echo FAILURE > {done_file} fi """ self.clump_command = """#
def __init__(self, name, output_dir, sim_task, config, global_config): self.config = config self.global_config = global_config base = config.get("BASE") if base is None: Task.fail_config(f"You have not specified a BASE nml file for task {name}") self.base_file = get_data_loc(base) if self.base_file is None: Task.fail_config(f"Base file {base} cannot be found for task {name}") super().__init__(name, output_dir, config, self.base_file, " = ", dependencies=[sim_task]) self.sim_task = sim_task self.sim_version = sim_task.output["genversion"] self.config_path = self.output_dir + "/FIT_" + self.sim_version + ".nml" self.lc_output_dir = os.path.join(self.output_dir, "output") self.lc_log_dir = os.path.join(self.lc_output_dir, "SPLIT_JOBS_LCFIT") self.fitres_dirs = [os.path.join(self.lc_output_dir, os.path.basename(s)) for s in self.sim_task.output["sim_folders"]] self.logging_file = self.config_path.replace(".nml", ".LOG") self.kill_file = self.config_path.replace(".input", "_KILL.LOG") self.done_file = f"{self.lc_output_dir}/ALL.DONE" self.merge_log = os.path.join(self.lc_output_dir, "MERGE.LOG") self.log_files = [self.logging_file] self.num_empty_threshold = 20 # Damn that tarball creation can be so slow self.display_threshold = 8 self.output["fitres_dirs"] = self.fitres_dirs self.output["base_file"] = self.base_file self.output["nml_file"] = self.config_path self.output["genversion"] = self.sim_version self.output["sim_name"] = sim_task.output["name"] self.output["blind"] = sim_task.output["blind"] self.output["lc_output_dir"] = self.lc_output_dir self.str_pattern = re.compile("[A-DG-SU-Za-dg-su-z]") self.validate_fitopts(config) is_data = False for d in self.dependencies: if isinstance(d, DataPrep): is_data = not d.output["is_sim"] self.output["is_data"] = is_data self.options = self.config.get("OPTS", {}) # Try to determine how many jobs will be put in the queue # First see if it's been explicitly set num_jobs = self.options.get("NUM_JOBS") if num_jobs is not None: self.num_jobs = num_jobs self.logger.debug("Num jobs set by NUM_JOBS option") else: try: property = self.options.get("BATCH_INFO") or self.yaml["CONFIG"].get("BATCH_INFO") self.num_jobs = int(property.split()[-1]) self.logger.debug("Num jobs set by BATCH_INFO") except Exception: self.logger.warning("Could not determine BATCH_INFO for job, setting num_jobs to 10") self.num_jobs = 10 self.logger.debug("Num jobs set to default")