def get_tasks(c, prior_tasks, base_output_dir, stage_number, prefix, global_config): sim_tasks = Task.get_task_of_type(prior_tasks, SNANASimulation, DataPrep) classifier_tasks = Task.get_task_of_type(prior_tasks, Classifier) def _get_aggregator_dir(base_output_dir, stage_number, agg_name): return f"{base_output_dir}/{stage_number}_AGG/{agg_name}" tasks = [] # Check for recalibration, and if so, find that task first for agg_name in c.get("AGGREGATION", []): config = c["AGGREGATION"][agg_name] if config is None: config = {} options = config.get("OPTS", {}) mask = config.get("MASK", "") mask_sim = config.get("MASK_SIM", "") mask_clas = config.get("MASK_CLAS", "") recalibration = config.get("RECALIBRATION") recal_simtask = None recal_aggtask = None if recalibration: recal_sim = [i for i, s in enumerate(sim_tasks) if s.name == recalibration] if len(recal_sim) == 0: Task.fail_config(f"Recalibration sim {recalibration} not in the list of available sims: {[s.name for s in sim_tasks]}") elif len(recal_sim) > 1: Task.fail_config(f"Recalibration aggregation {recalibration} not in the list of available aggs: {[s.name for s in sim_tasks]}") # Move the recal sim task to the front of the queue so it executes first recal_sim_index = recal_sim[0] recal_simtask = sim_tasks[recal_sim_index] sim_tasks.insert(0, sim_tasks.pop(recal_sim_index)) for sim_task in sim_tasks: if mask_sim not in sim_task.name or mask not in sim_task.name and recal_simtask != sim_task: continue agg_name2 = f"{agg_name}_{sim_task.name}" deps = [ c for c in classifier_tasks if mask in c.name and mask_clas in c.name and c.mode == Classifier.PREDICT and c.get_simulation_dependency() == sim_task ] if len(deps) == 0: deps = [sim_task] if recalibration and sim_task != recal_simtask: if recal_aggtask is None: Task.fail_config(f"The aggregator task for {recalibration} has not been made yet. Sam probably screwed up dependency order.") else: deps.append(recal_aggtask) a = Aggregator(agg_name2, _get_aggregator_dir(base_output_dir, stage_number, agg_name2), config, deps, options, recal_aggtask) if sim_task == recal_simtask: recal_aggtask = a Task.logger.info(f"Creating aggregation task {agg_name2} for {sim_task.name} with {a.num_jobs} jobs") tasks.append(a) return tasks
def get_tasks(c, prior_tasks, base_output_dir, stage_number, prefix, global_config): agg_tasks = Task.get_task_of_type(prior_tasks, Aggregator) lcfit_tasks = Task.get_task_of_type(prior_tasks, SNANALightCurveFit) tasks = [] def _get_merge_output_dir(base_output_dir, stage_number, merge_name, lcfit_name): return f"{base_output_dir}/{stage_number}_MERGE/{merge_name}_{lcfit_name}" for name in c.get("MERGE", []): num_gen = 0 config = c["MERGE"].get(name, {}) if config is None: config = {} options = config.get("OPTS", {}) mask = config.get("MASK", "") mask_sim = config.get("MASK_SIM", "") mask_lc = config.get("MASK_FIT", "") mask_agg = config.get("MASK_AGG", "") for lcfit in lcfit_tasks: if mask and mask not in lcfit.name: continue if mask_lc and mask_lc not in lcfit.name: continue sim = lcfit.get_dep(SNANASimulation, DataPrep) if mask and mask not in sim.name: continue if mask_sim and mask_sim not in sim.name: continue for agg in agg_tasks: if mask_agg and mask_agg not in agg.name: continue if mask and mask not in agg.name: continue # Check if the sim is the same for both if sim != agg.get_underlying_sim_task(): continue num_gen += 1 merge_name2 = f"{name}_{lcfit.name}" task = Merger( merge_name2, _get_merge_output_dir(base_output_dir, stage_number, name, lcfit.name), config, [lcfit, agg], options) Task.logger.info( f"Creating merge task {merge_name2} for {lcfit.name} and {agg.name} with {task.num_jobs} jobs" ) tasks.append(task) if num_gen == 0: Task.fail_config( f"Merger {name} with mask {mask} matched no combination of aggregators and fits" ) return tasks
def get_tasks(c, prior_tasks, base_output_dir, stage_number, prefix, global_config): create_cov_tasks = Task.get_task_of_type(prior_tasks, CreateCov) def _get_wfit_dir(base_output_dir, stage_number, name): return f"{base_output_dir}/{stage_number}_COSMOFIT/WFIT/{name}" tasks = [] key = "WFIT" for name in c.get(key, []): config = c[key].get(name, {}) name = f"WFIT_{name}" options = config.get("OPTS", {}) mask = config.get("MASK", "") ctasks = [ ctask for ctask in create_cov_tasks if mask in ctask.name ] t = WFit(name, _get_wfit_dir(base_output_dir, stage_number, name), ctasks, config, options, global_config) Task.logger.info(f"Creating WFit task {name} {t.num_jobs} jobs") tasks.append(t) if len(create_cov_tasks) == 0: Task.fail_config( f"WFit task {name} has no create_cov task to run on!") return tasks
def get_tasks(c, prior_tasks, base_output_dir, stage_number, prefix, global_config): create_cov_tasks = Task.get_task_of_type(prior_tasks, CreateCov) def _get_cosmomc_dir(base_output_dir, stage_number, name): return f"{base_output_dir}/{stage_number}_COSMOFIT/COSMOMC/{name}" tasks = [] key = "COSMOMC" for cname in c.get(key, []): config = c[key].get(cname, {}) options = config.get("OPTS", {}) mask = config.get("MASK_CREATE_COV", config.get("MASK", "")) # Check if this is static. Could scan the folder, but dont have all the chains yet. # TODO: Update this when I have all the chains if options.get("INI") in ["cmb_omw", "cmb_omol"]: a = CosmoMC( cname, _get_cosmomc_dir(base_output_dir, stage_number, cname), config, options, global_config) Task.logger.info( f"Creating CosmoMC task {cname} for {a.num_jobs} jobs") tasks.append(a) else: for ctask in create_cov_tasks: if mask not in ctask.name: continue name = f"COSMOMC_{cname}_{ctask.name}" a = CosmoMC(name, _get_cosmomc_dir(base_output_dir, stage_number, name), config, options, global_config, dependencies=[ctask]) Task.logger.info( f"Creating CosmoMC task {name} for {ctask.name} with {a.num_jobs} jobs" ) tasks.append(a) if len(create_cov_tasks) == 0: Task.fail_config( f"CosmoMC task {cname} has no create_cov task to run on!" ) return tasks
def get_tasks(c, prior_tasks, base_output_dir, stage_number, prefix, global_config): biascor_tasks = Task.get_task_of_type(prior_tasks, BiasCor) def _get_createcov_dir(base_output_dir, stage_number, name): return f"{base_output_dir}/{stage_number}_CREATE_COV/{name}" tasks = [] for cname in c.get("CREATE_COV", []): config = c["CREATE_COV"][cname] if config is None: config = {} options = config.get("OPTS", {}) mask = config.get("MASK", config.get("MASK_BIASCOR", "")) for btask in biascor_tasks: if mask not in btask.name: continue num = len(btask.output["subdirs"]) for i in range(num): ii = "" if num == 1 else f"_{i + 1}" name = f"{cname}_{btask.name}{ii}" a = CreateCov(name, _get_createcov_dir(base_output_dir, stage_number, name), config, options, global_config, dependencies=[btask], index=i) Task.logger.info( f"Creating createcov task {name} for {btask.name} with {a.num_jobs} jobs" ) tasks.append(a) if len(biascor_tasks) == 0: Task.fail_config( f"Create cov task {cname} has no biascor task to run on!") return tasks
def get_tasks(c, prior_tasks, base_output_dir, stage_number, prefix, global_config): from pippin.classifiers.factory import ClassifierFactory def _get_clas_output_dir(base_output_dir, stage_number, sim_name, fit_name, clas_name, index=None, extra=None): sim_name = "" if sim_name is None or fit_name is not None else "_" + sim_name fit_name = "" if fit_name is None else "_" + fit_name extra_name = "" if extra is None else "_" + extra index = "" if index is None else f"_{index}" return f"{base_output_dir}/{stage_number}_CLAS/{clas_name}{index}{sim_name}{fit_name}{extra_name}" def get_num_ranseed(sim_task, lcfit_task): if sim_task is not None: return len(sim_task.output["sim_folders"]) if lcfit_task is not None: return len(lcfit_task.output["fitres_dirs"]) raise ValueError( "Classifier dependency has no sim_task or lcfit_task?") tasks = [] lcfit_tasks = Task.get_task_of_type(prior_tasks, SNANALightCurveFit) sim_tasks = Task.get_task_of_type(prior_tasks, DataPrep, SNANASimulation) for clas_name in c.get("CLASSIFICATION", []): config = c["CLASSIFICATION"][clas_name] name = config["CLASSIFIER"] cls = ClassifierFactory.get(name) options = config.get("OPTS", {}) if "MODE" not in config: Task.fail_config( f"Classifier task {clas_name} needs to specify MODE as train or predict" ) mode = config["MODE"].lower() assert mode in ["train", "predict" ], "MODE should be either train or predict" if mode == "train": mode = Classifier.TRAIN else: mode = Classifier.PREDICT # Validate that train is not used on certain classifiers if mode == Classifier.TRAIN: assert name not in [ "PerfectClassifier", "UnityClassifier", "FitProbClassifier" ], f"Can not use train mode with {name}" needs_sim, needs_lc = cls.get_requirements(options) runs = [] if needs_sim and needs_lc: runs = [(l.dependencies[0], l) for l in lcfit_tasks] elif needs_sim: runs = [(s, None) for s in sim_tasks] elif needs_lc: runs = [(l.dependencies[0], l) for l in lcfit_tasks] else: Task.logger.warn( f"Classifier {name} does not need sims or fits. Wat.") num_gen = 0 mask = config.get("MASK", "") mask_sim = config.get("MASK_SIM", "") mask_fit = config.get("MASK_FIT", "") for s, l in runs: sim_name = s.name if s is not None else None fit_name = l.name if l is not None else None matched_sim = True matched_fit = True if mask: matched_sim = matched_sim and mask in sim_name if mask_sim: matched_sim = matched_sim and mask_sim in sim_name if mask: matched_fit = matched_fit and mask in sim_name if mask_fit: matched_fit = matched_fit and mask_sim in sim_name if not matched_fit or not matched_sim: continue deps = [] if s is not None: deps.append(s) if l is not None: deps.append(l) model = options.get("MODEL") # Validate to make sure training samples only have one sim. if mode == Classifier.TRAIN: if s is not None: folders = s.output["sim_folders"] assert ( len(folders) == 1 ), f"Training requires one version of the sim, you have {len(folders)} for sim task {s}. Make sure your training sim doesn't set RANSEED_CHANGE" if l is not None: folders = l.output["fitres_dirs"] assert ( len(folders) == 1 ), f"Training requires one version of the lcfits, you have {len(folders)} for lcfit task {l}. Make sure your training sim doesn't set RANSEED_CHANGE" if model is not None: if "/" in model or "." in model: potential_path = get_output_loc(model) if os.path.exists(potential_path): extra = os.path.basename( os.path.dirname(potential_path)) # Nasty duplicate code, TODO fix this indexes = get_num_ranseed(s, l) for i in range(indexes): num = i + 1 if indexes > 1 else None clas_output_dir = _get_clas_output_dir( base_output_dir, stage_number, sim_name, fit_name, clas_name, index=num, extra=extra) cc = cls(clas_name, clas_output_dir, config, deps, mode, options, index=i, model_name=extra) Task.logger.info( f"Creating classification task {name} with {cc.num_jobs} jobs, for LC fit {fit_name} on simulation {sim_name} and index {i}" ) num_gen += 1 tasks.append(cc) else: Task.fail_config( f"Your model {model} looks like a path, but I couldn't find a model at {potential_path}" ) else: for t in tasks: if model == t.name: # deps.append(t) extra = t.get_unique_name() assert t.__class__ == cls, f"Model {clas_name} with class {cls} has model {model} with class {t.__class__}, they should match!" indexes = get_num_ranseed(s, l) for i in range(indexes): num = i + 1 if indexes > 1 else None clas_output_dir = _get_clas_output_dir( base_output_dir, stage_number, sim_name, fit_name, clas_name, index=num, extra=extra) cc = cls(clas_name, clas_output_dir, config, deps + [t], mode, options, index=i) Task.logger.info( f"Creating classification task {name} with {cc.num_jobs} jobs, for LC fit {fit_name} on simulation {sim_name} and index {i}" ) num_gen += 1 tasks.append(cc) else: indexes = get_num_ranseed(s, l) for i in range(indexes): num = i + 1 if indexes > 1 else None clas_output_dir = _get_clas_output_dir(base_output_dir, stage_number, sim_name, fit_name, clas_name, index=num) cc = cls(clas_name, clas_output_dir, config, deps, mode, options, index=i) Task.logger.info( f"Creating classification task {name} with {cc.num_jobs} jobs, for LC fit {fit_name} on simulation {sim_name} and index {i}" ) num_gen += 1 tasks.append(cc) if num_gen == 0: Task.fail_config( f"Classifier {clas_name} with masks |{mask}|{mask_sim}|{mask_fit}| matched no combination of sims and fits" ) return tasks
def get_tasks(c, prior_tasks, base_output_dir, stage_number, prefix, global_config): merge_tasks = Task.get_task_of_type(prior_tasks, Merger) prob_cols = { k: v for d in [t.output["classifier_merge"] for t in merge_tasks] for k, v in d.items() } classifier_tasks = Task.get_task_of_type(prior_tasks, Classifier) tasks = [] def _get_biascor_output_dir(base_output_dir, stage_number, biascor_name): return f"{base_output_dir}/{stage_number}_BIASCOR/{biascor_name}" for name in c.get("BIASCOR", []): gname = name config = c["BIASCOR"][name] options = config.get("OPTS", {}) deps = [] # Create dict but swap out the names for tasks # do this for key 0 and for muopts # modify config directly # create copy to start with to keep labels if needed config_copy = copy.deepcopy(config) # Should return a single classifier task which maps to the desired prob column def resolve_classifiers(names): task = [c for c in classifier_tasks if c.name in names] if len(task) == 0: if len(names) > 1: Task.fail_config( f"CLASSIFIERS {names} do not match any classifiers. If these are prob column names, you must specify only one!" ) Task.logger.info( f"CLASSIFIERS {names} matched no classifiers. Checking prob column names instead." ) task = [ c for c in classifier_tasks if prob_cols[c.name] in names ] if len(task) == 0: choices = [prob_cols[c.name] for c in task] message = f"Unable to resolve classifiers {names} from list of classifiers {classifier_tasks} using either name or prob columns {choices}" Task.fail_config(message) else: task = [task[0]] elif len(task) > 1: choices = list(set([prob_cols[c.name] for c in task])) if len(choices) == 1: task = [task[0]] else: Task.fail_config( f"Found multiple classifiers. Please instead specify a column name. Your choices: {choices}" ) return task[0] # We only care about the prob column name def resolve_merged_fitres_files(name, classifier_name): task = [ m for m in merge_tasks if m.output["lcfit_name"] == name ] if len(task) == 0: valid = [m.output["lcfit_name"] for m in merge_tasks] message = f"Unable to resolve merge {name} from list of merge_tasks. There are valid options: {valid}" Task.fail_config(message) elif len(task) > 1: message = f"Resolved multiple merge tasks {task} for name {name}" Task.fail_config(message) else: if classifier_name is not None and classifier_name not in task[ 0].output["classifier_names"]: if prob_cols[classifier_name] not in [ prob_cols[n] for n in task[0].output['classifier_names'] ]: Task.logger.warning( f"When constructing Biascor {gname}, merge input {name} does not have classifier {classifier_name}. " f"If this is a spec confirmed sample, or an EXTERNAL task, all good, else check this." ) return task[0] # Ensure classifiers point to the same prob column def validate_classifiers(classifier_names): prob_col = [] for name in classifier_names: col = prob_cols.get(name) if col is None: # Check whether it is instead the prob_col name if name in prob_cols.values(): prob_col.append(name) else: Task.fail_config( f"Classifier {name} has no prob column name in {prob_cols}. This should never happen!" ) else: prob_col.append(col) if len(set(prob_col)) > 1: Task.fail_config( f"Classifiers {classifier_names} map to different probability columns: {prob_cols}, you may need to map them to the same name via MERGE_CLASSIFIERS in the AGGREGATION stage." ) else: Task.logger.debug( f"Classifiers {classifier_names} map to {prob_col[0]}") def resolve_conf(subdict, default=None): """ Resolve the sub-dictionary and keep track of all the dependencies """ deps = [] # If this is a muopt, allow access to the base config's resolution if default is None: default = {} # Get the specific classifier classifier_names = subdict.get( "CLASSIFIER") # Specific classifier name if classifier_names is not None: classifier_names = ensure_list(classifier_names) validate_classifiers(classifier_names) #Task.logger.debug(f"XXX names: {classifier_names}") # Only if all classifiers point to the same prob_column should you continue classifier_task = None if classifier_names is not None: classifier_task = resolve_classifiers(classifier_names) #Task.logger.debug(f"XXX tasks: {classifier_task}") classifier_dep = classifier_task or default.get( "CLASSIFIER") # For resolving merge tasks if classifier_dep is not None: classifier_dep = classifier_dep.name #Task.logger.debug(f"XXX deps: {classifier_dep}") if "CLASSIFIER" in subdict: subdict["CLASSIFIER"] = classifier_task if classifier_task is not None: deps.append(classifier_task) #Task.logger.debug(f"XXX global deps: {deps}") # Get the Ia sims simfile_ia = subdict.get("SIMFILE_BIASCOR") if default is None and simfile_ia is None: Task.fail_config( f"You must specify SIMFILE_BIASCOR for the default biascor. Supply a simulation name that has a merged output" ) if simfile_ia is not None: simfile_ia = ensure_list(simfile_ia) simfile_ia_tasks = [ resolve_merged_fitres_files(s, classifier_dep) for s in simfile_ia ] deps += simfile_ia_tasks subdict["SIMFILE_BIASCOR"] = simfile_ia_tasks # Resolve the cc sims simfile_cc = subdict.get("SIMFILE_CCPRIOR") if default is None and simfile_ia is None: message = f"No SIMFILE_CCPRIOR specified. Hope you're doing a Ia only analysis" Task.logger.warning(message) if simfile_cc is not None: simfile_cc = ensure_list(simfile_cc) simfile_cc_tasks = [ resolve_merged_fitres_files(s, classifier_dep) for s in simfile_cc ] deps += simfile_cc_tasks subdict["SIMFILE_CCPRIOR"] = simfile_cc_tasks return deps # Changes to dict are by ref, will modify original deps += resolve_conf(config) # Resolve the data section data_names = config.get("DATA") if data_names is None: Task.fail_config( "For BIASCOR tasks you need to specify an input DATA which is a mask for a merged task" ) data_names = ensure_list(data_names) class_task = config.get("CLASSIFIER") class_name = class_task.name if class_task is not None else None data_tasks = [ resolve_merged_fitres_files(s, class_name) for s in data_names ] deps += data_tasks config["DATA"] = data_tasks config["PROB_COLS"] = prob_cols # Resolve every MUOPT muopts = config.get("MUOPTS", {}) for label, mu_conf in muopts.items(): deps += resolve_conf(mu_conf, default=config) task = BiasCor( name, _get_biascor_output_dir(base_output_dir, stage_number, name), config, deps, options, global_config) Task.logger.info( f"Creating aggregation task {name} with {task.num_jobs}") tasks.append(task) return tasks