Esempio n. 1
0
    def get_tasks(config, prior_tasks, base_output_dir, stage_number, prefix,
                  global_config):
        tasks = []

        all_deps = Task.match_tasks_of_type(None, prior_tasks, DataPrep,
                                            SNANASimulation)
        for fit_name in config.get("LCFIT", []):
            num_matches = 0
            fit_config = config["LCFIT"][fit_name]
            mask = fit_config.get("MASK", "")

            sim_tasks = Task.match_tasks_of_type(mask, prior_tasks, DataPrep,
                                                 SNANASimulation)
            for sim in sim_tasks:
                num_matches += 1
                fit_output_dir = f"{base_output_dir}/{stage_number}_LCFIT/{fit_name}_{sim.name}"
                f = SNANALightCurveFit(f"{fit_name}_{sim.name}",
                                       fit_output_dir, sim, fit_config,
                                       global_config)
                Task.logger.info(
                    f"Creating fitting task {fit_name} with {f.num_jobs} jobs, for simulation {sim.name}"
                )
                tasks.append(f)
            if num_matches == 0:
                Task.fail_config(
                    f"LCFIT task {fit_name} with mask '{mask}' matched no sim_names: {[sim.name for sim in all_deps]}"
                )
        return tasks
Esempio n. 2
0
    def get_tasks(c, prior_tasks, base_output_dir, stage_number, prefix,
                  global_config):

        create_cov_tasks = Task.get_task_of_type(prior_tasks, CreateCov)

        def _get_wfit_dir(base_output_dir, stage_number, name):
            return f"{base_output_dir}/{stage_number}_COSMOFIT/WFIT/{name}"

        tasks = []
        key = "WFIT"
        for name in c.get(key, []):
            config = c[key].get(name, {})
            name = f"WFIT_{name}"
            options = config.get("OPTS", {})

            mask = config.get("MASK", "")

            ctasks = [
                ctask for ctask in create_cov_tasks if mask in ctask.name
            ]

            t = WFit(name, _get_wfit_dir(base_output_dir, stage_number, name),
                     ctasks, config, options, global_config)
            Task.logger.info(f"Creating WFit task {name} {t.num_jobs} jobs")
            tasks.append(t)

            if len(create_cov_tasks) == 0:
                Task.fail_config(
                    f"WFit task {name} has no create_cov task to run on!")
        return tasks
Esempio n. 3
0
    def classify(self):
        new_hash = self.get_hash_from_string(self.name +
                                             f"{self.prob_ia}_{self.prob_cc}")

        if self._check_regenerate(new_hash):
            shutil.rmtree(self.output_dir, ignore_errors=True)
            mkdirs(self.output_dir)
            try:
                name = self.get_prob_column_name()
                cid = "CID"
                s = self.get_simulation_dependency()
                df = None
                phot_dir = s.output["photometry_dirs"][self.index]
                headers = [
                    os.path.join(phot_dir, a) for a in os.listdir(phot_dir)
                    if "HEAD" in a
                ]
                if not headers:
                    Task.fail_config(
                        f"No HEAD fits files found in {phot_dir}!")
                else:
                    types = self.get_simulation_dependency(
                    ).output["types_dict"]
                    self.logger.debug(f"Input types are {types}")

                    for h in headers:
                        with fits.open(h) as hdul:
                            data = hdul[1].data
                            snid = np.array(data.field("SNID"))
                            sntype = np.array(data.field("SNTYPE")).astype(
                                np.int64)
                            is_ia = np.isin(sntype, types["IA"])
                            prob = (is_ia * self.prob_ia) + (~is_ia *
                                                             self.prob_cc)

                            dataframe = pd.DataFrame({cid: snid, name: prob})
                            dataframe[cid] = dataframe[cid].apply(str)
                            dataframe[cid] = dataframe[cid].str.strip()
                            if df is None:
                                df = dataframe
                            else:
                                df = pd.concat([df, dataframe])
                    df.drop_duplicates(subset=cid, inplace=True)

                self.logger.info(f"Saving probabilities to {self.output_file}")
                df.to_csv(self.output_file, index=False, float_format="%0.4f")
                chown_dir(self.output_dir)
                with open(self.done_file, "w") as f:
                    f.write("SUCCESS")
                self.save_new_hash(new_hash)
            except Exception as e:
                self.logger.exception(e, exc_info=True)
                self.passed = False
                with open(self.done_file, "w") as f:
                    f.write("FAILED")
                return False
        else:
            self.should_be_done()
        self.passed = True
        return True
Esempio n. 4
0
    def get_tasks(task_config, prior_tasks, output_dir, stage_num, prefix,
                  global_config):
        from pippin.cosmofitters.factory import FitterFactory
        Task.logger.debug("Setting up CosmoFit tasks")

        tasks = []

        for fitter_name in task_config.get("COSMOFIT", []):
            Task.logger.info(
                f"Found fitter of type {fitter_name}, generating tasks.")
            config = {fitter_name: task_config["COSMOFIT"][fitter_name]}
            Task.logger.debug(f"Config for {fitter_name}: {config}")
            fitter = FitterFactory.get(fitter_name.lower())
            Task.logger.debug(f"Fitter class for {fitter_name}: {fitter}")
            if fitter is None:
                Task.logger.error(
                    f"Fitter of type {fitter_name} not found, perhaps it's a typo? Skipping."
                )
                continue
            Task.logger.debug(
                f"get_task function for {fitter_name}: {fitter.get_tasks}")
            ts = fitter.get_tasks(config, prior_tasks, output_dir, stage_num,
                                  prefix, global_config)
            Task.logger.debug(f"{fitter} tasks: {ts}")
            tasks += ts
            if len(tasks) == 0:
                Task.fail_config("No CosmoFit tasks generated!")
            Task.logger.info(f"Generated {len(tasks)} CosmoFit tasks.")
        return tasks
Esempio n. 5
0
    def get_tasks(c, prior_tasks, base_output_dir, stage_number, prefix,
                  global_config):
        agg_tasks = Task.get_task_of_type(prior_tasks, Aggregator)
        lcfit_tasks = Task.get_task_of_type(prior_tasks, SNANALightCurveFit)
        tasks = []

        def _get_merge_output_dir(base_output_dir, stage_number, merge_name,
                                  lcfit_name):
            return f"{base_output_dir}/{stage_number}_MERGE/{merge_name}_{lcfit_name}"

        for name in c.get("MERGE", []):
            num_gen = 0
            config = c["MERGE"].get(name, {})
            if config is None:
                config = {}
            options = config.get("OPTS", {})
            mask = config.get("MASK", "")
            mask_sim = config.get("MASK_SIM", "")
            mask_lc = config.get("MASK_FIT", "")
            mask_agg = config.get("MASK_AGG", "")

            for lcfit in lcfit_tasks:
                if mask and mask not in lcfit.name:
                    continue
                if mask_lc and mask_lc not in lcfit.name:
                    continue
                sim = lcfit.get_dep(SNANASimulation, DataPrep)
                if mask and mask not in sim.name:
                    continue
                if mask_sim and mask_sim not in sim.name:
                    continue

                for agg in agg_tasks:
                    if mask_agg and mask_agg not in agg.name:
                        continue
                    if mask and mask not in agg.name:
                        continue

                    # Check if the sim is the same for both
                    if sim != agg.get_underlying_sim_task():
                        continue
                    num_gen += 1

                    merge_name2 = f"{name}_{lcfit.name}"
                    task = Merger(
                        merge_name2,
                        _get_merge_output_dir(base_output_dir, stage_number,
                                              name, lcfit.name), config,
                        [lcfit, agg], options)
                    Task.logger.info(
                        f"Creating merge task {merge_name2} for {lcfit.name} and {agg.name} with {task.num_jobs} jobs"
                    )
                    tasks.append(task)
            if num_gen == 0:
                Task.fail_config(
                    f"Merger {name} with mask {mask} matched no combination of aggregators and fits"
                )
        return tasks
Esempio n. 6
0
    def __init__(self, name, output_dir, dependencies, options, recal_aggtask):
        super().__init__(name, output_dir, dependencies=dependencies)
        self.passed = False
        self.classifiers = [
            d for d in dependencies if isinstance(d, Classifier)
        ]
        self.lcfit_deps = [
            c.get_fit_dependency(output=False) for c in self.classifiers
        ]
        self.lcfit_names = list(
            set([l.output["name"] for l in self.lcfit_deps if l is not None]))
        self.output["lcfit_names"] = self.lcfit_names
        if not self.lcfit_names:
            self.logger.debug(
                "No jobs depend on the LCFIT, so adding a dummy one")
            self.lcfit_names = [""]

        self.sim_task = self.get_underlying_sim_task()
        self.output["sim_name"] = self.sim_task.name
        self.recal_aggtask = recal_aggtask
        self.num_versions = len(self.sim_task.output["sim_folders"])

        self.output_dfs = [
            os.path.join(self.output_dir, f"merged_{i}.csv")
            for i in range(self.num_versions)
        ]
        self.output_dfs_key = [[
            os.path.join(self.output_dir, f"merged_{l}_{i}.key")
            for l in self.lcfit_names
        ] for i in range(self.num_versions)]
        self.output_cals = [
            os.path.join(self.output_dir, f"calibration_{i}.csv")
            for i in range(self.num_versions)
        ]

        self.id = "CID"
        self.type_name = "SNTYPE"
        self.options = options
        self.include_type = bool(options.get("INCLUDE_TYPE", False))
        self.plot = options.get("PLOT", True)
        self.plot_all = options.get("PLOT_ALL", False)
        self.output["classifiers"] = self.classifiers
        self.output["calibration_files"] = self.output_cals
        if isinstance(self.plot, bool):
            self.python_file = os.path.dirname(
                inspect.stack()[0][1]) + "/external/aggregator_plot.py"
        else:
            self.python_file = self.plot
        self.python_file = get_output_loc(self.python_file)

        if not os.path.exists(self.python_file):
            Task.fail_config(
                f"Attempting to find python file {self.python_file} but it's not there!"
            )
Esempio n. 7
0
    def get_tasks(configs, prior_tasks, base_output_dir, stage_number, prefix,
                  global_config):
        def _get_analyse_dir(base_output_dir, stage_number, name):
            return f"{base_output_dir}/{stage_number}_ANALYSE/{name}"

        tasks = []
        key = "ANALYSE"
        for cname in configs.get(key, []):
            config = configs[key].get(cname, {})
            if config is None:
                config = {}
            options = config.get("OPTS", {})

            mask_cosmofit = config.get("MASK_COSMOFIT")
            mask_biascor = config.get("MASK_BIASCOR")
            if config.get("HISTOGRAM") is not None:
                Task.fail_config(
                    "Sorry to do this, but please change HISTOGRAM into MASK_LCFIT to bring it into line with others."
                )
            mask_lcfit = config.get("MASK_LCFIT")
            # TODO: Add aggregation to compile all the plots here

            deps_cosmofit = Task.match_tasks_of_type(mask_cosmofit,
                                                     prior_tasks,
                                                     CosmoFit,
                                                     match_none=False,
                                                     allowed_failure=True)
            Task.logger.debug(f"deps_cosmofit: {deps_cosmofit}")
            deps_biascor = Task.match_tasks_of_type(mask_biascor,
                                                    prior_tasks,
                                                    BiasCor,
                                                    match_none=False)
            Task.logger.debug(f"deps_biascor: {deps_biascor}")
            deps_lcfit = Task.match_tasks_of_type(mask_lcfit,
                                                  prior_tasks,
                                                  SNANALightCurveFit,
                                                  match_none=False)
            Task.logger.debug(f"deps_lcfit: {deps_lcfit}")

            deps = deps_cosmofit + deps_biascor + deps_lcfit
            if len(deps) == 0:
                Task.fail_config(f"Analyse task {cname} has no dependencies!")

            a = AnalyseChains(
                cname, _get_analyse_dir(base_output_dir, stage_number, cname),
                config, options, deps)
            Task.logger.info(
                f"Creating Analyse task {cname} for {[c.name for c in deps]} with {a.num_jobs} jobs"
            )
            tasks.append(a)

        return tasks
Esempio n. 8
0
    def get_tasks(c, prior_tasks, base_output_dir, stage_number, prefix,
                  global_config):

        create_cov_tasks = Task.get_task_of_type(prior_tasks, CreateCov)

        def _get_cosmomc_dir(base_output_dir, stage_number, name):
            return f"{base_output_dir}/{stage_number}_COSMOFIT/COSMOMC/{name}"

        tasks = []
        key = "COSMOMC"
        for cname in c.get(key, []):
            config = c[key].get(cname, {})
            options = config.get("OPTS", {})

            mask = config.get("MASK_CREATE_COV", config.get("MASK", ""))

            # Check if this is static. Could scan the folder, but dont have all the chains yet.
            # TODO: Update this when I have all the chains
            if options.get("INI") in ["cmb_omw", "cmb_omol"]:
                a = CosmoMC(
                    cname,
                    _get_cosmomc_dir(base_output_dir, stage_number, cname),
                    config, options, global_config)
                Task.logger.info(
                    f"Creating CosmoMC task {cname} for {a.num_jobs} jobs")
                tasks.append(a)

            else:
                for ctask in create_cov_tasks:
                    if mask not in ctask.name:
                        continue
                    name = f"COSMOMC_{cname}_{ctask.name}"
                    a = CosmoMC(name,
                                _get_cosmomc_dir(base_output_dir, stage_number,
                                                 name),
                                config,
                                options,
                                global_config,
                                dependencies=[ctask])
                    Task.logger.info(
                        f"Creating CosmoMC task {name} for {ctask.name} with {a.num_jobs} jobs"
                    )
                    tasks.append(a)

                if len(create_cov_tasks) == 0:
                    Task.fail_config(
                        f"CosmoMC task {cname} has no create_cov task to run on!"
                    )

        return tasks
Esempio n. 9
0
 def get_tasks(config, prior_tasks, base_output_dir, stage_number, prefix,
               global_config):
     tasks = []
     for name in config.get("DATAPREP", []):
         output_dir = f"{base_output_dir}/{stage_number}_DATAPREP/{name}"
         options = config["DATAPREP"][name].get("OPTS")
         if options is None:
             Task.fail_config(
                 f"DATAPREP task {name} needs to specify OPTS!")
         s = DataPrep(name, output_dir, options, global_config)
         Task.logger.debug(
             f"Creating data prep task {name} with {s.num_jobs} jobs, output to {output_dir}"
         )
         tasks.append(s)
     return tasks
Esempio n. 10
0
 def validate_model(self):
     if self.mode == Classifier.PREDICT:
         model = self.options.get("MODEL")
         if model is None:
             Task.fail_config(
                 f"Classifier {self.name} is in predict mode but does not have a model specified"
             )
         model_classifier = self.get_model_classifier()
         if model_classifier is not None and model_classifier.name == model:
             return True
         path = get_data_loc(model)
         if not os.path.exists(path):
             Task.fail_config(
                 f"Classifier {self.name} does not have a classifier dependency and model is not a serialised file path"
             )
     return True
Esempio n. 11
0
    def get_tasks(c, prior_tasks, base_output_dir, stage_number, prefix,
                  global_config):

        biascor_tasks = Task.get_task_of_type(prior_tasks, BiasCor)

        def _get_createcov_dir(base_output_dir, stage_number, name):
            return f"{base_output_dir}/{stage_number}_CREATE_COV/{name}"

        tasks = []
        for cname in c.get("CREATE_COV", []):
            config = c["CREATE_COV"][cname]
            if config is None:
                config = {}
            options = config.get("OPTS", {})
            mask = config.get("MASK", config.get("MASK_BIASCOR", ""))

            for btask in biascor_tasks:
                if mask not in btask.name:
                    continue

                num = len(btask.output["subdirs"])
                for i in range(num):
                    ii = "" if num == 1 else f"_{i + 1}"

                    name = f"{cname}_{btask.name}{ii}"
                    a = CreateCov(name,
                                  _get_createcov_dir(base_output_dir,
                                                     stage_number, name),
                                  config,
                                  options,
                                  global_config,
                                  dependencies=[btask],
                                  index=i)
                    Task.logger.info(
                        f"Creating createcov task {name} for {btask.name} with {a.num_jobs} jobs"
                    )
                    tasks.append(a)

            if len(biascor_tasks) == 0:
                Task.fail_config(
                    f"Create cov task {cname} has no biascor task to run on!")

        return tasks
Esempio n. 12
0
 def validate_classifiers(classifier_names):
     prob_col = []
     for name in classifier_names:
         col = prob_cols.get(name)
         if col is None:
             # Check whether it is instead the prob_col name
             if name in prob_cols.values():
                 prob_col.append(name)
             else:
                 Task.fail_config(
                     f"Classifier {name} has no prob column name in {prob_cols}. This should never happen!"
                 )
         else:
             prob_col.append(col)
     if len(set(prob_col)) > 1:
         Task.fail_config(
             f"Classifiers {classifier_names} map to different probability columns: {prob_cols}, you may need to map them to the same name via MERGE_CLASSIFIERS in the AGGREGATION stage."
         )
     else:
         Task.logger.debug(
             f"Classifiers {classifier_names} map to {prob_col[0]}")
Esempio n. 13
0
 def resolve_classifiers(names):
     task = [c for c in classifier_tasks if c.name in names]
     if len(task) == 0:
         if len(names) > 1:
             Task.fail_config(
                 f"CLASSIFIERS {names} do not match any classifiers. If these are prob column names, you must specify only one!"
             )
         Task.logger.info(
             f"CLASSIFIERS {names} matched no classifiers. Checking prob column names instead."
         )
         task = [
             c for c in classifier_tasks
             if prob_cols[c.name] in names
         ]
         if len(task) == 0:
             choices = [prob_cols[c.name] for c in task]
             message = f"Unable to resolve classifiers {names} from list of classifiers {classifier_tasks} using either name or prob columns {choices}"
             Task.fail_config(message)
         else:
             task = [task[0]]
     elif len(task) > 1:
         choices = list(set([prob_cols[c.name] for c in task]))
         if len(choices) == 1:
             task = [task[0]]
         else:
             Task.fail_config(
                 f"Found multiple classifiers. Please instead specify a column name. Your choices: {choices}"
             )
     return task[0]  # We only care about the prob column name
Esempio n. 14
0
    def get_tasks(c, prior_tasks, base_output_dir, stage_number, prefix, global_config):
        sim_tasks = Task.get_task_of_type(prior_tasks, SNANASimulation, DataPrep)
        classifier_tasks = Task.get_task_of_type(prior_tasks, Classifier)

        def _get_aggregator_dir(base_output_dir, stage_number, agg_name):
            return f"{base_output_dir}/{stage_number}_AGG/{agg_name}"

        tasks = []

        # Check for recalibration, and if so, find that task first
        for agg_name in c.get("AGGREGATION", []):
            config = c["AGGREGATION"][agg_name]
            if config is None:
                config = {}
            options = config.get("OPTS", {})
            mask = config.get("MASK", "")
            mask_sim = config.get("MASK_SIM", "")
            mask_clas = config.get("MASK_CLAS", "")
            recalibration = config.get("RECALIBRATION")
            recal_simtask = None
            recal_aggtask = None
            if recalibration:
                recal_sim = [i for i, s in enumerate(sim_tasks) if s.name == recalibration]

                if len(recal_sim) == 0:
                    Task.fail_config(f"Recalibration sim {recalibration} not in the list of available sims: {[s.name for s in sim_tasks]}")
                elif len(recal_sim) > 1:
                    Task.fail_config(f"Recalibration aggregation {recalibration} not in the list of available aggs: {[s.name for s in sim_tasks]}")

                # Move the recal sim task to the front of the queue so it executes first
                recal_sim_index = recal_sim[0]
                recal_simtask = sim_tasks[recal_sim_index]
                sim_tasks.insert(0, sim_tasks.pop(recal_sim_index))

            for sim_task in sim_tasks:
                if mask_sim not in sim_task.name or mask not in sim_task.name and recal_simtask != sim_task:
                    continue

                agg_name2 = f"{agg_name}_{sim_task.name}"
                deps = [
                    c
                    for c in classifier_tasks
                    if mask in c.name and mask_clas in c.name and c.mode == Classifier.PREDICT and c.get_simulation_dependency() == sim_task
                ]
                if len(deps) == 0:
                    deps = [sim_task]

                if recalibration and sim_task != recal_simtask:
                    if recal_aggtask is None:
                        Task.fail_config(f"The aggregator task for {recalibration} has not been made yet. Sam probably screwed up dependency order.")
                    else:
                        deps.append(recal_aggtask)
                a = Aggregator(agg_name2, _get_aggregator_dir(base_output_dir, stage_number, agg_name2), config, deps, options, recal_aggtask)
                if sim_task == recal_simtask:
                    recal_aggtask = a
                Task.logger.info(f"Creating aggregation task {agg_name2} for {sim_task.name} with {a.num_jobs} jobs")
                tasks.append(a)

        return tasks
Esempio n. 15
0
    def __init__(self, name, output_dir, create_cov_tasks, config, options,
                 global_config):
        # First check if all required options exist
        # In this case, WFITOPTS must exist with at least 1 entry

        self.wfitopts = options.get("WFITOPTS")
        if self.wfitopts is None:
            Task.fail_config(
                f"You have not specified any WFITOPTS for task {name}")
        Task.logger.debug(f"WFITOPTS for task {name}: {self.wfitopts}")
        if len(self.wfitopts) == 0:
            Task.fail_config(
                f"WFITOPTS for task {name} does not have any options!")

        base_file = get_data_loc("wfit/input_file.INPUT")
        super().__init__(name,
                         output_dir,
                         config,
                         base_file,
                         default_assignment=": ",
                         dependencies=create_cov_tasks)
        self.num_jobs = len(self.wfitopts)

        self.create_cov_tasks = create_cov_tasks
        self.logger.debug("CreateCov tasks: {self.create_cov_tasks}")
        self.create_cov_dirs = [
            os.path.join(t.output_dir, "output") for t in self.create_cov_tasks
        ]
        self.logger.debug("CreateCov directories: {self.create_cov_dirs}")
        self.options = options
        self.global_config = global_config
        self.done_file = os.path.join(self.output_dir, "output", "ALL.DONE")

        self.job_name = os.path.basename(
            Path(output_dir).parents[1]) + "_WFIT_" + name
        self.logfile = os.path.join(self.output_dir, "output.log")
        self.input_name = f"{self.job_name}.INPUT"
        self.input_file = os.path.join(self.output_dir, self.input_name)
Esempio n. 16
0
 def resolve_merged_fitres_files(name, classifier_name):
     task = [
         m for m in merge_tasks if m.output["lcfit_name"] == name
     ]
     if len(task) == 0:
         valid = [m.output["lcfit_name"] for m in merge_tasks]
         message = f"Unable to resolve merge {name} from list of merge_tasks. There are valid options: {valid}"
         Task.fail_config(message)
     elif len(task) > 1:
         message = f"Resolved multiple merge tasks {task} for name {name}"
         Task.fail_config(message)
     else:
         if classifier_name is not None and classifier_name not in task[
                 0].output["classifier_names"]:
             if prob_cols[classifier_name] not in [
                     prob_cols[n]
                     for n in task[0].output['classifier_names']
             ]:
                 Task.logger.warning(
                     f"When constructing Biascor {gname}, merge input {name} does not have classifier {classifier_name}. "
                     f"If this is a spec confirmed sample, or an EXTERNAL task, all good, else check this."
                 )
         return task[0]
Esempio n. 17
0
    def __init__(self,
                 name,
                 output_dir,
                 config,
                 options,
                 global_config,
                 dependencies=None):
        super().__init__(name,
                         output_dir,
                         config=config,
                         dependencies=dependencies)
        self.options = options
        self.global_config = get_config()

        self.logfile = os.path.join(self.output_dir, "output.log")
        self.conda_env = self.global_config["DataSkimmer"]["conda_env"]
        self.path_to_task = output_dir

        self.unparsed_raw = self.options.get("RAW_DIR")
        self.raw_dir = get_data_loc(self.unparsed_raw)
        if self.raw_dir is None:
            Task.fail_config(f"Unable to find {self.options.get('RAW_DIR')}")

        self.genversion = os.path.basename(self.raw_dir)
        self.data_path = os.path.dirname(self.raw_dir)
        if self.unparsed_raw == "$SCRATCH_SIMDIR" or "SNDATA_ROOT/SIM" in self.raw_dir:
            self.logger.debug("Removing PRIVATE_DATA_PATH from NML file")
            self.data_path = ""
        self.job_name = os.path.basename(
            Path(output_dir).parents[1]) + "_DATAPREP_" + self.name

        self.output_info = os.path.join(self.output_dir,
                                        f"{self.genversion}.YAML")
        self.output["genversion"] = self.genversion
        self.opt_setpkmjd = options.get("OPT_SETPKMJD", 16)
        self.photflag_mskrej = options.get("PHOTFLAG_MSKREJ", 1016)
        self.output["data_path"] = self.data_path
        self.output["photometry_dirs"] = [get_output_loc(self.raw_dir)]
        self.output["sim_folders"] = [get_output_loc(self.raw_dir)]
        self.output["raw_dir"] = self.raw_dir
        self.clump_file = os.path.join(self.output_dir,
                                       self.genversion + ".SNANA.TEXT")
        self.output["clump_file"] = self.clump_file
        self.output["ranseed_change"] = False
        is_sim = options.get("SIM", False)
        self.output["is_sim"] = is_sim
        self.output["blind"] = options.get("BLIND", True)

        self.types_dict = options.get("TYPES")
        if self.types_dict is None:
            self.types_dict = {
                "IA": [1],
                "NONIA": [
                    2, 20, 21, 22, 29, 30, 31, 32, 33, 39, 40, 41, 42, 43, 80,
                    81
                ]
            }
        else:
            for key in self.types_dict.keys():
                self.types_dict[key] = [int(c) for c in self.types_dict[key]]

        self.batch_file = self.options.get("BATCH_FILE")
        if self.batch_file is not None:
            self.batch_file = get_data_loc(self.batch_file)
        self.batch_replace = self.options.get("BATCH_REPLACE", {})

        self.logger.debug(f"\tIA types are {self.types_dict['IA']}")
        self.logger.debug(f"\tNONIA types are {self.types_dict['NONIA']}")
        self.output["types_dict"] = self.types_dict
        self.types = OrderedDict()
        for n in self.types_dict["IA"]:
            self.types.update({n: "Ia"})
        for n in self.types_dict["NONIA"]:
            self.types.update({n: "II"})
        self.output["types"] = self.types

        self.slurm = """{sbatch_header}
        {task_setup}"""

        self.clump_command = """#
Esempio n. 18
0
    def get_tasks(c, prior_tasks, base_output_dir, stage_number, prefix,
                  global_config):
        merge_tasks = Task.get_task_of_type(prior_tasks, Merger)
        prob_cols = {
            k: v
            for d in [t.output["classifier_merge"] for t in merge_tasks]
            for k, v in d.items()
        }
        classifier_tasks = Task.get_task_of_type(prior_tasks, Classifier)
        tasks = []

        def _get_biascor_output_dir(base_output_dir, stage_number,
                                    biascor_name):
            return f"{base_output_dir}/{stage_number}_BIASCOR/{biascor_name}"

        for name in c.get("BIASCOR", []):
            gname = name
            config = c["BIASCOR"][name]
            options = config.get("OPTS", {})
            deps = []

            # Create dict but swap out the names for tasks
            # do this for key 0 and for muopts
            # modify config directly
            # create copy to start with to keep labels if needed
            config_copy = copy.deepcopy(config)

            # Should return a single classifier task which maps to the desired prob column
            def resolve_classifiers(names):
                task = [c for c in classifier_tasks if c.name in names]
                if len(task) == 0:
                    if len(names) > 1:
                        Task.fail_config(
                            f"CLASSIFIERS {names} do not match any classifiers. If these are prob column names, you must specify only one!"
                        )
                    Task.logger.info(
                        f"CLASSIFIERS {names} matched no classifiers. Checking prob column names instead."
                    )
                    task = [
                        c for c in classifier_tasks
                        if prob_cols[c.name] in names
                    ]
                    if len(task) == 0:
                        choices = [prob_cols[c.name] for c in task]
                        message = f"Unable to resolve classifiers {names} from list of classifiers {classifier_tasks} using either name or prob columns {choices}"
                        Task.fail_config(message)
                    else:
                        task = [task[0]]
                elif len(task) > 1:
                    choices = list(set([prob_cols[c.name] for c in task]))
                    if len(choices) == 1:
                        task = [task[0]]
                    else:
                        Task.fail_config(
                            f"Found multiple classifiers. Please instead specify a column name. Your choices: {choices}"
                        )
                return task[0]  # We only care about the prob column name

            def resolve_merged_fitres_files(name, classifier_name):
                task = [
                    m for m in merge_tasks if m.output["lcfit_name"] == name
                ]
                if len(task) == 0:
                    valid = [m.output["lcfit_name"] for m in merge_tasks]
                    message = f"Unable to resolve merge {name} from list of merge_tasks. There are valid options: {valid}"
                    Task.fail_config(message)
                elif len(task) > 1:
                    message = f"Resolved multiple merge tasks {task} for name {name}"
                    Task.fail_config(message)
                else:
                    if classifier_name is not None and classifier_name not in task[
                            0].output["classifier_names"]:
                        if prob_cols[classifier_name] not in [
                                prob_cols[n]
                                for n in task[0].output['classifier_names']
                        ]:
                            Task.logger.warning(
                                f"When constructing Biascor {gname}, merge input {name} does not have classifier {classifier_name}. "
                                f"If this is a spec confirmed sample, or an EXTERNAL task, all good, else check this."
                            )
                    return task[0]

            # Ensure classifiers point to the same prob column
            def validate_classifiers(classifier_names):
                prob_col = []
                for name in classifier_names:
                    col = prob_cols.get(name)
                    if col is None:
                        # Check whether it is instead the prob_col name
                        if name in prob_cols.values():
                            prob_col.append(name)
                        else:
                            Task.fail_config(
                                f"Classifier {name} has no prob column name in {prob_cols}. This should never happen!"
                            )
                    else:
                        prob_col.append(col)
                if len(set(prob_col)) > 1:
                    Task.fail_config(
                        f"Classifiers {classifier_names} map to different probability columns: {prob_cols}, you may need to map them to the same name via MERGE_CLASSIFIERS in the AGGREGATION stage."
                    )
                else:
                    Task.logger.debug(
                        f"Classifiers {classifier_names} map to {prob_col[0]}")

            def resolve_conf(subdict, default=None):
                """ Resolve the sub-dictionary and keep track of all the dependencies """
                deps = []

                # If this is a muopt, allow access to the base config's resolution
                if default is None:
                    default = {}

                # Get the specific classifier
                classifier_names = subdict.get(
                    "CLASSIFIER")  # Specific classifier name
                if classifier_names is not None:
                    classifier_names = ensure_list(classifier_names)
                    validate_classifiers(classifier_names)
                #Task.logger.debug(f"XXX names: {classifier_names}")
                # Only if all classifiers point to the same prob_column should you continue
                classifier_task = None
                if classifier_names is not None:
                    classifier_task = resolve_classifiers(classifier_names)
                #Task.logger.debug(f"XXX tasks: {classifier_task}")
                classifier_dep = classifier_task or default.get(
                    "CLASSIFIER")  # For resolving merge tasks
                if classifier_dep is not None:
                    classifier_dep = classifier_dep.name
                #Task.logger.debug(f"XXX deps: {classifier_dep}")
                if "CLASSIFIER" in subdict:
                    subdict["CLASSIFIER"] = classifier_task
                    if classifier_task is not None:
                        deps.append(classifier_task)
                #Task.logger.debug(f"XXX global deps: {deps}")

                # Get the Ia sims
                simfile_ia = subdict.get("SIMFILE_BIASCOR")
                if default is None and simfile_ia is None:
                    Task.fail_config(
                        f"You must specify SIMFILE_BIASCOR for the default biascor. Supply a simulation name that has a merged output"
                    )
                if simfile_ia is not None:
                    simfile_ia = ensure_list(simfile_ia)
                    simfile_ia_tasks = [
                        resolve_merged_fitres_files(s, classifier_dep)
                        for s in simfile_ia
                    ]
                    deps += simfile_ia_tasks
                    subdict["SIMFILE_BIASCOR"] = simfile_ia_tasks

                # Resolve the cc sims
                simfile_cc = subdict.get("SIMFILE_CCPRIOR")
                if default is None and simfile_ia is None:
                    message = f"No SIMFILE_CCPRIOR specified. Hope you're doing a Ia only analysis"
                    Task.logger.warning(message)
                if simfile_cc is not None:
                    simfile_cc = ensure_list(simfile_cc)
                    simfile_cc_tasks = [
                        resolve_merged_fitres_files(s, classifier_dep)
                        for s in simfile_cc
                    ]
                    deps += simfile_cc_tasks
                    subdict["SIMFILE_CCPRIOR"] = simfile_cc_tasks

                return deps  # Changes to dict are by ref, will modify original

            deps += resolve_conf(config)
            # Resolve the data section
            data_names = config.get("DATA")
            if data_names is None:
                Task.fail_config(
                    "For BIASCOR tasks you need to specify an input DATA which is a mask for a merged task"
                )
            data_names = ensure_list(data_names)
            class_task = config.get("CLASSIFIER")
            class_name = class_task.name if class_task is not None else None
            data_tasks = [
                resolve_merged_fitres_files(s, class_name) for s in data_names
            ]
            deps += data_tasks
            config["DATA"] = data_tasks

            config["PROB_COLS"] = prob_cols

            # Resolve every MUOPT
            muopts = config.get("MUOPTS", {})
            for label, mu_conf in muopts.items():
                deps += resolve_conf(mu_conf, default=config)

            task = BiasCor(
                name,
                _get_biascor_output_dir(base_output_dir, stage_number, name),
                config, deps, options, global_config)
            Task.logger.info(
                f"Creating aggregation task {name} with {task.num_jobs}")
            tasks.append(task)

        return tasks
Esempio n. 19
0
    def __init__(self, name, output_dir, sim_task, config, global_config):

        self.config = config
        self.global_config = global_config

        base = config.get("BASE")
        if base is None:
            Task.fail_config(
                f"You have not specified a BASE nml file for task {name}")
        self.base_file = get_data_loc(base)
        if self.base_file is None:
            Task.fail_config(
                f"Base file {base} cannot be found for task {name}")

        super().__init__(name,
                         output_dir,
                         self.base_file,
                         " = ",
                         dependencies=[sim_task])

        self.sim_task = sim_task
        self.sim_version = sim_task.output["genversion"]
        self.config_path = self.output_dir + "/FIT_" + self.sim_version + ".nml"
        self.lc_output_dir = os.path.join(self.output_dir, "output")
        self.lc_log_dir = os.path.join(self.lc_output_dir, "SPLIT_JOBS_LCFIT")
        self.fitres_dirs = [
            os.path.join(self.lc_output_dir, os.path.basename(s))
            for s in self.sim_task.output["sim_folders"]
        ]

        self.logging_file = self.config_path.replace(".nml", ".nml_log")
        self.done_file = f"{self.output_dir}/FINISHED.DONE"
        secondary_log = os.path.join(self.lc_log_dir, "MERGELOGS/MERGE2.LOG")

        self.log_files = [self.logging_file, secondary_log]
        self.num_empty_threshold = 20  # Damn that tarball creation can be so slow
        self.display_threshold = 8
        self.output["fitres_dirs"] = self.fitres_dirs
        self.output["nml_file"] = self.config_path
        self.output["genversion"] = self.sim_version
        self.output["sim_name"] = sim_task.output["name"]
        self.output["blind"] = sim_task.output["blind"]
        self.output["lc_output_dir"] = self.lc_output_dir
        self.str_pattern = re.compile("[A-DG-SU-Za-dg-su-z]")

        is_data = False
        for d in self.dependencies:
            if isinstance(d, DataPrep):
                is_data = not d.output["is_sim"]
        self.output["is_data"] = is_data

        # Loading fitopts
        fitopts = config.get("FITOPTS", [])
        if isinstance(fitopts, str):
            fitopts = [fitopts]

        self.logger.debug("Loading fitopts")
        self.fitopts = []
        for f in fitopts:
            potential_path = get_data_loc(f)
            if os.path.exists(potential_path):
                self.logger.debug(f"Loading in fitopts from {potential_path}")
                with open(potential_path) as f:
                    new_fitopts = list(f.read().splitlines())
                    self.fitopts += new_fitopts
                    self.logger.debug(
                        f"Loaded {len(new_fitopts)} fitopts file from {potential_path}"
                    )
            else:
                assert "[" in f and "]" in f, f"Manual fitopt {f} for lcfit {self.name} should specify a label in square brackets"
                if not f.startswith("FITOPT:"):
                    f = "FITOPT: " + f
                self.logger.debug(f"Adding manual fitopt {f}")
                self.fitopts.append(f)
        # Map the fitopt outputs
        mapped = {"DEFAULT": "FITOPT000.FITRES"}
        mapped2 = {0: "DEFAULT"}
        for i, line in enumerate(self.fitopts):
            label = line.split("[")[1].split("]")[0]
            mapped[line] = f"FITOPT{i + 1:3d}.FITRES"
            mapped2[i] = label
        self.output["fitopt_map"] = mapped
        self.output["fitopt_index"] = mapped
        self.output["fitres_file"] = os.path.join(self.fitres_dirs[0],
                                                  mapped["DEFAULT"])

        self.options = self.config.get("OPTS", {})
        # Try to determine how many jobs will be put in the queue
        try:
            property = self.options.get("BATCH_INFO") or self.get_property(
                "BATCH_INFO", assignment=": ")
            self.num_jobs = int(property.split()[-1])
        except Exception:
            self.num_jobs = 10
Esempio n. 20
0
            def resolve_conf(subdict, default=None):
                """ Resolve the sub-dictionary and keep track of all the dependencies """
                deps = []

                # If this is a muopt, allow access to the base config's resolution
                if default is None:
                    default = {}

                # Get the specific classifier
                classifier_names = subdict.get(
                    "CLASSIFIER")  # Specific classifier name
                if classifier_names is not None:
                    classifier_names = ensure_list(classifier_names)
                    validate_classifiers(classifier_names)
                #Task.logger.debug(f"XXX names: {classifier_names}")
                # Only if all classifiers point to the same prob_column should you continue
                classifier_task = None
                if classifier_names is not None:
                    classifier_task = resolve_classifiers(classifier_names)
                #Task.logger.debug(f"XXX tasks: {classifier_task}")
                classifier_dep = classifier_task or default.get(
                    "CLASSIFIER")  # For resolving merge tasks
                if classifier_dep is not None:
                    classifier_dep = classifier_dep.name
                #Task.logger.debug(f"XXX deps: {classifier_dep}")
                if "CLASSIFIER" in subdict:
                    subdict["CLASSIFIER"] = classifier_task
                    if classifier_task is not None:
                        deps.append(classifier_task)
                #Task.logger.debug(f"XXX global deps: {deps}")

                # Get the Ia sims
                simfile_ia = subdict.get("SIMFILE_BIASCOR")
                if default is None and simfile_ia is None:
                    Task.fail_config(
                        f"You must specify SIMFILE_BIASCOR for the default biascor. Supply a simulation name that has a merged output"
                    )
                if simfile_ia is not None:
                    simfile_ia = ensure_list(simfile_ia)
                    simfile_ia_tasks = [
                        resolve_merged_fitres_files(s, classifier_dep)
                        for s in simfile_ia
                    ]
                    deps += simfile_ia_tasks
                    subdict["SIMFILE_BIASCOR"] = simfile_ia_tasks

                # Resolve the cc sims
                simfile_cc = subdict.get("SIMFILE_CCPRIOR")
                if default is None and simfile_ia is None:
                    message = f"No SIMFILE_CCPRIOR specified. Hope you're doing a Ia only analysis"
                    Task.logger.warning(message)
                if simfile_cc is not None:
                    simfile_cc = ensure_list(simfile_cc)
                    simfile_cc_tasks = [
                        resolve_merged_fitres_files(s, classifier_dep)
                        for s in simfile_cc
                    ]
                    deps += simfile_cc_tasks
                    subdict["SIMFILE_CCPRIOR"] = simfile_cc_tasks

                return deps  # Changes to dict are by ref, will modify original
Esempio n. 21
0
    def __init__(self,
                 name,
                 output_dir,
                 config,
                 global_config,
                 combine="combine.input"):
        self.data_dirs = global_config["DATA_DIRS"]
        base_file = get_data_loc(combine)
        super().__init__(name, output_dir, config, base_file, ": ")

        # Check for any replacements
        path_sndata_sim = get_config().get("SNANA").get("sim_dir")
        self.logger.debug(f"Setting PATH_SNDATA_SIM to {path_sndata_sim}")
        self.yaml["CONFIG"]["PATH_SNDATA_SIM"] = path_sndata_sim

        self.genversion = self.config["GENVERSION"]
        if len(self.genversion) < 30:
            self.genprefix = self.genversion
        else:
            hash = get_hash(self.genversion)[:5]
            self.genprefix = self.genversion[:25] + hash

        self.options = self.config.get("OPTS", {})

        self.reserved_keywords = ["BASE"]
        self.reserved_top = ["GENVERSION", "GLOBAL", "OPTS", "EXTERNAL"]
        self.config_path = f"{self.output_dir}/{self.genversion}.input"  # Make sure this syncs with the tmp file name
        self.global_config = global_config

        self.sim_log_dir = f"{self.output_dir}/LOGS"
        self.total_summary = os.path.join(self.sim_log_dir, "MERGE.LOG")
        self.done_file = f"{self.output_dir}/LOGS/ALL.DONE"
        self.logging_file = self.config_path.replace(".input", ".LOG")
        self.kill_file = self.config_path.replace(".input", "_KILL.LOG")

        if "EXTERNAL" not in self.config.keys():
            # Deterime the type of each component
            keys = [
                k for k in self.config.keys() if k not in self.reserved_top
            ]
            self.base_ia = []
            self.base_cc = []
            types = {}
            types_dict = {"IA": [], "NONIA": []}
            for k in keys:
                d = self.config[k]
                base_file = d.get("BASE")
                if base_file is None:
                    Task.fail_config(
                        f"Your simulation component {k} for sim name {self.name} needs to specify a BASE input file"
                    )
                base_path = get_data_loc(base_file)
                if base_path is None:
                    Task.fail_config(
                        f"Cannot find sim component {k} base file at {base_path} for sim name {self.name}"
                    )

                gentype, genmodel = None, None
                with open(base_path) as f:
                    for line in f.read().splitlines():
                        if line.upper().strip().startswith("GENTYPE:"):
                            gentype = line.upper().split(":")[1].strip()
                        if line.upper().strip().startswith("GENMODEL:"):
                            genmodel = line.upper().split(":")[1].strip()

                gentype = gentype or d.get("GENTYPE")
                if gentype is None:
                    self.fail_config(
                        f"The simulation component {k} needs to specify a GENTYPE in its input file"
                    )
                gentype = int(gentype)
                genmodel = genmodel or d.get("GENMODEL")

                if not gentype:
                    Task.fail_config(
                        f"Cannot find GENTYPE for component {k} and base file {base_path}"
                    )
                if not genmodel:
                    Task.fail_config(
                        f"Cannot find GENMODEL for component {k} and base file {base_path}"
                    )

                type2 = 100 + gentype
                if "SALT2" in genmodel:
                    self.base_ia.append(base_file)
                    types[gentype] = "Ia"
                    types[type2] = "Ia"
                    types_dict["IA"].append(gentype)
                    types_dict["IA"].append(type2)
                else:
                    self.base_cc.append(base_file)
                    types[gentype] = "II"
                    types[type2] = "II"
                    types_dict["NONIA"].append(gentype)
                    types_dict["NONIA"].append(type2)

            sorted_types = dict(sorted(types.items()))
            self.logger.debug(f"Types found: {json.dumps(sorted_types)}")
            self.output["types_dict"] = types_dict
            self.output["types"] = sorted_types

            rankeys = [
                r for r in self.config["GLOBAL"].keys()
                if r.startswith("RANSEED_")
            ]
            value = int(self.config["GLOBAL"][rankeys[0]].split(" ")
                        [0]) if rankeys else 1
            self.set_num_jobs(2 * value)

            self.output["blind"] = self.options.get("BLIND", False)
            self.derived_batch_info = None

            # Determine if all the top level input files exist
            if len(self.base_ia + self.base_cc) == 0:
                Task.fail_config(
                    "Your sim has no components specified! Please add something to simulate!"
                )

            # Try to determine how many jobs will be put in the queue
            # First see if it's been explicitly set
            num_jobs = self.options.get("NUM_JOBS")
            if num_jobs is not None:
                self.num_jobs = num_jobs
                self.logger.debug(
                    f"Num jobs set by NUM_JOBS option to {self.num_jobs}")
            else:
                try:
                    # If BATCH_INFO is set, we'll use that
                    batch_info = self.config.get("GLOBAL",
                                                 {}).get("BATCH_INFO")
                    default_batch_info = self.yaml["CONFIG"].get("BATCH_INFO")

                    # If its not set, lets check for ranseed_repeat or ranseed_change
                    if batch_info is None:
                        ranseed_repeat = self.config.get(
                            "GLOBAL", {}).get("RANSEED_REPEAT")
                        ranseed_change = self.config.get(
                            "GLOBAL", {}).get("RANSEED_CHANGE")
                        default = self.yaml.get("CONFIG",
                                                {}).get("RANSEED_REPEAT")
                        ranseed = ranseed_repeat or ranseed_change or default

                        if ranseed:
                            num_jobs = int(ranseed.strip().split()[0])
                            self.logger.debug(
                                f"Found a randseed with {num_jobs}, deriving batch info"
                            )
                            comps = default_batch_info.strip().split()
                            comps[-1] = str(num_jobs)
                            self.derived_batch_info = " ".join(comps)
                            self.num_jobs = num_jobs
                            self.logger.debug(
                                f"Num jobs set by RANSEED to {self.num_jobs}")
                    else:
                        # self.logger.debug(f"BATCH INFO property detected as {property}")
                        self.num_jobs = int(batch_info.split()[-1])
                        self.logger.debug(
                            f"Num jobs set by BATCH_INFO to {self.num_jobs}")
                except Exception:
                    self.logger.warning(
                        f"Unable to determine how many jobs simulation {self.name} has"
                    )
                    self.num_jobs = 1

            self.output["genversion"] = self.genversion
            self.output["genprefix"] = self.genprefix

            self.ranseed_change = self.config.get("GLOBAL",
                                                  {}).get("RANSEED_CHANGE")
            base = os.path.expandvars(self.global_config["SNANA"]["sim_dir"])
            self.output["ranseed_change"] = self.ranseed_change is not None
            self.output["ranseed_change_val"] = self.ranseed_change
            self.get_sim_folders(base, self.genversion)
            self.output["sim_folders"] = self.sim_folders
        else:
            self.sim_folders = self.output["sim_folders"]
Esempio n. 22
0
    def get_tasks(c, prior_tasks, base_output_dir, stage_number, prefix,
                  global_config):
        from pippin.classifiers.factory import ClassifierFactory

        def _get_clas_output_dir(base_output_dir,
                                 stage_number,
                                 sim_name,
                                 fit_name,
                                 clas_name,
                                 index=None,
                                 extra=None):
            sim_name = "" if sim_name is None or fit_name is not None else "_" + sim_name
            fit_name = "" if fit_name is None else "_" + fit_name
            extra_name = "" if extra is None else "_" + extra
            index = "" if index is None else f"_{index}"
            return f"{base_output_dir}/{stage_number}_CLAS/{clas_name}{index}{sim_name}{fit_name}{extra_name}"

        def get_num_ranseed(sim_task, lcfit_task):
            if sim_task is not None:
                return len(sim_task.output["sim_folders"])
            if lcfit_task is not None:
                return len(lcfit_task.output["fitres_dirs"])
            raise ValueError(
                "Classifier dependency has no sim_task or lcfit_task?")

        tasks = []
        lcfit_tasks = Task.get_task_of_type(prior_tasks, SNANALightCurveFit)
        sim_tasks = Task.get_task_of_type(prior_tasks, DataPrep,
                                          SNANASimulation)
        for clas_name in c.get("CLASSIFICATION", []):
            config = c["CLASSIFICATION"][clas_name]
            name = config["CLASSIFIER"]
            cls = ClassifierFactory.get(name)
            options = config.get("OPTS", {})
            if "MODE" not in config:
                Task.fail_config(
                    f"Classifier task {clas_name} needs to specify MODE as train or predict"
                )
            mode = config["MODE"].lower()
            assert mode in ["train", "predict"
                            ], "MODE should be either train or predict"
            if mode == "train":
                mode = Classifier.TRAIN
            else:
                mode = Classifier.PREDICT

            # Validate that train is not used on certain classifiers
            if mode == Classifier.TRAIN:
                assert name not in [
                    "PerfectClassifier", "UnityClassifier", "FitProbClassifier"
                ], f"Can not use train mode with {name}"

            needs_sim, needs_lc = cls.get_requirements(options)

            runs = []
            if needs_sim and needs_lc:
                runs = [(l.dependencies[0], l) for l in lcfit_tasks]
            elif needs_sim:
                runs = [(s, None) for s in sim_tasks]
            elif needs_lc:
                runs = [(l.dependencies[0], l) for l in lcfit_tasks]
            else:
                Task.logger.warn(
                    f"Classifier {name} does not need sims or fits. Wat.")

            num_gen = 0
            mask = config.get("MASK", "")
            mask_sim = config.get("MASK_SIM", "")
            mask_fit = config.get("MASK_FIT", "")
            for s, l in runs:

                sim_name = s.name if s is not None else None
                fit_name = l.name if l is not None else None
                matched_sim = True
                matched_fit = True
                if mask:
                    matched_sim = matched_sim and mask in sim_name
                if mask_sim:
                    matched_sim = matched_sim and mask_sim in sim_name
                if mask:
                    matched_fit = matched_fit and mask in sim_name
                if mask_fit:
                    matched_fit = matched_fit and mask_sim in sim_name
                if not matched_fit or not matched_sim:
                    continue
                deps = []
                if s is not None:
                    deps.append(s)
                if l is not None:
                    deps.append(l)

                model = options.get("MODEL")

                # Validate to make sure training samples only have one sim.
                if mode == Classifier.TRAIN:
                    if s is not None:
                        folders = s.output["sim_folders"]
                        assert (
                            len(folders) == 1
                        ), f"Training requires one version of the sim, you have {len(folders)} for sim task {s}. Make sure your training sim doesn't set RANSEED_CHANGE"
                    if l is not None:
                        folders = l.output["fitres_dirs"]
                        assert (
                            len(folders) == 1
                        ), f"Training requires one version of the lcfits, you have {len(folders)} for lcfit task {l}. Make sure your training sim doesn't set RANSEED_CHANGE"
                if model is not None:
                    if "/" in model or "." in model:
                        potential_path = get_output_loc(model)
                        if os.path.exists(potential_path):
                            extra = os.path.basename(
                                os.path.dirname(potential_path))

                            # Nasty duplicate code, TODO fix this
                            indexes = get_num_ranseed(s, l)
                            for i in range(indexes):
                                num = i + 1 if indexes > 1 else None
                                clas_output_dir = _get_clas_output_dir(
                                    base_output_dir,
                                    stage_number,
                                    sim_name,
                                    fit_name,
                                    clas_name,
                                    index=num,
                                    extra=extra)
                                cc = cls(clas_name,
                                         clas_output_dir,
                                         config,
                                         deps,
                                         mode,
                                         options,
                                         index=i,
                                         model_name=extra)
                                Task.logger.info(
                                    f"Creating classification task {name} with {cc.num_jobs} jobs, for LC fit {fit_name} on simulation {sim_name} and index {i}"
                                )
                                num_gen += 1
                                tasks.append(cc)

                        else:
                            Task.fail_config(
                                f"Your model {model} looks like a path, but I couldn't find a model at {potential_path}"
                            )
                    else:
                        for t in tasks:
                            if model == t.name:
                                # deps.append(t)
                                extra = t.get_unique_name()

                                assert t.__class__ == cls, f"Model {clas_name} with class {cls} has model {model} with class {t.__class__}, they should match!"

                                indexes = get_num_ranseed(s, l)
                                for i in range(indexes):
                                    num = i + 1 if indexes > 1 else None
                                    clas_output_dir = _get_clas_output_dir(
                                        base_output_dir,
                                        stage_number,
                                        sim_name,
                                        fit_name,
                                        clas_name,
                                        index=num,
                                        extra=extra)
                                    cc = cls(clas_name,
                                             clas_output_dir,
                                             config,
                                             deps + [t],
                                             mode,
                                             options,
                                             index=i)
                                    Task.logger.info(
                                        f"Creating classification task {name} with {cc.num_jobs} jobs, for LC fit {fit_name} on simulation {sim_name} and index {i}"
                                    )
                                    num_gen += 1
                                    tasks.append(cc)
                else:

                    indexes = get_num_ranseed(s, l)
                    for i in range(indexes):
                        num = i + 1 if indexes > 1 else None
                        clas_output_dir = _get_clas_output_dir(base_output_dir,
                                                               stage_number,
                                                               sim_name,
                                                               fit_name,
                                                               clas_name,
                                                               index=num)
                        cc = cls(clas_name,
                                 clas_output_dir,
                                 config,
                                 deps,
                                 mode,
                                 options,
                                 index=i)
                        Task.logger.info(
                            f"Creating classification task {name} with {cc.num_jobs} jobs, for LC fit {fit_name} on simulation {sim_name} and index {i}"
                        )
                        num_gen += 1
                        tasks.append(cc)

            if num_gen == 0:
                Task.fail_config(
                    f"Classifier {clas_name} with masks |{mask}|{mask_sim}|{mask_fit}| matched no combination of sims and fits"
                )
        return tasks
Esempio n. 23
0
    def __init__(self,
                 name,
                 output_dir,
                 genversion,
                 config,
                 global_config,
                 combine="combine.input"):
        self.data_dirs = global_config["DATA_DIRS"]
        base_file = get_data_loc(combine)
        super().__init__(name, output_dir, base_file, ": ")

        self.genversion = genversion
        if len(genversion) < 30:
            self.genprefix = self.genversion
        else:
            hash = get_hash(self.genversion)[:5]
            self.genprefix = self.genversion[:25] + hash

        self.config = config
        self.options = config.get("OPTS", {})
        self.reserved_keywords = ["BASE"]
        self.config_path = f"{self.output_dir}/{self.genversion}.input"  # Make sure this syncs with the tmp file name

        # Deterime the type of each component
        keys = [k for k in config.keys() if k != "GLOBAL" and k != "OPTS"]
        self.base_ia = []
        self.base_cc = []
        types = {}
        types_dict = {"IA": [], "NONIA": []}
        for k in keys:
            d = config[k]
            base_file = d.get("BASE")
            if base_file is None:
                Task.fail_config(
                    f"Your simulation component {k} for sim name {self.name} needs to specify a BASE input file"
                )
            base_path = get_data_loc(base_file)
            if base_path is None:
                Task.fail_config(
                    f"Cannot find sim component {k} base file at {base_path} for sim name {self.name}"
                )

            gentype, genmodel = None, None
            with open(base_path) as f:
                for line in f.read().splitlines():
                    if line.upper().strip().startswith("GENTYPE:"):
                        gentype = line.upper().split(":")[1].strip()
                    if line.upper().strip().startswith("GENMODEL:"):
                        genmodel = line.upper().split(":")[1].strip()
            gentype = gentype or d.get("GENTYPE")
            genmodel = genmodel or d.get("GENMODEL")

            if not gentype:
                Task.fail_config(
                    f"Cannot find GENTYPE for component {k} and base file {base_path}"
                )
            if not genmodel:
                Task.fail_config(
                    f"Cannot find GENMODEL for component {k} and base file {base_path}"
                )

            type2 = "1" + f"{int(gentype):02d}"
            if "SALT2" in genmodel:
                self.base_ia.append(base_file)
                types[gentype] = "Ia"
                types[type2] = "Ia"
                types_dict["IA"].append(int(gentype))
                types_dict["IA"].append(int(type2))
            else:
                self.base_cc.append(base_file)
                types[gentype] = "II"
                types[type2] = "II"
                types_dict["NONIA"].append(int(gentype))
                types_dict["NONIA"].append(int(type2))

        sorted_types = collections.OrderedDict(sorted(types.items()))
        self.logger.debug(f"Types found: {json.dumps(sorted_types)}")
        self.output["types_dict"] = types_dict
        self.output["types"] = sorted_types
        self.global_config = global_config

        rankeys = [
            r for r in config["GLOBAL"].keys() if r.startswith("RANSEED_")
        ]
        value = int(
            config["GLOBAL"][rankeys[0]].split(" ")[0]) if rankeys else 1
        self.set_num_jobs(2 * value)

        self.sim_log_dir = f"{self.output_dir}/LOGS"
        self.total_summary = os.path.join(self.sim_log_dir,
                                          "TOTAL_SUMMARY.LOG")
        self.done_file = f"{self.output_dir}/FINISHED.DONE"
        self.logging_file = self.config_path.replace(".input", ".LOG")
        self.output["blind"] = self.options.get("BLIND", False)
        self.derived_batch_info = None

        # Determine if all the top level input files exist
        if len(self.base_ia + self.base_cc) == 0:
            Task.fail_config(
                "Your sim has no components specified! Please add something to simulate!"
            )

        # Try to determine how many jobs will be put in the queue
        try:
            # If BATCH_INFO is set, we'll use that
            batch_info = self.config.get("GLOBAL", {}).get("BATCH_INFO")
            default_batch_info = self.get_property("BATCH_INFO",
                                                   assignment=": ")

            # If its not set, lets check for ranseed_repeat or ranseed_change
            if batch_info is None:
                ranseed_repeat = self.config.get("GLOBAL",
                                                 {}).get("RANSEED_REPEAT")
                ranseed_change = self.config.get("GLOBAL",
                                                 {}).get("RANSEED_CHANGE")
                ranseed = ranseed_repeat or ranseed_change

                if ranseed:
                    num_jobs = int(ranseed.strip().split()[0])
                    self.logger.debug(
                        f"Found a randseed with {num_jobs}, deriving batch info"
                    )
                    comps = default_batch_info.strip().split()
                    comps[-1] = str(num_jobs)
                    self.derived_batch_info = " ".join(comps)
                    self.num_jobs = num_jobs
            else:
                # self.logger.debug(f"BATCH INFO property detected as {property}")
                self.num_jobs = int(default_batch_info.split()[-1])
        except Exception:
            self.logger.warning(
                f"Unable to determine how many jobs simulation {self.name} has"
            )
            self.num_jobs = 10

        self.output["genversion"] = self.genversion
        self.output["genprefix"] = self.genprefix

        ranseed_change = self.config.get("GLOBAL", {}).get("RANSEED_CHANGE")
        base = os.path.expandvars(
            f"{self.global_config['SNANA']['sim_dir']}/{self.genversion}")
        if ranseed_change:
            num_sims = int(ranseed_change.split()[0])
            self.logger.debug(
                "Detected randseed change with {num_sims} sims, updating sim_folders"
            )
            self.sim_folders = [
                base + f"-{i + 1:04d}" for i in range(num_sims)
            ]
        else:
            self.sim_folders = [base]
        self.output["ranseed_change"] = ranseed_change is not None
        self.output["sim_folders"] = self.sim_folders
Esempio n. 24
0
    def __init__(self, name, output_dir, config, dependencies, options, recal_aggtask):
        super().__init__(name, output_dir, config=config, dependencies=dependencies)
        self.passed = False
        self.classifiers = [d for d in dependencies if isinstance(d, Classifier)]
        self.lcfit_deps = [c.get_fit_dependency(output=False) for c in self.classifiers]
        self.lcfit_names = list(set([l.output["name"] for l in self.lcfit_deps if l is not None]))
        self.output["lcfit_names"] = self.lcfit_names
        if not self.lcfit_names:
            self.logger.debug("No jobs depend on the LCFIT, so adding a dummy one")
            self.lcfit_names = [""]

        self.sim_task = self.get_underlying_sim_task()
        self.output["sim_name"] = self.sim_task.name
        self.recal_aggtask = recal_aggtask
        self.num_versions = len(self.sim_task.output["sim_folders"])

        self.output_dfs = [os.path.join(self.output_dir, f"merged_{i}.csv") for i in range(self.num_versions)]
        self.output_dfs_key = [[os.path.join(self.output_dir, f"merged_{l}_{i}.key") for l in self.lcfit_names] for i in range(self.num_versions)]
        self.output_cals = [os.path.join(self.output_dir, f"calibration_{i}.csv") for i in range(self.num_versions)]

        self.id = "CID"
        self.type_name = "SNTYPE"
        self.options = options
        self.include_type = bool(options.get("INCLUDE_TYPE", False))
        self.plot = options.get("PLOT", False)
        self.plot_all = options.get("PLOT_ALL", False)
        self.output["classifier_names"] = [c.name for c in self.classifiers]
        self.output["classifier_indexes"] = [c.index for c in self.classifiers]
        self.output["calibration_files"] = self.output_cals
        self.output["empty_agg"] = False
        if isinstance(self.plot, bool):
            self.python_file = os.path.dirname(inspect.stack()[0][1]) + "/external/aggregator_plot.py"
        else:
            self.python_file = self.plot
        self.python_file = get_output_loc(self.python_file)

        if not os.path.exists(self.python_file):
            Task.fail_config(f"Attempting to find python file {self.python_file} but it's not there!")

        merge_classifiers = self.config.get("MERGE_CLASSIFIERS")
        self.classifier_merge = {c.output['name']: c.get_prob_column_name() for c in self.classifiers}
        if merge_classifiers is not None:
            self.classifier_merge = dict()
            for c in self.classifiers:
                prob_col = []
                for prob_col_name in merge_classifiers.keys():
                    mask_list = ensure_list(merge_classifiers[prob_col_name])
                    match = False
                    for m in mask_list:
                        if match:
                            continue
                        else:
                            if m in c.output['name']:
                                match = True
                    if match:
                        if prob_col_name[:5] != "PROB_":
                            prob_col_name = "PROB_" + prob_col_name
                        prob_col.append(prob_col_name)
                if len(prob_col) == 1:
                    self.classifier_merge[c.output['name']] = prob_col[0]
                else:
                    if len(prob_col) == 0:
                        self.classifier_merge[c.output['name']] = c.get_prob_column_name()
                    else:
                        Task.fail_config(f"Classifier task {c.output['name']} matched multiple MERGE_CLASSIFIERS keys: {prob_col}. Please provide more specific keys")
        self.logger.debug(f"Classifier merge = {self.classifier_merge}")
        self.output["classifier_merge"] = self.classifier_merge
Esempio n. 25
0
    def __init__(self,
                 name,
                 output_dir,
                 options,
                 global_config,
                 dependencies=None):
        super().__init__(name, output_dir, dependencies=dependencies)
        self.options = options
        self.global_config = get_config()

        self.logfile = os.path.join(self.output_dir, "output.log")
        self.conda_env = self.global_config["DataSkimmer"]["conda_env"]
        self.path_to_task = output_dir

        self.unparsed_raw = self.options.get("RAW_DIR")
        self.raw_dir = get_data_loc(self.unparsed_raw)
        if self.raw_dir is None:
            Task.fail_config(f"Unable to find {self.options.get('RAW_DIR')}")

        self.genversion = os.path.basename(self.raw_dir)
        self.data_path = os.path.dirname(self.raw_dir)
        if self.unparsed_raw == "$SCRATCH_SIMDIR" or "SNDATA_ROOT/SIM" in self.raw_dir:
            self.logger.debug("Removing PRIVATE_DATA_PATH from NML file")
            self.data_path = ""
        self.job_name = os.path.basename(
            Path(output_dir).parents[1]) + "_DATAPREP_" + self.name

        self.output["genversion"] = self.genversion
        self.output["data_path"] = self.data_path
        self.output["photometry_dirs"] = [get_output_loc(self.raw_dir)]
        self.output["sim_folders"] = [get_output_loc(self.raw_dir)]
        self.output["raw_dir"] = self.raw_dir
        self.clump_file = os.path.join(self.output_dir,
                                       self.genversion + ".SNANA.TEXT")
        self.output["clump_file"] = self.clump_file
        self.output["ranseed_change"] = False
        is_sim = options.get("SIM", False)
        self.output["is_sim"] = is_sim
        self.output["blind"] = options.get("BLIND", not is_sim)

        self.types_dict = options.get("TYPES")
        if self.types_dict is None:
            self.types_dict = {
                "IA": [1],
                "NONIA": [
                    2, 20, 21, 22, 29, 30, 31, 32, 33, 39, 40, 41, 42, 42, 43,
                    80, 81
                ]
            }
        else:
            for key in self.types_dict.keys():
                self.types_dict[key] = [int(c) for c in self.types_dict[key]]

        self.logger.debug(f"\tIA types are {self.types_dict['IA']}")
        self.logger.debug(f"\tNONIA types are {self.types_dict['NONIA']}")
        self.output["types_dict"] = self.types_dict
        self.types = OrderedDict()
        for n in self.types_dict["IA"]:
            self.types.update({n: "Ia"})
        for n in self.types_dict["NONIA"]:
            self.types.update({n: "II"})
        self.output["types"] = self.types

        self.slurm = """#!/bin/bash
#SBATCH --job-name={job_name}
#SBATCH --time=0:20:00
#SBATCH --nodes=1
#SBATCH --ntasks-per-node=1
#SBATCH --partition=broadwl
#SBATCH --output={log_file}
#SBATCH --account=pi-rkessler
#SBATCH --mem=2GB

cd {path_to_task}
snana.exe clump.nml
if [ $? -eq 0 ]; then
    echo SUCCESS > {done_file}
else
    echo FAILURE > {done_file}
fi
"""
        self.clump_command = """#
Esempio n. 26
0
    def __init__(self, name, output_dir, sim_task, config, global_config):

        self.config = config
        self.global_config = global_config

        base = config.get("BASE")
        if base is None:
            Task.fail_config(f"You have not specified a BASE nml file for task {name}")
        self.base_file = get_data_loc(base)
        if self.base_file is None:
            Task.fail_config(f"Base file {base} cannot be found for task {name}")

        super().__init__(name, output_dir, config, self.base_file, " = ", dependencies=[sim_task])

        self.sim_task = sim_task
        self.sim_version = sim_task.output["genversion"]
        self.config_path = self.output_dir + "/FIT_" + self.sim_version + ".nml"
        self.lc_output_dir = os.path.join(self.output_dir, "output")
        self.lc_log_dir = os.path.join(self.lc_output_dir, "SPLIT_JOBS_LCFIT")
        self.fitres_dirs = [os.path.join(self.lc_output_dir, os.path.basename(s)) for s in self.sim_task.output["sim_folders"]]

        self.logging_file = self.config_path.replace(".nml", ".LOG")
        self.kill_file = self.config_path.replace(".input", "_KILL.LOG")

        self.done_file = f"{self.lc_output_dir}/ALL.DONE"

        self.merge_log = os.path.join(self.lc_output_dir, "MERGE.LOG")

        self.log_files = [self.logging_file]
        self.num_empty_threshold = 20  # Damn that tarball creation can be so slow
        self.display_threshold = 8
        self.output["fitres_dirs"] = self.fitres_dirs
        self.output["base_file"] = self.base_file
        self.output["nml_file"] = self.config_path
        self.output["genversion"] = self.sim_version
        self.output["sim_name"] = sim_task.output["name"]
        self.output["blind"] = sim_task.output["blind"]
        self.output["lc_output_dir"] = self.lc_output_dir
        self.str_pattern = re.compile("[A-DG-SU-Za-dg-su-z]")

        self.validate_fitopts(config)

        is_data = False
        for d in self.dependencies:
            if isinstance(d, DataPrep):
                is_data = not d.output["is_sim"]
        self.output["is_data"] = is_data

        self.options = self.config.get("OPTS", {})
        # Try to determine how many jobs will be put in the queue
        # First see if it's been explicitly set
        num_jobs = self.options.get("NUM_JOBS")
        if num_jobs is not None:
            self.num_jobs = num_jobs
            self.logger.debug("Num jobs set by NUM_JOBS option")
        else:
            try:
                property = self.options.get("BATCH_INFO") or self.yaml["CONFIG"].get("BATCH_INFO")
                self.num_jobs = int(property.split()[-1])
                self.logger.debug("Num jobs set by BATCH_INFO")
            except Exception:
                self.logger.warning("Could not determine BATCH_INFO for job, setting num_jobs to 10")
                self.num_jobs = 10
                self.logger.debug("Num jobs set to default")