class GetFilterNamesFromMenuWrapper(Task, law.WrapperTask): hlt_menus = law.CSVParameter(description="hlt menus (no patterns!) to query") hlt_paths = law.CSVParameter(description="hlt paths (no patterns!) to query") check_for_patterns = ["hlt_menus", "hlt_paths"] def requires(self): return [ GetFilterNamesFromMenu.req(self, hlt_menu=m, hlt_path=p) for m, p in itertools.product(self.hlt_menus, self.hlt_paths) ]
class GetFilterNamesFromRunWrapper(Task, law.WrapperTask): hlt_paths = law.CSVParameter(description="hlt paths (no patterns!) to query") run_numbers = law.CSVParameter(cls=luigi.IntParameter, description="run numbers to query") check_for_patterns = ["hlt_paths"] def requires(self): return [ GetFilterNamesFromRun.req(self, run_number=r, hlt_path=p) for r, p in itertools.product(self.run_numbers, self.hlt_paths) ]
class GetMenusFromDatasetWrapper(Task, law.WrapperTask): datasets = law.CSVParameter(default=law.config.options("hltp_mc_datasets"), description="datasets (no patterns!) to query, default: config hltp_mc_datasets") file_indices = law.CSVParameter(default=[0], description="indices of dataset files to query, " "default: [0]") check_for_patterns = ["datasets"] def requires(self): return [ GetMenusFromDataset.req(self, dataset=d, file_index=f) for d, f in itertools.product(self.datasets, self.file_indices) ]
class GetDatasetLFNsWrapper(Task, law.WrapperTask): datasets = law.CSVParameter(default=law.config.options("hltp_mc_datasets"), description="datasets to query, default: config hltp_mc_datasets") def requires(self): return [GetDatasetLFNs.req(self, dataset=d) for d in self.datasets]
class GetLumiDataWrapper(Task, law.WrapperTask): lumi_file = GetLumiData.lumi_file normtag_file = GetLumiData.normtag_file hlt_paths = law.CSVParameter(default=law.config.options("hltp_paths"), description="hlt paths (can be patterns) to query, default: config hltp_paths") def requires(self): return [GetLumiData.req(self, hlt_path=p) for p in self.hlt_paths]
class HTCondorWorkflow(law.htcondor.HTCondorWorkflow): htcondor_pool = luigi.Parameter(default="", significant=False, description="target " "htcondor pool") htcondor_scheduler = luigi.Parameter(default="", significant=False, description="target " "htcondor scheduler") htcondor_ce = law.CSVParameter(default=(), significant=False, description="target arc computing " "element(s), default: ()") def htcondor_use_local_scheduler(self): return True
class PlotShiftedScaleFactorWrapper(AnalysisTask, law.WrapperTask): shifts = law.CSVParameter(default=[], description="shifts to require") skip_shifts = law.CSVParameter(default=[], description="shifts to skip, supports patterns") wrapped_task = PlotScaleFactor def __init__(self, *args, **kwargs): super(PlotShiftedScaleFactorWrapper, self).__init__(*args, **kwargs) if not self.shifts: # strip up/down direction from shifts, remove nominal variation shifts = MeasureScaleFactors.shifts self.shifts = list(set([shift.rsplit("_", 1)[0] for shift in shifts if not shift == "nominal"])) if self.skip_shifts: filter_fn = lambda d: not law.util.multi_match(d, self.skip_shifts) self.shifts = filter(filter_fn, self.shifts) def requires(self): def req(shift): return self.wrapped_task.req(self, shifts=[shift + "_up", shift + "_down"]) return OrderedDict([(shift, req(shift)) for shift in self.shifts])
class PlotShiftsFromCSV(AnalysisTask, law.WrapperTask): shifts = law.CSVParameter(default=[], description="shifts to require") skip_shifts = law.CSVParameter( default=[], description="shifts to skip, supports patterns") flavor = PlotFromCSV.flavor norm_to_nominal = PlotFromCSV.norm_to_nominal wrapped_task = PlotFromCSV def __init__(self, *args, **kwargs): super(PlotShiftsFromCSV, self).__init__(*args, **kwargs) jes_sources = self.config_inst.get_aux("jes_sources") if not self.shifts: self.shifts = [] if self.flavor == "c": self.shifts = ["cferr1", "cferr2"] else: self.shifts.extend([ "jes{}".format(jes_source) for jes_source in jes_sources if jes_source != "Total" ]) self.shifts.extend([ "{}{}".format(region, type) for region, type in itertools.product(["lf", "hf"], ["", "stats1", "stats2"]) ]) if self.skip_shifts: filter_fn = lambda d: not law.util.multi_match(d, self.skip_shifts) self.shifts = filter(filter_fn, self.shifts) def requires(self): def req(shift): return self.wrapped_task.req(self, shift=shift) return OrderedDict([(shift, req(shift)) for shift in self.shifts])
class GridWorkflow(AnalysisTask, law.glite.GLiteWorkflow, law.arc.ARCWorkflow, HTCondorWorkflow): glite_ce_map = { "CNAF": [ "ce04-lcg.cr.cnaf.infn.it:8443/cream-lsf-cms", "ce05-lcg.cr.cnaf.infn.it:8443/cream-lsf-cms", "ce06-lcg.cr.cnaf.infn.it:8443/cream-lsf-cms", ], "IRFU": "node74.datagrid.cea.fr:8443/cream-pbs-cms", "IIHE": "cream02.iihe.ac.be:8443/cream-pbs-cms", "CIEMAT": [ "creamce02.ciemat.es:8443/cream-pbs-medium", "creamce03.ciemat.es:8443/cream-pbs-medium", ], } arc_ce_map = { "DESY": "grid-arcce0.desy.de", "KIT": ["arc-{}-kit.gridka.de".format(i) for i in range(1, 6 + 1)], } htcondor_ce_map = { "RWTH": "grid-ce-1-rwth.gridka.de grid-ce-1-rwth.gridka.de:9619", # input to grid_resource } sl_distribution_map = collections.defaultdict(lambda: "slc7") grid_ce = law.CSVParameter(default=["RWTH"], significant=False, description="target computing " "element(s)") req_sandbox = "slc7" # sandbox key sandbox = "singularity::/cvmfs/singularity.opensciencegrid.org/cmssw/cms:rhel7-m20200612" exclude_params_branch = {"grid_ce"} @classmethod def modify_param_values(cls, params): params = AnalysisTask.modify_param_values(params) if "workflow" in params and law.is_no_param(params["workflow"]): grid_ce = params["grid_ce"] # figure out ce version if grid_ce[0] in cls.arc_ce_map: workflow = "arc" elif grid_ce[0] in cls.glite_ce_map: workflow = "glite" elif grid_ce[0] in cls.htcondor_ce_map: workflow = "htcondor" else: raise ValueError("Unknown computing element type {}".format(grid_ce[0])) ces = [] for ce in grid_ce: ces.append(getattr(cls, workflow + "_ce_map").get(ce, ce)) params[workflow + "_ce"] = tuple(law.util.flatten(ces)) params["workflow"] = workflow return params def _setup_workflow_requires(self, reqs): if not len(set([self.sl_distribution_map[ce] for ce in self.grid_ce])) == 1: raise Exception("Cannot submit to multiple CEs running different distributions.") self.remote_sl_dist_version = self.sl_distribution_map[self.grid_ce[0]] reqs["cmssw"] = UploadCMSSW.req(self, replicas=10, _prefer_cli=["replicas"], sandbox=self.sandbox) reqs["software"] = UploadSoftware.req(self, replicas=10, _prefer_cli=["replicas"], sandbox=self.config_inst.get_aux("sandboxes")[self.remote_sl_dist_version]) reqs["sandbox_software"] = UploadSoftware.req(self, replicas=10, _prefer_cli=["replicas"], sandbox=self.sandbox) reqs["repo"] = UploadRepo.req(self, replicas=10, _prefer_cli=["replicas"]) def _setup_render_variables(self, config, reqs): config.render_variables["jtsf_grid_user"] = os.getenv("JTSF_GRID_USER") config.render_variables["jtsf_cmssw_setup"] = os.getenv("JTSF_CMSSW_SETUP") config.render_variables["sandbox_jtsf_dist_version"] = self.req_sandbox config.render_variables["cmssw_base_url"] = reqs["cmssw"].output().dir.uri() config.render_variables["sandbox_cmssw_version"] = os.getenv("CMSSW_VERSION") config.render_variables["software_base_url"] = reqs["software"].output().dir.uri() config.render_variables["sandbox_software_base_url"] = reqs["sandbox_software"].output().dir.uri() config.render_variables["repo_checksum"] = reqs["repo"].checksum config.render_variables["repo_base"] = reqs["repo"].output().dir.uri() # GLITE def glite_workflow_requires(self): reqs = law.glite.GLiteWorkflow.glite_workflow_requires(self) self._setup_workflow_requires(reqs) return reqs def glite_output_directory(self): return law.wlcg.WLCGDirectoryTarget(self.wlcg_path()) def glite_output_uri(self): return self.glite_output_directory().uri(cmd="listdir") def glite_bootstrap_file(self): return law.util.rel_path(__file__, "files", "grid_bootstrap.sh") def glite_job_config(self, config, job_num, branches): config = law.glite.GLiteWorkflow.glite_job_config(self, config, job_num, branches) self._setup_render_variables(config, self.glite_workflow_requires()) config.vo = "cms:/cms/dcms" return config # ARC def arc_workflow_requires(self): reqs = law.arc.ARCWorkflow.arc_workflow_requires(self) self._setup_workflow_requires(reqs) return reqs def arc_output_directory(self): return self.glite_output_directory() def arc_output_uri(self): return self.glite_output_uri() def arc_bootstrap_file(self): return self.glite_bootstrap_file() def arc_job_config(self, config, job_num, branches): self._setup_render_variables(config, self.arc_workflow_requires()) return config def arc_stageout_file(self): return law.util.rel_path(__file__, "files", "arc_stageout.sh") # HTCONDOR def htcondor_workflow_requires(self): reqs = law.htcondor.HTCondorWorkflow.htcondor_workflow_requires(self) self._setup_workflow_requires(reqs) return reqs def htcondor_output_directory(self): return self.glite_output_directory() def htcondor_output_uri(self): return self.glite_output_uri() def htcondor_bootstrap_file(self): return self.glite_bootstrap_file() def htcondor_job_config(self, config, job_num, branches): self._setup_render_variables(config, self.htcondor_workflow_requires()) config.render_variables["output_uri"] = self.htcondor_output_uri() config.universe = "grid" #config.stdout = "out.txt" #config.stderr = "err.txt" #config.log = "log.txt" config.custom_content.append(("grid_resource", "condor {}".format(self.htcondor_ce[0]))) config.custom_content.append(("use_x509userproxy", "true")) config.custom_content.append(("transfer_output_files", '""')) return config def htcondor_stageout_file(self): return self.arc_stageout_file()
class WrapperTask(AnalysisTask, law.WrapperTask): datasets = law.CSVParameter(default=[], description="datasets to require") shifts = law.CSVParameter(default=[], description="shifts to require") skip_datasets = law.CSVParameter(default=[], description="datasets to skip, supports patterns") skip_shifts = law.CSVParameter(default=[], description="shifts to skip, supports patterns") grid_ces = law.CSVParameter(default=[], description="grid CEs to submit to, chosen randomly") exclude_db = True def __init__(self, *args, **kwargs): super(WrapperTask, self).__init__(*args, **kwargs) if not self.datasets: self.datasets = self.get_default_datasets() if not self.shifts: self.shifts = self.get_default_shifts() if self.skip_datasets: filter_fn = lambda d: not law.util.multi_match(d, self.skip_datasets) self.datasets = filter(filter_fn, self.datasets) if self.skip_shifts: filter_fn = lambda d: not law.util.multi_match(d, self.skip_shifts) self.shifts = filter(filter_fn, self.shifts) @abc.abstractproperty def wrapped_task(self): return def get_default_datasets(self): if issubclass(self.wrapped_task, DatasetTask): return [dataset.name for dataset in self.config_inst.datasets] else: return [None] def get_default_shifts(self): if issubclass(self.wrapped_task, ShiftTask): return self.wrapped_task.shifts else: return [None] def requires(self): cls = self.wrapped_task def req(dataset, shift): kwargs = {} if dataset is not None: kwargs["dataset"] = dataset if shift is not None: kwargs["shift"] = shift if issubclass(cls, GridWorkflow) and self.grid_ces: kwargs["grid_ce"] = [random.choice(self.grid_ces)] kwargs["_prefer_cli"] = ["grid_ce"] return cls.req(self, **kwargs) # get parameters, require shifts only for MC params_list = [] for dataset in self.datasets: for shift in self.shifts: if dataset is not None and shift is not None: # require shifts only for MC if self.config_inst.get_dataset(dataset).is_data and shift != "nominal": continue params_list.append((dataset, shift)) return collections.OrderedDict([(params, req(*params)) for params in params_list])
class GatherDataFilters(TaskWithSummary): hlt_menus = law.CSVParameter(default=law.config.options("hltp_data_menus"), description="hlt menus (can be patterns) to query") hlt_paths = GetLumiDataWrapper.hlt_paths lumi_file = GetLumiData.lumi_file show_menus = luigi.BoolParameter(default=False, significant=False, description="if set, show " "an additional 'HLT menu(s)' column in the summary table, default: False") verbose_runs = luigi.BoolParameter(default=False, significant=False, description="if set, " "print the full list of run umbers in the summary table, default: False") table_format = GatherMCFilters.table_format def output(self): return self.local_target("filters.json") @law.decorator.notify def run(self): # strategy: # 1. Get the list of valid run numbers from the lumi file. # 2. Get all menus and associated runs, and filter the latter by (1). # 3. Filter menus using the provided menu patterns. # 4. For all menus, get the list of paths and filter them using the provided path patterns. # 5. Get filter names for each menu and path combination. # 6. Save the data. # coloring and colored formatter helpers col = lambda s: law.util.colored(s, color="light_blue", style="bright") fmt = lambda s, *args: s.format(*(col(arg) for arg in args)) # 1 lumi_data = law.LocalFileTarget(self.lumi_file).load(formatter="json") valid_runs = [ int(run) for run, section in six.iteritems(lumi_data) if law.util.flatten(section) ] self.publish_message(fmt("found {} valid runs in lumi file", len(valid_runs))) # 2 all_menu_runs = (yield GetMenusInData.req(self)).load(formatter="json") menu_runs = { menu: [ run for run in data["runs"] if run in valid_runs ] for menu, data in six.iteritems(all_menu_runs) } # 3 menu_runs = { menu: runs for menu, runs in six.iteritems(menu_runs) if runs and law.util.multi_match(menu, self.hlt_menus, mode=any) } self.publish_message(fmt("found a total of {} valid runs in {} menus:\n{} ", sum(len(runs) for runs in six.itervalues(menu_runs)), len(menu_runs), "\n".join(menu_runs.keys()))) # 4 paths_inputs = yield { menu: GetPathsFromMenu.req(self, hlt_menu=menu) for menu in menu_runs } menu_paths = { menu: [ p for p in inp.load(formatter="json") if law.util.multi_match(p, self.hlt_paths) ] for menu, inp in six.iteritems(paths_inputs) } # 5 menu_path_pairs = sum(( [(menu, path) for path in paths] for menu, paths in six.iteritems(menu_paths) ), []) filter_inputs = yield { (menu, path): GetFilterNamesFromMenu.req(self, hlt_menu=menu, hlt_path=path) for menu, path in menu_path_pairs } filter_names = { (menu, path): [d["name"] for d in inps["filters"].load(formatter="json")] for (menu, path), inps in six.iteritems(filter_inputs) } # 6 data = [] for menu, runs in six.iteritems(menu_runs): data.append(dict( menu=menu, runs=runs, paths=[ dict( name=path, filters=filter_names[(menu, path)], ) for path in menu_paths[menu] ], )) # save the output and print the summary output = self.output() output.parent.touch() output.dump(data, indent=4, formatter="json") self.summary() def summary(self): with self.summary_lock(): super(GatherDataFilters, self).summary() # read data data = self.output().load(formatter="json") # get a menu -> runs map menu_runs = {entry["menu"]: entry["runs"] for entry in data} # get a flat (menu, path) -> filters map filter_map = {} for entry in data: for path_entry in entry["paths"]: filter_map[(entry["menu"], path_entry["name"])] = path_entry["filters"] # helper to compress run numbers into a readable string def compress_runs(runs): if len(runs) == 1: return str(runs[0]) else: runs = sorted(runs) return "{}-{}".format(runs[0], runs[-1]) # define the table headers = ["HLT path(s)", "Runs", "HLT menu(s)", "Filter names"] rows = [] # when multiple paths have exactly the same filter names associated, we want to reflect # that in the printed table, so use a while loop and a reduction pattern keys = [(str(menu), str(path)) for menu, path in filter_map] while keys: key = keys.pop(0) menu, path = key # prepare paths, menus, runs and filter names paths = [path] menus = [menu] runs = [sorted(menu_runs[menu])] filters = tuple(filter_map[key]) # try to look ahead to check if there is another entry with the same names for _key in list(keys): _menu, _path = _key if tuple(filter_map[_key]) == filters: keys.remove(_key) paths.append(_path) menus.append(_menu) runs.append(sorted(menu_runs[_menu])) # create string entries if self.verbose_runs: paths_str = "\n".join(sorted(list(set(paths)))) unique_runs = list(set(law.util.flatten(runs))) runs_str = ",\n".join( ",".join(str(r) for r in chunk) for chunk in law.util.iter_chunks(sorted(unique_runs), 5) ) else: # some additional sorting compressed_runs = [compress_runs(_runs) for _runs in runs] indices = list(range(len(paths))) sorted_indices = sorted(indices, key=lambda i: (paths[i], compressed_runs[i])) paths_str = "\n".join(paths[i] for i in sorted_indices) runs_str = "\n".join(compressed_runs[i] for i in sorted_indices) menus_str = "\n".join(sorted(list(set(menus)), key=str.lower)) filters_str = "\n".join(filters) # append a new row rows.append([paths_str, runs_str, menus_str, filters_str]) # sort rows by the first path rows = sorted(rows, key=lambda row: row[0].split("\n", 1)[0]) # remove the menu column if requested if not self.show_menus: headers.pop(2) for row in rows: row.pop(2) print(tabulate.tabulate(rows, headers=headers, tablefmt=self.table_format))
class GetPathsFromRuns(TaskWithSummary): run_numbers = law.CSVParameter(default=[], description="run numbers to query, can also be " "ranges denoted by 'start-end' (both inclusive), default: runs from lumi file") lumi_file = GetLumiData.lumi_file hlt_paths = law.CSVParameter(default=[], description="when set, can be a hlt paths (patterns " "allowed) that are used to filter the obtained paths, default: []") table_format = luigi.Parameter(default="grid", significant=False, description="the tabulate " "table format for the summary, default: grid") def requires(self): return GetMenusInData.req(self) def output(self): postfix = law.util.create_hash([self.run_numbers, self.lumi_file, self.hlt_paths]) return self.local_target("paths_{}.json".format(postfix)) @law.decorator.notify def run(self): # read menu data menu_data = self.input().load(formatter="json") # expand run numbers if self.run_numbers: run_numbers = set() for r in self.run_numbers: if r.count("-") == 1: start, end = [int(s) for s in r.split("-")] run_numbers |= set(range(start, end + 1)) else: run_numbers.add(int(r)) run_numbers = sorted(list(run_numbers)) else: lumi_data = law.LocalFileTarget(self.lumi_file).load(formatter="json") run_numbers = [int(r) for r in lumi_data.keys()] # reduce menu data to a simple mapping menu -> valid runs menu_runs = { menu: [r for r in data["runs"] if r in run_numbers] for menu, data in six.iteritems(menu_data) } menu_runs = { menu: runs for menu, runs in six.iteritems(menu_runs) if runs } self.publish_message("found {} trigger menus".format(len(menu_runs))) # get all paths for all menus paths_inputs = yield { menu: GetPathsFromMenu.req(self, hlt_menu=menu) for menu in menu_runs } menu_paths = { menu: sorted(inp.load(formatter="json")) for menu, inp in six.iteritems(paths_inputs) } # filter by given hlt path patterns if self.hlt_paths: menu_paths = { menu: [p for p in paths if law.util.multi_match(p, self.hlt_paths, mode=any)] for menu, paths in six.iteritems(menu_paths) } menu_paths = { menu: paths for menu, paths in six.iteritems(menu_paths) if paths } # merge output data data = { menu: dict(runs=menu_runs[menu], paths=paths) for menu, paths in six.iteritems(menu_paths) } # save the output and print the summary output = self.output() output.parent.touch() output.dump(data, indent=4, formatter="json") self.summary() def summary(self): with self.summary_lock(): super(GetPathsFromRuns, self).summary() data = self.output().load(formatter="json") headers = ["HLT menu", "Runs", "Matching HLT path(s)"] rows = [] for menu, entry in six.iteritems(data): rows.append([ menu, ",\n".join( ",".join(str(r) for r in chunk) for chunk in law.util.iter_chunks(sorted(entry["runs"]), 5) ), "\n".join(entry["paths"]), ]) # sort rows by first run number rows = sorted(rows, key=lambda row: int(row[1].split(",", 1)[0])) print(tabulate.tabulate(rows, headers=headers, tablefmt=self.table_format))
class GridWorkflow(law.ARCWorkflow): """ Here, we need to configure the default law.ARCWorkflow to a minimal extent in order to send our bootstrap file and some variables to remote jobs. Law does not aim to auto-magically do this in a multi-purpose manner for all possible cases, but rather provides a simple interface to steer the exact behavior you want in your grid jobs. """ arc_ce_map = { "DESY": "grid-arcce0.desy.de", "KIT": ["arc-{}-kit.gridka.de".format(i) for i in range(1, 6 + 1)], } grid_ce = law.CSVParameter(default=["KIT"], significant=False, description="target computing " "element(s)") exclude_params_branch = {"grid_ce"} @classmethod def modify_param_values(cls, params): # the law.ARCWorkflow requires a full (or a list of) computing elements, however, we want # to steer it with simple names like "RWTH" or "CNAF", hence we use the modify_param_values # hook to modify parameters before the task is actually instantiated params = super(GridWorkflow, cls).modify_param_values(params) if "workflow" in params and params["workflow"] in (law.NO_STR, "arc"): ces = [] for ce in params["grid_ce"]: ces.append(cls.arc_ce_map.get(ce, ce)) params["arc_ce"] = tuple(law.util.flatten(ces)) params["workflow"] = "arc" return params def arc_output_directory(self): # the directory where submission meta data should be stored return law.WLCGDirectoryTarget(self.wlcg_path()) def arc_create_job_file_factory(self): # tell the factory, which is responsible for creating our job files, # that the files are not temporary, i.e., it should not delete them after submission factory = super(GridWorkflow, self).arc_create_job_file_factory() factory.is_tmp = False return factory def arc_bootstrap_file(self): # each job can define a bootstrap file that is executed prior to the actual job # in order to setup software and environment variables return law.util.rel_path(__file__, "arc_bootstrap.sh") def arc_job_config(self, config, job_num, branches): # render_data is rendered into all files sent with a job, such as the arc_bootstrap file config.render_variables["grid_user"] = os.getenv("WLCG_EXAMPLE_GRID_USER") return config def arc_workflow_requires(self): # requirements of the arc workflow, i.e., upload the software stack and the example repo # with 2 replicas reqs = super(GridWorkflow, self).arc_workflow_requires() reqs["software"] = UploadSoftware.req(self, replicas=2) reqs["repo"] = UploadRepo.req(self, replicas=2) return reqs