class DstDataTask(DataTask): release = b2luigi.Parameter() prod = b2luigi.IntParameter() database = b2luigi.IntParameter() def output(self): yield {"full_output.root": b2luigi.LocalTarget(_build_data_path(self))}
class Basf2PathTask(Basf2Task): num_processes = b2luigi.IntParameter(significant=False, default=0) max_event = b2luigi.IntParameter(significant=False, default=0) def create_path(self): raise NotImplementedError() @b2luigi.on_temporary_files def process(self): assert get_basf2_git_hash() == self.git_hash try: import basf2 import ROOT except ImportError: raise ImportError( "Can not find ROOT or basf2. Can not use the basf2 task.") if self.num_processes: basf2.set_nprocesses(self.num_processes) if self.max_event: ROOT.Belle2.Environment.Instance().setNumberEventsOverride( self.max_event) path = self.create_path() path.add_module("Progress") basf2.print_path(path) basf2.process(path) print(basf2.statistics)
class SimulationTask(Basf2PathTask): n_events = luigi.IntParameter() event_type = luigi.EnumParameter(enum=SimulationType) def create_path(self): path = basf2.create_path() modularAnalysis.setupEventInfo(self.n_events, path) if self.event_type == SimulationType.y4s: dec_file = Belle2.FileSystem.findFile( 'analysis/examples/tutorials/B2A101-Y4SEventGeneration.dec') elif self.event_type == SimulationType.continuum: dec_file = Belle2.FileSystem.findFile( 'analysis/examples/simulations/B2A102-ccbarEventGeneration.dec' ) else: raise ValueError( f"Event type {self.event_type} is not valid. It should be either 'Y(4S)' or 'Continuum'!" ) generators.add_evtgen_generator(path, 'signal', dec_file) modularAnalysis.loadGearbox(path) simulation.add_simulation(path) path.add_module('RootOutput', outputFileName=self.get_output_file_name( 'simulation_full_output.root')) return path def output(self): yield self.add_to_output("simulation_full_output.root")
class FullTimingTask(luigi.Task): """Run tsfresh with all calculators for comparison""" n_jobs = luigi.IntParameter() def output(self): yield self.add_to_output("result.json") def run(self): input_file = self._get_input_targets("data.csv")[0] with input_file.open("r") as f: df = pd.read_csv(f) start_time = time() extract_features(df, column_id="id", column_sort="time", n_jobs=self.n_jobs, disable_progressbar=True) end_time = time() result_json = { "time": end_time - start_time, "n_ids": self.num_ids, "n_jobs": self.n_jobs, "time_series_length": int((df["id"] == 0).sum()), } with self._get_output_target("result.json").open("w") as f: json.dump(result_json, f)
class Resample(b2luigi.Task): """Resample the train sample and store it to a root file. Parameters: ntuple_file (str): Path to the file train_size (float): between 0 and 1, size of train sample test_size (float): size of test sample, random_seed (int): random seed to generate a resampled sample Output: train.root """ random_seed = b2luigi.IntParameter() queue = "sx" def output(self): yield self.add_to_output("train.root") def run(self): df = root_pandas.read_root(*self.get_input_file_names('train.root'), key=self.tree_name) # resample resampled_df = resample(df, random_state=self.random_seed) # store to root root_pandas.to_root(resampled_df, self.get_output_file_name('train.root'), key=self.tree_name)
class MyNumberTask(b2luigi.Task): some_parameter = b2luigi.IntParameter() def output(self): yield self.add_to_output("output_file.txt") def run(self): random_number = random.random() with open(self.get_output_file_name("output_file.txt"), "w") as f: f.write(f"{random_number}\n")
class MyNumberTask(b2luigi.Task): some_parameter = b2luigi.IntParameter() def output(self): return b2luigi.LocalTarget( f"results/output_file_{self.some_parameter}.txt") def run(self): random_number = random.random() with self.output().open("w") as f: f.write(f"{random_number}\n")
class TaskA(luigi.Task): central_value = luigi.FloatParameter() index = luigi.IntParameter() def run(self): with open(self.get_output_file_name("random_numbers.txt"), "w") as f: for _ in range(1000): f.write(f"{random.gauss(self.central_value, 0.0)}\n") def output(self): yield self.add_to_output("random_numbers.txt")
class DataCreationTask(luigi.Task): """Create random data for testing""" num_ids = luigi.IntParameter(default=100) time_series_length = luigi.IntParameter() random_seed = luigi.IntParameter() def output(self): yield self.add_to_output("data.csv") def run(self): np.random.seed(self.random_seed) df = pd.concat([ pd.DataFrame({ "id": [i] * self.time_series_length, "time": range(self.time_series_length), "value": np.random.randn(self.time_series_length) }) for i in range(self.num_ids) ]) with self._get_output_target("data.csv").open("w") as f: df.to_csv(f)
class TaskB(b2luigi.Task): another_parameter = b2luigi.IntParameter() def requires(self): for my_other_parameter in range(10): yield self.clone(TaskA, some_other_parameter=my_other_parameter) def run(self): # somehow merge the output of TaskA to create "out.dat" pass def output(self): yield self.add_to_output("out.dat")
class TimingTask(luigi.Task): """Run tsfresh with the given parameters""" feature_parameter = luigi.DictParameter(hashed=True) n_jobs = luigi.IntParameter() try_number = luigi.IntParameter() def output(self): yield self.add_to_output("result.json") def run(self): input_file = self._get_input_targets("data.csv")[0] with input_file.open("r") as f: df = pd.read_csv(f) start_time = time() extract_features(df, column_id="id", column_sort="time", n_jobs=self.n_jobs, default_fc_parameters=self.feature_parameter, disable_progressbar=True) end_time = time() single_parameter_name = list(self.feature_parameter.keys())[0] single_parameter_params = self.feature_parameter[single_parameter_name] result_json = { "time": end_time - start_time, "n_ids": self.num_ids, "n_jobs": self.n_jobs, "feature": single_parameter_name, "number_parameters": len(single_parameter_params) if single_parameter_params else 0, "time_series_length": int((df["id"] == 0).sum()), "try_number": self.try_number, } with self._get_output_target("result.json").open("w") as f: json.dump(result_json, f)
class MyNumberTask(b2luigi.Task): some_parameter = b2luigi.IntParameter() htcondor_settings = {"request_cpus": 1, "request_memory": "100 MB"} def output(self): yield self.add_to_output("output_file.txt") def run(self): print("I am now starting a task") random_number = random.random() if self.some_parameter == 3: raise ValueError with open(self.get_output_file_name("output_file.txt"), "w") as f: f.write(f"{random_number}\n")
class AnalysisTask(Basf2PathTask): # set the batch_system property to use the gbasf2 wrapper batch process for this task batch_system = "gbasf2" # Must define a prefix for the gbasf2 project name to submit to the grid. # b2luigi will then add a hash derived from the luigi parameters to create a unique project name. gbasf2_project_name_prefix = b2luigi.Parameter() gbasf2_input_dataset = b2luigi.Parameter(hashed=True) # Example luigi cut parameter to facilitate starting multiple projects for different cut values mbc_lower_cut = b2luigi.IntParameter() def create_path(self): mbc_range = (self.mbc_lower_cut, 5.3) return example_mdst_analysis.create_analysis_path( d_ntuple_filename="D_ntuple.root", b_ntuple_filename="B_ntuple.root", mbc_range=mbc_range) def output(self): yield self.add_to_output("D_ntuple.root") yield self.add_to_output("B_ntuple.root")
class BootstrapTraining(b2luigi.Task): """Start a training with a resampled train sample. See also `Training`. Parameters: random_seed (int): random seed of the resampled train sample off_res_files (list): List with paths to off-res. files tree_name (str): name of the tree in the root file training_variables (list): list of training variables used for training training_parameters (dict): train- and test size, the following BDT hyper-parameters (optional): "nTrees", "shrinkage" and "nLevels". Output: bdt.xml """ random_seed = b2luigi.IntParameter() off_res_files = b2luigi.ListParameter(hashed=True) tree_name = b2luigi.ListParameter() training_variables = b2luigi.ListParameter(hashed=True) training_parameters = b2luigi.DictParameter(hashed=True) def requires(self): train_size = self.training_parameters["train_size"] test_size = self.training_parameters["test_size"] for ntuple_file in self.off_res_files: yield self.clone(Resample, ntuple_file=ntuple_file, train_size=train_size, test_size=test_size, random_seed=self.random_seed) def output(self): yield self.add_to_output('bdt.xml') def run(self): Training.run(self)
class DataTask(b2luigi.ExternalTask): data_mode = b2luigi.EnumParameter(enum=DataMode) experiment_number = b2luigi.IntParameter() run_number = b2luigi.IntParameter() prefix = b2luigi.Parameter() file_name = b2luigi.Parameter()
class AggregatorTask(Basf2nTupleMergeTask): n_events = luigi.IntParameter() def requires(self): for event_type in SimulationType: yield self.clone(AnalysisTask, event_type=event_type)
class TaskB(b2luigi.Task): another_parameter = b2luigi.IntParameter() def output(self): yield self.add_to_output("out.dat")
class TaskA(b2luigi.Task): some_parameter = b2luigi.IntParameter() def output(self): yield self.add_to_output("file_a") yield self.add_to_output("file_b")
class TaskA(b2luigi.Task): some_parameter = b2luigi.IntParameter() some_other_parameter = b2luigi.IntParameter() def output(self): yield self.add_to_output("test.txt")