def test_hash_consistency(self): first_parameter = b2luigi.DictParameter() self.assertFalse(hasattr(first_parameter, "serialize_hashed")) second_parameter = b2luigi.DictParameter(hashed=True) self.assertTrue(hasattr(second_parameter, "serialize_hashed")) serialized_first = second_parameter.serialize_hashed({ "key1": "value1", "key2": 12345 }) serialized_second = second_parameter.serialize_hashed({ "key1": "value1", "key2": 12345 }) self.assertEqual(serialized_first, serialized_second) self.assertEqual(serialized_first, "hashed_df6221c515cbb93735f9478cb05a00e4") serialized_first = second_parameter.serialize_hashed( [1, "test", 456, { "hello": "bye" }]) serialized_second = second_parameter.serialize_hashed( [1, "test", 457, { "hello": "bye" }]) self.assertNotEqual(serialized_first, serialized_second) self.assertEqual(serialized_first, "hashed_7816c14282fd03e3dc4e398f28aa5a30")
class BootstrapTraining(b2luigi.Task): """Start a training with a resampled train sample. See also `Training`. Parameters: random_seed (int): random seed of the resampled train sample off_res_files (list): List with paths to off-res. files tree_name (str): name of the tree in the root file training_variables (list): list of training variables used for training training_parameters (dict): train- and test size, the following BDT hyper-parameters (optional): "nTrees", "shrinkage" and "nLevels". Output: bdt.xml """ random_seed = b2luigi.IntParameter() off_res_files = b2luigi.ListParameter(hashed=True) tree_name = b2luigi.ListParameter() training_variables = b2luigi.ListParameter(hashed=True) training_parameters = b2luigi.DictParameter(hashed=True) def requires(self): train_size = self.training_parameters["train_size"] test_size = self.training_parameters["test_size"] for ntuple_file in self.off_res_files: yield self.clone(Resample, ntuple_file=ntuple_file, train_size=train_size, test_size=test_size, random_seed=self.random_seed) def output(self): yield self.add_to_output('bdt.xml') def run(self): Training.run(self)
class TimingTask(luigi.Task): """Run tsfresh with the given parameters""" feature_parameter = luigi.DictParameter(hashed=True) n_jobs = luigi.IntParameter() try_number = luigi.IntParameter() def output(self): yield self.add_to_output("result.json") def run(self): input_file = self._get_input_targets("data.csv")[0] with input_file.open("r") as f: df = pd.read_csv(f) start_time = time() extract_features(df, column_id="id", column_sort="time", n_jobs=self.n_jobs, default_fc_parameters=self.feature_parameter, disable_progressbar=True) end_time = time() single_parameter_name = list(self.feature_parameter.keys())[0] single_parameter_params = self.feature_parameter[single_parameter_name] result_json = { "time": end_time - start_time, "n_ids": self.num_ids, "n_jobs": self.n_jobs, "feature": single_parameter_name, "number_parameters": len(single_parameter_params) if single_parameter_params else 0, "time_series_length": int((df["id"] == 0).sum()), "try_number": self.try_number, } with self._get_output_target("result.json").open("w") as f: json.dump(result_json, f)
class Training(b2luigi.Task): """Train a fastBDT on train samples of the given off-resonance files and save bdt_weightfile to `bdt.xml`. Parameters: off_res_files (list): list of off-resonance files to be used for training, tree_name (str): name of the tree in the root file, training_variables (list): variables used for training, If you have multiple candidates in your selection, be aware that only the 0th candidate is used for training. This does not have any effect, if you only use event-based variables for training. training_parameters (dict): train- and test size, the following BDT hyper-parameters (optional): "nTrees", "shrinkage" and "nLevels". """ off_res_files = b2luigi.ListParameter(hashed=True) tree_name = b2luigi.ListParameter() training_variables = b2luigi.ListParameter(hashed=True) training_parameters = b2luigi.DictParameter(hashed=True) queue = "sx" def requires(self): train_size = self.training_parameters["train_size"] test_size = self.training_parameters["test_size"] for ntuple_file in self.off_res_files: yield self.clone(SplitSample, ntuple_file=ntuple_file, train_size=train_size, test_size=test_size) def output(self): yield self.add_to_output('bdt.xml') def run(self): bdt = self.get_output_file_name('bdt.xml') train_samples = self.get_input_file_names('train.root') # bdt options general_options = basf2_mva.GeneralOptions() general_options.m_datafiles = basf2_mva.vector(*train_samples) general_options.m_identifier = bdt general_options.m_treename = self.tree_name general_options.m_variables = basf2_mva.vector( *self.training_variables) general_options.m_target_variable = "EventType" fastbdt_options = basf2_mva.FastBDTOptions() training_parameters = self.training_parameters if training_parameters.get("nTrees") is not None: fastbdt_options.m_nTrees = training_parameters["nTrees"] if training_parameters.get("shrinkage") is not None: fastbdt_options.m_shrinkage = training_parameters["shrinkage"] if training_parameters.get("nLevels") is not None: fastbdt_options.m_nLevels = training_parameters["nLevels"] if training_parameters.get("nCuts") is not None: fastbdt_options.m_nCuts = training_parameters["nCuts"] # teacher basf2_mva.teacher(general_options, fastbdt_options)