Beispiel #1
0
    def test_hash_consistency(self):
        first_parameter = b2luigi.DictParameter()
        self.assertFalse(hasattr(first_parameter, "serialize_hashed"))

        second_parameter = b2luigi.DictParameter(hashed=True)
        self.assertTrue(hasattr(second_parameter, "serialize_hashed"))

        serialized_first = second_parameter.serialize_hashed({
            "key1": "value1",
            "key2": 12345
        })
        serialized_second = second_parameter.serialize_hashed({
            "key1": "value1",
            "key2": 12345
        })

        self.assertEqual(serialized_first, serialized_second)
        self.assertEqual(serialized_first,
                         "hashed_df6221c515cbb93735f9478cb05a00e4")

        serialized_first = second_parameter.serialize_hashed(
            [1, "test", 456, {
                "hello": "bye"
            }])
        serialized_second = second_parameter.serialize_hashed(
            [1, "test", 457, {
                "hello": "bye"
            }])

        self.assertNotEqual(serialized_first, serialized_second)
        self.assertEqual(serialized_first,
                         "hashed_7816c14282fd03e3dc4e398f28aa5a30")
Beispiel #2
0
class BootstrapTraining(b2luigi.Task):
    """Start a training with a resampled train sample. See also `Training`.

    Parameters:
        random_seed (int): random seed of the resampled train sample
        off_res_files (list): List with paths to off-res. files
        tree_name (str): name of the tree in the root file
        training_variables (list): list of training variables used for training
        training_parameters (dict): train- and test size,
            the following BDT hyper-parameters (optional): "nTrees",
            "shrinkage" and "nLevels".

    Output:
        bdt.xml
    """
    random_seed = b2luigi.IntParameter()
    off_res_files = b2luigi.ListParameter(hashed=True)
    tree_name = b2luigi.ListParameter()
    training_variables = b2luigi.ListParameter(hashed=True)
    training_parameters = b2luigi.DictParameter(hashed=True)

    def requires(self):
        train_size = self.training_parameters["train_size"]
        test_size = self.training_parameters["test_size"]

        for ntuple_file in self.off_res_files:
            yield self.clone(Resample,
                             ntuple_file=ntuple_file,
                             train_size=train_size,
                             test_size=test_size,
                             random_seed=self.random_seed)

    def output(self):
        yield self.add_to_output('bdt.xml')

    def run(self):
        Training.run(self)
Beispiel #3
0
class TimingTask(luigi.Task):
    """Run tsfresh with the given parameters"""
    feature_parameter = luigi.DictParameter(hashed=True)
    n_jobs = luigi.IntParameter()
    try_number = luigi.IntParameter()

    def output(self):
        yield self.add_to_output("result.json")

    def run(self):
        input_file = self._get_input_targets("data.csv")[0]

        with input_file.open("r") as f:
            df = pd.read_csv(f)

        start_time = time()
        extract_features(df, column_id="id", column_sort="time", n_jobs=self.n_jobs,
                         default_fc_parameters=self.feature_parameter,
                         disable_progressbar=True)
        end_time = time()

        single_parameter_name = list(self.feature_parameter.keys())[0]
        single_parameter_params = self.feature_parameter[single_parameter_name]

        result_json = {
            "time": end_time - start_time,
            "n_ids": self.num_ids,
            "n_jobs": self.n_jobs,
            "feature": single_parameter_name,
            "number_parameters": len(single_parameter_params) if single_parameter_params else 0,
            "time_series_length": int((df["id"] == 0).sum()),
            "try_number": self.try_number,
        }

        with self._get_output_target("result.json").open("w") as f:
            json.dump(result_json, f)
Beispiel #4
0
class Training(b2luigi.Task):
    """Train a fastBDT on train samples of the given off-resonance files and
    save bdt_weightfile to `bdt.xml`.

    Parameters:
        off_res_files (list): list of off-resonance files to be used for
            training,
        tree_name (str): name of the tree in the root file,
        training_variables (list): variables used for training,
            If you have multiple candidates in your selection, be aware that
            only the 0th candidate is used for training.
            This does not have any effect, if you only use event-based
            variables for training.
        training_parameters (dict): train- and test size,
            the following BDT hyper-parameters (optional): "nTrees",
            "shrinkage" and "nLevels".
    """
    off_res_files = b2luigi.ListParameter(hashed=True)
    tree_name = b2luigi.ListParameter()
    training_variables = b2luigi.ListParameter(hashed=True)
    training_parameters = b2luigi.DictParameter(hashed=True)
    queue = "sx"

    def requires(self):
        train_size = self.training_parameters["train_size"]
        test_size = self.training_parameters["test_size"]

        for ntuple_file in self.off_res_files:
            yield self.clone(SplitSample,
                             ntuple_file=ntuple_file,
                             train_size=train_size,
                             test_size=test_size)

    def output(self):
        yield self.add_to_output('bdt.xml')

    def run(self):
        bdt = self.get_output_file_name('bdt.xml')
        train_samples = self.get_input_file_names('train.root')

        # bdt options
        general_options = basf2_mva.GeneralOptions()
        general_options.m_datafiles = basf2_mva.vector(*train_samples)
        general_options.m_identifier = bdt
        general_options.m_treename = self.tree_name
        general_options.m_variables = basf2_mva.vector(
            *self.training_variables)
        general_options.m_target_variable = "EventType"

        fastbdt_options = basf2_mva.FastBDTOptions()
        training_parameters = self.training_parameters
        if training_parameters.get("nTrees") is not None:
            fastbdt_options.m_nTrees = training_parameters["nTrees"]
        if training_parameters.get("shrinkage") is not None:
            fastbdt_options.m_shrinkage = training_parameters["shrinkage"]
        if training_parameters.get("nLevels") is not None:
            fastbdt_options.m_nLevels = training_parameters["nLevels"]
        if training_parameters.get("nCuts") is not None:
            fastbdt_options.m_nCuts = training_parameters["nCuts"]

        # teacher
        basf2_mva.teacher(general_options, fastbdt_options)