def test_derived_variable(self, tmp_root_datafile):
     derived_vars = {
         'dev_var1': {
             'var_args': ['testvar1', 'testvar2'],
             'tree': 'tree1',
             'func': lambda x, y: x + y
         },
         'dev_var2': {
             'var_args': ['testvar4'],
             'tree': 'tree2',
             'func': lambda x: 2 * x
         }
     }
     vars_to_cut = self.test_vars_to_cut.copy() | {'dev_var1', 'dev_var2'}
     expected_output = self.expected_output.copy()
     expected_output['testvar2'] = np.arange(1000) * 1.1
     expected_output['testvar4'] = np.arange(1000) * -1
     expected_output['dev_var1'] = expected_output[
         'testvar1'] + expected_output['testvar2']
     expected_output['dev_var2'] = 2 * expected_output['testvar4']
     output = Dataset._build_dataframe(tmp_root_datafile,
                                       TTree_name=self.default_TTree,
                                       cut_list_dicts=self.test_cut_dicts,
                                       vars_to_cut=vars_to_cut,
                                       calc_vars_dict=derived_vars)
     # test column names are the same
     assert set(output.columns) == set(expected_output.columns)
     # test contents are the same
     for col in output.columns:
         assert np.array_equal(output[col], expected_output[col])
 def test_alt_trees(self, tmp_root_datafile):
     newcut = {
         'name': 'cut 3',
         'cut_var': 'testvar4',
         'relation': '<',
         'cut_val': -10,
         'group': 'var4cut',
         'is_symmetric': False,
         'tree': 'tree2'
     }
     list_of_dicts = self.test_cut_dicts.copy()
     list_of_dicts += [newcut]
     expected_output = self.expected_output.copy()
     expected_output['testvar4'] = np.arange(1000) * -1
     expected_output['eventNumber'] = np.arange(1000)
     output = Dataset._build_dataframe(tmp_root_datafile,
                                       TTree_name=self.default_TTree,
                                       cut_list_dicts=list_of_dicts,
                                       vars_to_cut=self.test_vars_to_cut)
     assert set(output.columns) == set(expected_output.columns)
     # test contents are the same
     for col in output.columns:
         assert np.array_equal(output[col], expected_output[col]), \
             f"Dataframe builder failed in column {col};\n" \
             f"Expected: \n{expected_output[col]},\n" \
             f"Got: \n{output[col]}"
 def test_missing_branch(self, tmp_root_datafile):
     missing_branches = {'missing1', 'missing2'}
     with pytest.raises(ValueError) as e:
         _ = Dataset._build_dataframe(tmp_root_datafile,
                                      TTree_name=self.default_TTree,
                                      cut_list_dicts=self.test_cut_dicts,
                                      vars_to_cut=missing_branches)
     assert e.match(r"Missing TBranch\(es\) .* in TTree 'tree1' of file .*")
 def test_missing_tree(self, tmp_root_datafile):
     with pytest.raises(ValueError) as e:
         _ = Dataset._build_dataframe(tmp_root_datafile,
                                      TTree_name='missing',
                                      cut_list_dicts=self.test_cut_dicts,
                                      vars_to_cut=self.test_vars_to_cut)
     assert str(
         e.value
     ) == f"TTree(s) 'missing' not found in file {tmp_root_datafile}"
 def test_duplicate_events_no_alt_tree(self,
                                       tmp_root_datafile_duplicate_events):
     with pytest.raises(Exception) as e:
         _ = Dataset._build_dataframe(tmp_root_datafile_duplicate_events,
                                      TTree_name=self.default_TTree,
                                      cut_list_dicts=self.test_cut_dicts,
                                      vars_to_cut=self.test_vars_to_cut)
     assert str(
         e.value
     ) == f"Found 1000 duplicate events in datafile {tmp_root_datafile_duplicate_events}."
 def test_normal_input(self, tmp_root_datafile):
     output = Dataset._build_dataframe(tmp_root_datafile,
                                       TTree_name=self.default_TTree,
                                       cut_list_dicts=self.test_cut_dicts,
                                       vars_to_cut=self.test_vars_to_cut)
     # test column names are the same
     assert set(output.columns) == set(self.expected_output.columns)
     # test contents are the same
     for col in output.columns:
         assert np.array_equal(output[col], self.expected_output[col]), \
             f"In column {col}\n Expected: \n{self.expected_output[col]}\n Got: \n{output[col]}"
 def test_multifile(self, tmp_root_datafiles):
     expected_output = pd.DataFrame({
         'testvar1':
         np.concatenate(
             (np.arange(3000), np.arange(2000), np.arange(1000))),
         'testvar3':
         np.concatenate(
             (np.arange(3000) * 3, np.arange(2000) * 2, np.arange(1000))),
         'weight_mc':
         np.concatenate((
             np.append(np.ones(2970), -1 * np.ones(30)),
             np.append(np.ones(1980), -1 * np.ones(20)),
             np.append(np.ones(990), -1 * np.ones(10)),
         )),
         'eventNumber':
         np.concatenate(
             (np.arange(3000, 6000), np.arange(1000,
                                               3000), np.arange(1000))),
         'totalEventsWeighted':
         np.concatenate(
             (np.full(3000, sum(np.append(np.ones(2970),
                                          -1 * np.ones(30)))),
              np.full(2000, sum(np.append(np.ones(1980),
                                          -1 * np.ones(20)))),
              np.full(1000, sum(np.append(np.ones(990),
                                          -1 * np.ones(10)))))),
         'DSID':
         np.concatenate((np.full(3000, 1), np.full(2000,
                                                   2), np.full(1000, 1))),
         'weight_pileup':
         np.ones(6000),
     })
     output = Dataset._build_dataframe(tmp_root_datafiles,
                                       TTree_name=self.default_TTree,
                                       cut_list_dicts=self.test_cut_dicts,
                                       vars_to_cut=self.test_vars_to_cut)
     # test column names are the same
     assert set(output.columns) == set(expected_output.columns)
     # test contents are the same
     for col in output.columns:
         print(expected_output[col].unique(), col)
         assert np.array_equal(output[col], expected_output[col]), \
             f"Dataframe builder failed in column {col};\n" \
             f"Expected: \n{expected_output[col]},\n" \
             f"Got: \n{output[col]}"
 def test_duplicate_events_alt_tree(self,
                                    tmp_root_datafile_duplicate_events):
     with pytest.raises(Exception) as e:
         newcut = {
             'name': 'cut 3',
             'cut_var': 'testvar4',
             'relation': '<',
             'cut_val': -10,
             'group': 'var4cut',
             'is_symmetric': False,
             'tree': 'tree2'
         }
         newduplist = self.test_cut_dicts.copy()
         newduplist += [newcut]
         _ = Dataset._build_dataframe(tmp_root_datafile_duplicate_events,
                                      TTree_name=self.default_TTree,
                                      cut_list_dicts=newduplist,
                                      vars_to_cut=self.test_vars_to_cut)
     assert str(e.value) == "Duplicated events in 'tree1' TTree"
 def test_mass_slices(self, tmp_root_datafiles):
     """Test input as 'mass slices'"""
     expected_output = pd.DataFrame({
         'testvar1':
         np.concatenate(
             (np.arange(3000), np.arange(2000), np.arange(1000))),
         'testvar3':
         np.concatenate(
             (np.arange(3000) * 3, np.arange(2000) * 2, np.arange(1000))),
         'weight_mc':
         np.concatenate((
             np.append(np.ones(2970), -1 * np.ones(30)),
             np.append(np.ones(1980), -1 * np.ones(20)),
             np.append(np.ones(990), -1 * np.ones(10)),
         )),
         'eventNumber':
         np.concatenate(
             (np.arange(3000, 6000), np.arange(1000,
                                               3000), np.arange(1000))),
         'weight_pileup':
         np.ones(6000),
         # dataset IDs
         'DSID':
         np.concatenate((np.full(3000, 3), np.full(2000,
                                                   2), np.full(1000, 1))),
         # sum of weights for events with same dataset IDs
         'totalEventsWeighted':
         np.concatenate(
             (np.full(3000, 2940), np.full(2000, 1960), np.full(1000, 980)))
     })
     output = Dataset._build_dataframe(tmp_root_datafiles,
                                       TTree_name=self.default_TTree,
                                       cut_list_dicts=self.test_cut_dicts,
                                       vars_to_cut=self.test_vars_to_cut)
     # test column names are the same
     assert set(output.columns) == set(expected_output.columns)
     # test contents are the same
     for col in output.columns:
         assert np.array_equal(output[col], expected_output[col]), \
             f"Dataframe builder failed in column {col};\n" \
             f"Expected: \n{expected_output[col]},\n" \
             f"Got: \n{output[col]}"