def test_derived_variable(self, tmp_root_datafile): derived_vars = { 'dev_var1': { 'var_args': ['testvar1', 'testvar2'], 'tree': 'tree1', 'func': lambda x, y: x + y }, 'dev_var2': { 'var_args': ['testvar4'], 'tree': 'tree2', 'func': lambda x: 2 * x } } vars_to_cut = self.test_vars_to_cut.copy() | {'dev_var1', 'dev_var2'} expected_output = self.expected_output.copy() expected_output['testvar2'] = np.arange(1000) * 1.1 expected_output['testvar4'] = np.arange(1000) * -1 expected_output['dev_var1'] = expected_output[ 'testvar1'] + expected_output['testvar2'] expected_output['dev_var2'] = 2 * expected_output['testvar4'] output = Dataset._build_dataframe(tmp_root_datafile, TTree_name=self.default_TTree, cut_list_dicts=self.test_cut_dicts, vars_to_cut=vars_to_cut, calc_vars_dict=derived_vars) # test column names are the same assert set(output.columns) == set(expected_output.columns) # test contents are the same for col in output.columns: assert np.array_equal(output[col], expected_output[col])
def test_alt_trees(self, tmp_root_datafile): newcut = { 'name': 'cut 3', 'cut_var': 'testvar4', 'relation': '<', 'cut_val': -10, 'group': 'var4cut', 'is_symmetric': False, 'tree': 'tree2' } list_of_dicts = self.test_cut_dicts.copy() list_of_dicts += [newcut] expected_output = self.expected_output.copy() expected_output['testvar4'] = np.arange(1000) * -1 expected_output['eventNumber'] = np.arange(1000) output = Dataset._build_dataframe(tmp_root_datafile, TTree_name=self.default_TTree, cut_list_dicts=list_of_dicts, vars_to_cut=self.test_vars_to_cut) assert set(output.columns) == set(expected_output.columns) # test contents are the same for col in output.columns: assert np.array_equal(output[col], expected_output[col]), \ f"Dataframe builder failed in column {col};\n" \ f"Expected: \n{expected_output[col]},\n" \ f"Got: \n{output[col]}"
def test_missing_branch(self, tmp_root_datafile): missing_branches = {'missing1', 'missing2'} with pytest.raises(ValueError) as e: _ = Dataset._build_dataframe(tmp_root_datafile, TTree_name=self.default_TTree, cut_list_dicts=self.test_cut_dicts, vars_to_cut=missing_branches) assert e.match(r"Missing TBranch\(es\) .* in TTree 'tree1' of file .*")
def test_missing_tree(self, tmp_root_datafile): with pytest.raises(ValueError) as e: _ = Dataset._build_dataframe(tmp_root_datafile, TTree_name='missing', cut_list_dicts=self.test_cut_dicts, vars_to_cut=self.test_vars_to_cut) assert str( e.value ) == f"TTree(s) 'missing' not found in file {tmp_root_datafile}"
def test_duplicate_events_no_alt_tree(self, tmp_root_datafile_duplicate_events): with pytest.raises(Exception) as e: _ = Dataset._build_dataframe(tmp_root_datafile_duplicate_events, TTree_name=self.default_TTree, cut_list_dicts=self.test_cut_dicts, vars_to_cut=self.test_vars_to_cut) assert str( e.value ) == f"Found 1000 duplicate events in datafile {tmp_root_datafile_duplicate_events}."
def test_normal_input(self, tmp_root_datafile): output = Dataset._build_dataframe(tmp_root_datafile, TTree_name=self.default_TTree, cut_list_dicts=self.test_cut_dicts, vars_to_cut=self.test_vars_to_cut) # test column names are the same assert set(output.columns) == set(self.expected_output.columns) # test contents are the same for col in output.columns: assert np.array_equal(output[col], self.expected_output[col]), \ f"In column {col}\n Expected: \n{self.expected_output[col]}\n Got: \n{output[col]}"
def test_multifile(self, tmp_root_datafiles): expected_output = pd.DataFrame({ 'testvar1': np.concatenate( (np.arange(3000), np.arange(2000), np.arange(1000))), 'testvar3': np.concatenate( (np.arange(3000) * 3, np.arange(2000) * 2, np.arange(1000))), 'weight_mc': np.concatenate(( np.append(np.ones(2970), -1 * np.ones(30)), np.append(np.ones(1980), -1 * np.ones(20)), np.append(np.ones(990), -1 * np.ones(10)), )), 'eventNumber': np.concatenate( (np.arange(3000, 6000), np.arange(1000, 3000), np.arange(1000))), 'totalEventsWeighted': np.concatenate( (np.full(3000, sum(np.append(np.ones(2970), -1 * np.ones(30)))), np.full(2000, sum(np.append(np.ones(1980), -1 * np.ones(20)))), np.full(1000, sum(np.append(np.ones(990), -1 * np.ones(10)))))), 'DSID': np.concatenate((np.full(3000, 1), np.full(2000, 2), np.full(1000, 1))), 'weight_pileup': np.ones(6000), }) output = Dataset._build_dataframe(tmp_root_datafiles, TTree_name=self.default_TTree, cut_list_dicts=self.test_cut_dicts, vars_to_cut=self.test_vars_to_cut) # test column names are the same assert set(output.columns) == set(expected_output.columns) # test contents are the same for col in output.columns: print(expected_output[col].unique(), col) assert np.array_equal(output[col], expected_output[col]), \ f"Dataframe builder failed in column {col};\n" \ f"Expected: \n{expected_output[col]},\n" \ f"Got: \n{output[col]}"
def test_duplicate_events_alt_tree(self, tmp_root_datafile_duplicate_events): with pytest.raises(Exception) as e: newcut = { 'name': 'cut 3', 'cut_var': 'testvar4', 'relation': '<', 'cut_val': -10, 'group': 'var4cut', 'is_symmetric': False, 'tree': 'tree2' } newduplist = self.test_cut_dicts.copy() newduplist += [newcut] _ = Dataset._build_dataframe(tmp_root_datafile_duplicate_events, TTree_name=self.default_TTree, cut_list_dicts=newduplist, vars_to_cut=self.test_vars_to_cut) assert str(e.value) == "Duplicated events in 'tree1' TTree"
def test_mass_slices(self, tmp_root_datafiles): """Test input as 'mass slices'""" expected_output = pd.DataFrame({ 'testvar1': np.concatenate( (np.arange(3000), np.arange(2000), np.arange(1000))), 'testvar3': np.concatenate( (np.arange(3000) * 3, np.arange(2000) * 2, np.arange(1000))), 'weight_mc': np.concatenate(( np.append(np.ones(2970), -1 * np.ones(30)), np.append(np.ones(1980), -1 * np.ones(20)), np.append(np.ones(990), -1 * np.ones(10)), )), 'eventNumber': np.concatenate( (np.arange(3000, 6000), np.arange(1000, 3000), np.arange(1000))), 'weight_pileup': np.ones(6000), # dataset IDs 'DSID': np.concatenate((np.full(3000, 3), np.full(2000, 2), np.full(1000, 1))), # sum of weights for events with same dataset IDs 'totalEventsWeighted': np.concatenate( (np.full(3000, 2940), np.full(2000, 1960), np.full(1000, 980))) }) output = Dataset._build_dataframe(tmp_root_datafiles, TTree_name=self.default_TTree, cut_list_dicts=self.test_cut_dicts, vars_to_cut=self.test_vars_to_cut) # test column names are the same assert set(output.columns) == set(expected_output.columns) # test contents are the same for col in output.columns: assert np.array_equal(output[col], expected_output[col]), \ f"Dataframe builder failed in column {col};\n" \ f"Expected: \n{expected_output[col]},\n" \ f"Got: \n{output[col]}"