def convertTree(inputs, outFolder, name, treeName, indexVars): logging.info("Starting conversion") checkNcreateFolder(outFolder) dataset = Dataset(name, outFolder, treeName) files = [] for _input in inputs: with open(_input, "r") as f: data = f.read() for line in data.split("\n"): if ".root" in line: files.append(line) logging.info("Setting files") dataset.addFiles(files) logging.info("Setting output branches") dataset.setOutputBranches("*") logging.debug("Setting indexing branches: %s", indexVars) dataset.outputIndex = indexVars logging.info("Starting processing dataset") dataset.process(999999999999999999999) logging.info("Finished processing")
def convertTree(config, treeName, category): """ Wrapper for the functionality of preprocessing.dataset """ logging.info("Starting conversion") checkNcreateFolder(config.outputFolder) datasetName = config.outputPrefix + "_" + config.sampleName + "_" + config.categories[ category].name dataset = Dataset(datasetName, config.outputFolder, treeName) logging.info("Setting sample selection: %s", config.sampleSelection) dataset.sampleSelection = config.sampleSelection logging.info("Setting category selection: %s", config.categories[category].selection) dataset.selection = config.categories[category].selection if config.excludeBranches is not None: dataset.ignoreBranches = config.excludeBranches logging.info("Setting files") dataset.addFiles(config.files) logging.info("Setting output branches") dataset.setOutputBranches(config.outputVariables) logging.debug("Setting indexing branches: %s", config.indexVariables) dataset.outputIndex = config.indexVariables if config.addRatio: dataset.setSF(config.sampleSF, "sampleRatio") logging.info("Starting processing dataset") dataset.process(config.maxEvents) logging.info("Finished processing")
def convertTreeMulti(config, treeName, category): logging.info("Starting conversion using multi method") checkNcreateFolder(config.outputFolder) #For multi mode, we generate a dataset per sample. In the loop the output is disabled and in the end the #dataframs of the 1:: samples will be added to the first and saved eventsLeft = config.maxEvents dfs = [] baseDataset = None for iSample, sample in enumerate(config.samples): logging.info("Processing sample %s", sample) if iSample == 0: datasetName = config.outputPrefix + "_" + config.sampleName + "_" + config.categories[ category].name else: datasetName = config.outputPrefix + "_" + config.sampleInfo[ sample].name + "_" + config.categories[category].name dataset = Dataset(datasetName, config.outputFolder, treeName) logging.info("Setting sample selection: %s", config.sampleInfo[sample].selection) dataset.sampleSelection = config.sampleInfo[sample].selection logging.info("Setting category selection: %s", config.categories[category].selection) dataset.selection = config.categories[category].selection if config.excludeBranches is not None: dataset.ignoreBranches = config.excludeBranches logging.info("Setting files") dataset.addFiles(config.sampleInfo[sample].files) logging.info("Setting output branches") dataset.setOutputBranches(config.outputVariables) logging.debug("Setting indexing branches: %s", config.indexVariables) dataset.outputIndex = config.indexVariables if config.addRatio: dataset.setSF(config.sampleInfo[sample].addSF, "sampleRatio") logging.info("Starting processing dataset") thisSampleDF = dataset.process(eventsLeft, skipOutput=True) eventsLeft -= len(thisSampleDF) dfs.append(thisSampleDF) if iSample == 0: baseDataset = copy(dataset) baseDataset.makeOutput(pd.concat(dfs)) logging.info("Finished processing")
def test_Dataset_process(mockTree, mocker): newDataset = Dataset("someName") mockTree_1 = copy.deepcopy(mockTree) mockTree_2 = copy.deepcopy(mockTree) mockTree_1.dataframe.update(pd.DataFrame({'branch2': list(range(2,12))[::-1]})) mockTree_1.setDF() mockTree_2.dataframe.update(pd.DataFrame({'branch1': (list(range(0,10)))[::-1]})) mockTree_2.setDF() newDataset.filesAdded = True newDataset.files = ["file1.root", "file2.root"] newDataset.branches = ["branch1","branch2","branch3"] newDataset.outputBranchesSet = True newDataset.outputBranches = ["branch1", "branch3"] def openROOTFile(*args, **kwargs): mm = mocker.MagicMock() inputfile = args[0] if inputfile == "file1.root": mm.__enter__ = mocker.Mock(return_value = {newDataset.treeName : copy.deepcopy(mockTree_1)} ) else: mm.__enter__ = mocker.Mock(return_value = {newDataset.treeName : copy.deepcopy(mockTree_2)} ) return mm m = mocker.MagicMock() #This mocker, mocks the open call m.side_effect = openROOTFile #Returns a mocker to deal with the with statement mocker.patch("uproot.open", m, create=True) newDataset.selection = "branch1 >= 7 and branch2 >=2" mockTree_1_df = mockTree_1.dataframe mockTree_2_df = mockTree_2.dataframe mockTree_1_df = mockTree_1_df.query("branch1 >= 7 and branch2 >=2") mockTree_2_df = mockTree_2_df.query("branch1 >= 7 and branch2 >=2") expected = pd.concat([mockTree_1_df, mockTree_2_df]) expected.drop(columns=["branch2"], inplace=True) outputDF = newDataset.process(skipOutput = True) assert outputDF.equals(expected)