Ejemplo n.º 1
0
    def run(self):
        CreateDirectory(self.download_dir(), False)
        CreateDirectory(self.final_directory(), False)
        #self.__datasets = sorted(self.__datasets, key=lambda x: GetPRW_datasetID(x))
        DownloadList = [
            'rucio download --ndownloader 5 --dir %s %s' %
            (self.download_dir(), ds) for ds in self.__datasets
            if ds not in self.__already_on_disk
        ]
        ExecuteCommands(ListOfCmds=DownloadList, MaxCurrent=16)
        self.clearFromDuplicates(self.download_dir())
        Files = []
        for dir in os.listdir(self.download_dir()):
            dir_path = "%s/%s" % (self.download_dir(), dir)
            if not os.path.isdir(dir_path): continue
            if not self.hasDataset(dir): continue
            Files += [
                "%s/%s" % (dir_path, F) for F in os.listdir(dir_path)
                if IsROOTFile(F)
            ]

        WriteList(sorted(Files), "%s/temp_in.txt" % (self.download_dir()))
        #   only 1 entry in the MCPileupReweighting tree per Channel/RunNumber combination is actually needed
        #   thus, remove all others but one in order to significantly reduce the files size!
        #   This is done by the SlimPRWFile macro in XAMPPbase/utils/
        MergeCmd = "SlimPRWFile --InList %s/temp_in.txt --outFile %s" % (
            self.download_dir(), self.final_file())
        print MergeCmd
        os.system(MergeCmd)
        print "INFO: Clean up the temporary file"
        os.system("rm %s/temp_in.txt " % (self.download_dir()))

        self.standaloneCheck()

        print "INFO: Done"
Ejemplo n.º 2
0
 def events_in_prwFile(self, directory, ds):
     prw_configs = [
         "%s/%s/%s" % (directory, ds, f)
         for f in os.listdir("%s/%s" % (directory, ds)) if IsROOTFile(f)
     ]
     prw_helper = self.__setup_prw_helper(config_files=prw_configs)
     prw_period = prw_helper.getPRWperiods_fullsim()[0]
     return prw_helper.nEventsPerPRWperiod_full(GetPRW_datasetID(ds),
                                                prw_period)
Ejemplo n.º 3
0
def OpenFiles(MyList):
    ROOTFiles = []
    for Entry in MyList:
        if IsROOTFile(Entry): ROOTFiles.append(ROOT.TFile.Open(Entry))
        elif IsTextFile(Entry):
            #### Adapt for the possibility that someone passes a XAMPPplotting config
            if Entry.endswith(".conf"):
                ROOTFiles += [ROOT.TFile.Open(File) for File in readXAMPPplottingInputConfig(Entry)]
            else:
                ROOTFiles += [ROOT.TFile.Open(Line) for Line in ReadListFromFile(Entry)]
    return ROOTFiles
Ejemplo n.º 4
0
    def __setup_prw_helper(self, config_files=[]):
        prw_helper = ROOT.XAMPP.PileupHelper(id_generator(24))
        prw_config_files = ROOT.std.vector(str)()
        for f in config_files:
            if IsROOTFile(f): prw_config_files.push_back(f)

        prw_helper.loadPRWperiod_fullsim(prw_config_files)

        if len(prw_helper.getPRWperiods_fullsim()) != 1:
            print "WARNING: More than one period..."
            exit(1)
        return prw_helper
Ejemplo n.º 5
0
def ExecuteAthena(RunOptions, AthenaArgs):
    """
    @brief      Execute athena with options specified in run options and athena arguments
    
    @param      RunOptions  The run options (these are modified to satisfy athena style)
    @param      AthenaArgs  The athena arguments (are directly joined to the athena command)
    """
    ExeCmd = "athena.py %s %s" % (BringToAthenaStyle(
        RunOptions.jobOptions), " ".join(AthenaArgs))
    if RunOptions.outFile.find("/") != -1:
        print("INFO: Will execute Athena in directory " +
              RunOptions.outFile.rsplit("/", 1)[0])
        CreateDirectory(RunOptions.outFile.rsplit("/", 1)[0], False)
        os.chdir(RunOptions.outFile.rsplit("/", 1)[0])
    if RunOptions.outFile.find("/") == len(
            RunOptions.outFile) - 1 or not IsROOTFile(RunOptions.outFile):
        print("ERROR: Please give a file to save not only the directory")
        exit(1)

    # options to run with valgrind
    # ----------------------------------------------------------------------------------------------------
    if RunOptions.valgrind:
        if not any(
                os.access(os.path.join(path, 'valgrind'), os.X_OK)
                for path in os.environ["PATH"].split(os.pathsep)):
            print(
                "ERROR: valgrind not avaliable - you should set up an ATLAS release that contains it or install it manually"
            )
            exit(1)
    if RunOptions.valgrind == "callgrind":
        ExeCmd = "valgrind --suppressions=${ROOTSYS}/etc/valgrind-root.supp  --tool=callgrind --smc-check=all --num-callers=50  --trace-children=yes " + ExeCmd
        print "INFO: You are running with valgrind's callgrind! Execute command modified to:"
        print ExeCmd
    elif RunOptions.valgrind == "memcheck":
        ExeCmd += " --config-only=rec.pkl --stdcmalloc"
        print "INFO: You are running with valgrind's memcheck. First will create picke file with Athena configuration, then execute valgrind."
        print "Creating pickle file ..."
        print ExeCmd
        if os.system(ExeCmd):
            print("ERROR: Creating python pickle file with Athena has failed")
        print("Running valgrind and storing output in valgrind.log ...")
        print ExeCmd
        ExeCmd = "valgrind --suppressions=${ROOTSYS}/etc/valgrind-root.supp --leak-check=yes --trace-children=yes --num-callers=50 --show-reachable=yes --track-origins=yes --smc-check=all `which python` `which athena.py` --stdcmalloc rec.pkl 2>&1 | tee valgrind.log"
        print(
            "Explanation of output: https://twiki.cern.ch/twiki/bin/view/AtlasComputing/UsingValgrind"
        )
    # ----------------------------------------------------------------------------------------------------
    if os.system(ExeCmd):
        print("ERROR: Athena execeution failed")
        os.system("rm %s" % (RunOptions.outFile))
        exit(1)
Ejemplo n.º 6
0
 def __extract_root_files(self, file_list=""):
     content = ReadListFromFile(file_list)
     if len(content) == 0:
         print "ERROR: The file %s is empty" % (in_ds)
         return []
     n_files_in_cont = len(content) - len(
         [c for c in content if IsROOTFile(c)])
     ### The list contains a list of root_files
     if n_files_in_cont == 0:
         return content
     ### It's a mixture
     elif n_files_in_cont != len(content):
         print "ERROR: You've a mixture of ROOT files and other stuff in %s" % (
             file_list)
         return []
     root_files = []
     for ds in content:
         root_files += self.__find_on_dcache(ds)
     return root_files
Ejemplo n.º 7
0
def main():
    parser = setupSubmitParser()
    options = parser.parse_args()
    cluster_engine = setup_engine(options)

    Spared_Files = []
    #### The previous round of cluster screwed up. But had some results. There is no
    #### reason to reprocess them. So successful files are not submitted twice
    if len(options.SpareWhatsProcessedIn) > 0:
        print "INFO: Cluster did not perform so well last time? This little.. buttefingered.."
        for dirToSpare in options.SpareWhatsProcessedIn:
            if not os.path.isdir(dirToSpare):
                print "ERROR: I need a directory to look up %s" % (dirToSpare)
                exit(1)
            for finished in os.listdir(dirToSpare):
                if not IsROOTFile(finished): continue
                print "INFO: Yeah... %s has already beeen processed. Let's skip it.." % (
                    finished)
                Spared_Files.append(finished[:finished.rfind(".root")])

    Submit_Class = NtupleMakerSubmit(
        cluster_engine=cluster_engine,
        jobOptions=options.jobOptions.replace("share/", ""),
        input_ds=ClearFromDuplicates([
            ds for ds in options.inputDS
            if ds[:ds.rfind(".")] not in Spared_Files
            #  or ds not in  Spared_Files
        ]),
        run_time=options.RunTime,
        dcache_dir="%s/GroupDiskLists/%s" % (options.BaseProject, options.RSE),
        alg_opt=AssembleRemoteRunCmd(
            options,
            parser),  ### Extra options of the algorithm like noSyst... etc
        vmem=options.vmem,
        events_per_job=options.EventsPerJob,
        hold_jobs=options.HoldJob,
        files_per_merge=options.FilesPerMergeJob,
        final_split=options.FinalSplit,
    )
    Submit_Class.submit_job()
Ejemplo n.º 8
0
def AssembleIO():
    #--------------------------------------------------------------
    # Reduce the event loop spam a bit
    #--------------------------------------------------------------
    from AthenaCommon.Logging import logging
    recoLog = logging.getLogger('MuonAnalysis I/O')
    recoLog.info('****************** STARTING the job *****************')

    if os.path.exists("%s/athfile-cache.ascii.gz" % (os.getcwd())):
        recoLog.info(
            "Old athfile-cache found. Will delete it otherwise athena just freaks out. This little boy."
        )
        os.system("rm %s/athfile-cache.ascii.gz" % (os.getcwd()))
    from GaudiSvc.GaudiSvcConf import THistSvc
    from AthenaCommon.JobProperties import jobproperties
    import AthenaPoolCnvSvc.ReadAthenaPool
    from AthenaCommon.AthenaCommonFlags import athenaCommonFlags as acf
    from AthenaServices.AthenaServicesConf import AthenaEventLoopMgr
    from AthenaCommon.AppMgr import ServiceMgr
    from ClusterSubmission.Utils import ReadListFromFile, ResolvePath, IsROOTFile
    from MuonAnalysis.Utils import IsTextFile
    ServiceMgr += AthenaEventLoopMgr(EventPrintoutInterval=1000000)

    ServiceMgr += THistSvc()
    OutFileName = "AnalysisOutput.root" if not "outFile" in globals(
    ) else outFile
    ServiceMgr.THistSvc.Output += [
        "MuonAnalysis DATAFILE='{}' OPT='RECREATE'".format(OutFileName)
    ]
    recoLog.info("Will save the job's output to " + OutFileName)
    ROOTFiles = []

    if "inputFile" in globals():
        recoLog.info("Use the following %s as input" % (inputFile))
        ROOTFiles = []
        ResolvedInFile = ResolvePath(inputFile)

        if inputFile.startswith('root://'):
            ROOTFiles.append(inputFile)

        elif ResolvedInFile and os.path.isfile(ResolvedInFile):
            if IsTextFile(ResolvedInFile):
                ROOTFiles = ReadListFromFile(ResolvedInFile)
            else:
                ROOTFiles.append(ResolvedInFile)

        elif ResolvedInFile and os.path.isdir(ResolvedInFile):
            for DirEnt in os.listdir(ResolvedInFile):
                if IsROOTFile(DirEnt):
                    if DirEnt.find(ResolvedInFile) != -1:
                        ROOTFiles.append(DirEnt)
                    else:
                        ROOTFiles.append("%s/%s" % (ResolvedInFile, DirEnt))
        else:
            raise RuntimeError("Invalid input " + inputFile)
        if len(ROOTFiles) == 0:
            raise RuntimeError("No ROOT files could be loaded as input")
        ServiceMgr.EventSelector.InputCollections = ROOTFiles
        acf.FilesInput = ROOTFiles

    if "nevents" in globals():
        recoLog.info("Only run on %i events" % (int(nevents)))
        theApp.EvtMax = int(nevents)
    if "nskip" in globals():
        recoLog.info("Skip the first %i events" % (int(nskip)))
        ServiceMgr.EventSelector.SkipEvents = int(nskip)
    """if isData(): recoLog.info("We're running over data today")
Ejemplo n.º 9
0
    def __prepare_input(self, in_ds=""):
        print "INFO <_prepare_input>: Assemble configuration for %s" % (in_ds)
        ### Name to be piped to the job
        out_name = in_ds[in_ds.rfind("/") + 1:in_ds.rfind(".")] if IsTextFile(
            in_ds) or IsROOTFile(in_ds) else in_ds
        split_dir = "%s/Datasets/%s" % (self.split_cfg_dir(), out_name)
        root_files = []
        ### Now we need to find the corresponding ROOT files
        ### 1) The dataset is a root file itself
        if IsROOTFile(in_ds):
            root_files += [in_ds]
        ### 2) The given dataset is a .txt file
        elif IsTextFile(in_ds):
            ### Find the root files from there
            root_files = self.__extract_root_files(in_ds)
            if len(root_files) == 0: return False
        ### 3) The given dataset is a directory
        elif os.path.isdir(in_ds):
            if in_ds.endswith("/"):
                in_ds = in_ds[:in_ds.rfind("/")]
                out_name = in_ds[in_ds.rfind("/") + 1:]
            split_dir = "%s/Directory/%s" % (self.split_cfg_dir(), out_name)
            root_files = [
                "%s/%s" % (in_ds, F) for F in os.listdir(in_ds)
                if IsROOTFile(F)
            ]
        ### 4) It's a logical dataset stored on d-cache
        else:
            root_files = self.__find_on_dcache(in_ds)
        if len(root_files) == 0:
            print "ERROR: Could not associate anything to %s" % (in_ds)
            return False
        if len(out_name) == 0:
            print "ERROR: How should the output be called %s" % (in_ds)
            return False

        ### Assemble the splitting of the jobs
        main_list = "%s/AllROOTFiles.main" % (split_dir)
        files_in_main = ReadListFromFile(main_list) if os.path.exists(
            main_list) else []
        ### The list is unkown or the content of ROOT files has changed
        ### Redo the splitting again ;-)
        if len(files_in_main) != len(root_files) or not IsListIn(
                files_in_main, root_files):
            print "INFO: Assemble new split for %s" % (in_ds)
            CreateDirectory(split_dir, True)
            WriteList(root_files, main_list)
            os.system("CreateBatchJobSplit -I %s -O %s -EpJ %i" %
                      (main_list, split_dir, self.__events_per_job))
        ### Each of the lists contains the ROOT files to process per each sub job
        split_lists = [
            "%s/%s" % (split_dir, F) for F in os.listdir(split_dir)
            if IsTextFile(F)
        ]
        n_jobs = len(split_lists)
        subjob_outs = [
            "%s/%s_%d.root" % (self.engine().tmp_dir(), out_name, d)
            for d in range(n_jobs)
        ]

        assembled_in = [] if not os.path.exists(
            self.job_input()) else ReadListFromFile(self.job_input())
        assembled_out = [] if not os.path.exists(
            self.job_out_names()) else ReadListFromFile(self.job_out_names())
        start_reg = len(assembled_in)

        ### Write what we've
        WriteList(assembled_in + split_lists, self.job_input())
        WriteList(assembled_out + subjob_outs, self.job_out_names())
        #### Submit the merge jobs
        self.__merge_interfaces += [
            self.engine().create_merge_interface(
                out_name=out_name,
                files_to_merge=subjob_outs,
                hold_jobs=[(self.engine().job_name(),
                            [start_reg + i + 1 for i in range(n_jobs)])],
                files_per_job=self.__files_per_merge_itr,
                final_split=self.__final_split)
        ]
        self.__nsheduled += n_jobs
        return True
Ejemplo n.º 10
0
def AssembleAthenaOptions(RunOptions, Parser=None, IsRemote=False):
    """
    @brief      Assemble athena options from run options and argument parser.
                The athena arguments work like this (as documented here:
                https://gitlab.cern.ch/atlas/athena/blob/21.2/Control/AthenaCommon/python/AthArgumentParser.py#L2)
    
                The command line arguments in the athena call are first passed
                to athena. Every argument that should be passed to the user code
                needs to be prepended by a single additional `-`.
    
                Example:
    
                athena.py XAMPPbase/runXAMPPbase.py  --maxEvt 100 - --noSys
                -----------------------------------------------------------
                         | job option              | athena arg  | user arg
    
    @param      RunOptions  The run options
    @param      Parser      The parser
    @param      IsRemote    Flag to toggle option parsing for pathena instead of
                            athena for running on the grid
    
    @return     List with athena command line options
    """
    Options = []
    if not IsRemote and RunOptions.testJob:
        RunOptions.noSyst = True
        RunOptions.parseFilesForPRW = True
    athena_args = ["skipEvents", "evtMax", "filesInput"]
    local_only = ["outFile", "parseFilesForPRW"] + athena_args
    from XAMPPbase.SubmitToBatch import exclusiveBatchOpt
    from XAMPPbase.SubmitToGrid import exclusiveGridOpts

    black_listed = ["jobOptions", "valgrind"] + [
        x.dest for x in exclusiveBatchOpt()._actions
    ] + [x.dest for x in exclusiveGridOpts()._actions]
    attributes = [
        att for att in dir(RunOptions)
        if not att.startswith("_") and att not in black_listed
    ]
    attributes.sort(key=lambda x: x not in athena_args)
    ath_delimiter = False
    l_delim = -1
    for att in attributes:
        if ath_delimiter and att in athena_args: ath_delimiter = False
        if not ath_delimiter and not att in athena_args:
            ath_delimiter = True
            Options += ["-"]
            l_delim = len(Options)
        ### Skip all arguments which are default from the parser
        if IsArgumentDefault(getattr(RunOptions, att), att, Parser): continue
        if IsRemote and att in local_only: continue
        ### Attributed
        if att == "filesInput" and (os.path.isfile(RunOptions.filesInput)
                                    and not IsROOTFile(RunOptions.filesInput)
                                    or os.path.isdir(RunOptions.filesInput)):
            Options += [
                "--%s '%s'" % (att, ",".join(
                    ReadListFromFile(RunOptions.filesInput)
                    if not os.path.isdir(RunOptions.filesInput) else [
                        "%s/%s" % (RunOptions.filesInput, item)
                        for item in os.listdir(RunOptions.filesInput)
                        if IsROOTFile(item)
                    ]))
            ]
        elif isinstance(getattr(RunOptions, att), bool):
            Options += ["--%s" % (att)]
        elif isinstance(getattr(RunOptions, att), list):
            Options += ["--%s %s" % (att, " ".join(getattr(RunOptions, att)))]
        else:
            Options += ["--%s %s" % (att, getattr(RunOptions, att))]
    ### No extra options were parsed. Get rid of the trailing -
    if len(Options) == l_delim:
        Options.pop()
    return Options
if __name__ == "__main__":

    parser = argparse.ArgumentParser(
        prog='CompareNTUP_PILEUP',
        description=
        'This script searches for NTUP_PILEUP derivations in rucio (or takes a given list) and sorts the datasets by their AMI-tags. Then it donwloads and merges them accordingly.',
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    parser.add_argument('--oldPRWDir', help='Path to the previous files', default=ResolvePath("XAMPPbase/PRWFiles"))
    parser.add_argument('--newPRWDir', help='Path to the new file', required=True)
    parser.add_argument('--uniteFiles',
                        help="Put everything which was in the old file also in the new one",
                        default=False,
                        action="store_true")
    RunOptions = parser.parse_args()

    files_in_old = [f for f in os.listdir(RunOptions.oldPRWDir) if IsROOTFile(f)]
    files_in_new = [f for f in os.listdir(RunOptions.newPRWDir) if IsROOTFile(f)]

    MyxSecDB = ROOT.SUSY.CrossSectionDB()
    for new in files_in_new:
        if not new in files_in_old:
            print "WARNING: Strange the file %s is new. Is it a new campaign?"
            continue

        chan_in_old = readPRWchannels("%s/%s" % (RunOptions.oldPRWDir, new))
        chan_in_new = readPRWchannels("%s/%s" % (RunOptions.newPRWDir, new))
        messages = []
        AnythingNew = False
        ### Compare the prw channels of both files
        for c in chan_in_new:
            if not c in chan_in_old:
Ejemplo n.º 12
0
    def clearFromDuplicates(self, directory=""):
        print "INFO: Clear input of %s from duplicates and empty datasets" % (
            self.final_file())
        samples = []
        #### After downloading everything we need to clean it from the duplicates and remove
        #### all dataset containers which are empty
        #### To remove the empty datasets the fastest way is to check if the rucio download directory contains
        #### ROOT files
        for i in range(len(self.datasets())):
            ### We know persuade a now approach. It seems that extensions of a dataset are assigned
            ### to different NTUP_PILEUP p-tags so we need to download them all?
            ds_list = []
            dsid_to_check = GetPRW_datasetID(self.__datasets[i])
            tag = GetAMITagsMC(self.__datasets[i],
                               SkimPTag=True,
                               SkimSTag=False,
                               SkimETag=False)
            while i < len(self.datasets()) and (
                    GetPRW_datasetID(self.__datasets[i]) == dsid_to_check
                    and tag == GetAMITagsMC(self.__datasets[i],
                                            SkimPTag=True,
                                            SkimSTag=False,
                                            SkimETag=False)):
                ds = self.__datasets[i]
                smp_dir = "%s/%s" % (directory, ds)
                if os.path.isdir(smp_dir) and len(
                    [f for f in os.listdir(smp_dir) if IsROOTFile(f)]) > 0:
                    ds_list += [ds]
                i += 1

            if len(ds_list) == 0: continue
            if len(ds_list) > 1:
                ds_pairs = [(x, self.events_in_prwFile(directory, x))
                            for x in ds_list]
                ds_list = [
                    d[0]
                    for d in sorted(ds_pairs, key=lambda x: x[1], reverse=True)
                ]
            if self.__check_consistency:
                ### Setup the PileupHelper instance to read the prw config files
                ami_lookup = getAMIDataBase().getMCchannel(
                    dsid_to_check, "%s" % (self.campaign()))
                if not ami_lookup:
                    print "WARNING: The dataset %s does not exist in AMI at all. Interesting that we made prw files out of it" % (
                        ds)
                    continue

                config_file_tag = GetAMITagsMC(DS=ds_list[0],
                                               SkimPTag=True,
                                               SkimETag=False,
                                               SkimSTag=False)
                ev_in_ami = ami_lookup.getEvents(tag=config_file_tag)

                if ev_in_ami == -1:
                    print "WARNING: no AMI tag could be found for dataset %s " % (
                        ds)
                    for T in ami_lookup.getTags():
                        print "        --- %s: %d" % (
                            T, ami_lookup.getEvents(tag=T))
                    continue

                ds_to_add = []
                ev_in_prw = 0
                for ds in ds_list:
                    ev_in_ds = self.events_in_prwFile(directory, ds)
                    ev_in_prw += ev_in_ds
                    if ev_in_ds == ev_in_ami:
                        ds_to_add = [ds]
                        break
                    ### We still can add datasets
                    if ev_in_ami >= ev_in_prw:
                        ds_to_add += [ds]
                        if ev_in_ami == ev_in_prw: break

                if ev_in_prw != ev_in_ami:
                    print "WARNING: %s has different number of events in AMI (%d) vs. NTUP_PILEUP (%d)" % (
                        ds, ev_in_ami, ev_in_prw)
                    self.__inconsistent_log += [
                        "%s    %d  %d" % (ds, ev_in_ami, ev_in_prw)
                        for ds in ds_list
                    ]
                    ds_to_add = []
                ### Somehow we've more events in the config file than in AMI... Definetly a candidte to blacklist
                if ev_in_ami < ev_in_prw: self.__to_black_list += ds_to_add

            samples += ds_list

        if self.__check_consistency:
            WriteList(samples, "%s/Finished.txt" % (self.download_dir()))
        new_dsids = ClearFromDuplicates(
            [GetPRW_datasetID(ds) for ds in samples])
        if len(self.dsids()) != len(new_dsids):
            self.__dsids = sorted(new_dsids)
            self.__purged = sorted([
                GetPRW_datasetID(ds) for ds in self.__datasets
                if GetPRW_datasetID(ds) not in self.dsids()
            ])
            print "INFO: %d dsids have been eliminated since all input files are invalid." % (
                len(self.purged()))

        #### for the removal the sorting is important
        #### 1) official vs. privately produced
        #### 2) Newer ptag vs old
        ##samples = sorted(samples, cmp=lambda x,y: PRWdatasetSorter(x,y))
        AOD_Samples = []
        for s in samples:
            AOD = "%d.%s" % (
                GetPRW_datasetID(s),
                GetAMITagsMC(s, SkimPTag=True, SkimETag=False, SkimSTag=False))
            if not AOD in AOD_Samples:
                self.__datasets += [s]
                AOD_Samples.append(AOD)

        print "INFO: Will merge %d files to %s" % (len(
            self.datasets()), self.final_file())
Ejemplo n.º 13
0
 def _is_good_file(self, f):
     return IsROOTFile(f)