Example #1
0
def submitAnalysisToGrid(configFile = None, channel = None, samples = None,
                         outputFilePath = None, jobId = None,
                         samplesToAnalyze = None, samplesToSkip = None,
                         disableFactorization = False,
                         disableSysUncertainties = False,
			 disableZrecoilCorrections = False,
                         create = True, submit = True,
                         cfgdir = 'crab',
                         inputFileMap = None, outputFileMap = None,
                         enableEventDumps = False,
                         enableFakeRates = False,
                         processName = None,
                         savePlots = True, saveFinalEvents = False,
                         outsideCERN = False,
                         useCastor = True,
			 doApplyCfgOptions = True):
    """
    Submit analysis job (event selection, filling of histogram)
    via crab
    """

    # check that configFile, channel, samples, outputFilePath and jobId
    # parameters are defined and non-empty
    if configFile is None:
        raise ValueError("Undefined configFile Parameter !!")
    if channel is None:
        raise ValueError("Undefined channel Parameter !!")
    if samples is None:
        raise ValueError("Undefined samples Parameter !!")
    if outputFilePath is None:
        raise ValueError("Undefined outputFilePath Parameter !!")
    if jobId is None:
        raise ValueError("Undefined jobId Parameter !!")

    # Loop over the samples to be analyzed
    for sample in samples['SAMPLES_TO_ANALYZE']:
        # Skip submitting crab job in case
        #  o list of samples for which crab jobs are to be submitted has been
        #    explicitely specified
        #  o sample has explicitely been requested to be skipped
        if samplesToAnalyze:
            if sample not in samplesToAnalyze:
                print "Skipping", sample
                continue
        if samplesToSkip:
            if sample in samplesToSkip:
                print "Skipping", sample
                continue
        print "Submitting ", sample

        sample_info = samples['RECO_SAMPLES'][sample]

        # Make job info
        jobInfo = {
            'channel' : channel,
            'sample' : sample,
            'id' : jobId
        }

        #if outsideCERN:
        #    configFile = "%s_cfg.py"% (channel)

        newConfigFile = getNewConfigFileName(configFile, cfgdir, sample, jobId, label = "@Grid")

        # Check if we want to use a special file for the produced cfg file
        # File map is a function that takes a sample name and returns a list of
        # files corresponding to that file.  If files is None, no change will be
        # made.
        input_files = None
        if inputFileMap is not None:
            input_files = inputFileMap(sample)
            if input_files is None:
                print "Warning: No special input files specified for sample%s, using default." % sample
        output_file = None
        if outputFileMap is not None:
            output_file = outputFileMap(channel, sample, jobId)

        #--------------------------------------------------------------------
        # CV: temporary "hack" for producing (ED)Ntuples/skims
        jobCustomizations = []
        jobCustomizations.append("if hasattr(process, 'skimOutputModule'):")
        jobCustomizations.append("    process.skimOutputModule.fileName = '%s'" % output_file)
        #jobCustomizations.append("print process.dumpPython()")
        #--------------------------------------------------------------------    

        prepareConfigFile(
          configFile = configFile, jobInfo = jobInfo, newConfigFile = newConfigFile,
          sample_infos = samples,
          disableFactorization = disableFactorization,
          disableSysUncertainties = disableSysUncertainties,
          disableZrecoilCorrections = disableZrecoilCorrections,
          input_files = input_files, output_file = output_file,
          enableEventDumps = enableEventDumps, enableFakeRates = enableFakeRates,
          processName = processName,
          saveFinalEvents = saveFinalEvents,
          customizations = jobCustomizations,
          doApplyOptions = doApplyCfgOptions)

        output_files = []
        if output_file is not None:
            output_files.append(output_file)

        # Always include the plot files
        if savePlots:
            output_files.append("%s_%s_%s_%s.root" % (
              PLOT_FILES_PREFIX, jobInfo['channel'],
              jobInfo['sample'], jobInfo['id']))

        # Add our final event skim as well
        if saveFinalEvents:
            output_files.append("final_events_%s_%s_%s.root" % (
              jobInfo['channel'], jobInfo['sample'], jobInfo['id']))

        # Check if we need to reformat the output file path
        if outputFilePath.startswith('/castor/cern.ch'):
            outputFilePath = outputFilePath.replace('/castor/cern.ch','')

        saveNtuple = 0
        if 'saveNtuple' in sample_info.keys() and sample_info['saveNtuple'] is True:
            saveNtuple = 1

        # Build crab options
        crabOptions = {
            'number_of_jobs' : _number_of_jobs(sample_info),
            'datasetpath' : sample_info['datasetpath'],
            'dbs_url' : sample_info['dbs_url'],
            'user_remote_dir' : outputFilePath,
            'output_file' : ", ".join(output_files),
            'get_edm_output' : saveNtuple,
            # Default MC info
            'split_type' : (sample_info['type'] == 'Data' or sample_info['type'] == 'embeddedData') and 'lumis' or 'events',
            'lumi_mask' : sample_info['lumi_mask'],
            'runselection' : sample_info['runselection'],
            'SE_white_list' : sample_info['SE_white_list'],
            'SE_black_list' : sample_info['SE_black_list']
        }

        if outsideCERN:
            crabOptions['use_server'] = 0
            crabOptions['scheduler'] = 'condor'

        if not useCastor:
            crabOptions['return_data'] = 1
            crabOptions['copy_data'] = 0        

        submitToGrid(newConfigFile, jobInfo, crabOptions,
                     create=create, submit=submit, cfgdir=cfgdir)
def submitAnalysisToLocal(configFile = None, channel = None, samples = None, jobId = None,
                          samplesToAnalyze = None, samplesToSkip = None,
                          disableFactorization = False,
                          disableSysUncertainties = False,
                          disableZrecoilCorrections = False,
                          maxEvents = 25000, maxJobsConcurrently = 8,
                          cfgdir = 'local', submit = False, logFilePath = None,
                          inputFileMap = None, outputFileMap = None,
                          enableEventDumps = False,
                          enableFakeRates = False,
                          processName = None,
                          saveFinalEvents = False):
    
    """
    Submit analysis job (event selection, filling of histogram)
    to local machine
    """

    # check that configFile, channel, samples and jobId
    # parameters are defined and non-empty
    if configFile is None:
        raise ValueError("Undefined 'configFile' Parameter !!")
    if channel is None:
        raise ValueError("Undefined 'channel' Parameter !!")
    if samples is None:
        raise ValueError("Undefined 'samples' Parameter !!")
    if jobId is None:
        raise ValueError("Undefined 'jobId' Parameter !!")
    if logFilePath is None:
        raise ValueError("Undefined 'logFilePath' Parameter !!")

    cfgFiles = []

    # Loop over the samples to be analyzed
    for sample in samples['SAMPLES_TO_ANALYZE']:
        # Skip submitting crab job in case
        #  o list of samples for which crab jobs are to be submitted has been
        #    explicitely specified
        #  o sample has explicitely been requested to be skipped
        if samplesToAnalyze:
            if sample not in samplesToAnalyze:
                print "Skipping", sample
                continue
        if samplesToSkip:
            if sample in samplesToSkip:
                print "Skipping", sample
                continue

        sample_info = samples['RECO_SAMPLES'][sample]
        
        # Make job info
        jobInfo = {
            'channel' : channel,
            'sample' : sample,
            'id' : jobId
        }

        newConfigFile = getNewConfigFileName(configFile, cfgdir, sample, jobId, label = "@local")

        fileInfos = getInputFiles(jobInfo, inputFileMap, maxEvents = maxEvents)
        if len(fileInfos) > 0:
            print("Submitting %s in %i part(s)" % (sample, len(fileInfos)))
        else:
            print("No local input files for %s found !!" % sample)

        for job, fileInfo in enumerate(fileInfos):
            print("job %i:" % (job + 1))
            
            sample_infos = copy.deepcopy(samples)

            sample_infos['RECO_SAMPLES'][sample].update({ 'maxEvents'  : fileInfo['maxEvents']  })
            sample_infos['RECO_SAMPLES'][sample].update({ 'skipEvents' : fileInfo['skipEvents'] })

            input_files = [ fileInfo['fileName'] ]

            output_file = outputFileMap(channel, sample, jobId)
            if len(fileInfos) > 1:
                output_file = output_file.replace(".root", "_%i.root" % (job + 1))
                #print(" output_file = %s" % output_file)

            jobConfigFile = newConfigFile
            if len(fileInfos) > 1:
                jobConfigFile = jobConfigFile.replace("_cfg.py", "_%i_cfg.py" % (job + 1))

            #--------------------------------------------------------------------
            # CV: temporary "hack" for producing (ED)Ntuples/skims for tau id. efficiency measurement
            jobCustomizations = []
            jobCustomizations.append("if hasattr(process, 'ntupleOutputModule'):")
            jobCustomizations.append("    process.ntupleOutputModule.fileName = '%s'" % output_file)
            jobCustomizations.append("if hasattr(process, 'skimOutputModule'):")
            jobCustomizations.append("    process.skimOutputModule.fileName = '%s'" % output_file)
            HLTprocessName = 'HLT'
            if 'hlt' in sample_infos['RECO_SAMPLES'][sample].keys():
                HLTprocessName = sample_infos['RECO_SAMPLES'][sample]['hlt'].getProcessName()
            jobCustomizations.append("if hasattr(process, 'hltMu'):")
            jobCustomizations.append("    process.hltMu.selector.src = cms.InputTag('TriggerResults::%s')" % HLTprocessName)
            jobCustomizations.append("process.patTrigger.processName = '%s'" % HLTprocessName)
            jobCustomizations.append("process.patTriggerEvent.processName = '%s'" % HLTprocessName)
            if sample_infos['RECO_SAMPLES'][sample]['type'] == 'Data':
                jobCustomizations.append("if hasattr(process, 'prePatProductionSequence')"
                                        + " and hasattr(process, 'prePatProductionSequenceGen'):")
                jobCustomizations.append("    process.prePatProductionSequence.remove(process.prePatProductionSequenceGen)")
                jobCustomizations.append("if hasattr(process, 'ntupleProducer'):")
                jobCustomizations.append("    if hasattr(process.ntupleProducer.sources, 'tauGenJets'):")
                jobCustomizations.append("        delattr(process.ntupleProducer.sources, 'tauGenJets')")
                jobCustomizations.append("    if hasattr(process.ntupleProducer.sources, 'genJets'):")
                jobCustomizations.append("        delattr(process.ntupleProducer.sources, 'genJets')")
                jobCustomizations.append("    if hasattr(process.ntupleProducer.sources, 'genPhaseSpaceEventInfo'):")
                jobCustomizations.append("        delattr(process.ntupleProducer.sources, 'genPhaseSpaceEventInfo')")
                jobCustomizations.append("    if hasattr(process.ntupleProducer.sources, 'genPileUpEventInfo'):")
                jobCustomizations.append("        delattr(process.ntupleProducer.sources, 'genPileUpEventInfo')")
            #jobCustomizations.append("print process.dumpPython()")
            #--------------------------------------------------------------------

            prepareConfigFile(
              configFile = configFile, jobInfo = jobInfo, newConfigFile = jobConfigFile,
              sample_infos = sample_infos,
              disableFactorization = disableFactorization,
              disableSysUncertainties = disableSysUncertainties,
              disableZrecoilCorrections = disableZrecoilCorrections,  
              input_files = input_files, output_file = output_file,
              enableEventDumps = enableEventDumps, enableFakeRates = enableFakeRates,
              processName = processName,
              saveFinalEvents = saveFinalEvents,
              customizations = jobCustomizations)

            cfgFiles.append(jobConfigFile)

    # Build a script to run all jobs locally

    script = '''
#!/usr/bin/env python

import os
import subprocess
import shlex
import threading
import Queue
    
class Worker(threading.Thread):

    def __init__(self, work_queue):
        super(Worker, self).__init__()
        self.work_queue = work_queue

    def run(self):
        while True:
            try:
                cfgFileName = self.work_queue.get()
                self.process(cfgFileName)
            finally:
                self.work_queue.task_done()
    '''
    script += "\n" 
    script += "    def process(self, cfgFileName):\n"
    script += "        logFilePath = '%s'\n" % logFilePath
    script += "        logFileName = os.path.basename(cfgFileName).replace('_cfg.py', '.log')\n"
    script += "        commandLine = 'rm -f %s' % os.path.join(logFilePath, logFileName)\n"
    script += "        commandLine = 'nice cmsRun %s >& %s' % (cfgFileName, os.path.join(logFilePath, logFileName))\n"
    script += "        print('calling %s...' % commandLine)\n"
    script += "        subprocess.call(commandLine, shell = True)\n"
    script += "\n"  
    script += "work_queue = Queue.Queue()\n"
    script += "\n"
    script += "for iThread in range(%i):\n" % maxJobsConcurrently
    script += "    worker = Worker(work_queue)\n"
    script += "    worker.daemon = True\n"
    script += "    worker.start()\n"
    script += "\n"
    for cfgFile in cfgFiles:
        script += "work_queue.put('%s')\n" % cfgFile
    script += "\n"
    script += "work_queue.join()\n"

    scriptFileName = 'local/runAnalysis_%s_%s.py' % (channel, jobId)
    scriptFile = open(scriptFileName, 'w')
    scriptFile.write(script)

    print("Built %s script" % scriptFileName)

    if submit:
        subprocess.call("python %s" % scriptFileName, shell = True)
def submitAnalysisToLXBatch(configFile=None,
                            channel=None,
                            samples=None,
                            samplesToAnalyze=None,
                            samplesToSkip=None,
                            disableFactorization=False,
                            disableSysUncertainties=False,
                            disableZrecoilCorrections=False,
                            script_directory=None,
                            cfgdir='lxbatch',
                            inputFileMap=None,
                            outputFileMap=None,
                            outputDirectory=None,
                            queue='1nd',
                            enableEventDumps=False,
                            enableFakeRates=False,
                            processName=None,
                            changeTauId=None,
                            saveFinalEvents=False,
                            jobExtention=''):
    """
    Submit analysis job (event selection, filling of histogram)
    to local machine
    """

    # check that configFile, channel, samples and jobId
    # parameters are defined and non-empty
    for param in ["configFile", "channel", "samples", "outputDirectory"]:
        if locals()[param] is None:
            raise ValueError("Undefined '%s' parameter!!" % param)

    jobId = reg.getJobId(channel)

    # If not specified take script directory from user preferences.
    if script_directory is None:
        script_directory = reg.getHarvestScriptLocation()

    # Make sure our output file for the scripts is okay
    if not os.path.exists(script_directory):
        os.makedirs(script_directory)

    # Get all the files in our output directory that have non-zero size
    tmp_files = set(x['file'] for x in castor.nslsl(outputDirectory)
                    if x['size'])

    # Keep track of the files we care about
    relevant_files = set([])

    submit_file_name = 'submit_lxbatch_analysis_' + jobId + '.sh'
    with open(submit_file_name, 'w') as submit_file:
        # Loop over the samples to be analyzed
        for sample in samples['SAMPLES_TO_ANALYZE']:
            write_comment_header(submit_file, " Sample: " + sample)
            # Skip submitting crab job in case
            #  o list of samples for which crab jobs are to be submitted has been
            #    explicitely specified
            #  o sample has explicitely been requested to be skipped
            if samplesToAnalyze:
                if sample not in samplesToAnalyze:
                    print "Skipping", sample
                    continue
            if samplesToSkip:
                if sample in samplesToSkip:
                    print "Skipping", sample
                    continue

            sample_info = samples['RECO_SAMPLES'][sample]

            # Make job info
            jobInfo = {'channel': channel, 'sample': sample, 'id': jobId}

            # Now build the scripts to feed to bsub
            # Find the input files
            input_files = list(inputFileMap(channel, sample, jobId))

            if len(input_files) > 0:
                print("Submitting %s in %i part(s)" %
                      (sample, len(input_files)))
            else:
                print("No local input files for %s found !!" % sample)

            for job, file in enumerate(input_files):

                input_files = [file]
                # The None in the tuple indicates this file has no dependencies in
                # the batch job.
                input_files_and_jobs = [(None, file) for file in input_files]
                # Need to prepend file:, and strip off the directory since we
                # always have bsub rfcp the input files to the working
                # directory.
                input_files_for_cfgOptions = [
                    'file:' + os.path.basename(file) for file in input_files
                ]

                output_file = outputFileMap(channel, sample, jobId)
                input_file_hash = jobtools.hash_files(input_files,
                                                      add_time=False)
                # Add the hash of the input file so we know the provenance of all
                # files
                output_file = os.path.join(
                    outputDirectory,
                    output_file.replace(
                        '.root',
                        '_' + str(job) + '_' + input_file_hash + '.root'))

                relevant_files.add(os.path.basename(output_file))

                # Uncomment to skip rerunning of old jobs
                #if os.path.basename(output_file) in tmp_files:
                #print " done; skipping", output_file
                #continue

                # First, prepare the configuration file
                newConfigFile = getNewConfigFileName(configFile,
                                                     cfgdir,
                                                     sample,
                                                     jobId,
                                                     index=job,
                                                     label="@lxbatch")

                write_comment_header(submit_file, " cfg: " + newConfigFile)
                #--------------------------------------------------------------------
                # CV: temporary "hack" for producing (ED)Ntuples/skims for tau id. efficiency measurement
                jobCustomizations = []
                jobCustomizations.append(
                    "if hasattr(process, 'ntupleOutputModule'):")
                jobCustomizations.append(
                    "    process.ntupleOutputModule.fileName = '%s'" %
                    os.path.basename(output_file))
                jobCustomizations.append(
                    "if hasattr(process, 'patTupleOutputModule'):")
                jobCustomizations.append(
                    "    process.patTupleOutputModule.fileName = '%s'" %
                    os.path.basename(output_file))
                jobCustomizations.append(
                    "if hasattr(process, 'skimOutputModule'):")
                jobCustomizations.append(
                    "    process.skimOutputModule.fileName = '%s'" %
                    os.path.basename(output_file))
                HLTprocessName = 'HLT'
                if 'hlt' in samples['RECO_SAMPLES'][sample].keys():
                    HLTprocessName = samples['RECO_SAMPLES'][sample][
                        'hlt'].getProcessName()
                    jobCustomizations.append("if hasattr(process, 'hltMu'):")
                    jobCustomizations.append(
                        "    process.hltMu.selector.src = cms.InputTag('TriggerResults::%s')"
                        % HLTprocessName)
                    jobCustomizations.append(
                        "if hasattr(process, 'patTrigger'):")
                    jobCustomizations.append(
                        "    process.patTrigger.processName = '%s'" %
                        HLTprocessName)
                    jobCustomizations.append(
                        "if hasattr(process, 'patTriggerEvent'):")
                    jobCustomizations.append(
                        "    process.patTriggerEvent.processName = '%s'" %
                        HLTprocessName)
                if samples['RECO_SAMPLES'][sample]['type'] == 'Data':
                    jobCustomizations.append(
                        "if hasattr(process, 'prePatProductionSequence')" +
                        " and hasattr(process, 'prePatProductionSequenceGen'):"
                    )
                    jobCustomizations.append(
                        "    process.prePatProductionSequence.remove(process.prePatProductionSequenceGen)"
                    )
                    jobCustomizations.append(
                        "if hasattr(process, 'ntupleProducer'):")
                    jobCustomizations.append(
                        "    if hasattr(process.ntupleProducer.sources, 'tauGenJets'):"
                    )
                    jobCustomizations.append(
                        "        delattr(process.ntupleProducer.sources, 'tauGenJets')"
                    )
                    jobCustomizations.append(
                        "    if hasattr(process.ntupleProducer.sources, 'genJets'):"
                    )
                    jobCustomizations.append(
                        "        delattr(process.ntupleProducer.sources, 'genJets')"
                    )
                    jobCustomizations.append(
                        "    if hasattr(process.ntupleProducer.sources, 'genPhaseSpaceEventInfo'):"
                    )
                    jobCustomizations.append(
                        "        delattr(process.ntupleProducer.sources, 'genPhaseSpaceEventInfo')"
                    )
                    jobCustomizations.append(
                        "    if hasattr(process.ntupleProducer.sources, 'genPileUpEventInfo'):"
                    )
                    jobCustomizations.append(
                        "        delattr(process.ntupleProducer.sources, 'genPileUpEventInfo')"
                    )
                jobCustomizations.append(
                    "if hasattr(process, 'patTriggerEventSequence') and hasattr(process, 'patTriggerSequence'):"
                )
                jobCustomizations.append(
                    "    process.patDefaultSequence.replace(process.patTriggerEventSequence,"
                )
                jobCustomizations.append(
                    "                                       process.patTriggerSequence + process.patTriggerEventSequence)"
                )
                #jobCustomizations.append("print process.dumpPython()")
                #--------------------------------------------------------------------

                prepareConfigFile(
                    configFile=configFile,
                    jobInfo=jobInfo,
                    newConfigFile=newConfigFile,
                    sample_infos=samples,
                    disableFactorization=disableFactorization,
                    disableSysUncertainties=disableSysUncertainties,
                    disableZrecoilCorrections=disableZrecoilCorrections,
                    # We always copy the input files to the local directory
                    # before running cmsRun, so just take the basname
                    input_files=input_files_for_cfgOptions,
                    output_file=os.path.basename(output_file),
                    enableEventDumps=enableEventDumps,
                    enableFakeRates=enableFakeRates,
                    processName=processName,
                    saveFinalEvents=saveFinalEvents,
                    changeTauId=changeTauId,
                    customizations=jobCustomizations)

                # Build a function that constructs our log file name given the
                # job file hash.
                if not os.path.exists('lxbatch_log'):
                    os.makedirs('lxbatch_log')

                def log_file_maker(job_hash):
                    return os.path.join(
                        'lxbatch_log',
                        "_".join(['run', channel, sample, jobId, job_hash]) +
                        '.log')

                # Build our batch job
                jobname, script = jobtools.make_bsub_script(
                    output_file,
                    input_files_and_jobs,
                    log_file_maker,
                    "cmsRun %s" % newConfigFile,
                    pass_io_files=False)

                bsub_script_file = os.path.join(
                    script_directory, "_".join([
                        'analyze' + jobExtention, sample, 'job',
                        str(job), input_file_hash
                    ]) + '.sh')
                with open(bsub_script_file, 'w') as bsub_script:
                    bsub_script.write(script)
                # Add this bsub to our submission script
                submit_file.write("bsub -q %s < %s\n" %
                                  (queue, bsub_script_file))

        print len(tmp_files)
        garbage = tmp_files - relevant_files
        print len(garbage)
        if garbage:
            print "Found %i files not generated by this job!!" % len(garbage)
            print " You should really run:"
            print " cat ana_garbage.txt | xargs -n 1 -P 10 rfrm"
            with open('ana_garbage.txt', 'w') as garbage_script:
                for file in garbage:
                    garbage_script.write('%s\n' %
                                         os.path.join(outputDirectory, file))
        print "Run ./%s to submit jobs" % submit_file_name
        os.chmod(submit_file_name, 0755)

        return submit_file_name
Example #4
0
def submitAnalysisToLocal(configFile=None,
                          channel=None,
                          samples=None,
                          jobId=None,
                          samplesToAnalyze=None,
                          samplesToSkip=None,
                          disableFactorization=False,
                          disableSysUncertainties=False,
                          disableZrecoilCorrections=False,
                          maxEvents=25000,
                          maxJobsConcurrently=8,
                          cfgdir='local',
                          submit=False,
                          logFilePath=None,
                          inputFileMap=None,
                          outputFileMap=None,
                          enableEventDumps=False,
                          enableFakeRates=False,
                          processName=None,
                          saveFinalEvents=False):
    """
    Submit analysis job (event selection, filling of histogram)
    to local machine
    """

    # check that configFile, channel, samples and jobId
    # parameters are defined and non-empty
    if configFile is None:
        raise ValueError("Undefined 'configFile' Parameter !!")
    if channel is None:
        raise ValueError("Undefined 'channel' Parameter !!")
    if samples is None:
        raise ValueError("Undefined 'samples' Parameter !!")
    if jobId is None:
        raise ValueError("Undefined 'jobId' Parameter !!")
    if logFilePath is None:
        raise ValueError("Undefined 'logFilePath' Parameter !!")

    cfgFiles = []

    # Loop over the samples to be analyzed
    for sample in samples['SAMPLES_TO_ANALYZE']:
        # Skip submitting crab job in case
        #  o list of samples for which crab jobs are to be submitted has been
        #    explicitely specified
        #  o sample has explicitely been requested to be skipped
        if samplesToAnalyze:
            if sample not in samplesToAnalyze:
                print "Skipping", sample
                continue
        if samplesToSkip:
            if sample in samplesToSkip:
                print "Skipping", sample
                continue

        sample_info = samples['RECO_SAMPLES'][sample]

        # Make job info
        jobInfo = {'channel': channel, 'sample': sample, 'id': jobId}

        newConfigFile = getNewConfigFileName(configFile,
                                             cfgdir,
                                             sample,
                                             jobId,
                                             label="@local")

        fileInfos = getInputFiles(jobInfo, inputFileMap, maxEvents=maxEvents)
        if len(fileInfos) > 0:
            print("Submitting %s in %i part(s)" % (sample, len(fileInfos)))
        else:
            print("No local input files for %s found !!" % sample)

        for job, fileInfo in enumerate(fileInfos):
            print("job %i:" % (job + 1))

            sample_infos = copy.deepcopy(samples)

            sample_infos['RECO_SAMPLES'][sample].update(
                {'maxEvents': fileInfo['maxEvents']})
            sample_infos['RECO_SAMPLES'][sample].update(
                {'skipEvents': fileInfo['skipEvents']})

            input_files = [fileInfo['fileName']]

            output_file = outputFileMap(channel, sample, jobId)
            if len(fileInfos) > 1:
                output_file = output_file.replace(".root",
                                                  "_%i.root" % (job + 1))
                #print(" output_file = %s" % output_file)

            jobConfigFile = newConfigFile
            if len(fileInfos) > 1:
                jobConfigFile = jobConfigFile.replace("_cfg.py",
                                                      "_%i_cfg.py" % (job + 1))

            #--------------------------------------------------------------------
            # CV: temporary "hack" for producing (ED)Ntuples/skims for tau id. efficiency measurement
            jobCustomizations = []
            jobCustomizations.append(
                "if hasattr(process, 'ntupleOutputModule'):")
            jobCustomizations.append(
                "    process.ntupleOutputModule.fileName = '%s'" % output_file)
            jobCustomizations.append(
                "if hasattr(process, 'skimOutputModule'):")
            jobCustomizations.append(
                "    process.skimOutputModule.fileName = '%s'" % output_file)
            HLTprocessName = 'HLT'
            if 'hlt' in sample_infos['RECO_SAMPLES'][sample].keys():
                HLTprocessName = sample_infos['RECO_SAMPLES'][sample][
                    'hlt'].getProcessName()
            jobCustomizations.append("if hasattr(process, 'hltMu'):")
            jobCustomizations.append(
                "    process.hltMu.selector.src = cms.InputTag('TriggerResults::%s')"
                % HLTprocessName)
            jobCustomizations.append("process.patTrigger.processName = '%s'" %
                                     HLTprocessName)
            jobCustomizations.append(
                "process.patTriggerEvent.processName = '%s'" % HLTprocessName)
            if sample_infos['RECO_SAMPLES'][sample]['type'] == 'Data':
                jobCustomizations.append(
                    "if hasattr(process, 'prePatProductionSequence')" +
                    " and hasattr(process, 'prePatProductionSequenceGen'):")
                jobCustomizations.append(
                    "    process.prePatProductionSequence.remove(process.prePatProductionSequenceGen)"
                )
                jobCustomizations.append(
                    "if hasattr(process, 'ntupleProducer'):")
                jobCustomizations.append(
                    "    if hasattr(process.ntupleProducer.sources, 'tauGenJets'):"
                )
                jobCustomizations.append(
                    "        delattr(process.ntupleProducer.sources, 'tauGenJets')"
                )
                jobCustomizations.append(
                    "    if hasattr(process.ntupleProducer.sources, 'genJets'):"
                )
                jobCustomizations.append(
                    "        delattr(process.ntupleProducer.sources, 'genJets')"
                )
                jobCustomizations.append(
                    "    if hasattr(process.ntupleProducer.sources, 'genPhaseSpaceEventInfo'):"
                )
                jobCustomizations.append(
                    "        delattr(process.ntupleProducer.sources, 'genPhaseSpaceEventInfo')"
                )
                jobCustomizations.append(
                    "    if hasattr(process.ntupleProducer.sources, 'genPileUpEventInfo'):"
                )
                jobCustomizations.append(
                    "        delattr(process.ntupleProducer.sources, 'genPileUpEventInfo')"
                )
            #jobCustomizations.append("print process.dumpPython()")
            #--------------------------------------------------------------------

            prepareConfigFile(
                configFile=configFile,
                jobInfo=jobInfo,
                newConfigFile=jobConfigFile,
                sample_infos=sample_infos,
                disableFactorization=disableFactorization,
                disableSysUncertainties=disableSysUncertainties,
                disableZrecoilCorrections=disableZrecoilCorrections,
                input_files=input_files,
                output_file=output_file,
                enableEventDumps=enableEventDumps,
                enableFakeRates=enableFakeRates,
                processName=processName,
                saveFinalEvents=saveFinalEvents,
                customizations=jobCustomizations)

            cfgFiles.append(jobConfigFile)

    # Build a script to run all jobs locally

    script = '''
#!/usr/bin/env python

import os
import subprocess
import shlex
import threading
import Queue
    
class Worker(threading.Thread):

    def __init__(self, work_queue):
        super(Worker, self).__init__()
        self.work_queue = work_queue

    def run(self):
        while True:
            try:
                cfgFileName = self.work_queue.get()
                self.process(cfgFileName)
            finally:
                self.work_queue.task_done()
    '''
    script += "\n"
    script += "    def process(self, cfgFileName):\n"
    script += "        logFilePath = '%s'\n" % logFilePath
    script += "        logFileName = os.path.basename(cfgFileName).replace('_cfg.py', '.log')\n"
    script += "        commandLine = 'rm -f %s' % os.path.join(logFilePath, logFileName)\n"
    script += "        commandLine = 'nice cmsRun %s >& %s' % (cfgFileName, os.path.join(logFilePath, logFileName))\n"
    script += "        print('calling %s...' % commandLine)\n"
    script += "        subprocess.call(commandLine, shell = True)\n"
    script += "\n"
    script += "work_queue = Queue.Queue()\n"
    script += "\n"
    script += "for iThread in range(%i):\n" % maxJobsConcurrently
    script += "    worker = Worker(work_queue)\n"
    script += "    worker.daemon = True\n"
    script += "    worker.start()\n"
    script += "\n"
    for cfgFile in cfgFiles:
        script += "work_queue.put('%s')\n" % cfgFile
    script += "\n"
    script += "work_queue.join()\n"

    scriptFileName = 'local/runAnalysis_%s_%s.py' % (channel, jobId)
    scriptFile = open(scriptFileName, 'w')
    scriptFile.write(script)

    print("Built %s script" % scriptFileName)

    if submit:
        subprocess.call("python %s" % scriptFileName, shell=True)
def submitAnalysisToLXBatch(configFile = None, channel = None, samples = None,
                            samplesToAnalyze = None, samplesToSkip = None,
                            disableFactorization = False,
                            disableSysUncertainties = False,
                            disableZrecoilCorrections = False,
                            script_directory=None,
                            cfgdir = 'lxbatch',
                            inputFileMap = None, outputFileMap = None,
                            outputDirectory = None,
                            queue = '1nd',
                            enableEventDumps = False,
                            enableFakeRates = False,
                            processName = None,
                            changeTauId = None,
                            saveFinalEvents = False,
                            jobExtention = ''):

    """
    Submit analysis job (event selection, filling of histogram)
    to local machine
    """

    # check that configFile, channel, samples and jobId
    # parameters are defined and non-empty
    for param in ["configFile", "channel", "samples",
                  "outputDirectory"]:
        if locals()[param] is None:
            raise ValueError("Undefined '%s' parameter!!" % param)

    jobId = reg.getJobId(channel)

    # If not specified take script directory from user preferences.
    if script_directory is None:
        script_directory = reg.getHarvestScriptLocation()

    # Make sure our output file for the scripts is okay
    if not os.path.exists(script_directory):
        os.makedirs(script_directory)

    # Get all the files in our output directory that have non-zero size
    tmp_files = set(x['file'] for x in castor.nslsl(outputDirectory)
                    if x['size'])

    # Keep track of the files we care about
    relevant_files = set([])

    submit_file_name = 'submit_lxbatch_analysis_' + jobId + '.sh'
    with open(submit_file_name, 'w') as submit_file:
        # Loop over the samples to be analyzed
        for sample in samples['SAMPLES_TO_ANALYZE']:
            write_comment_header(submit_file, " Sample: " + sample)
            # Skip submitting crab job in case
            #  o list of samples for which crab jobs are to be submitted has been
            #    explicitely specified
            #  o sample has explicitely been requested to be skipped
            if samplesToAnalyze:
                if sample not in samplesToAnalyze:
                    print "Skipping", sample
                    continue
            if samplesToSkip:
                if sample in samplesToSkip:
                    print "Skipping", sample
                    continue

            sample_info = samples['RECO_SAMPLES'][sample]

            # Make job info
            jobInfo = {
                'channel' : channel,
                'sample' : sample,
                'id' : jobId
            }

            # Now build the scripts to feed to bsub
            # Find the input files
            input_files = list(inputFileMap(channel, sample, jobId))

            if len(input_files) > 0:
                print("Submitting %s in %i part(s)" % (sample, len(input_files)))
            else:
                print("No local input files for %s found !!" % sample)

            for job, file in enumerate(input_files):

                input_files = [file]
                # The None in the tuple indicates this file has no dependencies in
                # the batch job.
                input_files_and_jobs = [ (None, file) for file in input_files ]
                # Need to prepend file:, and strip off the directory since we
                # always have bsub rfcp the input files to the working
                # directory.
                input_files_for_cfgOptions = [
                    'file:' + os.path.basename(file) for file in input_files]


                output_file = outputFileMap(channel, sample, jobId)
                input_file_hash = jobtools.hash_files(
                    input_files, add_time=False)
                # Add the hash of the input file so we know the provenance of all
                # files
                output_file = os.path.join(outputDirectory, output_file.replace(
                    '.root', '_' + str(job) + '_' + input_file_hash + '.root'))

                relevant_files.add(os.path.basename(output_file))

                # Uncomment to skip rerunning of old jobs
                #if os.path.basename(output_file) in tmp_files:
                    #print " done; skipping", output_file
                    #continue

                # First, prepare the configuration file
                newConfigFile = getNewConfigFileName(
                    configFile, cfgdir, sample,
                    jobId, index = job, label = "@lxbatch")

                write_comment_header(submit_file, " cfg: " + newConfigFile)
                #--------------------------------------------------------------------
                # CV: temporary "hack" for producing (ED)Ntuples/skims for tau id. efficiency measurement
                jobCustomizations = []
                jobCustomizations.append("if hasattr(process, 'ntupleOutputModule'):")
                jobCustomizations.append("    process.ntupleOutputModule.fileName = '%s'" % os.path.basename(output_file))
                jobCustomizations.append("if hasattr(process, 'patTupleOutputModule'):")
                jobCustomizations.append("    process.patTupleOutputModule.fileName = '%s'" % os.path.basename(output_file))
                jobCustomizations.append("if hasattr(process, 'skimOutputModule'):")
                jobCustomizations.append("    process.skimOutputModule.fileName = '%s'" % os.path.basename(output_file))
                HLTprocessName = 'HLT'
                if 'hlt' in samples['RECO_SAMPLES'][sample].keys():
                    HLTprocessName = samples['RECO_SAMPLES'][sample]['hlt'].getProcessName()
                    jobCustomizations.append("if hasattr(process, 'hltMu'):")
                    jobCustomizations.append("    process.hltMu.selector.src = cms.InputTag('TriggerResults::%s')" % HLTprocessName)
                    jobCustomizations.append("if hasattr(process, 'patTrigger'):")
                    jobCustomizations.append("    process.patTrigger.processName = '%s'" % HLTprocessName)
                    jobCustomizations.append("if hasattr(process, 'patTriggerEvent'):")
                    jobCustomizations.append("    process.patTriggerEvent.processName = '%s'" % HLTprocessName)
                if samples['RECO_SAMPLES'][sample]['type'] == 'Data':
                    jobCustomizations.append("if hasattr(process, 'prePatProductionSequence')"
                                            + " and hasattr(process, 'prePatProductionSequenceGen'):")
                    jobCustomizations.append("    process.prePatProductionSequence.remove(process.prePatProductionSequenceGen)")
                    jobCustomizations.append("if hasattr(process, 'ntupleProducer'):")
                    jobCustomizations.append("    if hasattr(process.ntupleProducer.sources, 'tauGenJets'):")
                    jobCustomizations.append("        delattr(process.ntupleProducer.sources, 'tauGenJets')")
                    jobCustomizations.append("    if hasattr(process.ntupleProducer.sources, 'genJets'):")
                    jobCustomizations.append("        delattr(process.ntupleProducer.sources, 'genJets')")
                    jobCustomizations.append("    if hasattr(process.ntupleProducer.sources, 'genPhaseSpaceEventInfo'):")
                    jobCustomizations.append("        delattr(process.ntupleProducer.sources, 'genPhaseSpaceEventInfo')")
                    jobCustomizations.append("    if hasattr(process.ntupleProducer.sources, 'genPileUpEventInfo'):")
                    jobCustomizations.append("        delattr(process.ntupleProducer.sources, 'genPileUpEventInfo')")
                jobCustomizations.append("if hasattr(process, 'patTriggerEventSequence') and hasattr(process, 'patTriggerSequence'):")
                jobCustomizations.append("    process.patDefaultSequence.replace(process.patTriggerEventSequence,")
                jobCustomizations.append("                                       process.patTriggerSequence + process.patTriggerEventSequence)")    
                #jobCustomizations.append("print process.dumpPython()")
                #--------------------------------------------------------------------

                prepareConfigFile(
                    configFile = configFile, jobInfo = jobInfo,
                    newConfigFile = newConfigFile,
                    sample_infos = samples,
                    disableFactorization = disableFactorization,
                    disableSysUncertainties = disableSysUncertainties,
                    disableZrecoilCorrections = disableZrecoilCorrections, 
                    # We always copy the input files to the local directory
                    # before running cmsRun, so just take the basname
                    input_files = input_files_for_cfgOptions,
                    output_file = os.path.basename(output_file),
                    enableEventDumps = enableEventDumps, enableFakeRates = enableFakeRates,
                    processName = processName,
                    saveFinalEvents = saveFinalEvents,
                    changeTauId = changeTauId,
                    customizations = jobCustomizations)

                # Build a function that constructs our log file name given the
                # job file hash.
                if not os.path.exists('lxbatch_log'):
                    os.makedirs('lxbatch_log')
                def log_file_maker(job_hash):
                    return os.path.join(
                        'lxbatch_log', "_".join(
                        ['run', channel, sample, jobId, job_hash]) + '.log')

                # Build our batch job
                jobname, script = jobtools.make_bsub_script(
                    output_file, input_files_and_jobs, log_file_maker,
                    "cmsRun %s" % newConfigFile, pass_io_files = False)

                bsub_script_file = os.path.join(
                    script_directory, "_".join([
                        'analyze'+jobExtention, sample, 'job',
                        str(job), input_file_hash]) + '.sh')
                with open(bsub_script_file, 'w') as bsub_script:
                    bsub_script.write(script)
                # Add this bsub to our submission script
                submit_file.write("bsub -q %s < %s\n" % (queue, bsub_script_file))

        print len(tmp_files)
        garbage = tmp_files - relevant_files
        print len(garbage)
        if garbage:
            print "Found %i files not generated by this job!!" % len(garbage)
            print " You should really run:"
            print " cat ana_garbage.txt | xargs -n 1 -P 10 rfrm"
            with open('ana_garbage.txt', 'w') as garbage_script:
                for file in garbage:
                    garbage_script.write(
                        '%s\n' % os.path.join(outputDirectory, file))
        print "Run ./%s to submit jobs" % submit_file_name
        os.chmod(submit_file_name, 0755)

        return submit_file_name