def make_harvest_scripts(
        plot_regex,
        skim_regex,
        channel="",
        sampleToAnalyze="",
        job_id=None,
        # An iterable that gives the input files
        input_source=None,
        # Pass input_files_info explicitely in case input files
        # do not yet exist an time when make_harvest_scripts is called
        input_files_info=None,
        # Allow to switch between 'genericHarvester.py' and 'hadd'
        harvester_command=_HARVESTER_CMD,
        # Abort harvesting/merging scripts in case input files fail to get copied
        abort_on_rfcp_error=True,
        # Where to put the output
        castor_output_directory=None,
        script_directory=None,
        merge_script_name=None,
        local_copy_mapper=None,
        chunk_size=1e9,  # 1 GB
        max_input_files_per_chunk=50,
        run_harvesting=True,
        run_merging=True,
        check_old_files=True,
        max_bsub_concurrent_file_access=2000,
        verbosity=1):

    # Get the jobId from the user registry
    if job_id is None:
        job_id = reg.getJobId(channel)

    if script_directory is None:
        script_directory = reg.getHarvestScriptLocation()

    # Create the directory where we store the scripts if it doesn't exist
    if not os.path.exists(script_directory):
        os.mkdir(script_directory)

    if input_files_info is None:
        # Get all files with nonzero size in the input castor directory
        print "Getting files to harvest from input, input_source = %s" % input_source
        input_files_info = [x for x in input_source if x['size']]
    #print "input_files_info = %s" % input_files_info

    print "Getting files from destination"
    # Get all the tmp files (that are non-zero)
    if castor_output_directory.find("/castor") != -1:
        tmp_files_info = [
            x for x in castor_source(castor_output_directory) if x['size']
        ]
    elif castor_output_directory.find("/store") != -1:
        tmp_files_info = [
            x for x in eos_source(castor_output_directory) if x['size']
        ]
    else:
        local_output_directory = None
        if castor_output_directory.find(":") != -1:
            local_output_directory = castor_output_directory[
                castor_output_directory.find(":") + 1:]
        else:
            local_output_directory = castor_output_directory
        tmp_files_info = []
        for x in os.listdir(local_output_directory):
            file_info = {
                'path': os.path.join(local_output_directory,
                                     os.path.basename(x)),
                'size': 1,  # dummy
                'time': time.localtime(),
                'file': os.path.basename(x),
                'permissions':
                'mrw-r--r--'  # "ordinary" file access permissions
            }
            tmp_files_info.append(file_info)
    tmp_files = set(x['file'] for x in tmp_files_info)
    #print "tmp_files_info = %s" % tmp_files_info

    # Make a repository of info about our files
    all_files_dict = {}
    for file_info in input_files_info + tmp_files_info:
        all_files_dict[file_info['path']] = file_info
    #print "all_files_dict = %s" % all_files_dict

    # Keep track of files that we put in tmp with these jobs and that we care
    # about.  We can stop caring about old files if after adding new files (i.e.
    # crab jobs finishing), the merge block they were in contains a different
    # set of files.
    relevant_tmp_files = set([])

    # Keep track of the final harvested output
    final_harvest_files = []

    harvest_script_name = "_".join(
        ['submit', channel, sampleToAnalyze, job_id, 'harvest']) + '.sh'

    harvest_log = open(
        '_'.join(
            ('harvest', channel, sampleToAnalyze, job_id, 'log')) + '.txt',
        'a')

    # Keep track of the names of lxbatch jobs
    bsub_job_names = []

    if run_harvesting:
        # Select those that match our given regex for a sample
        plot_file_map = defaultdict(list)
        # Keep track of a hash of
        plot_matcher = re.compile(plot_regex)
        plot_source_hashes = defaultdict(hashlib.md5)
        for file in input_files_info:
            match = plot_matcher.match(file['file'])
            if match:
                full_file = file['path']
                sample = None
                if sampleToAnalyze != "":
                    sample = sampleToAnalyze
                else:
                    sample = match.group('sample')
                plot_file_map[sample].append(full_file)
                plot_source_hashes[sample].update(full_file)

        harvest_log.write('=========== %s ================' % time.asctime())
        for sample in sorted(plot_source_hashes.keys()):
            # Write down the hash of the input files
            harvest_log.write(" ".join([
                channel, sample, job_id,
                '<harvest>'
                'input[%s]' % plot_source_hashes[sample].hexdigest()
            ]) + '\n')

        submit_file = open(harvest_script_name, 'w')
        # Make the bsub scripts
        submit_file.write("#!/bin/bash\n")
        # Now build mergers for each of the samples
        # Count how many bsubs we have created, so we can put pauses in to
        # thwart the rate limit.
        bsub_file_access_counter = 0
        for sample in plot_file_map.keys():
            #if sample.find('NoPU') == -1:
            #    continue
            # Add helpful comments
            write_comment_header(
                submit_file,
                "Harvesting channel %s, sample %s" % (channel, sample))
            print " Building harvesting for channel %s, sample %s" % (channel,
                                                                      sample)
            print " -- Found %i files to harvest" % len(plot_file_map[sample])
            # Build merge tree.  We add the source has to the sample name.
            split = 6
            if 'PPmuX' in sample:
                print "High yield sample %s detected, setting split to 4" % sample
                split = 4
            merge_jobs = jobtools.make_merge_dependency_tree(
                "_".join([channel, sample, job_id]),
                plot_file_map[sample],
                local_output_directory,
                split=split)
            #print "merge_jobs = %s" % merge_jobs
            # Only do work that hasn't been done before.  We can check and see
            # if the output of a given merge layer is already in the temp
            # directory.  As the filenames contain a suffix with the hash of the
            # input file names, we can be sure that if a file is out of date we
            # will notice.
            merge_jobs_needed = []
            files_to_build = set([])
            print " --- Generated %i harvest layers:" % len(merge_jobs)
            for i, layer in enumerate(merge_jobs):
                # Figure out how many files we need to build
                layer_jobs_needed = []
                for layer_job in layer:
                    #print "layer_job = ", layer_job
                    # Check if we've already built this output file in the tmp
                    file_base_name = os.path.basename(layer_job[0])
                    needed = True
                    # Check if we are rebuilding a dependency
                    building_a_dependency = any(file in files_to_build
                                                for file in layer_job[1])
                    if not building_a_dependency and file_base_name in tmp_files:
                        output_m_time = all_files_dict[layer_job[0]]['time']
                        out_of_date = False
                        for input_file in layer_job[1]:
                            #print "input_file = ", input_file
                            if not input_file in all_files_dict.keys(
                            ) or all_files_dict[input_file][
                                    'time'] > output_m_time:
                                print "File: %s is older than its dependency %s, rebuilding!" % (
                                    file_base_name, input_file)
                                # Check if it's out of date
                                out_of_date = True
                                break
                        if not out_of_date:
                            needed = False
                    if needed:
                        layer_jobs_needed.append(layer_job)
                    # Keep track of the relevant files, so we can delete old
                    # cruft
                    relevant_tmp_files.add(file_base_name)
                # Check if this is the final output layer
                if len(layer) == 1:
                    final_harvest_files.append((sample, layer[0][0]))
                print " ---- layer %i has %i jobs, of which %i not done" % (
                    i, len(layer), len(layer_jobs_needed))
                merge_jobs_needed.append(layer_jobs_needed)

            # Keep track of what jobId was used for a paticular output file
            job_registry = {}
            # If a file is not produced by a job (already exists in CASTOR/EOS),
            # then the job ID returned is none.
            get_job_name = lambda x: x in job_registry and job_registry[
                x] or None

            for ilayer, layer in enumerate(merge_jobs_needed):
                write_comment_header(submit_file, "Layer %i" % ilayer)
                submit_file.write(
                    "echo Submitting layer %i of channel %s, sample %s\n" %
                    (ilayer, channel, sample))
                for ijob, (output_file, input_files) in enumerate(layer):
                    # Get the job name (if it exists) and file name for the
                    # input files.
                    input_files_and_jobs = [(get_job_name(file), file)
                                            for file in input_files]

                    # Build a function that constructs our log file name given the
                    # job file hash.
                    # Build a function that constructs our log file name given the
                    # job file hash.
                    if not os.path.exists('lxbatch_log'):
                        os.makedirs('lxbatch_log')

                    def log_file_maker(job_hash):
                        log_fileName = os.path.join(
                            'lxbatch_log', "_".join(
                                ['harvest', job_hash,
                                 'layer_%i' % ilayer]) + '.log')
                        # CV: delete log-files from previous job submissions
                        os.system("rm -f %s" % log_fileName)
                        return log_fileName

                    # Build the script
                    job_name, script = jobtools.make_bsub_script(
                        output_file,
                        input_files_and_jobs,
                        log_file_maker,
                        harvester_command,
                        abort_on_rfcp_error=abort_on_rfcp_error,
                        label="_".join([
                            "harvest", channel, sample, "layer",
                            str(ilayer), "job",
                            str(ijob)
                        ]))
                    #print "job_name = %s" % job_name
                    bsub_job_names.append(job_name)

                    # Register our job
                    #print "--> registering channel %s, sample %s, jobId %s" % (channel, sample, job_id)
                    #print " script_directory = %s" % script_directory
                    job_registry[output_file] = job_name
                    #print " job_registry[%s] = %s" % (output_file, job_registry[output_file])
                    script_file = os.path.join(
                        script_directory, "_".join([
                            "harvest", channel, sample, job_id, "layer",
                            str(ilayer), "job",
                            str(ijob)
                        ]) + ".sh")
                    submit_file.write("bsub < %s\n" % script_file)
                    # Keep track of how many files we access
                    bsub_file_access_counter += split
                    if bsub_file_access_counter > max_bsub_concurrent_file_access:
                        bsub_file_access_counter = 0
                        submit_file.write("# thwart rate limit\n")
                        submit_file.write(
                            "echo Sleeping for 500 seconds, it is now:\n")
                        submit_file.write("date\n")
                        submit_file.write("sleep 500\n")
                    with open(script_file, 'w') as script_file:
                        script_file.write(script)
        submit_file.close()

    if run_merging:
        # Now build skim merging file
        # Select those that match our given regex for a sample
        skim_file_map = defaultdict(list)
        skim_fileinhash_map = defaultdict(hashlib.md5)
        skim_fileouthash_map = defaultdict(hashlib.md5)
        skim_matcher = re.compile(skim_regex)
        for file in input_files_info:
            match = skim_matcher.match(file['file'])
            if match:
                full_file = file['path']
                # Parse the sample from the regex
                sample = None
                if sampleToAnalyze != "":
                    sample = sampleToAnalyze
                else:
                    sample = match.group('sample')
                # For the skims, keep track of the file size well, since we use it
                # to group the jobs.
                skim_file_map[sample].append(
                    (file['time'], file['size'], full_file))
                # Keep track of the hash of all input files so we know what went
                # into our output
                skim_fileinhash_map[sample].update(full_file)

        def make_skim_name(sample, chunk, hash):
            " Generate a nice name for an output skim "
            return "_".join(["skim", sample, "chunk",
                             str(chunk), hash]) + ".root"

        if merge_script_name is None:
            merge_script_name = "_".join(
                ['submit', channel, sampleToAnalyze, job_id, 'merge']) + '.sh'
        with open(merge_script_name, 'w') as merge_script:
            merge_jobs_counter = 0
            bsub_file_access_counter = 0
            for sample in skim_file_map.keys():
                write_comment_header(merge_script, " Merging " + sample)
                print "Merging channel %s, sample %s" % (channel, sample)
                files = skim_file_map[sample]
                num_files = len(files)
                total_file_size = sum(map(lambda x: x[1], files)) / 1e6
                # Divide the job up into chunks that are about 1 GB each in size
                chunks = list(
                    jobtools.split(files, chunk_size,
                                   max_input_files_per_chunk, lambda x: x[1]))
                print " Total sample size: %i files, %i MB - splitting into %i chunks" % (
                    num_files, total_file_size, len(chunks)),
                # Keep track of jobs we are actually running
                skim_merge_jobs = []
                for ichunk, input_files in enumerate(chunks):
                    # Figure out the name for our file.  It contains a hash of its
                    # inputs.  We don't add the time, as we never have any LXBatch
                    # job dependencies.
                    just_the_files = [x[2] for x in input_files]
                    output_file = make_skim_name(
                        "_".join([channel, sample, job_id]), ichunk,
                        jobtools.hash_files(just_the_files, add_time=False))
                    skim_fileouthash_map[sample].update(output_file)

                    relevant_tmp_files.add(output_file)
                    if output_file not in tmp_files:
                        output_file_full_path = os.path.join(
                            castor_output_directory, output_file)
                        # Add "None" as the job id of the input files to indicate we
                        # don't care about any dependencies.  The index on x takes
                        # out only the filename, not the size or the time.
                        skim_merge_jobs.append((output_file_full_path,
                                                map(lambda x: (None, x[2]),
                                                    input_files)))

                print " -- %i chunks are already done, skipping" % (
                    len(chunks) - len(skim_merge_jobs))

                for ijob, (output_file,
                           input_files) in enumerate(skim_merge_jobs):

                    def merge_log_file_maker(job_hash):
                        log_fileName = os.path.join(
                            'lxbatch_log',
                            "_".join(['merge', job_hash,
                                      'job_%i' % ijob]) + '.log')
                        # CV: delete log-files from previous job submissions
                        os.system("rm -f %s" % log_fileName)
                        return log_fileName

                    # Generate script contents
                    job_name, script = jobtools.make_bsub_script(
                        output_file,
                        input_files,
                        merge_log_file_maker,
                        _MERGER_CMD,
                        abort_on_rfcp_error=abort_on_rfcp_error,
                        label="_".join(
                            ["merge", channel, sample, "chunk",
                             str(ijob)]))
                    script_file = os.path.join(
                        script_directory, "_".join([
                            "merge", channel, sample, job_id, "chunk",
                            str(ijob)
                        ]) + ".sh")

                    # Add our bsub command
                    merge_script.write("bsub < %s\n" % script_file)
                    merge_jobs_counter += 1
                    bsub_file_access_counter += len(input_files)
                    if bsub_file_access_counter > max_bsub_concurrent_file_access:
                        bsub_file_access_counter = 0
                        merge_script.write("# thwart rate limit\n")
                        merge_script.write(
                            "echo Sleeping for 500 seconds, it is now:\n")
                        merge_script.write('date\n')
                        merge_script.write("sleep 500\n")
                    with open(script_file, 'w') as script_file:
                        script_file.write(script)

            print " Built %i merge jobs in %s" % (merge_jobs_counter,
                                                  merge_script_name)

        for sample in sorted(skim_fileinhash_map.keys()):
            # Write down the hash of the input files
            harvest_log.write(" ".join([
                job_id,
                sample,
                '<merge>'
                'input[%s]' % skim_fileinhash_map[sample].hexdigest(),
                'output[%s]' % skim_fileouthash_map[sample].hexdigest(),
            ]) + '\n')

    if check_old_files:
        # Compute all the extra tmp files
        extra_crap = tmp_files - relevant_tmp_files
        if extra_crap:
            print " Found %i extra files from previous harvest jobs" % len(
                extra_crap)
            print " Writing these files to file garbage.txt. To delete, please run: "
            print " cat garbage.txt | xargs -P 10 -n 1 rfrm "
            with open('garbage.txt', 'w') as garbage_file:
                for file in extra_crap:
                    full_path = os.path.join(castor_output_directory, file)
                    garbage_file.write(full_path + '\n')

    local_copy_script = 'copy_harvest_local_%s.sh' % job_id
    with open(local_copy_script, 'w') as copy_script:
        for sample, file in final_harvest_files:
            copy_script.write('rfcp %s %s &\n' %
                              (file, local_copy_mapper(sample)))
        copy_script.write('wait\n')

        if verbosity > 0:
            if run_harvesting:
                print "To harvest plots, run %s" % harvest_script_name
            if run_merging:
                print "To merge skims, run %s" % merge_script_name
            print "After harvesting is done, run %s to copy files locally" % local_copy_script

    # Make all our stuff executable
    os.chmod(local_copy_script, 0755)
    if run_harvesting:
        os.chmod(harvest_script_name, 0755)
    if run_merging:
        os.chmod(merge_script_name, 0755)

    # Return name of scripts
    retVal = {}
    retVal['harvest_script_name'] = harvest_script_name
    retVal['bsub_job_names'] = bsub_job_names
    retVal['final_harvest_files'] = final_harvest_files
    retVal['merge_script_name'] = merge_script_name
    retVal['local_copy_script'] = local_copy_script
    return retVal
def submitAnalysisToLXBatch(configFile = None, channel = None, samples = None,
                            samplesToAnalyze = None, samplesToSkip = None,
                            disableFactorization = False,
                            disableSysUncertainties = False,
                            disableZrecoilCorrections = False,
                            script_directory=None,
                            cfgdir = 'lxbatch',
                            inputFileMap = None, outputFileMap = None,
                            outputDirectory = None,
                            queue = '1nd',
                            enableEventDumps = False,
                            enableFakeRates = False,
                            processName = None,
                            changeTauId = None,
                            saveFinalEvents = False,
                            jobExtention = ''):

    """
    Submit analysis job (event selection, filling of histogram)
    to local machine
    """

    # check that configFile, channel, samples and jobId
    # parameters are defined and non-empty
    for param in ["configFile", "channel", "samples",
                  "outputDirectory"]:
        if locals()[param] is None:
            raise ValueError("Undefined '%s' parameter!!" % param)

    jobId = reg.getJobId(channel)

    # If not specified take script directory from user preferences.
    if script_directory is None:
        script_directory = reg.getHarvestScriptLocation()

    # Make sure our output file for the scripts is okay
    if not os.path.exists(script_directory):
        os.makedirs(script_directory)

    # Get all the files in our output directory that have non-zero size
    tmp_files = set(x['file'] for x in castor.nslsl(outputDirectory)
                    if x['size'])

    # Keep track of the files we care about
    relevant_files = set([])

    submit_file_name = 'submit_lxbatch_analysis_' + jobId + '.sh'
    with open(submit_file_name, 'w') as submit_file:
        # Loop over the samples to be analyzed
        for sample in samples['SAMPLES_TO_ANALYZE']:
            write_comment_header(submit_file, " Sample: " + sample)
            # Skip submitting crab job in case
            #  o list of samples for which crab jobs are to be submitted has been
            #    explicitely specified
            #  o sample has explicitely been requested to be skipped
            if samplesToAnalyze:
                if sample not in samplesToAnalyze:
                    print "Skipping", sample
                    continue
            if samplesToSkip:
                if sample in samplesToSkip:
                    print "Skipping", sample
                    continue

            sample_info = samples['RECO_SAMPLES'][sample]

            # Make job info
            jobInfo = {
                'channel' : channel,
                'sample' : sample,
                'id' : jobId
            }

            # Now build the scripts to feed to bsub
            # Find the input files
            input_files = list(inputFileMap(channel, sample, jobId))

            if len(input_files) > 0:
                print("Submitting %s in %i part(s)" % (sample, len(input_files)))
            else:
                print("No local input files for %s found !!" % sample)

            for job, file in enumerate(input_files):

                input_files = [file]
                # The None in the tuple indicates this file has no dependencies in
                # the batch job.
                input_files_and_jobs = [ (None, file) for file in input_files ]
                # Need to prepend file:, and strip off the directory since we
                # always have bsub rfcp the input files to the working
                # directory.
                input_files_for_cfgOptions = [
                    'file:' + os.path.basename(file) for file in input_files]


                output_file = outputFileMap(channel, sample, jobId)
                input_file_hash = jobtools.hash_files(
                    input_files, add_time=False)
                # Add the hash of the input file so we know the provenance of all
                # files
                output_file = os.path.join(outputDirectory, output_file.replace(
                    '.root', '_' + str(job) + '_' + input_file_hash + '.root'))

                relevant_files.add(os.path.basename(output_file))

                # Uncomment to skip rerunning of old jobs
                #if os.path.basename(output_file) in tmp_files:
                    #print " done; skipping", output_file
                    #continue

                # First, prepare the configuration file
                newConfigFile = getNewConfigFileName(
                    configFile, cfgdir, sample,
                    jobId, index = job, label = "@lxbatch")

                write_comment_header(submit_file, " cfg: " + newConfigFile)
                #--------------------------------------------------------------------
                # CV: temporary "hack" for producing (ED)Ntuples/skims for tau id. efficiency measurement
                jobCustomizations = []
                jobCustomizations.append("if hasattr(process, 'ntupleOutputModule'):")
                jobCustomizations.append("    process.ntupleOutputModule.fileName = '%s'" % os.path.basename(output_file))
                jobCustomizations.append("if hasattr(process, 'patTupleOutputModule'):")
                jobCustomizations.append("    process.patTupleOutputModule.fileName = '%s'" % os.path.basename(output_file))
                jobCustomizations.append("if hasattr(process, 'skimOutputModule'):")
                jobCustomizations.append("    process.skimOutputModule.fileName = '%s'" % os.path.basename(output_file))
                HLTprocessName = 'HLT'
                if 'hlt' in samples['RECO_SAMPLES'][sample].keys():
                    HLTprocessName = samples['RECO_SAMPLES'][sample]['hlt'].getProcessName()
                    jobCustomizations.append("if hasattr(process, 'hltMu'):")
                    jobCustomizations.append("    process.hltMu.selector.src = cms.InputTag('TriggerResults::%s')" % HLTprocessName)
                    jobCustomizations.append("if hasattr(process, 'patTrigger'):")
                    jobCustomizations.append("    process.patTrigger.processName = '%s'" % HLTprocessName)
                    jobCustomizations.append("if hasattr(process, 'patTriggerEvent'):")
                    jobCustomizations.append("    process.patTriggerEvent.processName = '%s'" % HLTprocessName)
                if samples['RECO_SAMPLES'][sample]['type'] == 'Data':
                    jobCustomizations.append("if hasattr(process, 'prePatProductionSequence')"
                                            + " and hasattr(process, 'prePatProductionSequenceGen'):")
                    jobCustomizations.append("    process.prePatProductionSequence.remove(process.prePatProductionSequenceGen)")
                    jobCustomizations.append("if hasattr(process, 'ntupleProducer'):")
                    jobCustomizations.append("    if hasattr(process.ntupleProducer.sources, 'tauGenJets'):")
                    jobCustomizations.append("        delattr(process.ntupleProducer.sources, 'tauGenJets')")
                    jobCustomizations.append("    if hasattr(process.ntupleProducer.sources, 'genJets'):")
                    jobCustomizations.append("        delattr(process.ntupleProducer.sources, 'genJets')")
                    jobCustomizations.append("    if hasattr(process.ntupleProducer.sources, 'genPhaseSpaceEventInfo'):")
                    jobCustomizations.append("        delattr(process.ntupleProducer.sources, 'genPhaseSpaceEventInfo')")
                    jobCustomizations.append("    if hasattr(process.ntupleProducer.sources, 'genPileUpEventInfo'):")
                    jobCustomizations.append("        delattr(process.ntupleProducer.sources, 'genPileUpEventInfo')")
                jobCustomizations.append("if hasattr(process, 'patTriggerEventSequence') and hasattr(process, 'patTriggerSequence'):")
                jobCustomizations.append("    process.patDefaultSequence.replace(process.patTriggerEventSequence,")
                jobCustomizations.append("                                       process.patTriggerSequence + process.patTriggerEventSequence)")    
                #jobCustomizations.append("print process.dumpPython()")
                #--------------------------------------------------------------------

                prepareConfigFile(
                    configFile = configFile, jobInfo = jobInfo,
                    newConfigFile = newConfigFile,
                    sample_infos = samples,
                    disableFactorization = disableFactorization,
                    disableSysUncertainties = disableSysUncertainties,
                    disableZrecoilCorrections = disableZrecoilCorrections, 
                    # We always copy the input files to the local directory
                    # before running cmsRun, so just take the basname
                    input_files = input_files_for_cfgOptions,
                    output_file = os.path.basename(output_file),
                    enableEventDumps = enableEventDumps, enableFakeRates = enableFakeRates,
                    processName = processName,
                    saveFinalEvents = saveFinalEvents,
                    changeTauId = changeTauId,
                    customizations = jobCustomizations)

                # Build a function that constructs our log file name given the
                # job file hash.
                if not os.path.exists('lxbatch_log'):
                    os.makedirs('lxbatch_log')
                def log_file_maker(job_hash):
                    return os.path.join(
                        'lxbatch_log', "_".join(
                        ['run', channel, sample, jobId, job_hash]) + '.log')

                # Build our batch job
                jobname, script = jobtools.make_bsub_script(
                    output_file, input_files_and_jobs, log_file_maker,
                    "cmsRun %s" % newConfigFile, pass_io_files = False)

                bsub_script_file = os.path.join(
                    script_directory, "_".join([
                        'analyze'+jobExtention, sample, 'job',
                        str(job), input_file_hash]) + '.sh')
                with open(bsub_script_file, 'w') as bsub_script:
                    bsub_script.write(script)
                # Add this bsub to our submission script
                submit_file.write("bsub -q %s < %s\n" % (queue, bsub_script_file))

        print len(tmp_files)
        garbage = tmp_files - relevant_files
        print len(garbage)
        if garbage:
            print "Found %i files not generated by this job!!" % len(garbage)
            print " You should really run:"
            print " cat ana_garbage.txt | xargs -n 1 -P 10 rfrm"
            with open('ana_garbage.txt', 'w') as garbage_script:
                for file in garbage:
                    garbage_script.write(
                        '%s\n' % os.path.join(outputDirectory, file))
        print "Run ./%s to submit jobs" % submit_file_name
        os.chmod(submit_file_name, 0755)

        return submit_file_name
def submitAnalysisToLXBatch(configFile=None,
                            channel=None,
                            samples=None,
                            samplesToAnalyze=None,
                            samplesToSkip=None,
                            disableFactorization=False,
                            disableSysUncertainties=False,
                            disableZrecoilCorrections=False,
                            script_directory=None,
                            cfgdir='lxbatch',
                            inputFileMap=None,
                            outputFileMap=None,
                            outputDirectory=None,
                            queue='1nd',
                            enableEventDumps=False,
                            enableFakeRates=False,
                            processName=None,
                            changeTauId=None,
                            saveFinalEvents=False,
                            jobExtention=''):
    """
    Submit analysis job (event selection, filling of histogram)
    to local machine
    """

    # check that configFile, channel, samples and jobId
    # parameters are defined and non-empty
    for param in ["configFile", "channel", "samples", "outputDirectory"]:
        if locals()[param] is None:
            raise ValueError("Undefined '%s' parameter!!" % param)

    jobId = reg.getJobId(channel)

    # If not specified take script directory from user preferences.
    if script_directory is None:
        script_directory = reg.getHarvestScriptLocation()

    # Make sure our output file for the scripts is okay
    if not os.path.exists(script_directory):
        os.makedirs(script_directory)

    # Get all the files in our output directory that have non-zero size
    tmp_files = set(x['file'] for x in castor.nslsl(outputDirectory)
                    if x['size'])

    # Keep track of the files we care about
    relevant_files = set([])

    submit_file_name = 'submit_lxbatch_analysis_' + jobId + '.sh'
    with open(submit_file_name, 'w') as submit_file:
        # Loop over the samples to be analyzed
        for sample in samples['SAMPLES_TO_ANALYZE']:
            write_comment_header(submit_file, " Sample: " + sample)
            # Skip submitting crab job in case
            #  o list of samples for which crab jobs are to be submitted has been
            #    explicitely specified
            #  o sample has explicitely been requested to be skipped
            if samplesToAnalyze:
                if sample not in samplesToAnalyze:
                    print "Skipping", sample
                    continue
            if samplesToSkip:
                if sample in samplesToSkip:
                    print "Skipping", sample
                    continue

            sample_info = samples['RECO_SAMPLES'][sample]

            # Make job info
            jobInfo = {'channel': channel, 'sample': sample, 'id': jobId}

            # Now build the scripts to feed to bsub
            # Find the input files
            input_files = list(inputFileMap(channel, sample, jobId))

            if len(input_files) > 0:
                print("Submitting %s in %i part(s)" %
                      (sample, len(input_files)))
            else:
                print("No local input files for %s found !!" % sample)

            for job, file in enumerate(input_files):

                input_files = [file]
                # The None in the tuple indicates this file has no dependencies in
                # the batch job.
                input_files_and_jobs = [(None, file) for file in input_files]
                # Need to prepend file:, and strip off the directory since we
                # always have bsub rfcp the input files to the working
                # directory.
                input_files_for_cfgOptions = [
                    'file:' + os.path.basename(file) for file in input_files
                ]

                output_file = outputFileMap(channel, sample, jobId)
                input_file_hash = jobtools.hash_files(input_files,
                                                      add_time=False)
                # Add the hash of the input file so we know the provenance of all
                # files
                output_file = os.path.join(
                    outputDirectory,
                    output_file.replace(
                        '.root',
                        '_' + str(job) + '_' + input_file_hash + '.root'))

                relevant_files.add(os.path.basename(output_file))

                # Uncomment to skip rerunning of old jobs
                #if os.path.basename(output_file) in tmp_files:
                #print " done; skipping", output_file
                #continue

                # First, prepare the configuration file
                newConfigFile = getNewConfigFileName(configFile,
                                                     cfgdir,
                                                     sample,
                                                     jobId,
                                                     index=job,
                                                     label="@lxbatch")

                write_comment_header(submit_file, " cfg: " + newConfigFile)
                #--------------------------------------------------------------------
                # CV: temporary "hack" for producing (ED)Ntuples/skims for tau id. efficiency measurement
                jobCustomizations = []
                jobCustomizations.append(
                    "if hasattr(process, 'ntupleOutputModule'):")
                jobCustomizations.append(
                    "    process.ntupleOutputModule.fileName = '%s'" %
                    os.path.basename(output_file))
                jobCustomizations.append(
                    "if hasattr(process, 'patTupleOutputModule'):")
                jobCustomizations.append(
                    "    process.patTupleOutputModule.fileName = '%s'" %
                    os.path.basename(output_file))
                jobCustomizations.append(
                    "if hasattr(process, 'skimOutputModule'):")
                jobCustomizations.append(
                    "    process.skimOutputModule.fileName = '%s'" %
                    os.path.basename(output_file))
                HLTprocessName = 'HLT'
                if 'hlt' in samples['RECO_SAMPLES'][sample].keys():
                    HLTprocessName = samples['RECO_SAMPLES'][sample][
                        'hlt'].getProcessName()
                    jobCustomizations.append("if hasattr(process, 'hltMu'):")
                    jobCustomizations.append(
                        "    process.hltMu.selector.src = cms.InputTag('TriggerResults::%s')"
                        % HLTprocessName)
                    jobCustomizations.append(
                        "if hasattr(process, 'patTrigger'):")
                    jobCustomizations.append(
                        "    process.patTrigger.processName = '%s'" %
                        HLTprocessName)
                    jobCustomizations.append(
                        "if hasattr(process, 'patTriggerEvent'):")
                    jobCustomizations.append(
                        "    process.patTriggerEvent.processName = '%s'" %
                        HLTprocessName)
                if samples['RECO_SAMPLES'][sample]['type'] == 'Data':
                    jobCustomizations.append(
                        "if hasattr(process, 'prePatProductionSequence')" +
                        " and hasattr(process, 'prePatProductionSequenceGen'):"
                    )
                    jobCustomizations.append(
                        "    process.prePatProductionSequence.remove(process.prePatProductionSequenceGen)"
                    )
                    jobCustomizations.append(
                        "if hasattr(process, 'ntupleProducer'):")
                    jobCustomizations.append(
                        "    if hasattr(process.ntupleProducer.sources, 'tauGenJets'):"
                    )
                    jobCustomizations.append(
                        "        delattr(process.ntupleProducer.sources, 'tauGenJets')"
                    )
                    jobCustomizations.append(
                        "    if hasattr(process.ntupleProducer.sources, 'genJets'):"
                    )
                    jobCustomizations.append(
                        "        delattr(process.ntupleProducer.sources, 'genJets')"
                    )
                    jobCustomizations.append(
                        "    if hasattr(process.ntupleProducer.sources, 'genPhaseSpaceEventInfo'):"
                    )
                    jobCustomizations.append(
                        "        delattr(process.ntupleProducer.sources, 'genPhaseSpaceEventInfo')"
                    )
                    jobCustomizations.append(
                        "    if hasattr(process.ntupleProducer.sources, 'genPileUpEventInfo'):"
                    )
                    jobCustomizations.append(
                        "        delattr(process.ntupleProducer.sources, 'genPileUpEventInfo')"
                    )
                jobCustomizations.append(
                    "if hasattr(process, 'patTriggerEventSequence') and hasattr(process, 'patTriggerSequence'):"
                )
                jobCustomizations.append(
                    "    process.patDefaultSequence.replace(process.patTriggerEventSequence,"
                )
                jobCustomizations.append(
                    "                                       process.patTriggerSequence + process.patTriggerEventSequence)"
                )
                #jobCustomizations.append("print process.dumpPython()")
                #--------------------------------------------------------------------

                prepareConfigFile(
                    configFile=configFile,
                    jobInfo=jobInfo,
                    newConfigFile=newConfigFile,
                    sample_infos=samples,
                    disableFactorization=disableFactorization,
                    disableSysUncertainties=disableSysUncertainties,
                    disableZrecoilCorrections=disableZrecoilCorrections,
                    # We always copy the input files to the local directory
                    # before running cmsRun, so just take the basname
                    input_files=input_files_for_cfgOptions,
                    output_file=os.path.basename(output_file),
                    enableEventDumps=enableEventDumps,
                    enableFakeRates=enableFakeRates,
                    processName=processName,
                    saveFinalEvents=saveFinalEvents,
                    changeTauId=changeTauId,
                    customizations=jobCustomizations)

                # Build a function that constructs our log file name given the
                # job file hash.
                if not os.path.exists('lxbatch_log'):
                    os.makedirs('lxbatch_log')

                def log_file_maker(job_hash):
                    return os.path.join(
                        'lxbatch_log',
                        "_".join(['run', channel, sample, jobId, job_hash]) +
                        '.log')

                # Build our batch job
                jobname, script = jobtools.make_bsub_script(
                    output_file,
                    input_files_and_jobs,
                    log_file_maker,
                    "cmsRun %s" % newConfigFile,
                    pass_io_files=False)

                bsub_script_file = os.path.join(
                    script_directory, "_".join([
                        'analyze' + jobExtention, sample, 'job',
                        str(job), input_file_hash
                    ]) + '.sh')
                with open(bsub_script_file, 'w') as bsub_script:
                    bsub_script.write(script)
                # Add this bsub to our submission script
                submit_file.write("bsub -q %s < %s\n" %
                                  (queue, bsub_script_file))

        print len(tmp_files)
        garbage = tmp_files - relevant_files
        print len(garbage)
        if garbage:
            print "Found %i files not generated by this job!!" % len(garbage)
            print " You should really run:"
            print " cat ana_garbage.txt | xargs -n 1 -P 10 rfrm"
            with open('ana_garbage.txt', 'w') as garbage_script:
                for file in garbage:
                    garbage_script.write('%s\n' %
                                         os.path.join(outputDirectory, file))
        print "Run ./%s to submit jobs" % submit_file_name
        os.chmod(submit_file_name, 0755)

        return submit_file_name
Example #4
0
def make_harvest_scripts(plot_regex, skim_regex,
                         channel = "",
                         sampleToAnalyze = "",
                         job_id = None,
                         # An iterable that gives the input files
                         input_source = None,
                         # Pass input_files_info explicitely in case input files
                         # do not yet exist an time when make_harvest_scripts is called
                         input_files_info = None,
                         # Allow to switch between 'genericHarvester.py' and 'hadd'
                         harvester_command = _HARVESTER_CMD,
                         # Abort harvesting/merging scripts in case input files fail to get copied
                         abort_on_rfcp_error = True,
                         # Where to put the output
                         castor_output_directory = None,
                         script_directory = None,
                         merge_script_name = None,
                         local_copy_mapper = None,
                         chunk_size = 1e9, # 1 GB
                         max_input_files_per_chunk = 50,
                         run_harvesting = True,
                         run_merging = True,
                         check_old_files = True,
                         max_bsub_concurrent_file_access = 2000,
                         verbosity = 1):

    # Get the jobId from the user registry
    if job_id is None:
        job_id = reg.getJobId(channel)

    if script_directory is None:
        script_directory = reg.getHarvestScriptLocation()

    # Create the directory where we store the scripts if it doesn't exist
    if not os.path.exists(script_directory):
        os.mkdir(script_directory)

    if input_files_info is None:
        # Get all files with nonzero size in the input castor directory
        print "Getting files to harvest from input, input_source = %s" % input_source
        input_files_info = [ x for x in input_source if x['size'] ]
    #print "input_files_info = %s" % input_files_info

    print "Getting files from destination"
    # Get all the tmp files (that are non-zero)
    if castor_output_directory.find("/castor") != -1:
        tmp_files_info = [ x for x in castor_source(castor_output_directory) if x['size'] ]
    elif castor_output_directory.find("/store") != -1:
        tmp_files_info = [ x for x in eos_source(castor_output_directory) if x['size'] ]
    else:
        local_output_directory = None
        if castor_output_directory.find(":") != -1:
            local_output_directory = castor_output_directory[castor_output_directory.find(":") + 1:]
        else:
            local_output_directory = castor_output_directory
        tmp_files_info = []
        for x in os.listdir(local_output_directory):
            file_info = {
                'path'        : os.path.join(local_output_directory, os.path.basename(x)),
                'size'        : 1,           # dummy
                'time'        : time.localtime(),
                'file'        : os.path.basename(x),
                'permissions' : 'mrw-r--r--' # "ordinary" file access permissions
            }
            tmp_files_info.append(file_info)
    tmp_files = set(x['file'] for x in tmp_files_info)
    #print "tmp_files_info = %s" % tmp_files_info

    # Make a repository of info about our files
    all_files_dict = {}
    for file_info in input_files_info + tmp_files_info:
        all_files_dict[file_info['path']] = file_info
    #print "all_files_dict = %s" % all_files_dict

    # Keep track of files that we put in tmp with these jobs and that we care
    # about.  We can stop caring about old files if after adding new files (i.e.
    # crab jobs finishing), the merge block they were in contains a different
    # set of files.
    relevant_tmp_files = set([])

    # Keep track of the final harvested output
    final_harvest_files = []

    harvest_script_name = "_".join(['submit', channel, sampleToAnalyze, job_id, 'harvest']) + '.sh'
        
    harvest_log = open('_'.join(('harvest', channel, sampleToAnalyze, job_id, 'log')) + '.txt', 'a')

    # Keep track of the names of lxbatch jobs
    bsub_job_names = []

    if run_harvesting:
        # Select those that match our given regex for a sample
        plot_file_map = defaultdict(list)
        # Keep track of a hash of
        plot_matcher = re.compile(plot_regex)
        plot_source_hashes = defaultdict(hashlib.md5)
        for file in input_files_info:
            match = plot_matcher.match(file['file'])
            if match:
                full_file = file['path']
                sample = None
                if sampleToAnalyze != "":
                    sample = sampleToAnalyze
                else:
                    sample = match.group('sample')
                plot_file_map[sample].append(full_file)
                plot_source_hashes[sample].update(full_file)

        harvest_log.write('=========== %s ================' % time.asctime())
        for sample in sorted(plot_source_hashes.keys()):
            # Write down the hash of the input files
            harvest_log.write(
                " ".join([channel, sample, job_id, '<harvest>' 'input[%s]' %
                          plot_source_hashes[sample].hexdigest()])+'\n')

        submit_file = open(harvest_script_name, 'w')
        # Make the bsub scripts
        submit_file.write("#!/bin/bash\n")
        # Now build mergers for each of the samples
        # Count how many bsubs we have created, so we can put pauses in to
        # thwart the rate limit.
        bsub_file_access_counter = 0
        for sample in plot_file_map.keys():
            #if sample.find('NoPU') == -1:
            #    continue
            # Add helpful comments
            write_comment_header(submit_file, "Harvesting channel %s, sample %s" % (channel, sample))
            print " Building harvesting for channel %s, sample %s" % (channel, sample)
            print " -- Found %i files to harvest" % len(plot_file_map[sample])
            # Build merge tree.  We add the source has to the sample name.
            split = 6
            if 'PPmuX' in sample:
                print "High yield sample %s detected, setting split to 4" % sample
                split = 4
            merge_jobs = jobtools.make_merge_dependency_tree(
                "_".join([channel, sample, job_id]), plot_file_map[sample],
                local_output_directory, split = split)
            #print "merge_jobs = %s" % merge_jobs
            # Only do work that hasn't been done before.  We can check and see
            # if the output of a given merge layer is already in the temp
            # directory.  As the filenames contain a suffix with the hash of the
            # input file names, we can be sure that if a file is out of date we
            # will notice.
            merge_jobs_needed = []
            files_to_build = set([])
            print " --- Generated %i harvest layers:" % len(merge_jobs)
            for i, layer in enumerate(merge_jobs):
                # Figure out how many files we need to build
                layer_jobs_needed = []
                for layer_job in layer:
                    #print "layer_job = ", layer_job
                    # Check if we've already built this output file in the tmp
                    file_base_name = os.path.basename(layer_job[0])
                    needed = True
                    # Check if we are rebuilding a dependency
                    building_a_dependency = any(
                        file in files_to_build for file in layer_job[1])
                    if not building_a_dependency and file_base_name in tmp_files:
                        output_m_time = all_files_dict[layer_job[0]]['time']
                        out_of_date = False
                        for input_file in layer_job[1]:
                            #print "input_file = ", input_file
                            if not input_file in all_files_dict.keys() or all_files_dict[input_file]['time'] > output_m_time:
                                print "File: %s is older than its dependency %s, rebuilding!" % (
                                    file_base_name, input_file)
                                # Check if it's out of date
                                out_of_date = True
                                break
                        if not out_of_date:
                            needed = False
                    if needed:
                        layer_jobs_needed.append(layer_job)
                    # Keep track of the relevant files, so we can delete old
                    # cruft
                    relevant_tmp_files.add(file_base_name)
                # Check if this is the final output layer
                if len(layer) == 1:
                    final_harvest_files.append((sample, layer[0][0]))
                print " ---- layer %i has %i jobs, of which %i not done" % (
                    i, len(layer), len(layer_jobs_needed))
                merge_jobs_needed.append(layer_jobs_needed)

            # Keep track of what jobId was used for a paticular output file
            job_registry = {}
            # If a file is not produced by a job (already exists in CASTOR/EOS),
            # then the job ID returned is none.
            get_job_name = lambda x : x in job_registry and job_registry[x] or None

            for ilayer, layer in enumerate(merge_jobs_needed):
                write_comment_header(submit_file, "Layer %i" % ilayer)
                submit_file.write("echo Submitting layer %i of channel %s, sample %s\n"
                                  % (ilayer, channel, sample))
                for ijob, (output_file, input_files) in enumerate(layer):
                    # Get the job name (if it exists) and file name for the
                    # input files.
                    input_files_and_jobs = [
                        (get_job_name(file), file) for file in input_files]

                    # Build a function that constructs our log file name given the
                    # job file hash.
                    # Build a function that constructs our log file name given the
                    # job file hash.
                    if not os.path.exists('lxbatch_log'):
                        os.makedirs('lxbatch_log')
                    def log_file_maker(job_hash):
                        log_fileName = os.path.join(
                            'lxbatch_log', "_".join(
                                ['harvest', job_hash, 'layer_%i' % ilayer]) + '.log')
                        # CV: delete log-files from previous job submissions
                        os.system("rm -f %s" % log_fileName)
                        return log_fileName

                    # Build the script                    
                    job_name, script = jobtools.make_bsub_script(
                        output_file, input_files_and_jobs,
                        log_file_maker,
                        harvester_command,
                        abort_on_rfcp_error = abort_on_rfcp_error,
                        label = "_".join([
                            "harvest",
                            channel, sample,
                            "layer", str(ilayer),
                            "job", str(ijob)]))
                    #print "job_name = %s" % job_name
                    bsub_job_names.append(job_name)
                    
                    # Register our job
                    #print "--> registering channel %s, sample %s, jobId %s" % (channel, sample, job_id)
                    #print " script_directory = %s" % script_directory
                    job_registry[output_file] = job_name
                    #print " job_registry[%s] = %s" % (output_file, job_registry[output_file])
                    script_file = os.path.join(
                        script_directory, "_".join([
                            "harvest",
                            channel, sample, job_id,
                            "layer", str(ilayer),
                            "job", str(ijob)]
                        ) + ".sh")
                    submit_file.write("bsub < %s\n" % script_file)
                    # Keep track of how many files we access
                    bsub_file_access_counter += split
                    if bsub_file_access_counter > max_bsub_concurrent_file_access:
                        bsub_file_access_counter = 0
                        submit_file.write("# thwart rate limit\n")
                        submit_file.write(
                            "echo Sleeping for 500 seconds, it is now:\n")
                        submit_file.write(
                            "date\n")
                        submit_file.write("sleep 500\n")
                    with open(script_file, 'w') as script_file:
                        script_file.write(script)
        submit_file.close()

    if run_merging:
        # Now build skim merging file
        # Select those that match our given regex for a sample
        skim_file_map = defaultdict(list)
        skim_fileinhash_map = defaultdict(hashlib.md5)
        skim_fileouthash_map = defaultdict(hashlib.md5)
        skim_matcher = re.compile(skim_regex)
        for file in input_files_info:
            match = skim_matcher.match(file['file'])
            if match:
                full_file = file['path']
                # Parse the sample from the regex
                sample = None
                if sampleToAnalyze != "":
                    sample = sampleToAnalyze                    
                else:
                    sample = match.group('sample')
                # For the skims, keep track of the file size well, since we use it
                # to group the jobs.
                skim_file_map[sample].append(
                    (file['time'], file['size'], full_file))
                # Keep track of the hash of all input files so we know what went
                # into our output
                skim_fileinhash_map[sample].update(full_file)

        def make_skim_name(sample, chunk, hash):
            " Generate a nice name for an output skim "
            return "_".join(["skim", sample, "chunk", str(chunk), hash]) + ".root"

        if merge_script_name is None:
            merge_script_name = "_".join(['submit', channel, sampleToAnalyze, job_id, 'merge']) + '.sh'
        with open(merge_script_name, 'w') as merge_script:
            merge_jobs_counter = 0
            bsub_file_access_counter = 0
            for sample in skim_file_map.keys():
                write_comment_header(merge_script, " Merging " + sample)
                print "Merging channel %s, sample %s" % (channel, sample)
                files = skim_file_map[sample]
                num_files = len(files)
                total_file_size =  sum(map(lambda x: x[1], files))/1e6
                # Divide the job up into chunks that are about 1 GB each in size
                chunks = list(jobtools.split(files, chunk_size, max_input_files_per_chunk, lambda x: x[1]))
                print " Total sample size: %i files, %i MB - splitting into %i chunks" % (
                    num_files, total_file_size, len(chunks)),
                # Keep track of jobs we are actually running
                skim_merge_jobs = []
                for ichunk, input_files in enumerate(chunks):
                    # Figure out the name for our file.  It contains a hash of its
                    # inputs.  We don't add the time, as we never have any LXBatch
                    # job dependencies.
                    just_the_files = [x[2] for x in input_files]
                    output_file = make_skim_name(
                        "_".join([channel, sample, job_id]), ichunk, jobtools.hash_files(
                            just_the_files, add_time=False))
                    skim_fileouthash_map[sample].update(output_file)

                    relevant_tmp_files.add(output_file)
                    if output_file not in tmp_files:
                        output_file_full_path = os.path.join(
                            castor_output_directory, output_file)
                        # Add "None" as the job id of the input files to indicate we
                        # don't care about any dependencies.  The index on x takes
                        # out only the filename, not the size or the time.
                        skim_merge_jobs.append(
                            (output_file_full_path,
                             map(lambda x: (None, x[2]), input_files))
                        )

                print " -- %i chunks are already done, skipping" % (
                    len(chunks) - len(skim_merge_jobs))
 
                for ijob, (output_file, input_files) in enumerate(skim_merge_jobs):
                    def merge_log_file_maker(job_hash):
                        log_fileName = os.path.join(
                            'lxbatch_log', "_".join(
                                ['merge', job_hash, 'job_%i' % ijob]) + '.log')
                        # CV: delete log-files from previous job submissions
                        os.system("rm -f %s" % log_fileName)
                        return log_fileName

                    # Generate script contents
                    job_name, script = jobtools.make_bsub_script(
                        output_file,
                        input_files,
                        merge_log_file_maker,
                        _MERGER_CMD,
                        abort_on_rfcp_error = abort_on_rfcp_error,
                        label = "_".join([
                            "merge",
                            channel, sample, 
                            "chunk", str(ijob)]))
                    script_file = os.path.join(
                        script_directory, "_".join([
                            "merge",
                            channel, sample, job_id,
                            "chunk", str(ijob)])
                        + ".sh")
                    
                    # Add our bsub command
                    merge_script.write("bsub < %s\n" % script_file)
                    merge_jobs_counter += 1
                    bsub_file_access_counter += len(input_files)
                    if bsub_file_access_counter > max_bsub_concurrent_file_access:
                        bsub_file_access_counter = 0
                        merge_script.write("# thwart rate limit\n")
                        merge_script.write(
                            "echo Sleeping for 500 seconds, it is now:\n")
                        merge_script.write('date\n')
                        merge_script.write("sleep 500\n")
                    with open(script_file, 'w') as script_file:
                        script_file.write(script)

            print " Built %i merge jobs in %s" % (merge_jobs_counter,
                                                  merge_script_name)

        for sample in sorted(skim_fileinhash_map.keys()):
            # Write down the hash of the input files
            harvest_log.write(
                " ".join([job_id, sample, '<merge>'
                          'input[%s]' % skim_fileinhash_map[sample].hexdigest(),
                          'output[%s]' % skim_fileouthash_map[sample].hexdigest(),
                          ])+'\n')

    if check_old_files:
        # Compute all the extra tmp files
        extra_crap = tmp_files - relevant_tmp_files
        if extra_crap:
            print " Found %i extra files from previous harvest jobs"%len(extra_crap)
            print " Writing these files to file garbage.txt. To delete, please run: "
            print " cat garbage.txt | xargs -P 10 -n 1 rfrm "
            with open('garbage.txt', 'w') as garbage_file:
                for file in extra_crap:
                    full_path = os.path.join(castor_output_directory, file)
                    garbage_file.write(full_path + '\n')

    local_copy_script = 'copy_harvest_local_%s.sh' % job_id
    with open(local_copy_script, 'w') as copy_script:
        for sample, file in final_harvest_files:
            copy_script.write('rfcp %s %s &\n' % (
                file, local_copy_mapper(sample)))
        copy_script.write('wait\n')

        if verbosity > 0:
            if run_harvesting:
                print "To harvest plots, run %s" % harvest_script_name
            if run_merging:
                print "To merge skims, run %s" % merge_script_name 
            print "After harvesting is done, run %s to copy files locally" % local_copy_script

    # Make all our stuff executable
    os.chmod(local_copy_script, 0755)
    if run_harvesting:
        os.chmod(harvest_script_name, 0755)
    if run_merging:    
        os.chmod(merge_script_name, 0755)

    # Return name of scripts
    retVal = {}
    retVal['harvest_script_name'] = harvest_script_name
    retVal['bsub_job_names']      = bsub_job_names
    retVal['final_harvest_files'] = final_harvest_files
    retVal['merge_script_name']   = merge_script_name
    retVal['local_copy_script']   = local_copy_script
    return retVal