Ejemplo n.º 1
0
def main():
    options = get_options()

    import platform
    if 'ingrid' in platform.node():
        storagePrefix = "/storage/data/cms"
    else:
        storagePrefix = "root://cms-xrd-global.cern.ch/"

    print "##### Get information out of the crab config file (work area, dataset, pset)"
    module = load_file(options.CrabConfig)
    workArea = module.config.General.workArea
    requestName = module.config.General.requestName
    psetName = module.config.JobType.psetName
    inputDataset = unicode(module.config.Data.inputDataset)
    print "done"

    print("")
    
    print "##### Check if the dataset exists in the database"
    # if yes then grab its ID
    # if not then run das_import.py to add it
    # print inputDataset
    values = get_dataset(inputDataset)
    # print values
    if( len(values) == 0 ):
        tmp_sysargv = sys.argv
        sys.argv = ["das_import.py", inputDataset]
        print "calling das_import"
        das_import.main()
        print "done"
        sys.argv = tmp_sysargv
        values = get_dataset(inputDataset)
    # if there is more than one sample then we're in trouble, crash here
    assert( len(values) == 1 )
    dataset_name, dataset_id, dataset_nevents = values[0]
    print "done"

    print("")
    
    print "##### Get info from crab (outputs, report)"
    # Since the API outputs AND prints the same data, hide whatever is printed on screen
    saved_stdout, saved_stderr = sys.stdout, sys.stderr
    if not options.debug:
        sys.stdout = sys.stderr = open(os.devnull, "w")
    taskdir = os.path.join(workArea, 'crab_' + requestName)
    # list output
    output_files = crabCommand('getoutput', '--dump', dir = taskdir )
    # get crab report
    report = crabCommand('report', dir = taskdir )
    # restore print to stdout 
    if not options.debug:
        sys.stdout, sys.stderr = saved_stdout, saved_stderr
#    print "log_files=", log_files
#    print "output_files=", output_files
#    print "report=", report
    print "done"

    print("")

    print "##### Get information from the output files"
    files = []
    for (i, lfn) in enumerate(output_files['lfn']):
        pfn = output_files['pfn'][i]
        files.append({'lfn': lfn, 'pfn': pfn})

    folder = os.path.dirname(output_files['lfn'][0])
    folder = storagePrefix + folder

    db_files = []
    dataset_sumw = 0
    dataset_nselected = 0
    for f in files:
        (sumw, entries) = get_file_data(storagePrefix + f['lfn'])
        if not sumw:
            print("Warning: failed to retrieve sum of event weight for %r" % f['lfn'])

        dataset_sumw += sumw

        if not entries:
            print("Warning: failed to retrieve number of entries for %r" % f['lfn'])

        dataset_nselected += entries

        db_files.append(File(unicode(f['lfn']), unicode(f['pfn']), sumw, entries))

    print "∑w = %.4f" % dataset_sumw
    print "Number of selected events: %d" % dataset_nselected

    print("")

    print "##### Check if the job processed the whole sample"
    has_job_processed_everything = (dataset_nevents == report['eventsRead'])
    is_data = (module.config.Data.splitting == 'LumiBased')
    if has_job_processed_everything:
        print "done"
    else:
        if is_data:
            # This is data, it is expected to not run on everything given we use a lumiMask
            print "done"
        else:
            # Warn
            print "Warning: You are about to add in the DB a sample which has not been completely processed (%d events out of %d, %.2f%%)" % (report['eventsRead'], dataset_nevents, report['eventsRead'] / dataset_nevents * 100)
            print "If you want to update this sample later on with more statistics, simply re-execute this script with the same arguments."

    print("")

    processed_lumi = None
    if is_data:
        processed_lumi = report['analyzedLumis']

    print "##### Figure out the code(s) version"
    # first the version of the framework
    FWHash, FWRepo, FWUrl = getGitTagRepoUrl( os.path.join(CMSSW_BASE, 'src/cp3_llbb/Framework') )
    print "FWUrl=", FWUrl
    # then the version of the analyzer
    AnaHash, AnaRepo, AnaUrl = getGitTagRepoUrl( os.path.dirname( psetName ) )
    print "AnaUrl=", AnaUrl

    print("")

    print "##### Put it all together: write this sample into the database"
    # all the info we have gathered is:
    # workArea
    # requestName
    # psetName
    # inputDataset
    # dataset_id
    # report['eventsRead']) (not necessarily equal to dataset_nevents)
    # log_files
    # output_files
    # report
    # FWHash, FWRepo, FWUrl
    # AnaHash, AnaRepo, AnaUrl
    # dataset_nselected
    # localpath
    NAME = requestName + '_' + FWHash + '_' + AnaRepo + '_' + AnaHash
    add_sample(NAME, folder, "NTUPLES", report['eventsRead'], dataset_nselected, AnaUrl, FWUrl, dataset_id, dataset_sumw, has_job_processed_everything, dataset_nevents, db_files, processed_lumi)
Ejemplo n.º 2
0
def main():
    options = get_options()

    import platform

    if "ingrid" in platform.node():
        storagePrefix = "/storage/data/cms"
    else:
        storagePrefix = "root://cms-xrd-global.cern.ch/"

    print "##### Get information out of the crab config file (work area, dataset, pset)"
    module = load_file(options.CrabConfig)
    workArea = module.config.General.workArea
    requestName = module.config.General.requestName
    psetName = module.config.JobType.psetName
    inputDataset = unicode(module.config.Data.inputDataset)
    print "done"

    print ("")

    print "##### Check if the dataset exists in the database"
    # if yes then grab its ID
    # if not then run das_import.py to add it
    # print inputDataset
    values = get_dataset(inputDataset)
    # print values
    if len(values) == 0:
        tmp_sysargv = sys.argv
        sys.argv = ["das_import.py", inputDataset]
        print "calling das_import"
        das_import.main()
        print "done"
        sys.argv = tmp_sysargv
        values = get_dataset(inputDataset)
    # if there is more than one sample then we're in trouble, crash here
    assert len(values) == 1
    dataset_name, dataset_id, dataset_nevents = values[0]
    print "done"

    print ("")

    print "##### Get info from crab (outputs, report)"
    # Since the API outputs AND prints the same data, hide whatever is printed on screen
    saved_stdout, saved_stderr = sys.stdout, sys.stderr
    if not options.debug:
        sys.stdout = sys.stderr = open(os.devnull, "w")
    taskdir = os.path.join(workArea, "crab_" + requestName)
    # list output
    output_files = crabCommand("getoutput", "--dump", dir=taskdir)
    # get crab report
    report = crabCommand("report", dir=taskdir)
    # restore print to stdout
    if not options.debug:
        sys.stdout, sys.stderr = saved_stdout, saved_stderr
    #    print "log_files=", log_files
    #    print "output_files=", output_files
    #    print "report=", report
    print "done"

    print ("")

    print "##### Get information from the output files"
    files = []
    for (i, lfn) in enumerate(output_files["lfn"]):
        pfn = output_files["pfn"][i]
        files.append({"lfn": lfn, "pfn": pfn})

    # DEBUG
    # files.append({'lfn': '/store/user/sbrochet/TTTo2L2Nu_13TeV-powheg/TTTo2L2Nu_13TeV-powheg_MiniAODv2/160210_181039/0000/output_mc_1.root', 'pfn': 'srm://ingrid-se02.cism.ucl.ac.be:8444/srm/managerv2?SFN=/storage/data/cms/store/user/sbrochet/TTTo2L2Nu_13TeV-powheg/TTTo2L2Nu_13TeV-powheg_MiniAODv2/160210_181039/0000/output_mc_1.root'})

    folder = os.path.dirname(output_files["lfn"][0])
    folder = storagePrefix + folder

    db_files = []
    dataset_sumw = 0
    dataset_extras_sumw = {}
    dataset_nselected = 0
    file_missing = False
    for f in files:
        (sumw, extras_sumw, entries) = get_file_data(storagePrefix + f["lfn"])
        if not sumw:
            print ("Warning: failed to retrieve sum of event weight for %r" % f["lfn"])
            file_missing = True
            continue

        dataset_sumw += sumw
        dataset_extras_sumw = sum_dicts(dataset_extras_sumw, extras_sumw)

        if not entries:
            print ("Warning: failed to retrieve number of entries for %r" % f["lfn"])

        dataset_nselected += entries

        # Convert python dict to json
        extras_sumw_json = unicode(json.dumps(extras_sumw, separators=(",", ":")))

        db_files.append(File(unicode(f["lfn"]), unicode(f["pfn"]), sumw, extras_sumw_json, entries))

    print "∑w = %.4f" % dataset_sumw
    print "Number of selected events: %d" % dataset_nselected
    print "Number of output files (crab / really on the storage): %d / %d" % (len(files), len(db_files))

    print ("")

    print "##### Check if the job processed the whole sample"
    has_job_processed_everything = (dataset_nevents == report["eventsRead"]) and not file_missing
    is_data = module.config.Data.splitting == "LumiBased"
    if has_job_processed_everything:
        print "done"
    else:
        if is_data:
            # This is data, it is expected to not run on everything given we use a lumiMask
            print "done"
        else:
            # Warn
            print "Warning: You are about to add in the DB a sample which has not been completely processed (%d events out of %d, %.2f%%)" % (
                report["eventsRead"],
                dataset_nevents,
                report["eventsRead"] / dataset_nevents * 100,
            )
            print "If you want to update this sample later on with more statistics, simply re-execute this script with the same arguments."

    print ("")

    processed_lumi = None
    if is_data:
        processed_lumi = report["analyzedLumis"]

    print "##### Figure out the code(s) version"
    # first the version of the framework
    FWHash, FWRepo, FWUrl = getGitTagRepoUrl(os.path.join(CMSSW_BASE, "src/cp3_llbb/Framework"))
    print "FWUrl=", FWUrl
    # then the version of the analyzer
    AnaHash, AnaRepo, AnaUrl = getGitTagRepoUrl(os.path.dirname(psetName))
    print "AnaUrl=", AnaUrl

    print ("")

    print "##### Put it all together: write this sample into the database"
    # all the info we have gathered is:
    # workArea
    # requestName
    # psetName
    # inputDataset
    # dataset_id
    # report['eventsRead']) (not necessarily equal to dataset_nevents)
    # log_files
    # output_files
    # report
    # FWHash, FWRepo, FWUrl
    # AnaHash, AnaRepo, AnaUrl
    # dataset_nselected
    # localpath
    NAME = requestName + "_" + FWHash + "_" + AnaRepo + "_" + AnaHash
    add_sample(
        NAME,
        folder,
        "NTUPLES",
        report["eventsRead"],
        dataset_nselected,
        AnaUrl,
        FWUrl,
        dataset_id,
        dataset_sumw,
        dataset_extras_sumw,
        has_job_processed_everything,
        dataset_nevents,
        db_files,
        processed_lumi,
    )
Ejemplo n.º 3
0
def main():
    options = get_options()

    import platform
    if 'ingrid' in platform.node():
        storagePrefix = "/storage/data/cms"
    else:
        storagePrefix = "root://cms-xrd-global.cern.ch/"

    print "##### Get information out of the crab config file (work area, dataset, pset)"
    module = load_file(options.CrabConfig)
    workArea = module.config.General.workArea
    requestName = module.config.General.requestName
    psetName = module.config.JobType.psetName
    inputDataset = unicode(module.config.Data.inputDataset)
    print "done"

    print("")

    print "##### Check if the dataset exists in the database"
    # if yes then grab its ID
    # if not then run das_import.py to add it
    # print inputDataset
    values = get_dataset(inputDataset)
    # print values
    if (len(values) == 0):
        tmp_sysargv = sys.argv
        sys.argv = ["das_import.py", inputDataset]
        print "calling das_import"
        das_import.main()
        print "done"
        sys.argv = tmp_sysargv
        values = get_dataset(inputDataset)
    # if there is more than one sample then we're in trouble, crash here
    assert (len(values) == 1)
    dataset_name, dataset_id, dataset_nevents = values[0]
    print "done"

    print("")

    print "##### Get info from crab (outputs, report)"
    # Since the API outputs AND prints the same data, hide whatever is printed on screen
    saved_stdout, saved_stderr = sys.stdout, sys.stderr
    if not options.debug:
        sys.stdout = sys.stderr = open(os.devnull, "w")
    taskdir = os.path.join(workArea, 'crab_' + requestName)
    # list output
    output_files = crabCommand('getoutput', '--dump', dir=taskdir)
    # get crab report
    report = crabCommand('report', dir=taskdir)
    # restore print to stdout
    if not options.debug:
        sys.stdout, sys.stderr = saved_stdout, saved_stderr


#    print "log_files=", log_files
#    print "output_files=", output_files
#    print "report=", report
    print "done"

    print("")

    print "##### Get information from the output files"
    files = []
    for (i, lfn) in enumerate(output_files['lfn']):
        pfn = output_files['pfn'][i]
        files.append({'lfn': lfn, 'pfn': pfn})

    folder = os.path.dirname(output_files['lfn'][0])
    folder = storagePrefix + folder

    db_files = []
    dataset_sumw = 0
    dataset_nselected = 0
    for f in files:
        (sumw, entries) = get_file_data(storagePrefix + f['lfn'])
        if not sumw:
            print("Warning: failed to retrieve sum of event weight for %r" %
                  f['lfn'])

        dataset_sumw += sumw

        if not entries:
            print("Warning: failed to retrieve number of entries for %r" %
                  f['lfn'])

        dataset_nselected += entries

        db_files.append(
            File(unicode(f['lfn']), unicode(f['pfn']), sumw, entries))

    print "∑w = %.4f" % dataset_sumw
    print "Number of selected events: %d" % dataset_nselected

    print("")

    print "##### Check if the job processed the whole sample"
    has_job_processed_everything = (dataset_nevents == report['eventsRead'])
    is_data = (module.config.Data.splitting == 'LumiBased')
    if has_job_processed_everything:
        print "done"
    else:
        if is_data:
            # This is data, it is expected to not run on everything given we use a lumiMask
            print "done"
        else:
            # Warn
            print "Warning: You are about to add in the DB a sample which has not been completely processed (%d events out of %d, %.2f%%)" % (
                report['eventsRead'], dataset_nevents,
                report['eventsRead'] / dataset_nevents * 100)
            print "If you want to update this sample later on with more statistics, simply re-execute this script with the same arguments."

    print("")

    processed_lumi = None
    if is_data:
        processed_lumi = report['analyzedLumis']

    print "##### Figure out the code(s) version"
    # first the version of the framework
    FWHash, FWRepo, FWUrl = getGitTagRepoUrl(
        os.path.join(CMSSW_BASE, 'src/cp3_llbb/Framework'))
    print "FWUrl=", FWUrl
    # then the version of the analyzer
    AnaHash, AnaRepo, AnaUrl = getGitTagRepoUrl(os.path.dirname(psetName))
    print "AnaUrl=", AnaUrl

    print("")

    print "##### Put it all together: write this sample into the database"
    # all the info we have gathered is:
    # workArea
    # requestName
    # psetName
    # inputDataset
    # dataset_id
    # report['eventsRead']) (not necessarily equal to dataset_nevents)
    # log_files
    # output_files
    # report
    # FWHash, FWRepo, FWUrl
    # AnaHash, AnaRepo, AnaUrl
    # dataset_nselected
    # localpath
    NAME = requestName + '_' + FWHash + '_' + AnaRepo + '_' + AnaHash
    add_sample(NAME, folder, "NTUPLES", report['eventsRead'],
               dataset_nselected, AnaUrl, FWUrl, dataset_id, dataset_sumw,
               has_job_processed_everything, dataset_nevents, db_files,
               processed_lumi)
Ejemplo n.º 4
0
def main():
    options = get_options()
    ingridStoragePrefix = "/storage/data/cms"

    print "##### Get information out of the crab config file (work area, dataset, pset)"
    module = load_file(options.CrabConfig)
    workArea = module.config.General.workArea
    requestName = module.config.General.requestName
    psetName = module.config.JobType.psetName
    inputDataset = unicode(module.config.Data.inputDataset)
    
    print "##### Check if the dataset exists in the database"
    # if yes then grab its ID
    # if not then run das_import.py to add it
    print inputDataset
    values = get_dataset(inputDataset)
    print values
    if( len(values) == 0 ):
        tmp_sysargv = sys.argv
        sys.argv = ["das_import.py", inputDataset]
        print "calling das_import"
        das_import.main()
        print "done"
        sys.argv = tmp_sysargv
        values = get_dataset(inputDataset)
    # if there is more than one sample then we're in trouble, crash here
    assert( len(values) == 1 )
    dataset_name, dataset_id, dataset_nevents = values[0]
    
    print "##### Get info from crab (logs, outputs, report)"
    taskdir = os.path.join(workArea, 'crab_' + requestName)
    # list logs
    log_files = crabCommand('getlog', '--dump', dir = taskdir )
    # list output
    output_files = crabCommand('getoutput', '--dump', dir = taskdir )
    # get crab report
    report = crabCommand('report', dir = taskdir )
#    print report

    print "##### Check if the job processed the whole sample"
    # FIXME: should it rather be just a warning ? Will likely be problematic at some point for big samples
    # FIXME: if changing to a warning, be careful what you actually insert in the DB
#    assert( dataset_nevents == report['eventsRead'] )

    print "##### Figure out the code(s) version"
    # first the version of the framework
    FWHash, FWBranch, FWUrl = getGitTagBranchUrl( os.path.join(CMSSW_BASE, 'src/cp3_llbb/Framework') )
    # then the version of the analyzer
    AnaHash, AnaBranch, AnaUrl = getGitTagBranchUrl( os.path.dirname( psetName ) )

    print "##### Figure out the number of selected events"
    # Need to get this from the output log of the jobs, and sum them all
#    log_files = {'lfn': ['/store/user/obondu/GluGluToRadionToHHTo2B2VTo2L2Nu_M-260_narrow_13TeV-madgraph/GluGluToRadionToHHTo2B2VTo2L2Nu_M-260_narrow_Asympt25ns/150728_092137/0000/log/cmsRun_1.log.tar.gz'], 'pfn': ['srm://ingrid-se02.cism.ucl.ac.be:8444/srm/managerv2?SFN=/storage/data/cms/store/user/obondu/GluGluToRadionToHHTo2B2VTo2L2Nu_M-260_narrow_13TeV-madgraph/GluGluToRadionToHHTo2B2VTo2L2Nu_M-260_narrow_Asympt25ns/150728_092137/0000/log/cmsRun_1.log.tar.gz']}
    nselected = 0
    for lfn in log_files['lfn']:
        tarLog =  ingridStoragePrefix + lfn
        with tarfile.open(tarLog) as tar:
            for tarFile in tar.getmembers():
                if 'stdout' not in tarFile.name:
                    continue
                # For some reason, even though we are using python 2.7, the with statement here seems broken... Using contextlib to handle the file opening / reading cleanly
                with contextlib.closing(tar.extractfile(tarFile)) as file:
                    for line in file:
                        if 'processed' not in line and 'selected' not in line:
                            continue
                        l = line.split()
                        nselected  += int(line.split()[3])
#    print nselected

    print "##### For the path, check that the files there do correspond EXACTLY to the list of output files from crab"
    # (crab being crab, we're never too careful!)
#    output_files = {'lfn': ['/store/user/obondu/GluGluToRadionToHHTo2B2VTo2L2Nu_M-260_narrow_13TeV-madgraph/GluGluToRadionToHHTo2B2VTo2L2Nu_M-260_narrow_Asympt25ns/150728_092137/0000/output_mc_1.root'], 'pfn': ['srm://ingrid-se02.cism.ucl.ac.be:8444/srm/managerv2?SFN=/storage/data/cms/store/user/obondu/GluGluToRadionToHHTo2B2VTo2L2Nu_M-260_narrow_13TeV-madgraph/GluGluToRadionToHHTo2B2VTo2L2Nu_M-260_narrow_Asympt25ns/150728_092137/0000/output_mc_1.root']}
    # first check what we do have locally
    p = os.path.dirname( output_files['lfn'][0] )
    localpath = ingridStoragePrefix + p
    localfiles = [ os.path.join(p, f) for f in os.listdir(localpath) if os.path.isfile( os.path.join(localpath, f) ) and 'root' in f ]
    # unordered comparison: the two list should be equal
    if set(localfiles) != set(output_files['lfn']):
        print "ERROR: the content of the path and the list of crab outputs are different, abort now!"
        print "localfiles (what's here locally)= ", localfiles
        print "outputfiles (what crab is saying)= ", output_files['lfn']
        raise AssertionError("CRAB3 and local list of files do not match")
#    print localpath 

    print "##### Put it all together: write this sample into the database"
    # all the info we have gathered is:
    # workArea
    # requestName
    # psetName
    # inputDataset
    # dataset_id
    # dataset_nevents (asserted to be equal to report['eventsRead'])
    # log_files
    # output_files
    # report
    # FWHash, FWBranch, FWUrl
    # AnaHash, AnaBranch, AnaUrl
    # nselected
    # localpath
    NAME = requestName + '_' + FWHash + '_' + AnaHash
    add_sample(NAME, localpath, "NTUPLES", dataset_nevents, nselected, AnaUrl, FWUrl, dataset_id)