Exemple #1
0
def get_prepId_from_das(dataset, das_dir):
    "get prepid for dataset"

    # get prepid from das/dataset
    prepid = get_from_deep_json(get_das_store_json(dataset, 'dataset', das_dir), 'prep_id')

    if prepid == None:
        # try to get from das/mcm:
        prepid = get_from_deep_json(get_das_store_json(dataset, 'mcm', das_dir), 'prepid')
        # todo also try different queries from the json. prep_id?

    return prepid
def get_conffile_ids(dataset, das_dir):
    """Return location of the configuration files for the dataset."""
    ids = {}
    byoutput = get_from_deep_json(
        get_das_store_json(dataset, 'config', das_dir), 'byoutputdataset')
    byinput = get_from_deep_json(
        get_das_store_json(dataset, 'config', das_dir), 'byinputdataset')
    if byoutput:
        for someid in byoutput:
            ids[someid] = 1
    if byinput:
        for someid in byinput:
            ids[someid] = 1
    return list(ids.keys())
def get_size(dataset, das_dir):
    """Return size of the dataset."""
    size = get_from_deep_json(get_das_store_json(dataset, 'dataset', das_dir),
                              'size')
    if size:
        return size
    return 0
def get_number_files(dataset, das_dir):
    """Return number of files for the dataset."""
    number_files = get_from_deep_json(
        get_das_store_json(dataset, 'dataset', das_dir), 'nfiles')
    if number_files:
        return number_files
    return 0
def get_number_events(dataset, das_dir):
    """Return number of events for the dataset."""
    number_events = get_from_deep_json(
        get_das_store_json(dataset, 'dataset', das_dir), 'nevents')
    if number_events:
        return number_events
    return 0
def get_generator_name(dataset, das_dir, mcm_dir):
    "Return list of generators used for that dataset"
    generator_names = []
    mcm_dict = get_mcm_dict(dataset, mcm_dir)
    generators = get_from_deep_json(mcm_dict, 'generators')
    input_generators = []

    dataset_json = get_das_store_json(dataset, 'mcm', das_dir)
    input_dataset = get_from_deep_json(dataset_json, 'input_dataset')
    if input_dataset:
        mcm_dict = get_mcm_dict(input_dataset, mcm_dir)
        input_generators = get_from_deep_json(mcm_dict, 'generators')

    if generators and input_generators:
        generators += input_generators

    if generators:
        for item in generators:
            for char in ['"', '\\', '[', ']']:  # remove ", \, [, ]
                item = item.replace(char, '')
            generator = item
            if generator not in generator_names:
                generator_names.append(item)

    return generator_names
def get_conffile_ids_from_das(dataset, das_dir, mcm_dir):
    """Return location of the configuration files for the dataset from DAS."""
    ids = {}
    output = get_from_deep_json(get_das_store_json(dataset, 'config', das_dir),
                                'byoutputdataset')
    if output:
        for someid in output:
            ids[someid] = 1
    else:
        print("Error: No config id found from DAS config for " + dataset,
              file=sys.stderr)
    return list(ids.keys())
def get_genfragment_url(dataset, mcm_dir, das_dir):
    "return list of url's of the genfragments used"
    input_dataset = ''
    url = []

    # get GEN-SIM dataset
    if get_dataset_format(dataset) == 'AODSIM':
        dataset_json = get_das_store_json(dataset, 'mcm', das_dir)
        input_dataset = get_from_deep_json(dataset_json, 'input_dataset')
    else:
        input_dataset = dataset

    script_path = get_cmsDriver_script(input_dataset, mcm_dir)
    if script_path == None:
        return None

    with open(script_path, 'r') as script:
        for line in script:
            if 'curl' in line:
                curl = re.search('(?P<url>https?://[^\s]+)', line)
                if curl:
                    url.append(curl.group('url'))
    return url
Exemple #9
0
def mcm_downloader(prepid, dataset, mcm_dir, das_dir):
    "Query dictionary and setup script from McM database"
    # this function is so ugly... but finally works! You're welcome to refactor it though

    cmd = "curl -s -k https://cms-pdmv.cern.ch/mcm/public/restapi/requests/{query}/{prepId}"

    # As prep_id in DAS for some datasets can be found with underscores and MCM
    # takes without underscores, we need to process prep_id removing all of them
    if "_" in prepid:
        print("Found some underscores in prep_id: " + prepid + ", removing...")
        prepid = prepid.replace("_", "")

    mcm_dict = subprocess.run(cmd.format(query="get", prepId=prepid),
                              shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
    mcm_script = subprocess.run(cmd.format(query="get_setup", prepId=prepid),
                                shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)

    mcm_dict_out = str(mcm_dict.stdout.decode("utf-8"))
    mcm_script_out = str(mcm_script.stdout.decode("utf-8"))

    # check if results are not empty
    if mcm_dict_out == '{"results": {}}\n':
        print("[ERROR] Empty McM dict (get) for {ds}".format(ds=dataset),
              file=sys.stderr)
    else:
        outfile = mcm_dir + "/dict/" + dataset.replace('/', '@') + ".json"
        with open(outfile, 'w') as dict_file:
                dict_file.write(mcm_dict_out)

    if mcm_script_out == '' or mcm_script_out[0] == '{':
        print("[ERROR] Empty McM script (get_setup) for {ds}".format(ds=dataset),
              file=sys.stderr)
    else:
        outfile = mcm_dir + "/scripts/" + dataset.replace('/', '@') + ".sh"
        with open(outfile, 'w') as dict_file:
                dict_file.write(mcm_script_out)

    # same thing for "input_dataset": hopefully the GEN-SIM step
    dataset_json = get_das_store_json(dataset, 'mcm', das_dir)
    input_dataset = get_from_deep_json(dataset_json, 'input_dataset')  # /bla/ble/GEN-SIM
    if input_dataset:
        mcm_dict = subprocess.run(cmd.format(query="produces", prepId=input_dataset[1:]),
                                 shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
        mcm_out = str(mcm_dict.stdout.decode("utf-8"))
        # check if results are not empty
        if mcm_out == '{"results": {}}' or mcm_out == '{"results": {}}\n':
            print("[ERROR] Empty McM dict (get) for {ds}".format(ds=input_dataset),
                  file=sys.stderr)
        else:
            outfile = mcm_dir + "/dict/" + input_dataset.replace('/', '@') + ".json"
            with open(outfile, 'w') as dict_file:
                    dict_file.write(mcm_out)

            prepid = get_prepid_from_mcm(input_dataset, mcm_dir)
            if prepid != None:
                mcm_script = subprocess.run(cmd.format(query="get_setup", prepId=prepid),
                                            shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
                if mcm_script.stdout.decode("utf-8")[0] == '{':
                    print("[ERROR] Empty McM script (get_setup) for {ds}".format(ds=input_dataset),
                          file=sys.stderr)
                else:
                    outfile = mcm_dir + "/scripts/" + input_dataset.replace('/', '@') + ".sh"
                    with open(outfile, 'w') as dict_file:
                            dict_file.write(mcm_script.stdout.decode("utf-8"))
            else:
                print("[ERROR] No prep_id in McM Store for record {ds}".format(ds=input_dataset),
                      file=sys.stderr)
    else:
        print("[ERROR] No input_dataset in das_store/mcm for record {ds}".format(ds=dataset),
              file=sys.stderr)
Exemple #10
0
def print_ancestor_information(dataset, das_dir, mcm_dir, recid_file,
                               doi_info):
    "All the information we have so far"
    # everything should be a sublist item (4 spaces of indentation):
    # - dataset_name
    #     - info

    # TODO add to this function:
    # - config files present
    #   - step GEN
    #   - step RECO
    #   - step HLT
    # - gen_parameters:
    #   - cross section from XSECDB.
    #     see github issue opendata.cern.ch#1137
    #     ideally we should make a local cache of that.
    # - LHE stuff?
    # - Data popularity from github.com/katilp/cms-data-popularity
    #   ideally we should make a local cache of that.
    # it would be very nice if this printer script needed not external (non cached) information

    # record ID as in OpenData portal
    # TODO move this code to other place, no need to open a file everytime
    RECID_INFO = {}
    _locals = locals()
    exec(open(recid_file, 'r').read(), globals(), _locals)
    RECID_INFO = _locals['RECID_INFO']

    try:
        recid = RECID_INFO[dataset]
        print("    - Record ID: [{recid}]({url})".format(
            recid=recid, url='http://opendata.cern.ch/record/' + str(recid)))
    except:
        pass

    # DOI
    doi = get_doi(dataset, doi_info)
    if doi:
        print("    - DOI: [{doi}]({url})".format(doi=doi,
                                                 url='https://doi.org/' +
                                                 str(doi)))

    # PrepId
    prepid = get_prepId_from_das(dataset, das_dir)
    if not prepid:
        prepid = get_prepid_from_mcm(dataset, mcm_dir)
    if prepid:
        print("    - PrepId: [{prepid}]({url})".format(
            prepid=prepid,
            url='https://cms-pdmv.cern.ch/mcm/requests?prepid=' + str(prepid)))

    # global tag & cmssw version
    global_tag = get_global_tag(dataset, mcm_dir)
    cmssw_ver = get_cmssw_version(dataset, mcm_dir)
    if global_tag:
        print("    - Global Tag:", global_tag)
    if cmssw_ver:
        print("    - CMSSW version:", cmssw_ver)

    # Energy
    print("    - Collision Energy: ", get_dataset_energy(dataset, mcm_dir),
          "TeV")

    # Generators
    generators = get_generator_name(dataset, das_dir, mcm_dir)
    if generators:
        print("    - Generators: ", generators)

    # GEN-SIM dataset used to produce the AODSIM
    dataset_json = get_das_store_json(dataset, 'mcm', das_dir)
    input_dataset = get_from_deep_json(dataset_json, 'input_dataset')
    if input_dataset:
        print("    - Input Dataset:", input_dataset)

        input_global_tag = get_global_tag(input_dataset, mcm_dir)
        input_cmssw_ver = get_cmssw_version(input_dataset, mcm_dir)
        if input_global_tag:
            print("        - Global Tag:", input_global_tag)
        if input_cmssw_ver:
            print("        - CMSSW version:", input_cmssw_ver)

        gen_fragment = get_genfragment_url(dataset, mcm_dir, das_dir)
        if gen_fragment:
            for url in gen_fragment:
                print("        - Gen Fragment: [{url}]({url})".format(url=url))

    # gen parameters of input dataset
    generator_parameters = get_generator_parameters(dataset, das_dir)
    if generator_parameters:
        print('        - Generator parameters:')
        print('            - Cross section:',
              generator_parameters.get('cross_section', None))
        print('            - Filter efficiency:',
              generator_parameters.get('filter_efficiency', None))
        print('            - Filter efficiency error:',
              generator_parameters.get('filter_efficiency_error', None))
        print('            - Match efficiency:',
              generator_parameters.get('match_efficiency', None))
        print('            - Match efficiency error:',
              generator_parameters.get('match_efficiency_error', None))

    # mcm scripts with cmsDriver instructions
    cmsDriver1 = get_cmsDriver_script(input_dataset, mcm_dir)
    cmsDriver2 = get_cmsDriver_script(dataset, mcm_dir)
    global DATASETS_WITH_BOTH_CMSDRIVER
    global DATASETS_WITH_CMSDRIVER1
    global DATASETS_WITH_CMSDRIVER2

    if cmsDriver1 or cmsDriver2:
        print("    - cmsDriver scripts:")
        if cmsDriver1:
            print('        - GEN-SIM:', cmsDriver1)
            DATASETS_WITH_CMSDRIVER1 += 1
        if cmsDriver2:
            print('        - RECO-HLT:', cmsDriver2)
            DATASETS_WITH_CMSDRIVER2 += 1

        if cmsDriver1 and cmsDriver2:
            DATASETS_WITH_BOTH_CMSDRIVER += 1

    # python config files
    conffile_ids = get_conffile_ids(dataset, das_dir)
    parent = get_parent_dataset(dataset, das_dir)
    while parent != '' and parent:
        conffile_ids += get_conffile_ids(parent, das_dir)
        parent = get_parent_dataset(parent, das_dir)
    global DATASETS_WITH_3CONFFILES
    if conffile_ids:
        print("    - python config scripts: ", conffile_ids)
        if len(conffile_ids) > 2:
            DATASETS_WITH_3CONFFILES += 1

    global DATASETS_WITH_FULL_PROVENANCE
    if (cmsDriver1 and cmsDriver2) or len(conffile_ids) > 2:
        DATASETS_WITH_FULL_PROVENANCE += 1

    # pile up information
    mcm_dict = get_mcm_dict(dataset, mcm_dir)
    if mcm_dict:
        pileup = get_from_deep_json(mcm_dict, 'pileup')
        pileup_dataset = get_from_deep_json(mcm_dict, 'pileup_dataset_name')
        if pileup or pileup_dataset:
            print('    - pile-up:')
            if pileup:
                print('        -', pileup)
            if pileup_dataset:
                print('        -', pileup_dataset)

        notes = get_from_deep_json(mcm_dict, 'notes')
        if notes != None:
            print(
                '    - notes:', notes.replace('\n', '\n        ')
            )  # some notes have several lines, this makes the markdown use them in the same item list