def get_generator_name(dataset, das_dir, mcm_dir): "Return list of generators used for that dataset" generator_names = [] mcm_dict = get_mcm_dict(dataset, mcm_dir) generators = get_from_deep_json(mcm_dict, 'generators') input_generators = [] dataset_json = get_das_store_json(dataset, 'mcm', das_dir) input_dataset = get_from_deep_json(dataset_json, 'input_dataset') if input_dataset: mcm_dict = get_mcm_dict(input_dataset, mcm_dir) input_generators = get_from_deep_json(mcm_dict, 'generators') if generators and input_generators: generators += input_generators if generators: for item in generators: for char in ['"', '\\', '[', ']']: # remove ", \, [, ] item = item.replace(char, '') generator = item if generator not in generator_names: generator_names.append(item) return generator_names
def get_prepid_from_mcm(dataset, mcm_dir): "get prepid for dataset from McM store" # get prepid from das/dataset prepid = get_from_deep_json(get_mcm_dict(dataset, mcm_dir), 'prep_id') if prepid == None: # try different queries from the json. prep_id? prepid = get_from_deep_json(get_mcm_dict(dataset, mcm_dir), 'prepid') return prepid
def get_prepId_from_das(dataset, das_dir): "get prepid for dataset" # get prepid from das/dataset prepid = get_from_deep_json(get_das_store_json(dataset, 'dataset', das_dir), 'prep_id') if prepid == None: # try to get from das/mcm: prepid = get_from_deep_json(get_das_store_json(dataset, 'mcm', das_dir), 'prepid') # todo also try different queries from the json. prep_id? return prepid
def get_conffile_ids(dataset, das_dir): """Return location of the configuration files for the dataset.""" ids = {} byoutput = get_from_deep_json( get_das_store_json(dataset, 'config', das_dir), 'byoutputdataset') byinput = get_from_deep_json( get_das_store_json(dataset, 'config', das_dir), 'byinputdataset') if byoutput: for someid in byoutput: ids[someid] = 1 if byinput: for someid in byinput: ids[someid] = 1 return list(ids.keys())
def get_number_files(dataset, das_dir): """Return number of files for the dataset.""" number_files = get_from_deep_json( get_das_store_json(dataset, 'dataset', das_dir), 'nfiles') if number_files: return number_files return 0
def get_size(dataset, das_dir): """Return size of the dataset.""" size = get_from_deep_json(get_das_store_json(dataset, 'dataset', das_dir), 'size') if size: return size return 0
def get_number_events(dataset, das_dir): """Return number of events for the dataset.""" number_events = get_from_deep_json( get_das_store_json(dataset, 'dataset', das_dir), 'nevents') if number_events: return number_events return 0
def get_cmssw_version_from_das(dataset, das_dir): """Return CMSSW release version from DAS JSON.""" out = get_from_deep_json(get_das_store_json(dataset, 'release', das_dir), 'name') if out: return out[0] else: return {}
def get_generator_parameters_from_mcm(dataset, mcm_dir): """Return generator parameters dictionary for given dataset.""" mcm_dict = get_mcm_dict(dataset, mcm_dir) out = get_from_deep_json(mcm_dict, 'generator_parameters') if out: return out[0] else: return {}
def get_global_tag(dataset, mcm_dir): "Get global tag from McM dictionary" mcm_dict = get_mcm_dict(dataset, mcm_dir) global_tag = get_from_deep_json(mcm_dict, 'conditions') if not global_tag: global_tag = '' return global_tag
def get_cmssw_version_from_mcm(dataset, mcm_dir): "Get CMSSW version from McM dictionary" mcm_dict = get_mcm_dict(dataset, mcm_dir) cmssw = get_from_deep_json(mcm_dict, 'cmssw_release') if not cmssw: cmssw = '' return cmssw
def get_generator_parameters(dataset, das_dir): """Return generator parameters dictionary for given dataset.""" # TODO get from mcm store instead? # and/or from xsecDB out = get_from_deep_json(get_das_store_json(dataset, 'mcm', das_dir), 'generator_parameters') if out: return out[0] else: return {}
def get_parent_dataset(dataset, das_dir): "Return parent dataset to the given dataset or an empty string if no parent found." parent_dataset = '' filepath = das_dir + '/parent/' + dataset.replace('/', '@') + '.json' if os.path.exists(filepath) and os.stat(filepath).st_size != 0: parent_dataset = get_from_deep_json( get_das_store_json(dataset, 'parent', das_dir), 'parent_dataset') return parent_dataset
def get_parent_dataset(dataset, das_dir): "Return parent dataset to the given dataset or an empty string if no parent found." parent_dataset = '' try: parent_dataset = get_from_deep_json( get_das_store_json(dataset, 'parent', das_dir), 'parent_dataset') except: # troubles getting information about parent pass return parent_dataset
def get_conffile_ids_from_das(dataset, das_dir, mcm_dir): """Return location of the configuration files for the dataset from DAS.""" ids = {} output = get_from_deep_json(get_das_store_json(dataset, 'config', das_dir), 'byoutputdataset') if output: for someid in output: ids[someid] = 1 else: print("Error: No config id found from DAS config for " + dataset, file=sys.stderr) return list(ids.keys())
def get_dataset_energy(dataset, mcm_dir): "Return energy of that dataset in TeV" mcm_dict = get_mcm_dict(dataset, mcm_dir) if mcm_dict: energy = get_from_deep_json(mcm_dict, 'energy') if isinstance(energy, str): return energy else: return str(energy).replace('.0', '') + 'TeV' else: year = get_dataset_year(dataset) return { 2010: '7TeV', 2011: '7TeV', 2012: '8TeV', 2015: '13TeV', 2016: '13TeV', }.get(year, 0)
def get_genfragment_url(dataset, mcm_dir, das_dir): "return list of url's of the genfragments used" input_dataset = '' url = [] # get GEN-SIM dataset if get_dataset_format(dataset) == 'AODSIM': dataset_json = get_das_store_json(dataset, 'mcm', das_dir) input_dataset = get_from_deep_json(dataset_json, 'input_dataset') else: input_dataset = dataset script_path = get_cmsDriver_script(input_dataset, mcm_dir) if script_path == None: return None with open(script_path, 'r') as script: for line in script: if 'curl' in line: curl = re.search('(?P<url>https?://[^\s]+)', line) if curl: url.append(curl.group('url')) return url
def get_conffile_ids_from_mcm(dataset, das_dir, mcm_dir): """Return location of the configuration files for the dataset from McM.""" config_ids = [] mcm_dict = get_mcm_dict(dataset, mcm_dir) config_ids = get_from_deep_json(mcm_dict, 'config_id') return config_ids
def get_parent_dataset_from_mcm(dataset, das_dir, mcm_dir): "Return parent dataset to given DATASET from McM." parent_dataset = '' mcm_dict = get_mcm_dict(dataset, mcm_dir) parent_dataset = get_from_deep_json(mcm_dict, 'input_dataset') return parent_dataset
def get_pileup_from_mcm(dataset, mcm_dir): """Return pileup_dataset_name for given dataset.""" mcm_dict = get_mcm_dict(dataset, mcm_dir) pileup = get_from_deep_json(mcm_dict, 'pileup_dataset_name') return pileup
def print_ancestor_information(dataset, das_dir, mcm_dir, recid_file, doi_info): "All the information we have so far" # everything should be a sublist item (4 spaces of indentation): # - dataset_name # - info # TODO add to this function: # - config files present # - step GEN # - step RECO # - step HLT # - gen_parameters: # - cross section from XSECDB. # see github issue opendata.cern.ch#1137 # ideally we should make a local cache of that. # - LHE stuff? # - Data popularity from github.com/katilp/cms-data-popularity # ideally we should make a local cache of that. # it would be very nice if this printer script needed not external (non cached) information # record ID as in OpenData portal # TODO move this code to other place, no need to open a file everytime RECID_INFO = {} _locals = locals() exec(open(recid_file, 'r').read(), globals(), _locals) RECID_INFO = _locals['RECID_INFO'] try: recid = RECID_INFO[dataset] print(" - Record ID: [{recid}]({url})".format( recid=recid, url='http://opendata.cern.ch/record/' + str(recid))) except: pass # DOI doi = get_doi(dataset, doi_info) if doi: print(" - DOI: [{doi}]({url})".format(doi=doi, url='https://doi.org/' + str(doi))) # PrepId prepid = get_prepId_from_das(dataset, das_dir) if not prepid: prepid = get_prepid_from_mcm(dataset, mcm_dir) if prepid: print(" - PrepId: [{prepid}]({url})".format( prepid=prepid, url='https://cms-pdmv.cern.ch/mcm/requests?prepid=' + str(prepid))) # global tag & cmssw version global_tag = get_global_tag(dataset, mcm_dir) cmssw_ver = get_cmssw_version(dataset, mcm_dir) if global_tag: print(" - Global Tag:", global_tag) if cmssw_ver: print(" - CMSSW version:", cmssw_ver) # Energy print(" - Collision Energy: ", get_dataset_energy(dataset, mcm_dir), "TeV") # Generators generators = get_generator_name(dataset, das_dir, mcm_dir) if generators: print(" - Generators: ", generators) # GEN-SIM dataset used to produce the AODSIM dataset_json = get_das_store_json(dataset, 'mcm', das_dir) input_dataset = get_from_deep_json(dataset_json, 'input_dataset') if input_dataset: print(" - Input Dataset:", input_dataset) input_global_tag = get_global_tag(input_dataset, mcm_dir) input_cmssw_ver = get_cmssw_version(input_dataset, mcm_dir) if input_global_tag: print(" - Global Tag:", input_global_tag) if input_cmssw_ver: print(" - CMSSW version:", input_cmssw_ver) gen_fragment = get_genfragment_url(dataset, mcm_dir, das_dir) if gen_fragment: for url in gen_fragment: print(" - Gen Fragment: [{url}]({url})".format(url=url)) # gen parameters of input dataset generator_parameters = get_generator_parameters(dataset, das_dir) if generator_parameters: print(' - Generator parameters:') print(' - Cross section:', generator_parameters.get('cross_section', None)) print(' - Filter efficiency:', generator_parameters.get('filter_efficiency', None)) print(' - Filter efficiency error:', generator_parameters.get('filter_efficiency_error', None)) print(' - Match efficiency:', generator_parameters.get('match_efficiency', None)) print(' - Match efficiency error:', generator_parameters.get('match_efficiency_error', None)) # mcm scripts with cmsDriver instructions cmsDriver1 = get_cmsDriver_script(input_dataset, mcm_dir) cmsDriver2 = get_cmsDriver_script(dataset, mcm_dir) global DATASETS_WITH_BOTH_CMSDRIVER global DATASETS_WITH_CMSDRIVER1 global DATASETS_WITH_CMSDRIVER2 if cmsDriver1 or cmsDriver2: print(" - cmsDriver scripts:") if cmsDriver1: print(' - GEN-SIM:', cmsDriver1) DATASETS_WITH_CMSDRIVER1 += 1 if cmsDriver2: print(' - RECO-HLT:', cmsDriver2) DATASETS_WITH_CMSDRIVER2 += 1 if cmsDriver1 and cmsDriver2: DATASETS_WITH_BOTH_CMSDRIVER += 1 # python config files conffile_ids = get_conffile_ids(dataset, das_dir) parent = get_parent_dataset(dataset, das_dir) while parent != '' and parent: conffile_ids += get_conffile_ids(parent, das_dir) parent = get_parent_dataset(parent, das_dir) global DATASETS_WITH_3CONFFILES if conffile_ids: print(" - python config scripts: ", conffile_ids) if len(conffile_ids) > 2: DATASETS_WITH_3CONFFILES += 1 global DATASETS_WITH_FULL_PROVENANCE if (cmsDriver1 and cmsDriver2) or len(conffile_ids) > 2: DATASETS_WITH_FULL_PROVENANCE += 1 # pile up information mcm_dict = get_mcm_dict(dataset, mcm_dir) if mcm_dict: pileup = get_from_deep_json(mcm_dict, 'pileup') pileup_dataset = get_from_deep_json(mcm_dict, 'pileup_dataset_name') if pileup or pileup_dataset: print(' - pile-up:') if pileup: print(' -', pileup) if pileup_dataset: print(' -', pileup_dataset) notes = get_from_deep_json(mcm_dict, 'notes') if notes != None: print( ' - notes:', notes.replace('\n', '\n ') ) # some notes have several lines, this makes the markdown use them in the same item list
def mcm_downloader(prepid, dataset, mcm_dir, das_dir): "Query dictionary and setup script from McM database" # this function is so ugly... but finally works! You're welcome to refactor it though cmd = "curl -s -k https://cms-pdmv.cern.ch/mcm/public/restapi/requests/{query}/{prepId}" # As prep_id in DAS for some datasets can be found with underscores and MCM # takes without underscores, we need to process prep_id removing all of them if "_" in prepid: print("Found some underscores in prep_id: " + prepid + ", removing...") prepid = prepid.replace("_", "") mcm_dict = subprocess.run(cmd.format(query="get", prepId=prepid), shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE) mcm_script = subprocess.run(cmd.format(query="get_setup", prepId=prepid), shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE) mcm_dict_out = str(mcm_dict.stdout.decode("utf-8")) mcm_script_out = str(mcm_script.stdout.decode("utf-8")) # check if results are not empty if mcm_dict_out == '{"results": {}}\n': print("[ERROR] Empty McM dict (get) for {ds}".format(ds=dataset), file=sys.stderr) else: outfile = mcm_dir + "/dict/" + dataset.replace('/', '@') + ".json" with open(outfile, 'w') as dict_file: dict_file.write(mcm_dict_out) if mcm_script_out == '' or mcm_script_out[0] == '{': print("[ERROR] Empty McM script (get_setup) for {ds}".format(ds=dataset), file=sys.stderr) else: outfile = mcm_dir + "/scripts/" + dataset.replace('/', '@') + ".sh" with open(outfile, 'w') as dict_file: dict_file.write(mcm_script_out) # same thing for "input_dataset": hopefully the GEN-SIM step dataset_json = get_das_store_json(dataset, 'mcm', das_dir) input_dataset = get_from_deep_json(dataset_json, 'input_dataset') # /bla/ble/GEN-SIM if input_dataset: mcm_dict = subprocess.run(cmd.format(query="produces", prepId=input_dataset[1:]), shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE) mcm_out = str(mcm_dict.stdout.decode("utf-8")) # check if results are not empty if mcm_out == '{"results": {}}' or mcm_out == '{"results": {}}\n': print("[ERROR] Empty McM dict (get) for {ds}".format(ds=input_dataset), file=sys.stderr) else: outfile = mcm_dir + "/dict/" + input_dataset.replace('/', '@') + ".json" with open(outfile, 'w') as dict_file: dict_file.write(mcm_out) prepid = get_prepid_from_mcm(input_dataset, mcm_dir) if prepid != None: mcm_script = subprocess.run(cmd.format(query="get_setup", prepId=prepid), shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE) if mcm_script.stdout.decode("utf-8")[0] == '{': print("[ERROR] Empty McM script (get_setup) for {ds}".format(ds=input_dataset), file=sys.stderr) else: outfile = mcm_dir + "/scripts/" + input_dataset.replace('/', '@') + ".sh" with open(outfile, 'w') as dict_file: dict_file.write(mcm_script.stdout.decode("utf-8")) else: print("[ERROR] No prep_id in McM Store for record {ds}".format(ds=input_dataset), file=sys.stderr) else: print("[ERROR] No input_dataset in das_store/mcm for record {ds}".format(ds=dataset), file=sys.stderr)
def get_all_generator_text(dataset, das_dir, mcm_dir, conf_dir): """Return DICT with all information about the generator steps.""" info = {} info[ "description"] = "<p>These data were generated in several steps (see also <a href=\"/docs/cms-mc-production-overview\">CMS Monte Carlo production overview</a>):</p>" info["steps"] = [] input_dataset = dataset while input_dataset: step = {} process = '' step['output_dataset'] = input_dataset release = get_cmssw_version(input_dataset, das_dir, mcm_dir) if release: step['release'] = release global_tag = get_global_tag(input_dataset, mcm_dir) if global_tag: step['global_tag'] = global_tag cmsdriver_path = get_cmsDriver_script(input_dataset, mcm_dir) step['configuration_files'] = [] if cmsdriver_path: with open(cmsdriver_path) as myfile: configuration_files = {} configuration_files['title'] = 'Production script' configuration_files['script'] = myfile.read() if configuration_files: step['configuration_files'].append(configuration_files) generator_names = get_generator_name(input_dataset, das_dir, mcm_dir) if generator_names: step['generators'] = generator_names gen_fragment = get_genfragment_url(input_dataset, mcm_dir, das_dir) if gen_fragment: for url in gen_fragment: configuration_files = {} configuration_files['title'] = 'Generator parameters' configuration_files['url'] = url try: script = requests.get(url, verify=False).text configuration_files['script'] = script except: pass if configuration_files: step['configuration_files'].append(configuration_files) config_ids = get_conffile_ids(input_dataset, das_dir, mcm_dir) if config_ids: for config_id in config_ids: afile = config_id + '.configFile' proc = get_process(afile, conf_dir) if process: process += " " + proc else: process += proc configuration_files = {} configuration_files['title'] = 'Configuration file' configuration_files['process'] = proc configuration_files['cms_confdb_id'] = config_id globaltag = get_globaltag_from_conffile(afile, conf_dir) if not 'global_tag' in step: step['global_tag'] = globaltag step['configuration_files'].append(configuration_files) # if we couldn't detect process type from config files, try guessing # via extension: if not process: if input_dataset.endswith('/LHE'): process = 'LHE' elif input_dataset.endswith('/SIM'): process = 'SIM' elif input_dataset.endswith('/GEN-SIM'): process = 'SIM' #if process == 'LHE': # step['note'] = "To get the exact generator parameters, please see <a href=\"/docs/cms-mc-production-overview#finding-the-generator-parameters\">Finding the generator parameters</a>." step['type'] = process # For cases where SIM and LHE steps are done together datatier = get_from_deep_json(get_mcm_dict(input_dataset, mcm_dir), 'datatier') if datatier == ["GEN-SIM", "LHE"]: step['type'] = "LHE SIM" for i, configuration_files in enumerate( step['configuration_files']): if configuration_files['title'] == 'Generator parameters': step['configuration_files'][i][ 'title'] = 'Hadronizer parameters' info["steps"].append(step) # find parent dataset, first via DAS, then via McM input_dataset_das = get_parent_dataset(input_dataset, das_dir) input_dataset_mcm = get_parent_dataset_from_mcm( input_dataset, das_dir, mcm_dir) if input_dataset_mcm == 'Default': # workaround McM bugs input_dataset_mcm = '' if input_dataset_das: input_dataset = input_dataset_das else: input_dataset = input_dataset_mcm # reverse order of steps for provenance info['steps'].reverse() # post-generation fix: if we have LHE step, let's modify the configuration file titles for other steps: lhe_present = False for step in info['steps']: if lhe_present: for configuration_file in step.get('configuration_files'): if configuration_file['title'] == 'Generator parameters': configuration_file['title'] = 'Hadronizer parameters' if 'LHE' in step['type']: lhe_present = True # post-generation fix: keep generators only for the first step, remove from others: generators_present = False for step in info['steps']: if generators_present: if 'generators' in step: del (step['generators']) else: if 'generators' in step: generators_present = True return info
def mcm_downloader(dataset, mcm_dir, das_dir): "Query dictionary and setup script from McM database" filepath = mcm_dir + "/dict/" + dataset.replace('/', '@') + ".json" if os.path.exists(filepath) and os.stat(filepath).st_size != 0: print("==> " + dataset + "\n==> Already exist. Skipping...") return cmd = "curl -s -k https://cms-pdmv.cern.ch/mcm/public/restapi/requests/" mcm_dict = subprocess.run(cmd + "produces" + dataset, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE) mcm_dict_out = str(mcm_dict.stdout.decode("utf-8")) prepid = None if mcm_dict_out != '{"results": {}}\n' or mcm_dict_out != '{"results": {}}': # get prepid from mcm/dataset prepid = get_from_deep_json(json.loads(mcm_dict_out), "prepid") if prepid == None: prepid = get_from_deep_json(json.loads(mcm_dict_out), "prep_id") if prepid == None: prepid = get_prepId_from_das(dataset, das_dir) if prepid == None: print("Error: prepid not found in mcm, das, and das/mcm for " + dataset + "\n==> Skipping dataset McM dict and script", file=sys.stderr) return # check if McM dict is empty try to get it by das prepid ( /get/perpid instead of /produces/dataset) if mcm_dict_out == '{"results": {}}\n' or mcm_dict_out == '{"results": {}}': cmd = "curl -s -k https://cms-pdmv.cern.ch/mcm/public/restapi/requests/" mcm_dict = subprocess.run(cmd + "get/" + prepid, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE) mcm_dict_out = str(mcm_dict.stdout.decode("utf-8")) # check if it is still empty (then there is no way to get dataset McM dict) if mcm_dict_out == '{"results": {}}\n' or mcm_dict_out == '{"results": {}}': print( "[ERROR] Empty McM dict (get) for {ds} \n with prepid {pd}".format( ds=dataset, pd=prepid), file=sys.stderr) else: outfile = mcm_dir + "/dict/" + dataset.replace('/', '@') + ".json" with open(outfile, 'w') as dict_file: dict_file.write(mcm_dict_out) mcm_script = subprocess.run(cmd + "get_test/" + prepid, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE) mcm_script_out = str(mcm_script.stdout.decode("utf-8")) if mcm_script_out == '' or mcm_script_out[0] == '{': print( "[ERROR] Empty McM script (get_test) for {ds}".format(ds=dataset), file=sys.stderr) else: outfile = mcm_dir + "/scripts/" + dataset.replace('/', '@') + ".sh" with open(outfile, 'w') as dict_file: dict_file.write(mcm_script_out)