def add_datasets(self, input_prim_proc_acq_tier_config, number=1): "Add blocks to a given primary dataset" idict = deepcopy(input_prim_proc_acq_tier_config) prim_val = idict['prim_ds'] proc_ver = idict['processing_era'] acq_era = idict['acquisition_era'] tier = idict['tier'] config = idict['configs'] func = lambda x: isinstance(x, dict) and [x] or x prim_val = func(prim_val) proc_ver = func(proc_ver) acq_era = func(acq_era) tier = func(tier) config = func(config) output = [] for item_prim, item_proc, item_acq, item_tier, item_config \ in zip(prim_val, proc_ver, acq_era, tier, config): prim = item_prim['primary_ds_name'] acq = item_acq['acquisition_era_name'] tier = item_tier['data_tier_name'] proc_ver = item_proc['processing_version'] attrs = {'prim':prim, 'processing_version':proc_ver, 'acquisition_era_name':acq, 'tier':tier, 'output_configs':[item_config]} res = self.datasets(number, **attrs) for row in res: output.append(row['dataset']) idict['dataset'] = output return idict
def add_datasets(self, input_prim_proc_acq_tier_config, number=1): "Add blocks to a given primary dataset" idict = deepcopy(input_prim_proc_acq_tier_config) prim_val = idict['prim_ds'] proc_ver = idict['processing_era'] acq_era = idict['acquisition_era'] tier = idict['tier'] config = idict['configs'] func = lambda x: isinstance(x, dict) and [x] or x prim_val = func(prim_val) proc_ver = func(proc_ver) acq_era = func(acq_era) tier = func(tier) config = func(config) output = [] for item_prim, item_proc, item_acq, item_tier, item_config \ in zip(prim_val, proc_ver, acq_era, tier, config): prim = item_prim['primary_ds_name'] acq = item_acq['acquisition_era_name'] tier = item_tier['data_tier_name'] proc_ver = item_proc['processing_version'] attrs = { 'prim': prim, 'processing_version': proc_ver, 'acquisition_era_name': acq, 'tier': tier, 'output_configs': [item_config] } res = self.datasets(number, **attrs) for row in res: output.append(row['dataset']) idict['dataset'] = output return idict
def add_files(self, input_dataset, number_of_files=1): "Add files to a given dataset" record = deepcopy(input_dataset) block = record['block'] if not isinstance(block, list): block = [block] for rec in block: _, prim, proc, tier = rec['block_name'].split('#')[0].split('/') attrs = {'prim':prim, 'proc':proc, 'tier':tier, 'block_name':rec['block_name'], 'output_configs':record['configs']} res = self.files(number_of_files, **attrs) return res
def add_blocks(self, input_dataset, number_of_blocks=1): "Add blocks to a given dataset" dataset = deepcopy(input_dataset) name = dataset['dataset']['name'] res = self.blocks(number_of_blocks) for row in res: buid = generate_block_uid() row['block']['name'] = '%s#%s' % (name, buid) if dataset['dataset']['is-open'] == 'y': blocks = dataset['dataset']['blocks'] blocks += res return dataset
def add_blocks(self, input_dataset, number=1): "Add blocks to a given dataset" idict = deepcopy(input_dataset) datasets = idict['dataset'] if isinstance(datasets, dict): datasets = [datasets] output = [] for item in datasets: _, prim, proc, tier = item['dataset'].split('/') attrs = {'prim': prim, 'proc': proc, 'tier': tier} res = self.blocks(number, **attrs) for row in res: output.append(row['block']) idict['block'] = output return idict
def add_blocks(self, input_dataset, number=1): "Add blocks to a given dataset" idict = deepcopy(input_dataset) datasets = idict['dataset'] if isinstance(datasets, dict): datasets = [datasets] output = [] for item in datasets: _, prim, proc, tier = item['dataset'].split('/') attrs = {'prim':prim, 'proc':proc, 'tier':tier} res = self.blocks(number, **attrs) for row in res: output.append(row['block']) idict['block'] = output return idict
def dataset(self): "return dataset object" if not hasattr(self, '_dataset'): self.generate_dataset() phedex_data = {'dbs_name': self.dbs_name} dataset = deepcopy(self._dataset) dataset.update({'is-open': self.dataset_is_open}) for block in dataset['blocks']: #update block information size = sum([f['file']['bytes'] for f in block['block']['files']]) block['block'].update({"nfiles": len(block['block']['files']), "size": size}) phedex_data.update(dict(dataset=dataset)) return phedex_data
def add_files(self, input_dataset, number_of_files=1): "Add files to a given dataset" record = deepcopy(input_dataset) block = record['block'] if not isinstance(block, list): block = [block] for rec in block: _, prim, proc, tier = rec['block_name'].split('#')[0].split('/') attrs = { 'prim': prim, 'proc': proc, 'tier': tier, 'block_name': rec['block_name'], 'output_configs': record['configs'] } res = self.files(number_of_files, **attrs) return res
def gen_runs(self, file_record, number_of_runs=1): "Generate run/lumis for a given file record" if not isinstance(file_record, dict) or \ not file_record.has_key('file') or \ not file_record['file'].has_key('logical_file_name'): msg = 'To generate run/lumis please provide valid file record/JSON file' raise Exception(msg) row = deepcopy(file_record) records = [] for idx in range(0, number_of_runs): run = random.randint(100000, 200000) for _ in range(0, random.randint(1, 10)): lumi = random.randint(1, 100) rec = {'run_num': str(run), 'lumi_section_num': str(lumi)} records.append(rec) row['file']['file_lumi_list'] = records return row
def dataset(self): "return dataset object" if not hasattr(self, '_dataset'): self.generate_dataset() phedex_data = {'dbs_name': self.dbs_name} dataset = deepcopy(self._dataset) dataset.update({'is-open': self.dataset_is_open}) for block in dataset['blocks']: #update block information size = sum([f['file']['bytes'] for f in block['block']['files']]) block['block'].update({ "nfiles": len(block['block']['files']), "size": size }) phedex_data.update(dict(dataset=dataset)) return phedex_data
def add_files(self, input_dataset, number_of_files=1): "Add files to a given dataset" dataset = deepcopy(input_dataset) for block in dataset['dataset']['blocks']: if block['block']['is-open'] != 'y': continue block_name = block['block']['name'] _, prim, proc, tier = block_name.split('#')[0].split('/') attrs = {'prim':prim, 'proc':proc, 'tier':tier} res = self.files(number_of_files, **attrs) size = 0 for row in res: size += row['file']['bytes'] if block['block']['files']: block['block']['files'] += res block['block']['size'] += size block['block']['nfiles'] += len(res) else: block['block']['files'] = res block['block']['size'] = size block['block']['nfiles'] = len(res) return dataset
def add_files(self, input_dataset, number_of_files=1): "Add files to a given dataset" dataset = deepcopy(input_dataset) for block in dataset['dataset']['blocks']: if block['block']['is-open'] != 'y': continue block_name = block['block']['name'] _, prim, proc, tier = block_name.split('#')[0].split('/') attrs = {'prim': prim, 'proc': proc, 'tier': tier} if "tags" in dataset['dataset']: attrs['tags'] = dataset['dataset']['tags'] res = self.files(number_of_files, **attrs) size = 0 for row in res: size += row['file']['bytes'] if block['block']['files']: block['block']['files'] += res block['block']['size'] += size block['block']['nfiles'] += len(res) else: block['block']['files'] = res block['block']['size'] = size block['block']['nfiles'] = len(res) return dataset
def workflow(fin, fout, verbose=None): "LifeCycle workflow" initial_payload = None # initial payload, should be provided by LifeCycle new_payload = [] # newly created payloads will be returned by LifeCycle with open(fin, 'r') as source: initial_payload = json.load(source) if verbose: print "\n### input workflow" print pprint.pformat(initial_payload) ### read inputs from payload workflow = initial_payload['workflow'] # check if input are read from configuration file try: cfg = workflow['DataProviderCfg'] except KeyError: #No configuration, try to use values provided in the workflow #for backward compatibility #values using get are optional cdict = { 'process' : {'NumberOfDatasets' : workflow['NumberOfDatasets'], 'NumberOfBlocks' : workflow['NumberOfBlocks'], 'NumberOfFiles' : workflow['NumberOfFiles'], 'NumberOfRuns' : workflow['NumberOfRuns'], 'NumberOfLumis' : workflow['NumberOfLumis']}, 'dbs' : {'DBSSkipFileFail': workflow.get('DBSSkipFileFail', None), 'DBSChangeCksumFail': workflow.get('DBSChangeCksumFail', None), 'DBSChangeSizeFail': workflow.get('DBSChangeSizeFail', None)}, 'phedex' : {'PhedexSkipFileFail' : workflow.get('PhedexSkipFileFail', None), 'PhedexChangeCksumFail' : workflow.get('PhedexChangeCksumFail', None), 'PhedexChangeSizeFail' : workflow.get('PhedexChangeSizeFail', None), 'PhedexDBSName' : workflow['PhedexDBSName']} } else: cdict = read_configparser(cfg) process_cfg = cdict['process'] dbs_cfg = cdict['dbs'] phedex_cfg = cdict['phedex'] phedex_dbs_name = phedex_cfg.get('PhedexDBSName') number_of_datasets = int(process_cfg.get('NumberOfDatasets')) number_of_blocks = int(process_cfg.get('NumberOfBlocks')) number_of_files = int(process_cfg.get('NumberOfFiles')) number_of_runs = int(process_cfg.get('NumberOfRuns')) number_of_lumis = int(process_cfg.get('NumberOfLumis')) try: phedex_file = float(phedex_cfg.get('PhedexSkipFileFail')) phedex_cksum = float(phedex_cfg.get('PhedexChangeCksumFail')) phedex_size = float(phedex_cfg.get('PhedexChangeSizeFail')) dbs_file = float(dbs_cfg.get('DBSSkipFileFail')) dbs_cksum = float(dbs_cfg.get('DBSChangeCksumFail')) dbs_size = float(dbs_cfg.get('DBSChangeSizeFail')) # if value is None, the cast will fail, which means no failures are used except TypeError: failure_rates = None else: failure_rates = dict(PhedexSkipFileFail=phedex_file) failure_rates.update(PhedexChangeCksumFail=phedex_cksum) failure_rates.update(PhedexChangeSizeFail=phedex_size) failure_rates.update(DBSSkipFileFail=dbs_file) failure_rates.update(DBSChangeCksumFail=dbs_cksum) failure_rates.update(DBSChangeSizeFail=dbs_size) print failure_rates phedex_provider = PhedexProvider(dbs_name=phedex_dbs_name, failure_rates=failure_rates) dbs_provider = DBSProvider(failure_rates=failure_rates) for _ in xrange(number_of_datasets): #clone initial payload payload = deepcopy(initial_payload) phedex_provider.generate_dataset() phedex_provider.add_blocks(number_of_blocks) phedex_provider.add_files(number_of_files) payload['workflow']['Phedex'] = [phedex_provider.dataset()] payload['workflow']['DBS'] = dbs_provider.block_dump(number_of_runs, number_of_lumis) phedex_provider.reset() new_payload.append(payload) with open(fout, 'w') as output: json.dump(new_payload, output) if verbose: print "\n### output workflow" print pprint.pformat(new_payload)
def workflow(fin, fout, verbose=None): "LifeCycle workflow" initial_payload = None # initial payload, should be provided by LifeCycle new_payload = [] # newly created payloads will be returned by LifeCycle with open(fin, 'r') as source: initial_payload = json.load(source) if verbose: print "\n### input workflow" print pprint.pformat(initial_payload) ### read inputs from payload workflow = initial_payload['workflow'] # check if input are read from configuration file try: cfg = workflow['DataProviderCfg'] except KeyError: #No configuration, try to use values provided in the workflow #for backward compatibility #values using get are optional cdict = { 'process': { 'NumberOfDatasets': workflow['NumberOfDatasets'], 'NumberOfBlocks': workflow['NumberOfBlocks'], 'NumberOfFiles': workflow['NumberOfFiles'], 'NumberOfRuns': workflow['NumberOfRuns'], 'NumberOfLumis': workflow['NumberOfLumis'] }, 'dbs': { 'DBSSkipFileFail': workflow.get('DBSSkipFileFail', None), 'DBSChangeCksumFail': workflow.get('DBSChangeCksumFail', None), 'DBSChangeSizeFail': workflow.get('DBSChangeSizeFail', None) }, 'phedex': { 'PhedexSkipFileFail': workflow.get('PhedexSkipFileFail', None), 'PhedexChangeCksumFail': workflow.get('PhedexChangeCksumFail', None), 'PhedexChangeSizeFail': workflow.get('PhedexChangeSizeFail', None), 'PhedexDBSName': workflow['PhedexDBSName'] } } else: cdict = read_configparser(cfg) process_cfg = cdict['process'] dbs_cfg = cdict['dbs'] phedex_cfg = cdict['phedex'] phedex_dbs_name = phedex_cfg.get('PhedexDBSName') number_of_datasets = int(process_cfg.get('NumberOfDatasets')) number_of_blocks = int(process_cfg.get('NumberOfBlocks')) number_of_files = int(process_cfg.get('NumberOfFiles')) number_of_runs = int(process_cfg.get('NumberOfRuns')) number_of_lumis = int(process_cfg.get('NumberOfLumis')) try: phedex_file = float(phedex_cfg.get('PhedexSkipFileFail')) phedex_cksum = float(phedex_cfg.get('PhedexChangeCksumFail')) phedex_size = float(phedex_cfg.get('PhedexChangeSizeFail')) dbs_file = float(dbs_cfg.get('DBSSkipFileFail')) dbs_cksum = float(dbs_cfg.get('DBSChangeCksumFail')) dbs_size = float(dbs_cfg.get('DBSChangeSizeFail')) # if value is None, the cast will fail, which means no failures are used except TypeError: failure_rates = None else: failure_rates = dict(PhedexSkipFileFail=phedex_file) failure_rates.update(PhedexChangeCksumFail=phedex_cksum) failure_rates.update(PhedexChangeSizeFail=phedex_size) failure_rates.update(DBSSkipFileFail=dbs_file) failure_rates.update(DBSChangeCksumFail=dbs_cksum) failure_rates.update(DBSChangeSizeFail=dbs_size) print failure_rates phedex_provider = PhedexProvider(dbs_name=phedex_dbs_name, failure_rates=failure_rates) dbs_provider = DBSProvider(failure_rates=failure_rates) for _ in xrange(number_of_datasets): #clone initial payload payload = deepcopy(initial_payload) phedex_provider.generate_dataset() phedex_provider.add_blocks(number_of_blocks) phedex_provider.add_files(number_of_files) payload['workflow']['Phedex'] = [phedex_provider.dataset()] payload['workflow']['DBS'] = dbs_provider.block_dump( number_of_runs, number_of_lumis) phedex_provider.reset() new_payload.append(payload) with open(fout, 'w') as output: json.dump(new_payload, output) if verbose: print "\n### output workflow" print pprint.pformat(new_payload)