def checkForSubmission(self): """Additional checks for unit submission""" # call the base class if not super(AtlasUnit, self).checkForSubmission(): return False # check that parent units are complete because otherwise, when we check for submission to do submissions first (ITransform.update) # datasets may not have been created yet if not self.checkParentUnitsAreComplete(): return False # Add a check for chain units to have frozen their input DS if len( self.req_units ) > 0 and self.inputdata._name == "DQ2Dataset" and not self.inputdata.tag_info: # check datasets are frozen for uds in self.inputdata.dataset: try: dq2_lock.acquire() try: # list datasets in container ds_list = dq2.listDatasetsInContainer(uds) cont_ok = True for ds in ds_list: # find locations and check if frozen loc_dict = dq2.listDatasetReplicas(ds) locations = [] for loc in loc_dict[loc_dict.keys()[0]]: locations += loc_dict[loc_dict.keys()[0]][loc] ds_ok = False for loc in locations: if loc == "": continue datasetsiteinfo = dq2.listFileReplicas(loc, ds) if datasetsiteinfo[0]['found'] != None: ds_ok = True break if not ds_ok: cont_ok = False break except: logger.warning( "Unable to check if datasets are frozen") cont_ok = False finally: dq2_lock.release() # at least one dataset wasn't frozen if not cont_ok: return False return True
def checkForSubmission(self): """Additional checks for unit submission""" # call the base class if not super(AtlasUnit,self).checkForSubmission(): return False # check that parent units are complete because otherwise, when we check for submission to do submissions first (ITransform.update) # datasets may not have been created yet if not self.checkParentUnitsAreComplete(): return False # Add a check for chain units to have frozen their input DS if len(self.req_units) > 0 and self.inputdata._name == "DQ2Dataset" and not self.inputdata.tag_info: # check datasets are frozen for uds in self.inputdata.dataset: try: dq2_lock.acquire() try: # list datasets in container ds_list = dq2.listDatasetsInContainer(uds) cont_ok = True for ds in ds_list: # find locations and check if frozen loc_dict = dq2.listDatasetReplicas(ds) locations = [] for loc in loc_dict[ loc_dict.keys()[0] ]: locations += loc_dict[ loc_dict.keys()[0] ][loc] ds_ok = False for loc in locations: if loc == "": continue datasetsiteinfo = dq2.listFileReplicas(loc, ds) if datasetsiteinfo[0]['found'] != None: ds_ok = True break if not ds_ok: cont_ok = False break except: logger.warning("Unable to check if datasets are frozen") cont_ok = False finally: dq2_lock.release() # at least one dataset wasn't frozen if not cont_ok: return False return True
def check(self): super(AnaTransform, self).check() if not self.inputdata.dataset: return if not self.backend: logger.warning("Determining backend and cloud...") # Get ddm sites of atlas_dbrelease, if present db_sites = None if self.application.atlas_dbrelease == "LATEST": from pandatools import Client self.application.atlas_dbrelease = getPandaClient( ).getLatestDBRelease(False) if self.application.atlas_dbrelease: try: db_dataset = self.application.atlas_dbrelease.split(':')[0] try: dq2_lock.acquire() db_locations = dq2.listDatasetReplicas( db_dataset).values()[0][1] finally: dq2_lock.release() except Exception as x: raise ApplicationConfigurationError( x, 'Problem in AnaTask - j.application.atlas_dbrelease is wrongly configured ! ' ) db_sites = stripSites(db_locations) # Get complete/incomplete ddm sites for input dataset ds = self.inputdata.dataset[0] try: dq2_lock.acquire() if ds[-1] != "/": try: replicas = {ds: dq2.listDatasetReplicas(ds)} except DQUnknownDatasetException: ds += "/" if ds[-1] == "/": replicas = dq2.listDatasetReplicasInContainer(ds) finally: dq2_lock.release() # check if replicas are non-empty somefound = False for tid in replicas: if len(replicas[tid]) == 0: raise ApplicationConfigurationError( None, "No replicas for dataset %s found!" % tid) replicas = [r.values()[0] for r in replicas.values() ] # (dict with only one entry) # Get allowed sites for each backend: backends = [ be for be in config["backendPreference"] if be in GPI.__dict__ ] allowed_sites = {} if "LCG" in backends: allowed_sites["LCG"] = GPI.LCG().requirements.list_sites( True, True) if "Panda" in backends: from pandatools import Client allowed_sites["Panda"] = getPandaClient().PandaSites.keys() #allowed_sites["Panda"] = [site["ddm"] for site in Client.getSiteSpecs()[1].values()] if "NG" in backends: allowed_sites["NG"] = getConfig( "Athena")["AllowedSitesNGDQ2JobSplitter"] #if "PBS" in backends: # sites["PBS"] = [] # should be local DQ2 storage element! # Get list of cloud-backend pairs (cbl) for complete replicas common_cbl = None for r in replicas: cbl = self.findCompleteCloudBackend(db_sites, allowed_sites, r) if common_cbl is None: common_cbl = cbl else: common_cbl = [cb for cb in cbl if cb in common_cbl] #print "CLOUD/BACKEND list for COMPLETE replicas: ", common_cbl # ..and for incomplete replicas if common_cbl is None or len(common_cbl) == 0: if len(replicas) > 1: raise ApplicationConfigurationError( None, 'Container dataset %s has no complete replica on one site and backend. Please specify individual tid datasets or use t.initializeFromDataset("%s") ' % (ds, ds)) common_cbl = self.findIncompleteCloudBackend( db_sites, allowed_sites, replicas[0]) #print "CLOUD/BACKEND list for INCOMPLETE replicas: ", common_cbl if common_cbl is None or len(common_cbl) == 0: raise ApplicationConfigurationError( None, 'Container dataset %s has no replica on one site and backend. Please specify individual tid datasets!' % (ds)) cb = common_cbl[0] using_cloud = cb[0] using_backend = cb[1] assert using_cloud, using_backend if using_backend == "Panda": self.backend = stripProxy(GPI.Panda()) self.backend.requirements.cloud = using_cloud elif using_backend == "NG": self.backend = stripProxy(GPI.NG()) elif using_backend == "LCG": self.backend = stripProxy(GPI.LCG()) self.backend.requirements = stripProxy( GPI.AtlasLCGRequirements()) self.backend.requirements.cloud = using_cloud assert self.backend logger.warning("Running on cloud %s using backend %s", using_cloud, using_backend) logger.warning("Determining partition splitting...") try: if not self.backend.requirements.cloud: self.backend.requirements.cloud = "DE" except: pass if not self.inputdata.dataset: return splitter = DQ2JobSplitter() splitter.numfiles = self.files_per_job #splitter.update_siteindex = False # commented to use default value #splitter.use_lfc = True sjl = splitter.split( self ) # This works even for Panda, no special "Job" properties are used anywhere. self.partitions_data = [sj.inputdata for sj in sjl] try: self.partitions_sites = [ sj.backend.requirements.sites for sj in sjl ] except AttributeError: self.partitions_sites = [sj.backend.site for sj in sjl] pass self.setPartitionsLimit(len(self.partitions_data) + 1) self.setPartitionsStatus([ c for c in range(1, len(self.partitions_data) + 1) if self.getPartitionStatus(c) != "completed" ], "ready")
def checkCompletedApp(self, app): task = self._getParent() j = app._getParent() for odat in j.outputdata.outputdata: # Look out: if this is changed, there is anothher one like it below! if 0 == len([ f for f in j.outputdata.output if ".".join(odat.split(".")[:-1]) in f ]): logger.error("Job %s has not produced %s file, only: %s" % (j.id, odat, j.outputdata.output)) return False # if this is the first app to complete the partition... if self.getPartitionStatus(self._app_partition[app.id]) != "completed": task_container, subtask_dsname = task.container_name, self.dataset_name infos = {} for oinfo in j.outputdata.output: try: dq2_lock.acquire() info = oinfo.split(",") # get master replica from dataset - info not set to SE; but to ANALY_XYZ from panda master_replica = dq2.getMasterReplicaLocation(info[0]) if master_replica: info[5] = master_replica else: replicas = dq2.listDatasetReplicas(info[0]).values() if len(replicas) == 0: try: info[5] = getPandaClient().PandaSites[ info[5]]["ddm"] except KeyError: pass else: complete, incomplete = replicas[0].values() info[5] = (complete + incomplete)[0] if info[4][:3] == "ad:": info[4] = info[4][3:] finally: dq2_lock.release() datasetname = subtask_dsname + '.' + info[5] info[0] = datasetname infos.setdefault(datasetname, []).append(",".join(info)) for ds in infos.keys(): outputdata = DQ2OutputDataset() try: outputdata.create_dataset(ds) except DQDatasetExistsException: pass try: outputdata.register_datasets_details(None, infos[ds]) except DQFileExistsInDatasetException: pass # Register Container try: containerinfo = {} dq2_lock.acquire() try: containerinfo = dq2.listDatasets(task_container) except: containerinfo = {} if containerinfo == {}: try: dq2.registerContainer(task_container) logger.debug('Registered container for Task %i: %s' % (task.id, task_container)) except Exception as x: logger.error( 'Problem registering container for Task %i, %s : %s %s' % (task.id, task_container, x.__class__, x)) for ds in infos.keys(): try: dq2.registerDatasetsInContainer(task_container, [ds]) except DQContainerAlreadyHasDataset: pass except Exception as x: logger.error( 'Problem registering dataset %s in container %s: %s %s' % (subtask_dsname, task_container, x.__class__, x)) finally: dq2_lock.release() return True
def checkCompletedApp(self, app): task = self._getParent() j = app._getParent() for odat in j.outputdata.outputdata: # Look out: if this is changed, there is anothher one like it below! if 0==len([f for f in j.outputdata.output if ".".join(odat.split(".")[:-1]) in f]): logger.error("Job %s has not produced %s file, only: %s" % (j.id, odat, j.outputdata.output)) return False # if this is the first app to complete the partition... if self.getPartitionStatus(self._app_partition[app.id]) != "completed": task_container, subtask_dsname = task.container_name, self.dataset_name infos = {} for oinfo in j.outputdata.output: try: dq2_lock.acquire() info = oinfo.split(",") # get master replica from dataset - info not set to SE; but to ANALY_XYZ from panda master_replica = dq2.getMasterReplicaLocation(info[0]) if master_replica: info[5] = master_replica else: replicas = dq2.listDatasetReplicas(info[0]).values() if len(replicas) == 0: try: info[5] = getPandaClient().PandaSites[info[5]]["ddm"] except KeyError: pass else: complete, incomplete = replicas[0].values() info[5] = (complete + incomplete)[0] if info[4][:3] == "ad:": info[4] = info[4][3:] finally: dq2_lock.release() datasetname = subtask_dsname + '.' + info[5] info[0] = datasetname infos.setdefault(datasetname, []).append(",".join(info)) for ds in infos.keys(): outputdata = DQ2OutputDataset() try: outputdata.create_dataset(ds) except DQDatasetExistsException: pass try: outputdata.register_datasets_details(None, infos[ds]) except DQFileExistsInDatasetException: pass # Register Container try: containerinfo = {} dq2_lock.acquire() try: containerinfo = dq2.listDatasets(task_container) except: containerinfo = {} if containerinfo == {}: try: dq2.registerContainer(task_container) logger.debug('Registered container for Task %i: %s' % (task.id, task_container)) except Exception as x: logger.error('Problem registering container for Task %i, %s : %s %s' % (task.id, task_container,x.__class__, x)) for ds in infos.keys(): try: dq2.registerDatasetsInContainer(task_container, [ ds ] ) except DQContainerAlreadyHasDataset: pass except Exception as x: logger.error('Problem registering dataset %s in container %s: %s %s' %( subtask_dsname, task_container, x.__class__, x)) finally: dq2_lock.release() return True
def check(self): super(AnaTransform,self).check() if not self.inputdata.dataset: return if not self.backend: logger.warning("Determining backend and cloud...") # Get ddm sites of atlas_dbrelease, if present db_sites = None if self.application.atlas_dbrelease == "LATEST": from pandatools import Client self.application.atlas_dbrelease = getPandaClient().getLatestDBRelease(False) if self.application.atlas_dbrelease: try: db_dataset = self.application.atlas_dbrelease.split(':')[0] try: dq2_lock.acquire() db_locations = dq2.listDatasetReplicas(db_dataset).values()[0][1] finally: dq2_lock.release() except Exception as x: raise ApplicationConfigurationError(x, 'Problem in AnaTask - j.application.atlas_dbrelease is wrongly configured ! ') db_sites = stripSites(db_locations) # Get complete/incomplete ddm sites for input dataset ds = self.inputdata.dataset[0] try: dq2_lock.acquire() if ds[-1] != "/": try: replicas = {ds : dq2.listDatasetReplicas(ds)} except DQUnknownDatasetException: ds += "/" if ds[-1] == "/": replicas = dq2.listDatasetReplicasInContainer(ds) finally: dq2_lock.release() # check if replicas are non-empty somefound = False for tid in replicas: if len(replicas[tid]) == 0: raise ApplicationConfigurationError(None, "No replicas for dataset %s found!" % tid) replicas = [r.values()[0] for r in replicas.values()] # (dict with only one entry) # Get allowed sites for each backend: backends = [be for be in config["backendPreference"] if be in GPI.__dict__] allowed_sites = {} if "LCG" in backends: allowed_sites["LCG"] = GPI.LCG().requirements.list_sites(True,True) if "Panda" in backends: from pandatools import Client allowed_sites["Panda"] = getPandaClient().PandaSites.keys() #allowed_sites["Panda"] = [site["ddm"] for site in Client.getSiteSpecs()[1].values()] if "NG" in backends: allowed_sites["NG"] = getConfig("Athena")["AllowedSitesNGDQ2JobSplitter"] #if "PBS" in backends: # sites["PBS"] = [] # should be local DQ2 storage element! # Get list of cloud-backend pairs (cbl) for complete replicas common_cbl = None for r in replicas: cbl = self.findCompleteCloudBackend(db_sites, allowed_sites, r) if common_cbl is None: common_cbl = cbl else: common_cbl = [cb for cb in cbl if cb in common_cbl] #print "CLOUD/BACKEND list for COMPLETE replicas: ", common_cbl # ..and for incomplete replicas if common_cbl is None or len(common_cbl) == 0: if len(replicas) > 1: raise ApplicationConfigurationError(None, 'Container dataset %s has no complete replica on one site and backend. Please specify individual tid datasets or use t.initializeFromDataset("%s") ' % (ds, ds)) common_cbl = self.findIncompleteCloudBackend(db_sites, allowed_sites, replicas[0]) #print "CLOUD/BACKEND list for INCOMPLETE replicas: ", common_cbl if common_cbl is None or len(common_cbl) == 0: raise ApplicationConfigurationError(None, 'Container dataset %s has no replica on one site and backend. Please specify individual tid datasets!' % (ds)) cb = common_cbl[0] using_cloud = cb[0] using_backend = cb[1] assert using_cloud, using_backend if using_backend == "Panda": self.backend = stripProxy(GPI.Panda()) self.backend.requirements.cloud = using_cloud elif using_backend == "NG": self.backend = stripProxy(GPI.NG()) elif using_backend == "LCG": self.backend = stripProxy(GPI.LCG()) self.backend.requirements = stripProxy(GPI.AtlasLCGRequirements()) self.backend.requirements.cloud = using_cloud assert self.backend logger.warning("Running on cloud %s using backend %s", using_cloud, using_backend) logger.warning("Determining partition splitting...") try: if not self.backend.requirements.cloud: self.backend.requirements.cloud = "DE" except: pass if not self.inputdata.dataset: return splitter = DQ2JobSplitter() splitter.numfiles = self.files_per_job #splitter.update_siteindex = False # commented to use default value #splitter.use_lfc = True sjl = splitter.split(self) # This works even for Panda, no special "Job" properties are used anywhere. self.partitions_data = [sj.inputdata for sj in sjl] try: self.partitions_sites = [sj.backend.requirements.sites for sj in sjl] except AttributeError: self.partitions_sites = [sj.backend.site for sj in sjl] pass self.setPartitionsLimit(len(self.partitions_data)+1) self.setPartitionsStatus([c for c in range(1,len(self.partitions_data)+1) if self.getPartitionStatus(c) != "completed"], "ready")