def getOSFEntry(dagManJobId): """ Given a DAGManJobId `dagManJobId`, find all the blackboard entries that are associated with that DAG and present them in an OSF-like manner: Dataset, Owner, DAGManJobId, [Nodei JobState, Nodei ExitCode, ] WARNING: while this works also if you do not specify a full Global Job ID but just a ClusterId (which incidentally is what is stored in the database as well as in each Job ClassAd), there is a risk. The risk is that we end up returning Jobs/Blackboard entries that are not associated to that DAG but simply happen to have the same DAGManJobId ClusterId (maybe because they were submitted on a different host or because we reinstalled Condor in the mean time etc.). For this reason, it is always better to use a GlobalJobId or at least fabricate one with the right form: <submit host>#<ClusterId>.0#<whatever> Even that is not safe as we might have reinstalled Condor and resetted the ClusterId counter... """ # Define the database connection. elixir.metadata.bind = DATABASE_CONNECTION_STR elixir.metadata.bind.echo = False elixir.setup_all() osfEntries = [] # Get all the relevant entries, grouped by their DAGManJobId. if(not dagManJobId): return # See if dagManJobId is a global job id or a local one. submit_host = None if(condorutils.is_globaljobid(dagManJobId)): [submit_host, jobId, _] = condorutils.parse_globaljobid(dagManJobId) dagManJobId = int(jobId.split('.')[0]) query = Blackboard.query.filter_by(DAGManJobId=unicode(dagManJobId)) if(submit_host): # This is why global job ids are much safer (but not super safe). submit_host = unicode(submit_host) query = query.filter(Blackboard.GlobalJobId.startswith(submit_host)) entries = query.order_by(Blackboard.ClusterId, Blackboard.ProcId).all() if(not entries): return # Now build the OSF-like entry. osfEntry = (entries[0].Dataset, entries[0].Owner, entries[0].DAGManJobId, entries) return(osfEntry)
def getOSFEntry(dagManJobId): """ Given a DAGManJobId `dagManJobId`, find all the blackboard entries that are associated with that DAG and present them in an OSF-like manner: Dataset, Owner, DAGManJobId, [Nodei JobState, Nodei ExitCode, ] WARNING: while this works also if you do not specify a full Global Job ID but just a ClusterId (which incidentally is what is stored in the database as well as in each Job ClassAd), there is a risk. The risk is that we end up returning Jobs/Blackboard entries that are not associated to that DAG but simply happen to have the same DAGManJobId ClusterId (maybe because they were submitted on a different host or because we reinstalled Condor in the mean time etc.). For this reason, it is always better to use a GlobalJobId or at least fabricate one with the right form: <submit host>#<ClusterId>.0#<whatever> Even that is not safe as we might have reinstalled Condor and resetted the ClusterId counter... """ # Define the database connection. elixir.metadata.bind = DATABASE_CONNECTION_STR elixir.metadata.bind.echo = False elixir.setup_all() osfEntries = [] # Get all the relevant entries, grouped by their DAGManJobId. if (not dagManJobId): return # See if dagManJobId is a global job id or a local one. submit_host = None if (condorutils.is_globaljobid(dagManJobId)): [submit_host, jobId, _] = condorutils.parse_globaljobid(dagManJobId) dagManJobId = int(jobId.split('.')[0]) query = Blackboard.query.filter_by(DAGManJobId=unicode(dagManJobId)) if (submit_host): # This is why global job ids are much safer (but not super safe). submit_host = unicode(submit_host) query = query.filter(Blackboard.GlobalJobId.startswith(submit_host)) entries = query.order_by(Blackboard.ClusterId, Blackboard.ProcId).all() if (not entries): return # Now build the OSF-like entry. osfEntry = (entries[0].Dataset, entries[0].Owner, entries[0].DAGManJobId, entries) return (osfEntry)
def _fix_dagman_job_id(self): """ DAGManJobId is simply the parent DAGMan ClusterId, we would like to have it be a full GlobalJobId and here we try to infer the missing pieces of information (namely the hostname and submission timestamp) from the job classad. """ # Remember that in Condor, ClusterIds start form 1, not 0. Also, if the # classad has a DAGManJobId, we assume that its MyType == Job dagman_job_id = getattr(self, 'DAGManJobId', None) # We can only fix DAGManJobId if we have CONDOR_PARENT_ID defined in the # job classad environment string. if(dagman_job_id and self.environmentdict): # parnt_id = submit_host:integer:timestamp parent_id = self.environmentdict.get('CONDOR_PARENT_ID', '') if(not parent_id): msg = 'CONDOR_PARENT_ID not defined in Job Environment string.' raise(Exception(msg)) timestamp = parent_id.split(':')[-1] (host, _, _) = condorutils.parse_globaljobid(self.GlobalJobId) self.DAGManJobId = '%s#%s.0#%s' % (host, dagman_job_id, timestamp) return