def updateQuery(self, resubmit=False): """Update the dataset information of all attached transforms. This will include any new data in the processing or re-run jobs that have data which has been removed.""" # Tried to use multithreading, better to check the tasksregistry class # Also tried multiprocessing but bottleneck at server. for t in self.transforms: try: t.updateQuery(resubmit) except GangaException as e: logger.warning(e.__str__()) continue # update the status of the Task in case we're started running again self.updateStatus()
def updateStatus(self, status): """Update status hook""" # check for input data deletion of chain data if status == "completed" and self._getParent().delete_chain_input and len(self.req_units) > 0: # the inputdata field *must* be filled from the parent task # NOTE: When changing to inputfiles, will probably need to check # for any specified in trf.inputfiles # check that the parent replicas have been copied by checking # backend status == Done job_list = [] for req_unit in self.req_units: trf = self._getParent()._getParent().transforms[ int(req_unit.split(":")[0])] req_unit_id = req_unit.split(":")[1] if req_unit_id != "ALL": unit = trf.units[int(req_unit_id)] job_list.append(GPI.jobs(unit.active_job_ids[0])) else: for unit in trf.units: job_list.append(GPI.jobs(unit.active_job_ids[0])) for j in job_list: if j.subjobs: for sj in j.subjobs: if sj.backend.status != "Done": return else: if j.backend.status != "Done": return job = GPI.jobs(self.active_job_ids[0]) for f in job.inputdata.files: logger.warning( "Removing chain inputdata file '%s'..." % f.name) f.remove() super(LHCbUnit, self).updateStatus(status)
def removeUnusedData(self): """Remove any output data from orphaned jobs""" for unit in self.units: for jid in unit.prev_job_ids: try: logger.warning("Removing data from job '%d'..." % jid) job = GPI.jobs(jid) jlist = [] if len(job.subjobs) > 0: jlist = job.subjobs else: jlist = [job] for sj in jlist: for f in sj.outputfiles: if isType(f, DiracFile) == "DiracFile" and f.lfn: f.remove() except: logger.error("Problem deleting data for job '%d'" % jid) pass
def createUnits(self): """Create new units if required given the inputdata""" # call parent for chaining super(LHCbTransform, self).createUnits() if len(self.inputdata) > 0: # check for conflicting input if self.mc_num_units > 0: logger.warning("Inputdata specified - MC Event info ignored") # loop over input data and see if we need to create any more units import copy for id, inds in enumerate(self.inputdata): if not isType(inds, LHCbDataset): continue # go over the units and see what files have been assigned assigned_data = LHCbDataset() for unit in self.units: if unit.input_datset_index != id: continue assigned_data.files += unit.inputdata.files # any new files new_data = LHCbDataset( files = self.inputdata[id].difference(assigned_data).files ) if len(new_data.files) == 0: continue # create units for these files if self.files_per_unit > 0: # loop over the file array and create units for each set num = 0 while num < len(new_data.files): unit = LHCbUnit() unit.name = "Unit %d" % len(self.units) unit.input_datset_index = id self.addUnitToTRF(unit) unit.inputdata = copy.deepcopy(self.inputdata[id]) unit.inputdata.files = [] unit.inputdata.files += new_data.files[ num:num + self.files_per_unit] num += self.files_per_unit else: # new unit required for this dataset unit = LHCbUnit() unit.name = "Unit %d" % len(self.units) self.addUnitToTRF(unit) unit.inputdata = copy.deepcopy(self.inputdata[id]) unit.inputdata.files = [] unit.inputdata.files += new_data.files elif self.mc_num_units > 0: if len(self.units) == 0: # check for appropriate splitter from GPI import GaussSplitter if not self.splitter or isType(self.splitter, GaussSplitter): logger.warning("No GaussSplitter specified - first event info ignored") # create units for MC generation for i in range(0, self.mc_num_units): unit = LHCbUnit() unit.name = "Unit %d" % len(self.units) self.addUnitToTRF(unit) else: logger.error( "Please specify either inputdata or MC info for unit generation")
def createUnits(self): """Create new units if required given the inputdata""" # call parent for chaining super(LHCbTransform, self).createUnits() if len(self.inputdata) > 0: # check for conflicting input if self.mc_num_units > 0: logger.warning("Inputdata specified - MC Event info ignored") # loop over input data and see if we need to create any more units import copy for id, inds in enumerate(self.inputdata): if not isType(inds, LHCbDataset): continue # go over the units and see what files have been assigned assigned_data = LHCbDataset() for unit in self.units: if unit.input_datset_index != id: continue assigned_data.files += unit.inputdata.files # any new files new_data = LHCbDataset( files=self.inputdata[id].difference(assigned_data).files) if len(new_data.files) == 0: continue # create units for these files if self.files_per_unit > 0: # loop over the file array and create units for each set num = 0 while num < len(new_data.files): unit = LHCbUnit() unit.name = "Unit %d" % len(self.units) unit.input_datset_index = id self.addUnitToTRF(unit) unit.inputdata = copy.deepcopy(self.inputdata[id]) unit.inputdata.files = [] unit.inputdata.files += new_data.files[num:num + self. files_per_unit] num += self.files_per_unit else: # new unit required for this dataset unit = LHCbUnit() unit.name = "Unit %d" % len(self.units) self.addUnitToTRF(unit) unit.inputdata = copy.deepcopy(self.inputdata[id]) unit.inputdata.files = [] unit.inputdata.files += new_data.files elif self.mc_num_units > 0: if len(self.units) == 0: # check for appropriate splitter from GPI import GaussSplitter if not self.splitter or isType(self.splitter, GaussSplitter): logger.warning( "No GaussSplitter specified - first event info ignored" ) # create units for MC generation for i in range(0, self.mc_num_units): unit = LHCbUnit() unit.name = "Unit %d" % len(self.units) self.addUnitToTRF(unit) else: import traceback traceback.print_stack() logger.error( "Please specify either inputdata or MC info for unit generation" )