jediTaskID = int(options.tid) if True: if options.resurrectDS: sd, so = taskBuffer.querySQLS( 'SELECT datasetName FROM ATLAS_PANDA.JEDI_Datasets WHERE jediTaskID=:id AND type IN (:t1,:t2)', { ':id': jediTaskID, ':t1': 'output', ':t2': 'log' }) rc = RucioClient() for datasetName, in so: for i in range(3): try: scope, name = rucioAPI.extract_scope(datasetName) rc.get_did(scope, name) break except DataIdentifierNotFound: print('resurrect {0}'.format(datasetName)) rc.resurrect([{'scope': scope, 'name': name}]) try: rc.set_metadata(scope, name, 'lifetime', None) except: pass print(Client.reloadInput(jediTaskID)[-1]) print('done for jediTaskID={0}'.format(jediTaskID)) else: print('failed')
def main(taskBuffer=None, exec_options=None, log_stream=None, args_list=None): # options parser = argparse.ArgumentParser() if taskBuffer: parser.add_argument('--ds',action='store',dest='ds',default=None, help='dataset name') else: parser.add_argument('--ds',action='store',dest='ds',default=None,required=True, help='dataset name') parser.add_argument('--files',action='store',dest='files',default=None, help='comma-separated list of lost file names. The list is dedeuced if this option is omitted') parser.add_argument('--noChildRetry',action='store_const',const=True,dest='noChildRetry',default=False, help='not retry child tasks') parser.add_argument('--resurrectDS',action='store_const',const=True,dest='resurrectDS',default=False, help='resurrect output and log datasets if they were already deleted') parser.add_argument('--dryRun',action='store_const',const=True,dest='dryRun',default=False, help='dry run') parser.add_argument('--force', action='store_const', const=True, dest='force', default=False, help='force retry even if no lost files') parser.add_argument('--reproduceParent', action='store_const', const=True, dest='reproduceParent', default=False, help='reproduce the input files from which the lost files were produced. ' 'Typically useful to recover merged files when unmerged files were already deleted') # parse options if taskBuffer: if args_list: options = parser.parse_args(args_list) else: options, unknown = parser.parse_known_args() else: if args_list: options = parser.parse_args(args_list) else: options = parser.parse_args() # executed via command-line givenTaskID = None dn = None if taskBuffer is None: # instantiate TB from pandaserver.taskbuffer.TaskBuffer import taskBuffer taskBuffer.init(panda_config.dbhost,panda_config.dbpasswd,nDBConnection=1) else: # set options from dict if exec_options is None: exec_options = {} keys = set(vars(options).keys()) for k in exec_options: if k in keys: setattr(options, k, exec_options[k]) if 'jediTaskID' in exec_options: givenTaskID = exec_options['jediTaskID'] if 'userName' in exec_options: dn = exec_options['userName'] ds_files = {} if options.files is not None: files = options.files.split(',') ds_files[options.ds] = files else: # look for lost files if not givenTaskID: # get files from rucio st, files_rucio = get_files_from_rucio(options.ds, log_stream) if st is not True: return st, files_rucio # get files from panda dsName = options.ds.split(':')[-1] fd, fo = taskBuffer.querySQLS( 'SELECT c.lfn FROM ATLAS_PANDA.JEDI_Datasets d,ATLAS_PANDA.JEDI_Dataset_Contents c ' 'WHERE c.jediTaskID=d.jediTaskID AND c.datasetID=d.datasetID AND ' 'd.type IN (:t1,:t2) AND c.status=:s AND d.datasetName=:name ', {':s': 'finished', ':t1': 'output', ':t2': 'log', ':name': dsName}) for tmpLFN, in fo: if tmpLFN not in files_rucio: ds_files.setdefault(options.ds, []) ds_files[options.ds].append(tmpLFN) # get taskID td, to = taskBuffer.querySQLS( 'SELECT jediTaskID FROM ATLAS_PANDA.JEDI_Datasets ' 'WHERE datasetName=:datasetName AND type IN (:t1,:t2) ', {':t1': 'output', ':t2': 'log', ':datasetName': dsName}) jediTaskID, = to[0] else: # get dataset names dd, do = taskBuffer.querySQLS( 'SELECT datasetName FROM ATLAS_PANDA.JEDI_Datasets ' 'WHERE jediTaskID=:jediTaskID AND type IN (:t1,:t2) ', {':t1': 'output', ':t2': 'log', ':jediTaskID': givenTaskID}) # get files from rucio files_rucio = set() for tmpDS, in do: st, tmp_files_rucio = get_files_from_rucio(tmpDS, log_stream) if st is None: return st, tmp_files_rucio # ignore unknown dataset if st: files_rucio = files_rucio.union(tmp_files_rucio) # get files from rucio fd, fo = taskBuffer.querySQLS( 'SELECT d.datasetName,c.lfn FROM ATLAS_PANDA.JEDI_Datasets d,ATLAS_PANDA.JEDI_Dataset_Contents c ' 'WHERE d.jediTaskID=:jediTaskID AND c.jediTaskID=d.jediTaskID AND c.datasetID=d.datasetID AND ' 'd.type IN (:t1,:t2) AND c.status=:s ', {':s': 'finished', ':t1': 'output', ':t2': 'log', ':jediTaskID': givenTaskID}) for tmpDS, tmpLFN in fo: if tmpLFN not in files_rucio: ds_files.setdefault(tmpDS, []) ds_files[tmpDS].append(tmpLFN) for tmpDS in ds_files: files = ds_files[tmpDS] msgStr = '{} has {} lost files -> {}'.format(tmpDS, len(files), ','.join(files)) if log_stream: log_stream.info(msgStr) else: print(msgStr) # no lost files if not ds_files and not options.force: return True, "No lost files. Use --force to ignore this check" # reset file status s = False for tmpDS in ds_files: files = ds_files[tmpDS] if dn: ts, jediTaskID, lostInputFiles = taskBuffer.resetFileStatusInJEDI(dn, False, tmpDS, files, options.reproduceParent, options.dryRun) else: ts, jediTaskID, lostInputFiles = taskBuffer.resetFileStatusInJEDI('', True, tmpDS, files, options.reproduceParent, options.dryRun) msgStr = 'reset file status for {} in the DB: done with {} for jediTaskID={}'.format(tmpDS, ts, jediTaskID) if log_stream: log_stream.info(msgStr) else: print(msgStr) s |= ts # recover parent if options.reproduceParent: # reproduce input for lostDS in lostInputFiles: com_args = ['--ds', lostDS, '--noChildRetry', '--resurrectDS'] if options.dryRun: com_args.append('--dryRun') com_args += ['--files', ','.join(lostInputFiles[lostDS])] main(taskBuffer=taskBuffer, log_stream=log_stream, args_list=com_args) # go ahead if options.dryRun: return True, 'Done in the dry-run mode with {}'.format(s) if s or options.force: if options.resurrectDS: sd,so = taskBuffer.querySQLS( 'SELECT datasetName FROM ATLAS_PANDA.JEDI_Datasets WHERE jediTaskID=:id AND type IN (:t1,:t2)', {':id': jediTaskID, ':t1': 'output', ':t2': 'log'}) rc = RucioClient() for datasetName, in so: for i in range(3): try: scope, name = rucioAPI.extract_scope(datasetName) rc.get_did(scope, name) break except DataIdentifierNotFound: print('resurrect {0}'.format(datasetName)) rc.resurrect([{'scope': scope, 'name': name}]) try: rc.set_metadata(scope, name, 'lifetime', None) except Exception: pass if not options.reproduceParent: msgStr = Client.retryTask(jediTaskID, noChildRetry=options.noChildRetry)[-1][-1] else: msgStr = Client.reloadInput(jediTaskID)[-1][-1] if log_stream: log_stream.info("Retried task with {}".format(msgStr)) log_stream.info("Done") else: print("Retried task: done with {}".format(msgStr)) return True, msgStr else: msgStr = 'failed' if log_stream: log_stream.error(msgStr) else: print(msgStr) return False, msgStr