def mark_good(workflow, files, oracleDB, logger): """ Mark the list of files as tranferred """ wfnamemsg = "%s: " % workflow for lfn in files: data = {} source_lfn = lfn docId = getHashLfn(source_lfn) msg = "Marking file %s as published." % lfn msg += " Document id: %s (source LFN: %s)." % (docId, source_lfn) logger.info(wfnamemsg+msg) data['asoworker'] = config.General.asoworker data['subresource'] = 'updatePublication' data['list_of_ids'] = docId data['list_of_publication_state'] = 'DONE' data['list_of_retry_value'] = 1 data['list_of_failure_reason'] = '' try: result = oracleDB.post(config.General.oracleFileTrans, data=encodeRequest(data)) logger.debug("updated: %s %s " % (docId, result)) except Exception as ex: logger.error("Error during status update: %s" %ex)
def mark_good(workflow, files, oracleDB, logger): """ Mark the list of files as tranferred """ wfnamemsg = "%s: " % workflow for lfn in files: data = {} source_lfn = lfn docId = getHashLfn(source_lfn) msg = "Marking file %s as published." % lfn msg += " Document id: %s (source LFN: %s)." % (docId, source_lfn) logger.info(wfnamemsg+msg) data['asoworker'] = 'asodciangot1' data['subresource'] = 'updatePublication' data['list_of_ids'] = docId data['list_of_publication_state'] = 'DONE' data['list_of_retry_value'] = 1 data['list_of_failure_reason'] = '' try: result = oracleDB.post(config.General.oracleFileTrans, data=encodeRequest(data)) logger.debug("updated: %s %s " % (docId, result)) except Exception as ex: logger.error("Error during status update: %s" %ex)
def mark_failed(files, crabServer, logger, failure_reason=""): """ Something failed for these files so increment the retry count """ msg = "Marking %s file(s) as failed" % len(files) logger.info(msg) if dryRun: logger.debug("DryRun: skip marking failes files") return nMarked = 0 for lfn in files: source_lfn = lfn docId = getHashLfn(source_lfn) data = dict() data['asoworker'] = config.General.asoworker data['subresource'] = 'updatePublication' data['list_of_ids'] = [docId] data['list_of_publication_state'] = ['FAILED'] data['list_of_retry_value'] = [1] data['list_of_failure_reason'] = [failure_reason] logger.debug("data: %s ", data) try: result = crabServer.post(api='filetransfers', data=encodeRequest(data)) logger.debug("updated DocumentId: %s lfn: %s Result %s", docId, source_lfn, result) except Exception as ex: logger.error("Error updating status for DocumentId: %s lfn: %s", docId, source_lfn) logger.error("Error reason: %s", ex) nMarked += 1 if nMarked % 10 == 0: logger.info('marked %d files', nMarked)
def mark_good(files, crabServer, logger): """ Mark the list of files as tranferred """ msg = "Marking %s file(s) as published." % len(files) logger.info(msg) if dryRun: logger.info("DryRun: skip marking good file") return nMarked = 0 for lfn in files: data = {} source_lfn = lfn docId = getHashLfn(source_lfn) data['asoworker'] = config.General.asoworker data['subresource'] = 'updatePublication' data['list_of_ids'] = [docId] data['list_of_publication_state'] = ['DONE'] data['list_of_retry_value'] = [1] data['list_of_failure_reason'] = [''] try: result = crabServer.post(api='filetransfers', data=encodeRequest(data)) logger.debug("updated DocumentId: %s lfn: %s Result %s", docId, source_lfn, result) except Exception as ex: logger.error("Error updating status for DocumentId: %s lfn: %s", docId, source_lfn) logger.error("Error reason: %s", ex) nMarked += 1 if nMarked % 10 == 0: logger.info('marked %d files', nMarked)
def mark_failed(files, oracleDB, logger, failure_reason=""): """ Something failed for these files so increment the retry count """ h = 0 for lfn in files: h += 1 logger.debug("Marking failed %s" % h) source_lfn = lfn docId = getHashLfn(source_lfn) logger.debug("Marking failed %s" % docId) try: docbyId = oracleDB.get(config.General.oracleUserTrans, data=encodeRequest({ 'subresource': 'getById', 'id': docId })) except Exception: logger.exception("Error updating failed docs.") continue document = oracleOutputMapping(docbyId, None)[0] logger.debug("Document: %s" % document) try: fileDoc = dict() fileDoc['asoworker'] = 'asodciangot1' fileDoc['subresource'] = 'updatePublication' fileDoc['list_of_ids'] = docId fileDoc['list_of_publication_state'] = 'FAILED' #if force_failure or document['publish_retry_count'] > self.max_retry: # fileDoc['list_of_publication_state'] = 'FAILED' #else: # fileDoc['list_of_publication_state'] = 'RETRY' # TODO: implement retry fileDoc['list_of_retry_value'] = 1 fileDoc['list_of_failure_reason'] = failure_reason logger.debug("fileDoc: %s " % fileDoc) _ = oracleDB.post(config.General.oracleFileTrans, data=encodeRequest(fileDoc)) logger.debug("updated: %s " % docId) except Exception as ex: msg = "Error updating document: %s" % fileDoc msg += str(ex) msg += str(traceback.format_exc()) logger.error(msg) continue
def mark_failed(files, oracleDB, logger, failure_reason=""): """ Something failed for these files so increment the retry count """ h = 0 for lfn in files: h += 1 logger.debug("Marking failed %s" % h) source_lfn = lfn docId = getHashLfn(source_lfn) logger.debug("Marking failed %s" % docId) try: docbyId = oracleDB.get(config.General.oracleUserTrans, data=encodeRequest({'subresource': 'getById', 'id': docId})) except Exception: logger.exception("Error updating failed docs.") continue document = oracleOutputMapping(docbyId, None)[0] logger.debug("Document: %s" % document) try: fileDoc = dict() fileDoc['asoworker'] = config.General.asoworker fileDoc['subresource'] = 'updatePublication' fileDoc['list_of_ids'] = docId fileDoc['list_of_publication_state'] = 'FAILED' #if force_failure or document['publish_retry_count'] > self.max_retry: # fileDoc['list_of_publication_state'] = 'FAILED' #else: # fileDoc['list_of_publication_state'] = 'RETRY' # TODO: implement retry fileDoc['list_of_retry_value'] = 1 fileDoc['list_of_failure_reason'] = failure_reason logger.debug("fileDoc: %s " % fileDoc) _ = oracleDB.post(config.General.oracleFileTrans, data=encodeRequest(fileDoc)) logger.debug("updated: %s " % docId) except Exception as ex: msg = "Error updating document: %s" % fileDoc msg += str(ex) msg += str(traceback.format_exc()) logger.error(msg) continue
def testFileTransferPUT(self): """ _testFileTransferPUT_ Just test simple testFileTransferPUT with fake data """ # We just sent fake data which is not monitored by dashboard. # Also only the first time to decide is publication ON or NOT for user in self.users: timestamp = time.strftime('%y%m%d_%H%M%S', time.gmtime()) for i in range(self.totalFiles): now = int(time.time()) # Generate a taskname workflowName = "" taskname = "" if user not in self.tasks: workflowName = "".join([ random.choice(string.ascii_lowercase) for _ in range(20) ]) + "_" + str(now) publicationState = random.choice(['NEW', 'NOT_REQUIRED']) else: workflowName = self.tasks[user]['workflowName'] publicationState = self.tasks[user]['publication'] transferState = random.choice(['NEW', 'DONE']) taskname = generateTaskName(user, workflowName, timestamp) finalLfn = self.lfnBase % (user, workflowName, i, random.randint(1, 9999)) idHash = getHashLfn(finalLfn) self.fileDoc['id'] = idHash self.fileDoc['job_id'] = i self.fileDoc['username'] = user self.fileDoc['taskname'] = taskname self.fileDoc['start_time'] = int(time.time()) self.fileDoc['source_lfn'] = finalLfn self.fileDoc['destination_lfn'] = finalLfn self.fileDoc['transfer_state'] = transferState self.fileDoc['publication_state'] = publicationState print(self.fileDoc) self.server.put('/crabserver/dev/fileusertransfers', data=encodeRequest(self.fileDoc)) # if I will put the same doc twice, it should raise an error. # self.server.put('/crabserver/dev/fileusertransfers', data=urllib.urlencode(self.fileDoc)) # This tasks are for the future and next calls if user not in self.tasks: self.tasks[user] = { 'workflowName': workflowName, 'taskname': taskname, 'listOfIds': [], 'publication': publicationState, 'toTransfer': 0, 'toPublish': 0, 'total': self.totalFiles } if self.tasks[user]['publication'] == 'NEW': self.tasks[user]['toPublish'] += 1 if transferState == 'NEW': self.tasks[user]['toTransfer'] += 1 self.tasks[user]['listOfIds'].append(idHash) # This should raise an error for username in self.tasks: taskname = self.tasks[username]['taskname'] for query in ['getTransferStatus', 'getPublicationStatus']: result = self.server.get('/crabserver/dev/fileusertransfers', data=encodeRequest({ 'subresource': query, 'username': username, 'taskname': taskname })) print(result) print(result[0]['result']) taskInfoDict = oracleOutputMapping(result, 'id') print(taskInfoDict) for key, docDict in taskInfoDict.items(): result = self.server.get( '/crabserver/dev/fileusertransfers', data=encodeRequest({ 'subresource': 'getById', 'id': key })) randomUsers = random.sample( set(self.users), 3 ) # Take half of the users and kill their transfers for specific task for username in randomUsers: taskname = self.tasks[username]['taskname'] result = self.server.post('/crabserver/dev/fileusertransfers', data=encodeRequest({ 'subresource': 'killTransfers', 'username': username, 'taskname': taskname })) print(result) # oneUser is left for killing a list of IDs # leftUsers will be killing transfers one by one for specific id. leftUsers = list(set(self.users) - set(randomUsers)) oneUser = random.sample(set(leftUsers), 1) leftUsers = list(set(leftUsers) - set(oneUser)) for username in leftUsers: # First get all left ids for this users result = self.server.get('/crabserver/dev/fileusertransfers', data=encodeRequest({ 'subresource': 'getTransferStatus', 'username': username, 'taskname': self.tasks[username]['taskname'] })) resultOut = oracleOutputMapping(result, None) print("**" * 50) for outDict in resultOut: print(outDict) result = self.server.post('/crabserver/dev/fileusertransfers', data=encodeRequest({ 'subresource': 'killTransfersById', 'username': username, 'listOfIds': outDict['id'] })) print(result) print(resultOut) print(result) for username in oneUser: result = self.server.post( '/crabserver/dev/fileusertransfers', data=encodeRequest( { 'subresource': 'killTransfersById', 'username': username, 'listOfIds': self.tasks[username]['listOfIds'] }, ['listOfIds'])) # As it asks to kill all which are in new, need to double check what we submitted before and if the output of killed is correct print(result) print(self.tasks[username])
def startSlave(self, task): """ start a slave process to deal with publication for a single task :param task: one tupla describing a task as returned by active_tasks() :return: 0 It will always terminate normally, if publication fails it will mark it in the DB """ # TODO: lock task! # - process logger logger = setSlaveLogger(str(task[0][3])) logger.info("Process %s is starting. PID %s", task[0][3], os.getpid()) self.force_publication = False workflow = str(task[0][3]) if len(task[1]) > self.max_files_per_block: self.force_publication = True msg = "All datasets have more than %s ready files." % ( self.max_files_per_block) msg += " No need to retrieve task status nor last publication time." logger.info(msg) else: msg = "At least one dataset has less than %s ready files." % ( self.max_files_per_block) logger.info(msg) # Retrieve the workflow status. If the status can not be retrieved, continue # with the next workflow. workflow_status = '' msg = "Retrieving status" logger.info(msg) data = encodeRequest({'workflow': workflow}) try: res = self.crabServer.get(api='workflow', data=data) except Exception as ex: logger.warn( 'Error retrieving status from crabserver for %s:\n%s', workflow, str(ex)) return 0 try: workflow_status = res[0]['result'][0]['status'] msg = "Task status is %s." % workflow_status logger.info(msg) except ValueError: msg = "Workflow removed from WM." logger.error(msg) workflow_status = 'REMOVED' except Exception as ex: msg = "Error loading task status!" msg += str(ex) msg += str(traceback.format_exc()) logger.error(msg) # If the workflow status is terminal, go ahead and publish all the ready files # in the workflow. if workflow_status in ['COMPLETED', 'FAILED', 'KILLED', 'REMOVED']: self.force_publication = True if workflow_status in ['KILLED', 'REMOVED']: self.force_failure = True msg = "Considering task status as terminal. Will force publication." logger.info(msg) # Otherwise... else: ## TODO put this else in a function like def checkForPublication() msg = "Task status is not considered terminal." logger.info(msg) msg = "Getting last publication time." logger.info(msg) # Get when was the last time a publication was done for this workflow (this # should be more or less independent of the output dataset in case there are # more than one). last_publication_time = None data = encodeRequest({ 'workflow': workflow, 'subresource': 'search' }) try: result = self.crabServer.get(api='task', data=data) logger.debug("task: %s ", str(result[0])) last_publication_time = getColumn(result[0], 'tm_last_publication') except Exception as ex: logger.error("Error during task doc retrieving:\n%s", ex) if last_publication_time: date = last_publication_time # datetime in Oracle format timetuple = datetime.strptime( date, "%Y-%m-%d %H:%M:%S.%f").timetuple( ) # convert to time tuple last_publication_time = time.mktime( timetuple) # convert to seconds since Epoch (float) msg = "Last publication time: %s." % str(last_publication_time) logger.debug(msg) # If this is the first time a publication would be done for this workflow, go # ahead and publish. if not last_publication_time: self.force_publication = True msg = "There was no previous publication. Will force publication." logger.info(msg) # Otherwise... else: last = last_publication_time msg = "Last published block time: %s" % last logger.debug(msg) # If the last publication was long time ago (> our block publication timeout), # go ahead and publish. now = int(time.time()) - time.timezone time_since_last_publication = now - last hours = int(time_since_last_publication / 60 / 60) minutes = int( (time_since_last_publication - hours * 60 * 60) / 60) timeout_hours = int(self.block_publication_timeout / 60 / 60) timeout_minutes = int((self.block_publication_timeout - timeout_hours * 60 * 60) / 60) msg = "Last publication was %sh:%sm ago" % (hours, minutes) if time_since_last_publication > self.block_publication_timeout: self.force_publication = True msg += " (more than the timeout of %sh:%sm)." % ( timeout_hours, timeout_minutes) msg += " Will force publication." else: msg += " (less than the timeout of %sh:%sm)." % ( timeout_hours, timeout_minutes) msg += " Not enough to force publication." logger.info(msg) # logger.info(task[1]) try: if self.force_publication: # - get info active_ = [{ 'key': [ x['username'], x['user_group'], x['user_role'], x['taskname'] ], 'value': [ x['destination'], x['source_lfn'], x['destination_lfn'], x['input_dataset'], x['dbs_url'], x['last_update'] ] } for x in task[1] if x['transfer_state'] == 3 and x['publication_state'] not in [2, 3, 5]] lfn_ready = [] wf_jobs_endtime = [] pnn, input_dataset, input_dbs_url = "", "", "" for active_file in active_: job_end_time = active_file['value'][5] if job_end_time: wf_jobs_endtime.append( int(job_end_time) - time.timezone) source_lfn = active_file['value'][1] dest_lfn = active_file['value'][2] self.lfn_map[dest_lfn] = source_lfn if not pnn or not input_dataset or not input_dbs_url: pnn = str(active_file['value'][0]) input_dataset = str(active_file['value'][3]) input_dbs_url = str(active_file['value'][4]) lfn_ready.append(dest_lfn) username = task[0][0] # Get metadata toPublish = [] toFail = [] publDescFiles_list = self.getPublDescFiles( workflow, lfn_ready, logger) for file_ in active_: metadataFound = False for doc in publDescFiles_list: # logger.info(type(doc)) # logger.info(doc) if doc["lfn"] == file_["value"][2]: doc["User"] = username doc["Group"] = file_["key"][1] doc["Role"] = file_["key"][2] doc["UserDN"] = self.myDN doc["Destination"] = file_["value"][0] doc["SourceLFN"] = file_["value"][1] toPublish.append(doc) metadataFound = True break # if we failed to find metadata mark publication as failed to avoid to keep looking # at same files over and over if not metadataFound: toFail.append(file_["value"][1]) with open(self.taskFilesDir + workflow + '.json', 'w') as outfile: json.dump(toPublish, outfile) logger.debug( 'Unitarity check: active_:%d toPublish:%d toFail:%d', len(active_), len(toPublish), len(toFail)) if len(toPublish) + len(toFail) != len(active_): logger.error("SOMETHING WRONG IN toPublish vs toFail !!") if toFail: logger.info( 'Did not find useful metadata for %d files. Mark as failed', len(toFail)) from ServerUtilities import getHashLfn nMarked = 0 for lfn in toFail: source_lfn = lfn docId = getHashLfn(source_lfn) data = dict() data['asoworker'] = self.config.asoworker data['subresource'] = 'updatePublication' data['list_of_ids'] = docId data['list_of_publication_state'] = 'FAILED' data['list_of_retry_value'] = 1 data[ 'list_of_failure_reason'] = 'File type not EDM or metadata not found' try: result = self.crabServer.post( api='filetransfers', data=encodeRequest(data)) #logger.debug("updated DocumentId: %s lfn: %s Result %s", docId, source_lfn, result) except Exception as ex: logger.error( "Error updating status for DocumentId: %s lfn: %s", docId, source_lfn) logger.error("Error reason: %s", ex) nMarked += 1 #if nMarked % 10 == 0: logger.info('marked %d files as Failed', nMarked) # find the location in the current environment of the script we want to run import Publisher.TaskPublish as tp taskPublishScript = tp.__file__ cmd = "python %s " % taskPublishScript cmd += " --configFile=%s" % self.configurationFile cmd += " --taskname=%s" % workflow if self.TPconfig.dryRun: cmd += " --dry" logger.info("Now execute: %s", cmd) stdout, stderr, exitcode = executeCommand(cmd) if exitcode != 0: errorMsg = 'Failed to execute command: %s.\n StdErr: %s.' % ( cmd, stderr) raise Exception(errorMsg) else: logger.info('TaskPublishScript done : %s', stdout) jsonSummary = stdout.split()[-1] with open(jsonSummary, 'r') as fd: summary = json.load(fd) result = summary['result'] reason = summary['reason'] taskname = summary['taskname'] if result == 'OK': if reason == 'NOTHING TO DO': logger.info('Taskname %s is OK. Nothing to do', taskname) else: msg = 'Taskname %s is OK. Published %d files in %d blocks.' % \ (taskname, summary['publishedFiles'], summary['publishedBlocks']) if summary['nextIterFiles']: msg += ' %d files left for next iteration.' % summary[ 'nextIterFiles'] logger.info(msg) if result == 'FAIL': logger.error('Taskname %s : TaskPublish failed with: %s', taskname, reason) if reason == 'DBS Publication Failure': logger.error( 'Taskname %s : %d blocks failed for a total of %d files', taskname, summary['failedBlocks'], summary['failedFiles']) logger.error( 'Taskname %s : Failed block(s) details have been saved in %s', taskname, summary['failedBlockDumps']) except Exception as ex: logger.exception("Exception when calling TaskPublish!\n%s", str(ex)) return 0
def testFileTransferPUT(self): """ _testFileTransferPUT_ Just test simple testFileTransferPUT with fake data """ # We just sent fake data which is not monitored by dashboard. # Also only the first time to decide is publication ON or NOT for user in self.users: timestamp = time.strftime('%y%m%d_%H%M%S', time.gmtime()) for i in range(self.totalFiles): now = int(time.time()) # Generate a taskname workflowName = "" taskname = "" if user not in self.tasks: workflowName = "".join([random.choice(string.ascii_lowercase) for _ in range(20)]) + "_" + str(now) publicationState = random.choice(['NEW', 'NOT_REQUIRED']) else: workflowName = self.tasks[user]['workflowName'] publicationState = self.tasks[user]['publication'] transferState = random.choice(['NEW', 'DONE']) taskname = generateTaskName(user, workflowName, timestamp) finalLfn = self.lfnBase % (user, workflowName, i, random.randint(1, 9999)) idHash = getHashLfn(finalLfn) self.fileDoc['id'] = idHash self.fileDoc['job_id'] = i self.fileDoc['username'] = user self.fileDoc['taskname'] = taskname self.fileDoc['start_time'] = int(time.time()) self.fileDoc['source_lfn'] = finalLfn self.fileDoc['destination_lfn'] = finalLfn self.fileDoc['transfer_state'] = transferState self.fileDoc['publication_state'] = publicationState print(self.fileDoc) self.server.put('/crabserver/dev/fileusertransfers', data=encodeRequest(self.fileDoc)) # if I will put the same doc twice, it should raise an error. # self.server.put('/crabserver/dev/fileusertransfers', data=urllib.urlencode(self.fileDoc)) # This tasks are for the future and next calls if user not in self.tasks: self.tasks[user] = {'workflowName': workflowName, 'taskname': taskname, 'listOfIds': [], 'publication': publicationState, 'toTransfer': 0, 'toPublish': 0, 'total': self.totalFiles} if self.tasks[user]['publication'] == 'NEW': self.tasks[user]['toPublish'] += 1 if transferState == 'NEW': self.tasks[user]['toTransfer'] += 1 self.tasks[user]['listOfIds'].append(idHash) # This should raise an error for username in self.tasks: taskname = self.tasks[username]['taskname'] for query in ['getTransferStatus', 'getPublicationStatus']: result = self.server.get('/crabserver/dev/fileusertransfers', data=encodeRequest({'subresource': query, 'username': username, 'taskname': taskname})) print(result) print(result[0]['result']) taskInfoDict = oracleOutputMapping(result, 'id') print(taskInfoDict) for key, docDict in taskInfoDict.items(): result = self.server.get('/crabserver/dev/fileusertransfers', data=encodeRequest({'subresource': 'getById', 'id': key})) randomUsers = random.sample(set(self.users), 3) # Take half of the users and kill their transfers for specific task for username in randomUsers: taskname = self.tasks[username]['taskname'] result = self.server.post('/crabserver/dev/fileusertransfers', data=encodeRequest({'subresource': 'killTransfers', 'username': username, 'taskname': taskname})) print(result) # oneUser is left for killing a list of IDs # leftUsers will be killing transfers one by one for specific id. leftUsers = list(set(self.users) - set(randomUsers)) oneUser = random.sample(set(leftUsers), 1) leftUsers = list(set(leftUsers) - set(oneUser)) for username in leftUsers: # First get all left ids for this users result = self.server.get('/crabserver/dev/fileusertransfers', data=encodeRequest({'subresource': 'getTransferStatus', 'username': username, 'taskname': self.tasks[username]['taskname']})) resultOut = oracleOutputMapping(result, None) print("**"*50) for outDict in resultOut: print(outDict) result = self.server.post('/crabserver/dev/fileusertransfers', data=encodeRequest({'subresource': 'killTransfersById', 'username': username, 'listOfIds': outDict['id']})) print(result) print(resultOut) print(result) for username in oneUser: result = self.server.post('/crabserver/dev/fileusertransfers', data=encodeRequest({'subresource': 'killTransfersById', 'username': username, 'listOfIds': self.tasks[username]['listOfIds']}, ['listOfIds'])) # As it asks to kill all which are in new, need to double check what we submitted before and if the output of killed is correct print(result) print(self.tasks[username])