def pubFailed(self, task, files, failure_reasons=list(), force_failure=False): """ :param files: :param failure_reasons: :return: """ id_list = list() for Lfn in files: source_lfn = Lfn[0] docId = getHashLfn(source_lfn) id_list.append(docId) self.logger.debug("Marking failed %s" % docId) fileDoc = dict() fileDoc['asoworker'] = 'asodciangot1' fileDoc['subresource'] = 'updatePublication' fileDoc['list_of_ids'] = id_list fileDoc['list_of_publication_state'] = ['FAILED' for x in id_list] # TODO: implement retry, publish_retry_count missing from input? fileDoc['list_of_retry_value'] = [1 for x in id_list] fileDoc['list_of_failure_reason'] = failure_reasons try: self.oracleDB.post(self.config.oracleFileTrans, data=encodeRequest(fileDoc)) self.logger.debug("updated failed: %s " % id_list) except Exception: msg = "Error updating failed documents" self.logger.exception(msg)
def pubDone(self, workflow, files): """ :param files: :param workflow: :return: """ wfnamemsg = "%s: " % workflow data = dict() id_list = list() for lfn in files: source_lfn = lfn docId = getHashLfn(source_lfn) id_list.append(docId) msg = "Marking file %s as published." % lfn msg += " Document id: %s (source LFN: %s)." % (docId, source_lfn) self.logger.info(wfnamemsg + msg) data['asoworker'] = self.config.asoworker data['subresource'] = 'updatePublication' data['list_of_ids'] = id_list data['list_of_publication_state'] = ['DONE' for x in id_list] try: self.oracleDB.post(self.config.oracleFileTrans, data=encodeRequest(data)) self.logger.debug("updated done: %s " % id_list) except Exception as ex: self.logger.error("Error during status update for published docs: %s" % ex)
def transferred(self, files): """ Mark the list of files as tranferred """ good_ids = list() updated_lfn = list() try: for lfn in files: lfn = lfn[0] if lfn.find('temp') == 7: docId = getHashLfn(lfn) good_ids.append(docId) updated_lfn.append(lfn) self.logger.debug("Marking done %s" % lfn) self.logger.debug("Marking done %s" % docId) data = dict() data['asoworker'] = self.config.asoworker data['subresource'] = 'updateTransfers' data['list_of_ids'] = good_ids data['list_of_transfer_state'] = ["DONE" for x in good_ids] self.oracleDB.post(self.config.oracleFileTrans, data=encodeRequest(data)) self.logger.debug("Marked good %s" % good_ids) except Exception: self.logger.exception("Error updating documents") return 1 return 0
def failed(self, files, failures_reasons=[], max_retry=3, force_fail=False, submission_error=False): """ :param files: tuple (source_lfn, dest_lfn) :param failures_reasons: list(str) with reasons of failure :param max_retry: number of retry before giving up :param force_fail: flag for triggering failure without retry :param submission_error: error during fts submission :return: """ updated_lfn = [] for Lfn in files: lfn = Lfn[0] # Load document and get the retry_count docId = getHashLfn(lfn) self.logger.debug("Marking failed %s" % docId) try: docbyId = self.oracleDB.get(self.config.oracleUserFileTrans.replace('filetransfer','fileusertransfers'), data=encodeRequest({'subresource': 'getById', 'id': docId})) document = oracleOutputMapping(docbyId, None)[0] self.logger.debug("Document: %s" % document) except Exception as ex: self.logger.error("Error updating failed docs: %s" % ex) return 1 fileDoc = dict() fileDoc['asoworker'] = self.config.asoworker fileDoc['subresource'] = 'updateTransfers' fileDoc['list_of_ids'] = docId if not len(failures_reasons) == 0: try: fileDoc['list_of_failure_reason'] = failures_reasons[files.index(Lfn)] except: fileDoc['list_of_failure_reason'] = "unexcpected error, missing reasons" self.logger.exception("missing reasons") if force_fail or document['transfer_retry_count'] + 1 > max_retry: fileDoc['list_of_transfer_state'] = 'FAILED' fileDoc['list_of_retry_value'] = 1 else: fileDoc['list_of_transfer_state'] = 'RETRY' if submission_error: fileDoc['list_of_failure_reason'] = "Job could not be submitted to FTS: temporary problem of FTS" fileDoc['list_of_retry_value'] = 1 else: fileDoc['list_of_retry_value'] = 1 self.logger.debug("update: %s" % fileDoc) try: updated_lfn.append(docId) self.oracleDB.post(self.config.oracleFileTrans, data=encodeRequest(fileDoc)) except Exception: self.logger.exception('ERROR updating failed documents') return 1 self.logger.debug("failed file updated") return 0
def submitted(self, files): """ Mark the list of files as submitted once the FTS submission succeeded ACQUIRED -> SUBMITTED Return the lfns updated successfully and report data for dashboard :param files: tuple (source_lfn, dest_lfn) :return: """ lfn_in_transfer = [] dash_rep = () id_list = list() docId = '' for lfn in files: lfn = lfn[0] if lfn.find('temp') == 7: self.logger.debug("Marking acquired %s" % lfn) docId = getHashLfn(lfn) self.logger.debug("Marking acquired %s" % docId) try: id_list.append(docId) lfn_in_transfer.append(lfn) except Exception as ex: self.logger.error("Error getting id: %s" % ex) raise lfn_in_transfer.append(lfn) # TODO: add dashboard stuff # dash_rep = (document['jobid'], document['job_retry_count'], document['taskname']) try: fileDoc = dict() fileDoc['asoworker'] = self.config.asoworker fileDoc['subresource'] = 'updateTransfers' fileDoc['list_of_ids'] = id_list fileDoc['list_of_transfer_state'] = ["SUBMITTED" for x in id_list] self.oracleDB.post(self.config.oracleFileTrans, data=encodeRequest(fileDoc)) self.logger.debug("Marked acquired %s" % (id_list)) except Exception as ex: self.logger.error("Error during status update: %s" % ex) return lfn_in_transfer, dash_rep
def worker(self, i, inputs): """ - Retrieve userDN - Retrieve user proxy - Delegate proxy to fts is needed - submit fts job - update doc states :param i: thread number :param inputs: tuple (lfns, _user, source, dest, tfc_map) :return: """ # TODO: differentiate log messages per USER! logger = self.logger logger.info("Process %s is starting. PID %s", i, os.getpid()) lock = Lock() Update = update(logger, self.config) while not self.STOP: if inputs.empty(): time.sleep(10) continue try: lfns, _user, source, dest, tfc_map = inputs.get() [user, group, role] = _user except (EOFError, IOError): crashMessage = "Hit EOF/IO in getting new work\n" crashMessage += "Assuming this is a graceful break attempt.\n" logger.error(crashMessage) continue start = time.time() if not self.config.TEST: try: userDN = getDNFromUserName(user, logger, ckey=self.config.opsProxy, cert=self.config.opsProxy) except Exception as ex: logger.exception('Cannot retrieve user DN') self.critical_failure(lfns, lock, inputs) continue defaultDelegation = { 'logger': logger, 'credServerPath': self.config.credentialDir, 'myProxySvr': 'myproxy.cern.ch', 'min_time_left': getattr(self.config, 'minTimeLeft', 36000), 'serverDN': self.config.serverDN, 'uisource': '', 'cleanEnvironment': getattr(self.config, 'cleanEnvironment', False) } cache_area = self.config.cache_area try: defaultDelegation['myproxyAccount'] = re.compile( 'https?://([^/]*)/.*').findall(cache_area)[0] except IndexError: logger.error( 'MyproxyAccount parameter cannot be retrieved from %s . ' % self.config.cache_area) self.critical_failure(lfns, lock, inputs) continue if getattr(self.config, 'serviceCert', None): defaultDelegation['server_cert'] = self.config.serviceCert if getattr(self.config, 'serviceKey', None): defaultDelegation['server_key'] = self.config.serviceKey try: defaultDelegation['userDN'] = userDN defaultDelegation['group'] = group defaultDelegation['role'] = role logger.debug('delegation: %s' % defaultDelegation) valid_proxy, user_proxy = getProxy(defaultDelegation, logger) if not valid_proxy: logger.error( 'Failed to retrieve user proxy... putting docs on retry' ) logger.error( 'docs on retry: %s' % Update.failed(lfns, submission_error=True)) continue except Exception: logger.exception('Error retrieving proxy') self.critical_failure(lfns, lock, inputs) continue else: user_proxy = self.config.opsProxy self.logger.debug("Using opsProxy for testmode") context = dict() try: if self.config.TEST: logger.debug("Running in test mode, submitting fake jobs") else: context = fts3.Context(self.config.serverFTS, user_proxy, user_proxy, verify=True) logger.debug( fts3.delegate(context, lifetime=timedelta(hours=48), force=False)) except Exception: logger.exception("Error submitting to FTS") self.critical_failure(lfns, lock, inputs) continue failed_lfn = list() try: if self.config.TEST: submitted_lfn = lfns jobid = getHashLfn(lfns[0][0]) self.logger.debug('Fake job id: ' + jobid) else: failed_lfn, submitted_lfn, jobid = Submission( lfns, source, dest, i, self.logger, fts3, context, tfc_map) if jobid == -1: self.critical_failure(lfns, lock, inputs) continue logger.info('Submitted %s files' % len(submitted_lfn)) except Exception: logger.exception("Unexpected error during FTS job submission!") self.critical_failure(lfns, lock, inputs) continue # TODO: add file FTS id and job id columns for kill command try: Update.submitted(lfns) except Exception: logger.exception("Error updating document status") self.critical_failure(lfns, lock, inputs) continue try: Update.failed(failed_lfn) except Exception: logger.exception( "Error updating document status, job submission will be retried later..." ) self.critical_failure(lfns, lock, inputs) continue try: createLogdir('Monitor/' + user) with open('Monitor/' + user + '/' + str(jobid) + '.txt', 'w') as outfile: json.dump(lfns, outfile) logger.info('Monitor files created') except Exception: logger.exception("Error creating file for monitor") self.critical_failure(lfns, lock, inputs) continue end = time.time() self.logger.info('Input processed in %s', str(end - start)) time.sleep(0.5) logger.debug("Worker %s exiting.", i) return 0