def testXMLJSON(self): """ Test XML and JSON in the same scope """ site = 'T1_US_FNAL_Buffer' httpDict = { 'endpoint': "https://cmsweb.cern.ch/phedex/datasvc/json/test" } phedexJSON = PhEDEx(responseType='json', httpDict=httpDict) httpDict = { 'endpoint': "https://cmsweb.cern.ch/phedex/datasvc/xml/test" } phedexXML = PhEDEx(responseType='xml', httpDict=httpDict) phedexXML.getNodeTFC(site) tfc_file = phedexXML.cacheFileName('tfc', inputdata={'node': site}) tfc_map = {} tfc_map[site] = readTFC(tfc_file) pfn = tfc_map[site].matchLFN('srmv2', '/store/user/jblow/dir/test.root') self.assertTrue( pfn == 'srm://cmssrm.fnal.gov:8443/srm/managerv2?SFN=/11/store/user/jblow/dir/test.root' ) self.assertTrue( phedexJSON.getNodeSE('T1_US_FNAL_Buffer') == 'cmssrm.fnal.gov')
def get_tfc_rules(site): """ Get the TFC regexp for a given site. """ phedex = PhEDEx(responseType='xml') phedex.getNodeTFC(site) tfc_file = phedex.cacheFileName('tfc', inputdata={'node': site}) return readTFC(tfc_file)
def get_tfc_rules(site): """ Get the TFC regexp for a given site. """ phedex = PhEDEx(responseType='xml') phedex.getNodeTFC(site) tfc_file = phedex.cacheFileName('tfc', inputdata={'node': site}) return readTFC(tfc_file)
def testDataServiceXML(self): # asks for PEM pass phrase ... raise nose.SkipTest phedex = PhEDEx(responseType='xml') site = 'T2_UK_SGrid_Bristol' lfn = '/store/users/metson/file' protocol = 'srmv2' phedex.getNodeTFC(site) tfc_file = phedex.cacheFileName('tfc', inputdata={'node': site}) tfc = readTFC(tfc_file) pfn_dict = phedex.getPFN(site, lfn, protocol) phedex_pfn = pfn_dict[(site, lfn)] pfn = tfc.matchLFN(protocol, lfn) msg = 'TFC pfn (%s) did not match PhEDEx pfn (%s)' % (pfn, phedex_pfn) self.assertEqual(phedex_pfn, pfn, msg)
def testDataServiceXML(self): # asks for PEM pass phrase ... raise nose.SkipTest phedex = PhEDEx(responseType='xml') site = 'T2_UK_SGrid_Bristol' lfn = '/store/users/metson/file' protocol = 'srmv2' phedex.getNodeTFC(site) tfc_file = phedex.cacheFileName('tfc', inputdata={'node': site}) tfc = readTFC(tfc_file) pfn_dict = phedex.getPFN(site, lfn, protocol) phedex_pfn = pfn_dict[(site, lfn)] pfn = tfc.matchLFN(protocol, lfn) msg = 'TFC pfn (%s) did not match PhEDEx pfn (%s)' % (pfn, phedex_pfn) self.assertEqual(phedex_pfn, pfn, msg)
def testXMLJSON(self): """ Test XML and JSON in the same scope """ site = 'T1_US_FNAL_Buffer' httpDict = {'endpoint': "https://cmsweb.cern.ch/phedex/datasvc/json/test"} phedexJSON = PhEDEx(responseType='json', httpDict=httpDict) httpDict = {'endpoint': "https://cmsweb.cern.ch/phedex/datasvc/xml/test"} phedexXML = PhEDEx(responseType='xml', httpDict=httpDict) phedexXML.getNodeTFC(site) tfc_file = phedexXML.cacheFileName('tfc', inputdata={'node': site}) tfc_map = {} tfc_map[site] = readTFC(tfc_file) pfn = tfc_map[site].matchLFN('srmv2', '/store/user/jblow/dir/test.root') self.assertTrue(pfn == 'srm://cmssrm.fnal.gov:8443/srm/managerv2?SFN=/11/store/user/jblow/dir/test.root') self.assertTrue(phedexJSON.getNodeSE('T1_US_FNAL_Buffer') == 'cmssrm.fnal.gov')
class TransferDaemon(BaseDaemon): """ _TransferDaemon_ Call multiprocessing library to instantiate a TransferWorker for each user. """ def __init__(self, config): """ Initialise class members: 1. check and create dropbox dir 2. define oracle and couch (config and file instance) server connection 3. PhEDEx connection 4. Setup wmcore factory """ self.doc_acq = '' # Need a better way to test this without turning off this next line BaseDaemon.__init__(self, config, 'AsyncTransfer') self.dropbox_dir = '%s/dropbox/outputs' % self.config.componentDir if not os.path.isdir(self.dropbox_dir): try: os.makedirs(self.dropbox_dir) except OSError as e: if not e.errno == errno.EEXIST: self.logger.exception('Unknown error in mkdir' % e.errno) raise if not os.path.isdir("/tmp/DashboardReport"): try: os.makedirs("/tmp/DashboardReport") except OSError as e: if not e.errno == errno.EEXIST: self.logger.exception('Unknown error in mkdir' % e.errno) raise try: config_server = CouchServer(dburl=self.config.config_couch_instance) self.config_db = config_server.connectDatabase(self.config.config_database) except: self.logger.exception('Failed when contacting local couch') raise try: self.oracleDB = HTTPRequests(self.config.oracleDB, self.config.opsProxy, self.config.opsProxy) except: self.logger.exception('Failed when contacting Oracle') raise self.pool = Pool(processes=self.config.pool_size) self.factory = WMFactory(self.config.schedAlgoDir, namespace=self.config.schedAlgoDir) self.site_tfc_map = {} try: self.phedex = PhEDEx(responseType='xml', dict={'key':self.config.opsProxy, 'cert':self.config.opsProxy}) except Exception as e: self.logger.exception('PhEDEx exception: %s' % e) raise # TODO: decode xml try: self.phedex2 = PhEDEx(responseType='json', dict={'key':self.config.opsProxy, 'cert':self.config.opsProxy}) except Exception as e: self.logger.exception('PhEDEx exception: %s' % e) raise self.logger.debug(type((self.phedex2.getNodeMap())['phedex']['node'])) for site in [x['name'] for x in self.phedex2.getNodeMap()['phedex']['node']]: if site and str(site) != 'None' and str(site) != 'unknown': self.site_tfc_map[site] = self.get_tfc_rules(site) self.logger.debug('tfc site: %s %s' % (site, self.get_tfc_rules(site))) # Over riding setup() is optional, and not needed here def algorithm(self, parameters=None): """ 1 Get transfer config from couchdb config instance 2. Get a list of users with files to transfer from the db instance (oracle or couch, by config flag) 3. For each user get a suitably sized input for submission (call to a list) 4. Submit to a subprocess """ if self.config.isOracle: users = self.oracleSiteUser(self.oracleDB) else: users = self.active_users(self.db) sites = self.active_sites() self.logger.info('%s active sites' % len(sites)) self.logger.debug('Active sites are: %s' % sites) self.logger.debug('kicking off pool') for u in users: for i in range(len(u)): if not u[i]: u[i] = '' self.logger.debug('current_running %s' % current_running) self.logger.debug('Testing current running: %s %s %s' % (u, current_running, (u not in current_running))) if u not in current_running: self.logger.debug('processing %s' % u) current_running.append(u) self.logger.debug('processing %s' % current_running) self.pool.apply_async(ftscp, (u, self.site_tfc_map, self.config), callback=log_result) def oracleSiteUser(self, db): """ 1. Acquire transfers from DB 2. Get acquired users and destination sites """ self.logger.info('Retrieving users...') fileDoc = dict() fileDoc['subresource'] = 'activeUsers' fileDoc['grouping'] = 0 fileDoc['asoworker'] = self.config.asoworker result = dict() try: result = db.get(self.config.oracleFileTrans, data=encodeRequest(fileDoc)) except Exception as ex: self.logger.error("Failed to acquire transfers \ from oracleDB: %s" % ex) return [] self.logger.debug(oracleOutputMapping(result)) # TODO: translate result into list((user,group,role),...) if len(oracleOutputMapping(result)) != 0: self.logger.debug(type( [[x['username'].encode('ascii','ignore'), x['user_group'], x['user_role']] for x in oracleOutputMapping(result)])) try: docs = oracleOutputMapping(result) users = [[x['username'], x['user_group'], x['user_role']] for x in docs] self.logger.info('Users to process: %s' % str(users)) except: self.logger.exception('User data malformed. ') else: self.logger.info('No new user to acquire') return [] actives = list() for user in users: fileDoc = dict() fileDoc['asoworker'] = self.config.asoworker fileDoc['subresource'] = 'acquireTransfers' fileDoc['username'] = user[0] self.logger.debug("Retrieving transfers from oracleDB for user: %s " % user[0]) try: result = db.post(self.config.oracleFileTrans, data=encodeRequest(fileDoc)) except Exception as ex: self.logger.error("Failed to acquire transfers \ from oracleDB: %s" %ex) continue self.doc_acq = str(result) for i in range(len(user)): if not user[i] or user[i] in ['None', 'NULL']: user[i] = '' user[i] = str(user[i]) actives.append(user) self.logger.debug("Transfers retrieved from oracleDB. %s " % users) return users def active_users(self, db): """ Query a view for users with files to transfer. get this from the following view: ftscp?group=true&group_level=1 """ query = {'group': True, 'group_level': 3} try: users = db.loadView(self.config.ftscp_design, 'ftscp_all', query) except Exception as e: self.logger.exception('A problem occured when\ contacting couchDB: %s' % e) return [] if len(users['rows']) <= self.config.pool_size: active_users = [x['key'] for x in users['rows']] else: sorted_users = self.factory.loadObject(self.config.algoName, args=[self.config, self.logger, users['rows'], self.config.pool_size], getFromCache=False, listFlag=True) active_users = sorted_users()[:self.config.pool_size] self.logger.info('%s active users' % len(active_users)) self.logger.debug('Active users are: %s' % active_users) return active_users def active_sites(self): """ Get a list of all sites involved in transfers. """ query = {'group': True, 'stale': 'ok'} try: sites = self.db.loadView('AsyncTransfer', 'sites', query) except Exception as e: self.logger.exception('A problem occured \ when contacting couchDB: %s' % e) return [] def keys_map(inputDict): """ Map function. """ return inputDict['key'] return map(keys_map, sites['rows']) def get_tfc_rules(self, site): """ Get the TFC regexp for a given site. """ tfc_file = None try: self.phedex.getNodeTFC(site) except Exception as e: self.logger.exception('PhEDEx exception: %s' % e) try: tfc_file = self.phedex.cacheFileName('tfc', inputdata={'node': site}) except Exception as e: self.logger.exception('PhEDEx cache exception: %s' % e) return readTFC(tfc_file) def terminate(self, parameters=None): """ Called when thread is being terminated. """ self.pool.close() self.pool.join()
class ReporterWorker: def __init__(self, user, config): """ store the user and tfc the worker """ self.user = user self.config = config self.dropbox_dir = '%s/dropbox/inputs' % self.config.componentDir logging.basicConfig(level=config.log_level) self.site_tfc_map = {} self.logger = logging.getLogger('AsyncTransfer-Reporter-%s' % self.user) formatter = getCommonLogFormatter(self.config) for handler in logging.getLogger().handlers: handler.setFormatter(formatter) self.uiSetupScript = getattr(self.config, 'UISetupScript', None) self.cleanEnvironment = '' self.userDN = '' self.init = True if getattr(self.config, 'cleanEnvironment', False): self.cleanEnvironment = 'unset LD_LIBRARY_PATH; unset X509_USER_CERT; unset X509_USER_KEY;' # TODO: improve how the worker gets a log self.logger.debug("Trying to get DN") try: self.userDN = getDNFromUserName(self.user, self.logger) except Exception as ex: msg = "Error retrieving the user DN" msg += str(ex) msg += str(traceback.format_exc()) self.logger.error(msg) self.init = False return if not self.userDN: self.init = False return defaultDelegation = { 'logger': self.logger, 'credServerPath': self.config.credentialDir, # It will be moved to be getfrom couchDB 'myProxySvr': 'myproxy.cern.ch', 'min_time_left' : getattr(self.config, 'minTimeLeft', 36000), 'serverDN': self.config.serverDN, 'uisource': self.uiSetupScript, 'cleanEnvironment': getattr(self.config, 'cleanEnvironment', False) } if hasattr(self.config, "cache_area"): try: defaultDelegation['myproxyAccount'] = re.compile('https?://([^/]*)/.*').findall(self.config.cache_area)[0] except IndexError: self.logger.error('MyproxyAccount parameter cannot be retrieved from %s' % self.config.cache_area) pass if getattr(self.config, 'serviceCert', None): defaultDelegation['server_cert'] = self.config.serviceCert if getattr(self.config, 'serviceKey', None): defaultDelegation['server_key'] = self.config.serviceKey self.valid = False try: self.valid, proxy = getProxy(self.userDN, "", "", defaultDelegation, self.logger) except Exception as ex: msg = "Error getting the user proxy" msg += str(ex) msg += str(traceback.format_exc()) self.logger.error(msg) if self.valid: self.userProxy = proxy else: # Use the operator's proxy when the user proxy in invalid. # This will be moved soon self.logger.error('Did not get valid proxy. Setting proxy to ops proxy') self.userProxy = config.opsProxy if self.config.isOracle: try: self.oracleDB = HTTPRequests(self.config.oracleDB, config.opsProxy, config.opsProxy) except Exception: self.logger.exception() raise else: server = CouchServer(dburl=self.config.couch_instance, ckey=self.config.opsProxy, cert=self.config.opsProxy) self.db = server.connectDatabase(self.config.files_database) # Set up a factory for loading plugins self.factory = WMFactory(self.config.pluginDir, namespace = self.config.pluginDir) self.commandTimeout = 1200 self.max_retry = config.max_retry # Proxy management in Couch os.environ['X509_USER_PROXY'] = self.userProxy try: self.phedex = PhEDEx(responseType='xml', dict={'key':self.config.opsProxy, 'cert':self.config.opsProxy}) except Exception as e: self.logger.exception('PhEDEx exception: %s' % e) def __call__(self): """ a. makes the ftscp copyjob b. submits ftscp c. deletes successfully transferred files from the DB """ self.logger.info("Retrieving files for %s" % self.user) files_to_update = self.files_for_update() self.logger.info("%s files to process" % len(files_to_update)) self.logger.debug("%s files to process" % files_to_update) for input_file in files_to_update: remove_good = True remove_failed = True failed_lfns = [] failure_reason = [] good_lfns = [] self.logger.info("Updating %s" % input_file) if os.path.basename(input_file).startswith('Reporter'): try: json_data = json.loads(open(input_file).read()) except ValueError as e: self.logger.error("Error loading %s" % e) self.logger.debug('Removing %s' % input_file) os.unlink(input_file) continue except Exception as e: self.logger.error("Error loading %s" % e) self.logger.debug('Removing %s' % input_file) os.unlink(input_file) continue if json_data: self.logger.debug('Inputs: %s %s %s' % (json_data['LFNs'], json_data['transferStatus'], json_data['failure_reason'])) if 'FAILED' or 'abandoned' or 'CANCELED' or 'lost' in json_data['transferStatus']: # Sort failed files failed_indexes = [i for i, x in enumerate(json_data['transferStatus']) if x == 'FAILED' or x == 'CANCELED'] abandoned_indexes = [i for i, x in enumerate(json_data['transferStatus']) if x == 'abandoned'] failed_indexes.extend(abandoned_indexes) self.logger.info('failed indexes %s' % len(failed_indexes)) self.logger.debug('failed indexes %s' % failed_indexes) for i in failed_indexes: failed_lfns.append(json_data['LFNs'][i]) failure_reason.append(json_data['failure_reason'][i]) self.logger.debug('Marking failed %s %s' %(failed_lfns, failure_reason)) updated_failed_lfns = self.mark_failed(failed_lfns, failure_reason) if 'Done' or 'FINISHED' in json_data['transferStatus']: # Sort good files good_indexes = [i for i, x in enumerate(json_data['transferStatus']) if (x == 'Done' or x == 'FINISHED' or x == 'Finishing') ] self.logger.info('good indexes %s' % len(good_indexes)) self.logger.debug('good indexes %s' % good_indexes) for i in good_indexes: good_lfns.append(json_data['LFNs'][i]) self.logger.info('Marking good %s' %(good_lfns)) try: updated_good_lfns = self.mark_good(good_lfns) except: self.logger.exception('Either no files to mark or failed to update state') # Remove the json file self.logger.debug('Removing %s' % input_file) os.unlink( input_file ) else: self.logger.info('Empty file %s' % input_file) continue else: self.logger.info('File not for the Reporter %s' % input_file) continue self.logger.info('Update completed') return def files_for_update(self): """ Retrieve the list of files to update. """ files_to_update = [] user_dir = os.path.join(self.dropbox_dir, self.user) self.logger.info('Looking into %s' % user_dir) for user_file in os.listdir(user_dir): files_to_update.append(os.path.join(self.dropbox_dir, self.user, user_file)) return files_to_update def mark_good(self, files): """ Mark the list of files as tranferred """ updated_lfn = [] good_ids = [] if len(files) == 0: return updated_lfn for it, lfn in enumerate(files): hash_lfn = getHashLfn(lfn) self.logger.info("Marking good %s" % hash_lfn) self.logger.debug("Marking good %s" % lfn) if not self.config.isOracle: try: document = self.db.document(hash_lfn) except Exception as ex: msg = "Error loading document from couch" msg += str(ex) msg += str(traceback.format_exc()) self.logger.error(msg) continue self.logger.info("Doc %s Loaded" % hash_lfn) try: now = str(datetime.datetime.now()) last_update = time.time() if self.config.isOracle: docId = getHashLfn(lfn) good_ids.append(docId) updated_lfn.append(lfn) else: if document['state'] != 'killed' and document['state'] != 'done' and document['state'] != 'failed': outputLfn = document['lfn'].replace('store/temp', 'store', 1) data = dict() data['end_time'] = now data['state'] = 'done' data['lfn'] = outputLfn data['last_update'] = last_update updateUri = "/" + self.db.name + "/_design/AsyncTransfer/_update/updateJobs/" + getHashLfn(lfn) updateUri += "?" + urllib.urlencode(data) self.db.makeRequest(uri = updateUri, type = "PUT", decode = False) updated_lfn.append(lfn) self.logger.debug("Marked good %s" % lfn) else: updated_lfn.append(lfn) try: self.db.commit() except Exception as ex: msg = "Error commiting documents in couch" msg += str(ex) msg += str(traceback.format_exc()) self.logger.error(msg) continue except Exception as ex: msg = "Error updating document" msg += str(ex) msg += str(traceback.format_exc()) self.logger.error(msg) continue if self.config.isOracle: try: data = dict() data['asoworker'] = self.config.asoworker data['subresource'] = 'updateTransfers' data['list_of_ids'] = good_ids data['list_of_transfer_state'] = ["DONE" for x in good_ids] result = self.oracleDB.post(self.config.oracleFileTrans, data=encodeRequest(data)) self.logger.debug("Marked good %s" % good_ids) except Exception: self.logger.exception('Error updating document') return {} self.logger.info("Transferred file %s updated, removing now source file" %docId) try: docbyId = self.oracleDB.get(self.config.oracleFileTrans.replace('filetransfers','fileusertransfers'), data=encodeRequest({'subresource': 'getById', 'id': docId})) document = oracleOutputMapping(docbyId, None)[0] except Exception: msg = "Error getting file from source" self.logger.exception(msg) return {} if document["source"] not in self.site_tfc_map: self.logger.debug("site not found... gathering info from phedex") self.site_tfc_map[document["source"]] = self.get_tfc_rules(document["source"]) pfn = self.apply_tfc_to_lfn( '%s:%s' %(document["source"], lfn)) self.logger.debug("File has to be removed now from source site: %s" %pfn) self.remove_files(self.userProxy, pfn) self.logger.debug("Transferred file removed from source") return updated_lfn def remove_files(self, userProxy, pfn): command = 'env -i X509_USER_PROXY=%s gfal-rm -v -t 180 %s' % \ (userProxy, pfn) logging.debug("Running remove command %s" % command) try: rc, stdout, stderr = execute_command(command, self.logger, 3600) except Exception as ex: self.logger.error(ex) if rc: logging.info("Deletion command failed with output %s and error %s" %(stdout, stderr)) else: logging.info("File Deleted.") return def get_tfc_rules(self, site): """ Get the TFC regexp for a given site. """ self.phedex.getNodeTFC(site) try: tfc_file = self.phedex.cacheFileName('tfc', inputdata={'node': site}) except Exception: self.logger.exception('A problem occured when getting the TFC regexp: %s') return None return readTFC(tfc_file) def apply_tfc_to_lfn(self, file): """ Take a CMS_NAME:lfn string and make a pfn. Update pfn_to_lfn_mapping dictionary. """ try: site, lfn = tuple(file.split(':')) except Exception: self.logger.exception('It does not seem to be an lfn %s' %file.split(':')) return None if site in self.site_tfc_map: pfn = self.site_tfc_map[site].matchLFN('srmv2', lfn) # TODO: improve fix for wrong tfc on sites try: if pfn.find("\\") != -1: pfn = pfn.replace("\\","") if pfn.split(':')[0] != 'srm' and pfn.split(':')[0] != 'gsiftp' : self.logger.error('Broken tfc for file %s at site %s' % (lfn, site)) return None except IndexError: self.logger.error('Broken tfc for file %s at site %s' % (lfn, site)) return None except AttributeError: self.logger.error('Broken tfc for file %s at site %s' % (lfn, site)) return None return pfn else: self.logger.error('Wrong site %s!' % site) return None def mark_failed(self, files=[], failures_reasons=[], force_fail=False): """ Something failed for these files so increment the retry count """ updated_lfn = [] for lfn in files: data = {} self.logger.debug("Document: %s" % lfn) if not isinstance(lfn, dict): if 'temp' not in lfn: temp_lfn = lfn.replace('store', 'store/temp', 1) else: temp_lfn = lfn else: if 'temp' not in lfn['value']: temp_lfn = lfn['value'].replace('store', 'store/temp', 1) else: temp_lfn = lfn['value'] docId = getHashLfn(temp_lfn) # Load document to get the retry_count if self.config.isOracle: try: self.logger.debug("Document: %s" %docId) docbyId = self.oracleDB.get(self.config.oracleFileTrans.replace('filetransfers', 'fileusertransfers'), data=encodeRequest({'subresource': 'getById', 'id': docId})) document = oracleOutputMapping(docbyId)[0] data = dict() data['asoworker'] = self.config.asoworker data['subresource'] = 'updateTransfers' data['list_of_ids'] = docId if force_fail or document['transfer_retry_count'] + 1 > self.max_retry: data['list_of_transfer_state'] = 'FAILED' data['list_of_retry_value'] = 0 else: data['list_of_transfer_state'] = 'RETRY' fatal_error = self.determine_fatal_error(failures_reasons[files.index(lfn)]) if fatal_error: data['list_of_transfer_state'] = 'FAILED' data['list_of_failure_reason'] = failures_reasons[files.index(lfn)] data['list_of_retry_value'] = 0 self.logger.debug("update: %s" % data) result = self.oracleDB.post(self.config.oracleFileTrans, data=encodeRequest(data)) if not data['list_of_transfer_state'] == 'RETRY': updated_lfn.append(lfn) self.logger.debug("Marked failed %s" % lfn) except Exception as ex: self.logger.error("Error updating document status: %s" %ex) continue else: try: document = self.db.document( docId ) except Exception as ex: msg = "Error loading document from couch" msg += str(ex) msg += str(traceback.format_exc()) self.logger.error(msg) continue if document['state'] != 'killed' and document['state'] != 'done' and document['state'] != 'failed': now = str(datetime.datetime.now()) last_update = time.time() # Prepare data to update the document in couch if force_fail or len(document['retry_count']) + 1 > self.max_retry: data['state'] = 'failed' data['end_time'] = now else: data['state'] = 'retry' fatal_error = self.determine_fatal_error(failures_reasons[files.index(lfn)]) if fatal_error: data['state'] = 'failed' data['end_time'] = now self.logger.debug("Failure list: %s" % failures_reasons) self.logger.debug("Files: %s" % files) self.logger.debug("LFN %s" % lfn) data['failure_reason'] = failures_reasons[files.index(lfn)] data['last_update'] = last_update data['retry'] = now # Update the document in couch self.logger.debug("Marking failed %s" % docId) try: updateUri = "/" + self.db.name + "/_design/AsyncTransfer/_update/updateJobs/" + docId updateUri += "?" + urllib.urlencode(data) self.db.makeRequest(uri = updateUri, type = "PUT", decode = False) updated_lfn.append(docId) self.logger.debug("Marked failed %s" % docId) except Exception as ex: msg = "Error in updating document in couch" msg += str(ex) msg += str(traceback.format_exc()) self.logger.error(msg) continue try: self.db.commit() except Exception as ex: msg = "Error commiting documents in couch" msg += str(ex) msg += str(traceback.format_exc()) self.logger.error(msg) continue else: updated_lfn.append(docId) self.logger.debug("failed file updated") return updated_lfn def determine_fatal_error(self, failure=""): """ Determine if transfer error is fatal or not. """ permanent_failure_reasons = [ ".*canceled because it stayed in the queue for too long.*", ".*permission denied.*", ".*disk quota exceeded.*", ".*operation not permitted*", ".*mkdir\(\) fail.*", ".*open/create error.*", ".*mkdir\: cannot create directory.*", ".*does not have enough space.*" ] failure = str(failure).lower() for permanent_failure_reason in permanent_failure_reasons: if re.match(permanent_failure_reason, failure): return True return False def mark_incomplete(self, files=[]): """ Mark the list of files as acquired """ self.logger('Something called mark_incomplete which should never be called')
class TransferDaemon(BaseDaemon): """ _TransferDaemon_ Call multiprocessing library to instantiate a TransferWorker for each user. """ def __init__(self, config): """ Initialise class members """ #Need a better way to test this without turning off this next line BaseDaemon.__init__(self, config, 'AsyncTransfer') self.dropbox_dir = '%s/dropbox/outputs' % self.config.componentDir if not os.path.isdir(self.dropbox_dir): try: os.makedirs(self.dropbox_dir) except OSError as e: if e.errno == errno.EEXIST: pass else: self.logger.error('Unknown error in mkdir' % e.errno) raise server = CouchServer(dburl=self.config.couch_instance, ckey=self.config.opsProxy, cert=self.config.opsProxy) self.db = server.connectDatabase(self.config.files_database) config_server = CouchServer(dburl=self.config.config_couch_instance) self.config_db = config_server.connectDatabase(self.config.config_database) self.logger.debug('Connected to CouchDB') self.pool = Pool(processes=self.config.pool_size) try: self.phedex = PhEDEx(responseType='xml', dict = {'key': self.config.opsProxy, 'cert': self.config.opsProxy}) except Exception as e: self.logger.exception('PhEDEx exception: %s' % e) # Set up a factory for loading plugins self.factory = WMFactory(self.config.schedAlgoDir, namespace = self.config.schedAlgoDir) result_list = [] current_running = [] # Over riding setup() is optional, and not needed here def algorithm(self, parameters = None): """ 1. Get a list of users with files to transfer from the couchdb instance 2. For each user get a suitably sized input for ftscp (call to a list) 3. Submit the ftscp to a subprocess """ query = {'stale':'ok'} try: params = self.config_db.loadView('asynctransfer_config', 'GetTransferConfig', query) self.config.max_files_per_transfer = params['rows'][0]['key'][1] self.config.algoName = params['rows'][0]['key'][2] except IndexError: self.logger.exception('Config data could not be retrieved from the config database. Fallback to the config file') except Exception as e: self.logger.exception('A problem occured when contacting couchDB: %s' % e) users = self.active_users(self.db) sites = self.active_sites() self.logger.info('%s active sites' % len(sites)) self.logger.debug('Active sites are: %s' % sites) site_tfc_map = {} for site in sites: # TODO: Remove this check once the ASO request will be validated before the upload. if site and str(site) != 'None' and str(site) != 'unknown': site_tfc_map[site] = self.get_tfc_rules(site) self.logger.debug('kicking off pool') for u in users: self.logger.debug('current_running %s' %current_running) if u not in current_running: self.logger.debug('processing %s' %u) current_running.append(u) self.logger.debug('processing %s' %current_running) self.pool.apply_async(ftscp,(u, site_tfc_map, self.config), callback = log_result) def active_users(self, db): """ Query a view for users with files to transfer. Get this from the following view: ftscp?group=true&group_level=1 """ #TODO: Remove stale=ok for now until tested #query = {'group': True, 'group_level': 3, 'stale': 'ok'} query = {'group': True, 'group_level': 3} try: users = db.loadView(self.config.ftscp_design, 'ftscp_all', query) except Exception as e: self.logger.exception('A problem occured when contacting couchDB: %s' % e) return [] active_users = [] if len(users['rows']) <= self.config.pool_size: active_users = users['rows'] def keys_map(inputDict): """ Map function. """ return inputDict['key'] active_users = map(keys_map, active_users) else: sorted_users = self.factory.loadObject(self.config.algoName, args = [self.config, self.logger, users['rows'], self.config.pool_size], getFromCache = False, listFlag = True) #active_users = random.sample(users['rows'], self.config.pool_size) active_users = sorted_users()[:self.config.pool_size] self.logger.info('%s active users' % len(active_users)) self.logger.debug('Active users are: %s' % active_users) return active_users def active_sites(self): """ Get a list of all sites involved in transfers. """ query = {'group': True, 'stale': 'ok'} try: sites = self.db.loadView('AsyncTransfer', 'sites', query) except Exception as e: self.logger.exception('A problem occured when contacting couchDB: %s' % e) return [] def keys_map(inputDict): """ Map function. """ return inputDict['key'] return map(keys_map, sites['rows']) def get_tfc_rules(self, site): """ Get the TFC regexp for a given site. """ tfc_file = None try: self.phedex.getNodeTFC(site) except Exception as e: self.logger.exception('PhEDEx exception: %s' % e) try: tfc_file = self.phedex.cacheFileName('tfc', inputdata={'node': site}) except Exception as e: self.logger.exception('PhEDEx cache exception: %s' % e) return readTFC(tfc_file) def terminate(self, parameters = None): """ Called when thread is being terminated. """ self.pool.close() self.pool.join()
class Getter(object): """ Get transfers to be submitted """ def __init__(self, config, quiet, debug, test=False): """ initialize log, connections etc """ self.config = config.Getter self.TEST = False createLogdir('Monitor') def setRootLogger(quiet, debug): """ Taken from CRABServer TaskWorker Sets the root logger with the desired verbosity level The root logger logs to logs/asolog.txt and every single logging instruction is propagated to it (not really nice to read) :arg bool quiet: it tells if a quiet logger is needed :arg bool debug: it tells if needs a verbose logger :return logger: a logger with the appropriate logger level.""" createLogdir('logs') createLogdir('logs/processes') if self.TEST: # if we are testing log to the console is easier logging.getLogger().addHandler(logging.StreamHandler()) else: logHandler = MultiProcessingLog('logs/submitter.txt', when='midnight') logFormatter = \ logging.Formatter("%(asctime)s:%(levelname)s:%(module)s:%(message)s") logHandler.setFormatter(logFormatter) logging.getLogger().addHandler(logHandler) loglevel = logging.INFO if quiet: loglevel = logging.WARNING if debug: loglevel = logging.DEBUG logging.getLogger().setLevel(loglevel) logger = setProcessLogger("master") logger.debug("PID %s.", os.getpid()) logger.debug("Logging level initialized to %s.", loglevel) return logger try: self.phedex = PhEDEx(responseType='xml', dict={ 'key': self.config.opsProxy, 'cert': self.config.opsProxy }) except Exception as e: self.logger.exception('PhEDEx exception: %s' % e) self.documents = dict() self.doc_acq = '' self.STOP = False self.logger = setRootLogger(quiet, debug) self.q = Queue() self.active_lfns = list() self.Update = update(self.logger, self.config) self.site_tfc_map = {} for site in [ x['name'] for x in json.loads(self.phedex.getNodeMap())['phedex']['node'] ]: if site and str(site) != 'None' and str(site) != 'unknown': self.site_tfc_map[site] = self.get_tfc_rules(site) self.logger.debug('tfc site: %s %s' % (site, self.get_tfc_rules(site))) def algorithm(self): """ - Get Users - Get Source dest - create queue for each (user, link) - feed threads """ workers = list() for i in range(self.config.max_threads_num): worker = Thread(target=self.worker, args=(i, self.q)) worker.setDaemon(True) worker.start() workers.append(worker) site_tfc_map = dict() while not self.STOP: sites, users = self.oracleSiteUser(self.Update) self.Update.retry() for _user in users: for source in sites: for dest in sites: lfns = [[x['source_lfn'], x['destination_lfn']] for x in self.documents if x['source'] == source and x['destination'] == dest and x['username'] == _user[0] and x not in self.active_lfns] self.active_lfns = self.active_lfns + lfns # IMPORTANT: remove only on final states for files in chunks(lfns, self.config.files_per_job): self.q.put((files, _user, source, dest, self.site_tfc_map)) self.logger.debug('Queue lenght: %s' % self.q.qsize()) time.sleep(4) for w in workers: w.join() self.logger.info('Submitter stopped.') def oracleSiteUser(self, Update): """ 1. Acquire transfers from DB 2. Get acquired users and destination sites """ # TODO: flexible with other DBs and get users list users = Update.acquire() if users != 1: self.documents = Update.getAcquired(users) for doc in self.documents: if doc['user_role'] is None: doc['user_role'] = "" if doc['user_group'] is None: doc['user_group'] = "" unique_users = list() try: unique_users = [ list(i) for i in set( tuple([x['username'], x['user_group'], x['user_role']]) for x in self.documents) ] except Exception as ex: self.logger.error("Failed to map active users: %s" % ex) if len(unique_users) <= self.config.pool_size: active_users = unique_users else: active_users = unique_users[:self.config.pool_size] self.logger.info('%s active users' % len(active_users)) self.logger.debug('Active users are: %s' % active_users) active_sites_dest = [x['destination'] for x in self.documents] active_sites = active_sites_dest + [ x['source'] for x in self.documents ] self.logger.debug('Active sites are: %s' % list(set(active_sites))) return list(set(active_sites)), active_users def get_tfc_rules(self, site): """ Get the TFC regexp for a given site. """ tfc_file = None try: self.phedex.getNodeTFC(site) except Exception as e: self.logger.exception('PhEDEx exception: %s' % e) try: tfc_file = self.phedex.cacheFileName('tfc', inputdata={'node': site}) except Exception as e: self.logger.exception('PhEDEx cache exception: %s' % e) return readTFC(tfc_file) def critical_failure(self, lfns, lock, inputs): """ if an exception occurs before the end, remove lfns from active to let it be reprocessed later. :param lfns: :param lock: :param inputs: :return: """ lock.acquire() for lfn in lfns: self.active_lfns.remove(lfn) lock.release() inputs.task_done() def worker(self, i, inputs): """ - Retrieve userDN - Retrieve user proxy - Delegate proxy to fts is needed - submit fts job - update doc states :param i: thread number :param inputs: tuple (lfns, _user, source, dest, tfc_map) :return: """ # TODO: differentiate log messages per USER! logger = self.logger logger.info("Process %s is starting. PID %s", i, os.getpid()) lock = Lock() Update = update(logger, self.config) while not self.STOP: if inputs.empty(): time.sleep(10) continue try: lfns, _user, source, dest, tfc_map = inputs.get() [user, group, role] = _user except (EOFError, IOError): crashMessage = "Hit EOF/IO in getting new work\n" crashMessage += "Assuming this is a graceful break attempt.\n" logger.error(crashMessage) continue start = time.time() if not self.config.TEST: try: userDN = getDNFromUserName(user, logger, ckey=self.config.opsProxy, cert=self.config.opsProxy) except Exception as ex: logger.exception('Cannot retrieve user DN') self.critical_failure(lfns, lock, inputs) continue defaultDelegation = { 'logger': logger, 'credServerPath': self.config.credentialDir, 'myProxySvr': 'myproxy.cern.ch', 'min_time_left': getattr(self.config, 'minTimeLeft', 36000), 'serverDN': self.config.serverDN, 'uisource': '', 'cleanEnvironment': getattr(self.config, 'cleanEnvironment', False) } cache_area = self.config.cache_area try: defaultDelegation['myproxyAccount'] = re.compile( 'https?://([^/]*)/.*').findall(cache_area)[0] except IndexError: logger.error( 'MyproxyAccount parameter cannot be retrieved from %s . ' % self.config.cache_area) self.critical_failure(lfns, lock, inputs) continue if getattr(self.config, 'serviceCert', None): defaultDelegation['server_cert'] = self.config.serviceCert if getattr(self.config, 'serviceKey', None): defaultDelegation['server_key'] = self.config.serviceKey try: defaultDelegation['userDN'] = userDN defaultDelegation['group'] = group defaultDelegation['role'] = role logger.debug('delegation: %s' % defaultDelegation) valid_proxy, user_proxy = getProxy(defaultDelegation, logger) if not valid_proxy: logger.error( 'Failed to retrieve user proxy... putting docs on retry' ) logger.error( 'docs on retry: %s' % Update.failed(lfns, submission_error=True)) continue except Exception: logger.exception('Error retrieving proxy') self.critical_failure(lfns, lock, inputs) continue else: user_proxy = self.config.opsProxy self.logger.debug("Using opsProxy for testmode") context = dict() try: if self.config.TEST: logger.debug("Running in test mode, submitting fake jobs") else: context = fts3.Context(self.config.serverFTS, user_proxy, user_proxy, verify=True) logger.debug( fts3.delegate(context, lifetime=timedelta(hours=48), force=False)) except Exception: logger.exception("Error submitting to FTS") self.critical_failure(lfns, lock, inputs) continue failed_lfn = list() try: if self.config.TEST: submitted_lfn = lfns jobid = getHashLfn(lfns[0][0]) self.logger.debug('Fake job id: ' + jobid) else: failed_lfn, submitted_lfn, jobid = Submission( lfns, source, dest, i, self.logger, fts3, context, tfc_map) if jobid == -1: self.critical_failure(lfns, lock, inputs) continue logger.info('Submitted %s files' % len(submitted_lfn)) except Exception: logger.exception("Unexpected error during FTS job submission!") self.critical_failure(lfns, lock, inputs) continue # TODO: add file FTS id and job id columns for kill command try: Update.submitted(lfns) except Exception: logger.exception("Error updating document status") self.critical_failure(lfns, lock, inputs) continue try: Update.failed(failed_lfn) except Exception: logger.exception( "Error updating document status, job submission will be retried later..." ) self.critical_failure(lfns, lock, inputs) continue try: createLogdir('Monitor/' + user) with open('Monitor/' + user + '/' + str(jobid) + '.txt', 'w') as outfile: json.dump(lfns, outfile) logger.info('Monitor files created') except Exception: logger.exception("Error creating file for monitor") self.critical_failure(lfns, lock, inputs) continue end = time.time() self.logger.info('Input processed in %s', str(end - start)) time.sleep(0.5) logger.debug("Worker %s exiting.", i) return 0 def quit_(self): """ set STOP to True :return: """ self.logger.info( "Received kill request. Setting STOP flag in the master and threads..." ) self.STOP = True
class TransferDaemon(BaseWorkerThread): """ _TransferDaemon_ Call multiprocessing library to instantiate a TransferWorker for each user. """ def __init__(self, config): """ Initialise class members """ #Need a better way to test this without turning off this next line BaseWorkerThread.__init__(self) #logging.basicConfig(format = '%(asctime)s %(name)-12s %(levelname)-8s %(message)s',datefmt = '%m-%d %H:%M') #self.logger = logging.getLogger() # self.logger is set up by the BaseWorkerThread, we just set it's level self.config = config.AsyncTransfer try: self.logger.setLevel(self.config.log_level) except: import logging self.logger = logging.getLogger() self.logger.setLevel(self.config.log_level) self.logger.debug('Configuration loaded') server = CouchServer(self.config.couch_instance) self.db = server.connectDatabase(self.config.files_database) self.logger.debug('Connected to CouchDB') self.pool = Pool(processes=self.config.pool_size) self.phedex = PhEDEx(responseType='xml') # Over riding setup() is optional, and not needed here def algorithm(self, parameters = None): """ 1. Get a list of users with files to transfer from the couchdb instance 2. For each user get a suitably sized input for ftscp (call to a list) 3. Submit the ftscp to a subprocess """ users = self.active_users(self.db) sites = self.active_sites() self.logger.info('%s active sites' % len(sites)) self.logger.debug('Active sites are: %s' % sites) site_tfc_map = {} for site in sites: site_tfc_map[site] = self.get_tfc_rules(site) self.logger.debug('kicking off pool') r = [self.pool.apply_async(ftscp, (u, site_tfc_map, self.config)) for u in users] for result in r: self.logger.info(result.get()) def active_users(self, db): """ Query a view for users with files to transfer. Get this from the following view: ftscp?group=true&group_level=1 """ query = {'group': True, 'group_level':3} users = db.loadView('AsyncTransfer', 'ftscp', query) active_users = [] if len(users['rows']) <= self.config.pool_size: active_users = users['rows'] else: #TODO: have a plugin algorithm here... active_users = random.sample(users['rows'], self.config.pool_size) def keys_map(inputDict): """ Map function. """ return inputDict['key'] return map(keys_map, active_users) def active_sites(self): """ Get a list of all sites involved in transfers. """ query = {'group': True} sites = self.db.loadView('AsyncTransfer', 'sites', query) def keys_map(inputDict): """ Map function. """ return inputDict['key'] return map(keys_map, sites['rows']) def get_tfc_rules(self, site): """ Get the TFC regexp for a given site. """ self.phedex.getNodeTFC(site) tfc_file = self.phedex.cacheFileName('tfc', inputdata={'node': site}) return readTFC(tfc_file) def terminate(self, parameters = None): """ Called when thread is being terminated. """ self.pool.close() self.pool.join()