def __init__(self, serverName, serverPort, cfg_params, proxyPath=None): """ Open the communication with an Analysis Server by passing the server URL and the port """ self.ServerTwiki = 'https://twiki.cern.ch/twiki/bin/view/CMS/SWGuideCrabServerForUsers#Server_available_for_users' self.asSession = C_AS_Session(serverName, serverPort) self.cfg_params = cfg_params self.userSubj = '' self.serverName = serverName credentialType = 'Proxy' if common.scheduler.name().upper() in ['CAF','LSF']: credentialType = 'Token' CliServerParams(self) self.crab_task_name = common.work_space.topDir().split('/')[-2] # nice task name "crab_0_..." configAPI = {'credential' : credentialType, \ 'logger' : common.logger() } CredAPI = CredentialAPI( configAPI ) try: self.userSubj = CredAPI.getSubject() except Exception, err: common.logger.debug("Getting Credential Subject: " +str(traceback.format_exc())) raise CrabException("Error Getting Credential Subject")
def __init__(self, serverName, serverPort, cfg_params, proxyPath=None): """ Open the communication with an Analysis Server by passing the server URL and the port """ self.ServerTwiki = ( "https://twiki.cern.ch/twiki/bin/view/CMS/SWGuideCrabServerForUsers#Server_available_for_users" ) self.asSession = C_AS_Session(serverName, serverPort) self.cfg_params = cfg_params self.userSubj = "" self.serverName = serverName credentialType = "Proxy" if common.scheduler.name().upper() in ["CAF", "LSF"]: credentialType = "Token" CliServerParams(self) self.crab_task_name = common.work_space.topDir().split("/")[-2] # nice task name "crab_0_..." configAPI = {"credential": credentialType, "logger": common.logger()} CredAPI = CredentialAPI(configAPI) try: self.userSubj = CredAPI.getSubject() except Exception, err: common.logger.debug("Getting Credential Subject: " + str(traceback.format_exc())) raise CrabException("Error Getting Credential Subject")
def checkProxy(self, minTime=10): """ Function to check the Globus proxy. """ if (self.proxyValid): return ### Just return if asked to do so if (self.dontCheckProxy==1): self.proxyValid=1 return CredAPI_config = { 'credential':'Proxy',\ 'myProxySvr': self.proxyServer, \ 'logger': common.logger() \ } from ProdCommon.Credential.CredentialAPI import CredentialAPI CredAPI = CredentialAPI(CredAPI_config) if not CredAPI.checkCredential(Time=int(minTime)) or \ not CredAPI.checkAttribute(group=self.group, role=self.role): try: CredAPI.ManualRenewCredential(group=self.group, role=self.role) except Exception, ex: raise CrabException(str(ex))
def checkProxy(self, minTime=10): """ Function to check the Globus proxy. """ if (self.proxyValid): return ### Just return if asked to do so if (self.dontCheckProxy == 1): self.proxyValid = 1 return CredAPI_config = { 'credential':'Proxy',\ 'myProxySvr': self.proxyServer, \ 'logger': common.logger() \ } from ProdCommon.Credential.CredentialAPI import CredentialAPI CredAPI = CredentialAPI(CredAPI_config) if not CredAPI.checkCredential(Time=int(minTime)) or \ not CredAPI.checkAttribute(group=self.group, role=self.role): try: CredAPI.ManualRenewCredential(group=self.group, role=self.role) except Exception, ex: raise CrabException(str(ex))
class CredentialRenew(Actor): def __init__(self, cfg_params): self.cfg_params = cfg_params self.credentialType = 'Proxy' if common.scheduler.name().upper() in ['LSF', 'CAF']: self.credentialType = 'Token' # init client server params... CliServerParams(self) def run(self): """ """ common.logger.debug("CredentialRenew::run() called") # FIXME With MyProxy delegation this part is completely overlapped with the method manageDelegation # in SubmitServer. We should to maintain just one version of the method in a common part try: myproxyserver = Downloader( "http://cmsdoc.cern.ch/cms/LCG/crab/config/").config( "myproxy_server.conf") myproxyserver = myproxyserver.strip() if myproxyserver is None: raise CrabException("myproxy_server.conf retrieved but empty") except Exception, e: common.logger.info( "Problem setting myproxy server endpoint: using myproxy.cern.ch" ) common.logger.debug(e) myproxyserver = 'myproxy.cern.ch' configAPI = {'credential' : self.credentialType, \ 'myProxySvr' : myproxyserver,\ 'serverDN' : self.server_dn,\ 'shareDir' : common.work_space.shareDir() ,\ 'userName' : getUserName(),\ 'serverName' : self.server_name, \ 'logger' : common.logger() \ } try: CredAPI = CredentialAPI(configAPI) except Exception, err: common.logger.debug("Configuring Credential API: " + str(traceback.format_exc())) raise CrabException( "ERROR: Unable to configure Credential Client API %s\n" % str(err))
common.logger.info("Problem setting myproxy server endpoint: using myproxy.cern.ch") common.logger.debug(e) myproxyserver = 'myproxy.cern.ch' configAPI = {'credential' : self.credentialType, \ 'myProxySvr' : myproxyserver,\ 'serverDN' : self.server_dn,\ 'shareDir' : common.work_space.shareDir() ,\ 'userName' : getUserName(),\ 'serverName' : self.server_name, \ 'proxyPath' : self.proxy_path, \ 'logger' : common.logger() \ } try: CredAPI = CredentialAPI( configAPI ) except Exception, err : common.logger.debug("Configuring Credential API: " +str(traceback.format_exc())) raise CrabException("ERROR: Unable to configure Credential Client API %s\n"%str(err)) if self.credentialType == 'Proxy': # Proxy delegation through MyProxy, 4 days lifetime minimum if not CredAPI.checkMyProxy(Time=4, checkRetrieverRenewer=True) : common.logger.info("Please renew MyProxy delegated proxy:\n") try: CredAPI.credObj.serverDN = self.server_dn CredAPI.ManualRenewMyProxy() except Exception, ex: common.logger.debug("Delegating Credentials to MyProxy : " +str(traceback.format_exc())) raise CrabException(str(ex))
) common.logger.debug(e) myproxyserver = 'myproxy.cern.ch' configAPI = {'credential' : self.credentialType, \ 'myProxySvr' : myproxyserver,\ 'serverDN' : self.server_dn,\ 'shareDir' : common.work_space.shareDir() ,\ 'userName' : getUserName(),\ 'serverName' : self.server_name, \ 'proxyPath' : self.proxy_path, \ 'logger' : common.logger() \ } try: CredAPI = CredentialAPI(configAPI) except Exception, err: common.logger.debug("Configuring Credential API: " + str(traceback.format_exc())) raise CrabException( "ERROR: Unable to configure Credential Client API %s\n" % str(err)) if self.credentialType == 'Proxy': # Proxy delegation through MyProxy, 4 days lifetime minimum if not CredAPI.checkMyProxy(Time=4, checkRetrieverRenewer=True): common.logger.info("Please renew MyProxy delegated proxy:\n") try: CredAPI.credObj.serverDN = self.server_dn CredAPI.ManualRenewMyProxy() except Exception, ex:
def pollProxies(self, credConfig): """ __pollProxies__ loops on the proxies and makes related actions """ logging.info( "Start proxy's polling...." ) expiredtask = [] from ProdCommon.Credential.CredentialAPI import CredentialAPI CredAPI = CredentialAPI( credConfig ) CredAPI.credObj.myproxyServer = '$MYPROXY_SERVER' mySession = BossLiteAPI("MySQL", self.bossCfgDB) tlapi = TaskLifeAPI() if os.path.exists(self.proxiespath): ## get the list of proxies if credConfig['credential'] == 'Token': proxieslist=[] proxiesTemp = tlapi.getListTokens( mySession.bossLiteDB ) for proxy in proxiesTemp: if os.path.exists(proxy): proxieslist.append(proxy) else: proxieslist = tlapi.getListProxies( mySession.bossLiteDB ) for proxyfull in proxieslist: ## get the remaining proxy life time logging.info("Checking proxy [%s]"% str(proxyfull)) timeleft = 0 try: timeleft = CredAPI.getTimeLeft( proxyfull ) except Exception, exc: logging.info("Problem checking proxy validity: %s"% str(exc)) import traceback logging.info( str(traceback.format_exc()) ) continue ## credential expired ## if timeleft <= 0: logging.info( "Credential expired [%s]: %s s"% (proxyfull, str(timeleft)) ) tasksbymail = tlapi.getTaskList(proxyfull, mySession.bossLiteDB) allTasks = [] for mail, tasks in tasksbymail.iteritems(): for task in tasks: ## archive tlapi.archiveBliteTask(mySession, task) tlapi.archiveServerTask(task, mySession.bossLiteDB) ## append for hand clean allTasks.append(task) expiredtask.append(task) try: logging.info("Destroying proxy %s" %proxyfull) CredAPI.destroyCredential( proxyfull ) except Exception, ex: logging.error("Problem '%s' destroying credential '%s'."%(str(ex),str(proxyfull))) ## if not already asked notify the admin to hand-clean if not self.cleanasked(proxyfull): self.notifyToClean(allTasks) self.askclean(proxyfull) ## short credential ## elif timeleft <= self.minimumleft: logging.info("Credential still valid for: %s s"% str(timeleft)) ## proxy renewal through myproxy delegation ## delegatedtimeleft = 0 if credConfig['credential'] == 'Proxy': logging.info("Trying to renew proxy [%s]"% str(proxyfull)) if self.useGlExecDelegation == True: # glExec renewal specific parts # TODO # Sanjay fix here # change the proxy ownership so that CrabServer can renew it pass try: CredAPI.renewalMyProxy(proxyfull) delegatedtimeleft = CredAPI.getTimeLeft(proxyfull) logging.info("Renewed credential still valid for: %s s"% str(delegatedtimeleft)) except Exception, exc: logging.info("Problem renewing proxy : %s"% str(exc)) import traceback logging.info( str(traceback.format_exc()) ) delegatedtimeleft = 0 if self.useGlExecDelegation == True: # glExec renewal specific parts # TODO # Sanjay fix here # set again the proxy ownership for glExec pass if credConfig['credential'] == 'Token': logging.info("Trying to renew Token [%s]"% str(proxyfull)) try: CredAPI.renewalMyToken(proxyfull) delegatedtimeleft = CredAPI.getTimeLeft(proxyfull) logging.info("Renewed credential still valid for: %s s"% str(delegatedtimeleft)) except Exception, exc: logging.info("Problem renewing Token : %s"% str(exc)) import traceback logging.info( str(traceback.format_exc()) ) delegatedtimeleft = 0
def run(args): dash_checker = cmssw.dash.JobStateChecker(300) with open(args.configfile) as configfile: config = yaml.load(configfile) workdir = config['workdir'] if not os.path.exists(workdir): os.makedirs(workdir) util.register_checkpoint(workdir, "version", get_distribution('Lobster').version) else: util.verify(workdir) cmsjob = False if config.get('type', 'cmssw') == 'cmssw': cmsjob = True from ProdCommon.Credential.CredentialAPI import CredentialAPI cred = CredentialAPI({'credential': 'Proxy'}) if cred.checkCredential(Time=60): if not 'X509_USER_PROXY' in os.environ: os.environ['X509_USER_PROXY'] = cred.credObj.getUserProxy() else: if config.get('advanced', {}).get('renew proxy', True): try: cred.ManualRenewCredential() except Exception as e: print("could not renew proxy") sys.exit(1) else: print("please renew your proxy") sys.exit(1) print "Saving log to {0}".format(os.path.join(workdir, 'lobster.log')) if not args.foreground: ttyfile = open(os.path.join(workdir, 'lobster.err'), 'a') print "Saving stderr and stdout to {0}".format(os.path.join(workdir, 'lobster.err')) signals = daemon.daemon.make_default_signal_map() signals[signal.SIGTERM] = lambda num, frame: kill(args) with daemon.DaemonContext( detach_process=not args.foreground, stdout=sys.stdout if args.foreground else ttyfile, stderr=sys.stderr if args.foreground else ttyfile, working_directory=workdir, pidfile=util.get_lock(workdir), signal_map=signals): fileh = logging.handlers.RotatingFileHandler(os.path.join(workdir, 'lobster.log'), maxBytes=500e6, backupCount=10) fileh.setFormatter(ShortPathFormatter("%(asctime)s [%(levelname)5s] - %(pathname)-40s %(lineno)4d: %(message)s")) fileh.setLevel(config.get('advanced', {}).get('log level', 2) * 10) logger.addHandler(fileh) logger.setLevel(config.get('advanced', {}).get('log level', 2) * 10) if args.foreground: console = logging.StreamHandler() console.setLevel(config.get('advanced', {}).get('log level', 2) * 10) console.setFormatter(ShortPathFormatter("%(asctime)s [%(levelname)5s] - %(pathname)-40s %(lineno)4d: %(message)s")) logger.addHandler(console) config['configdir'] = args.configdir config['filename'] = args.configfile config['startdir'] = args.startdir if cmsjob: job_src = cmssw.JobProvider(config) actions = cmssw.Actions(config) else: job_src = job.SimpleJobProvider(config) actions = None logger.info("using wq from {0}".format(wq.__file__)) wq.cctools_debug_flags_set("all") wq.cctools_debug_config_file(os.path.join(workdir, "work_queue_debug.log")) wq.cctools_debug_config_file_size(1 << 29) queue = wq.WorkQueue(-1) queue.specify_log(os.path.join(workdir, "work_queue.log")) queue.specify_name("lobster_" + config["id"]) queue.specify_keepalive_timeout(300) # queue.tune("short-timeout", 600) queue.tune("transfer-outlier-factor", 4) queue.specify_algorithm(wq.WORK_QUEUE_SCHEDULE_RAND) logger.info("starting queue as {0}".format(queue.name)) logger.info("submit workers with: condor_submit_workers -M {0} <num>".format(queue.name)) payload = config.get('advanced', {}).get('payload', 400) abort_active = False abort_threshold = config.get('advanced', {}).get('abort threshold', 400) abort_multiplier = config.get('advanced', {}).get('abort multiplier', 4) if util.checkpoint(workdir, 'KILLED') == 'PENDING': util.register_checkpoint(workdir, 'KILLED', 'RESTART') jobits_left = 0 successful_jobs = 0 creation_time = 0 destruction_time = 0 with open(os.path.join(workdir, "lobster_stats.log"), "a") as statsfile: statsfile.write( "#timestamp " + "total_workers_connected total_workers_joined total_workers_removed " + "workers_busy workers_idle " + "tasks_running " + "total_send_time total_receive_time " + "total_create_time total_return_time " + "idle_percentage " + "capacity " + "efficiency " + "total_memory " + "total_cores " + "jobits_left\n") while not job_src.done(): jobits_left = job_src.work_left() stats = queue.stats with open(os.path.join(workdir, "lobster_stats.log"), "a") as statsfile: now = datetime.datetime.now() statsfile.write(" ".join(map(str, [ int(int(now.strftime('%s')) * 1e6 + now.microsecond), stats.total_workers_connected, stats.total_workers_joined, stats.total_workers_removed, stats.workers_busy, stats.workers_idle, stats.tasks_running, stats.total_send_time, stats.total_receive_time, creation_time, destruction_time, stats.idle_percentage, stats.capacity, stats.efficiency, stats.total_memory, stats.total_cores, jobits_left ] )) + "\n" ) if util.checkpoint(workdir, 'KILLED') == 'PENDING': util.register_checkpoint(workdir, 'KILLED', str(datetime.datetime.utcnow())) # just in case, check for any remaining not done task that # hasn't been reported as aborted for task_id in queue._task_table.keys(): status = cmssw.dash.status_map[queue.task_state(task_id)] if status not in (cmssw.dash.DONE, cmssw.dash.ABORTED): job_src._JobProvider__dash.update_job(task_id, cmssw.dash.ABORTED) logger.info("terminating gracefully") break logger.info("{0} out of {1} workers busy; {3} jobs running, {4} waiting; {2} jobits left".format( stats.workers_busy, stats.workers_busy + stats.workers_ready, jobits_left, stats.tasks_running, stats.tasks_waiting)) hunger = max(payload - stats.tasks_waiting, 0) t = time.time() while hunger > 0: jobs = job_src.obtain(50) if jobs == None or len(jobs) == 0: break hunger -= len(jobs) cores = config.get('cores per job', 1) for id, cmd, inputs, outputs in jobs: task = wq.Task(cmd) task.specify_tag(id) task.specify_cores(cores) # temporary work-around? # task.specify_memory(1000) # task.specify_disk(4000) for (local, remote, cache) in inputs: if os.path.isfile(local): cache_opt = wq.WORK_QUEUE_CACHE if cache else wq.WORK_QUEUE_NOCACHE task.specify_input_file(str(local), str(remote), cache_opt) elif os.path.isdir(local): task.specify_directory(local, remote, wq.WORK_QUEUE_INPUT, wq.WORK_QUEUE_CACHE, recursive=True) else: logger.critical("cannot send file to worker: {0}".format(local)) raise NotImplementedError for (local, remote) in outputs: task.specify_output_file(str(local), str(remote)) queue.submit(task) creation_time += int((time.time() - t) * 1e6) # update dashboard status for all not done tasks # report Done status only once when releasing the task # WAITING_RETRIEVAL is not a valid status in dashboard # so, skipping it for now monitor = job_src._JobProvider__dash queue = queue exclude_states = (cmssw.dash.DONE, cmssw.dash.WAITING_RETRIEVAL) try: dash_checker.update_dashboard_states(monitor, queue, exclude_states) except Exception as e: logger.warning("Could not update job states to dashboard") task = queue.wait(300) tasks = [] while task: if task.return_status == 0: successful_jobs += 1 tasks.append(task) if queue.stats.tasks_complete > 0: task = queue.wait(1) else: task = None if len(tasks) > 0: try: t = time.time() job_src.release(tasks) destruction_time += int((time.time() - t) * 1e6) except: tb = traceback.format_exc() logger.critical("cannot recover from the following exception:\n" + tb) for task in tasks: logger.critical("tried to return task {0} from {1}".format(task.tag, task.hostname)) raise if successful_jobs >= abort_threshold and not abort_active: logger.info("activating fast abort with multiplier: {0}".format(abort_multiplier)) abort_active = True queue.activate_fast_abort(abort_multiplier) # recurring actions are triggered here if actions: actions.take() if jobits_left == 0: logger.info("no more work left to do")
def run(args): with open(args.configfile) as configfile: config = yaml.load(configfile) workdir = config['workdir'] if not os.path.exists(workdir): os.makedirs(workdir) cmsjob = False if config.get('type', 'cmssw') == 'cmssw': cmsjob = True from ProdCommon.Credential.CredentialAPI import CredentialAPI cred = CredentialAPI({'credential': 'Proxy'}) if cred.checkCredential(Time=60): if not 'X509_USER_PROXY' in os.environ: os.environ['X509_USER_PROXY'] = cred.credObj.getUserProxy() else: if config.get('check proxy', True): try: cred.ManualRenewCredential() except Exception as e: logging.critical("could not renew proxy") sys.exit(1) else: logging.critical("please renew your proxy") sys.exit(1) mode_label = 'merge_' if args.merge else '' print "Saving log to {0}".format(os.path.join(workdir, mode_label+'lobster.log')) if not args.foreground: ttyfile = open(os.path.join(workdir, mode_label+'lobster.err'), 'a') print "Saving stderr and stdout to {0}".format(os.path.join(workdir, mode_label+'lobster.err')) with daemon.DaemonContext( detach_process=not args.foreground, stdout=sys.stdout if args.foreground else ttyfile, stderr=sys.stderr if args.foreground else ttyfile, working_directory=workdir, pidfile=get_lock(workdir)): logging.basicConfig( datefmt="%Y-%m-%d %H:%M:%S", format="%(asctime)s [%(levelname)s] - %(filename)s %(lineno)d: %(message)s", level=config.get('log level', 2) * 10, filename=os.path.join(workdir, mode_label+'lobster.log')) if args.foreground: console = logging.StreamHandler() console.setLevel(config.get('log level', 2) * 10) console.setFormatter(logging.Formatter("%(asctime)s [%(levelname)s] - %(filename)s %(lineno)d: %(message)s")) logging.getLogger('').addHandler(console) config['configdir'] = args.configdir config['filepath'] = args.configfile config['startdir'] = args.startdir if args.merge: if args.server: config['stageout server'] = args.server config['max megabytes'] = args.max_megabytes job_src = cmssw.MergeProvider(config) elif cmsjob: job_src = cmssw.JobProvider(config) else: job_src = job.SimpleJobProvider(config) wq.cctools_debug_flags_set("all") wq.cctools_debug_config_file(os.path.join(workdir, mode_label+"work_queue_debug.log")) wq.cctools_debug_config_file_size(1 << 29) queue = wq.WorkQueue(-1) queue.specify_log(os.path.join(workdir, mode_label+"work_queue.log")) queue.specify_name("lobster_" + mode_label + config["id"]) queue.specify_keepalive_timeout(300) # queue.tune("short-timeout", 600) queue.tune("transfer-outlier-factor", 4) logging.info("starting queue as {0}".format(queue.name)) logging.info("submit workers with: condor_submit_workers -M {0} <num>".format(queue.name)) payload = config.get('tune', {}).get('payload', 400) abort_active = False abort_threshold = config.get('tune', {}).get('abort threshold', 400) abort_multiplier = config.get('tune', {}).get('abort multiplier', 4) if util.checkpoint(workdir, 'KILLED') == 'PENDING': util.register_checkpoint(workdir, 'KILLED', 'RESTART') successful_jobs = 0 creation_time = 0 destruction_time = 0 with open(os.path.join(workdir, mode_label+"lobster_stats.log"), "a") as statsfile: statsfile.write( "#timestamp " + "total_workers_connected total_workers_joined total_workers_removed " + "workers_busy workers_idle " + "tasks_running " + "total_send_time total_receive_time " + "total_create_time total_return_time " + "idle_percentage " + "capacity " + "efficiency " + "jobits_left\n") while not job_src.done(): jobits_left = job_src.work_left() stats = queue.stats with open(os.path.join(workdir, mode_label+"lobster_stats.log"), "a") as statsfile: now = datetime.datetime.now() statsfile.write(" ".join(map(str, [ int(int(now.strftime('%s')) * 1e6 + now.microsecond), stats.total_workers_connected, stats.total_workers_joined, stats.total_workers_removed, stats.workers_busy, stats.workers_idle, stats.tasks_running, stats.total_send_time, stats.total_receive_time, creation_time, destruction_time, stats.idle_percentage, stats.capacity, stats.efficiency, jobits_left ] )) + "\n" ) if util.checkpoint(workdir, 'KILLED') == 'PENDING': util.register_checkpoint(workdir, 'KILLED', str(datetime.datetime.utcnow())) logging.info("terminating gracefully") break logging.info("{0} out of {1} workers busy; {3} jobs running, {4} waiting; {2} jobits left".format( stats.workers_busy, stats.workers_busy + stats.workers_ready, jobits_left, stats.tasks_running, stats.tasks_waiting)) hunger = max(payload - stats.tasks_waiting, 0) t = time.time() while hunger > 0: jobs = job_src.obtain(50) if jobs == None or len(jobs) == 0: break hunger -= len(jobs) for id, cmd, inputs, outputs in jobs: task = wq.Task(cmd) task.specify_tag(id) task.specify_cores(1) # temporary work-around? # task.specify_memory(1000) # task.specify_disk(4000) for (local, remote) in inputs: if os.path.isfile(local): task.specify_input_file(str(local), str(remote), wq.WORK_QUEUE_CACHE) elif os.path.isdir(local): task.specify_directory(local, remote, wq.WORK_QUEUE_INPUT, wq.WORK_QUEUE_CACHE, recursive=True) else: logging.critical("cannot send file to worker: {0}".format(local)) raise NotImplementedError for (local, remote) in outputs: task.specify_output_file(str(local), str(remote)) queue.submit(task) creation_time += int((time.time() - t) * 1e6) task = queue.wait(300) tasks = [] while task: if task.return_status == 0: successful_jobs += 1 tasks.append(task) if queue.stats.tasks_complete > 0: task = queue.wait(1) else: task = None if len(tasks) > 0: try: t = time.time() job_src.release(tasks) destruction_time += int((time.time() - t) * 1e6) except: tb = traceback.format_exc() logging.critical("cannot recover from the following exception:\n" + tb) for task in tasks: logging.critical("tried to return task {0} from {1}".format(task.tag, task.hostname)) raise if successful_jobs >= abort_threshold and not abort_active: logging.info("activating fast abort with multiplier: {0}".format(abort_multiplier)) abort_active = True queue.activate_fast_abort(abort_multiplier) if jobits_left == 0: logging.info("no more work left to do")