Ejemplo n.º 1
0
    def __init__(self, serverName, serverPort, cfg_params, proxyPath=None):
        """
        Open the communication with an Analysis Server by passing the server URL and the port
        """

        self.ServerTwiki = 'https://twiki.cern.ch/twiki/bin/view/CMS/SWGuideCrabServerForUsers#Server_available_for_users'

        self.asSession = C_AS_Session(serverName, serverPort)
        self.cfg_params = cfg_params
        self.userSubj = ''
        self.serverName = serverName
        credentialType = 'Proxy'
        if common.scheduler.name().upper() in ['CAF','LSF']:
            credentialType = 'Token'
        CliServerParams(self)
        self.crab_task_name = common.work_space.topDir().split('/')[-2] # nice task name "crab_0_..."

        configAPI = {'credential' : credentialType, \
                     'logger' : common.logger() }

        CredAPI =  CredentialAPI( configAPI )
        try:
            self.userSubj = CredAPI.getSubject()
        except Exception, err:
            common.logger.debug("Getting Credential Subject: " +str(traceback.format_exc()))
            raise CrabException("Error Getting Credential Subject")
Ejemplo n.º 2
0
    def __init__(self, serverName, serverPort, cfg_params, proxyPath=None):
        """
        Open the communication with an Analysis Server by passing the server URL and the port
        """

        self.ServerTwiki = (
            "https://twiki.cern.ch/twiki/bin/view/CMS/SWGuideCrabServerForUsers#Server_available_for_users"
        )

        self.asSession = C_AS_Session(serverName, serverPort)
        self.cfg_params = cfg_params
        self.userSubj = ""
        self.serverName = serverName
        credentialType = "Proxy"
        if common.scheduler.name().upper() in ["CAF", "LSF"]:
            credentialType = "Token"
        CliServerParams(self)
        self.crab_task_name = common.work_space.topDir().split("/")[-2]  # nice task name "crab_0_..."

        configAPI = {"credential": credentialType, "logger": common.logger()}

        CredAPI = CredentialAPI(configAPI)
        try:
            self.userSubj = CredAPI.getSubject()
        except Exception, err:
            common.logger.debug("Getting Credential Subject: " + str(traceback.format_exc()))
            raise CrabException("Error Getting Credential Subject")
Ejemplo n.º 3
0
    def checkProxy(self, minTime=10):
        """
        Function to check the Globus proxy.
        """
        if (self.proxyValid): return

        ### Just return if asked to do so
        if (self.dontCheckProxy==1):
            self.proxyValid=1
            return
        CredAPI_config =  { 'credential':'Proxy',\
                            'myProxySvr': self.proxyServer, \
                            'logger': common.logger() \
                          }   
        from ProdCommon.Credential.CredentialAPI import CredentialAPI 
        CredAPI = CredentialAPI(CredAPI_config)

        if not CredAPI.checkCredential(Time=int(minTime)) or \
           not CredAPI.checkAttribute(group=self.group, role=self.role):
            try:
                CredAPI.ManualRenewCredential(group=self.group, role=self.role) 
            except Exception, ex:
                raise CrabException(str(ex))   
Ejemplo n.º 4
0
    def checkProxy(self, minTime=10):
        """
        Function to check the Globus proxy.
        """
        if (self.proxyValid): return

        ### Just return if asked to do so
        if (self.dontCheckProxy == 1):
            self.proxyValid = 1
            return
        CredAPI_config =  { 'credential':'Proxy',\
                            'myProxySvr': self.proxyServer, \
                            'logger': common.logger() \
                          }
        from ProdCommon.Credential.CredentialAPI import CredentialAPI
        CredAPI = CredentialAPI(CredAPI_config)

        if not CredAPI.checkCredential(Time=int(minTime)) or \
           not CredAPI.checkAttribute(group=self.group, role=self.role):
            try:
                CredAPI.ManualRenewCredential(group=self.group, role=self.role)
            except Exception, ex:
                raise CrabException(str(ex))
Ejemplo n.º 5
0
class CredentialRenew(Actor):
    def __init__(self, cfg_params):
        self.cfg_params = cfg_params
        self.credentialType = 'Proxy'
        if common.scheduler.name().upper() in ['LSF', 'CAF']:
            self.credentialType = 'Token'

        # init client server params...
        CliServerParams(self)

    def run(self):
        """
        """

        common.logger.debug("CredentialRenew::run() called")

        # FIXME With MyProxy delegation this part is completely overlapped with the method manageDelegation
        # in SubmitServer. We should to maintain just one version of the method in a common part

        try:
            myproxyserver = Downloader(
                "http://cmsdoc.cern.ch/cms/LCG/crab/config/").config(
                    "myproxy_server.conf")
            myproxyserver = myproxyserver.strip()
            if myproxyserver is None:
                raise CrabException("myproxy_server.conf retrieved but empty")
        except Exception, e:
            common.logger.info(
                "Problem setting myproxy server endpoint: using myproxy.cern.ch"
            )
            common.logger.debug(e)
            myproxyserver = 'myproxy.cern.ch'

        configAPI = {'credential' : self.credentialType, \
                     'myProxySvr' : myproxyserver,\
                     'serverDN'   : self.server_dn,\
                     'shareDir'   : common.work_space.shareDir() ,\
                     'userName'   : getUserName(),\
                     'serverName' : self.server_name, \
                     'logger' : common.logger() \
                     }
        try:
            CredAPI = CredentialAPI(configAPI)
        except Exception, err:
            common.logger.debug("Configuring Credential API: " +
                                str(traceback.format_exc()))
            raise CrabException(
                "ERROR: Unable to configure Credential Client API  %s\n" %
                str(err))
Ejemplo n.º 6
0
            common.logger.info("Problem setting myproxy server endpoint: using myproxy.cern.ch")
            common.logger.debug(e)
            myproxyserver = 'myproxy.cern.ch'

        configAPI = {'credential' : self.credentialType, \
                     'myProxySvr' : myproxyserver,\
                     'serverDN'   : self.server_dn,\
                     'shareDir'   : common.work_space.shareDir() ,\
                     'userName'   : getUserName(),\
                     'serverName' : self.server_name, \
                     'proxyPath'  : self.proxy_path, \
                     'logger'     : common.logger() \
                     }

        try:
            CredAPI =  CredentialAPI( configAPI )
        except Exception, err :
            common.logger.debug("Configuring Credential API: " +str(traceback.format_exc()))
            raise CrabException("ERROR: Unable to configure Credential Client API  %s\n"%str(err))


        if  self.credentialType == 'Proxy':
            # Proxy delegation through MyProxy, 4 days lifetime minimum
            if not CredAPI.checkMyProxy(Time=4, checkRetrieverRenewer=True) :
                common.logger.info("Please renew MyProxy delegated proxy:\n")
                try:
                    CredAPI.credObj.serverDN = self.server_dn
                    CredAPI.ManualRenewMyProxy()
                except Exception, ex:
                    common.logger.debug("Delegating Credentials to MyProxy : " +str(traceback.format_exc()))
                    raise CrabException(str(ex))
Ejemplo n.º 7
0
            )
            common.logger.debug(e)
            myproxyserver = 'myproxy.cern.ch'

        configAPI = {'credential' : self.credentialType, \
                     'myProxySvr' : myproxyserver,\
                     'serverDN'   : self.server_dn,\
                     'shareDir'   : common.work_space.shareDir() ,\
                     'userName'   : getUserName(),\
                     'serverName' : self.server_name, \
                     'proxyPath'  : self.proxy_path, \
                     'logger'     : common.logger() \
                     }

        try:
            CredAPI = CredentialAPI(configAPI)
        except Exception, err:
            common.logger.debug("Configuring Credential API: " +
                                str(traceback.format_exc()))
            raise CrabException(
                "ERROR: Unable to configure Credential Client API  %s\n" %
                str(err))

        if self.credentialType == 'Proxy':
            # Proxy delegation through MyProxy, 4 days lifetime minimum
            if not CredAPI.checkMyProxy(Time=4, checkRetrieverRenewer=True):
                common.logger.info("Please renew MyProxy delegated proxy:\n")
                try:
                    CredAPI.credObj.serverDN = self.server_dn
                    CredAPI.ManualRenewMyProxy()
                except Exception, ex:
Ejemplo n.º 8
0
    def pollProxies(self, credConfig):
        """
        __pollProxies__

        loops on the proxies and makes related actions
        """
        logging.info( "Start proxy's polling...." )

        expiredtask = []

        from ProdCommon.Credential.CredentialAPI import CredentialAPI
        CredAPI = CredentialAPI( credConfig )

        CredAPI.credObj.myproxyServer = '$MYPROXY_SERVER'
 
        mySession = BossLiteAPI("MySQL", self.bossCfgDB)
        tlapi = TaskLifeAPI()

        if os.path.exists(self.proxiespath):

            ## get the list of proxies
            if credConfig['credential'] == 'Token':
                proxieslist=[]
                proxiesTemp = tlapi.getListTokens( mySession.bossLiteDB )
                for proxy in proxiesTemp:
                    if os.path.exists(proxy): proxieslist.append(proxy)
            else:
                proxieslist = tlapi.getListProxies( mySession.bossLiteDB )
 
            for proxyfull in proxieslist:

                ## get the remaining proxy life time
                logging.info("Checking proxy [%s]"% str(proxyfull))
                timeleft = 0
                try:
                    timeleft = CredAPI.getTimeLeft( proxyfull )
                except Exception, exc:
                    logging.info("Problem checking proxy validity: %s"% str(exc))
                    import traceback
                    logging.info( str(traceback.format_exc()) )
                    continue

                ## credential expired ##
                if timeleft <= 0:

                    logging.info( "Credential expired [%s]: %s s"% (proxyfull, str(timeleft)) )
                    tasksbymail = tlapi.getTaskList(proxyfull, mySession.bossLiteDB)
                    allTasks = []
                    for mail, tasks in tasksbymail.iteritems():
                        for task in tasks:
                            ## archive
                            tlapi.archiveBliteTask(mySession, task)
                            tlapi.archiveServerTask(task, mySession.bossLiteDB)
                            ## append for hand clean
                            allTasks.append(task)
                            expiredtask.append(task)

                    try:
                        logging.info("Destroying proxy %s" %proxyfull) 
                        CredAPI.destroyCredential( proxyfull )
                    except Exception, ex:
                        logging.error("Problem '%s' destroying credential '%s'."%(str(ex),str(proxyfull)))
                    
                    ## if not already asked notify the admin to hand-clean
                    if not self.cleanasked(proxyfull):
                        self.notifyToClean(allTasks)
                        self.askclean(proxyfull)

                ## short credential ##
                elif timeleft <= self.minimumleft:
                    logging.info("Credential still valid for: %s s"% str(timeleft))

                    ## proxy renewal through myproxy delegation ##
                    delegatedtimeleft = 0
                    if credConfig['credential'] == 'Proxy': 
                        logging.info("Trying to renew proxy [%s]"% str(proxyfull))

                        if self.useGlExecDelegation == True:
                            # glExec renewal specific parts
                            # TODO
                            # Sanjay fix here
                            # change the proxy ownership so that CrabServer can renew it
                            pass

                        try:

                            CredAPI.renewalMyProxy(proxyfull)
                            delegatedtimeleft = CredAPI.getTimeLeft(proxyfull)
                            logging.info("Renewed credential still valid for: %s s"% str(delegatedtimeleft))

                        except Exception, exc:

                            logging.info("Problem renewing proxy : %s"% str(exc))
                            import traceback
                            logging.info( str(traceback.format_exc()) )
                            delegatedtimeleft = 0

                        if self.useGlExecDelegation == True:
                            # glExec renewal specific parts
                            # TODO
                            # Sanjay fix here
                            # set again the proxy ownership for glExec
                            pass

                    if credConfig['credential'] == 'Token':
                        logging.info("Trying to renew Token [%s]"% str(proxyfull))

              	        try:

                            CredAPI.renewalMyToken(proxyfull)
                            delegatedtimeleft = CredAPI.getTimeLeft(proxyfull)
                            logging.info("Renewed credential still valid for: %s s"% str(delegatedtimeleft))

                        except Exception, exc:

                            logging.info("Problem renewing Token : %s"% str(exc))
                            import traceback
                            logging.info( str(traceback.format_exc()) )
                            delegatedtimeleft = 0
Ejemplo n.º 9
0
def run(args):
    dash_checker = cmssw.dash.JobStateChecker(300)
    with open(args.configfile) as configfile:
        config = yaml.load(configfile)

    workdir = config['workdir']
    if not os.path.exists(workdir):
        os.makedirs(workdir)
        util.register_checkpoint(workdir, "version", get_distribution('Lobster').version)
    else:
        util.verify(workdir)

    cmsjob = False
    if config.get('type', 'cmssw') == 'cmssw':
        cmsjob = True

        from ProdCommon.Credential.CredentialAPI import CredentialAPI
        cred = CredentialAPI({'credential': 'Proxy'})
        if cred.checkCredential(Time=60):
            if not 'X509_USER_PROXY' in os.environ:
                os.environ['X509_USER_PROXY'] = cred.credObj.getUserProxy()
        else:
            if config.get('advanced', {}).get('renew proxy', True):
                try:
                    cred.ManualRenewCredential()
                except Exception as e:
                    print("could not renew proxy")
                    sys.exit(1)
            else:
                print("please renew your proxy")
                sys.exit(1)

    print "Saving log to {0}".format(os.path.join(workdir, 'lobster.log'))

    if not args.foreground:
        ttyfile = open(os.path.join(workdir, 'lobster.err'), 'a')
        print "Saving stderr and stdout to {0}".format(os.path.join(workdir, 'lobster.err'))

    signals = daemon.daemon.make_default_signal_map()
    signals[signal.SIGTERM] = lambda num, frame: kill(args)

    with daemon.DaemonContext(
            detach_process=not args.foreground,
            stdout=sys.stdout if args.foreground else ttyfile,
            stderr=sys.stderr if args.foreground else ttyfile,
            working_directory=workdir,
            pidfile=util.get_lock(workdir),
            signal_map=signals):

        fileh = logging.handlers.RotatingFileHandler(os.path.join(workdir, 'lobster.log'), maxBytes=500e6, backupCount=10)
        fileh.setFormatter(ShortPathFormatter("%(asctime)s [%(levelname)5s] - %(pathname)-40s %(lineno)4d: %(message)s"))
        fileh.setLevel(config.get('advanced', {}).get('log level', 2) * 10)

        logger.addHandler(fileh)
        logger.setLevel(config.get('advanced', {}).get('log level', 2) * 10)

        if args.foreground:
            console = logging.StreamHandler()
            console.setLevel(config.get('advanced', {}).get('log level', 2) * 10)
            console.setFormatter(ShortPathFormatter("%(asctime)s [%(levelname)5s] - %(pathname)-40s %(lineno)4d: %(message)s"))
            logger.addHandler(console)

        config['configdir'] = args.configdir
        config['filename'] = args.configfile
        config['startdir'] = args.startdir
        if cmsjob:
            job_src = cmssw.JobProvider(config)
            actions = cmssw.Actions(config)
        else:
            job_src = job.SimpleJobProvider(config)
            actions = None

        logger.info("using wq from {0}".format(wq.__file__))

        wq.cctools_debug_flags_set("all")
        wq.cctools_debug_config_file(os.path.join(workdir, "work_queue_debug.log"))
        wq.cctools_debug_config_file_size(1 << 29)

        queue = wq.WorkQueue(-1)
        queue.specify_log(os.path.join(workdir, "work_queue.log"))
        queue.specify_name("lobster_" + config["id"])
        queue.specify_keepalive_timeout(300)
        # queue.tune("short-timeout", 600)
        queue.tune("transfer-outlier-factor", 4)
        queue.specify_algorithm(wq.WORK_QUEUE_SCHEDULE_RAND)

        logger.info("starting queue as {0}".format(queue.name))
        logger.info("submit workers with: condor_submit_workers -M {0} <num>".format(queue.name))

        payload = config.get('advanced', {}).get('payload', 400)
        abort_active = False
        abort_threshold = config.get('advanced', {}).get('abort threshold', 400)
        abort_multiplier = config.get('advanced', {}).get('abort multiplier', 4)

        if util.checkpoint(workdir, 'KILLED') == 'PENDING':
            util.register_checkpoint(workdir, 'KILLED', 'RESTART')

        jobits_left = 0
        successful_jobs = 0

        creation_time = 0
        destruction_time = 0

        with open(os.path.join(workdir, "lobster_stats.log"), "a") as statsfile:
            statsfile.write(
                    "#timestamp " +
                    "total_workers_connected total_workers_joined total_workers_removed " +
                    "workers_busy workers_idle " +
                    "tasks_running " +
                    "total_send_time total_receive_time " +
                    "total_create_time total_return_time " +
                    "idle_percentage " +
                    "capacity " +
                    "efficiency " +
                    "total_memory " +
                    "total_cores " +
                    "jobits_left\n")

        while not job_src.done():
            jobits_left = job_src.work_left()
            stats = queue.stats

            with open(os.path.join(workdir, "lobster_stats.log"), "a") as statsfile:
                now = datetime.datetime.now()
                statsfile.write(" ".join(map(str,
                    [
                        int(int(now.strftime('%s')) * 1e6 + now.microsecond),
                        stats.total_workers_connected,
                        stats.total_workers_joined,
                        stats.total_workers_removed,
                        stats.workers_busy,
                        stats.workers_idle,
                        stats.tasks_running,
                        stats.total_send_time,
                        stats.total_receive_time,
                        creation_time,
                        destruction_time,
                        stats.idle_percentage,
                        stats.capacity,
                        stats.efficiency,
                        stats.total_memory,
                        stats.total_cores,
                        jobits_left
                    ]
                    )) + "\n"
                )

            if util.checkpoint(workdir, 'KILLED') == 'PENDING':
                util.register_checkpoint(workdir, 'KILLED', str(datetime.datetime.utcnow()))
                # just in case, check for any remaining not done task that
                # hasn't been reported as aborted
                for task_id in queue._task_table.keys():
                    status = cmssw.dash.status_map[queue.task_state(task_id)]
                    if status not in (cmssw.dash.DONE, cmssw.dash.ABORTED):
                        job_src._JobProvider__dash.update_job(task_id, cmssw.dash.ABORTED)

                logger.info("terminating gracefully")
                break

            logger.info("{0} out of {1} workers busy; {3} jobs running, {4} waiting; {2} jobits left".format(
                    stats.workers_busy,
                    stats.workers_busy + stats.workers_ready,
                    jobits_left,
                    stats.tasks_running,
                    stats.tasks_waiting))

            hunger = max(payload - stats.tasks_waiting, 0)

            t = time.time()
            while hunger > 0:
                jobs = job_src.obtain(50)

                if jobs == None or len(jobs) == 0:
                    break

                hunger -= len(jobs)
                cores = config.get('cores per job', 1)
                for id, cmd, inputs, outputs in jobs:
                    task = wq.Task(cmd)
                    task.specify_tag(id)
                    task.specify_cores(cores)
                    # temporary work-around?
                    # task.specify_memory(1000)
                    # task.specify_disk(4000)

                    for (local, remote, cache) in inputs:
                        if os.path.isfile(local):
                            cache_opt = wq.WORK_QUEUE_CACHE if cache else wq.WORK_QUEUE_NOCACHE
                            task.specify_input_file(str(local), str(remote), cache_opt)
                        elif os.path.isdir(local):
                            task.specify_directory(local, remote, wq.WORK_QUEUE_INPUT,
                                    wq.WORK_QUEUE_CACHE, recursive=True)
                        else:
                            logger.critical("cannot send file to worker: {0}".format(local))
                            raise NotImplementedError

                    for (local, remote) in outputs:
                        task.specify_output_file(str(local), str(remote))

                    queue.submit(task)
            creation_time += int((time.time() - t) * 1e6)

            # update dashboard status for all not done tasks
            # report Done status only once when releasing the task
            # WAITING_RETRIEVAL is not a valid status in dashboard
            # so, skipping it for now
            monitor = job_src._JobProvider__dash
            queue = queue
            exclude_states = (cmssw.dash.DONE, cmssw.dash.WAITING_RETRIEVAL)
            try:
                dash_checker.update_dashboard_states(monitor, queue, exclude_states)
            except Exception as e:
                logger.warning("Could not update job states to dashboard")

            task = queue.wait(300)
            tasks = []
            while task:
                if task.return_status == 0:
                    successful_jobs += 1
                tasks.append(task)
                if queue.stats.tasks_complete > 0:
                    task = queue.wait(1)
                else:
                    task = None
            if len(tasks) > 0:
                try:
                    t = time.time()
                    job_src.release(tasks)
                    destruction_time += int((time.time() - t) * 1e6)
                except:
                    tb = traceback.format_exc()
                    logger.critical("cannot recover from the following exception:\n" + tb)
                    for task in tasks:
                        logger.critical("tried to return task {0} from {1}".format(task.tag, task.hostname))
                    raise
            if successful_jobs >= abort_threshold and not abort_active:
                logger.info("activating fast abort with multiplier: {0}".format(abort_multiplier))
                abort_active = True
                queue.activate_fast_abort(abort_multiplier)

            # recurring actions are triggered here
            if actions:
                actions.take()
        if jobits_left == 0:
            logger.info("no more work left to do")
Ejemplo n.º 10
0
def run(args):
    with open(args.configfile) as configfile:
        config = yaml.load(configfile)

    workdir = config['workdir']
    if not os.path.exists(workdir):
        os.makedirs(workdir)

    cmsjob = False
    if config.get('type', 'cmssw') == 'cmssw':
        cmsjob = True

        from ProdCommon.Credential.CredentialAPI import CredentialAPI
        cred = CredentialAPI({'credential': 'Proxy'})
        if cred.checkCredential(Time=60):
            if not 'X509_USER_PROXY' in os.environ:
                os.environ['X509_USER_PROXY'] = cred.credObj.getUserProxy()
        else:
            if config.get('check proxy', True):
                try:
                    cred.ManualRenewCredential()
                except Exception as e:
                    logging.critical("could not renew proxy")
                    sys.exit(1)
            else:
                logging.critical("please renew your proxy")
                sys.exit(1)

    mode_label = 'merge_' if args.merge else ''
    print "Saving log to {0}".format(os.path.join(workdir, mode_label+'lobster.log'))

    if not args.foreground:
        ttyfile = open(os.path.join(workdir, mode_label+'lobster.err'), 'a')
        print "Saving stderr and stdout to {0}".format(os.path.join(workdir, mode_label+'lobster.err'))

    with daemon.DaemonContext(
            detach_process=not args.foreground,
            stdout=sys.stdout if args.foreground else ttyfile,
            stderr=sys.stderr if args.foreground else ttyfile,
            working_directory=workdir,
            pidfile=get_lock(workdir)):
        logging.basicConfig(
                datefmt="%Y-%m-%d %H:%M:%S",
                format="%(asctime)s [%(levelname)s] - %(filename)s %(lineno)d: %(message)s",
                level=config.get('log level', 2) * 10,
                filename=os.path.join(workdir, mode_label+'lobster.log'))

        if args.foreground:
            console = logging.StreamHandler()
            console.setLevel(config.get('log level', 2) * 10)
            console.setFormatter(logging.Formatter("%(asctime)s [%(levelname)s] - %(filename)s %(lineno)d: %(message)s"))
            logging.getLogger('').addHandler(console)

        config['configdir'] = args.configdir
        config['filepath'] = args.configfile
        config['startdir'] = args.startdir
        if args.merge:
            if args.server:
                config['stageout server'] = args.server
            config['max megabytes'] = args.max_megabytes
            job_src = cmssw.MergeProvider(config)
        elif cmsjob:
            job_src = cmssw.JobProvider(config)
        else:
            job_src = job.SimpleJobProvider(config)

        wq.cctools_debug_flags_set("all")
        wq.cctools_debug_config_file(os.path.join(workdir, mode_label+"work_queue_debug.log"))
        wq.cctools_debug_config_file_size(1 << 29)

        queue = wq.WorkQueue(-1)
        queue.specify_log(os.path.join(workdir, mode_label+"work_queue.log"))
        queue.specify_name("lobster_" + mode_label + config["id"])
        queue.specify_keepalive_timeout(300)
        # queue.tune("short-timeout", 600)
        queue.tune("transfer-outlier-factor", 4)

        logging.info("starting queue as {0}".format(queue.name))
        logging.info("submit workers with: condor_submit_workers -M {0} <num>".format(queue.name))

        payload = config.get('tune', {}).get('payload', 400)
        abort_active = False
        abort_threshold = config.get('tune', {}).get('abort threshold', 400)
        abort_multiplier = config.get('tune', {}).get('abort multiplier', 4)

        if util.checkpoint(workdir, 'KILLED') == 'PENDING':
            util.register_checkpoint(workdir, 'KILLED', 'RESTART')

        successful_jobs = 0

        creation_time = 0
        destruction_time = 0

        with open(os.path.join(workdir, mode_label+"lobster_stats.log"), "a") as statsfile:
            statsfile.write(
                    "#timestamp " +
                    "total_workers_connected total_workers_joined total_workers_removed " +
                    "workers_busy workers_idle " +
                    "tasks_running " +
                    "total_send_time total_receive_time " +
                    "total_create_time total_return_time " +
                    "idle_percentage " +
                    "capacity " +
                    "efficiency " +
                    "jobits_left\n")

        while not job_src.done():
            jobits_left = job_src.work_left()
            stats = queue.stats

            with open(os.path.join(workdir, mode_label+"lobster_stats.log"), "a") as statsfile:
                now = datetime.datetime.now()
                statsfile.write(" ".join(map(str,
                    [
                        int(int(now.strftime('%s')) * 1e6 + now.microsecond),
                        stats.total_workers_connected,
                        stats.total_workers_joined,
                        stats.total_workers_removed,
                        stats.workers_busy,
                        stats.workers_idle,
                        stats.tasks_running,
                        stats.total_send_time,
                        stats.total_receive_time,
                        creation_time,
                        destruction_time,
                        stats.idle_percentage,
                        stats.capacity,
                        stats.efficiency,
                        jobits_left
                    ]
                    )) + "\n"
                )

            if util.checkpoint(workdir, 'KILLED') == 'PENDING':
                util.register_checkpoint(workdir, 'KILLED', str(datetime.datetime.utcnow()))
                logging.info("terminating gracefully")
                break

            logging.info("{0} out of {1} workers busy; {3} jobs running, {4} waiting; {2} jobits left".format(
                    stats.workers_busy,
                    stats.workers_busy + stats.workers_ready,
                    jobits_left,
                    stats.tasks_running,
                    stats.tasks_waiting))

            hunger = max(payload - stats.tasks_waiting, 0)

            t = time.time()
            while hunger > 0:
                jobs = job_src.obtain(50)

                if jobs == None or len(jobs) == 0:
                    break

                hunger -= len(jobs)

                for id, cmd, inputs, outputs in jobs:
                    task = wq.Task(cmd)
                    task.specify_tag(id)
                    task.specify_cores(1)
                    # temporary work-around?
                    # task.specify_memory(1000)
                    # task.specify_disk(4000)

                    for (local, remote) in inputs:
                        if os.path.isfile(local):
                            task.specify_input_file(str(local), str(remote), wq.WORK_QUEUE_CACHE)
                        elif os.path.isdir(local):
                            task.specify_directory(local, remote, wq.WORK_QUEUE_INPUT,
                                    wq.WORK_QUEUE_CACHE, recursive=True)
                        else:
                            logging.critical("cannot send file to worker: {0}".format(local))
                            raise NotImplementedError

                    for (local, remote) in outputs:
                        task.specify_output_file(str(local), str(remote))

                    queue.submit(task)
            creation_time += int((time.time() - t) * 1e6)

            task = queue.wait(300)
            tasks = []
            while task:
                if task.return_status == 0:
                    successful_jobs += 1
                tasks.append(task)
                if queue.stats.tasks_complete > 0:
                    task = queue.wait(1)
                else:
                    task = None
            if len(tasks) > 0:
                try:
                    t = time.time()
                    job_src.release(tasks)
                    destruction_time += int((time.time() - t) * 1e6)
                except:
                    tb = traceback.format_exc()
                    logging.critical("cannot recover from the following exception:\n" + tb)
                    for task in tasks:
                        logging.critical("tried to return task {0} from {1}".format(task.tag, task.hostname))
                    raise
            if successful_jobs >= abort_threshold and not abort_active:
                logging.info("activating fast abort with multiplier: {0}".format(abort_multiplier))
                abort_active = True
                queue.activate_fast_abort(abort_multiplier)
        if jobits_left == 0:
            logging.info("no more work left to do")