def call_next(self, delay=.75): """Reschedules self for next job (if Slurm Job) with a delay percentage of current time. Delay is float between 0 and 1 """ if self.slurm_id is None: logging.warning("Skipping self-invoked Call Next (not a Slurm Job)") else: ts = int(dt.now().timestamp()) total_time = int(self.ttl) - ts next_start_delay = round(delay * total_time) logging.debug('TIMES') logging.debug('TTL %f', self.ttl) logging.debug('TS %d', ts) logging.info('Redis Service will schedule next job to begin in %d seconds', next_start_delay) # for k, v in os.environ.items(): # print(k, ': ', v) params = {} params['time'] = self.jobinfo['TimeLimit'] params['exclude'] = self.jobinfo['NodeList'] params['nodes'] = os.getenv('SLURM_JOB_NUM_NODES') params['cpus-per-task'] = os.getenv('SLURM_CPUS_PER_TASK') params['partition']= os.getenv('SLURM_JOB_PARTITION') params['job-name'] = os.getenv('SLURM_JOB_NAME') params['workdir'] = os.getcwd() params['begin'] = 'now+%d' % (next_start_delay) params['output'] = '/home-1/[email protected]/ddc/osvc-redis-%%j.out' logging.debug('CALL NEXT for next Redis Server Handover: %s', str(params)) slurm.sbatch(taskid=params['job-name'], options = params, modules = set(['redis']), cmd = "src/overlay.py --name=%s redis start" % self._name_app)
def call_next(self, delay=.75): """Reschedules self for next job (if Slurm Job) with a delay percentage of current time. Delay is float between 0 and 1 """ if self.slurm_id is None: logging.warning( "Skipping self-invoked Call Next (not a Slurm Job)") else: ts = int(dt.now().timestamp()) total_time = int(self.ttl) - ts next_start_delay = round(delay * total_time) logging.debug('TIMES') logging.debug('TTL %f', self.ttl) logging.debug('TS %d', ts) logging.info( 'Redis Service will schedule next job to begin in %d seconds', next_start_delay) # for k, v in os.environ.items(): # print(k, ': ', v) params = {} params['time'] = self.jobinfo['TimeLimit'] params['exclude'] = self.jobinfo['NodeList'] params['nodes'] = os.getenv('SLURM_JOB_NUM_NODES') params['cpus-per-task'] = os.getenv('SLURM_CPUS_PER_TASK') params['partition'] = os.getenv('SLURM_JOB_PARTITION') params['job-name'] = os.getenv('SLURM_JOB_NAME') params['workdir'] = os.getcwd() params['begin'] = 'now+%d' % (next_start_delay) params['output'] = '/home-1/[email protected]/ddc/osvc-redis-%%j.out' logging.debug('CALL NEXT for next Redis Server Handover: %s', str(params)) slurm.sbatch(taskid=params['job-name'], options=params, modules=set(['redis']), cmd="src/overlay.py --name=%s redis start" % self._name_app)
def run(self): args = self.parser.parse_args() settings = systemsettings() self.experiment_number = settings.EXPERIMENT_NUMBER logging.info("APPLICATION: %s", settings.APPL_LABEL) logging.info("WORKDIR: %s", settings.WORKDIR) # Read in Slurm params (TODO: Move to abstract slurm call) if self.job_id is None: self.job_id = os.getenv('JOB_NAME') self.slurm_id = os.getenv('SLURM_JOB_ID') logging.debug('EnVars') for i in ['SBATCH_JOBID', 'SBATCH_JOB_NAME', 'SLURM_JOB_ID', 'SLURM_JOBID', 'SLURM_JOB_NAME']: logging.debug(' %s : %s', i, os.getenv(i)) logging.info("JOB NAME : %s", str(self.job_id)) logging.info("SLURM JOB: %s", str(self.slurm_id)) if args.debug: logging.debug("DEBUGGING: %s", self.name) if args.single: logging.debug("Macrothread running in single exection Mode (only 1 manager will execute).") self.singleuse = True if args.init: sys.exit(0) # Both Worker & Manager need catalog to run; load it here and import schema retry = 3 connected = False while retry > 0: retry -= 1 logging.info('Trying to estabish connection to the Catalog Service') try: self.catalog = RedisClient(settings.name) if self.catalog.isconnected and self.catalog.ping(): logging.info('Catalog service is connected') connected = True break logging.info("Catalog service is not running. Trying to start the service now") self.start_local_catalog() except (redis.RedisError, OverlayNotAvailable) as e: self.catalog = None self.start_local_catalog() if not connected: # If the catalog is unavailable. Fail this thread and re-schedule it if args.workinput: relaunch_cmd = "python3 %s -c %s -w" % (self.fname, self.config, args.workinput) else: self.slurmParams['cpus-per-task'] = 1 relaunch_cmd = "python3 %s -c %s" % (self.fname, self.config) self.slurmParams['job-name'] = self.job_id slurm.sbatch(taskid =self.slurmParams['job-name'], options = self.slurmParams, modules = self.modules, cmd = relaunch_cmd) # NOTE: This should be handled in an exception (need to figure out which one) # And then raise a custom OverlayConnectionError here return # LOAD Some self-bootstraping meta-data (if not alread loaded): mthread_key = 'macrothread:' + self.name if not self.catalog.exists(mthread_key): self.catalog.hmset(mthread_key, {'fname': self.fname}) self.catalog.loadSchema() # Should this be called from within the catalog module? # Load meta-data about registered mactrothreads self.data['macrothread'] = {} for key in self.catalog.keys('macrothread'): mt_name = key.split(':')[1] self.data['macrothread'][mt_thread] = self.catalog.hgetall(key) # Load current STATE from Catalog logging.info("Loading Thread State for from catalog:") # Load Standard set of simple params (init and simulation vals) # By default these are immutable. For any vals which may change or update # during execution, they should be explicitly set in the _mut or _append self.load(list(settings.state.keys())) self.load(list(settings.sim_params.keys())) # Load additional State values self.load(self._mut, self._immut, self._append) if args.workinput: logging.debug("Running worker.") self.worker(args.workinput) else: self.manager() if self.localcatalogserver: logging.debug("This thread is running the catalog. Waiting on local service to terminate...") self.localcatalogserver.join() self.localcatalogserver = None
def manager(self, fork=False): logging.debug("\n==========================\n MANAGER: %s", self.name) # Check global termination: term_flag = self.data['terminate'] if term_flag and term_flag.lower() in ['halt', 'stop', 'now']: logging.info('RECEIVED TERMINATION FLAG. Shutting down') sys.exit(0) # Load Data from Thread's State and Upstream thread if self.upstream: logging.debug("Loading upstream data: %s", self.upstream) self.load(self.upstream) # Check for termination if self.term(): logging.info('TERMINATION condition for ' + self.name) return 0 # Set Elasticity Policy self.configElasPolicy() # Note: Manager can become a service daemon. Thus, we allow the manager # to run along with the monitor process and assume the manager overhead # is small enough to not interfere. Eventually, this will be threaded # differently by preventing the local service (within this object's # context) from running while the manager performs its split() function # worker dispatching. The worker (below) starts a local service # for reading, reads in the state, stops it, performs its work, and then # starts it for writing and remain alive to monitor...... # Hence, we'll eventually change this next line to False or some other # state value or we'll just let this manager become the monitor and # provide the service which means it will need to immediate re-schedule # itself # self.catalogPersistanceState = True # if self.localcatalogserver and not self.catalogPersistanceState: # self.catalog.stop() # self.localcatalogserver = None # TODO: Det if manager should load entire input data set, make this abstract, or # push into UDF portion # Defer can return either a list of items to push back or a "split" value to # perform an in-line data trim on the key-store DB (optimiation) immed, defer = self.split() # Manager oversee's id assignment. idlabel = 'id_%s' % self.name self.catalog.incr(idlabel) nextid = self.catalog.get(idlabel) # first ID check nextid = 0 if nextid is None else int(nextid) myid = self.fromMID() if myid is None: myid = int(nextid - 1) # No Jobs to run.... Delay and then rerun later if len(immed) == 0: delay = int(self.delay) logging.debug("MANAGER %s: No Available input data. Delaying %d seconds and rescheduling...." % (self.name, delay)) self.slurmParams['begin'] = 'now+%d' % delay # Dispatch Workers else: workernum = 1 delay = 180 + self.delay # Set baseline slurm params and modules (to allow for dynamic disatching) baseline_param = copy.deepcopy(self.slurmParams) baseline_mods = copy.deepcopy(self.modules) for i in immed: logging.debug("%s: scheduling worker, input=%s", self.name, i) self.preparejob(i) self.slurmParams['job-name'] = self.toWID(myid, workernum) slurm.sbatch(taskid=self.slurmParams['job-name'], options = self.slurmParams, modules = self.modules, cmd = "python3 %s -c %s -w %s" % (self.fname, self.config, str(i))) workernum += 1 # Reset params and mods self.slurmParams = copy.deepcopy(baseline_param) self.modules = copy.deepcopy(baseline_mods) # Single use exit: if self.singleuse: logging.debug("SINGLE USE INVOKED. No more managers will run ") return 0 # Elas Policy to control manager rescheduling self.slurmParams['begin'] = 'now+%d' % delay self.slurmParams['job-name'] = self.toMID(nextid) self.slurmParams['cpus-per-task'] = 1 slurm.sbatch(taskid =self.slurmParams['job-name'], options = self.slurmParams, modules = self.modules, cmd = "python3 %s -c %s" % (self.fname, self.config)) # TODO: Alternate manager rescheduling: Trigger Based # use after:job_id[:jobid...] w/ #SBATCH --dependency=<dependency_list> # Consume upstream input data logging.debug('Consuming Upstream Data....') if isinstance(defer, list): logging.debug('Deferring a list and removing %d items tasked to run immed', len(immed)) self.catalog.removeItems(self.upstream, immed) elif defer is not None: logging.debug('Slicing %d items', defer) self.catalog.slice(self.upstream, defer) # Other interal thread state is saved back to catalog self.save(self._mut) logging.debug("==========================") return 0