def check_except(name): log = logging.getLogger(name) total_uptime = 0. total_downtime = 0. run_time = 0. counter = 0 up = False uptxt = 'DOWN' ts = dt.now() log.info('START,%f,%f,%f', total_uptime, total_downtime, run_time) while True: ts = dt.now() time.sleep(.5) try: client = RedisClient(name) status = client.ping() if counter == 0: client.set('testcounter', 1) else: client.incr('testcounter') counter += 1 testcounter = int(client.get('testcounter')) assert (counter == testcounter) except redis.RedisError as e: print(' REDIS ERROR ===> ', e.__name__) status = False except OverlayNotAvailable as e: print(' OVERLAY not available') status = False delta = (dt.now() - ts).total_seconds() if status == up: run_time += delta else: print('STATUS Change from %s' % uptxt) log.info('%s,%f,%f,%f', uptxt, total_uptime, total_downtime, run_time) run_time = 0. if status: uptxt = 'UP' total_uptime += delta else: uptxt = 'DOWN' total_downtime += delta print('%s,%f' % (uptxt, run_time)) up = status
def wait_catalog(self): """Blocks current execution until the catalog service is available. If it is not available remotely, start up a local service. """ start = dt.datetime.now() settings = systemsettings() while True: try: if self.catalog is None: self.catalog = RedisClient(settings.name) self.catalog.ping() break except OverlayNotAvailable as e: self.start_local_catalog() self.catalog = None except redis.RedisError as e: self.catalog = None delta = (dt.datetime.now() - start).total_seconds() if delta > 1: logging.info('CLIENT_DELAY,%f', delta)
# return self.feal_list[trnum] ############################# if __name__ == '__main__': parser = argparse.ArgumentParser() parser.add_argument('name', default='default') parser.add_argument('--centroid', action='store_true') args = parser.parse_args() confile = args.name + '.json' settings = systemsettings() settings.applyConfig(confile) catalog = RedisClient(args.name) # TO Recalculate PCA Vectors from DEShaw (~30-40 mins at 10% of data) # calcDEShaw_PCA(catalog) # sys.exit(0) if args.centroid: centroid_bootstrap(catalog) # #================ # hcf = {k: np.array([np.array(feal[i]) for i in v['elm']]) for k,v in hc5.items()} # fmean = {k: np.mean(v, axis=0) for k,v in hcf.items()} # def find_hc(hclist, index):
def run(self): args = self.parser.parse_args() settings = systemsettings() self.experiment_number = settings.EXPERIMENT_NUMBER logging.info("APPLICATION: %s", settings.APPL_LABEL) logging.info("WORKDIR: %s", settings.WORKDIR) # Read in Slurm params (TODO: Move to abstract slurm call) if self.job_id is None: self.job_id = os.getenv('JOB_NAME') self.slurm_id = os.getenv('SLURM_JOB_ID') logging.debug('EnVars') for i in ['SBATCH_JOBID', 'SBATCH_JOB_NAME', 'SLURM_JOB_ID', 'SLURM_JOBID', 'SLURM_JOB_NAME']: logging.debug(' %s : %s', i, os.getenv(i)) logging.info("JOB NAME : %s", str(self.job_id)) logging.info("SLURM JOB: %s", str(self.slurm_id)) if args.debug: logging.debug("DEBUGGING: %s", self.name) if args.single: logging.debug("Macrothread running in single exection Mode (only 1 manager will execute).") self.singleuse = True if args.init: sys.exit(0) # Both Worker & Manager need catalog to run; load it here and import schema retry = 3 connected = False while retry > 0: retry -= 1 logging.info('Trying to estabish connection to the Catalog Service') try: self.catalog = RedisClient(settings.name) if self.catalog.isconnected and self.catalog.ping(): logging.info('Catalog service is connected') connected = True break logging.info("Catalog service is not running. Trying to start the service now") self.start_local_catalog() except (redis.RedisError, OverlayNotAvailable) as e: self.catalog = None self.start_local_catalog() if not connected: # If the catalog is unavailable. Fail this thread and re-schedule it if args.workinput: relaunch_cmd = "python3 %s -c %s -w" % (self.fname, self.config, args.workinput) else: self.slurmParams['cpus-per-task'] = 1 relaunch_cmd = "python3 %s -c %s" % (self.fname, self.config) self.slurmParams['job-name'] = self.job_id slurm.sbatch(taskid =self.slurmParams['job-name'], options = self.slurmParams, modules = self.modules, cmd = relaunch_cmd) # NOTE: This should be handled in an exception (need to figure out which one) # And then raise a custom OverlayConnectionError here return # LOAD Some self-bootstraping meta-data (if not alread loaded): mthread_key = 'macrothread:' + self.name if not self.catalog.exists(mthread_key): self.catalog.hmset(mthread_key, {'fname': self.fname}) self.catalog.loadSchema() # Should this be called from within the catalog module? # Load meta-data about registered mactrothreads self.data['macrothread'] = {} for key in self.catalog.keys('macrothread'): mt_name = key.split(':')[1] self.data['macrothread'][mt_thread] = self.catalog.hgetall(key) # Load current STATE from Catalog logging.info("Loading Thread State for from catalog:") # Load Standard set of simple params (init and simulation vals) # By default these are immutable. For any vals which may change or update # during execution, they should be explicitly set in the _mut or _append self.load(list(settings.state.keys())) self.load(list(settings.sim_params.keys())) # Load additional State values self.load(self._mut, self._immut, self._append) if args.workinput: logging.debug("Running worker.") self.worker(args.workinput) else: self.manager() if self.localcatalogserver: logging.debug("This thread is running the catalog. Waiting on local service to terminate...") self.localcatalogserver.join() self.localcatalogserver = None