def setUpClass(cls): config_dict = Config(CONFIGURATION) server_config_dict = Config(SERVER_CONFIGURATION) secrets_dict = Config(SECRETS) cls.config = config_dict cls.secrets = secrets_dict cls.glidein_site = config_dict['Glidein']['site'] cls.minio_url = config_dict['StartdLogging']['url'] cls.minio_bucket = config_dict['StartdLogging']['bucket'] cls.minio_acces_key = secrets_dict['StartdLogging']['access_key'] cls.minio_secret_key = secrets_dict['StartdLogging']['secret_key'] cls.minio_secure = True cls.pyglidein_client_name = 'pyglidein-client' cls.metrics_graphite_server = server_config_dict['metrics']['graphite_server'] cls.metrics_namespace = server_config_dict['metrics']['namespace'] cls.tmpdir = tempfile.mkdtemp()
def main(): parser = OptionParser() parser.add_option('--config', type='string', default='cluster.config', help="config file for cluster") parser.add_option('--secrets', type='string', default='.pyglidein_secrets', help="secrets file for cluster") parser.add_option('--uuid', type='string', default=getpass.getuser() + '@' + socket.gethostname(), help="Unique id for this client") (options, args) = parser.parse_args() config_dict = Config(options.config) config_glidein = config_dict['Glidein'] config_cluster = config_dict['Cluster'] if 'StartdLogging' in config_dict: config_startd_logging = config_dict['StartdLogging'] else: config_startd_logging = {} if ('Mode' in config_dict and 'debug' in config_dict['Mode'] and config_dict['Mode']['debug']): logging.basicConfig(level=logging.DEBUG, format='%(asctime)s %(levelname)s %(message)s') else: logging.basicConfig(level=logging.INFO, format='%(asctime)s %(levelname)s %(message)s') # Loading secrets. Fail if permissions wrong. if os.path.isfile(options.secrets): if os.stat(options.secrets).st_mode & (stat.S_IXGRP | stat.S_IRWXO): logger.error('Set Permissions on {} to 600'.format( options.secrets)) sys.exit(1) secrets_dict = Config(options.secrets) if 'StartdLogging' in secrets_dict: secrets_startd_logging = secrets_dict['StartdLogging'] else: secrets_startd_logging = {} else: logger.error( 'Error Accessing Secrets File: {}. '.format(options.secrets) + 'Did you set the --secrets flag?') sys.exit(1) # Importing the correct class to handle the submit sched_type = config_cluster["scheduler"].lower() if sched_type == "htcondor": scheduler = submit.SubmitCondor(config_dict, secrets_dict) metrics = client_metrics.ClientMetricsCondor(config_dict, secrets_dict) elif sched_type == "pbs": scheduler = submit.SubmitPBS(config_dict, secrets_dict) metrics = client_metrics.ClientMetricsPBS(config_dict, secrets_dict) elif sched_type == "slurm": scheduler = submit.SubmitSLURM(config_dict, secrets_dict) metrics = client_metrics.ClientMetricsSlurm(config_dict, secrets_dict) elif sched_type == "uge": scheduler = submit.SubmitUGE(config_dict, secrets_dict) metrics = client_metrics.ClientMetricsPBS(config_dict, secrets_dict) elif sched_type == "lsf": scheduler = submit.SubmitLSF(config_dict, secrets_dict) metrics = client_metrics.ClientMetricsLSF(config_dict, secrets_dict) else: raise Exception('scheduler not supported') # if "glidein_cmd" not in config_dict["Glidein"]: # raise Exception('no glidein_cmd') # Failing if startd logging is enabled and python version < 2.7 if ('send_startd_logs' in config_startd_logging and config_startd_logging['send_startd_logs'] is True and sys.version_info < (2, 7)): logger.error('Python version must be > 2.7 to enable startd logging.') sys.exit(1) # Checking on startd logging configuration if enabled if ('send_startd_logs' in config_startd_logging and config_startd_logging['send_startd_logs'] is True): for config_val in ['url', 'bucket']: if config_val not in config_startd_logging: logger.error( 'Missing %s configuration value in StartdLogging Section' % config_val) sys.exit(1) for secret_val in ['access_key', 'secret_key']: if secret_val not in secrets_startd_logging: logger.error( 'Missing %s secret value in StartdLogging Section' % secret_val) sys.exit(1) while True: if 'ssh_state' in config_glidein and config_glidein['ssh_state']: state = get_ssh_state() else: state = get_state(config_glidein['address']) if 'uuid' in config_glidein: options.uuid = config_glidein['uuid'] info = { 'uuid': options.uuid, 'glideins_idle': dict(), 'glideins_running': dict(), 'glideins_launched': dict(), } metrics_bundle = client_metrics.ClientMetricsBundle(options.uuid) if state: for partition in config_dict['Cluster'].get( 'partitions', ['Cluster']): config_cluster = config_dict[partition] if "running_cmd" not in config_cluster: raise Exception('Section [%s] has no running_cmd' % partition) idle = 0 try: info['glideins_running'][partition] = get_running( config_cluster["running_cmd"]) metrics_bundle.update_metric( 'glideins_running', partition, info['glideins_running'][partition]) if "idle_cmd" in config_cluster: idle = get_running(config_cluster["idle_cmd"]) info['glideins_idle'][partition] = idle metrics_bundle.update_metric( 'glideins_idle', partition, info['glideins_idle'][partition]) except Exception: logger.warn('error getting running job count', exc_info=True) continue info['glideins_launched'][partition] = 0 limit = min( config_cluster["limit_per_submit"], config_cluster["max_total_jobs"] - info['glideins_running'][partition], max(config_cluster.get("max_idle_jobs", 1000) - idle, 0)) # Prioitize job submission. By default, prioritize submission of gpu and high memory jobs. state = sort_states(state, config_cluster["prioritize_jobs"]) for s in state: if sched_type == "pbs": s["memory"] = s["memory"] * 1024 / 1000 if limit <= 0: logger.info('reached limit') break # Skipping CPU jobs for gpu only clusters if ('gpu_only' in config_cluster and config_cluster['gpu_only'] and s["gpus"] == 0): continue # skipping GPU jobs for cpu only clusters if ('cpu_only' in config_cluster and config_cluster['cpu_only'] and s["gpus"] != 0): continue # skipping jobs over cluster resource limits if config_cluster['whole_node']: prefix = 'whole_node_%s' else: prefix = 'max_%s_per_job' for resource in ('cpus', 'gpus', 'memory', 'disk'): cfg_name = prefix % resource if (cfg_name in config_cluster and s[resource] > config_cluster[cfg_name]): break cfg_name = 'min_%s_per_job' % resource if (cfg_name in config_cluster and s[resource] < config_cluster[cfg_name]): break else: if "count" in s and s["count"] > limit: s["count"] = limit scheduler.submit(s, partition) num = 1 if "count" not in s else s["count"] limit -= num info['glideins_launched'][partition] += num metrics_bundle.update_metric( 'glideins_launched', partition, info['glideins_launched'][partition]) logger.info('launched %d glideins on %s', info['glideins_launched'][partition], partition) else: logger.info('no state, nothing to do') metrics_bundle.update_metrics(metrics.get_mma_idle_time()) metrics.send(metrics_bundle) if 'delay' not in config_glidein or int(config_glidein['delay']) < 1: break time.sleep(config_glidein['delay']) for partition in config_dict['Cluster'].get('partitions', ['Cluster']): config_cluster = config_dict[partition] if "cleanup" in config_cluster and config_cluster["cleanup"]: scheduler.cleanup(config_cluster["running_cmd"], config_cluster["dir_cleanup"])
def main(): parser = OptionParser() parser.add_option('-p', '--port', type='int', default=11001, help='Port to serve from (default: 11001)') parser.add_option('-u', '--user', type='string', default=None, help='Only track a single user') parser.add_option('--constraint', type='string', default=None, help='HTCondor constraint expression') parser.add_option( '--delay', type='int', default=300, help='delay between calls to condor_q (default: 300 seconds)') parser.add_option('--debug', action='store_true', default=False, help='Enable debug logging') parser.add_option('--config', type='string', default='pyglidein_server.config', help="config file for cluster") (options, args) = parser.parse_args() config = Config(options.config) logformat = '%(asctime)s %(levelname)s %(name)s : %(message)s' if options.debug: logging.basicConfig(level=logging.DEBUG, format=logformat) else: logging.basicConfig(level=logging.INFO, format=logformat) if options.delay < 0 or options.delay > 1000: raise Exception('delay out of range') if config.get('metrics', {}).get('enable_metrics', False): metrics_sender_client = MetricsSenderClient(config['metrics']) else: metrics_sender_client = None cfg = { 'options': options, 'config': config, 'condor_q': False, 'state': [], 'monitoring': {}, 'metrics_sender_client': metrics_sender_client } # load condor_q IOLoop.instance().call_later(5, partial(condor_q_helper, cfg)) # setup server s = server(cfg) s.start()
def main(): parser = OptionParser() parser.add_option('-p', '--port', type='int', default=11001, help='Port to serve from (default: 11001)') parser.add_option('-u', '--user', type='string', default=None, help='Only track a single user') parser.add_option('--constraint', type='string', default=None, help='HTCondor constraint expression') parser.add_option('--delay', type='int', default=300, help='delay between calls to condor_q (default: 300 seconds)') parser.add_option('--debug', action='store_true', default=False, help='Enable debug logging') parser.add_option('--config', type='string', default='pyglidein_server.config', help="config file for cluster") parser.add_option('-n','--no-daemon',dest='daemon',default=True,action='store_false',help='do not daemonize') parser.add_option('--logfile',default='log',help='filename for logging (daemon mode)') (options, args) = parser.parse_args() config = Config(options.config) logformat = '%(asctime)s %(levelname)s %(name)s : %(message)s' kwargs = { 'format': logformat, 'level': logging.DEBUG if options.debug else logging.INFO, } if options.daemon: kwargs['filename'] = options.logfile if options.delay < 0 or options.delay > 1000: raise Exception('delay out of range') if config.get('metrics', {}).get('enable_metrics', False): metrics_sender_client = MetricsSenderClient(config['metrics']) else: metrics_sender_client = None cfg = {'options': options, 'config': config, 'condor_q': False, 'state': [], 'monitoring': {}, 'metrics_sender_client': metrics_sender_client} def starter(): logging.basicConfig(**kwargs) # load condor_q IOLoop.instance().call_later(5, partial(condor_q_helper, cfg)) # setup server s = server(cfg) s.start() if options.daemon: from pyglidein.daemon import Daemon pid = '/tmp/authorlist.pid' d = Daemon(pidfile=pid, chdir=os.getcwd(), runner=starter) action = args[0] if args else None if (not action) or action == 'start': d.start() elif action == 'stop': d.stop() elif action == 'restart': d.restart() elif action == 'kill': d.kill() else: raise Exception('unknown action') else: starter()
def main(): parser = OptionParser() parser.add_option('-p', '--port', type='int', default=11001, help='Port to serve from (default: 11001)') parser.add_option('-u', '--user', type='string', default=None, help='Only track a single user') parser.add_option('--constraint', type='string', default=None, help='HTCondor constraint expression') parser.add_option('--delay', type='int', default=300, help='delay between calls to condor_q (default: 300 seconds)') parser.add_option('--debug', action='store_true', default=False, help='Enable debug logging') parser.add_option('--config', type='string', default='pyglidein_server.config', help="config file for cluster") parser.add_option('-n','--no-daemon',dest='daemon',default=True,action='store_false',help='do not daemonize') parser.add_option('--logfile',default='log',help='filename for logging (daemon mode)') (options, args) = parser.parse_args() config = Config(options.config) logformat = '%(asctime)s %(levelname)s %(name)s : %(message)s' kwargs = { 'format': logformat, 'level': logging.DEBUG if options.debug else logging.INFO, } if options.daemon: kwargs['filename'] = options.logfile if options.delay < 0 or options.delay > 1000: raise Exception('delay out of range') if config.get('metrics', {}).get('enable_metrics', False): metrics_sender_client = MetricsSenderClient(config['metrics']) else: metrics_sender_client = None cfg = {'options': options, 'config': config, 'condor_q': False, 'state': [], 'monitoring': {}, 'metrics_sender_client': metrics_sender_client} def starter(): logging.basicConfig(**kwargs) # load condor_q IOLoop.current().call_later(5, partial(condor_q, cfg)) # setup server s = server(cfg) s.start() if options.daemon: from pyglidein.daemon import Daemon pid = '/tmp/authorlist.pid' d = Daemon(pidfile=pid, chdir=os.getcwd(), runner=starter) action = args[0] if args else None if (not action) or action == 'start': d.start() elif action == 'stop': d.stop() elif action == 'restart': d.restart() elif action == 'kill': d.kill() else: raise Exception('unknown action') else: starter()