def main(): parser = OptionParser() parser.add_option( '--address', type='string', default='http://bosco.icecube.wisc.edu:9070', help= 'Address to connect to (default: http://bosco.icecube.wisc.edu:9070)') parser.add_option('--debug', action='store_true', default=False, help='Enable debug logging') parser.add_option('--ssh-host', dest='ssh_host', type='string', default='', help='ssh host') (options, args) = parser.parse_args() if options.debug: logging.basicConfig(level=logging.DEBUG) else: logging.basicConfig(level=logging.INFO) state = get_state(options.address) if state: try: ssh_write(options.ssh_host, state) except Exception: logger.warn('error', exc_info=True) else: logger.info('done')
def main(): parser = OptionParser() parser.add_option('--address',type='string',default='http://bosco.icecube.wisc.edu:9070', help='Address to connect to (default: http://bosco.icecube.wisc.edu:9070)') parser.add_option('--debug',action='store_true',default=False, help='Enable debug logging') parser.add_option('--ssh-host',dest='ssh_host',type='string', default='',help='ssh host') (options,args) = parser.parse_args() if options.debug: logging.basicConfig(level=logging.DEBUG) else: logging.basicConfig(level=logging.INFO) state = get_state(options.address) if state: try: ssh_write(options.ssh_host,state) except Exception: logger.warn('error',exc_info=True) else: logger.info('done')
def main(): parser = OptionParser() parser.add_option('--config', type='string', default='cluster.config', help="config file for cluster") parser.add_option('--uuid', type='string', default=getpass.getuser() + '@' + socket.gethostname(), help="Unique id for this client") (options, args) = parser.parse_args() config = ConfigParser.ConfigParser() config.optionxform = str config.read(options.config) config_dict = config_options_dict(config) config_glidein = config_dict['Glidein'] config_cluster = config_dict['Cluster'] # Importing the correct class to handle the submit sched_type = config_cluster["scheduler"].lower() if sched_type == "htcondor": scheduler = submit.SubmitCondor(config_dict) elif sched_type == "pbs": scheduler = submit.SubmitPBS(config_dict) elif sched_type == "slurm": scheduler = submit.SubmitSLURM(config_dict) elif sched_type == "uge": scheduler = submit.SubmitUGE(config_dict) elif sched_type == "lsf": scheduler = submit.SubmitLSF(config_dict) else: raise Exception('scheduler not supported') # if "glidein_cmd" not in config_dict["Glidein"]: # raise Exception('no glidein_cmd') if "running_cmd" not in config_dict["Cluster"]: raise Exception('no running_cmd') if ('Mode' in config_dict and 'debug' in config_dict['Mode'] and config_dict['Mode']['debug']): logging.basicConfig(level=logging.DEBUG) else: logging.basicConfig(level=logging.INFO) while True: if 'ssh_state' in config_glidein and config_glidein['ssh_state']: state = get_ssh_state() else: state = get_state(config_glidein['address']) info = { 'uuid': options.uuid, 'glideins_running': 0, 'glideins_launched': 0, } if state: idle = 0 try: info['glideins_running'] = get_running( config_cluster["running_cmd"]) if "idle_cmd" in config_cluster: idle = get_running(config_cluster["idle_cmd"]) except Exception: logger.warn('error getting running job count', exc_info=True) continue limit = min( config_cluster["limit_per_submit"], config_cluster["max_total_jobs"] - info['glideins_running'], max(config_cluster.get("max_idle_jobs", 1000) - idle, 0)) # Prioitize job submission. By default, prioritize submission of gpu and high memory jobs. if "prioritize_jobs" in config_cluster: state = sort_states(state, config_cluster["prioritize_jobs"]) else: state = sort_states(state, ["gpus", "memory"]) for s in state: if sched_type == "pbs": s["memory"] = s["memory"] * 1024 / 1000 if limit <= 0: logger.info('reached limit') break # Skipping CPU jobs for gpu only clusters if ('gpu_only' in config_cluster and config_cluster['gpu_only'] and s["gpus"] == 0): continue # skipping GPU jobs for cpu only clusters if ('cpu_only' in config_cluster and config_cluster['cpu_only'] and s["gpus"] != 0): continue # skipping jobs over cluster resource limits skip = False for resource in ('cpus', 'gpus', 'memory', 'disk'): cfg_name = 'max_%s_per_job' % (resource) if (cfg_name in config_cluster and s[resource] > config_cluster[cfg_name]): skip = True break if skip: continue if "count" in s and s["count"] > limit: s["count"] = limit scheduler.submit(s) num = 1 if "count" not in s else s["count"] limit -= num info['glideins_launched'] += num logger.info('launched %d glideins', info['glideins_launched']) else: logger.info('no state, nothing to do') # send monitoring info to server monitoring(config_glidein['address'], info) if 'delay' not in config_glidein or int(config_glidein['delay']) < 1: break time.sleep(config_glidein['delay']) if "cleanup" in config_cluster and config_cluster["cleanup"]: scheduler.cleanup(config_cluster["running_cmd"], config_cluster["dir_cleanup"])
def main(): parser = OptionParser() parser.add_option('--config', type='string', default='cluster.config', help="config file for cluster") parser.add_option('--uuid', type='string', default=getpass.getuser()+'@'+socket.gethostname(), help="Unique id for this client") (options, args) = parser.parse_args() config = ConfigParser.ConfigParser() config.optionxform = str config.read(options.config) config_dict = config_options_dict(config) config_glidein = config_dict['Glidein'] config_cluster = config_dict['Cluster'] # Importing the correct class to handle the submit sched_type = config_cluster["scheduler"].lower() if sched_type == "htcondor": scheduler = submit.SubmitCondor(config_dict) elif sched_type == "pbs": scheduler = submit.SubmitPBS(config_dict) elif sched_type == "slurm": scheduler = submit.SubmitSLURM(config_dict) elif sched_type == "uge": scheduler = submit.SubmitUGE(config_dict) elif sched_type == "lsf": scheduler = submit.SubmitLSF(config_dict) else: raise Exception('scheduler not supported') # if "glidein_cmd" not in config_dict["Glidein"]: # raise Exception('no glidein_cmd') if "running_cmd" not in config_dict["Cluster"]: raise Exception('no running_cmd') if ('Mode' in config_dict and 'debug' in config_dict['Mode'] and config_dict['Mode']['debug']): logging.basicConfig(level=logging.DEBUG) else: logging.basicConfig(level=logging.INFO) while True: if 'ssh_state' in config_glidein and config_glidein['ssh_state']: state = get_ssh_state() else: state = get_state(config_glidein['address']) info = {'uuid': options.uuid, 'glideins_running': 0, 'glideins_launched': 0, } if state: idle = 0 try: info['glideins_running'] = get_running(config_cluster["running_cmd"]) if "idle_cmd" in config_cluster: idle = get_running(config_cluster["idle_cmd"]) except Exception: logger.warn('error getting running job count', exc_info=True) continue limit = min(config_cluster["limit_per_submit"], config_cluster["max_total_jobs"] - info['glideins_running'], max(config_cluster.get("max_idle_jobs", 1000) - idle, 0)) # Prioitize job submission. By default, prioritize submission of gpu and high memory jobs. if "prioritize_jobs" in config_cluster: state = sort_states(state, config_cluster["prioritize_jobs"]) else: state = sort_states(state, ["gpus", "memory"]) for s in state: if sched_type == "pbs": s["memory"] = s["memory"]*1024/1000 if limit <= 0: logger.info('reached limit') break # Skipping CPU jobs for gpu only clusters if ('gpu_only' in config_cluster and config_cluster['gpu_only'] and s["gpus"] == 0): continue # skipping GPU jobs for cpu only clusters if ('cpu_only' in config_cluster and config_cluster['cpu_only'] and s["gpus"] != 0): continue # skipping jobs over cluster resource limits skip = False for resource in ('cpus','gpus','memory','disk'): cfg_name = 'max_%s_per_job'%(resource) if (cfg_name in config_cluster and s[resource] > config_cluster[cfg_name]): skip = True break if skip: continue if "count" in s and s["count"] > limit: s["count"] = limit scheduler.submit(s) num = 1 if "count" not in s else s["count"] limit -= num info['glideins_launched'] += num logger.info('launched %d glideins', info['glideins_launched']) else: logger.info('no state, nothing to do') # send monitoring info to server monitoring(config_glidein['address'], info) if 'delay' not in config_glidein or int(config_glidein['delay']) < 1: break time.sleep(config_glidein['delay']) if "cleanup" in config_cluster and config_cluster["cleanup"]: scheduler.cleanup(config_cluster["running_cmd"], config_cluster["dir_cleanup"])