Esempio n. 1
0
def main():
    parser = OptionParser()
    parser.add_option(
        '--address',
        type='string',
        default='http://bosco.icecube.wisc.edu:9070',
        help=
        'Address to connect to (default: http://bosco.icecube.wisc.edu:9070)')
    parser.add_option('--debug',
                      action='store_true',
                      default=False,
                      help='Enable debug logging')
    parser.add_option('--ssh-host',
                      dest='ssh_host',
                      type='string',
                      default='',
                      help='ssh host')
    (options, args) = parser.parse_args()

    if options.debug:
        logging.basicConfig(level=logging.DEBUG)
    else:
        logging.basicConfig(level=logging.INFO)

    state = get_state(options.address)
    if state:
        try:
            ssh_write(options.ssh_host, state)
        except Exception:
            logger.warn('error', exc_info=True)
        else:
            logger.info('done')
Esempio n. 2
0
def main():
    parser = OptionParser()
    parser.add_option('--address',type='string',default='http://bosco.icecube.wisc.edu:9070',
                      help='Address to connect to (default: http://bosco.icecube.wisc.edu:9070)')
    parser.add_option('--debug',action='store_true',default=False,
                      help='Enable debug logging')
    parser.add_option('--ssh-host',dest='ssh_host',type='string',
                      default='',help='ssh host')
    (options,args) = parser.parse_args()

    if options.debug:
        logging.basicConfig(level=logging.DEBUG)
    else:
        logging.basicConfig(level=logging.INFO)

    state = get_state(options.address)
    if state:
        try:
            ssh_write(options.ssh_host,state)
        except Exception:
            logger.warn('error',exc_info=True)
        else:
            logger.info('done')
Esempio n. 3
0
def main():
    parser = OptionParser()
    parser.add_option('--config',
                      type='string',
                      default='cluster.config',
                      help="config file for cluster")
    parser.add_option('--uuid',
                      type='string',
                      default=getpass.getuser() + '@' + socket.gethostname(),
                      help="Unique id for this client")
    (options, args) = parser.parse_args()
    config = ConfigParser.ConfigParser()
    config.optionxform = str
    config.read(options.config)
    config_dict = config_options_dict(config)
    config_glidein = config_dict['Glidein']
    config_cluster = config_dict['Cluster']

    # Importing the correct class to handle the submit
    sched_type = config_cluster["scheduler"].lower()
    if sched_type == "htcondor":
        scheduler = submit.SubmitCondor(config_dict)
    elif sched_type == "pbs":
        scheduler = submit.SubmitPBS(config_dict)
    elif sched_type == "slurm":
        scheduler = submit.SubmitSLURM(config_dict)
    elif sched_type == "uge":
        scheduler = submit.SubmitUGE(config_dict)
    elif sched_type == "lsf":
        scheduler = submit.SubmitLSF(config_dict)
    else:
        raise Exception('scheduler not supported')

    # if "glidein_cmd" not in config_dict["Glidein"]:
    #     raise Exception('no glidein_cmd')
    if "running_cmd" not in config_dict["Cluster"]:
        raise Exception('no running_cmd')

    if ('Mode' in config_dict and 'debug' in config_dict['Mode']
            and config_dict['Mode']['debug']):
        logging.basicConfig(level=logging.DEBUG)
    else:
        logging.basicConfig(level=logging.INFO)

    while True:
        if 'ssh_state' in config_glidein and config_glidein['ssh_state']:
            state = get_ssh_state()
        else:
            state = get_state(config_glidein['address'])
        info = {
            'uuid': options.uuid,
            'glideins_running': 0,
            'glideins_launched': 0,
        }
        if state:
            idle = 0
            try:
                info['glideins_running'] = get_running(
                    config_cluster["running_cmd"])
                if "idle_cmd" in config_cluster:
                    idle = get_running(config_cluster["idle_cmd"])
            except Exception:
                logger.warn('error getting running job count', exc_info=True)
                continue
            limit = min(
                config_cluster["limit_per_submit"],
                config_cluster["max_total_jobs"] - info['glideins_running'],
                max(config_cluster.get("max_idle_jobs", 1000) - idle, 0))
            # Prioitize job submission. By default, prioritize submission of gpu and high memory jobs.
            if "prioritize_jobs" in config_cluster:
                state = sort_states(state, config_cluster["prioritize_jobs"])
            else:
                state = sort_states(state, ["gpus", "memory"])
            for s in state:
                if sched_type == "pbs": s["memory"] = s["memory"] * 1024 / 1000
                if limit <= 0:
                    logger.info('reached limit')
                    break
                # Skipping CPU jobs for gpu only clusters
                if ('gpu_only' in config_cluster and config_cluster['gpu_only']
                        and s["gpus"] == 0):
                    continue
                # skipping GPU jobs for cpu only clusters
                if ('cpu_only' in config_cluster and config_cluster['cpu_only']
                        and s["gpus"] != 0):
                    continue
                # skipping jobs over cluster resource limits
                skip = False
                for resource in ('cpus', 'gpus', 'memory', 'disk'):
                    cfg_name = 'max_%s_per_job' % (resource)
                    if (cfg_name in config_cluster
                            and s[resource] > config_cluster[cfg_name]):
                        skip = True
                        break
                if skip:
                    continue
                if "count" in s and s["count"] > limit:
                    s["count"] = limit
                scheduler.submit(s)
                num = 1 if "count" not in s else s["count"]
                limit -= num
                info['glideins_launched'] += num
            logger.info('launched %d glideins', info['glideins_launched'])
        else:
            logger.info('no state, nothing to do')

        # send monitoring info to server
        monitoring(config_glidein['address'], info)

        if 'delay' not in config_glidein or int(config_glidein['delay']) < 1:
            break
        time.sleep(config_glidein['delay'])
    if "cleanup" in config_cluster and config_cluster["cleanup"]:
        scheduler.cleanup(config_cluster["running_cmd"],
                          config_cluster["dir_cleanup"])
Esempio n. 4
0
def main():
    parser = OptionParser()
    parser.add_option('--config', type='string', default='cluster.config',
                      help="config file for cluster")
    parser.add_option('--uuid', type='string',
                      default=getpass.getuser()+'@'+socket.gethostname(),
                      help="Unique id for this client")
    (options, args) = parser.parse_args()
    config = ConfigParser.ConfigParser()
    config.optionxform = str
    config.read(options.config)
    config_dict = config_options_dict(config)
    config_glidein = config_dict['Glidein']
    config_cluster = config_dict['Cluster']

    # Importing the correct class to handle the submit
    sched_type = config_cluster["scheduler"].lower()
    if sched_type == "htcondor":
        scheduler = submit.SubmitCondor(config_dict)
    elif sched_type == "pbs":
        scheduler = submit.SubmitPBS(config_dict)
    elif sched_type == "slurm":
        scheduler = submit.SubmitSLURM(config_dict)
    elif sched_type == "uge":
        scheduler = submit.SubmitUGE(config_dict)
    elif sched_type == "lsf":
        scheduler = submit.SubmitLSF(config_dict)
    else:
        raise Exception('scheduler not supported')

    # if "glidein_cmd" not in config_dict["Glidein"]:
    #     raise Exception('no glidein_cmd')
    if "running_cmd" not in config_dict["Cluster"]:
        raise Exception('no running_cmd')

    if ('Mode' in config_dict and 'debug' in config_dict['Mode'] and
        config_dict['Mode']['debug']):
        logging.basicConfig(level=logging.DEBUG)
    else:
        logging.basicConfig(level=logging.INFO)

    while True:
        if 'ssh_state' in config_glidein and config_glidein['ssh_state']:
            state = get_ssh_state()
        else:
            state = get_state(config_glidein['address'])
        info = {'uuid': options.uuid,
                'glideins_running': 0,
                'glideins_launched': 0,
               }
        if state:
            idle = 0
            try:
                info['glideins_running'] = get_running(config_cluster["running_cmd"])
                if "idle_cmd" in config_cluster:
                    idle = get_running(config_cluster["idle_cmd"])
            except Exception:
                logger.warn('error getting running job count', exc_info=True)
                continue
            limit = min(config_cluster["limit_per_submit"], 
                        config_cluster["max_total_jobs"] - info['glideins_running'],
                        max(config_cluster.get("max_idle_jobs", 1000) - idle, 0))
            # Prioitize job submission. By default, prioritize submission of gpu and high memory jobs. 
            if "prioritize_jobs" in config_cluster:
                state = sort_states(state, config_cluster["prioritize_jobs"])
            else:
                state = sort_states(state, ["gpus", "memory"])
            for s in state:
                if sched_type == "pbs": s["memory"] = s["memory"]*1024/1000 
                if limit <= 0:
                    logger.info('reached limit')
                    break
                # Skipping CPU jobs for gpu only clusters
                if ('gpu_only' in config_cluster and config_cluster['gpu_only']
                    and s["gpus"] == 0):
                    continue
                # skipping GPU jobs for cpu only clusters
                if ('cpu_only' in config_cluster and config_cluster['cpu_only']
                    and s["gpus"] != 0):
                    continue
                # skipping jobs over cluster resource limits
                skip = False
                for resource in ('cpus','gpus','memory','disk'):
                    cfg_name = 'max_%s_per_job'%(resource)
                    if (cfg_name in config_cluster
                        and s[resource] > config_cluster[cfg_name]):
                        skip = True
                        break
                if skip:
                    continue
                if "count" in s and s["count"] > limit:
                    s["count"] = limit
                scheduler.submit(s)
                num = 1 if "count" not in s else s["count"]
                limit -= num
                info['glideins_launched'] += num
            logger.info('launched %d glideins', info['glideins_launched'])
        else:
            logger.info('no state, nothing to do')

        # send monitoring info to server
        monitoring(config_glidein['address'], info)

        if 'delay' not in config_glidein or int(config_glidein['delay']) < 1:
            break
        time.sleep(config_glidein['delay'])
    if "cleanup" in config_cluster and config_cluster["cleanup"]:
        scheduler.cleanup(config_cluster["running_cmd"], config_cluster["dir_cleanup"])