예제 #1
0
    def __init__(self, args, hostname):
        self.context = zmq.Context()
        self.socket = self.context.socket(zmq.REQ)
        self.socket.connect(f"tcp://{args.master_address}")
        self.remaining_timer = remaining_time_minutes(args.time_limit_min)
        self.hostname = hostname
        next(self.remaining_timer)
        self.EXIT_FLAG = False

        self.gpus_per_node = args.gpus_per_node
        self.prefetch_count = args.worker_prefetch_count
        config_logging('serial-launcher',
                       filename=args.log_filename,
                       use_buffer=True)
        self.processes = {}
        self.outfiles = {}
        self.cuteids = {}
        self.start_times = {}
        self.retry_counts = {}
        self.job_specs = {}
        self.runnable_cache = {}
        self.occupancy = 0.0
        self.all_affinity = [
            i * SERIAL_HYPERTHREAD_STRIDE for i in range(SERIAL_CORES_PER_NODE)
        ]
        self.used_affinity = []
예제 #2
0
    def __init__(self, args):
        self.MAX_IDLE_TIME = 120.0
        self.DELAY_PERIOD = 0.2
        self.idle_time = 0.0
        self.EXIT_FLAG = False

        config_logging('serial-launcher',
                       filename=args.log_filename,
                       use_buffer=True)
        self.remaining_timer = remaining_time_minutes(args.time_limit_min)
        next(self.remaining_timer)

        if args.db_prefetch_count == 0:
            prefetch = args.num_workers * 96
        else:
            prefetch = args.db_prefetch_count

        self.job_source = BalsamJobSource(prefetch, args.wf_name)
        self.status_updater = BalsamDBStatusUpdater()
        self.status_updater.start()
        self.job_source.start()

        self.context = zmq.Context()
        self.socket = self.context.socket(zmq.REP)
        self.socket.bind(f"tcp://*:{args.master_port}")
예제 #3
0
    def main(self):
        bcast_msg = {}
        bcast_msg = comm.bcast(bcast_msg, root=0)
        self.gpus_per_node = bcast_msg["gpus_per_node"]
        self.prefetch_count = bcast_msg["worker_prefetch"]
        log_filename = bcast_msg["log_fname"]
        config_logging('serial-launcher', filename=log_filename)

        current_request, next_request = None, WorkerRequest()

        while True:
            done_pks, errors, active = self.poll_processes()
            started_pks = self.start_jobs()
            request_num_jobs = max(
                0, self.prefetch_count - len(self.runnable_cache))

            next_request.add_started(started_pks)
            next_request.add_done(done_pks)
            next_request.add_error(errors)
            next_request.set_request_num_jobs(request_num_jobs)
            if active: next_request.set_active()

            if current_request is None:
                current_request = next_request
                next_request = WorkerRequest()
                current_request.send()
                response_msg = {}
            else:
                response_msg = current_request.get_response()
                if response_msg is not None:
                    current_request = None
                else:
                    time.sleep(0.2)
                    response_msg = {}

            if response_msg.get('exit', False):
                logger.debug(f"rank {RANK} received EXIT")
                break

            if response_msg.get('new_jobs'):
                self.runnable_cache.update(
                    {job['pk']: job
                     for job in response_msg["new_jobs"]})

            logger.debug(f"rank {RANK} occupancy: {self.occupancy} "
                         f"[{len(self.runnable_cache)} additional prefetched "
                         f"jobs in cache]")

        self.exit()
예제 #4
0
    def __init__(self):
        self.MAX_IDLE_TIME = 120.0
        self.DELAY_PERIOD = 0.2
        self.idle_time = 0.0
        self.EXIT_FLAG = False

        args = self.parse_args()
        log_filename = config_logging('serial-launcher')
        bcast_msg = {
            "gpus_per_node": args.gpus_per_node,
            "worker_prefetch": args.worker_prefetch_count,
            "log_fname": log_filename,
        }
        comm.bcast(bcast_msg, root=0)
        self.remaining_timer = remaining_time_minutes(args.time_limit_min)
        next(self.remaining_timer)

        if args.db_prefetch_count == 0:
            prefetch = (comm.size - 1) * 128
        else:
            prefetch = args.db_prefetch_count

        job_source = BalsamJobSource(prefetch, args.wf_name)
        status_updater = BalsamDBStatusUpdater()
        self.manager = ResourceManager(job_source, status_updater)
예제 #5
0
파일: service.py 프로젝트: mgierada/balsam
        QueuedLaunch.refresh_from_scheduler()
        open_queues = get_open_queues()
        if open_queues:
            logger.info(f"Open queues: {list(open_queues.keys())}")
            qlaunch = jobpacker.create_qlaunch(open_queues)
            if qlaunch:
                submit_qlaunch(qlaunch)
        if not QueuedLaunch.acquire_advisory():
            logger.error('Failed to refresh advisory lock; aborting')
            break
        elif not EXIT_FLAG:
            source.clear_stale_locks()
            time.sleep(10)


if __name__ == "__main__":
    setup()
    config_logging('service')
    logger.info(f"Balsam Service starting on {gethostname()}")
    parser = service_subparser()
    transition_pool = transitions.TransitionProcessPool(5, '')
    source.start_tick()
    try:
        main(parser.parse_args())
    except:
        raise
    finally:
        transition_pool.terminate()
        source.release_all_owned()
        logger.info(f"Balsam Service shutdown: released all locks OK")
예제 #6
0
    parser.add_argument('--persistent', action='store_true')
    args = parser.parse_args()
    args.master_host = args.master_address.split(':')[0]
    args.master_port = int(args.master_address.split(':')[1])
    return args


if __name__ == "__main__":
    args = parse_args()
    hostname = socket.gethostname()

    if args.run_master:
        log_fname = args.log_filename + ".master"
        config_logging(
            'serial-launcher',
            filename=log_fname,
            buffer_capacity=128,
        )
        master = Master(args)

        # TODO(KGF): factor out signal handling to SigHandler class
        # (util/sighandler.py) like in B2 1fc1824c
        def handle_term(signum, stack):
            master.EXIT_FLAG = True

        signal.signal(signal.SIGINT, handle_term)
        signal.signal(signal.SIGTERM, handle_term)
        master.main()
    else:
        log_fname = args.log_filename + "." + hostname
        config_logging(
예제 #7
0
파일: launcher.py 프로젝트: ehermes/balsam
        if nthread > 0:
            transition_pool = transitions.TransitionProcessPool(
                nthread, wf_filter)
        else:
            transition_pool = None
        launcher = Launcher(wf_filter, timelimit_min, gpus_per_node)
        launcher.run()
    except:
        raise
    finally:
        if transition_pool is not None:
            transition_pool.terminate()
        logger.info("Exit: Launcher exit graceful\n\n")


def get_args(inputcmd=None):
    '''Parse command line arguments'''
    parser = config_launcher_subparser()
    if inputcmd:
        return parser.parse_args(inputcmd)
    else:
        return parser.parse_args()


if __name__ == "__main__":
    setup()
    args = get_args()
    config_logging('launcher')
    logger.info("Loading Balsam Launcher")
    main(args)
예제 #8
0
import signal
import time
import psutil

from mpi4py import MPI
from django.db import transaction, connections

from balsam import config_logging, settings, setup

setup()
from balsam.launcher.exceptions import *
from balsam.launcher.util import cd, get_tail, remaining_time_minutes
from balsam.core.models import BalsamJob, safe_select, PROCESSABLE_STATES

logger = logging.getLogger('balsam.launcher.mpi_ensemble')
config_logging('serial-launcher')

comm = MPI.COMM_WORLD
RANK = comm.Get_rank()
MSG_BUFSIZE = 2**16
connections.close_all()


class ResourceManager:

    FETCH_PERIOD = 2.0
    KILLED_REFRESH_PERIOD = 3.0

    def __init__(self, job_source):
        self.job_source = job_source
        self.node_occupancy = [0.0 for i in range(comm.size)]