Exemple #1
0
    def start_driver(self):
        name = '[dpark] ' + os.path.abspath(sys.argv[0]) + ' ' + ' '.join(
            sys.argv[1:])
        if len(name) > 256:
            name = name[:256] + '...'
        framework = mesos_pb2.FrameworkInfo()
        framework.user = getuser()
        if framework.user == 'root':
            raise Exception("dpark is not allowed to run as 'root'")
        framework.name = name
        framework.hostname = socket.gethostname()

        self.driver = mesos.MesosSchedulerDriver(self, framework, self.master)
        self.driver.start()
        logger.debug("Mesos Scheudler driver started")

        self.started = True
        self.last_finish_time = time.time()

        def check():
            while self.started:
                now = time.time()
                if not self.activeJobs and now - self.last_finish_time > MAX_IDLE_TIME:
                    logger.info("stop mesos scheduler after %d seconds idle",
                                now - self.last_finish_time)
                    self.stop()
                    break
                time.sleep(1)

        spawn(check)
Exemple #2
0
    logging.basicConfig(
        format='[drun] %(threadName)s %(asctime)-15s %(message)s',
        level=options.quiet and logging.ERROR
        or options.verbose and logging.DEBUG or logging.WARNING)

    if options.mpi:
        if options.retry > 0:
            logger.error("MPI application can not retry")
            options.retry = 0
        sched = MPIScheduler(options, command)
    else:
        sched = SubmitScheduler(options, command)

    logger.debug("Connecting to mesos master %s", options.master)
    driver = mesos.MesosSchedulerDriver(sched, sched.framework, options.master)

    driver.start()

    def handler(signm, frame):
        logger.warning("got signal %d, exit now", signm)
        sched.stop(3)

    signal.signal(signal.SIGTERM, handler)
    signal.signal(signal.SIGHUP, handler)
    signal.signal(signal.SIGABRT, handler)
    signal.signal(signal.SIGQUIT, handler)

    try:
        from rfoo.utils import rconsole
        rconsole.spawn_server(locals(), 0)
Exemple #3
0
def main() -> None:
    parser, args = parse_args()
    prepare_env(args)
    katsdpservices.setup_logging()
    katsdpservices.setup_restart()
    if args.log_level is not None:
        logging.root.setLevel(args.log_level.upper())

    logger = logging.getLogger('katsdpcontroller')
    logger.info("Starting SDP product controller...")
    logger.info('katcp: %s:%d', args.host, args.port)
    logger.info('http: %s', args.http_url)

    master_controller = aiokatcp.Client(args.master_controller.host,
                                        args.master_controller.port)
    image_lookup = product_controller.KatcpImageLookup(master_controller)
    try:
        image_resolver_factory = make_image_resolver_factory(
            image_lookup, args)
    except ValueError as exc:
        parser.error(str(exc))

    framework_info = addict.Dict()
    framework_info.user = args.user
    framework_info.name = args.subarray_product_id
    framework_info.checkpoint = True
    framework_info.principal = args.principal
    framework_info.roles = [args.realtime_role, args.batch_role]
    framework_info.capabilities = [{
        'type': 'MULTI_ROLE'
    }, {
        'type': 'TASK_KILLING_STATE'
    }]

    loop = asyncio.get_event_loop()
    sched = scheduler.Scheduler(
        args.realtime_role,
        args.host,
        args.http_port,
        args.http_url,
        task_stats=product_controller.TaskStats(),
        runner_kwargs=dict(access_log_class=web_utils.AccessLogger))
    sched.app.router.add_get('/metrics', web_utils.prometheus_handler)
    sched.app.router.add_get('/health', web_utils.health_handler)
    driver = pymesos.MesosSchedulerDriver(sched,
                                          framework_info,
                                          args.mesos_master,
                                          use_addict=True,
                                          implicit_acknowledgements=False)
    sched.set_driver(driver)
    driver.start()

    dashboard_path = f'/gui/{args.subarray_product_id}/product/dashboard/'
    dashboard_url: Optional[str] = args.dashboard_url
    if args.dashboard_port != 0 and dashboard_url is None:
        dashboard_url = str(
            yarl.URL.build(scheme='http',
                           host=args.external_hostname,
                           port=args.dashboard_port,
                           path=dashboard_path))

    server = product_controller.DeviceServer(
        args.host,
        args.port,
        master_controller,
        args.subarray_product_id,
        sched,
        batch_role=args.batch_role,
        interface_mode=False,
        localhost=args.localhost,
        image_resolver_factory=image_resolver_factory,
        s3_config=args.s3_config if args.s3_config is not None else {},
        graph_dir=args.write_graphs,
        dashboard_url=dashboard_url)
    if args.dashboard_port != 0:
        init_dashboard(server, args, dashboard_path)

    with katsdpservices.start_aiomonitor(loop, args, locals()):
        loop.run_until_complete(run(sched, server))
    loop.close()