Example #1
0
def execute(config: str, debug: bool, params):
    check_statuses()

    _create_computer()
    _create_docker()

    # Fail all InProgress Tasks
    logger = create_logger(_session, __name__)

    provider = TaskProvider(_session)
    step_provider = StepProvider(_session)

    for t in provider.by_status(TaskStatus.InProgress,
                                worker_index=WORKER_INDEX):
        step = step_provider.last_for_task(t.id)
        logger.error(
            f'Task Id = {t.id} was in InProgress state '
            f'when another tasks arrived to the same worker',
            ComponentType.Worker, t.computer_assigned, t.id, step)
        provider.change_status(t, TaskStatus.Failed)

    # Create dags
    dags = _dag(config, debug, params=params)
    for dag in dags:
        for ids in dag.values():
            for id in ids:
                task = provider.by_id(id)
                task.gpu_assigned = ','.join(
                    [str(i) for i in range(torch.cuda.device_count())])

                provider.commit()
                execute_by_id(id, exit=False)
Example #2
0
def start(daemon: bool, debug: bool, workers: int, log_level: str):
    """
    Start both server and worker on the same machine.

    It starts: redis-server, site, worker_supervisor, workers
    """
    # creating supervisord config
    supervisor_command = 'mlcomp-worker worker-supervisor'
    worker_command = 'mlcomp-worker worker'
    server_command = 'mlcomp-server start-site'

    if debug:
        supervisor_command = 'python mlcomp/worker/__main__.py ' \
                             'worker-supervisor'
        worker_command = 'python mlcomp/worker/__main__.py worker'
        server_command = 'python mlcomp/server/__main__.py start-site'

    folder = os.path.dirname(os.path.dirname(__file__))
    redis_path = os.path.join(folder, 'bin/redis-server')

    daemon_text = 'false' if daemon else 'true'
    text = [
        '[supervisord]',
        f'nodaemon={daemon_text}',
        '',
        '[program:redis]',
        f'command={redis_path} --port {REDIS_PORT}'
        f' --requirepass {REDIS_PASSWORD}',
    ]
    conf = os.path.join(CONFIG_FOLDER, 'supervisord-redis.conf')
    with open(conf, 'w') as f:
        f.writelines('\n'.join(text))

    Popen(
        ['supervisord', f'--configuration={conf}', f'--loglevel={log_level}'])

    sleep(5)
    check_statuses()

    daemon_text = 'false' if daemon else 'true'
    text = [
        '[supervisord]', f'nodaemon={daemon_text}', '', '[program:supervisor]',
        f'command={supervisor_command}', 'autostart=true', 'autorestart=true',
        '', '[program:server]', f'command={server_command}', 'autostart=true',
        'autorestart=true', ''
    ]

    for p in range(workers):
        text.append(f'[program:worker{p}]')
        text.append(f'command={worker_command} {p}')
        text.append('autostart=true')
        text.append('autorestart=true')
        text.append('')

    conf = os.path.join(CONFIG_FOLDER, 'supervisord.conf')
    with open(conf, 'w') as f:
        f.writelines('\n'.join(text))

    os.system(f'supervisord ' f'-c {conf} -e {log_level}')
Example #3
0
def stop():
    """
    Stop supervisord started by start command
    """
    check_statuses()

    lines = os.popen('ps -ef | grep supervisord').readlines()
    for line in lines:
        if 'mlcomp/configs/supervisord.conf' not in line:
            continue
        pid = int(line.split()[1])
        kill_child_processes(pid)
Example #4
0
def sync(project: str, computer: str, only_from: bool, only_to: bool,
         online: bool):
    """
    Syncs specified project on this computer with other computers
    """
    check_statuses()

    _create_computer()
    _create_docker()

    computer = computer or socket.gethostname()
    provider = ComputerProvider(_session)
    project_provider = ProjectProvider(_session)
    computer = provider.by_name(computer)
    computers = provider.all_with_last_activtiy()
    p = project_provider.by_name(project)
    assert p, f'Project={project} is not found'

    sync_folders = yaml_load(p.sync_folders)
    ignore_folders = yaml_load(p.ignore_folders)

    sync_folders = correct_folders(sync_folders, p.name)
    ignore_folders = correct_folders(ignore_folders, p.name)

    if not isinstance(sync_folders, list):
        sync_folders = []
    if not isinstance(ignore_folders, list):
        ignore_folders = []

    folders = [[s, ignore_folders] for s in sync_folders]

    for c in computers:
        if c.name != computer.name:
            if online and (now() - c.last_activity).total_seconds() > 100:
                continue

            if not only_from:
                sync_directed(_session, computer, c, folders)
            if not only_to:
                sync_directed(_session, c, computer, folders)
Example #5
0
def start(daemon: bool, debug: bool, workers: int, log_level: str):
    """
       Start worker_supervisor and workers
    """
    check_statuses()

    # creating supervisord config
    supervisor_command = 'mlcomp-worker worker-supervisor'
    worker_command = 'mlcomp-worker worker'
    if debug:
        supervisor_command = 'python mlcomp/worker/__main__.py ' \
                             'worker-supervisor'
        worker_command = 'python mlcomp/worker/__main__.py worker'

    daemon_text = 'false' if daemon else 'true'
    text = [
        '[supervisord]',
        f'nodaemon={daemon_text}',
        '',
        '[program:supervisor]',
        f'command={supervisor_command} --workers {workers}',
        'autostart=true',
        'autorestart=true',
        ''
    ]
    for p in range(workers):
        text.append(f'[program:worker{p}]')
        text.append(f'command={worker_command} {p}')
        text.append('autostart=true')
        text.append('autorestart=true')
        text.append('')

    conf = os.path.join(CONFIG_FOLDER, 'supervisord.conf')
    with open(conf, 'w') as f:
        f.writelines('\n'.join(text))

    os.system(f'supervisord ' f'-c {conf} -e {log_level}')
Example #6
0
def worker_supervisor(workers: int):
    """
    Start worker supervisor.
    This program controls workers ran on the same machine.
    Also, it writes metric of resources consumption.
    """
    check_statuses()

    host = socket.gethostname()

    logger = create_logger(_session, 'worker_supervisor')
    logger.info('worker_supervisor start',
                ComponentType.WorkerSupervisor,
                host)

    _create_computer(workers)
    _create_docker()

    start_schedule([(stop_processes_not_exist, 10)])

    if DOCKER_MAIN:
        syncer = FileSync()
        start_schedule([(worker_usage, 0)])
        start_schedule([(syncer.sync, 0)])

    name = f'{host}_{DOCKER_IMG}_supervisor'
    argv = [
        'worker', '--loglevel=INFO', '-P=solo', f'-n={name}', '-O fair',
        '-c=1', '--prefetch-multiplier=1', '-Q', f'{name}'
    ]

    logger.info('worker_supervisor run celery',
                ComponentType.WorkerSupervisor,
                host)

    app.worker_main(argv)
Example #7
0
def dag(config: str, control_reqs: bool, params):
    check_statuses()
    _dag(config, control_reqs=control_reqs, params=params)
Example #8
0
def stop_site():
    """
    Stop site
    """
    check_statuses()
    _stop_server()
Example #9
0
def start_site():
    """
    Start only site
    """
    check_statuses()
    _start_server()