Example #1
0
def deploy(machines):
    for machine in machines:
        rest_gpus = len(machine['available_gpus'])
        if rest_gpus == 0:
            continue
        next_job = GpuTask.objects(running_machine=machine['name'], status='aborted').first()
        if not next_job:
            next_job = GpuTask.objects(machine__contains=machine['name'], status='queueing').first()
        if not next_job:
            continue
        if next_job['gpu_num'] > rest_gpus:
            continue
        if not machine.ports:
            continue
        deploy_task(machine, next_job)
Example #2
0
def check_downloading_tasks():
    tasks = GpuTask.objects(status='downloading').all()
    machines = Machine.objects().all()
    machine_dict = {m['name']: m for m in machines}
    for task in tasks:
        if check_image_exist(machine_dict[task['running_machine']], task['docker']):
            task['status'] = 'queueing'
            task.save()
Example #3
0
def main():
    db.connect(mongo_config['DB'],
               host="%s:%s" % (mongo_config['host'], mongo_config['port']))

    missions = GpuTask.objects().all()
    for mission in missions:
        mission_log = GpuTaskLog.objects(gpu_mission_name=mission['name']).first()
        if not mission_log:
            mission_log = GpuTaskLog(gpu_mission_name=mission['name'])
            mission_log.save()
            print('create %s log' % mission['name'])
Example #4
0
def pid_formatter(view, context, model, name):
    pids = model[name]
    if not pids:
        return ''
    else:
        pids = pids.split(',')
    mission = GpuTask.objects(running_pid__in=pids).first()
    if mission:
        return '%s(%s)' % (model[name], mission['name'])
    else:
        return '%s' % model[name]
Example #5
0
def update_machine():
    # offline not available gpus
    all_machines = Machine.objects(accept_jobs__not__contains='gpu').all()
    for m in all_machines:
        m['available_gpus'] = []
        m.save()

    machines = Machine.objects(accept_jobs__contains='gpu').all()
    gpus = Gpu.objects().all()
    gpus_dict = defaultdict(dict)
    for gpu in gpus:
        gpus_dict[gpu['machine']['name']][gpu['order']] = gpu
    not_init_machines = [m for m in machines if m['name'] not in gpus_dict]
    init_gpu(not_init_machines)

    updated_machines = list()
    for m in machines:
        try:
            m['available_gpus'] = []
            m['ports'] = available_ports
            r = requests.get("http://%s/gpu/status/json" % m['plugin']).json()
            for i, g in enumerate(r['Devices']):
                _gpu = gpus_dict[m['name']][i]
                _gpu['power'] = g['Power']
                _gpu['temperature'] = g['Temperature']
                if g['Processes']:
                    _gpu['processes'] = ','.join([str(p['PID']) for p in g['Processes']])
                else:
                    m['available_gpus'].append(_gpu['path'])
                    _gpu['processes'] = ''
                _gpu['last_update'] = datetime.now()
                _gpu.save()
            start_tasks = GpuTask.objects(running_machine=m['name'], status='running').all()
            for task in start_tasks:
                m['available_gpus'] = list(set(m['available_gpus']) - set(task['running_gpu']))
                m['ports'] = list(set(m['ports']) - {task['mount_port']})

            m['gpu_last_update'] = datetime.now()
            m.save()
            updated_machines.append(m)
        except Exception as e:
            logging.error(e)
            # raise
    return updated_machines
Example #6
0
def update_task():
    tasks = GpuTask.objects(status='running').all()
    machines = Machine.objects().all()
    machine_dict = {m['name']: m for m in machines}

    for tasks in tasks:
        try:
            log_request = requests.get("http://%s/containers/%s/logs?stdout=1&stderr=1" % (machine_dict[tasks['running_machine']]['host'], tasks['running_id'][:12]))
            task_log = GpuTaskLog.objects(gpu_mission_name=tasks['name']).first()
            task_log['running_log'] = log_request.text
            task_log.save()
            r = requests.get("http://%s/containers/%s/json" % (machine_dict[tasks['running_machine']]['host'], tasks['running_id'][:12])).json()
            tasks['status'] = r['State']['Status']
            if tasks['status'] != 'running':
                check_result(tasks, task_log)
            tasks['error_log'] = ''
        except Exception as e:
            tasks['error_log'] = str(e)
        tasks['update_time'] = datetime.now()
        tasks.save()
Example #7
0
def stop_daemon(machine_name, container_name):
    machine = Machine.objects(name=machine_name).first()
    if not machine:
        return jsonify({
            "code": 404,
            "msg": "do not find machine %s" % machine_name
        })
    gpu_mission = GpuTask.objects(name=container_name).first()
    if not gpu_mission:
        return jsonify({
            "code": 404,
            "msg": "do not find gpu mission %s" % container_name
        })
    r = requests.post("http://{}/containers/{}/stop".format(
        machine['host'], container_name))
    if r.status_code < 400:
        gpu_mission['status'] = 'manual_aborted'
        gpu_mission['update_time'] = datetime.now()
        gpu_mission['finish_time'] = datetime.now()
        gpu_mission.save()
    return jsonify({"code": r.status_code})