def deploy(machines): for machine in machines: rest_gpus = len(machine['available_gpus']) if rest_gpus == 0: continue next_job = GpuTask.objects(running_machine=machine['name'], status='aborted').first() if not next_job: next_job = GpuTask.objects(machine__contains=machine['name'], status='queueing').first() if not next_job: continue if next_job['gpu_num'] > rest_gpus: continue if not machine.ports: continue deploy_task(machine, next_job)
def check_downloading_tasks(): tasks = GpuTask.objects(status='downloading').all() machines = Machine.objects().all() machine_dict = {m['name']: m for m in machines} for task in tasks: if check_image_exist(machine_dict[task['running_machine']], task['docker']): task['status'] = 'queueing' task.save()
def main(): db.connect(mongo_config['DB'], host="%s:%s" % (mongo_config['host'], mongo_config['port'])) missions = GpuTask.objects().all() for mission in missions: mission_log = GpuTaskLog.objects(gpu_mission_name=mission['name']).first() if not mission_log: mission_log = GpuTaskLog(gpu_mission_name=mission['name']) mission_log.save() print('create %s log' % mission['name'])
def pid_formatter(view, context, model, name): pids = model[name] if not pids: return '' else: pids = pids.split(',') mission = GpuTask.objects(running_pid__in=pids).first() if mission: return '%s(%s)' % (model[name], mission['name']) else: return '%s' % model[name]
def update_machine(): # offline not available gpus all_machines = Machine.objects(accept_jobs__not__contains='gpu').all() for m in all_machines: m['available_gpus'] = [] m.save() machines = Machine.objects(accept_jobs__contains='gpu').all() gpus = Gpu.objects().all() gpus_dict = defaultdict(dict) for gpu in gpus: gpus_dict[gpu['machine']['name']][gpu['order']] = gpu not_init_machines = [m for m in machines if m['name'] not in gpus_dict] init_gpu(not_init_machines) updated_machines = list() for m in machines: try: m['available_gpus'] = [] m['ports'] = available_ports r = requests.get("http://%s/gpu/status/json" % m['plugin']).json() for i, g in enumerate(r['Devices']): _gpu = gpus_dict[m['name']][i] _gpu['power'] = g['Power'] _gpu['temperature'] = g['Temperature'] if g['Processes']: _gpu['processes'] = ','.join([str(p['PID']) for p in g['Processes']]) else: m['available_gpus'].append(_gpu['path']) _gpu['processes'] = '' _gpu['last_update'] = datetime.now() _gpu.save() start_tasks = GpuTask.objects(running_machine=m['name'], status='running').all() for task in start_tasks: m['available_gpus'] = list(set(m['available_gpus']) - set(task['running_gpu'])) m['ports'] = list(set(m['ports']) - {task['mount_port']}) m['gpu_last_update'] = datetime.now() m.save() updated_machines.append(m) except Exception as e: logging.error(e) # raise return updated_machines
def update_task(): tasks = GpuTask.objects(status='running').all() machines = Machine.objects().all() machine_dict = {m['name']: m for m in machines} for tasks in tasks: try: log_request = requests.get("http://%s/containers/%s/logs?stdout=1&stderr=1" % (machine_dict[tasks['running_machine']]['host'], tasks['running_id'][:12])) task_log = GpuTaskLog.objects(gpu_mission_name=tasks['name']).first() task_log['running_log'] = log_request.text task_log.save() r = requests.get("http://%s/containers/%s/json" % (machine_dict[tasks['running_machine']]['host'], tasks['running_id'][:12])).json() tasks['status'] = r['State']['Status'] if tasks['status'] != 'running': check_result(tasks, task_log) tasks['error_log'] = '' except Exception as e: tasks['error_log'] = str(e) tasks['update_time'] = datetime.now() tasks.save()
def stop_daemon(machine_name, container_name): machine = Machine.objects(name=machine_name).first() if not machine: return jsonify({ "code": 404, "msg": "do not find machine %s" % machine_name }) gpu_mission = GpuTask.objects(name=container_name).first() if not gpu_mission: return jsonify({ "code": 404, "msg": "do not find gpu mission %s" % container_name }) r = requests.post("http://{}/containers/{}/stop".format( machine['host'], container_name)) if r.status_code < 400: gpu_mission['status'] = 'manual_aborted' gpu_mission['update_time'] = datetime.now() gpu_mission['finish_time'] = datetime.now() gpu_mission.save() return jsonify({"code": r.status_code})