def prepare_job_start_events(self): ''' add job start events into job_events list end events should be added when they are starting ''' for job in self.job_list: start_t = job['submit_time'] # util.print_fn('%d, %d' % (start_t, end_t)) #for job start tmp_dict = util.search_dict_list(self.job_events, 'time', start_t) if tmp_dict == None: #not found, add the time into to job_events tmp_dict = dict() tmp_dict['time'] = start_t tmp_dict['start_jobs'] = list() tmp_dict['end_jobs'] = list() tmp_dict['start_jobs'].append(job) self.job_events.append(tmp_dict) else: tmp_dict['start_jobs'].append(job) job['status'] = 'EVENT' #job has been in EVENT status ''' sort events based on their time''' self.job_events.sort(key=lambda e: e.__getitem__('time')) util.print_fn('Init, add job start events') self.print_job_events()
def get_model(model_name): ''' get model tensor information by model_name return a dict{name, tensors(list)} ''' if model_name == 'vgg19': m_idx = 0 elif model_name == 'vgg16': m_idx = 1 elif model_name == 'vgg11': m_idx = 2 elif model_name == 'alexnet': m_idx = 3 elif model_name == 'resnet152': m_idx = 4 elif model_name == 'resnet101': m_idx = 5 elif model_name == 'resnet50': m_idx = 6 elif model_name == 'inception4': m_idx = 7 elif model_name == 'inception3': m_idx = 8 else: # m_idx = random.randint(0,8) m_idx = 8 util.print_fn('No model match, pick %s' % m_names[m_idx]) ret = { 'name': m_names[m_idx], 'ind': m_idx, 'tensors': m_tensors[m_idx], 'mem_util': m_mem[m_idx] } return ret
def get_network_load(self, job_dict): if 'num_gpu' not in job_dict: util.print_fn('No gpu information') return if 'model' not in job_dict: util.print_fn('No model information') return num_w = job_dict['num_gpu'] num_ps = num_w if num_w == 1: job_dict['ps_network'] = list() job_dict['w_network'] = list([0]) ''' check job ps_size ''' job_dict['ps_ave'] = 0 return job_dict['w_network'] = list([job_dict['model']['total_size']] * num_w) job_dict['ps_network'] = list([0] * num_ps) for i in range(0, len(job_dict['model']['tensors'])): ps_idx = int(i % num_ps) # job_dict['ps_network'][ps_idx] += (job_dict['model']['tensors'][i] * num_w) job_dict['ps_network'][ps_idx] += (job_dict['model']['tensors'][i]) for i in range(0, len(job_dict['ps_network'])): job_dict['ps_network'][i] = round(job_dict['ps_network'][i], 1) '''
def get_job_model(self, job_dict): # if job_dict.has_key('model_name') and job_dict.has_key('model_scale'): if ('model_name' in job_dict) and ('model_scale' in job_dict): job_dict['model'] = models.get_model_with_scale( job_dict['model_name'], job_dict['model_scale']) else: util.print_fn('Not enough model information to get the details')
def __init__(self, id, num_node=0, num_gpu_p_node=0, num_cpu_p_node=0, mem_p_node=0): self.num_node = num_node self.num_gpu_p_node = num_gpu_p_node self.num_cpu_p_node = num_cpu_p_node self.mem_p_node = mem_p_node = id self.node_list = list() util.print_fn(' Switch[%d] has %d nodes' % (id, num_node))
def checkpoint_multi_dlas_gpu(self, event_time): ''' Record cluster, and job information, including: time idle_node busy_node: gpu running full_node: all gpus are running idle_gpu busy_gpu pending_job running_job completed_job ''' idle_node = 0 busy_node = 0 full_node = 0 idle_gpu = 0 busy_gpu = 0 pending_job = 0 running_job = 0 completed_job = 0 if FLAGS.schedule != 'multi-dlas-gpu': util.print_fn("Error, not multi-dlas-gpu in checkpoint") exit() for num_gpu, gjob in JOBS.gpu_job.items(): idle_gpu += gjob.free_gpu busy_gpu = CLUSTER.num_gpu - idle_gpu busy_node = int(math.ceil(busy_gpu / CLUSTER.num_gpu_p_node)) full_node = busy_node idle_node = int(CLUSTER.num_node - busy_node) for job in JOBS.job_list: if job['status'] == 'RUNNING': running_job += 1 elif job['status'] == 'PENDING': pending_job += 1 elif job['status'] == 'END': completed_job += 1 #add log self.log_list.append([ event_time, int(idle_node), int(busy_node), int(full_node), int(idle_gpu), int(busy_gpu), int(pending_job), int(running_job), int(completed_job) ]) if len(self.log_list) >= 1: self.dump_all_logs()
def end_job(self, e_job): if FLAGS.schedule != 'multi-dlas-gpu': util.print_fn("Not multi-dlas-gpu") exit() num_gpu = e_job['num_gpu'] gjob = self.gpu_job[num_gpu] gjob.release_job_gpu(1) gjob.runnable_jobs.remove(e_job) # gjob.running_jobs.remove(e_job) gjob.queues[e_job['q_id']].remove(e_job) gjob.end_job += 1
def sort_all_jobs(self, mode=None): ''' Sort jobs based on their sumbit_time j1, num_gpu, start_t, end_t, duration ''' # tmp_list = sorted(self.job_list, key = lambda e:e.__getitem__('start_time')) # tmp_dict = util.search_dict_list(self.job_list, 'start_time', 4) # tmp_dict['end_time'] = 15 # print(tmp_dict) # self.job_list = tmp_list self.job_list.sort(key=lambda e: e.__getitem__('submit_time')) util.print_fn(' Jobs are sorted with their start time') # self.read_all_jobs() if FLAGS.schedule == 'multi-dlas-gpu' and FLAGS.scheme == 'count': for num_gpu, gjob in self.gpu_job.items(): util.print_fn('%d-GPU jobs have %d ' % (num_gpu, gjob.total_job))
def __init__(self, id, num_gpu=0, num_cpu=0, mem=0): = id self.num_cpu = num_cpu self.free_cpus = num_cpu self.num_gpu = num_gpu self.free_gpus = num_gpu #network load: can be bw, or the amount of traffic # in and out should be the same self.network_in = 0 self.network_out = 0 self.mem = mem self.free_mem = mem #node class for gandiva self.job_gpu = 0 self.num_jobs = 0 util.print_fn(' Node[%d] has %d gpus, %d cpus, %d G memory' % (id, num_gpu, num_cpu, mem))
def print_all_job_size_info(self): ''' print job tensor info ''' ps_max_ave_fd = open('ps_max_ave.csv', 'w+') ps_max_ave_writer = csv.writer(ps_max_ave_fd) ps_max_ave_writer.writerow(['ps_max_ave']) ps_max99_ave_fd = open('ps_max99_ave.csv', 'w+') ps_max99_ave_writer = csv.writer(ps_max99_ave_fd) ps_max99_ave_writer.writerow(['ps_max99_ave']) w_fd = open('w.csv', 'w+') w_writer = csv.writer(w_fd) w_writer.writerow(['w']) ps_fd = open('ps.csv', 'w+') ps_writer = csv.writer(ps_fd) ps_writer.writerow(['ps']) ps_w_fd = open('ps_w.csv', 'w+') ps_w_writer = csv.writer(ps_w_fd) ps_w_writer.writerow(['ps_w']) util.print_fn("Start to dump job information") for job in self.job_list: if job['ps_ave'] != 0: ps_max_ave_writer.writerow(list([job['ps_max_ave']])) ps_max99_ave_writer.writerow(list([job['ps_max99_ave']])) w_writer.writerow(list([job['w_network'][0]])) # ps_w_writer.writerow(job['w_network'][0]) # for ps in job['ps_network']: # ps_writer.writerow(ps) # ps_w_writer.writerow(ps) ps_max_ave_fd.close() ps_max99_ave_fd.close() w_fd.close() ps_fd.close() ps_w_fd.close()
def print_job_events(self): util.print_fn(' Print all job events ') for event in self.job_events: util.print_fn( ' event.time[%d], with %d start_jobs, and %d end_jobs' % (event['time'], len(event['start_jobs']), len( event['end_jobs']))) util.print_fn(' ')
def init_log(self): self.log_path = FLAGS.log_path if self.log_path[-1] == '/': self.log_path = self.log_path[:-1] util.print_fn(self.log_path) util.print_fn(' ') #prepare folder cmd = 'mkdir -p ' + self.log_path ''' python 2.7 status, output = commands.getstatusoutput(cmd) ''' #python 2.7 & 3 ret = subprocess.check_output(cmd, shell=True) self.log_file = self.log_path + '/cluster.csv' self.log_job = self.log_path + '/job.csv' if FLAGS.scheme != 'count': self.log_cpu = self.log_path + '/cpu.csv' self.log_gpu = self.log_path + '/gpu.csv' self.log_network = self.log_path + '/network.csv' self.log_mem = self.log_path + '/memory.csv' fd = open(self.log_file, 'w+') log_writer = csv.writer(fd) if FLAGS.scheme == 'gandiva': log_writer.writerow([ 'time', 'idle_node', 'busy_node', 'full_node', 'fra_gpu', 'busy_gpu', 'pending_job', 'running_job', 'completed_job', 'len_g1', 'len_g2', 'len_g4', 'len_g8', 'len_g16', 'len_g32', 'len_g64' ]) else: log_writer.writerow([ 'time', 'idle_node', 'busy_node', 'full_node', 'idle_gpu', 'busy_gpu', 'pending_job', 'running_job', 'completed_job' ]) fd.close() if FLAGS.scheme != 'count': fd = open(self.log_cpu, 'w+') log_writer = csv.writer(fd) log_writer.writerow( ['time'] + ['cpu' + str(i) for i in range(CLUSTER.num_node)]) ''''if combine all the info together log_writer.writerow(['cpu'+str(i) for i in range(CLUSTER.num_node)] + ['gpu'+str(i) for i in range(CLUSTER.num_node)] + ['net'+str(i) for i in range(CLUSTER.num_node)]) ''' fd.close() fd = open(self.log_gpu, 'w+') log_writer = csv.writer(fd) log_writer.writerow( ['time'] + ['gpu' + str(i) for i in range(CLUSTER.num_node)]) fd.close() fd = open(self.log_network, 'w+') log_writer = csv.writer(fd) title_list = list() title_list.append('time') for i in range(CLUSTER.num_node): title_list.append('in' + str(i)) title_list.append('out' + str(i)) log_writer.writerow(title_list) # log_writer.writerow(['net'+str(i) for i in range(CLUSTER.num_node)]) fd.close() fd = open(self.log_mem, 'w+') log_writer = csv.writer(fd) # log_writer.writerow(['time'] + ['mem'+str(i) for i in range(CLUSTER.num_node)]) log_writer.writerow(['time', 'max', '99th', '95th', 'med']) fd.close() fd = open(self.log_job, 'w+') log_writer = csv.writer(fd) if FLAGS.schedule == 'gpu-demands': log_writer.writerow([ 'time', '1-GPU', '2-GPU', '4-GPU', '8-GPU', '12-GPU', '16-GPU', '24-GPU', '32-GPU' ]) else: if FLAGS.scheme == 'count': log_writer.writerow([ 'time', 'job_id', 'num_gpu', 'submit_time', 'start_time', 'end_time', 'executed_time', 'JCT', 'duration', 'pending_time', 'preempt', 'resume', 'promote' ]) else: log_writer.writerow([ 'time', 'job_id', 'num_gpu', 'submit_time', 'start_time', 'end_time', 'executed_time', 'JCT', 'duration', 'pending_time', 'preempt', 'promote' ]) fd.close()
def completion_check(self): for num_gpu, gjob in self.gpu_job.items(): if gjob.end_job != gjob.total_job: util.print_fn( '!!!! Miss-match %d completed jobs with %d total jobs in %d-GPU jobs' % (gjob.end_job, gjob.total_job, num_gpu))
def reserve_gpus(self, total_num): ''' GPU cluster reserve gpus for gpu_job groups ''' num_group = len(self.gpu_job) ave_gpu = math.floor(total_num / num_group) job_list = list() for num_gpu, gjob in self.gpu_job.items(): tmp_dict = dict() tmp_dict['num_gpu'] = num_gpu tmp_dict['used_gpu'] = gjob.total_gpu - gjob.free_gpu tmp_dict['demands'] = gjob.get_gpu_demands() tmp_dict['cur_gpu'] = gjob.total_gpu tmp_dict['cur_free_gpu'] = gjob.free_gpu tmp_dict['reserve'] = 0 job_list.append(tmp_dict) total_free_gpu = total_num - sum(k['used_gpu'] for k in job_list) total_demands = sum(k['demands'] for k in job_list) # print('total_free %d, total_demands %d' % (total_free_gpu, total_demands)) if total_demands == 0: return '''demand-based, keep current used_gpu''' remain_free_gpu = total_free_gpu job_list.sort(key=lambda e: e.__getitem__('demands')) for job_dict in job_list: if job_dict['demands'] == 0: continue ratio = round((job_dict['demands'] * 1.0) / total_demands, 2) cal_gpu = int( math.floor((ratio * total_num) / job_dict['num_gpu']) * job_dict['num_gpu']) cal_gpu = job_dict[ 'demands'] if job_dict['demands'] <= cal_gpu else cal_gpu extra_gpu = cal_gpu - job_dict['used_gpu'] if extra_gpu <= 0: extra_gpu = 0 elif extra_gpu > remain_free_gpu: extra_gpu = int( math.floor(remain_free_gpu / job_dict['num_gpu']) * job_dict['num_gpu']) # print('%d-GPU, u%d, cal_gpu %d, extra_g %d' %(job_dict['num_gpu'], job_dict['used_gpu'], cal_gpu, extra_gpu)) job_dict['reserve'] = job_dict['used_gpu'] + extra_gpu remain_free_gpu -= extra_gpu # if remain_free_gpu <= 0: # break ''' still remaining, give to the right job group''' job_list.sort(key=lambda e: e.__getitem__('num_gpu')) num_full = 0 while remain_free_gpu > 0: # if all are satisfied if num_full >= len(job_list): break else: num_full = 0 for job_dict in job_list: if job_dict['demands'] <= job_dict['reserve']: num_full += 1 continue if remain_free_gpu >= job_dict['num_gpu']: remain_free_gpu -= job_dict['num_gpu'] job_dict['reserve'] += job_dict['num_gpu'] else: num_full += 1 if remain_free_gpu <= 0: break #execute reservation for job_dict in job_list: num_gpu = job_dict['num_gpu'] self.gpu_job[num_gpu].get_gpu_reservation(job_dict['reserve']) print("%d-j, T%d, F%d, U%d, N%d, R%d; " % (job_dict['num_gpu'], job_dict['cur_gpu'], job_dict['cur_free_gpu'], job_dict['used_gpu'], job_dict['demands'], job_dict['reserve']), end=' ') for num_gpu, gjob in self.gpu_job.items(): if gjob.free_gpu < 0: print("Error free gpu, %d" % num_gpu) exit() util.print_fn(' %s is done' % sys._getframe().f_code.co_name)
def release_job_gpu(self, num_job=1): if num_job < 0: util.print_fn("Error: num_job < 0") exit() self.free_gpu += int(self.num_gpu * num_job)