def dispatch_kernel(self): start_time, done_events = self.kernel.dispatch( 0, 0, self.ctxs, self.cmd_qs, C_cpu=self.thread_coarsening_cpu, C_gpu=self.thread_coarsening_gpu) fw.host_synchronize(self.cmd_qs, done_events) end_time = datetime.datetime.now() seconds = (end_time - start_time).total_seconds() return seconds
kernel.get_data_types_and_shapes() logging.debug("Dispatching Kernel...") lw_callback = False start_time, done_events = kernel.dispatch(0, 0, ctxs, cmd_qs, C_cpu=1, C_gpu=1) logging.debug("Waiting for events... \n") fw.host_synchronize(cmd_qs, done_events) sched_end_time = datetime.datetime.now() seconds = (sched_end_time - sched_start_time).total_seconds() # print("%s with partition %d and dataset %d ran %fs" % (info['name'], partition, dataset, seconds)) if args.profile: dump = os.path.join("profiling", "dumps") if not os.path.exists(dump): os.makedirs(dump) with open(os.path.join(dump, name), "w") as f: f.write(str(seconds)) if not lw_callback: dump_dev = fw.dump_device_history()
def select_main(kernels, select=baseline_select): cmd_qs, ctxs, gpus, cpus = fw.host_initialize(4, 2) aCPU, aGPU = fw.nCPU, fw.nGPU cpu_q, gpu_q, mixed_q, mixedg_q, mixedc_q = [], [], [], [], [] num_dispatched = 0 num_kernels = len(kernels) rCPU, rGPU = 0, 0 num_global_work_items_max = 0 for i in range(len(kernels)): kernels[i].build_kernel(gpus, cpus, ctxs) kernels[i].random_data() p = kernels[i].partition num_global_work_items = kernels[i].get_num_global_work_items() if num_global_work_items > num_global_work_items_max: num_global_work_items_max = num_global_work_items if p == 0: heapq.heappush(cpu_q, (p, -num_global_work_items, i)) rCPU += 1 elif p == 10: heapq.heappush(gpu_q, (-p, -num_global_work_items, i)) rGPU += 1 elif p >= 5: heapq.heappush(mixedg_q, (abs(p-5), -num_global_work_items, i)) heapq.heappush(mixed_q, (abs(p-5), -num_global_work_items, i)) rCPU += 1 rGPU += 1 else: heapq.heappush(mixedc_q, (abs(p-5), -num_global_work_items, i)) heapq.heappush(mixed_q, (abs(p-5), -num_global_work_items, i)) rCPU +=1 rGPU +=1 logging.debug( "Task Queue Stats") logging.debug(rCPU) logging.debug(rGPU) logging.debug( "CPU " + str(len(cpu_q))) logging.debug( "GPU " + str(len(gpu_q))) logging.debug( "Mixed " + str(len(mixed_q))) logging.debug( "Mixed CPU " +str(len(mixedc_q))) logging.debug( "Mixed GPU " +str(len(mixedg_q))) events = [] now, soon = None, None while (len(cpu_q) > 0 or len(gpu_q) > 0 or len(mixed_q) > 0 or num_dispatched != num_kernels) and (len(cpu_q) > 0 or len(gpu_q) > 0 or len(mixedc_q) > 0 or len(mixedg_q) > 0 or num_dispatched != num_kernels): logging.debug( "READY DEVICES") logging.debug( "GPU " + str(len(fw.ready_queue['gpu']))) logging.debug( "CPU " + str(len(fw.ready_queue['cpu']))) logging.debug( "Number of tasks left") logging.debug( "Mixed Queue " + str(len(mixed_q))) logging.debug( "CPU Queue " + str(len(cpu_q))) logging.debug( "GPU Queue " + str(len(gpu_q))) logging.debug( "Mixed CPU " + str(len(mixedc_q))) logging.debug( "Mixed GPU " + str(len(mixedg_q))) logging.debug( "Number of available devices (CPU and GPU) " + str(fw.nCPU) + " " + str(fw.nGPU)) if fw.nCPU > 0 or fw.nGPU > 0: logging.debug( "Entering selection phase") if soon == None: if select is baseline_select or select is look_ahead_select: now, soon = select(kernels, M=mixed_q, G=gpu_q, C=cpu_q) else: now, soon = select(kernels, M1=mixedc_q, M2=mixedg_q, G=gpu_q, C=cpu_q, feat_max=num_global_work_items_max, threshold=0.4) else: now, soon = soon, None if now == None: logging.debug( "Searching for available devices") continue i, p = now if fw.nCPU > rCPU and fw.nGPU > rGPU: logging.debug( "DISPATCH MULTIPLE") g_factor = fw.nGPU if rGPU == 0 else fw.nGPU / rGPU c_factor = fw.nCPU if rCPU == 0 else fw.nCPU / rCPU free_gpus, free_cpus = [], [] # try: while fw.test_and_set(0,1): pass if p == 0: for j in range(c_factor): free_cpus.append(fw.ready_queue['cpu'].popleft()) elif p == 10: for j in range(g_factor): free_gpus.append(fw.ready_queue['gpu'].popleft()) else: for j in range(c_factor): free_cpus.append(fw.ready_queue['cpu'].popleft()) for j in range(g_factor): free_gpus.append(fw.ready_queue['gpu'].popleft()) fw.rqlock[0] = 0 # except: # logging.debug( free_cpus, free_gpus, framework.ready_queue, framework.nCPU, framework.nGPU, c_factor, g_factor if kernels[i].partition == 0: rCPU -= 1 elif kernels[i].partition == 10: rGPU -= 1 else: rCPU -= 1 rGPU -= 1 kernels[i].partition = p # kernels[i].build_kernel(gpus, cpus, ctxs) # kernels[i].random_data() logging.debug( "Dispatching Multiple " + str(kernels[i].name)) start_time, done_events = kernels[i].dispatch_multiple(free_gpus, free_cpus, ctxs, cmd_qs) events.extend(done_events) num_dispatched += 1 # if False: # pass else: logging.debug( "DISPATCH") cpu, gpu = -1, -1 if p == 0: while fw.test_and_set(0, 1): pass cpu = fw.ready_queue['cpu'].popleft() fw.rqlock[0] = 0 elif p == 10: while fw.test_and_set(0, 1): pass gpu = fw.ready_queue['gpu'].popleft() fw.rqlock[0] = 0 else: while fw.test_and_set(0, 1): pass cpu = fw.ready_queue['cpu'].popleft() gpu = fw.ready_queue['gpu'].popleft() fw.rqlock[0] = 0 if kernels[i].partition == 0: rCPU -= 1 elif kernels[i].partition == 10: rGPU -= 1 else: rCPU -=1 rGPU -=1 kernels[i].partition = p logging.debug( "Dispatching " + str(kernels[i].name) + " with partition class " + str(kernels[i].partition)) start_time, done_events = kernels[i].dispatch(gpu, cpu, ctxs, cmd_qs) events.extend(done_events) num_dispatched +=1 else: logging.debug( "Devices unavailable") fw.host_synchronize(cmd_qs, events) return fw.dump_device_history()