Esempio n. 1
0
 def dispatch_kernel(self):
     start_time, done_events = self.kernel.dispatch(
         0,
         0,
         self.ctxs,
         self.cmd_qs,
         C_cpu=self.thread_coarsening_cpu,
         C_gpu=self.thread_coarsening_gpu)
     fw.host_synchronize(self.cmd_qs, done_events)
     end_time = datetime.datetime.now()
     seconds = (end_time - start_time).total_seconds()
     return seconds
    kernel.get_data_types_and_shapes()

    logging.debug("Dispatching Kernel...")

    lw_callback = False

    start_time, done_events = kernel.dispatch(0,
                                              0,
                                              ctxs,
                                              cmd_qs,
                                              C_cpu=1,
                                              C_gpu=1)

    logging.debug("Waiting for events... \n")
    fw.host_synchronize(cmd_qs, done_events)

    sched_end_time = datetime.datetime.now()
    seconds = (sched_end_time - sched_start_time).total_seconds()
    # print("%s with partition %d and dataset %d ran %fs" % (info['name'], partition, dataset, seconds))

    if args.profile:
        dump = os.path.join("profiling", "dumps")
        if not os.path.exists(dump):
            os.makedirs(dump)
        with open(os.path.join(dump, name), "w") as f:
            f.write(str(seconds))

    if not lw_callback:
        dump_dev = fw.dump_device_history()
Esempio n. 3
0
def select_main(kernels, select=baseline_select):
    cmd_qs, ctxs, gpus, cpus = fw.host_initialize(4, 2)
    aCPU, aGPU = fw.nCPU, fw.nGPU
    cpu_q, gpu_q, mixed_q, mixedg_q, mixedc_q = [], [], [], [], []
    num_dispatched = 0
    num_kernels = len(kernels)
    rCPU, rGPU = 0, 0
    num_global_work_items_max = 0
    for i in range(len(kernels)):
        kernels[i].build_kernel(gpus, cpus, ctxs)
        kernels[i].random_data()
        p = kernels[i].partition
        num_global_work_items = kernels[i].get_num_global_work_items()
        if num_global_work_items > num_global_work_items_max:
            num_global_work_items_max = num_global_work_items

        if p == 0:
            heapq.heappush(cpu_q, (p, -num_global_work_items, i))
            rCPU += 1
        elif p == 10:
            heapq.heappush(gpu_q, (-p, -num_global_work_items, i))
            rGPU += 1
        elif p >= 5:
            heapq.heappush(mixedg_q, (abs(p-5), -num_global_work_items, i))
            heapq.heappush(mixed_q, (abs(p-5), -num_global_work_items, i))
            rCPU += 1
            rGPU += 1
        else:
            heapq.heappush(mixedc_q, (abs(p-5), -num_global_work_items, i))
            heapq.heappush(mixed_q, (abs(p-5), -num_global_work_items, i))
            rCPU +=1
            rGPU +=1

    logging.debug( "Task Queue Stats")
    logging.debug(rCPU)
    logging.debug(rGPU)

    logging.debug( "CPU " + str(len(cpu_q)))
    logging.debug( "GPU " + str(len(gpu_q)))
    logging.debug( "Mixed " + str(len(mixed_q)))
    logging.debug( "Mixed CPU " +str(len(mixedc_q)))
    logging.debug( "Mixed GPU " +str(len(mixedg_q)))


    events = []
    now, soon = None, None
    while (len(cpu_q) > 0 or len(gpu_q) > 0 or len(mixed_q) > 0 or num_dispatched != num_kernels) and (len(cpu_q) > 0 or len(gpu_q) > 0 or
                                                                      len(mixedc_q) > 0 or len(mixedg_q) > 0 or num_dispatched != num_kernels):

        logging.debug( "READY DEVICES")
        logging.debug( "GPU " + str(len(fw.ready_queue['gpu'])))
        logging.debug( "CPU " + str(len(fw.ready_queue['cpu'])))
        logging.debug( "Number of tasks left")
        logging.debug( "Mixed Queue " + str(len(mixed_q)))
        logging.debug( "CPU Queue " + str(len(cpu_q)))
        logging.debug( "GPU Queue " + str(len(gpu_q)))
        logging.debug( "Mixed CPU " + str(len(mixedc_q)))
        logging.debug( "Mixed GPU " + str(len(mixedg_q)))

        logging.debug( "Number of available devices (CPU and GPU) " + str(fw.nCPU) + " " + str(fw.nGPU))
        if fw.nCPU > 0 or fw.nGPU > 0:
            logging.debug( "Entering selection phase")
            if soon == None:
                if select is baseline_select or select is look_ahead_select:

                    now, soon = select(kernels, M=mixed_q, G=gpu_q, C=cpu_q)
                else:

                    now, soon = select(kernels, M1=mixedc_q, M2=mixedg_q, G=gpu_q, C=cpu_q, feat_max=num_global_work_items_max, threshold=0.4)
            else:
                now, soon = soon, None

            if now == None:
                logging.debug( "Searching for available devices")
                continue
            i, p = now
            if fw.nCPU > rCPU and fw.nGPU > rGPU:
                logging.debug( "DISPATCH MULTIPLE")

                g_factor = fw.nGPU if rGPU == 0 else fw.nGPU / rGPU
                c_factor = fw.nCPU if rCPU == 0 else fw.nCPU / rCPU
                free_gpus, free_cpus = [], []
                # try:
                while fw.test_and_set(0,1):
                    pass
                if p == 0:
                    for j in range(c_factor):
                        free_cpus.append(fw.ready_queue['cpu'].popleft())
                elif p == 10:
                    for j in range(g_factor):
                        free_gpus.append(fw.ready_queue['gpu'].popleft())
                else:
                    for j in range(c_factor):
                        free_cpus.append(fw.ready_queue['cpu'].popleft())
                    for j in range(g_factor):
                        free_gpus.append(fw.ready_queue['gpu'].popleft())
                fw.rqlock[0] = 0
                # except:
                #     logging.debug( free_cpus, free_gpus, framework.ready_queue, framework.nCPU, framework.nGPU, c_factor, g_factor

                if kernels[i].partition == 0:
                    rCPU -= 1
                elif kernels[i].partition == 10:
                    rGPU -= 1
                else:
                    rCPU -= 1
                    rGPU -= 1

                kernels[i].partition = p
                # kernels[i].build_kernel(gpus, cpus, ctxs)
                # kernels[i].random_data()
                logging.debug( "Dispatching Multiple " + str(kernels[i].name))
                start_time, done_events = kernels[i].dispatch_multiple(free_gpus, free_cpus, ctxs, cmd_qs)
                events.extend(done_events)
                num_dispatched += 1
            # if False:
            #     pass
            else:
                logging.debug( "DISPATCH")
                cpu, gpu = -1, -1
                if p == 0:
                    while fw.test_and_set(0, 1):
                        pass
                    cpu = fw.ready_queue['cpu'].popleft()
                    fw.rqlock[0] = 0
                elif p == 10:
                    while fw.test_and_set(0, 1):
                        pass
                    gpu = fw.ready_queue['gpu'].popleft()
                    fw.rqlock[0] = 0
                else:
                    while fw.test_and_set(0, 1):
                        pass
                    cpu = fw.ready_queue['cpu'].popleft()
                    gpu = fw.ready_queue['gpu'].popleft()
                    fw.rqlock[0] = 0

                if kernels[i].partition == 0:
                    rCPU -= 1
                elif kernels[i].partition == 10:
                    rGPU -= 1
                else:
                    rCPU -=1
                    rGPU -=1

                kernels[i].partition = p

                logging.debug( "Dispatching " + str(kernels[i].name) + " with partition class " + str(kernels[i].partition))
                start_time, done_events = kernels[i].dispatch(gpu, cpu, ctxs, cmd_qs)

                events.extend(done_events)
                num_dispatched +=1
        else:
            logging.debug( "Devices unavailable")
    fw.host_synchronize(cmd_qs, events)

    return fw.dump_device_history()