i1 = ref_ip[0].reshape([dataset, -1])
        i2 = ref_ip[1].reshape([dataset, -1])
        o_pred = ref_op[0].reshape([dataset, -1])
        o_act = i1.dot(i2)
        print o_pred
        print o_act
        return o_pred == o_act


if __name__ == "__main__":
    args = parse_arg(sys.argv[1:])
    src_name = args.file.split("/")[-1]
    s_name = src_name[:-5]  #name of Kernel
    if args.log:
        logging.basicConfig(level=logging.DEBUG)
    cmd_qs, ctxs, gpus, cpus = fw.host_initialize(int(args.nGPU),
                                                  int(args.nCPU))
    info_file = args.file
    with open(info_file, "r") as f:
        info = json.loads(f.read())
    dataset = int(args.dataset_size)

    st = time.time()
    if args.dump_output_file:
        fw.dump_output = True
    if args.partition_class != None:
        partition = int(args.partition_class)
        kernel = fw.Kernel(info, dataset=dataset, partition=partition)
    else:
        kernel = fw.Kernel(info, dataset=dataset)
        partition = info['partition']
Beispiel #2
0
    return task


if __name__ == '__main__':
    args = parse_arg(sys.argv[1:])

    if args.recreate_dag:
        fw.create_dag("./database/info/","./dag_info/dag_transformer/dag.graph","./dag_info/dag_transformer/t1.json"\
    ,partition=10)

    num_chunks = int(args.num_chunks)
    fw.just_for_testing_num_chunks = num_chunks


    info_file = args.file
    cmd_qs, ctxs, gpus, cpus = fw.host_initialize(int(args.nGPU), int(args.nCPU),use_mul_queues = True)


    #Dags_folder = list()
    all_dags = [] #list of all the DAGs

    finished_task_Dag = dict()
    deleted_task_dag = list()

    all_dags_jsons = [join(info_file,f) for f in listdir(info_file)] #list of json files - each json file corresponds to a single DAG
    gantt_label = [(info_file + f) for f in listdir(info_file)]
    gantt = 0
    # count  = 0
    # count1 = 0
    # task_dag_id = 0
    frontier_Q = fw.frontier_Q
Beispiel #3
0
            if local_work_size > dataset:
                continue
            #logging.info("\t\tLocal Work Size :- "+str(local_work_size))
            print "dataset size :- ", dataset
            for partition in [0, 10]:
                with open(info_file, "r") as f:
                    info = json.loads(f.read())
                    info['localWorkSize'] = [local_work_size
                                             ] * info['workDimension']

                if local_work_size >= lims[info['workDimension'] - 1]:
                    break
                elif info['workDimension'] == 3 and dataset > 256:
                    break

                cmd_qs, ctxs, gpus, cpus = fw.host_initialize(1, 1)
                if partition == 0:
                    dev = "CPU"
                else:
                    dev = "GPU"
                print dev

                kernel = fw.Kernel(info, dataset=dataset, partition=partition)
                kernel.build_kernel(gpus, cpus, ctxs, profile=True)
                kernel.random_data()

                ref_ip = []
                ref_op = []
                ref_iop = []
                for i, ip in enumerate(kernel.data["input"]):
                    ref_ip.append(weakref.proxy(ip))
Beispiel #4
0
def select_main(kernels, select=baseline_select):
    cmd_qs, ctxs, gpus, cpus = fw.host_initialize(4, 2)
    aCPU, aGPU = fw.nCPU, fw.nGPU
    cpu_q, gpu_q, mixed_q, mixedg_q, mixedc_q = [], [], [], [], []
    num_dispatched = 0
    num_kernels = len(kernels)
    rCPU, rGPU = 0, 0
    num_global_work_items_max = 0
    for i in range(len(kernels)):
        kernels[i].build_kernel(gpus, cpus, ctxs)
        kernels[i].random_data()
        p = kernels[i].partition
        num_global_work_items = kernels[i].get_num_global_work_items()
        if num_global_work_items > num_global_work_items_max:
            num_global_work_items_max = num_global_work_items

        if p == 0:
            heapq.heappush(cpu_q, (p, -num_global_work_items, i))
            rCPU += 1
        elif p == 10:
            heapq.heappush(gpu_q, (-p, -num_global_work_items, i))
            rGPU += 1
        elif p >= 5:
            heapq.heappush(mixedg_q, (abs(p-5), -num_global_work_items, i))
            heapq.heappush(mixed_q, (abs(p-5), -num_global_work_items, i))
            rCPU += 1
            rGPU += 1
        else:
            heapq.heappush(mixedc_q, (abs(p-5), -num_global_work_items, i))
            heapq.heappush(mixed_q, (abs(p-5), -num_global_work_items, i))
            rCPU +=1
            rGPU +=1

    logging.debug( "Task Queue Stats")
    logging.debug(rCPU)
    logging.debug(rGPU)

    logging.debug( "CPU " + str(len(cpu_q)))
    logging.debug( "GPU " + str(len(gpu_q)))
    logging.debug( "Mixed " + str(len(mixed_q)))
    logging.debug( "Mixed CPU " +str(len(mixedc_q)))
    logging.debug( "Mixed GPU " +str(len(mixedg_q)))


    events = []
    now, soon = None, None
    while (len(cpu_q) > 0 or len(gpu_q) > 0 or len(mixed_q) > 0 or num_dispatched != num_kernels) and (len(cpu_q) > 0 or len(gpu_q) > 0 or
                                                                      len(mixedc_q) > 0 or len(mixedg_q) > 0 or num_dispatched != num_kernels):

        logging.debug( "READY DEVICES")
        logging.debug( "GPU " + str(len(fw.ready_queue['gpu'])))
        logging.debug( "CPU " + str(len(fw.ready_queue['cpu'])))
        logging.debug( "Number of tasks left")
        logging.debug( "Mixed Queue " + str(len(mixed_q)))
        logging.debug( "CPU Queue " + str(len(cpu_q)))
        logging.debug( "GPU Queue " + str(len(gpu_q)))
        logging.debug( "Mixed CPU " + str(len(mixedc_q)))
        logging.debug( "Mixed GPU " + str(len(mixedg_q)))

        logging.debug( "Number of available devices (CPU and GPU) " + str(fw.nCPU) + " " + str(fw.nGPU))
        if fw.nCPU > 0 or fw.nGPU > 0:
            logging.debug( "Entering selection phase")
            if soon == None:
                if select is baseline_select or select is look_ahead_select:

                    now, soon = select(kernels, M=mixed_q, G=gpu_q, C=cpu_q)
                else:

                    now, soon = select(kernels, M1=mixedc_q, M2=mixedg_q, G=gpu_q, C=cpu_q, feat_max=num_global_work_items_max, threshold=0.4)
            else:
                now, soon = soon, None

            if now == None:
                logging.debug( "Searching for available devices")
                continue
            i, p = now
            if fw.nCPU > rCPU and fw.nGPU > rGPU:
                logging.debug( "DISPATCH MULTIPLE")

                g_factor = fw.nGPU if rGPU == 0 else fw.nGPU / rGPU
                c_factor = fw.nCPU if rCPU == 0 else fw.nCPU / rCPU
                free_gpus, free_cpus = [], []
                # try:
                while fw.test_and_set(0,1):
                    pass
                if p == 0:
                    for j in range(c_factor):
                        free_cpus.append(fw.ready_queue['cpu'].popleft())
                elif p == 10:
                    for j in range(g_factor):
                        free_gpus.append(fw.ready_queue['gpu'].popleft())
                else:
                    for j in range(c_factor):
                        free_cpus.append(fw.ready_queue['cpu'].popleft())
                    for j in range(g_factor):
                        free_gpus.append(fw.ready_queue['gpu'].popleft())
                fw.rqlock[0] = 0
                # except:
                #     logging.debug( free_cpus, free_gpus, framework.ready_queue, framework.nCPU, framework.nGPU, c_factor, g_factor

                if kernels[i].partition == 0:
                    rCPU -= 1
                elif kernels[i].partition == 10:
                    rGPU -= 1
                else:
                    rCPU -= 1
                    rGPU -= 1

                kernels[i].partition = p
                # kernels[i].build_kernel(gpus, cpus, ctxs)
                # kernels[i].random_data()
                logging.debug( "Dispatching Multiple " + str(kernels[i].name))
                start_time, done_events = kernels[i].dispatch_multiple(free_gpus, free_cpus, ctxs, cmd_qs)
                events.extend(done_events)
                num_dispatched += 1
            # if False:
            #     pass
            else:
                logging.debug( "DISPATCH")
                cpu, gpu = -1, -1
                if p == 0:
                    while fw.test_and_set(0, 1):
                        pass
                    cpu = fw.ready_queue['cpu'].popleft()
                    fw.rqlock[0] = 0
                elif p == 10:
                    while fw.test_and_set(0, 1):
                        pass
                    gpu = fw.ready_queue['gpu'].popleft()
                    fw.rqlock[0] = 0
                else:
                    while fw.test_and_set(0, 1):
                        pass
                    cpu = fw.ready_queue['cpu'].popleft()
                    gpu = fw.ready_queue['gpu'].popleft()
                    fw.rqlock[0] = 0

                if kernels[i].partition == 0:
                    rCPU -= 1
                elif kernels[i].partition == 10:
                    rGPU -= 1
                else:
                    rCPU -=1
                    rGPU -=1

                kernels[i].partition = p

                logging.debug( "Dispatching " + str(kernels[i].name) + " with partition class " + str(kernels[i].partition))
                start_time, done_events = kernels[i].dispatch(gpu, cpu, ctxs, cmd_qs)

                events.extend(done_events)
                num_dispatched +=1
        else:
            logging.debug( "Devices unavailable")
    fw.host_synchronize(cmd_qs, events)

    return fw.dump_device_history()
Beispiel #5
0
                        help='Flag for dumping output file for a kernel',
                        action="store_true")
    parser.add_argument('-lf',
                        '--layer_forward',
                        help='Test Layer ID Forward',
                        default=0)
    parser.add_argument('-lb',
                        '--layer_backward',
                        help='Test Layer ID Backward',
                        default=0)

    return parser.parse_args(args)


args = parse_arg(sys.argv[1:])
CLHOST = fw.host_initialize(int(args.nGPU), int(args.nCPU))


class LocalController(object):
    def __init__(self):
        pass


class Transforms(object):
    def __init__(self):
        self.macros = dict()
        self.local_work_size = []
        self.global_work_size = []
        self.buffer_sizes = {'input': [], 'output': []}
        self.buffer_chunks = {'input': [], 'output': []}
        self.local_buffer_sizes = []