Python exprefixsumNumbaの例、MyML.helper.scan.exprefixsumNumba Pythonの例

コード例 #1

0

ファイルを表示

def prescan_test():

    a = np.arange(2048).astype(np.int32)
    reference = np.empty_like(a)

    ref_sum = scan.exprefixsumNumba(a, reference)

    a1 = np.arange(1024).astype(np.int32)
    a2 = np.arange(1024, 2048).astype(np.int32)

    ref1 = np.empty_like(a1)
    ref2 = np.empty_like(a2)

    ref_sum1 = scan.exprefixsumNumba(a1, ref1)
    ref_sum2 = scan.exprefixsumNumba(a2, ref2)

    dAux = cuda.device_array(2, dtype=np.int32)
    dA = cuda.to_device(a)

    sm_size = 1024 * a.dtype.itemsize

    scan.prescan[2, 512, 0, sm_size](dA, dAux)

    aux = dAux.copy_to_host()
    a_gpu = dA.copy_to_host()

    print "finish"

コード例 #2

0

ファイルを表示

ファイル: test_scan.py プロジェクト: Chiroptera/ThesisWriting

def prescan_test():

    a = np.arange(2048).astype(np.int32)
    reference = np.empty_like(a)

    ref_sum = scan.exprefixsumNumba(a, reference)

    a1 = np.arange(1024).astype(np.int32)
    a2 = np.arange(1024, 2048).astype(np.int32)

    ref1 = np.empty_like(a1)
    ref2 = np.empty_like(a2)

    ref_sum1 = scan.exprefixsumNumba(a1, ref1)
    ref_sum2 = scan.exprefixsumNumba(a2, ref2)

    dAux = cuda.device_array(2, dtype = np.int32)
    dA = cuda.to_device(a)

    sm_size = 1024 * a.dtype.itemsize

    scan.prescan[2, 512, 0, sm_size](dA, dAux)

    aux = dAux.copy_to_host()
    a_gpu = dA.copy_to_host()

    print "finish"

コード例 #3

0

ファイルを表示

def get_new_graph(dest, weight, fe, od, mst, nod, nfe, ndest, nweight):

    # first build the outDegree to get the first_edge
    for e in range(mst.size):
        edge = mst[e]
        o_v = dest[edge] # destination
        i_v = binaryEdgeIdSearch(edge, dest, fe, od)
        if i_v == -1:
            return -1
        nod[o_v] += 1
        nod[i_v] += 1

    # get first edge from outDegree
    exprefixsumNumba(nod, nfe, init = 0)

    #get copy of newFirstEdge to serve as pointers for the newDest
    top_edge = np.empty(nfe.size, dtype = np.int32)
    for i in range(nfe.size):
        top_edge[i] = nfe[i]
    #top_edge = nfe.copy()

    # go through all the mst edges again and write the new edges in the new arrays
    for e in range(mst.size):
        edge = mst[e]

        o_v = dest[edge] # destination vertex
        i_v = binaryEdgeIdSearch(edge, dest, fe, od)
        if i_v == -1:
            return -1
        
        i_ptr = top_edge[i_v]
        o_ptr = top_edge[o_v]

        ndest[i_ptr] = o_v
        ndest[o_ptr] = i_v

        edge_w = weight[edge]
        nweight[i_ptr] = edge_w
        nweight[o_ptr] = edge_w

        top_edge[i_v] += 1
        top_edge[o_v] += 1

    return 0

コード例 #4

0

ファイルを表示

def last_block_test():

    MAX_TPB = 512
    n = 1024

    a = np.arange(n).astype(np.int32)
    reference = np.empty_like(a)

    start = timer()
    scan.exprefixsumNumba(a, reference, init=0)
    end = timer()

    auxidx = -1

    elb = a.size
    p2elb = np.int(np.ceil(np.log2(elb)))
    telb = 2**p2elb
    tlb = telb / 2
    startIdx = 0

    sm_size = telb * a.itemsize

    aux = np.empty(1, dtype=np.int8)

    trash = cuda.device_array(1)

    e1, e2 = cuda.event(), cuda.event()

    e1.record()
    scan.last_scan[1, tlb, 0, sm_size](a, aux, -1, elb, startIdx)
    e2.record()

    print "CPU took:    ", (end - start) * 1000, " ms"
    print "Kernel took: ", cuda.event_elapsed_time(e1, e2), " ms"

    print(a == reference).all()

コード例 #5

0

ファイルを表示

ファイル: test_scan.py プロジェクト: Chiroptera/ThesisWriting

def last_block_test():

    MAX_TPB = 512
    n = 1024

    a = np.arange(n).astype(np.int32)
    reference = np.empty_like(a)

    start = timer()
    scan.exprefixsumNumba(a, reference, init = 0)
    end = timer()

    auxidx = -1

    elb = a.size
    p2elb = np.int(np.ceil(np.log2(elb)))
    telb = 2 ** p2elb
    tlb = telb / 2
    startIdx = 0

    sm_size = telb * a.itemsize

    aux = np.empty(1,dtype=np.int8)

    trash = cuda.device_array(1)

    e1, e2 = cuda.event(), cuda.event()

    e1.record()
    scan.last_scan[1, tlb, 0, sm_size](a, aux, -1, elb, startIdx)
    e2.record()

    print "CPU took:    ", (end - start) * 1000, " ms"
    print "Kernel took: ", cuda.event_elapsed_time(e1,e2), " ms"

    print (a == reference).all()

コード例 #6

0

ファイルを表示

ファイル: test_scan.py プロジェクト: Chiroptera/ThesisWriting

def recursive_big_scan_test():

    print "running recursive scan test"

    MAX_TPB = 512
    n = 2e6
    n = int(n)

    a = np.arange(n).astype(np.int32) 
    reference = np.empty_like(a)

    start = timer()
    sum_ref = scan.exprefixsumNumba(a, reference, init = 0)
    end = timer()

    dA = cuda.to_device(a)

    # e1, e2 = cuda.event(), cuda.event()

    # e1.record()
    # e2.record()


    start2 = timer()
    total_sum = scan.scan_gpu(dA)
    end2 = timer()
    

    dA.copy_to_host(ary = a)
    sum_gpu = total_sum.copy_to_host()
    

    print "sum_ref = ", sum_ref
    print "sum_gpu = ", sum_gpu

    print "CPU took:    ", (end - start) * 1000, " ms"
    print "Kernel took: ", (end2 - start2) * 1000, " ms"

    print (a == reference).all()

コード例 #7

0

ファイルを表示

def recursive_big_scan_test():

    print "running recursive scan test"

    MAX_TPB = 512
    n = 2e6
    n = int(n)

    a = np.arange(n).astype(np.int32)
    reference = np.empty_like(a)

    start = timer()
    sum_ref = scan.exprefixsumNumba(a, reference, init=0)
    end = timer()

    dA = cuda.to_device(a)

    # e1, e2 = cuda.event(), cuda.event()

    # e1.record()
    # e2.record()

    start2 = timer()
    total_sum = scan.scan_gpu(dA)
    end2 = timer()

    dA.copy_to_host(ary=a)
    sum_gpu = total_sum.copy_to_host()

    print "sum_ref = ", sum_ref
    print "sum_gpu = ", sum_gpu

    print "CPU took:    ", (end - start) * 1000, " ms"
    print "Kernel took: ", (end2 - start2) * 1000, " ms"

    print(a == reference).all()

コード例 #8

0

ファイルを表示

def recursive_step_by_step():

    ## setup

    MAX_TPB = 512
    n = 5000

    a = np.arange(n).astype(np.int32)
    reference = np.empty_like(a)

    start = timer()
    sum_ref = scan.exprefixsumNumba(a, reference, init=0)
    end = timer()

    dA = cuda.to_device(a)

    # e1, e2 = cuda.event(), cuda.event()
    # e1.record()
    # e2.record()

    ## scan
    in_ary = dA

    epb = MAX_TPB * 2
    whole_blocks = n // epb
    el_last_block = n % epb

    n_scans = whole_blocks
    if el_last_block != 0:
        n_scans += 1

    ## prescan

    dAux = cuda.device_array(shape=n_scans, dtype=np.int32)
    sm_size = epb * in_ary.dtype.itemsize

    scan.prescan[whole_blocks, MAX_TPB, 0, sm_size](in_ary, dAux)

    # tIn = in_ary.copy_to_host()
    # tAux = dAux.copy_to_host()

    p2elb = np.int(np.ceil(np.log2(el_last_block)))
    p2_el_last_block = 2**p2elb  # the smallest number of elements that is power of 2
    tlb = p2_el_last_block >> 1  # number of threads in last block

    sm_size = p2_el_last_block * in_ary.dtype.itemsize

    startIdx = n - el_last_block
    auxIdx = n_scans - 1

    scan.last_scan[1, tlb, 0, sm_size](in_ary, dAux, auxIdx, el_last_block,
                                       startIdx)

    in_ary2 = dAux
    n2 = in_ary2.size

    if n2 < MAX_TPB << 1:
        el_last_block2 = n2

        p2elb2 = np.int(np.ceil(np.log2(el_last_block2)))
        p2_el_last_block2 = 2**p2elb  # the smallest number of elements that is power of 2
        tlb2 = p2_el_last_block2 >> 1  # number of threads in last block

        total_sum = cuda.device_array(shape=1, dtype=np.int32)
        sm_size2 = p2_el_last_block2 * in_ary2.dtype.itemsize

        startIdx2 = 0
        auxIdx2 = 0

        scan.last_scan[1, tlb2, 0, sm_size2](in_ary2, total_sum, auxIdx2,
                                             el_last_block2, startIdx2)

    scan.scan_sum[n_scans, tlb](in_ary, dAux)

    tIn = in_ary.copy_to_host()
    tAux = dAux.copy_to_host()
    tSum = total_sum.copy_to_host()

    print "finish"

コード例 #9

0

ファイルを表示

ファイル: test_scan.py プロジェクト: Chiroptera/ThesisWriting

def recursive_step_by_step():

    ## setup

    MAX_TPB = 512
    n = 5000

    a = np.arange(n).astype(np.int32) 
    reference = np.empty_like(a)

    start = timer()
    sum_ref = scan.exprefixsumNumba(a, reference, init = 0)
    end = timer()

    dA = cuda.to_device(a)

    # e1, e2 = cuda.event(), cuda.event()
    # e1.record()
    # e2.record()


    ## scan
    in_ary = dA
 
    epb = MAX_TPB * 2
    whole_blocks = n // epb
    el_last_block = n % epb

    n_scans = whole_blocks
    if el_last_block != 0:
        n_scans += 1

    ## prescan

    dAux = cuda.device_array(shape = n_scans, dtype = np.int32)
    sm_size = epb * in_ary.dtype.itemsize

    scan.prescan[whole_blocks, MAX_TPB, 0, sm_size](in_ary, dAux)

    # tIn = in_ary.copy_to_host()
    # tAux = dAux.copy_to_host()

    p2elb = np.int(np.ceil(np.log2(el_last_block)))
    p2_el_last_block = 2 ** p2elb # the smallest number of elements that is power of 2
    tlb = p2_el_last_block >> 1 # number of threads in last block

    sm_size = p2_el_last_block * in_ary.dtype.itemsize

    startIdx = n - el_last_block
    auxIdx = n_scans - 1

    scan.last_scan[1, tlb, 0, sm_size](in_ary, dAux, auxIdx, el_last_block, startIdx)

    in_ary2 = dAux
    n2 = in_ary2.size

    if n2 < MAX_TPB << 1:
        el_last_block2 = n2

        p2elb2 = np.int(np.ceil(np.log2(el_last_block2)))
        p2_el_last_block2 = 2 ** p2elb # the smallest number of elements that is power of 2
        tlb2 = p2_el_last_block2 >> 1 # number of threads in last block

        total_sum = cuda.device_array(shape = 1, dtype = np.int32)
        sm_size2 = p2_el_last_block2 * in_ary2.dtype.itemsize

        startIdx2 = 0
        auxIdx2 = 0

        scan.last_scan[1, tlb2, 0, sm_size2](in_ary2, total_sum, auxIdx2, el_last_block2, startIdx2)    

    scan.scan_sum[n_scans, tlb](in_ary, dAux)

    tIn = in_ary.copy_to_host()
    tAux = dAux.copy_to_host()
    tSum = total_sum.copy_to_host()

    print "finish"