def test_basic(self,
                PE,
                mat_A,
                mat_B,
                bias,
                post_scale=[1, 1],
                RELU_scale=[1, 0]):
     m = mat_A.shape[0]
     k = mat_A.shape[1]
     n = mat_B.shape[1]
     print("test Fcn")
     print("test_basic: %d %d %d %d %d" %
           (m, k, n, post_scale[0], post_scale[1]))
     print("A: ", np.amax(mat_A), np.amin(mat_A), np.average(mat_A))
     print("B: ", np.amax(mat_B), np.amin(mat_B), np.average(mat_B))
     print("bias: ", np.amax(bias), np.amin(bias), np.average(bias))
     C_fpga = np.zeros((m, n), dtype=np.int16, order='C')
     xfmlp.sendMat(mat_A, PE)
     xfmlp.sendMat(mat_B, PE)
     xfmlp.sendMat(C_fpga, PE)
     xfmlp.sendMat(bias, PE)
     xfmlp.addFCNOp(mat_A, mat_B, C_fpga, bias, post_scale[0],
                    post_scale[1], RELU_scale[0], RELU_scale[1], PE)
     xfmlp.execute(PE)
     xfmlp.clearInstrBuf(PE)
     xfmlp.getMat(C_fpga, PE)
     self.multiply_and_cmp(C_fpga, mat_A, mat_B, bias, m, n, post_scale,
                           RELU_scale)
def test_multiInstrv1(int_range, m, k, n, add_bias=False):
    print ("test_multiInstrv1: %d %d %d %d" % (int_range, m, k, n)) 
    A = np.random.randint(low=-int_range, high=int_range, size=(m, k), dtype=np.int16)
    B = np.random.randint(low=-int_range, high=int_range, size=(k, n), dtype=np.int16)
    C = np.zeros ((m, n), dtype=np.int16);
    D = np.random.randint(low=-int_range, high=int_range, size=(m, k), dtype=np.int16)
    E = np.zeros ((m, n), dtype=np.int16);
    b0 = np.zeros ((m, n), dtype=np.int32);        
    b1 = np.zeros ((m, n), dtype=np.int32);    
    if add_bias == True:
        b0 = np.random.randint(low=-int_range, high=int_range, size=(m, n), dtype=np.int32)
        b1 = np.random.randint(low=-int_range, high=int_range, size=(m, n), dtype=np.int32)
    xfmlp.sendMat(A)
    xfmlp.sendMat(B)
    xfmlp.sendMat(b0)
    xfmlp.sendMat(C)
    xfmlp.sendMat(D)    
    xfmlp.sendMat(E)
    xfmlp.sendMat(b1)         
    xfmlp.addFCNOp(A, B, C, b0, 1, 13, 307, 10)
    xfmlp.addFCNOp(D, C, E, b1, 1, 18, 307, 10)
    xfmlp.execute()
    xfmlp.clearInstrBuf()
    xfmlp.getMat(C)
    xfmlp.getMat(E)
    print("test C")
    test.multiply_and_cmp(C, A, B, b0, m, n, [1, 13],[307, 10])
    print("test E")
    test.multiply_and_cmp(E, D, C, b1, m, n, [1, 18],[307, 10])
Ejemplo n.º 3
0
def test_perf_gemm_gemm(A_range, B_range, bias_range, m, k, n, post_scale):
    mat_A = np.random.randint(low=-A_range, high=A_range, size=(m, k), dtype=np.int16)
    mat_B = np.random.randint(low=-B_range, high=B_range, size=(k, n), dtype=np.int16)  
    bias = []
    if bias_range != 0:
        bias = np.random.randint(low=-bias_range, high=bias_range, size=(m, n), dtype=np.int32)
    else:
        bias = np.zeros ((m, n), dtype=np.int32, order='C');   
    C_fpga = np.zeros( (m, n), dtype=np.int16)
    timePointKernel = []
    timePointKernel.append(time.time()) # current time    
    xfmlp.sendMat(mat_A)
    xfmlp.sendMat(mat_B)
    xfmlp.sendMat(C_fpga)    
    xfmlp.sendMat(bias)
    xfmlp.addGEMMOp ( mat_A, mat_B, C_fpga, bias, post_scale[0], post_scale[1])
    timePointKernel.append(time.time()) # send to FPGA
    xfmlp.execute()
    timePointKernel.append(time.time()) # call kernel
    xfmlp.getMat(C_fpga)  
    timePointKernel.append(time.time()) # copy from FPGA
    total_operations = 2 * m * n * k + m * n * 3
    total_parallel_operations = 2 * m * n * k
    freq = xfmlp.getFreq()
    test.test_perf(timePointKernel,total_operations,total_parallel_operations,freq,m,k,n)
    if m > 4096 and n > 4096 and k > 4096:
      print("Skip golden comparision because large matrix size")
    else:
      test.multiply_and_cmp(C_fpga, mat_A, mat_B, bias, m, n, post_scale)
Ejemplo n.º 4
0
def test_perf_gemm(m, k, n, A_range=32764, B_range=32764, bias_range=32764, post_scale=[1,0]):
    mat_A = np.random.randint(low=-A_range, high=A_range, size=(m, k), dtype=np.int16)
    mat_B = np.random.randint(low=-B_range, high=B_range, size=(k, n), dtype=np.int16)  
    bias = []
    if bias_range != 0:
        bias = np.random.randint(low=-bias_range, high=bias_range, size=(m, n), dtype=np.int32)
    else:
        bias = np.zeros ((m, n), dtype=np.int32, order='C');   
    C_fpga = np.zeros( (m, n), dtype=np.int16)
    timePointKernel = []
    timePointKernel.append(time.time()) # current time    
    xfmlp.sendMat(mat_A)
    xfmlp.sendMat(mat_B)
    xfmlp.sendMat(C_fpga)    
    xfmlp.sendMat(bias)
    xfmlp.addGEMMOp ( mat_A, mat_B, C_fpga, bias, post_scale[0], post_scale[1])
    timePointKernel.append(time.time()) # send to FPGA
    xfmlp.execute()
    xfmlp.clearInstrBuf()
    timePointKernel.append(time.time()) # call kernel
    xfmlp.getMat(C_fpga)  
    timePointKernel.append(time.time()) # copy from FPGA
    total_operations = 2 * m * n * k + m * n * 3
    total_parallel_operations = 2 * m * n * k
    freq = xfmlp.getFreq()
    test.test_perf(timePointKernel,total_operations,total_parallel_operations,freq,m,k,n)
    test.multiply_and_cmp(C_fpga, mat_A, mat_B, bias, m, n, post_scale)
 def test_textfiles(self, path_to_a, path_to_b, path_to_bias, post_scale):
     mat_A = np.loadtxt(path_to_a, dtype=np.int16)
     mat_B = np.loadtxt(path_to_b, dtype=np.int16)
     bias = np.loadtxt(path_to_bias, dtype=np.int32)
     m = mat_A.shape[0]
     k = mat_A.shape[1]
     n = mat_B.shape[1]
     C_fpga = np.zeros((m, n), dtype=np.int16, order='C')
     xfmlp.sendMat(mat_A)
     xfmlp.sendMat(mat_B)
     xfmlp.sendMat(C_fpga)
     xfmlp.sendMat(bias)
     xfmlp.addFCNOp(mat_A, mat_B, C_fpga, bias, post_scale[0],
                    post_scale[1], 1, 0)
     xfmlp.execute()
     xfmlp.clearInstrBuf()
     xfmlp.getMat(C_fpga)
     self.multiply_and_cmp(C_fpga, mat_A, mat_B, bias, m, n, post_scale)
Ejemplo n.º 6
0
def common_spmv(row, col, data, m, k, nnz, vector_range):
    if xclbin_opts["GEMX_dataType"] == "float":
        data_type = np.float32
    elif xclbin_opts["GEMX_dataType"] == "int32_t":
        data_type = np.int32
    else:
        raise TypeError("type", xclbin_opts["GEMX_dataType"], "not supported")
    ddrWidth = int(xclbin_opts["GEMX_ddrWidth"])
    min_k = ddrWidth
    if xclbin_opts["GEMX_useURAM"] == "1":
        min_nnz = ddrWidth
        min_m = ddrWidth * int(xclbin_opts["GEMX_spmvUramGroups"])
    else:
        spmvWidth = int(xclbin_opts["GEMX_spmvWidth"])
        min_nnz = spmvWidth
        min_m = spmvWidth * int(xclbin_opts["GEMX_spmvMacGroups"])
    while nnz % min_nnz != 0:
        # pad with 0s and adjust dimensions when necessary
        row = (np.append(row, 0)).astype(np.int32)
        col = (np.append(col, 0)).astype(np.int32)
        data = (np.append(data, 0)).astype(np.float32)
        nnz = nnz + 1
    m = get_padded_size(m, min_m)
    k = get_padded_size(k, min_k)
    print("size:", m, k, "nnz:", nnz)
    if data_type == np.int32:
        B = np.random.randint(low=-vector_range,
                              high=vector_range,
                              size=(k, 1),
                              dtype=np.int32)
    else:
        B = np.zeros((k, 1), dtype=np.float32)
        test.fillMod(B, k, vector_range)
    C = np.zeros((m, 1), dtype=data_type)
    A = xfmlp.sendSpMat(row, col, data, m, k, nnz, xclbin_opts, data_type)
    xfmlp.sendMat(B)
    xfmlp.sendMat(C)
    xfmlp.addSPMVOp(A, B, C, nnz, xclbin_opts)
    xfmlp.execute()
    xfmlp.clearInstrBuf()
    xfmlp.getMat(C)
    test.multiply_and_cmp_spmv(row, col, data, m, k, nnz, B, C)
 def test_basic(self, PE, mat_A, mat_B, bias, post_scale=[1, 1]):
     m = mat_A.shape[0]
     k = mat_A.shape[1]
     n = mat_B.shape[1]
     print("test_basic(PE=%d): %d %d %d %d %d" %
           (PE, m, k, n, post_scale[0], post_scale[1]))
     print("A: ", np.amax(mat_A), np.amin(mat_A), np.average(mat_A))
     print("B: ", np.amax(mat_B), np.amin(mat_B), np.average(mat_B))
     print("bias: ", np.amax(bias), np.amin(bias), np.average(bias))
     C_fpga = np.zeros((m, n), dtype=np.int16)
     xfmlp.sendMat(mat_A, PE)
     xfmlp.sendMat(mat_B, PE)
     xfmlp.sendMat(C_fpga, PE)
     xfmlp.sendMat(bias, PE)
     xfmlp.addGEMMOp(mat_A, mat_B, C_fpga, bias, post_scale[0],
                     post_scale[1],
                     PE)  # default test_basic will call addGEMMOp
     xfmlp.execute(PE)
     xfmlp.clearInstrBuf(PE)
     xfmlp.getMat(C_fpga, PE)
     self.multiply_and_cmp(C_fpga, mat_A, mat_B, bias, m, n, post_scale)
Ejemplo n.º 8
0
def test_perf_multi_gemm(ins_count, m_size, k_size, n_size, A_range, B_range, post_scale):
    total_operations = 0
    total_parallel_operations = 0
    mat_A=[]
    mat_C=[]
    mat_bias=[]
    for i in range(ins_count):
      total_operations += 2 * m_size[i] * n_size[i] * k_size[i] + m_size[i] * n_size[i] * 3
      total_parallel_operations += 2 * m_size[i] * n_size[i] * k_size[i]
      mat_A.append(np.random.randint(low=-A_range, high=A_range, size=(m_size[i], k_size[i]), dtype=np.int16))
      mat_bias.append(np.zeros ((m_size[i], n_size[i]), dtype=np.int32))
      mat_C.append(np.zeros((m_size[i], n_size[i]), dtype=np.int16, order='C'))
    mat_B0 = np.random.randint(low=-B_range, high=B_range, size=(k_size[0], n_size[0]), dtype=np.int16) 
    timePointKernel = []
    timePointKernel.append(time.time()) # current time 
    for i in range(ins_count):
      xfmlp.sendMat(mat_A[i])
      xfmlp.sendMat(mat_C[i])
      xfmlp.sendMat(mat_bias[i])
    xfmlp.sendMat(mat_B0)
    xfmlp.addGEMMOp (mat_A[0], mat_B0, mat_C[0], mat_bias[0], post_scale[0], post_scale[1])    
    xfmlp.addGEMMOp (mat_A[1], mat_C[0], mat_C[1], mat_bias[1], post_scale[0], post_scale[1]) 
    xfmlp.addGEMMOp (mat_A[2], mat_C[1], mat_C[2], mat_bias[2], post_scale[0], post_scale[1]) 
    xfmlp.addGEMMOp (mat_A[3], mat_C[2], mat_C[3], mat_bias[3], post_scale[0], post_scale[1])
    timePointKernel.append(time.time()) # send to FPGA
    xfmlp.execute()
    timePointKernel.append(time.time()) # call kernel
    xfmlp.getMat(mat_C[0])  
    xfmlp.getMat(mat_C[1]) 
    xfmlp.getMat(mat_C[2]) 
    xfmlp.getMat(mat_C[3]) 
    timePointKernel.append(time.time()) # copy from FPGA
    freq = xfmlp.getFreq()
    test.test_perf(timePointKernel,total_operations,total_parallel_operations,freq,0,0,0)
    if np.max(m_size) > 4096 and np.max(k_size) > 4096 and np.max(n_size) > 4096:
      print("Skip golden comparision because large matrix size")
    else:
      test.multiply_and_cmp(mat_C[3], mat_A[3], mat_C[2], mat_bias[3], m_size[3], n_size[3], post_scale)
    print(m, k, n)
    A_buf.append(np.zeros((m, k), dtype=np.int16, order='C'))
    bias_buf.append(np.zeros((m, n), dtype=np.int32, order='C'))
    B_buf.append(np.zeros((k, n), dtype=np.int16, order='C'))
    C_buf.append(np.zeros((m, n), dtype=np.int16, order='C'))

for i in range(num_matrix):
    xfmlp.sendMat(B_buf[i])
    xfmlp.sendMat(A_buf[i])
    xfmlp.sendMat(C_buf[i])
    xfmlp.sendMat(bias_buf[i])

time.sleep(2)
total_time = 0
for k in range(args.numiter):
    start_time = time.time()
    xfmlp.sendMat(B_buf[0])
    for i in range(num_matrix):
        #xfmlp.addFCNOp(A_buf[i], B_buf[i], B_buf[i+1], bias_buf[i], 1,0,1,0 )
        xfmlp.addFCNOp(A_buf[i], B_buf[i], C_buf[i], bias_buf[i], 1, 0, 1, 0)

    xfmlp.execute()
    xfmlp.getMat(C_buf[num_matrix - 1])
    #xfmlp.wait()
    total_time += time.time() - start_time

print("Average FPGA exec time(python): ", (total_time / args.numiter) * 1000,
      " ms")

xfmlp.printStats()