def test_multiInstrv1(int_range, m, k, n, add_bias=False):
    print ("test_multiInstrv1: %d %d %d %d" % (int_range, m, k, n)) 
    A = np.random.randint(low=-int_range, high=int_range, size=(m, k), dtype=np.int16)
    B = np.random.randint(low=-int_range, high=int_range, size=(k, n), dtype=np.int16)
    C = np.zeros ((m, n), dtype=np.int16);
    D = np.random.randint(low=-int_range, high=int_range, size=(m, k), dtype=np.int16)
    E = np.zeros ((m, n), dtype=np.int16);
    b0 = np.zeros ((m, n), dtype=np.int32);        
    b1 = np.zeros ((m, n), dtype=np.int32);    
    if add_bias == True:
        b0 = np.random.randint(low=-int_range, high=int_range, size=(m, n), dtype=np.int32)
        b1 = np.random.randint(low=-int_range, high=int_range, size=(m, n), dtype=np.int32)
    xfmlp.sendMat(A)
    xfmlp.sendMat(B)
    xfmlp.sendMat(b0)
    xfmlp.sendMat(C)
    xfmlp.sendMat(D)    
    xfmlp.sendMat(E)
    xfmlp.sendMat(b1)         
    xfmlp.addFCNOp(A, B, C, b0, 1, 13, 307, 10)
    xfmlp.addFCNOp(D, C, E, b1, 1, 18, 307, 10)
    xfmlp.execute()
    xfmlp.clearInstrBuf()
    xfmlp.getMat(C)
    xfmlp.getMat(E)
    print("test C")
    test.multiply_and_cmp(C, A, B, b0, m, n, [1, 13],[307, 10])
    print("test E")
    test.multiply_and_cmp(E, D, C, b1, m, n, [1, 18],[307, 10])
Exemple #2
0
def test_perf_fcn(A_range, B_range, bias_range, m, k, n, post_scale):
    mat_A = np.random.randint(low=-A_range, high=A_range, size=(m, k), dtype=np.int16)
    mat_B = np.random.randint(low=-B_range, high=B_range, size=(k, n), dtype=np.int16)  
    bias = []
    if bias_range != 0:
        bias = np.random.randint(low=-bias_range, high=bias_range, size=(m, n), dtype=np.int32)
    else:
        bias = np.zeros ((m, n), dtype=np.int32, order='C');   
    C_fpga = np.zeros( (m, n), dtype=np.int16)
    timePointKernel = []
    timePointKernel.append(time.time()) # current time    
    gemx.sendMat(mat_A)
    gemx.sendMat(mat_B)
    gemx.sendMat(C_fpga)    
    gemx.sendMat(bias)
    gemx.addFCNOp ( mat_A, mat_B, C_fpga, bias, post_scale[0], post_scale[1],1,0)
    timePointKernel.append(time.time()) # send to FPGA
    gemx.execute()
    timePointKernel.append(time.time()) # call kernel
    gemx.getMat(C_fpga)  
    timePointKernel.append(time.time()) # copy from FPGA
    total_operations = 2 * m * n * k + m * n * 3
    total_parallel_operations = 2 * m * n * k
    freq = gemx.getFreq()
    test.test_perf(timePointKernel,total_operations,total_parallel_operations,freq,m,k,n)
    if m > 4096 and n > 4096 and k > 4096:
      print("Skip golden comparision because large matrix size")
    else:
      test.multiply_and_cmp(C_fpga, mat_A, mat_B, bias, m, n, post_scale)
Exemple #3
0
def test_perf_multi_gemm(ins_count, m_size, k_size, n_size, A_range, B_range,
                         post_scale):
    total_operations = 0
    total_parallel_operations = 0
    mat_A = []
    mat_C = []
    mat_bias = []
    for i in range(ins_count):
        total_operations += 2 * m_size[i] * n_size[i] * k_size[i] + m_size[
            i] * n_size[i] * 3
        total_parallel_operations += 2 * m_size[i] * n_size[i] * k_size[i]
        mat_A.append(
            np.random.randint(low=-A_range,
                              high=A_range,
                              size=(m_size[i], k_size[i]),
                              dtype=np.int16))
        mat_bias.append(np.zeros((m_size[i], n_size[i]), dtype=np.int32))
        mat_C.append(
            np.zeros((m_size[i], n_size[i]), dtype=np.int16, order='C'))
    mat_B0 = np.random.randint(low=-B_range,
                               high=B_range,
                               size=(k_size[0], n_size[0]),
                               dtype=np.int16)
    timePointKernel = []
    timePointKernel.append(time.time())  # current time
    for i in range(ins_count):
        gemx.sendMat(mat_A[i])
        gemx.sendMat(mat_C[i])
        gemx.sendMat(mat_bias[i])
    gemx.sendMat(mat_B0)
    gemx.addGEMMOp(mat_A[0], mat_B0, mat_C[0], mat_bias[0], post_scale[0],
                   post_scale[1])
    gemx.addGEMMOp(mat_A[1], mat_C[0], mat_C[1], mat_bias[1], post_scale[0],
                   post_scale[1])
    gemx.addGEMMOp(mat_A[2], mat_C[1], mat_C[2], mat_bias[2], post_scale[0],
                   post_scale[1])
    gemx.addGEMMOp(mat_A[3], mat_C[2], mat_C[3], mat_bias[3], post_scale[0],
                   post_scale[1])
    timePointKernel.append(time.time())  # send to FPGA
    gemx.execute()
    timePointKernel.append(time.time())  # call kernel
    gemx.getMat(mat_C[0])
    gemx.getMat(mat_C[1])
    gemx.getMat(mat_C[2])
    gemx.getMat(mat_C[3])
    timePointKernel.append(time.time())  # copy from FPGA
    freq = gemx.getFreq()
    test.test_perf(timePointKernel, total_operations,
                   total_parallel_operations, freq, 0, 0, 0)
    if np.max(m_size) > 4096 and np.max(k_size) > 4096 and np.max(
            n_size) > 4096:
        print("Skip golden comparision because large matrix size")
    else:
        test.multiply_and_cmp(mat_C[3], mat_A[3], mat_C[2], mat_bias[3],
                              m_size[3], n_size[3], post_scale)
Exemple #4
0
def test_perf_fcn(m,
                  k,
                  n,
                  xclbin_opts,
                  post_scale=[1, 0],
                  A_range=32764,
                  B_range=32764,
                  bias_range=32764):
    ddrWidth = int(xclbin_opts["GEMX_ddrWidth"])
    m = test.get_padded_size(m,
                             int(xclbin_opts["GEMX_gemmMBlocks"]) * ddrWidth)
    k = test.get_padded_size(k,
                             int(xclbin_opts["GEMX_gemmKBlocks"]) * ddrWidth)
    n = test.get_padded_size(n,
                             int(xclbin_opts["GEMX_gemmNBlocks"]) * ddrWidth)
    if xclbin_opts["GEMX_dataType"] == "short":
        mat_A = np.random.randint(low=-A_range,
                                  high=A_range,
                                  size=(m, k),
                                  dtype=np.int16)
        mat_B = np.random.randint(low=-B_range,
                                  high=B_range,
                                  size=(k, n),
                                  dtype=np.int16)
        bias = []
        if bias_range != 0:
            bias = np.random.randint(low=-bias_range,
                                     high=bias_range,
                                     size=(m, n),
                                     dtype=np.int32)
        else:
            bias = np.zeros((m, n), dtype=np.int32, order='C')
        C_fpga = np.zeros((m, n), dtype=np.int16)
    else:
        mat_A = np.random.uniform(low=-128, high=128,
                                  size=(m, k)).astype(np.float32)
        mat_B = np.random.uniform(low=-128, high=128,
                                  size=(k, n)).astype(np.float32)
        bias = np.zeros((m, n), dtype=np.float32, order='C')
        C_fpga = np.zeros((m, n), dtype=np.float32)

    start_time = time.time()
    gemx.sendMat(mat_A)
    gemx.sendMat(mat_B)
    gemx.sendMat(C_fpga)
    gemx.sendMat(bias)
    gemx.addFCNOp(mat_A, mat_B, C_fpga, bias, post_scale[0], post_scale[1], 1,
                  0)
    gemx.execute()
    gemx.clearInstrBuf()
    gemx.getMat(C_fpga)
    end_time = time.time()
    total_operations = 2 * m * n * k + m * n * 3
    test.test_perf(end_time - start_time, total_operations, m, k, n, ddrWidth)
    test.multiply_and_cmp(C_fpga, mat_A, mat_B, bias, m, n, post_scale)
Exemple #5
0
def test_multi_fcn(ins_count,
                   m_size,
                   k_size,
                   n_size,
                   post_scale=[1, 0],
                   A_range=32764,
                   B_range=32764):
    mat_A = []
    mat_C = []
    mat_bias = []
    ddrWidth = int(xclbin_opts["GEMX_ddrWidth"])
    for i in range(ins_count):
        m_size[i] = test.get_padded_size(
            m_size[i],
            int(xclbin_opts["GEMX_gemmMBlocks"]) * ddrWidth)
        k_size[i] = test.get_padded_size(
            k_size[i],
            int(xclbin_opts["GEMX_gemmKBlocks"]) * ddrWidth)
        n_size[i] = test.get_padded_size(
            n_size[i],
            int(xclbin_opts["GEMX_gemmNBlocks"]) * ddrWidth)
        mat_A.append(
            np.random.randint(low=-A_range,
                              high=A_range,
                              size=(m_size[i], k_size[i]),
                              dtype=np.int16))
        mat_bias.append(np.zeros((m_size[i], n_size[i]), dtype=np.int32))
        mat_C.append(
            np.zeros((m_size[i], n_size[i]), dtype=np.int16, order='C'))
    mat_B0 = np.random.randint(low=-B_range,
                               high=B_range,
                               size=(k_size[0], n_size[0]),
                               dtype=np.int16)
    for i in range(ins_count):
        gemx.sendMat(mat_A[i])
        gemx.sendMat(mat_C[i])
        gemx.sendMat(mat_bias[i])
    gemx.sendMat(mat_B0)
    gemx.addFCNOp(mat_A[0], mat_B0, mat_C[0], mat_bias[0], post_scale[0],
                  post_scale[1], 1, 0)
    gemx.addFCNOp(mat_A[1], mat_C[0], mat_C[1], mat_bias[1], post_scale[0],
                  post_scale[1], 1, 0)
    gemx.addFCNOp(mat_A[2], mat_C[1], mat_C[2], mat_bias[2], post_scale[0],
                  post_scale[1], 1, 0)
    gemx.addFCNOp(mat_A[3], mat_C[2], mat_C[3], mat_bias[3], post_scale[0],
                  post_scale[1], 1, 0)
    gemx.execute()
    gemx.clearInstrBuf()
    gemx.getMat(mat_C[0])
    gemx.getMat(mat_C[1])
    gemx.getMat(mat_C[2])
    gemx.getMat(mat_C[3])
    test.multiply_and_cmp(mat_C[3], mat_A[3], mat_C[2], mat_bias[3], m_size[3],
                          n_size[3], post_scale)
Exemple #6
0
def test_perf_multi_fcn(ins_count, m_size, k_size, n_size, A_range, B_range, post_scale):
    total_operations = 0
    total_parallel_operations = 0
    mat_A=[]
    mat_C=[]
    mat_bias=[]
    for i in range(ins_count):
      total_operations += 2 * m_size[i] * n_size[i] * k_size[i] + m_size[i] * n_size[i] * 3
      total_parallel_operations += 2 * m_size[i] * n_size[i] * k_size[i]
      mat_A.append(np.random.randint(low=-A_range, high=A_range, size=(m_size[i], k_size[i]), dtype=np.int16))
      mat_bias.append(np.zeros ((m_size[i], n_size[i]), dtype=np.int32))
      mat_C.append(np.zeros((m_size[i], n_size[i]), dtype=np.int16, order='C'))
    mat_B0 = np.random.randint(low=-B_range, high=B_range, size=(k_size[0], n_size[0]), dtype=np.int16) 
    timePointKernel = []
    timePointKernel.append(time.time()) # current time 
    for i in range(ins_count):
      gemx.sendMat(mat_A[i])
      gemx.sendMat(mat_C[i])
      gemx.sendMat(mat_bias[i])
    gemx.sendMat(mat_B0)
    gemx.addFCNOp (mat_A[0], mat_B0, mat_C[0], mat_bias[0], post_scale[0], post_scale[1],1,0)    
    gemx.addFCNOp (mat_A[1], mat_C[0], mat_C[1], mat_bias[1], post_scale[0], post_scale[1],1,0) 
    gemx.addFCNOp (mat_A[2], mat_C[1], mat_C[2], mat_bias[2], post_scale[0], post_scale[1],1,0) 
    gemx.addFCNOp (mat_A[3], mat_C[2], mat_C[3], mat_bias[3], post_scale[0], post_scale[1],1,0)
    timePointKernel.append(time.time()) # send to FPGA
    gemx.execute()
    gemx.clearInstrBuf()
    timePointKernel.append(time.time()) # call kernel
    gemx.getMat(mat_C[0])  
    gemx.getMat(mat_C[1]) 
    gemx.getMat(mat_C[2]) 
    gemx.getMat(mat_C[3]) 
    timePointKernel.append(time.time()) # copy from FPGA
    freq = gemx.getFreq()
    test.test_perf(timePointKernel,total_operations,total_parallel_operations,freq,0,0,0)
    test.multiply_and_cmp(mat_C[3], mat_A[3], mat_C[2], mat_bias[3], m_size[3], n_size[3], post_scale)