def test_perf_fcn(A_range, B_range, bias_range, m, k, n, post_scale): mat_A = np.random.randint(low=-A_range, high=A_range, size=(m, k), dtype=np.int16) mat_B = np.random.randint(low=-B_range, high=B_range, size=(k, n), dtype=np.int16) bias = [] if bias_range != 0: bias = np.random.randint(low=-bias_range, high=bias_range, size=(m, n), dtype=np.int32) else: bias = np.zeros ((m, n), dtype=np.int32, order='C'); C_fpga = np.zeros( (m, n), dtype=np.int16) timePointKernel = [] timePointKernel.append(time.time()) # current time gemx.sendMat(mat_A) gemx.sendMat(mat_B) gemx.sendMat(C_fpga) gemx.sendMat(bias) gemx.addFCNOp ( mat_A, mat_B, C_fpga, bias, post_scale[0], post_scale[1],1,0) timePointKernel.append(time.time()) # send to FPGA gemx.execute() timePointKernel.append(time.time()) # call kernel gemx.getMat(C_fpga) timePointKernel.append(time.time()) # copy from FPGA total_operations = 2 * m * n * k + m * n * 3 total_parallel_operations = 2 * m * n * k freq = gemx.getFreq() test.test_perf(timePointKernel,total_operations,total_parallel_operations,freq,m,k,n) if m > 4096 and n > 4096 and k > 4096: print("Skip golden comparision because large matrix size") else: test.multiply_and_cmp(C_fpga, mat_A, mat_B, bias, m, n, post_scale)
def test_perf_multi_gemm(ins_count, m_size, k_size, n_size, A_range, B_range, post_scale): total_operations = 0 total_parallel_operations = 0 mat_A = [] mat_C = [] mat_bias = [] for i in range(ins_count): total_operations += 2 * m_size[i] * n_size[i] * k_size[i] + m_size[ i] * n_size[i] * 3 total_parallel_operations += 2 * m_size[i] * n_size[i] * k_size[i] mat_A.append( np.random.randint(low=-A_range, high=A_range, size=(m_size[i], k_size[i]), dtype=np.int16)) mat_bias.append(np.zeros((m_size[i], n_size[i]), dtype=np.int32)) mat_C.append( np.zeros((m_size[i], n_size[i]), dtype=np.int16, order='C')) mat_B0 = np.random.randint(low=-B_range, high=B_range, size=(k_size[0], n_size[0]), dtype=np.int16) timePointKernel = [] timePointKernel.append(time.time()) # current time for i in range(ins_count): gemx.sendMat(mat_A[i]) gemx.sendMat(mat_C[i]) gemx.sendMat(mat_bias[i]) gemx.sendMat(mat_B0) gemx.addGEMMOp(mat_A[0], mat_B0, mat_C[0], mat_bias[0], post_scale[0], post_scale[1]) gemx.addGEMMOp(mat_A[1], mat_C[0], mat_C[1], mat_bias[1], post_scale[0], post_scale[1]) gemx.addGEMMOp(mat_A[2], mat_C[1], mat_C[2], mat_bias[2], post_scale[0], post_scale[1]) gemx.addGEMMOp(mat_A[3], mat_C[2], mat_C[3], mat_bias[3], post_scale[0], post_scale[1]) timePointKernel.append(time.time()) # send to FPGA gemx.execute() timePointKernel.append(time.time()) # call kernel gemx.getMat(mat_C[0]) gemx.getMat(mat_C[1]) gemx.getMat(mat_C[2]) gemx.getMat(mat_C[3]) timePointKernel.append(time.time()) # copy from FPGA freq = gemx.getFreq() test.test_perf(timePointKernel, total_operations, total_parallel_operations, freq, 0, 0, 0) if np.max(m_size) > 4096 and np.max(k_size) > 4096 and np.max( n_size) > 4096: print("Skip golden comparision because large matrix size") else: test.multiply_and_cmp(mat_C[3], mat_A[3], mat_C[2], mat_bias[3], m_size[3], n_size[3], post_scale)
def test_perf_fcn(m, k, n, xclbin_opts, post_scale=[1, 0], A_range=32764, B_range=32764, bias_range=32764): ddrWidth = int(xclbin_opts["GEMX_ddrWidth"]) m = test.get_padded_size(m, int(xclbin_opts["GEMX_gemmMBlocks"]) * ddrWidth) k = test.get_padded_size(k, int(xclbin_opts["GEMX_gemmKBlocks"]) * ddrWidth) n = test.get_padded_size(n, int(xclbin_opts["GEMX_gemmNBlocks"]) * ddrWidth) if xclbin_opts["GEMX_dataType"] == "short": mat_A = np.random.randint(low=-A_range, high=A_range, size=(m, k), dtype=np.int16) mat_B = np.random.randint(low=-B_range, high=B_range, size=(k, n), dtype=np.int16) bias = [] if bias_range != 0: bias = np.random.randint(low=-bias_range, high=bias_range, size=(m, n), dtype=np.int32) else: bias = np.zeros((m, n), dtype=np.int32, order='C') C_fpga = np.zeros((m, n), dtype=np.int16) else: mat_A = np.random.uniform(low=-128, high=128, size=(m, k)).astype(np.float32) mat_B = np.random.uniform(low=-128, high=128, size=(k, n)).astype(np.float32) bias = np.zeros((m, n), dtype=np.float32, order='C') C_fpga = np.zeros((m, n), dtype=np.float32) start_time = time.time() gemx.sendMat(mat_A) gemx.sendMat(mat_B) gemx.sendMat(C_fpga) gemx.sendMat(bias) gemx.addFCNOp(mat_A, mat_B, C_fpga, bias, post_scale[0], post_scale[1], 1, 0) gemx.execute() gemx.clearInstrBuf() gemx.getMat(C_fpga) end_time = time.time() total_operations = 2 * m * n * k + m * n * 3 test.test_perf(end_time - start_time, total_operations, m, k, n, ddrWidth) test.multiply_and_cmp(C_fpga, mat_A, mat_B, bias, m, n, post_scale)
def test_perf_multi_fcn(ins_count, m_size, k_size, n_size, A_range, B_range, post_scale): total_operations = 0 total_parallel_operations = 0 mat_A=[] mat_C=[] mat_bias=[] for i in range(ins_count): total_operations += 2 * m_size[i] * n_size[i] * k_size[i] + m_size[i] * n_size[i] * 3 total_parallel_operations += 2 * m_size[i] * n_size[i] * k_size[i] mat_A.append(np.random.randint(low=-A_range, high=A_range, size=(m_size[i], k_size[i]), dtype=np.int16)) mat_bias.append(np.zeros ((m_size[i], n_size[i]), dtype=np.int32)) mat_C.append(np.zeros((m_size[i], n_size[i]), dtype=np.int16, order='C')) mat_B0 = np.random.randint(low=-B_range, high=B_range, size=(k_size[0], n_size[0]), dtype=np.int16) timePointKernel = [] timePointKernel.append(time.time()) # current time for i in range(ins_count): gemx.sendMat(mat_A[i]) gemx.sendMat(mat_C[i]) gemx.sendMat(mat_bias[i]) gemx.sendMat(mat_B0) gemx.addFCNOp (mat_A[0], mat_B0, mat_C[0], mat_bias[0], post_scale[0], post_scale[1],1,0) gemx.addFCNOp (mat_A[1], mat_C[0], mat_C[1], mat_bias[1], post_scale[0], post_scale[1],1,0) gemx.addFCNOp (mat_A[2], mat_C[1], mat_C[2], mat_bias[2], post_scale[0], post_scale[1],1,0) gemx.addFCNOp (mat_A[3], mat_C[2], mat_C[3], mat_bias[3], post_scale[0], post_scale[1],1,0) timePointKernel.append(time.time()) # send to FPGA gemx.execute() gemx.clearInstrBuf() timePointKernel.append(time.time()) # call kernel gemx.getMat(mat_C[0]) gemx.getMat(mat_C[1]) gemx.getMat(mat_C[2]) gemx.getMat(mat_C[3]) timePointKernel.append(time.time()) # copy from FPGA freq = gemx.getFreq() test.test_perf(timePointKernel,total_operations,total_parallel_operations,freq,0,0,0) test.multiply_and_cmp(mat_C[3], mat_A[3], mat_C[2], mat_bias[3], m_size[3], n_size[3], post_scale)