def test_array_multiplication(): # 1. Create Kompute Manager (selects device 0 by default) mgr = kp.Manager() # 2. Create Kompute Tensors to hold data tensor_in_a = kp.Tensor([2, 2, 2]) tensor_in_b = kp.Tensor([1, 2, 3]) tensor_out = kp.Tensor([0, 0, 0]) # 3. Initialise the Kompute Tensors in the GPU mgr.rebuild([tensor_in_a, tensor_in_b, tensor_out]) # 4. Define the multiplication shader code to run on the GPU @ps.python2shader def compute_shader_multiply(index=("input", "GlobalInvocationId", ps.ivec3), data1=("buffer", 0, ps.Array(ps.f32)), data2=("buffer", 1, ps.Array(ps.f32)), data3=("buffer", 2, ps.Array(ps.f32))): i = index.x data3[i] = data1[i] * data2[i] # 5. Run shader code against our previously defined tensors mgr.eval_algo_data_def([tensor_in_a, tensor_in_b, tensor_out], compute_shader_multiply.to_spirv()) # 6. Sync tensor data from GPU back to local mgr.eval_tensor_sync_local_def([tensor_out]) assert tensor_out.data() == [2.0, 4.0, 6.0] assert np.all(tensor_out.numpy() == [2.0, 4.0, 6.0])
def test_shader_str(): """ Test basic OpAlgoBase operation """ shader = """ #version 450 layout(set = 0, binding = 0) buffer tensorLhs {float valuesLhs[];}; layout(set = 0, binding = 1) buffer tensorRhs {float valuesRhs[];}; layout(set = 0, binding = 2) buffer tensorOutput { float valuesOutput[];}; layout (local_size_x = 1, local_size_y = 1, local_size_z = 1) in; void main() { uint index = gl_GlobalInvocationID.x; valuesOutput[index] = valuesLhs[index] * valuesRhs[index]; } """ tensor_in_a = kp.Tensor([2, 2, 2]) tensor_in_b = kp.Tensor([1, 2, 3]) tensor_out = kp.Tensor([0, 0, 0]) mgr = kp.Manager() mgr.rebuild([tensor_in_a, tensor_in_b, tensor_out]) spirv = kp.Shader.compile_source(shader) mgr.eval_algo_data_def([tensor_in_a, tensor_in_b, tensor_out], spirv) mgr.eval_tensor_sync_local_def([tensor_out]) assert tensor_out.data() == [2.0, 4.0, 6.0]
def test_sequence(): """ Test basic OpAlgoBase operation """ mgr = kp.Manager(0, [2]) tensor_in_a = kp.Tensor([2, 2, 2]) tensor_in_b = kp.Tensor([1, 2, 3]) tensor_out = kp.Tensor([0, 0, 0]) mgr.eval_tensor_create_def([tensor_in_a, tensor_in_b, tensor_out]) shaderFilePath = os.path.join(DIRNAME, "../../shaders/glsl/opmult.comp") mgr.eval_async_algo_file_def([tensor_in_a, tensor_in_b, tensor_out], shaderFilePath) mgr.eval_await_def() seq = mgr.create_sequence("op") seq.begin() seq.record_tensor_sync_local([tensor_in_a]) seq.record_tensor_sync_local([tensor_in_b]) seq.record_tensor_sync_local([tensor_out]) seq.end() seq.eval() assert tensor_out.data() == [2.0, 4.0, 6.0]
def test_opalgobase_data(): """ Test basic OpAlgoBase operation """ tensor_in_a = kp.Tensor([2, 2, 2]) tensor_in_b = kp.Tensor([1, 2, 3]) tensor_out = kp.Tensor([0, 0, 0]) mgr = kp.Manager() shaderData = """ #version 450 layout (local_size_x = 1) in; // The input tensors bind index is relative to index in parameter passed layout(set = 0, binding = 0) buffer bina { float tina[]; }; layout(set = 0, binding = 1) buffer binb { float tinb[]; }; layout(set = 0, binding = 2) buffer bout { float tout[]; }; void main() { uint index = gl_GlobalInvocationID.x; tout[index] = tina[index] * tinb[index]; } """ mgr.eval_tensor_create_def([tensor_in_a, tensor_in_b, tensor_out]) mgr.eval_algo_str_def([tensor_in_a, tensor_in_b, tensor_out], list(shaderData)) mgr.eval_tensor_sync_local_def([tensor_out]) assert tensor_out.data() == [2.0, 4.0, 6.0]
def test_opmult(): """ Test basic OpMult operation """ tensor_in_a = kp.Tensor([2, 2, 2]) tensor_in_b = kp.Tensor([1, 2, 3]) tensor_out = kp.Tensor([0, 0, 0]) mgr = kp.Manager() mgr.eval_tensor_create_def([tensor_in_a, tensor_in_b, tensor_out]) mgr.eval_algo_mult_def([tensor_in_a, tensor_in_b, tensor_out]) mgr.eval_tensor_sync_local_def([tensor_out]) assert tensor_out.data() == [2.0, 4.0, 6.0]
def test_workgroup(): mgr = kp.Manager(0) tensor_a = kp.Tensor(np.zeros([16, 8])) tensor_b = kp.Tensor(np.zeros([16, 8])) mgr.rebuild([tensor_a, tensor_b]) @ps.python2shader def compute_shader_wg(gl_idx=("input", "GlobalInvocationId", ps.ivec3), gl_wg_id=("input", "WorkgroupId", ps.ivec3), gl_wg_num=("input", "NumWorkgroups", ps.ivec3), data1=("buffer", 0, ps.Array(ps.f32)), data2=("buffer", 1, ps.Array(ps.f32))): i = gl_wg_id.x * gl_wg_num.y + gl_wg_id.y data1[i] = f32(gl_idx.x) data2[i] = f32(gl_idx.y) seq = mgr.sequence("new") seq.begin() seq.record_algo_data([tensor_a, tensor_b], compute_shader_wg.to_spirv(), workgroup=(16, 8, 1)) seq.end() seq.eval() mgr.destroy(seq) assert seq.is_init() == False mgr.eval_tensor_sync_local_def([tensor_a, tensor_b]) print(tensor_a.numpy()) print(tensor_b.numpy()) assert np.all(tensor_a.numpy() == np.stack([np.arange(16)] * 8, axis=1).ravel()) assert np.all(tensor_b.numpy() == np.stack([np.arange(8)] * 16, axis=0).ravel()) mgr.destroy([tensor_a, tensor_b]) assert tensor_a.is_init() == False assert tensor_b.is_init() == False
def test_opalgobase_file(): """ Test basic OpAlgoBase operation """ tensor_in_a = kp.Tensor([2, 2, 2]) tensor_in_b = kp.Tensor([1, 2, 3]) tensor_out = kp.Tensor([0, 0, 0]) mgr = kp.Manager() mgr.rebuild([tensor_in_a, tensor_in_b, tensor_out]) shader_path = os.path.join(DIRNAME, "../../shaders/glsl/opmult.comp.spv") mgr.eval_algo_file_def([tensor_in_a, tensor_in_b, tensor_out], shader_path) mgr.eval_tensor_sync_local_def([tensor_out]) assert tensor_out.data() == [2.0, 4.0, 6.0]
def test_tensor_rebuild_backwards_compat(): """ Test basic OpMult operation """ tensor_in_a = kp.Tensor([2, 2, 2]) tensor_in_b = kp.Tensor([1, 2, 3]) tensor_out = kp.Tensor([0, 0, 0]) mgr = kp.Manager() mgr.eval_tensor_create_def([tensor_in_a, tensor_in_b, tensor_out]) shader_path = os.path.abspath(os.path.join(DIRNAME, "../../shaders/glsl/opmult.comp.spv")) mgr.eval_async_algo_file_def([tensor_in_a, tensor_in_b, tensor_out], shader_path) mgr.eval_await_def() mgr.eval_tensor_sync_local_def([tensor_out]) assert tensor_out.data() == [2.0, 4.0, 6.0] assert np.all(tensor_out.numpy() == [2.0, 4.0, 6.0])
def test_sequence(): """ Test basic OpAlgoBase operation """ mgr = kp.Manager(0, [2]) tensor_in_a = kp.Tensor([2, 2, 2]) tensor_in_b = kp.Tensor([1, 2, 3]) tensor_out = kp.Tensor([0, 0, 0]) mgr.rebuild([tensor_in_a, tensor_in_b, tensor_out]) shader_path = os.path.abspath( os.path.join(DIRNAME, "../../shaders/glsl/opmult.comp.spv")) mgr.eval_async_algo_file_def([tensor_in_a, tensor_in_b, tensor_out], shader_path) mgr.eval_await_def() seq = mgr.sequence("op") seq.begin() seq.record_tensor_sync_local([tensor_in_a]) seq.record_tensor_sync_local([tensor_in_b]) seq.record_tensor_sync_local([tensor_out]) seq.end() seq.eval() mgr.destroy("op") assert seq.is_init() == False assert tensor_out.data() == [2.0, 4.0, 6.0] assert np.all(tensor_out.numpy() == [2.0, 4.0, 6.0]) mgr.destroy(tensor_in_a) mgr.destroy([tensor_in_b, tensor_out]) assert tensor_in_a.is_init() == False assert tensor_in_b.is_init() == False assert tensor_out.is_init() == False
def test_workgroup(): mgr = kp.Manager(0) tensor_a = kp.Tensor(np.zeros([16, 8])) tensor_b = kp.Tensor(np.zeros([16, 8])) mgr.eval_tensor_create_def([tensor_a, tensor_b]) shader_src = """ #version 450 layout (local_size_x = 1) in; // The input tensors bind index is relative to index in parameter passed layout(set = 0, binding = 0) writeonly buffer bout { float toutx[]; }; layout(set = 0, binding = 1) writeonly buffer bout2 { float touty[]; }; void main() { uint index = gl_WorkGroupID.x*gl_NumWorkGroups.y + gl_WorkGroupID.y; toutx[index] = gl_GlobalInvocationID.x; touty[index] = gl_GlobalInvocationID.y; } """ shader_src = bytes(shader_src, encoding='utf8') seq = mgr.create_sequence() seq.begin() seq.record_algo_data([tensor_a, tensor_b], shader_src, (16, 8, 1)) seq.end() seq.eval() mgr.eval_tensor_sync_local_def([tensor_a, tensor_b]) assert np.all(tensor_a.numpy() == np.stack([np.arange(16)] * 8, axis=1).ravel()) assert np.all(tensor_b.numpy() == np.stack([np.arange(8)] * 16, axis=0).ravel())
def render_base(args, folder): SIZE = (args.width, args.height) # pygame setup if visual enabled surf = None if (args.vis): pygame.init() surf = pygame.display.set_mode(SIZE) # change verbosity level kp_logger = logging.getLogger("kp") kp_logger.setLevel(50 - (max(min(args.verbose, 4), 0) * 10)) # init manager mgr = kp.Manager(args.device) # shader inputs tensor_size = kp.Tensor(SIZE) tensor_frame = kp.Tensor([0]) tensor_offset = kp.Tensor([0]) tensor_out = kp.Tensor(np.zeros((SIZE[0] * SIZE[1] * 3))) # allocate memory on gpu mgr.eval_tensor_create_def([tensor_out, tensor_size, tensor_frame, tensor_offset]) # read shader f = open(folder + args.scene + ".spv", "rb") # create sequences sq_sdf = mgr.create_sequence() sq_sdf.begin() sq_sdf.record_tensor_sync_device([tensor_frame]) sq_sdf.end() sq_sdo = mgr.create_sequence() sq_sdo.begin() sq_sdo.record_tensor_sync_device([tensor_offset]) sq_sdo.end() sq_r = mgr.create_sequence() sq_r.begin() sq_r.record_algo_data([tensor_out, tensor_size, tensor_frame, tensor_offset], f.read()) sq_r.end() sq_sl = mgr.create_sequence() sq_sl.begin() sq_sl.record_tensor_sync_local([tensor_out]) sq_sl.end() # close shader file f.close() # render frames for i in range(args.start, args.end + 1): if (args.verbose > 0): print("rendering frame {}".format(i)) # run program tensor_frame[0] = i # copy frame to shader sq_sdf.eval() # split into smaller chunks for j in range(16): if (args.verbose > 1): print("- rendering chunk {}".format(j)) tensor_offset[0] = j # copy offset to shader sq_sdo.eval() # run shader sq_r.eval() # copy frame from shader sq_sl.eval() # save frame to output frame = np.flip(np.array(tensor_out.data()).reshape((SIZE[1], SIZE[0], 3)), axis=0) plt.imsave("output/image{}.png".format(i), frame) # visualize if (args.vis): # create surface from array surf2 = pygame.surfarray.make_surface(np.swapaxes(frame, 0, 1) * 255) # weird pygame bug surf.blit(surf2, (0, 0)) pygame.display.update() surf.blit(surf2, (0, 0)) pygame.display.update() # stop on last frame if (i == args.end): while True: for event in pygame.event.get(): if event.type == pygame.QUIT: quit()
def test_logistic_regression(): @ps.python2shader def compute_shader(index=("input", "GlobalInvocationId", ps.ivec3), x_i=("buffer", 0, ps.Array(ps.f32)), x_j=("buffer", 1, ps.Array(ps.f32)), y=("buffer", 2, ps.Array(ps.f32)), w_in=("buffer", 3, ps.Array(ps.f32)), w_out_i=("buffer", 4, ps.Array(ps.f32)), w_out_j=("buffer", 5, ps.Array(ps.f32)), b_in=("buffer", 6, ps.Array(ps.f32)), b_out=("buffer", 7, ps.Array(ps.f32)), l_out=("buffer", 8, ps.Array(ps.f32)), M=("buffer", 9, ps.Array(ps.f32))): i = index.x m = M[0] w_curr = vec2(w_in[0], w_in[1]) b_curr = b_in[0] x_curr = vec2(x_i[i], x_j[i]) y_curr = y[i] z_dot = w_curr @ x_curr z = z_dot + b_curr y_hat = 1.0 / (1.0 + exp(-z)) d_z = y_hat - y_curr d_w = (1.0 / m) * x_curr * d_z d_b = (1.0 / m) * d_z loss = -((y_curr * log(y_hat)) + ((1.0 + y_curr) * log(1.0 - y_hat))) w_out_i[i] = d_w.x w_out_j[i] = d_w.y b_out[i] = d_b l_out[i] = loss mgr = kp.Manager(0) # First we create input and ouput tensors for shader tensor_x_i = kp.Tensor([0.0, 1.0, 1.0, 1.0, 1.0]) tensor_x_j = kp.Tensor([0.0, 0.0, 0.0, 1.0, 1.0]) tensor_y = kp.Tensor([0.0, 0.0, 0.0, 1.0, 1.0]) tensor_w_in = kp.Tensor([0.001, 0.001]) tensor_w_out_i = kp.Tensor([0.0, 0.0, 0.0, 0.0, 0.0]) tensor_w_out_j = kp.Tensor([0.0, 0.0, 0.0, 0.0, 0.0]) tensor_b_in = kp.Tensor([0.0]) tensor_b_out = kp.Tensor([0.0, 0.0, 0.0, 0.0, 0.0]) tensor_l_out = kp.Tensor([0.0, 0.0, 0.0, 0.0, 0.0]) tensor_m = kp.Tensor([tensor_y.size()]) # We store them in an array for easier interaction params = [ tensor_x_i, tensor_x_j, tensor_y, tensor_w_in, tensor_w_out_i, tensor_w_out_j, tensor_b_in, tensor_b_out, tensor_l_out, tensor_m ] mgr.eval_tensor_create_def(params) # Create a managed sequence sq = mgr.create_sequence() # Clear previous operations and begin recording for new operations sq.begin() # Record operation to sync memory from local to GPU memory sq.record_tensor_sync_device([tensor_w_in, tensor_b_in]) # Record operation to execute GPU shader against all our parameters sq.record_algo_data(params, compute_shader.to_spirv()) # Record operation to sync memory from GPU to local memory sq.record_tensor_sync_local( [tensor_w_out_i, tensor_w_out_j, tensor_b_out, tensor_l_out]) # Stop recording operations sq.end() ITERATIONS = 100 learning_rate = 0.1 # Perform machine learning training and inference across all input X and Y for i_iter in range(ITERATIONS): # Execute an iteration of the algorithm sq.eval() # Calculate the parameters based on the respective derivatives calculated for j_iter in range(tensor_b_out.size()): tensor_w_in[0] -= learning_rate * tensor_w_out_i.data()[j_iter] tensor_w_in[1] -= learning_rate * tensor_w_out_j.data()[j_iter] tensor_b_in[0] -= learning_rate * tensor_b_out.data()[j_iter] assert tensor_w_in.data()[0] < 0.01 assert tensor_w_in.data()[0] > 0.0 assert tensor_w_in.data()[1] > 1.5 assert tensor_b_in.data()[0] < 0.7