def test_dot(): A1 = np.float32(np.random.rand(2,4)) A2 = np.float32(np.random.rand(4,2)) B1 = gpu.array(A1) B2 = gpu.array(A2) B3 = gpu.dot(B1,B2) C = B3.tocpu() t.assert_array_almost_equal(np.dot(A1,A2),C,4,"array.tocpu not equal to init array!") B1 = gpu.array(A1) B2 = gpu.array(A2) B3 = gpu.empty((2,2)) gpu.dot(B1,B2,B3) t.assert_array_almost_equal(np.dot(A1,A2),B3.tocpu(),4,"array.tocpu not equal to init array!")
def test_dot(): A1 = np.float32(np.random.rand(2, 4)) A2 = np.float32(np.random.rand(4, 2)) B1 = gpu.array(A1) B2 = gpu.array(A2) B3 = gpu.dot(B1, B2) C = B3.tocpu() t.assert_array_almost_equal(np.dot(A1, A2), C, 4, "array.tocpu not equal to init array!") B1 = gpu.array(A1) B2 = gpu.array(A2) B3 = gpu.empty((2, 2)) gpu.dot(B1, B2, B3) t.assert_array_almost_equal(np.dot(A1, A2), B3.tocpu(), 4, "array.tocpu not equal to init array!")
def test_timer(): if gpu.lib.pt_clusterNet == gpu.lib.pt_clusterNetCPU: return t = gpu.Timer() A = gpu.rand(100, 100) B = gpu.rand(100, 100) C = gpu.rand(100, 100) time = 0 t.tick() for i in range(10): gpu.dot(A, B, C) time = t.tock() assert time > 0 time = 0 t.tick("Timer test") gpu.dot(A, B, C) time = t.tock("Timer test") assert time > 0 accumulative_time = 0 for i in range(100): t.tick('cumulative') gpu.dot(A, B, C) t.tick('cumulative') accumulative_time = t.tock('cumulative') assert accumulative_time > 5 * time
def test_timer(): if gpu.lib.pt_clusterNet == gpu.lib.pt_clusterNetCPU: return t = gpu.Timer() A = gpu.rand(100,100) B = gpu.rand(100,100) C = gpu.rand(100,100) time = 0 t.tick() for i in range(10): gpu.dot(A,B,C) time = t.tock() assert time > 0 time = 0 t.tick("Timer test") gpu.dot(A,B,C) time = t.tock("Timer test") assert time > 0 accumulative_time = 0 for i in range(100): t.tick('cumulative') gpu.dot(A,B,C) t.tick('cumulative') accumulative_time = t.tock('cumulative') assert accumulative_time > 5*time
input = gpu.rand(dim_inner,dim1) W = gpu.rand(dim_outer,dim_inner) output = gpu.rand(dim_outer,dim1) input2 = gpu2.random.rand(dim_inner,dim1) W2 = gpu2.random.rand(dim_outer,dim_inner) output2 = gpu2.random.rand(dim_outer,dim1) mean_time = 0 for i in range(5): iters = 100 #warmup for j in range(1000): if batch_first_mode: gpu.dot(input,W,output) else: gpu.dot(W, input, output) t.tick(str(dim_inner)) for j in range(iters): if batch_first_mode: gpu.dot(input,W,output) else: gpu.dot(W, input, output) t.tick(str(dim_inner)) print t.tock(str(dim_inner))/5/iters mean_time = 0
dim_inner = 32 dim_outer = 256 for i in range(1000): dim_inner += 32 A = gpu.rand(dim1, dim_inner) B = gpu.rand(dim_inner, dim_outer) C = gpu.rand(dim1, dim_outer) if dim_inner > 0: iters = 1000 if dim_inner > 100: iters = 100 if dim_inner > 1000: iters = 10 if dim_inner > 3000: iters = 4 #warmup for j in range(2): gpu.dot(A, B, C) t.tick(str(dim_inner)) for j in range(iters): gpu.dot(A, B, C) sec = t.tock(str(dim_inner)) / 1000. tilesA = (dim1 / 16) * ((dim_inner / 64) + (1 if dim_inner % 64 > 0 else 0)) tilesB = ((dim_inner / 64) + (1 if dim_inner % 64 > 0 else 0)) * ( (dim_inner / 16) * ((dim_outer / 64) + (1 if dim_outer % 64 > 0 else 0))) memops = (tilesA + tilesB) * 16 * 64 + (dim_inner * dim_outer) #print sec / (memops*iters) #print (memops/sec)*4*(1024**-3)*iters #print iters*(dim**3)/(sec*1000*1000*1000) #print iters*(dim1*dim_inner*dim_outer)/(sec*1000*1000*1000)
else: input = gpu.rand(dim_inner, dim1) W = gpu.rand(dim_outer, dim_inner) output = gpu.rand(dim_outer, dim1) input2 = gpu2.random.rand(dim_inner, dim1) W2 = gpu2.random.rand(dim_outer, dim_inner) output2 = gpu2.random.rand(dim_outer, dim1) mean_time = 0 for i in range(5): iters = 100 #warmup for j in range(1000): if batch_first_mode: gpu.dot(input, W, output) else: gpu.dot(W, input, output) t.tick(str(dim_inner)) for j in range(iters): if batch_first_mode: gpu.dot(input, W, output) else: gpu.dot(W, input, output) t.tick(str(dim_inner)) print t.tock(str(dim_inner)) / 5 / iters mean_time = 0 for i in range(5): iters = 100
dim_inner = 32 dim_outer = 256 for i in range(1000): dim_inner += 32 A = gpu.rand(dim1,dim_inner) B = gpu.rand(dim_inner,dim_outer) C = gpu.rand(dim1,dim_outer) if dim_inner > 0: iters = 1000 if dim_inner > 100: iters = 100 if dim_inner > 1000: iters = 10 if dim_inner > 3000: iters = 4 #warmup for j in range(2): gpu.dot(A,B,C) t.tick(str(dim_inner)) for j in range(iters): gpu.dot(A,B,C) sec = t.tock(str(dim_inner))/1000. tilesA = (dim1/16)*((dim_inner/64) + (1 if dim_inner % 64 > 0 else 0)) tilesB = ((dim_inner/64) + (1 if dim_inner % 64 > 0 else 0))*((dim_inner/16)*((dim_outer/64) + (1 if dim_outer % 64 > 0 else 0))) memops = (tilesA+tilesB)*16*64 + (dim_inner*dim_outer) #print sec / (memops*iters) #print (memops/sec)*4*(1024**-3)*iters #print iters*(dim**3)/(sec*1000*1000*1000) #print iters*(dim1*dim_inner*dim_outer)/(sec*1000*1000*1000) print iters*dim1*dim_inner*dim_outer/(6144.*1000*1000*1000)*24, sec A2 = gpu.rand(dim1,dim_inner)