( 64, 64, 64, 1, 224,224, 1, 3, 3, 0,1,1, 1,1,1), ( 64, 64,128, 1, 112,112, 1, 3, 3, 0,1,1, 1,1,1), ( 64,128,128, 1, 112,112, 1, 3, 3, 0,1,1, 1,1,1), ( 64,128,256, 1, 56, 56, 1, 3, 3, 0,1,1, 1,1,1), ( 64,256,256, 1, 56, 56, 1, 3, 3, 0,1,1, 1,1,1), ( 64,256,512, 1, 28, 28, 1, 3, 3, 0,1,1, 1,1,1), ( 64,512,512, 1, 28, 28, 1, 3, 3, 0,1,1, 1,1,1), ( 64,512,512, 1, 14, 14, 1, 3, 3, 0,1,1, 1,1,1), (128, 3, 64, 1, 224,224, 1,11,11, 0,3,3, 1,4,4), #Alexnet (128, 64,192, 1, 27, 27, 1, 5, 5, 0,2,2, 1,1,1), (128,192,384, 1, 13, 13, 1, 3, 3, 0,1,1, 1,1,1), (128,384,256, 1, 13, 13, 1, 3, 3, 0,1,1, 1,1,1), (128,256,256, 1, 13, 13, 1, 3, 3, 0,1,1, 1,1,1),): conv = ng.conv_layer(dtype, *dims) N,C,K = conv.NCK D,H,W = conv.DHW T,R,S = conv.TRS M,P,Q = conv.MPQ pad_d, pad_h, pad_w = conv.padding str_d, str_h, str_w = conv.strides alpha, beta = (1.0, 0.0) dimI = conv.dimI2 dimF = conv.dimF2 dimO = conv.dimO2 print "cudnn:"
def run(): ng = NervanaGPU(stochastic_round=False) dt = np.float32 # N: Number of images in mini-batch # C: Number of input feature maps # K: Number of output feature maps # D: Depth of input image # H: Height of input image # W: Width of input image # T: Depth of filter kernel # R: Height of filter kernel # S: Width of filter kernel # # * images: (numColors, imgSizeY, imgSizeX, numImages) with stride given # * filters: (numColors, filterPixels, numFilters) if conv # * (numModules, numColors, filterPixels, numFilters) otherwise # * # * targets: (numFilters, numModulesY, numModulesX, numImages) N = 128 C = 3 K = 64 D = 1 H = 64 W = 64 T = 1 R = 8 S = 8 pad_h = pad_w = 0 str_h = str_w = 4 layer = ng.conv_layer(dt, N, C, K, D=D, H=H, W=W, T=T, R=R, S=S, pad_d=0, pad_h=pad_h, pad_w=pad_w, str_d=1, str_h=str_h, str_w=str_w, grid_P=0, grid_Q=0, update_size=None) numImages = N numFilters = K numModulesY = int(math.ceil(float(H - R + 1 + 2*pad_h) / str_h)) numModulesX = int(math.ceil(float(W - S + 1 + 2*pad_w) / str_w)) print "Num Modules ", numModulesX, numModulesY # Set up images, filters, and outputs # imgd = np.loadtxt("im1.txt") # img = np.zeros((64, 64, 3)) # print imgd.shape # for i in range(3): # img[:, :, i] = imgd[i*64:(i+1)*64, :] # hostImages = np.tile(img) hostImages = np.random.rand(C, H, W, N) hostFilters = np.random.uniform(low=0.0, high=1.0, size=(C, S*R, numFilters)) #np.ones((C, S*R, numFilters)) # hostOutputs = np.zeros((numFilters, numModulesY, numModulesX, N)) print "Input sum", np.sum(hostImages) # Run cc2 kernel devI = ng.array(hostImages, dtype=dt) devF = ng.array(hostFilters, dtype=dt) devO = ng.array(hostOutputs, dtype=dt) ng.fprop_cuda_conv(layer, devI, devF, devO) print "CC2 input sum: ", np.sum(devI.asnumpyarray()) print "CC2 output sum: ", np.sum(devO.asnumpyarray()) # Run maxwel kernel # images: (C * H * W, N) # filters: (C * S * R , numFilters) # outputs: (numFilters * numModulesX * numModulesY, N) devI = ng.array(hostImages.reshape((C*H*W, N)), dtype=dt) devF = ng.array(hostFilters.reshape((C*S*R, numFilters)), dtype=dt) devO2 = ng.array(hostOutputs.reshape(numFilters*numModulesX*numModulesY, N), dtype=dt) ng.fprop_conv(layer, devI, devF, devO2) print "NG input sum: ", np.sum(devI.asnumpyarray()) print "NG output sum: ", np.sum(devO2.asnumpyarray()) hostOutputs1 = np.reshape(devO.asnumpyarray(), devO2.shape) hostOutputs2 = devO2.asnumpyarray() for i in xrange(hostOutputs1.shape[0]): for j in xrange(hostOutputs1.shape[1]): assert(abs(hostOutputs1[i, j] - hostOutputs2[i, j]) < 1e-4)
(64, 64, 64, 1, 224, 224, 1, 3, 3, 0, 1, 1, 1, 1, 1), (64, 64, 128, 1, 112, 112, 1, 3, 3, 0, 1, 1, 1, 1, 1), (64, 128, 128, 1, 112, 112, 1, 3, 3, 0, 1, 1, 1, 1, 1), (64, 128, 256, 1, 56, 56, 1, 3, 3, 0, 1, 1, 1, 1, 1), (64, 256, 256, 1, 56, 56, 1, 3, 3, 0, 1, 1, 1, 1, 1), (64, 256, 512, 1, 28, 28, 1, 3, 3, 0, 1, 1, 1, 1, 1), (64, 512, 512, 1, 28, 28, 1, 3, 3, 0, 1, 1, 1, 1, 1), (64, 512, 512, 1, 14, 14, 1, 3, 3, 0, 1, 1, 1, 1, 1), (128, 3, 64, 1, 224, 224, 1, 11, 11, 0, 3, 3, 1, 4, 4), #Alexnet (128, 64, 192, 1, 27, 27, 1, 5, 5, 0, 2, 2, 1, 1, 1), (128, 192, 384, 1, 13, 13, 1, 3, 3, 0, 1, 1, 1, 1, 1), (128, 384, 256, 1, 13, 13, 1, 3, 3, 0, 1, 1, 1, 1, 1), (128, 256, 256, 1, 13, 13, 1, 3, 3, 0, 1, 1, 1, 1, 1), ): conv = ng.conv_layer(dtype, *dims) N, C, K = conv.NCK D, H, W = conv.DHW T, R, S = conv.TRS M, P, Q = conv.MPQ pad_d, pad_h, pad_w = conv.padding str_d, str_h, str_w = conv.strides alpha, beta = (1.0, 0.0) dimI = conv.dimI2 dimF = conv.dimF2 dimO = conv.dimO2 print "cudnn:"
print context.get_device().name() np.set_printoptions(threshold=8193, linewidth=600, formatter={'int':lambda x: "%10d" % x,'float':lambda x: "% .0f" % x}) ops = set(("update",)) # "fprop","bprop","update" ones = 0 cpu = 0 # Set CPU to 1 to check against CPU repeat = 1 dtype = np.float32 ng = NervanaGPU(stochastic_round=False, bench=True) conv = ng.conv_layer( dtype, 16,3,8, # N,C,K 1,64,64, # D,H,W 1,3,3, # T,R,S 0,1,1, # padding 1,1,1) # strides dimI = conv.dimI dimF = conv.dimF dimO = conv.dimO # colapse outer dimensions into one and preserve inner dimension # this allows for easy cpu convolution in numpy def slicable(dim, pad=0): dim0 = reduce(mul, dim[:-1], 1) + pad return (dim0, dim[-1])
print(context.get_device().name()) np.set_printoptions(threshold=8193, linewidth=600, formatter={'int':lambda x: "%10d" % x,'float':lambda x: "% .0f" % x}) ops = set(("update",)) # "fprop","bprop","update" ones = 0 cpu = 0 # Set CPU to 1 to check against CPU repeat = 1 dtype = np.float32 ng = NervanaGPU(stochastic_round=False, bench=True) conv = ng.conv_layer( dtype, 16,3,8, # N,C,K 1,64,64, # D,H,W 1,3,3, # T,R,S 0,1,1, # padding 1,1,1) # strides dimI = conv.dimI dimF = conv.dimF dimO = conv.dimO # colapse outer dimensions into one and preserve inner dimension # this allows for easy cpu convolution in numpy def slicable(dim, pad=0): dim0 = reduce(mul, dim[:-1], 1) + pad return (dim0, dim[-1])