# copy to device devI = fp.array(I, iwl) devF = fp.array(F, iwl) devE = fp.array(E, iwl) # set output bit widths at approximately mean scaling def scale(n,q): return ((struct.unpack('I',struct.pack('f',float(0x7fff**2 * n) / q )))[0] >> 23)-126 iwlO = scale(C*T*R*S, 2) iwlB = scale(K*T*R*S, 4) iwlU = scale(N*M*P*Q, 4) # allocate output devO = fp.empty(dimO, iwlO) devB = fp.zeros(dimI, iwlB) devU = fp.zeros(dimF, iwlU) args = dict(padding=padding, strides=strides, upscale=upscale, repeat=100) # perform convolutions print 'Warming up' fp.fprop_conv(devI, devF, devO, strides=strides, upscale=upscale, repeat=10) # spin up clock print 'Starting' fp.fprop_conv(devI, devF, devO, **args) fp.bprop_conv(devF, devE, devB, **args) fp.update_conv(devI, devE, devU, **args) print 'Done'