def backward(dY, cache): Xe = cache['Xe'] generator_str = cache['generator_str'] dWs = np.zeros(cache['Ws_shape']) gen_caches = cache['gen_caches'] F = cache['F'] dXe = np.zeros(Xe.shape) Generator = decodeGenerator(generator_str) # backprop each item in the batch grads = {} for i in xrange(len(gen_caches)): ix, gen_cache = gen_caches[i] # unpack local_grads = Generator.backward(dY[i], gen_cache) dXs = local_grads['dXs'] # intercept the gradients wrt Xi and Xs del local_grads['dXs'] dXi = local_grads['dXi'] del local_grads['dXi'] accumNpDicts( grads, local_grads) # add up the gradients wrt model parameters # now backprop from dXs to the image vector and word vectors dXe[i, :] += dXi # image vector for n, j in enumerate(ix): # and now all the other words dWs[j, :] += dXs[n, :] # finally backprop into the image encoder dWe = F.transpose().dot(dXe) dbe = np.sum(dXe, axis=0, keepdims=True) accumNpDicts(grads, {'We': dWe, 'be': dbe, 'Ws': dWs}) return grads
def backward(dY, cache): Xe = cache['Xe'] generator_str = cache['generator_str'] dWs = np.zeros(cache['Ws_shape']) gen_caches = cache['gen_caches'] F = cache['F'] dXe = np.zeros(Xe.shape) Generator = decodeGenerator(generator_str) # backprop each item in the batch grads = {} for i in xrange(len(gen_caches)): ix, gen_cache = gen_caches[i] # unpack local_grads = Generator.backward(dY[i], gen_cache) dXs = local_grads['dXs'] # intercept the gradients wrt Xi and Xs del local_grads['dXs'] dXi = local_grads['dXi'] del local_grads['dXi'] accumNpDicts(grads, local_grads) # add up the gradients wrt model parameters # now backprop from dXs to the image vector and word vectors dXe[i,:] += dXi # image vector for n,j in enumerate(ix): # and now all the other words dWs[j,:] += dXs[n,:] # finally backprop into the image encoder dWe = F.transpose().dot(dXe) dbe = np.sum(dXe, axis=0, keepdims = True) accumNpDicts(grads, { 'We':dWe, 'be':dbe, 'Ws':dWs }) return grads
def backward(dY, cache): Xe = cache['Xe'] generator_str = cache['generator_str'] dWs = np.zeros(cache['Ws_shape']) gen_caches = cache['gen_caches'] F = cache['F'] dXe = np.zeros(Xe.shape) Generator = decodeGenerator(generator_str) dmmy, gen_cache = gen_caches[0] g_WLSTM = cuda.to_device(np.asfortranarray(gen_cache['WLSTM'])) # backprop each item in the batch grads = {} dt1 = 0 dt2 = 0 t0 = time.time() for i in xrange(len(gen_caches)): t1 = time.time() ix, gen_cache = gen_caches[i] # unpack local_grads = Generator.backward(dY[i], gen_cache, g_WLSTM) dt1 += time.time() - t1 t2 = time.time() dXs = local_grads['dXs'] # intercept the gradients wrt Xi and Xs del local_grads['dXs'] dXi = local_grads['dXi'] del local_grads['dXi'] accumNpDicts( grads, local_grads) # add up the gradients wrt model parameters # now backprop from dXs to the image vector and word vectors dXe[i, :] += dXi # image vector for n, j in enumerate(ix): # and now all the other words dWs[j, :] += dXs[n, :] dt2 += time.time() - t2 #dt = time.time() - t0 #print 'BP :%0.4f' %(dt) dt = time.time() - t0 print 'Backward Pass:%0.4f Others :%0.4f' % (dt1, dt2) t0 = time.time() # finally backprop into the image encoder dWe = F.transpose().dot(dXe) dbe = np.sum(dXe, axis=0, keepdims=True) dt = time.time() - t0 print 'MMult :%0.4f' % (dt) t0 = time.time() accumNpDicts(grads, {'We': dWe, 'be': dbe, 'Ws': dWs}) dt = time.time() - t0 print 'accum 2:%0.4f' % (dt) t0 = time.time() return grads
def backward(dY, cache): generator_str = cache['generator_str'] gen_caches = cache['gen_caches'] X_v1_orig = cache['X_v1_orig'] X_v2_orig = cache['X_v2_orig'] X_v3_orig = cache['X_v3_orig'] We_v1 = cache['We_v1'] dWe_v1 = np.zeros(We_v1.shape) be_v1 = cache['be_v1'] dbe_v1 = np.zeros(be_v1.shape) We_v2 = cache['We_v2'] dWe_v2 = np.zeros(We_v2.shape) be_v2 = cache['be_v2'] dbe_v2 = np.zeros(be_v2.shape) We_v3 = cache['We_v3'] dWe_v3 = np.zeros(We_v3.shape) be_v3 = cache['be_v3'] dbe_v3 = np.zeros(be_v3.shape) Generator = decodeGenerator(generator_str) # backprop each item in the batch grads = {} for i in xrange(len(gen_caches)): ix, gen_cache = gen_caches[i] # unpack local_grads = Generator.backward(dY[i], gen_cache) dX_v1 = local_grads['dX_v1'] dX_v2 = local_grads['dX_v2'] dX_v3 = local_grads['dX_v3'] del local_grads['dX_v1'] del local_grads['dX_v2'] del local_grads['dX_v3'] accumNpDicts( grads, local_grads) # add up the gradients wrt model parameters dWe_v1 += X_v1_orig[i].dot(dX_v1) dbe_v1 += np.sum(dX_v1, axis=0, keepdims=True) dWe_v2 += X_v2_orig[i].dot(dX_v2) dbe_v2 += np.sum(dX_v2, axis=0, keepdims=True) dWe_v3 += X_v3_orig[i].dot(dX_v3) dbe_v3 += np.sum(dX_v3, axis=0, keepdims=True) accumNpDicts( grads, { 'We_v1': dWe_v1, 'We_v2': dWe_v2, 'We_v3': dWe_v3, 'be_v1': dbe_v1, 'be_v2': dbe_v2, 'be_v3': dbe_v3 }) return grads