def L_op(self, inputs, outputs, out_grads): x, k = inputs k_grad = grad_undefined(self, 1, k, "topk: k is not differentiable") if not (self.return_indices or self.return_values): x_grad = grad_undefined( self, 0, x, "topk: cannot get gradient" " without both indices and values", ) else: x_shp = theano.tensor.shape(x) z_grad = out_grads[0] ndim = x.ndim axis = self.axis % ndim grad_indices = [ arange(x_shp[i]).dimshuffle([0] + ["x"] * (ndim - i - 1)) if i != axis else outputs[-1] for i in range(ndim) ] x_grad = x.zeros_like(dtype=z_grad.dtype) x_grad = set_subtensor(x_grad[tuple(grad_indices)], z_grad) return [x_grad, k_grad]
def L_op(self, inputs, outputs, output_grads): # Gradients computed by Op assert self.compute_grad and len(outputs) == 2 gradients = outputs[1] assert gradients is not None # Gradients of original function, to compose chain rule grad_op = output_grads[0] grad_shuffle = GpuDimShuffle(input_broadcastable=( False, False, False, ), new_order=(1, 0, 2))(gradients) grad_bdot = T.basic.batched_dot(grad_op, grad_shuffle) grad_shuffle_reverse = GpuDimShuffle(input_broadcastable=( False, False, False, ), new_order=(1, 0, 2))(grad_bdot) return [ grad_shuffle_reverse, grad_undefined(self, 1, inputs[1]), grad_undefined(self, 2, inputs[2]) ]
def grad(self, inputs, output_grads): return [ self.gradients, grad_undefined(self, 1, inputs[1]), grad_undefined(self, 2, inputs[2]), grad_undefined(self, 3, inputs[3]), grad_undefined(self, 4, inputs[4]) ]
def grad(self, inputs, output_grads): # self.gradients.shape = [seqLen, batchSize, outputSize] # output_grads[0].shape = [batchSize] (one cost per sequence) # So, reshape output_grads to [1, batchSize, 1] for broadcasting output_grad = output_grads[0].reshape( (1, -1, 1) ) return [output_grad * self.gradients, grad_undefined(self, 1, inputs[1]), grad_undefined(self, 2, inputs[2])]
def grad(self, inputs, output_grads): gradients = CPUCTCGrad()(*inputs) return [ gradients, grad_undefined(self, 1, inputs[1]), grad_undefined(self, 2, inputs[2]), grad_undefined(self, 3, inputs[3]), ]
def grad(self, inputs, output_grads): gradients = output_grads * CPUCTCGrad()(*inputs) return [ gradients, grad_undefined(self, 1, inputs[1]), grad_undefined(self, 2, inputs[2]), grad_undefined(self, 3, inputs[3]) ]
def grad(self, inputs, output_grads): # Enable gradient computation self.computeGradient.set_value(np.asarray([1], dtype=np.int32)) return [self.gradients, grad_undefined(self, 1, inputs[1]), grad_undefined(self, 2, inputs[2]), grad_undefined(self, 3, inputs[3]), grad_undefined(self, 4, inputs[4])]
def L_op(self, inputs, outputs, output_grads): assert self.compute_grad and len(outputs) == 2 gradients = outputs[1] assert gradients is not None grad_op = output_grads[0] total_grad = T.basic.batched_dot(grad_op, gradients.dimshuffle(1, 0, 2)).dimshuffle(1, 0, 2) return [total_grad, grad_undefined(self, 1, inputs[1]), grad_undefined(self, 2, inputs[2])]
def grad(self, inputs, grads): o, W, h, inputIdx, outputIdx = inputs go = grads[0] Wgrad = gpu_sparse_block_outer(W.zeros_like(), h, go, inputIdx, outputIdx) hgrad = gpu_sparse_block_gemv(h.zeros_like(), W.dimshuffle((1, 0, 3, 2)), go, outputIdx, inputIdx) return [ go, Wgrad, hgrad, grad_undefined(self, 3, inputIdx, "grad of inputIdx makes no sense"), grad_undefined(self, 4, outputIdx, "grad of outputIdx makes no sense"), ]
def grad(self, inputs, grads): o, W, h, inputIdx, outputIdx = inputs go = grads[0] outer_fun = SparseBlockOuter(self.inplace) gemv_fun = SparseBlockGemv(self.inplace) Wgrad = outer_fun(W.zeros_like(), h, go, inputIdx, outputIdx) hgrad = gemv_fun(h.zeros_like(), W.dimshuffle((1, 0, 3, 2)), go, outputIdx, inputIdx) return [go, Wgrad, hgrad, grad_undefined(self, 3, inputIdx, "grad of inputIdx makes no sense"), grad_undefined(self, 4, outputIdx, "grad of outputIdx makes no sense")]
def grad(self, inputs, grads): o, W, h, inputIdx, outputIdx = inputs go = grads[0] Wgrad = sparse_block_outer_ss(W.zeros_like(), h, go, inputIdx, outputIdx) hgrad = sparse_block_gemv_ss(h.zeros_like(), W.dimshuffle((1, 0, 3, 2)), go, outputIdx, inputIdx) return [go, Wgrad, hgrad, grad_undefined(self, 3, inputIdx, "grad of inputIdx makes no sense"), grad_undefined(self, 4, outputIdx, "grad of outputIdx makes no sense")]
def grad(self, inp, grads): outs = self(*inp) grad_op = ROIPoolingGradOp( self.pooled_h, self.pooled_w, self.spatial_scale) data_grad = grad_op(*(inp + [outs[1], grads[0]])) return [data_grad, grad_undefined(self, 1, inp[1])]
def grad(self, inputs, ograd): return [ gradient.grad_undefined( self, k, inp, "No gradient defined through " "random sampling op" ) for k, inp in enumerate(inputs) ]
def L_op(self, inputs, outputs, output_grads): # Gradients computed by Op assert self.compute_grad and len(outputs) == 2 gradients = outputs[1] assert gradients is not None # Gradients of original function, to compose chain rule grad_op = output_grads[0] grad_shuffle = GpuDimShuffle(input_broadcastable=(False, False, False,), new_order=(1, 0, 2))(gradients) grad_bdot = T.basic.batched_dot(grad_op, grad_shuffle) grad_shuffle_reverse = GpuDimShuffle(input_broadcastable=(False, False, False,), new_order=(1, 0, 2))(grad_bdot) return [grad_shuffle_reverse, grad_undefined(self, 1, inputs[1]), grad_undefined(self, 2, inputs[2])]
def grad(self, inputs, grads): o, W, h, inputIdx, outputIdx = inputs go = grads[0] # might revise that interface to not have a huge output Wgrad = sparse_block_outer_ss(W.zeros_like(), h, go, inputIdx, outputIdx) hgrad = sparse_block_gemv_ss(h.zeros_like(), W.dimshuffle((1, 0, 3, 2)), go, outputIdx, inputIdx) return [go, Wgrad, hgrad, grad_undefined(self, 3, inputIdx, "grad of inputIdx makes no sense"), grad_undefined(self, 4, outputIdx, "grad of outputIdx makes no sense")]
def grad(self, inp, grads): outs = self(*inp) grad_op = ROIPoolingGradOp(self.pooled_h, self.pooled_w, self.spatial_scale) data_grad = grad_op(*(inp + [outs[1], grads[0]])) return [data_grad, grad_undefined(self, 1, inp[1])]
def grad(self, inputs, output_grads): a, axis = inputs indices = self.__get_argsort_indices(a, axis) inp_grad = output_grads[0][tuple(indices)] axis_grad = grad_undefined( self, 1, axis, "The gradient of sort is not defined " "with respect to the integer axes itself") return [inp_grad, axis_grad]
def grad(self, inputs, output_grads): # No grad defined for intergers. inp, axis = inputs inp_grad = inp.zeros_like() axis_grad = grad_undefined( self, 1, axis, "argsort is not defined for non-integer axes so" " argsort(x, axis+eps) is undefined") return [inp_grad, axis_grad]
def grad(self, inp, grads): x, neib_shape, neib_step = inp gz, = grads if self.mode in ['valid', 'ignore_borders']: if (neib_shape is neib_step or neib_shape == neib_step or # Theano Constant == do not compare the data # the equals function do that. (hasattr(neib_shape, "equals") and neib_shape.equals(neib_step))): return [neibs2images(gz, neib_shape, x.shape, mode=self.mode), grad_undefined(self, 1, neib_shape), grad_undefined(self, 2, neib_step)] return [grad_not_implemented(self, 0, x), grad_undefined(self, 1, neib_shape), grad_undefined(self, 2, neib_step)]
def L_op(self, inputs, outputs, out_grads): x, k = inputs k_grad = grad_undefined(self, 1, k, 'topk: k is not differentiable') if not (self.return_indices or self.return_values): x_grad = grad_undefined( self, 0, x, 'topk: cannot get gradient' ' without both indices and values') else: x_shp = theano.tensor.shape(x) z_grad = out_grads[0] ndim = x.ndim axis = self.axis % ndim grad_indices = [ arange(x_shp[i]).dimshuffle([0] + ['x'] * (ndim - i - 1)) if i != axis else outputs[-1] for i in range(ndim)] x_grad = x.zeros_like(dtype=z_grad.dtype) x_grad = set_subtensor(x_grad[tuple(grad_indices)], z_grad) return [x_grad, k_grad]
def grad(self, inputs, grads): logger.warning("BernoulliOp.grad(...) called") prob = inputs[0] noise = inputs[1] #import ipdb; ipdb.set_trace() #g0 = prob.zeros_like().astype(theano.config.floatX) g0 = prob * grads[0] g1 = grad_undefined(self, 1, noise) return [g0, g1]
def grad(self, inputs, output_gradients): V, W, b, d = inputs dCdH, = output_gradients # make all of these ops support broadcasting of scalar b to vector b and eplace the zeros_like in all their grads # print dCdH.broadcastable # print "dCdH.broadcastable" # quit(-1) # dCdH = printing.Print("dCdH = ",["shape"]) # Make sure the broadcasting pattern of the gradient is the the same # as the initial variable dCdV = theano.tensor.nnet.convTransp3D(W, T.zeros_like(V[0, 0, 0, 0, :]), d, dCdH, V.shape[1:4]) dCdV = T.patternbroadcast(dCdV, V.broadcastable) WShape = W.shape dCdW = theano.tensor.nnet.convGrad3D(V, d, WShape, dCdH) dCdW = T.patternbroadcast(dCdW, W.broadcastable) dCdb = T.sum(dCdH, axis=(0, 1, 2, 3)) dCdb = T.patternbroadcast(dCdb, b.broadcastable) dCdd = grad_undefined( self, 3, inputs[3], "The gradient of Conv3D with respect to the convolution" " stride is undefined because Conv3D is only defined for" " integer strides.") if 'name' in dir(dCdH) and dCdH.name is not None: dCdH_name = dCdH.name else: dCdH_name = 'anon_dCdH' if 'name' in dir(V) and V.name is not None: V_name = V.name else: V_name = 'anon_V' if 'name' in dir(W) and W.name is not None: W_name = W.name else: W_name = 'anon_W' if 'name' in dir(b) and b.name is not None: b_name = b.name else: b_name = 'anon_b' dCdV.name = 'Conv3D_dCdV(dCdH=' + dCdH_name + ',V=' + V_name + ')' dCdW.name = ('Conv3D_dCdW(dCdH=' + dCdH_name + ',V=' + V_name + ',W=' + W_name + ')') dCdb.name = ('Conv3D_dCdb(dCdH=' + dCdH_name + ',V=' + V_name + ',W=' + W_name + ',b=' + b_name + ')') return [dCdV, dCdW, dCdb, dCdd]
def grad(self, inp, grads): x, neib_shape, neib_step = inp gz, = grads if self.mode in ['valid', 'ignore_borders']: if (neib_shape is neib_step or neib_shape == neib_step or # Theano Constant == do not compare the data # the equals function do that. (hasattr(neib_shape, "equals") and neib_shape.equals(neib_step) )): return [ neibs2images(gz, neib_shape, x.shape, mode=self.mode), grad_undefined(self, 1, neib_shape), grad_undefined(self, 2, neib_step) ] if self.mode in ['valid']: # Iterate over neighborhood positions, summing contributions. def pos2map(pidx, pgz, prior_result, neib_shape, neib_step): ''' Helper function that adds gradient contribution from a single neighborhood position i,j. pidx = Index of position within neighborhood. pgz = Gradient of shape (batch_size*num_channels*neibs) prior_result = Shape (batch_size, num_channnels, rows, cols) neib_shape = Number of rows, cols in a neighborhood. neib_step = Step sizes from image2neibs. ''' nrows, ncols = neib_shape rstep, cstep = neib_step batch_size, num_channels, rows, cols = prior_result.shape i = pidx // ncols j = pidx - (i * ncols) # This position does not touch some img pixels in valid mode. result_indices = prior_result[:, :, i:(rows - nrows + i + 1):rstep, j:(cols - ncols + j + 1):cstep] newshape = (batch_size, num_channels) + \ ((rows - nrows) // rstep + 1,) + \ ((cols - ncols) // cstep + 1,) return T.inc_subtensor(result_indices, pgz.reshape(newshape)) indices = T.arange(neib_shape[0] * neib_shape[1]) pgzs = gz.dimshuffle((1, 0)) result, _ = theano.scan(fn=pos2map, sequences=[indices, pgzs], outputs_info=T.zeros(x.shape), non_sequences=[neib_shape, neib_step]) grad_input = result[-1] return [ grad_input, grad_undefined(self, 1, neib_shape), grad_undefined(self, 2, neib_step) ] return [ grad_not_implemented(self, 0, x), grad_undefined(self, 1, neib_shape), grad_undefined(self, 2, neib_step) ]
def grad(self, inputs, output_gradients): V, W, b, d = inputs dCdH, = output_gradients # make all of these ops support broadcasting of scalar b to vector b and eplace the zeros_like in all their grads # print dCdH.broadcastable # print "dCdH.broadcastable" # quit(-1) # dCdH = printing.Print("dCdH = ",["shape"]) # Make sure the broadcasting pattern of the gradient is the the same # as the initial variable dCdV = theano.tensor.nnet.convTransp3D( W, T.zeros_like(V[0, 0, 0, 0, :]), d, dCdH, V.shape[1:4]) dCdV = T.patternbroadcast(dCdV, V.broadcastable) WShape = W.shape dCdW = theano.tensor.nnet.convGrad3D(V, d, WShape, dCdH) dCdW = T.patternbroadcast(dCdW, W.broadcastable) dCdb = T.sum(dCdH, axis=(0, 1, 2, 3)) dCdb = T.patternbroadcast(dCdb, b.broadcastable) dCdd = grad_undefined( self, 3, inputs[3], "The gradient of Conv3D with respect to the convolution" " stride is undefined because Conv3D is only defined for" " integer strides.") if 'name' in dir(dCdH) and dCdH.name is not None: dCdH_name = dCdH.name else: dCdH_name = 'anon_dCdH' if 'name' in dir(V) and V.name is not None: V_name = V.name else: V_name = 'anon_V' if 'name' in dir(W) and W.name is not None: W_name = W.name else: W_name = 'anon_W' if 'name' in dir(b) and b.name is not None: b_name = b.name else: b_name = 'anon_b' dCdV.name = 'Conv3D_dCdV(dCdH=' + dCdH_name + ',V=' + V_name + ')' dCdW.name = ('Conv3D_dCdW(dCdH=' + dCdH_name + ',V=' + V_name + ',W=' + W_name + ')') dCdb.name = ('Conv3D_dCdb(dCdH=' + dCdH_name + ',V=' + V_name + ',W=' + W_name + ',b=' + b_name + ')') return [dCdV, dCdW, dCdb, dCdd]
def grad(self, inputs, output_gradients): C, d, WShape, B = inputs dLdA, = output_gradients z = T.zeros_like(C[0, 0, 0, 0, :]) dLdC = convTransp3D(dLdA, z, d, B, C.shape[1:4]) # d actually does affect the outputs, so it's not disconnected dLdd = grad_undefined(self, 1, d) # The shape of the weights doesn't affect the output elements dLdWShape = DisconnectedType()() dLdB = conv3D(C, dLdA, T.zeros_like(B[0, 0, 0, 0, :]), d) return [dLdC, dLdd, dLdWShape, dLdB]
def grad(self, inp, grads): axis, tensors = inp[0], inp[1:] gz, = grads rval = [grad_undefined(self, 0, axis)] out = ConcatenateGrad()(gz, axis, *tensors) if not isinstance(out, list): out = [out] rval = rval + out return rval
def grad(self, inp, grads): x, neib_shape, neib_step = inp gz, = grads if self.mode in ['valid', 'ignore_borders']: if (neib_shape is neib_step or neib_shape == neib_step or # Theano Constant == do not compare the data # the equals function do that. (hasattr(neib_shape, "equals") and neib_shape.equals(neib_step))): return [neibs2images(gz, neib_shape, x.shape, mode=self.mode), grad_undefined(self, 1, neib_shape), grad_undefined(self, 2, neib_step)] if self.mode in ['valid']: # Iterate over neighborhood positions, summing contributions. def pos2map(pidx, pgz, prior_result, neib_shape, neib_step): ''' Helper function that adds gradient contribution from a single neighborhood position i,j. pidx = Index of position within neighborhood. pgz = Gradient of shape (batch_size*num_channels*neibs) prior_result = Shape (batch_size, num_channnels, rows, cols) neib_shape = Number of rows, cols in a neighborhood. neib_step = Step sizes from image2neibs. ''' nrows, ncols = neib_shape rstep, cstep = neib_step batch_size, num_channels, rows, cols = prior_result.shape i = pidx // ncols j = pidx - (i * ncols) # This position does not touch some img pixels in valid mode. result_indices = prior_result[:, :, i:(rows - nrows + i + 1):rstep, j:(cols - ncols + j + 1):cstep] newshape = (batch_size, num_channels) + \ ((rows - nrows) // rstep + 1,) + \ ((cols - ncols) // cstep + 1,) return T.inc_subtensor(result_indices, pgz.reshape(newshape)) indices = T.arange(neib_shape[0] * neib_shape[1]) pgzs = gz.dimshuffle((1, 0)) result, _ = theano.scan(fn=pos2map, sequences=[indices, pgzs], outputs_info=T.zeros(x.shape), non_sequences=[neib_shape, neib_step]) grad_input = result[-1] return [grad_input, grad_undefined(self, 1, neib_shape), grad_undefined(self, 2, neib_step)] return [grad_not_implemented(self, 0, x), grad_undefined(self, 1, neib_shape), grad_undefined(self, 2, neib_step)]
def grad(self, inp, cost_grad): """ Notes ----- The gradient is currently implemented for matrices only. """ a, val, offset = inp grad = cost_grad[0] height, width = grad.shape if a.dtype.startswith("complex"): return [None, None] # only valid for matrices wr_a = fill_diagonal_offset(grad, 0, offset) offset_abs = basic.abs_(offset) pos_offset_flag = basic.ge(offset, 0) neg_offset_flag = basic.lt(offset, 0) min_wh = basic.minimum(width, height) start = offset * pos_offset_flag + offset_abs * width * neg_offset_flag num_of_step = basic.minimum( min_wh, width * pos_offset_flag + height * neg_offset_flag - offset_abs) step = a.shape[1] + 1 end = start + step * num_of_step # input of slice should be integer start = basic.cast(start, "int32") step = basic.cast(step, "int32") end = basic.cast(end, "int32") wr_val = grad.flatten()[start:end:step].sum() wr_offset = grad_undefined( self, 2, offset, "offset is not defined for non-integer offset so" " fill_diagonal_offset(a,val,offset+eps) is undefined", ) return [wr_a, wr_val, wr_offset]
def grad(self, inputs, output_gradients): W, b, d, H, RShape = inputs dCdR, = output_gradients dCdH = theano.tensor.nnet.conv3D(dCdR, W, T.zeros_like(H[0, 0, 0, 0, :]), d) WShape = W.shape dCdW = theano.tensor.nnet.convGrad3D(dCdR, d, WShape, H) dCdb = T.sum(dCdR, axis=(0, 1, 2, 3)) # not differentiable, since d affects the output elements dCdd = grad_undefined(self, 2, d) # disconnected, since RShape just determines the output shape dCdRShape = DisconnectedType()() if 'name' in dir(dCdR) and dCdR.name is not None: dCdR_name = dCdR.name else: dCdR_name = 'anon_dCdR' if 'name' in dir(H) and H.name is not None: H_name = H.name else: H_name = 'anon_H' if 'name' in dir(W) and W.name is not None: W_name = W.name else: W_name = 'anon_W' if 'name' in dir(b) and b.name is not None: b_name = b.name else: b_name = 'anon_b' dCdW.name = ('ConvTransp3D_dCdW.H=' + H_name + ',dCdR=' + dCdR_name + ',W=' + W_name) dCdb.name = ('ConvTransp3D_dCdb.H=' + H_name + ',dCdR=' + dCdR_name + ',W=' + W_name + ',b=' + b_name) dCdH.name = 'ConvTransp3D_dCdH.H=' + H_name + ',dCdR=' + dCdR_name return [dCdW, dCdb, dCdd, dCdH, dCdRShape]
def grad(self, inputs, outputs_gradients): a, *shape = inputs (dout, ) = outputs_gradients # Determine the dimensions that were added by broadcasting new_dims = list(range(dout.ndim - a.ndim)) d_wrt_a = broadcast_to(dout, shape).sum(axis=new_dims) # Determine the dimensions that were broadcast _, shape_bcast = basic.alloc_validate_shape(shape) bcast_sums = [ i for i, ( a_b, s_b) in enumerate(zip(a.broadcastable, shape_bcast[-a.ndim:])) if a_b and not s_b ] if bcast_sums: d_wrt_a = d_wrt_a.sum(axis=bcast_sums, keepdims=True) return [d_wrt_a] + [ grad_undefined(self, i, shp) for i, shp in enumerate(shape, 1) ]
def grad(self, inp, grads): return [grad_undefined(self, 0, inp[0])]
def grad(self, inp, grads): return [grad_undefined(self, i, inp[i]) for i in xrange(3)]
def grad(self, inputs, ograd): return [gradient.grad_undefined( self, k, inp, 'No gradient defined through random sampling op') for k, inp in enumerate(inputs)]
def grad(self,inp,grads): x,indx,=inp gz, = grads return [GpuAssigner()(x,indx,gz), grad_undefined(self,1,inp[1])]
def grad(self, inputs, output_grads): return [self.gradients, grad_undefined(self, 1, inputs[1]), grad_undefined(self, 2, inputs[2]), grad_undefined(self, 3, inputs[3]), grad_undefined(self, 4, inputs[4])]
def grad(self, inp, grads): return [grad_undefined(self, i, inp[i]) for i in xrange(2)]
def grad(self, inp, output_grads): return [grad_undefined(self, i, inp[i]) for i in xrange(len(inp))]