def grad(self, inputs, output_grads): if not self.lua_bw_func: # Unknown how to calculate gradient. return [T.DisconnectedType()() for inp in inputs] assert len(self.in_info) == len(inputs) assert len(self.out_info) == len(output_grads) out_info = [info.copy() for info in self.in_info] for idx, info in enumerate(out_info): # Refer to input shapes. See infer_shape(). info["shape"] = [(idx, i) for i in range(info["ndim"])] out_info = [info for info in out_info if info.get("gradient", "") != "disconnected"] grad_op = TorchWrapperOp( name="grad-of-%s" % self.name, in_info=self.in_info + self.out_info, # inputs + output_grads out_info=out_info, lua_file=self.lua_file, lua_fw_func=self.lua_bw_func ) input_grads = grad_op(*(inputs + output_grads)) assert len(out_info) == len(input_grads) results = [] for info in self.in_info: if info.get("gradient", "") == "disconnected": results += [T.DisconnectedType()()] else: results += input_grads[:1] input_grads = input_grads[1:] assert len(input_grads) == 0 assert len(results) == len(self.in_info) return results
def grad(self, inputs, xxx_todo_changeme5): (doutput, ) = xxx_todo_changeme5 if self.breakpoint_grad: doutput = Breakpoint(self.var_names, self.cond, self.tb, self.py_vars, True, True)(doutput, *inputs[1:]) return [doutput] + [T.DisconnectedType()() for _ in range(self.nvars)]
def _theano_cpu_multi_batch_beam_grad(array, start_idxs, batch_lens, beam_width, wrap_mode, pad_left=0, pad_right=0, idx_dim=0, batch_dim=1, output_grad=None): # Note: This is slow and hacky. This will create an index-array of the size of the original array. # This is calculated on the CPU. The subtensor then can be done on the GPU, but we should avoid the first part. D_beam = output_grad prod_array_shape = T.prod(array.shape) prod_pad_left_shape = T.prod(pad_left.shape) prod_pad_right_shape = T.prod(pad_right.shape) D_array_tmp_size = prod_array_shape if wrap_mode == "pad": D_array_tmp_size += prod_pad_left_shape + prod_pad_right_shape D_array_tmp_flat = T.zeros([D_array_tmp_size], dtype="float32") # with pad values if wrap_mode == "pad": # Calculate the indices for D_pad_left/D_pad_right in D_array_tmp_flat. pad_left_idxs = T.arange(prod_pad_left_shape) + prod_array_shape pad_right_idxs = T.arange( prod_pad_right_shape) + prod_array_shape + prod_pad_left_shape pad_left_idxs = pad_left_idxs.reshape(pad_left.shape) pad_right_idxs = pad_right_idxs.reshape(pad_right.shape) else: pad_left_idxs = pad_right_idxs = 0 all_idxs = T.arange(T.prod(array.shape)).reshape(array.shape) idxs = multi_batch_beam(array=all_idxs, start_idxs=start_idxs, batch_lens=batch_lens, beam_width=beam_width, wrap_mode=wrap_mode, pad_left=pad_left_idxs, pad_right=pad_right_idxs, idx_dim=idx_dim, batch_dim=batch_dim) D_array_tmp_flat = T.inc_subtensor(D_array_tmp_flat[idxs.flatten()], D_beam.flatten()) if wrap_mode == "pad": D_array = D_array_tmp_flat[:prod_array_shape].reshape(array.shape) D_pad_left = D_array_tmp_flat[pad_left_idxs.flatten()].reshape( pad_left.shape) D_pad_right = D_array_tmp_flat[pad_right_idxs.flatten()].reshape( pad_right.shape) else: D_array = D_array_tmp_flat.reshape(array.shape) D_pad_left = D_pad_right = T.DisconnectedType()() return D_array, D_pad_left, D_pad_right
def grad(self, inputs, grads): x, ilist = inputs gz, = grads assert len(inputs) == 2 if self.sparse_grad: raise RuntimeError( "sparse grad not supported for AdvancedSubtensor1Floats") setinc, inpl = self.set_instead_of_inc, self.inplace inc_op = AdvancedIncSubtensor1Floats(set_instead_of_inc=setinc, inplace=inpl) rval1 = [inc_op(x.zeros_like(), gz, ilist)] return rval1 + [T.DisconnectedType()()] * (len(inputs) - 1)
def grad(self, inputs, output_grads): """ For Theano. :param inputs: :param output_grads: :return: """ if self.custom_grad: return self.custom_grad(self, inputs, output_grads) if not self.c_bw_code: # Unknown how to calculate gradient. return [T.DisconnectedType()() for inp in inputs] assert len(self.in_info) == len(inputs) assert len(self.out_info) == len(output_grads) # Some of output_grads might be of disconnected type. out_shapes = self.infer_shape(None, [v.shape for v in inputs]) assert len(out_shapes) == len(output_grads) for i, out_grad in enumerate(output_grads): if isinstance(out_grad.type, T.DisconnectedType): output_grads[i] = T.zeros(out_shapes[i], dtype="float32") kwargs_for_grad = self.kwargs_for_grad_op() grad_op = self.__class__(**kwargs_for_grad) # noinspection PyCallingNonCallable grad_inputs = inputs + list(make_var_tuple(self(*inputs))) + output_grads grad_inputs = self._filter_grad_inputs(grad_inputs) assert len(grad_op.in_info) == len(grad_inputs) # noinspection PyCallingNonCallable grad_outputs = make_var_tuple(grad_op(*grad_inputs)) assert len(grad_op.out_info) == len(grad_outputs) if grad_op.num_dummy_outs > 0: grad_outputs = grad_outputs[:-grad_op.num_dummy_outs] # remove any dummy outputs return self.make_results_of_gradient(grad_outputs, disconnected_type=T.DisconnectedType())
def grad(self, inputs, output_grads): array, start_idxs, batch_lens, beam_width, pad_left, pad_right = inputs D_beam, = output_grads if not isinstance(pad_left, theano.Constant): raise NotImplementedError("D_pad_left not implemented...") if not isinstance(pad_right, theano.Constant): raise NotImplementedError("D_pad_right not implemented...") grad_op = MultiBatchBeamGradAddOp(wrap_mode=self.wrap_mode, zero_with_shape=True, array_ndim=array.ndim, idx_dim=self.idx_dim, batch_dim=self.batch_dim) D_array = grad_op(array.shape, start_idxs, batch_lens, beam_width, D_beam) if self.wrap_mode == "wrap_around": D_pad_left = D_pad_right = T.DisconnectedType()() elif self.wrap_mode == "pad": D_pad_left = D_pad_right = T.DisconnectedType()() # XXX... # D_pad_left = T.zeros(pad_left.shape, dtype="float32") # D_pad_right = T.zeros(pad_right.shape, dtype="float32") else: assert False, self.wrap_mode # Those are all discrete values. The gradient is 0 almost everywhere, except for integers where it is not defined. D_start_idxs = T.DisconnectedType()() D_batch_lens = T.DisconnectedType()() D_beam_width = T.DisconnectedType()() return [ D_array, D_start_idxs, D_batch_lens, D_beam_width, D_pad_left, D_pad_right ]
def grad(self, inputs, grads): g_output, = grads x, y, idx_list = inputs if x.dtype in theano.tensor.discrete_dtypes: # The output dtype is the same as x gx = x.zeros_like(dtype=theano.config.floatX) if y.dtype in theano.tensor.discrete_dtypes: gy = y.zeros_like(dtype=theano.config.floatX) else: gy = y.zeros_like() elif x.dtype in theano.tensor.complex_dtypes: raise NotImplementedError("No support for complex grad yet") else: if self.set_instead_of_inc: gx_op = AdvancedIncSubtensor1Floats(set_instead_of_inc=True, inplace=self.inplace) gx = gx_op(g_output, y.zeros_like(), idx_list) else: gx = g_output gy = AdvancedSubtensor1Floats()(g_output, idx_list) gy = T.subtensor._sum_grad_over_bcasted_dims(y, gy) return [gx, gy] + [T.DisconnectedType()()]
def grad(self, inputs, doutputs): return [T.DisconnectedType()() for _ in inputs]
if self.is_grad: place = 'theano gradient eval' else: place = 'theano eval' print >> sys.stderr, 'Breakpoint in %s, created at' % place print >> sys.stderr, ' ...' traceback.print_list(self.tb[-4:], sys.stderr) ipdb.set_trace() pass # in theano breakpoint def grad(self, inputs, (doutput,)): if self.breakpoint_grad: doutput = Breakpoint(self.var_names, self.cond, self.tb, self.py_vars, True, True) \ (doutput, *inputs[1:]) return [doutput] + [T.DisconnectedType()() for _ in xrange(self.nvars)] _theano_types = (theano.tensor.basic.TensorConstant, theano.tensor.basic.TensorVariable, theano.compile.SharedVariable, ) def is_theano_var(x): return isinstance(x, _theano_types) def breakpoint(output, vars=None, cond=lambda v: True, grad=True): tb = tuple(traceback.extract_stack()[:-1]) py_vars = {} if type(vars) not in (tuple, list, dict, types.NoneType): raise ValueError('vars keyword arg must be None, dict, list or tuple') if not isinstance(vars, dict):