def errors(self): if self.loss in ('ctc', 'ce_ctc', 'ctc_warp'): from theano.tensor.extra_ops import cpu_contiguous return T.sum(BestPathDecodeOp()(self.p_y_given_x, cpu_contiguous(self.y.dimshuffle(1, 0)), self.index_for_ctc())) elif self.loss == 'hmm': from theano.tensor.extra_ops import cpu_contiguous return T.sum(TwoStateBestPathDecodeOp()(self.p_y_given_x, cpu_contiguous(self.y.dimshuffle(1, 0)), self.index_for_ctc())) elif self.loss == 'viterbi': scores = T.log(self.p_y_given_x) - self.prior_scale * T.log(self.priors) y = NumpyAlignOp(False)(self.sources[0].index, self.index, -scores, self.y) self.y_data_flat = y.flatten() return super(SequenceOutputLayer, self).errors() else: return super(SequenceOutputLayer, self).errors()
def make_node(self, activations, labels, input_lengths): t_activations = T.as_tensor_variable(activations) # Ensure activations array is C-contiguous t_activations = cpu_contiguous(t_activations) t_labels = T.as_tensor_variable(labels) t_input_lengths = T.as_tensor_variable(input_lengths) if t_activations.type.dtype != 'float32': raise TypeError('activations must use the float32 type!') if t_activations.ndim != 3: raise ValueError('activations must have 3 dimensions.') if t_labels.type.dtype != 'int32': raise TypeError('labels must use the int32 type!') if t_labels.ndim != 2: raise ValueError('labels must have 2 dimensions.') if t_input_lengths.type.dtype != 'int32': raise TypeError('input_lengths must use the int32 type!') if t_input_lengths.ndim != 1: raise ValueError('input_lengths must have 1 dimension.') costs = T.fvector(name="ctc_cost") outputs = [costs] if self.compute_grad: gradients = T.ftensor3(name="ctc_grad") outputs += [gradients] return gof.Apply(self, inputs=[t_activations, t_labels, t_input_lengths], outputs=outputs)
def cost(self): """ :param y: shape (time*batch,) -> label :return: error scalar, known_grads dict """ y_f = T.cast(T.reshape(self.y_data_flat, (self.y_data_flat.shape[0] * self.y_data_flat.shape[1]), ndim = 1), 'int32') known_grads = None if self.loss == 'sprint': if not isinstance(self.sprint_opts, dict): import json self.sprint_opts = json.loads(self.sprint_opts) assert isinstance(self.sprint_opts, dict), "you need to specify sprint_opts in the output layer" if self.exp_normalize: log_probs = T.log(self.p_y_given_x) else: log_probs = self.z sprint_error_op = SprintErrorSigOp(self.attrs.get("target", "classes"), self.sprint_opts) err, grad = sprint_error_op(log_probs, T.sum(self.index, axis=0)) err = err.sum() if self.loss_like_ce: y_ref = T.clip(self.p_y_given_x - grad, numpy.float32(0), numpy.float32(1)) err = -T.sum(T.log(T.pow(self.p_y_given_x, y_ref)) * T.cast(self.index, "float32").dimshuffle(0, 1, 'x')) if self.ce_smoothing: err *= numpy.float32(1.0 - self.ce_smoothing) grad *= numpy.float32(1.0 - self.ce_smoothing) if not self.prior_scale: # we kept the softmax bias as it was nll, pcx = T.nnet.crossentropy_softmax_1hot(x=self.y_m[self.i], y_idx=self.y_data_flat[self.i]) else: # assume that we have subtracted the bias by the log priors beforehand assert self.log_prior is not None # In this case, for the CE calculation, we need to add the log priors again. y_m_prior = T.reshape(self.z + numpy.float32(self.prior_scale) * self.log_prior, (self.z.shape[0] * self.z.shape[1], self.z.shape[2]), ndim=2) nll, pcx = T.nnet.crossentropy_softmax_1hot(x=y_m_prior[self.i], y_idx=self.y_data_flat[self.i]) ce = numpy.float32(self.ce_smoothing) * T.sum(nll) err += ce grad += T.grad(ce, self.z) known_grads = {self.z: grad} return err, known_grads elif self.loss == 'ctc': from theano.tensor.extra_ops import cpu_contiguous err, grad, priors = CTCOp()(self.p_y_given_x, cpu_contiguous(self.y.dimshuffle(1, 0)), self.index_for_ctc()) known_grads = {self.z: grad} return err.sum(), known_grads, priors.sum(axis=0) elif self.loss == 'ce_ctc': y_m = T.reshape(self.z, (self.z.shape[0] * self.z.shape[1], self.z.shape[2]), ndim=2) p_y_given_x = T.nnet.softmax(y_m) #pcx = p_y_given_x[(self.i > 0).nonzero(), y_f[(self.i > 0).nonzero()]] pcx = p_y_given_x[self.i, self.y_data_flat[self.i]] ce = -T.sum(T.log(pcx)) return ce, known_grads elif self.loss == 'ctc2': from NetworkCtcLayer import ctc_cost, uniq_with_lengths, log_sum max_time = self.z.shape[0] num_batches = self.z.shape[1] time_mask = self.index.reshape((max_time, num_batches)) y_batches = self.y_data_flat.reshape((max_time, num_batches)) targets, seq_lens = uniq_with_lengths(y_batches, time_mask) log_pcx = self.z - log_sum(self.z, axis=0, keepdims=True) err = ctc_cost(log_pcx, time_mask, targets, seq_lens) return err, known_grads
def test_cpu_contiguous(): a = T.fmatrix('a') i = T.iscalar('i') a_val = numpy.asarray(numpy.random.rand(4, 5), dtype='float32') f = theano.function([a, i], cpu_contiguous(a.reshape((5,4))[::i])) topo = f.maker.fgraph.toposort() assert any([isinstance(node.op, CpuContiguous) for node in topo]) assert f(a_val, 1).flags['C_CONTIGUOUS'] assert f(a_val, 2).flags['C_CONTIGUOUS'] assert f(a_val, 3).flags['C_CONTIGUOUS']
def test_cpu_contiguous(): a = T.fmatrix('a') i = T.iscalar('i') a_val = numpy.asarray(numpy.random.rand(4, 5), dtype='float32') f = theano.function([a, i], cpu_contiguous(a.reshape((5, 4))[::i])) topo = f.maker.fgraph.toposort() assert any([isinstance(node.op, CpuContiguous) for node in topo]) assert f(a_val, 1).flags['C_CONTIGUOUS'] assert f(a_val, 2).flags['C_CONTIGUOUS'] assert f(a_val, 3).flags['C_CONTIGUOUS']
def test_cpu_contiguous(): a = T.fmatrix("a") i = T.iscalar("i") a_val = np.asarray(np.random.rand(4, 5), dtype="float32") f = theano.function([a, i], cpu_contiguous(a.reshape((5, 4))[::i])) topo = f.maker.fgraph.toposort() assert any([isinstance(node.op, CpuContiguous) for node in topo]) assert f(a_val, 1).flags["C_CONTIGUOUS"] assert f(a_val, 2).flags["C_CONTIGUOUS"] assert f(a_val, 3).flags["C_CONTIGUOUS"] # Test the grad: utt.verify_grad(cpu_contiguous, [np.random.rand(5, 7, 2)])
def errors(self): if self.loss in ('ctc', 'ce_ctc'): from theano.tensor.extra_ops import cpu_contiguous return T.sum(BestPathDecodeOp()(self.p_y_given_x, cpu_contiguous(self.y.dimshuffle(1, 0)), self.index_for_ctc())) else: return super(SequenceOutputLayer, self).errors()
def cost(self): """ :param y: shape (time*batch,) -> label :return: error scalar, known_grads dict """ known_grads = None # In case that our target has another index, self.index will be that index. # However, the right index for self.p_y_given_x and many others is the index from the source layers. src_index = self.sources[0].index if self.loss == 'sprint': if not isinstance(self.sprint_opts, dict): import json self.sprint_opts = json.loads(self.sprint_opts) assert isinstance(self.sprint_opts, dict), "you need to specify sprint_opts in the output layer" if self.exp_normalize: log_probs = T.log(self.p_y_given_x) else: log_probs = self.z if self.prior_scale: # use own priors, assume prior scale in sprint config to be 0.0 assert self.log_prior is not None log_probs -= numpy.float32(self.prior_scale) * self.log_prior err, grad = sprint_loss_and_error_signal( output_layer=self, target=self.attrs.get("target", "classes"), sprint_opts=self.sprint_opts, log_posteriors=log_probs, seq_lengths=T.sum(src_index, axis=0) ) err = err.sum() if self.loss_like_ce: y_ref = T.clip(self.p_y_given_x - grad, numpy.float32(0), numpy.float32(1)) err = -T.sum(T.switch(T.cast(src_index, "float32").dimshuffle(0, 1, 'x'), y_ref * T.log(self.p_y_given_x), numpy.float32(0))) if self.ce_smoothing: err *= numpy.float32(1.0 - self.ce_smoothing) grad *= numpy.float32(1.0 - self.ce_smoothing) if not self.prior_scale: # we kept the softmax bias as it was nll, pcx = T.nnet.crossentropy_softmax_1hot(x=self.y_m[self.i], y_idx=self.y_data_flat[self.i]) else: # assume that we have subtracted the bias by the log priors beforehand assert self.log_prior is not None # In this case, for the CE calculation, we need to add the log priors again. y_m_prior = T.reshape(self.z + numpy.float32(self.prior_scale) * self.log_prior, (self.z.shape[0] * self.z.shape[1], self.z.shape[2]), ndim=2) nll, pcx = T.nnet.crossentropy_softmax_1hot(x=y_m_prior[self.i], y_idx=self.y_data_flat[self.i]) ce = numpy.float32(self.ce_smoothing) * T.sum(nll) err += ce grad += T.grad(ce, self.z) known_grads = {self.z: grad} return err, known_grads elif self.loss == 'fast_bw': if not isinstance(self.sprint_opts, dict): import json self.sprint_opts = json.loads(self.sprint_opts) assert isinstance(self.sprint_opts, dict), "you need to specify sprint_opts in the output layer" y = self.p_y_given_x if self.attrs.get("sigmoid_outputs", False): y = T.nnet.sigmoid(self.z) assert y.ndim == 3 y = T.clip(y, numpy.float32(1.e-20), numpy.float(1.e20)) nlog_scores = -T.log(y) # in -log space if self.attrs.get("exp_outputs", False): y = T.exp(self.z) nlog_scores = -self.z # in -log space if self.attrs.get("gauss_outputs", False): z_sqr = T.sqr(self.z) y = T.exp(-z_sqr) nlog_scores = z_sqr # in -log space am_scores = nlog_scores am_scale = self.attrs.get("am_scale", 1) if am_scale != 1: am_scale = numpy.float32(am_scale) am_scores *= am_scale if self.prior_scale and not self.attrs.get("substract_prior_from_output", False): assert self.log_prior is not None # Scores are in -log space, self.log_prior is in +log space. # We want to subtract the prior, thus `-=`. am_scores -= -self.log_prior * numpy.float32(self.prior_scale) edges, weights, start_end_states, state_buffer = SprintAlignmentAutomataOp(self.sprint_opts)(self.network.tags) float_idx = T.cast(src_index, "float32") float_idx_bc = float_idx.dimshuffle(0, 1, 'x') idx_sum = T.sum(float_idx) fwdbwd = FastBaumWelchOp.make_op()(am_scores, edges, weights, start_end_states, float_idx, state_buffer) gamma = self.attrs.get("gamma", 1) need_renorm = False if gamma != 1: fwdbwd *= numpy.float32(gamma) need_renorm = True bw = T.exp(-fwdbwd) if self.attrs.get("compute_priors_via_baum_welch", False): assert self.priors.custom_update is not None self.priors.custom_update = T.sum(bw * float_idx_bc, axis=(0, 1)) / idx_sum if self.attrs.get("bw_norm_class_avg", False): cavg = T.sum(bw * float_idx_bc, axis=(0, 1), keepdims=True) / idx_sum bw /= T.clip(cavg, numpy.float32(1.e-20), numpy.float(1.e20)) need_renorm = True if need_renorm: bw /= T.clip(T.sum(bw, axis=2, keepdims=True), numpy.float32(1.e-20), numpy.float32(1.e20)) self.baumwelch_alignment = bw if self.ce_smoothing > 0: target_layer = self.attrs.get("ce_target_layer_align", None) assert target_layer # we could also use self.y but so far we only want this bw2 = self.network.output[target_layer].baumwelch_alignment bw = numpy.float32(self.ce_smoothing) * bw2 + numpy.float32(1 - self.ce_smoothing) * bw if self.attrs.get("loss_with_softmax_prob", False): y = self.p_y_given_x nlog_scores = -T.log(T.clip(y, numpy.float32(1.e-20), numpy.float(1.e20))) err_inner = bw * nlog_scores if self.attrs.get("log_score_penalty", 0): err_inner -= numpy.float32(self.attrs["log_score_penalty"]) * nlog_scores err = (err_inner * float_idx_bc).sum() known_grads = {self.z: (y - bw) * float_idx_bc} if self.attrs.get("gauss_outputs", False): del known_grads[self.z] if self.prior_scale and self.attrs.get('trained_softmax_prior', False): bw_sum0 = T.sum(bw * float_idx_bc, axis=(0, 1)) assert bw_sum0.ndim == self.priors.ndim == 1 # Note that this is the other way around as usually (`bw - y` instead of `y - bw`). # That is because the prior is in the denominator. known_grads[self.trained_softmax_prior_p] = numpy.float32(self.prior_scale) * (bw_sum0 - self.priors * idx_sum) return err, known_grads elif self.loss == 'ctc': from theano.tensor.extra_ops import cpu_contiguous err, grad, priors = CTCOp()(self.p_y_given_x, cpu_contiguous(self.y.dimshuffle(1, 0)), self.index_for_ctc()) known_grads = {self.z: grad} return err.sum(), known_grads, priors.sum(axis=0) elif self.loss == 'hmm': from theano.tensor.extra_ops import cpu_contiguous err, grad, priors = TwoStateHMMOp()(self.p_y_given_x, cpu_contiguous(self.y.dimshuffle(1, 0)), self.index_for_ctc()) known_grads = {self.z: grad} return err.sum(), known_grads, priors.sum(axis=0) elif self.loss == 'warp_ctc': import os os.environ['CTC_LIB'] = self.attrs.get('warp_ctc_lib', "/usr/lib") try: from theano_ctc import ctc_cost # from theano_ctc.cpu_ctc import CpuCtc except Exception: assert False, "install this: https://github.com/mcf06/theano_ctc" from TheanoUtil import print_to_file yr = T.set_subtensor(self.y.flatten()[self.j], numpy.int32(-1)).reshape(self.y.shape).dimshuffle(1, 0) yr = print_to_file('yr', yr) cost = T.mean(ctc_cost(self.p_y_given_x, yr, self.index_for_ctc())) # cost = T.mean(CpuCtc()(self.p_y_given_x, yr, self.index_for_ctc())) cost = print_to_file('cost', cost) return cost, known_grads elif self.loss == 'ce_ctc': y_m = T.reshape(self.z, (self.z.shape[0] * self.z.shape[1], self.z.shape[2]), ndim=2) p_y_given_x = T.nnet.softmax(y_m) # pcx = p_y_given_x[(self.i > 0).nonzero(), y_f[(self.i > 0).nonzero()]] pcx = p_y_given_x[self.i, self.y_data_flat[self.i]] ce = -T.sum(T.log(pcx)) return ce, known_grads elif self.loss == 'ctc2': from NetworkCtcLayer import ctc_cost, uniq_with_lengths, log_sum max_time = self.z.shape[0] num_batches = self.z.shape[1] time_mask = self.index.reshape((max_time, num_batches)) y_batches = self.y_data_flat.reshape((max_time, num_batches)) targets, seq_lens = uniq_with_lengths(y_batches, time_mask) log_pcx = self.z - log_sum(self.z, axis=0, keepdims=True) err = ctc_cost(log_pcx, time_mask, targets, seq_lens) return err, known_grads elif self.loss == 'viterbi': y_m = T.reshape(self.z, (self.z.shape[0] * self.z.shape[1], self.z.shape[2]), ndim=2) nlog_scores = T.log(self.p_y_given_x) - self.prior_scale * T.log(self.priors) y = NumpyAlignOp(False)(src_index, self.index, -nlog_scores, self.y) self.y_data_flat = y.flatten() nll, pcx = T.nnet.crossentropy_softmax_1hot(x=y_m[self.i], y_idx=self.y_data_flat[self.i]) return T.sum(nll), known_grads
def __init__(self, num_layers=1, direction=0, **kwargs): # this has to be provided in THEANO_FLAGS as e.g. contexts=gpu0->cuda0 context_name = kwargs.get('device', str(theano.config.device)) #if context_name == 'cpu': # context_name = 'gpu0' kwargs['device'] = context_name #kwargs['n_out'] *= 2 super(RNNBlockLayer, self).__init__(**kwargs) self.params = {} #self.attrs['n_out'] /= 2 #self.set_attr('nout', self.attrs['n_out'] / 4) from theano.gpuarray import dnn from theano.gpuarray.type import gpuarray_shared_constructor from theano.tensor.extra_ops import cpu_contiguous #from theano.sandbox.cuda.basic_ops import gpu_contiguous rnnb = dnn.RNNBlock( dtype=theano.config.floatX, hidden_size=self.attrs['n_out'], num_layers=num_layers, rnn_mode='lstm', input_mode='linear', direction_mode='unidirectional' if direction != 0 else 'bidirectional', context_name=context_name if context_name != 'cpu' else 'gpu0') buffer_size = 1 # self.attrs['n_out'] * num_layers #X = self.get_linear_forward_output() #X = T.concatenate([s.output for s in self.sources],axis=2)[::direction or 1] X = cpu_contiguous( T.concatenate([s.output for s in self.sources], axis=2)[::direction or 1]) #X = cpu_contiguous(self.sources[0].output[::direction or 1]) #X = T.concatenate([X,T.zeros((X.shape[0],batch_size - X.shape[1] + 1,X.shape[2]),X.dtype)],axis=1)[:,:-1] n_in = sum([s.attrs['n_out'] for s in self.sources]) psize = rnnb.get_param_size([buffer_size, n_in]) l = numpy.sqrt(6.) / numpy.sqrt(4 * self.attrs['n_out']) pvalue = numpy.asarray(self.rng.uniform(low=-l, high=l, size=(psize, )), dtype=theano.config.floatX) if context_name == 'cpu': params_cudnn = self.add_param( self.create_bias(psize, name='cudnn_%s' % self.name)) else: params_cudnn = self.add_param( gpuarray_shared_constructor(pvalue, target=context_name, name='cudnn_%s' % self.name)) c_init = cpu_contiguous( T.alloc(numpy.cast[theano.config.floatX](0), num_layers, X.shape[1], self.attrs['n_out'])) h_init = cpu_contiguous( T.alloc(numpy.cast[theano.config.floatX](0), num_layers, X.shape[1], self.attrs['n_out'])) W_out = self.add_param( self.create_random_uniform_weights( self.attrs['n_out'], self.y_in[self.attrs['target']].n_out)) b_out = self.add_param( self.create_bias(self.y_in[self.attrs['target']].n_out)) if context_name == 'cpu': self.cost_val = T.constant(0) self.error_val = T.constant(0) self.known_grads = {} return out = rnnb.apply(params_cudnn, X, h_init, c_init)[0] out = out[::-1] out = T.dot(out, W_out) + b_out self.y_m = out.reshape((out.shape[0] * out.shape[1], out.shape[2])) self.i = (self.index.flatten() > 0).nonzero() self.y_data_flat = self.y_in[self.attrs['target']].flatten() nll, _ = T.nnet.crossentropy_softmax_1hot( x=self.y_m[self.i], y_idx=self.y_data_flat[self.i]) self.cost_val = T.sum(nll) #self.cost_val = -T.sum(T.log(out[:,self.y_in[self.attrs['target']].flatten()][(self.index.flatten()>0).nonzero()])) self.known_grads = {params_cudnn: T.grad(self.cost_val, params_cudnn)} self.output = out self.index = self.sources[0].index self.error_val = T.sum( T.neq(T.argmax(self.y_m[self.i], axis=-1), self.y_data_flat[self.i]))