Example #1
0
 def errors(self):
   if self.loss in ('ctc', 'ce_ctc', 'ctc_warp'):
     from theano.tensor.extra_ops import cpu_contiguous
     return T.sum(BestPathDecodeOp()(self.p_y_given_x, cpu_contiguous(self.y.dimshuffle(1, 0)), self.index_for_ctc()))
   elif self.loss == 'hmm':
     from theano.tensor.extra_ops import cpu_contiguous
     return T.sum(TwoStateBestPathDecodeOp()(self.p_y_given_x, cpu_contiguous(self.y.dimshuffle(1, 0)), self.index_for_ctc()))
   elif self.loss == 'viterbi':
     scores = T.log(self.p_y_given_x) - self.prior_scale * T.log(self.priors)
     y = NumpyAlignOp(False)(self.sources[0].index, self.index, -scores, self.y)
     self.y_data_flat = y.flatten()
     return super(SequenceOutputLayer, self).errors()
   else:
     return super(SequenceOutputLayer, self).errors()
Example #2
0
    def make_node(self, activations, labels, input_lengths):
        t_activations = T.as_tensor_variable(activations)
        # Ensure activations array is C-contiguous
        t_activations = cpu_contiguous(t_activations)

        t_labels = T.as_tensor_variable(labels)
        t_input_lengths = T.as_tensor_variable(input_lengths)

        if t_activations.type.dtype != 'float32':
            raise TypeError('activations must use the float32 type!')

        if t_activations.ndim != 3:
            raise ValueError('activations must have 3 dimensions.')

        if t_labels.type.dtype != 'int32':
            raise TypeError('labels must use the int32 type!')

        if t_labels.ndim != 2:
            raise ValueError('labels must have 2 dimensions.')

        if t_input_lengths.type.dtype != 'int32':
            raise TypeError('input_lengths must use the int32 type!')

        if t_input_lengths.ndim != 1:
            raise ValueError('input_lengths must have 1 dimension.')

        costs = T.fvector(name="ctc_cost")
        outputs = [costs]
        if self.compute_grad:
            gradients = T.ftensor3(name="ctc_grad")
            outputs += [gradients]

        return gof.Apply(self, inputs=[t_activations, t_labels, t_input_lengths],
                         outputs=outputs)
Example #3
0
 def cost(self):
   """
   :param y: shape (time*batch,) -> label
   :return: error scalar, known_grads dict
   """
   y_f = T.cast(T.reshape(self.y_data_flat, (self.y_data_flat.shape[0] * self.y_data_flat.shape[1]), ndim = 1), 'int32')
   known_grads = None
   if self.loss == 'sprint':
     if not isinstance(self.sprint_opts, dict):
       import json
       self.sprint_opts = json.loads(self.sprint_opts)
     assert isinstance(self.sprint_opts, dict), "you need to specify sprint_opts in the output layer"
     if self.exp_normalize:
       log_probs = T.log(self.p_y_given_x)
     else:
       log_probs = self.z
     sprint_error_op = SprintErrorSigOp(self.attrs.get("target", "classes"), self.sprint_opts)
     err, grad = sprint_error_op(log_probs, T.sum(self.index, axis=0))
     err = err.sum()
     if self.loss_like_ce:
       y_ref = T.clip(self.p_y_given_x - grad, numpy.float32(0), numpy.float32(1))
       err = -T.sum(T.log(T.pow(self.p_y_given_x, y_ref)) * T.cast(self.index, "float32").dimshuffle(0, 1, 'x'))
     if self.ce_smoothing:
       err *= numpy.float32(1.0 - self.ce_smoothing)
       grad *= numpy.float32(1.0 - self.ce_smoothing)
       if not self.prior_scale:  # we kept the softmax bias as it was
         nll, pcx = T.nnet.crossentropy_softmax_1hot(x=self.y_m[self.i], y_idx=self.y_data_flat[self.i])
       else:  # assume that we have subtracted the bias by the log priors beforehand
         assert self.log_prior is not None
         # In this case, for the CE calculation, we need to add the log priors again.
         y_m_prior = T.reshape(self.z + numpy.float32(self.prior_scale) * self.log_prior,
                               (self.z.shape[0] * self.z.shape[1], self.z.shape[2]), ndim=2)
         nll, pcx = T.nnet.crossentropy_softmax_1hot(x=y_m_prior[self.i], y_idx=self.y_data_flat[self.i])
       ce = numpy.float32(self.ce_smoothing) * T.sum(nll)
       err += ce
       grad += T.grad(ce, self.z)
     known_grads = {self.z: grad}
     return err, known_grads
   elif self.loss == 'ctc':
     from theano.tensor.extra_ops import cpu_contiguous
     err, grad, priors = CTCOp()(self.p_y_given_x, cpu_contiguous(self.y.dimshuffle(1, 0)), self.index_for_ctc())
     known_grads = {self.z: grad}
     return err.sum(), known_grads, priors.sum(axis=0)
   elif self.loss == 'ce_ctc':
     y_m = T.reshape(self.z, (self.z.shape[0] * self.z.shape[1], self.z.shape[2]), ndim=2)
     p_y_given_x = T.nnet.softmax(y_m)
     #pcx = p_y_given_x[(self.i > 0).nonzero(), y_f[(self.i > 0).nonzero()]]
     pcx = p_y_given_x[self.i, self.y_data_flat[self.i]]
     ce = -T.sum(T.log(pcx))
     return ce, known_grads
   elif self.loss == 'ctc2':
     from NetworkCtcLayer import ctc_cost, uniq_with_lengths, log_sum
     max_time = self.z.shape[0]
     num_batches = self.z.shape[1]
     time_mask = self.index.reshape((max_time, num_batches))
     y_batches = self.y_data_flat.reshape((max_time, num_batches))
     targets, seq_lens = uniq_with_lengths(y_batches, time_mask)
     log_pcx = self.z - log_sum(self.z, axis=0, keepdims=True)
     err = ctc_cost(log_pcx, time_mask, targets, seq_lens)
     return err, known_grads
Example #4
0
def test_cpu_contiguous():
    a = T.fmatrix('a')
    i = T.iscalar('i')
    a_val = numpy.asarray(numpy.random.rand(4, 5), dtype='float32')
    f = theano.function([a, i], cpu_contiguous(a.reshape((5,4))[::i]))
    topo = f.maker.fgraph.toposort()
    assert any([isinstance(node.op, CpuContiguous) for node in topo])
    assert f(a_val, 1).flags['C_CONTIGUOUS']
    assert f(a_val, 2).flags['C_CONTIGUOUS']
    assert f(a_val, 3).flags['C_CONTIGUOUS']
Example #5
0
def test_cpu_contiguous():
    a = T.fmatrix('a')
    i = T.iscalar('i')
    a_val = numpy.asarray(numpy.random.rand(4, 5), dtype='float32')
    f = theano.function([a, i], cpu_contiguous(a.reshape((5, 4))[::i]))
    topo = f.maker.fgraph.toposort()
    assert any([isinstance(node.op, CpuContiguous) for node in topo])
    assert f(a_val, 1).flags['C_CONTIGUOUS']
    assert f(a_val, 2).flags['C_CONTIGUOUS']
    assert f(a_val, 3).flags['C_CONTIGUOUS']
Example #6
0
def test_cpu_contiguous():
    a = T.fmatrix("a")
    i = T.iscalar("i")
    a_val = np.asarray(np.random.rand(4, 5), dtype="float32")
    f = theano.function([a, i], cpu_contiguous(a.reshape((5, 4))[::i]))
    topo = f.maker.fgraph.toposort()
    assert any([isinstance(node.op, CpuContiguous) for node in topo])
    assert f(a_val, 1).flags["C_CONTIGUOUS"]
    assert f(a_val, 2).flags["C_CONTIGUOUS"]
    assert f(a_val, 3).flags["C_CONTIGUOUS"]
    # Test the grad:

    utt.verify_grad(cpu_contiguous, [np.random.rand(5, 7, 2)])
Example #7
0
 def errors(self):
   if self.loss in ('ctc', 'ce_ctc'):
     from theano.tensor.extra_ops import cpu_contiguous
     return T.sum(BestPathDecodeOp()(self.p_y_given_x, cpu_contiguous(self.y.dimshuffle(1, 0)), self.index_for_ctc()))
   else:
     return super(SequenceOutputLayer, self).errors()
Example #8
0
 def cost(self):
   """
   :param y: shape (time*batch,) -> label
   :return: error scalar, known_grads dict
   """
   known_grads = None
   # In case that our target has another index, self.index will be that index.
   # However, the right index for self.p_y_given_x and many others is the index from the source layers.
   src_index = self.sources[0].index
   if self.loss == 'sprint':
     if not isinstance(self.sprint_opts, dict):
       import json
       self.sprint_opts = json.loads(self.sprint_opts)
     assert isinstance(self.sprint_opts, dict), "you need to specify sprint_opts in the output layer"
     if self.exp_normalize:
       log_probs = T.log(self.p_y_given_x)
     else:
       log_probs = self.z
     if self.prior_scale:  # use own priors, assume prior scale in sprint config to be 0.0
       assert self.log_prior is not None
       log_probs -= numpy.float32(self.prior_scale) * self.log_prior
     err, grad = sprint_loss_and_error_signal(
       output_layer=self,
       target=self.attrs.get("target", "classes"),
       sprint_opts=self.sprint_opts,
       log_posteriors=log_probs,
       seq_lengths=T.sum(src_index, axis=0)
     )
     err = err.sum()
     if self.loss_like_ce:
       y_ref = T.clip(self.p_y_given_x - grad, numpy.float32(0), numpy.float32(1))
       err = -T.sum(T.switch(T.cast(src_index, "float32").dimshuffle(0, 1, 'x'),
                             y_ref * T.log(self.p_y_given_x),
                             numpy.float32(0)))
     if self.ce_smoothing:
       err *= numpy.float32(1.0 - self.ce_smoothing)
       grad *= numpy.float32(1.0 - self.ce_smoothing)
       if not self.prior_scale:  # we kept the softmax bias as it was
         nll, pcx = T.nnet.crossentropy_softmax_1hot(x=self.y_m[self.i], y_idx=self.y_data_flat[self.i])
       else:  # assume that we have subtracted the bias by the log priors beforehand
         assert self.log_prior is not None
         # In this case, for the CE calculation, we need to add the log priors again.
         y_m_prior = T.reshape(self.z + numpy.float32(self.prior_scale) * self.log_prior,
                               (self.z.shape[0] * self.z.shape[1], self.z.shape[2]), ndim=2)
         nll, pcx = T.nnet.crossentropy_softmax_1hot(x=y_m_prior[self.i], y_idx=self.y_data_flat[self.i])
       ce = numpy.float32(self.ce_smoothing) * T.sum(nll)
       err += ce
       grad += T.grad(ce, self.z)
     known_grads = {self.z: grad}
     return err, known_grads
   elif self.loss == 'fast_bw':
     if not isinstance(self.sprint_opts, dict):
       import json
       self.sprint_opts = json.loads(self.sprint_opts)
     assert isinstance(self.sprint_opts, dict), "you need to specify sprint_opts in the output layer"
     y = self.p_y_given_x
     if self.attrs.get("sigmoid_outputs", False):
       y = T.nnet.sigmoid(self.z)
     assert y.ndim == 3
     y = T.clip(y, numpy.float32(1.e-20), numpy.float(1.e20))
     nlog_scores = -T.log(y)  # in -log space
     if self.attrs.get("exp_outputs", False):
       y = T.exp(self.z)
       nlog_scores = -self.z  # in -log space
     if self.attrs.get("gauss_outputs", False):
       z_sqr = T.sqr(self.z)
       y = T.exp(-z_sqr)
       nlog_scores = z_sqr  # in -log space
     am_scores = nlog_scores
     am_scale = self.attrs.get("am_scale", 1)
     if am_scale != 1:
       am_scale = numpy.float32(am_scale)
       am_scores *= am_scale
     if self.prior_scale and not self.attrs.get("substract_prior_from_output", False):
       assert self.log_prior is not None
       # Scores are in -log space, self.log_prior is in +log space.
       # We want to subtract the prior, thus `-=`.
       am_scores -= -self.log_prior * numpy.float32(self.prior_scale)
     edges, weights, start_end_states, state_buffer = SprintAlignmentAutomataOp(self.sprint_opts)(self.network.tags)
     float_idx = T.cast(src_index, "float32")
     float_idx_bc = float_idx.dimshuffle(0, 1, 'x')
     idx_sum = T.sum(float_idx)
     fwdbwd = FastBaumWelchOp.make_op()(am_scores, edges, weights, start_end_states, float_idx, state_buffer)
     gamma = self.attrs.get("gamma", 1)
     need_renorm = False
     if gamma != 1:
       fwdbwd *= numpy.float32(gamma)
       need_renorm = True
     bw = T.exp(-fwdbwd)
     if self.attrs.get("compute_priors_via_baum_welch", False):
       assert self.priors.custom_update is not None
       self.priors.custom_update = T.sum(bw * float_idx_bc, axis=(0, 1)) / idx_sum
     if self.attrs.get("bw_norm_class_avg", False):
       cavg = T.sum(bw * float_idx_bc, axis=(0, 1), keepdims=True) / idx_sum
       bw /= T.clip(cavg, numpy.float32(1.e-20), numpy.float(1.e20))
       need_renorm = True
     if need_renorm:
       bw /= T.clip(T.sum(bw, axis=2, keepdims=True), numpy.float32(1.e-20), numpy.float32(1.e20))
     self.baumwelch_alignment = bw
     if self.ce_smoothing > 0:
       target_layer = self.attrs.get("ce_target_layer_align", None)
       assert target_layer  # we could also use self.y but so far we only want this
       bw2 = self.network.output[target_layer].baumwelch_alignment
       bw = numpy.float32(self.ce_smoothing) * bw2 + numpy.float32(1 - self.ce_smoothing) * bw
     if self.attrs.get("loss_with_softmax_prob", False):
       y = self.p_y_given_x
       nlog_scores = -T.log(T.clip(y, numpy.float32(1.e-20), numpy.float(1.e20)))
     err_inner = bw * nlog_scores
     if self.attrs.get("log_score_penalty", 0):
       err_inner -= numpy.float32(self.attrs["log_score_penalty"]) * nlog_scores
     err = (err_inner * float_idx_bc).sum()
     known_grads = {self.z: (y - bw) * float_idx_bc}
     if self.attrs.get("gauss_outputs", False):
       del known_grads[self.z]
     if self.prior_scale and self.attrs.get('trained_softmax_prior', False):
       bw_sum0 = T.sum(bw * float_idx_bc, axis=(0, 1))
       assert bw_sum0.ndim == self.priors.ndim == 1
       # Note that this is the other way around as usually (`bw - y` instead of `y - bw`).
       # That is because the prior is in the denominator.
       known_grads[self.trained_softmax_prior_p] = numpy.float32(self.prior_scale) * (bw_sum0 - self.priors * idx_sum)
     return err, known_grads
   elif self.loss == 'ctc':
     from theano.tensor.extra_ops import cpu_contiguous
     err, grad, priors = CTCOp()(self.p_y_given_x, cpu_contiguous(self.y.dimshuffle(1, 0)), self.index_for_ctc())
     known_grads = {self.z: grad}
     return err.sum(), known_grads, priors.sum(axis=0)
   elif self.loss == 'hmm':
     from theano.tensor.extra_ops import cpu_contiguous
     err, grad, priors = TwoStateHMMOp()(self.p_y_given_x, cpu_contiguous(self.y.dimshuffle(1, 0)),
                                         self.index_for_ctc())
     known_grads = {self.z: grad}
     return err.sum(), known_grads, priors.sum(axis=0)
   elif self.loss == 'warp_ctc':
     import os
     os.environ['CTC_LIB'] = self.attrs.get('warp_ctc_lib', "/usr/lib")
     try:
       from theano_ctc import ctc_cost
       # from theano_ctc.cpu_ctc import CpuCtc
     except Exception:
       assert False, "install this: https://github.com/mcf06/theano_ctc"
     from TheanoUtil import print_to_file
     yr = T.set_subtensor(self.y.flatten()[self.j], numpy.int32(-1)).reshape(self.y.shape).dimshuffle(1, 0)
     yr = print_to_file('yr', yr)
     cost = T.mean(ctc_cost(self.p_y_given_x, yr, self.index_for_ctc()))
     # cost = T.mean(CpuCtc()(self.p_y_given_x, yr, self.index_for_ctc()))
     cost = print_to_file('cost', cost)
     return cost, known_grads
   elif self.loss == 'ce_ctc':
     y_m = T.reshape(self.z, (self.z.shape[0] * self.z.shape[1], self.z.shape[2]), ndim=2)
     p_y_given_x = T.nnet.softmax(y_m)
     # pcx = p_y_given_x[(self.i > 0).nonzero(), y_f[(self.i > 0).nonzero()]]
     pcx = p_y_given_x[self.i, self.y_data_flat[self.i]]
     ce = -T.sum(T.log(pcx))
     return ce, known_grads
   elif self.loss == 'ctc2':
     from NetworkCtcLayer import ctc_cost, uniq_with_lengths, log_sum
     max_time = self.z.shape[0]
     num_batches = self.z.shape[1]
     time_mask = self.index.reshape((max_time, num_batches))
     y_batches = self.y_data_flat.reshape((max_time, num_batches))
     targets, seq_lens = uniq_with_lengths(y_batches, time_mask)
     log_pcx = self.z - log_sum(self.z, axis=0, keepdims=True)
     err = ctc_cost(log_pcx, time_mask, targets, seq_lens)
     return err, known_grads
   elif self.loss == 'viterbi':
     y_m = T.reshape(self.z, (self.z.shape[0] * self.z.shape[1], self.z.shape[2]), ndim=2)
     nlog_scores = T.log(self.p_y_given_x) - self.prior_scale * T.log(self.priors)
     y = NumpyAlignOp(False)(src_index, self.index, -nlog_scores, self.y)
     self.y_data_flat = y.flatten()
     nll, pcx = T.nnet.crossentropy_softmax_1hot(x=y_m[self.i], y_idx=self.y_data_flat[self.i])
     return T.sum(nll), known_grads
Example #9
0
    def __init__(self, num_layers=1, direction=0, **kwargs):
        # this has to be provided in THEANO_FLAGS as e.g. contexts=gpu0->cuda0
        context_name = kwargs.get('device', str(theano.config.device))
        #if context_name == 'cpu':
        #  context_name = 'gpu0'
        kwargs['device'] = context_name
        #kwargs['n_out'] *= 2
        super(RNNBlockLayer, self).__init__(**kwargs)
        self.params = {}
        #self.attrs['n_out'] /= 2
        #self.set_attr('nout', self.attrs['n_out'] / 4)
        from theano.gpuarray import dnn
        from theano.gpuarray.type import gpuarray_shared_constructor
        from theano.tensor.extra_ops import cpu_contiguous
        #from theano.sandbox.cuda.basic_ops import gpu_contiguous

        rnnb = dnn.RNNBlock(
            dtype=theano.config.floatX,
            hidden_size=self.attrs['n_out'],
            num_layers=num_layers,
            rnn_mode='lstm',
            input_mode='linear',
            direction_mode='unidirectional'
            if direction != 0 else 'bidirectional',
            context_name=context_name if context_name != 'cpu' else 'gpu0')

        buffer_size = 1  # self.attrs['n_out'] * num_layers
        #X = self.get_linear_forward_output()
        #X = T.concatenate([s.output for s in self.sources],axis=2)[::direction or 1]
        X = cpu_contiguous(
            T.concatenate([s.output for s in self.sources],
                          axis=2)[::direction or 1])
        #X = cpu_contiguous(self.sources[0].output[::direction or 1])
        #X = T.concatenate([X,T.zeros((X.shape[0],batch_size - X.shape[1] + 1,X.shape[2]),X.dtype)],axis=1)[:,:-1]
        n_in = sum([s.attrs['n_out'] for s in self.sources])
        psize = rnnb.get_param_size([buffer_size, n_in])
        l = numpy.sqrt(6.) / numpy.sqrt(4 * self.attrs['n_out'])
        pvalue = numpy.asarray(self.rng.uniform(low=-l, high=l,
                                                size=(psize, )),
                               dtype=theano.config.floatX)
        if context_name == 'cpu':
            params_cudnn = self.add_param(
                self.create_bias(psize, name='cudnn_%s' % self.name))
        else:
            params_cudnn = self.add_param(
                gpuarray_shared_constructor(pvalue,
                                            target=context_name,
                                            name='cudnn_%s' % self.name))
        c_init = cpu_contiguous(
            T.alloc(numpy.cast[theano.config.floatX](0), num_layers,
                    X.shape[1], self.attrs['n_out']))
        h_init = cpu_contiguous(
            T.alloc(numpy.cast[theano.config.floatX](0), num_layers,
                    X.shape[1], self.attrs['n_out']))

        W_out = self.add_param(
            self.create_random_uniform_weights(
                self.attrs['n_out'], self.y_in[self.attrs['target']].n_out))
        b_out = self.add_param(
            self.create_bias(self.y_in[self.attrs['target']].n_out))

        if context_name == 'cpu':
            self.cost_val = T.constant(0)
            self.error_val = T.constant(0)
            self.known_grads = {}
            return

        out = rnnb.apply(params_cudnn, X, h_init, c_init)[0]
        out = out[::-1]
        out = T.dot(out, W_out) + b_out
        self.y_m = out.reshape((out.shape[0] * out.shape[1], out.shape[2]))

        self.i = (self.index.flatten() > 0).nonzero()
        self.y_data_flat = self.y_in[self.attrs['target']].flatten()
        nll, _ = T.nnet.crossentropy_softmax_1hot(
            x=self.y_m[self.i], y_idx=self.y_data_flat[self.i])
        self.cost_val = T.sum(nll)

        #self.cost_val = -T.sum(T.log(out[:,self.y_in[self.attrs['target']].flatten()][(self.index.flatten()>0).nonzero()]))
        self.known_grads = {params_cudnn: T.grad(self.cost_val, params_cudnn)}
        self.output = out
        self.index = self.sources[0].index

        self.error_val = T.sum(
            T.neq(T.argmax(self.y_m[self.i], axis=-1),
                  self.y_data_flat[self.i]))