def softmax(z):
  assert z.ndim >= 1
  if z.ndim <= 2:
    return T.nnet.softmax(z)
  else:
    from TheanoUtil import time_batch_make_flat
    z_flat = time_batch_make_flat(z)
    assert z_flat.ndim == 2
    return T.reshape(T.nnet.softmax(z_flat), z.shape)
Beispiel #2
0
def softmax(z):
  assert z.ndim >= 1
  if z.ndim <= 2:
    return T.nnet.softmax(z)
  else:
    from TheanoUtil import time_batch_make_flat
    z_flat = time_batch_make_flat(z)
    assert z_flat.ndim == 2
    return T.reshape(T.nnet.softmax(z_flat), z.shape)
Beispiel #3
0
 def add_layer(self, layer):
   """
   :type layer: NetworkHiddenLayer.Layer
   :rtype NetworkHiddenLayer.Layer
   """
   assert layer.name
   layer_errors = layer.errors()
   if isinstance(layer, OutputLayer) or layer.name == "output" or layer_errors is not None:
     is_output_layer = True
     self.output[layer.name] = layer
   else:
     is_output_layer = False
     self.hidden[layer.name] = layer
   self.add_cost_and_constraints(layer)
   if layer_errors is not None:
     self.errors[layer.name] = layer_errors
   if is_output_layer:
     if getattr(layer, "p_y_given_x", None) is None and layer.output:
       # Small little hack for layers which we use as output-layers which don't set this.
       from TheanoUtil import time_batch_make_flat
       layer.p_y_given_x = time_batch_make_flat(layer.output)
     self.declare_train_params()
   return layer
Beispiel #4
0
 def add_layer(self, layer):
   """
   :type layer: NetworkHiddenLayer.Layer
   :rtype NetworkHiddenLayer.Layer
   """
   assert layer.name
   layer_errors = layer.errors()
   if isinstance(layer, OutputLayer) or layer.name == "output" or layer_errors is not None:
     is_output_layer = True
     self.output[layer.name] = layer
   else:
     is_output_layer = False
     self.hidden[layer.name] = layer
   if layer_errors is not None:
     self.errors[layer.name] = layer_errors
   if is_output_layer:
     if getattr(layer, "p_y_given_x", None) is None and layer.output:
       # Small little hack for layers which we use as output-layers whicgh don't set this.
       from TheanoUtil import time_batch_make_flat
       layer.p_y_given_x = layer.output
       layer.p_y_given_x_flat = time_batch_make_flat(layer.output)
     self.declare_train_params()
   return layer
Beispiel #5
0
 def __init__(self, sources, n_out, index, y_in=None, target=None, target_index=None,
              sparse=False, cost_scale=1.0,
              L1=0.0, L2=0.0, L2_eye=None, varreg=0.0,
              output_L2_reg=0.0, output_entropy_reg=0.0, output_entropy_exp_reg=0.0,
              with_bias=True,
              mask="unity", dropout=0.0, batch_norm=False, layer_drop=0.0, residual=False,
              carry=False,
              sparse_filtering=False, gradient_scale=1.0, trainable=True, device=None,
              dtype='float32',
              **kwargs):
   """
   :param list[NetworkBaseLayer.Layer] sources: list of source layers
   :param int n_out: output dim of W_in and dim of bias
   :param float L1: l1-param-norm regularization
   :param float L2: l2-param-norm regularization
   :param str mask: "unity" or "dropout"
   :type dropout: float
   """
   super(Layer, self).__init__(**kwargs)
   self.index = index
   self.sources = sources; ":type: list[Layer]"
   self.num_sources = len(sources)
   if mask is None: mask = 'none'
   self.set_attr('mask', mask)
   self.set_attr('dropout', dropout)
   self.set_attr('sparse', sparse)
   self.set_attr('sparse_filtering', sparse_filtering)
   if not trainable:
     self.set_attr('trainable', trainable)  # only store if not default
     self.gradient_scale = 0.0  # just to be sure
   else:
     self.gradient_scale = gradient_scale
   if gradient_scale != 1.0:
     self.set_attr('gradient_scale', gradient_scale)
   self.set_attr('layer_drop', layer_drop)
   assert not carry, "not supported anymore"
   self.set_attr('residual', residual)
   self.set_attr('n_out', n_out)
   self.set_attr('L1', L1)
   self.set_attr('L2', L2)
   if L2_eye:
     self.set_attr('L2_eye', L2_eye)
   self.device = device # if device else str(theano.config.device)
   for s in self.sources:
     s.transfer_output(self.device)
   self.set_attr('varreg', varreg)
   if output_L2_reg:
     self.set_attr('output_L2_reg', output_L2_reg)
   if output_entropy_reg:
     self.set_attr('output_entropy_reg', output_entropy_reg)
   if output_entropy_exp_reg:
     self.set_attr('output_entropy_exp_reg', output_entropy_exp_reg)
   self.set_attr('batch_norm', batch_norm)
   if y_in is not None:
     self.y_in = {}
     for k in y_in:
       if not isinstance(y_in[k], T.Variable): continue
       self.y_in[k] = time_batch_make_flat(y_in[k])  # TODO: better not flatten here...
       self.y_in[k].n_out = getattr(y_in[k], "n_out", None)
   else:
     self.y_in = None
   self.constraints = T.constant(0)
   if target:
     self.set_attr('target', target)
   if target_index:
     self.set_attr('target_index', target_index)
     assert target_index in self.network.j
     self.index = index = self.network.j[target_index]
   if cost_scale != 1:
     self.set_attr("cost_scale", cost_scale)
   if with_bias:
     self.b = self.add_param(self.create_bias(n_out), 'b_%s'%self.name)
   else:
     self.set_attr('with_bias', False)
     self.b = numpy.float32(0)
   self.mass = T.constant(1., name = "mass_%s" % self.name, dtype='float32')
   self.masks = [None] * len(self.sources)
   assert mask in ['dropout', 'unity', 'none'], "invalid mask: %s" % mask
   if mask == "dropout" or (mask == 'none' and dropout > 0):
     assert 0.0 < dropout < 1.0
     # If we apply this mass during training then we don't need any mask or mass for testing.
     # The expected weight should be 1 in
     #   E[x] = mass * (1-dropout)
     # so mass has to be 1 / (1 - dropout).
     self.mass = T.constant(1.0 / (1.0 - dropout), dtype='float32')
     from theano.sandbox.rng_mrg import MRG_RandomStreams as RandomStreams
     srng = RandomStreams(self.rng.randint(1234) + 1)
     if self.depth > 1:
       self.masks = [T.cast(srng.binomial(n=1, p=1 - dropout, size=(s.attrs['n_out'],self.depth)), theano.config.floatX) for s in self.sources]
     else:
       self.masks = [T.cast(srng.binomial(n=1, p=1 - dropout, size=(s.attrs['n_out'],)), theano.config.floatX) for s in self.sources]
Beispiel #6
0
  def __init__(self, loss, y, dtype=None, copy_input=None, copy_output=None, time_limit=0,
               grad_clip_z=None, grad_discard_out_of_bound_z=None,
               **kwargs):
    """
    :param theano.Variable index: index for batches
    :param str loss: e.g. 'ce'
    """
    super(OutputLayer, self).__init__(**kwargs)
    if dtype:
      self.set_attr('dtype', dtype)
    if copy_input:
      self.set_attr("copy_input", copy_input.name)
    if grad_clip_z is not None:
      self.set_attr("grad_clip_z", grad_clip_z)
    if grad_discard_out_of_bound_z is not None:
      self.set_attr("grad_discard_out_of_bound_z", grad_discard_out_of_bound_z)
    if not copy_input:
      self.z = self.b
      self.W_in = [self.add_param(self.create_forward_weights(source.attrs['n_out'], self.attrs['n_out'],
                                                              name="W_in_%s_%s" % (source.name, self.name)))
                   for source in self.sources]

      assert len(self.sources) == len(self.masks) == len(self.W_in)
      assert len(self.sources) > 0
      for source, m, W in zip(self.sources, self.masks, self.W_in):
        source_output = source.output
        #4D input from TwoD Layers -> collapse height dimension
        if source_output.ndim == 4:
          source_output = source_output.sum(axis=0)
        if source.attrs['sparse']:
          if source.output.ndim == 3:
            input = source_output[:,:,0]  # old sparse format
          else:
            assert source_output.ndim == 2
            input = source.output
          self.z += W[T.cast(input, 'int32')]
        elif m is None:
          self.z += self.dot(source_output, W)
        else:
          self.z += self.dot(self.mass * m * source_output, W)
    else:
      self.z = copy_input.output
    assert self.z.ndim == 3
    if grad_clip_z is not None:
      grad_clip_z = numpy.float32(grad_clip_z)
      self.z = theano.gradient.grad_clip(self.z, -grad_clip_z, grad_clip_z)
    if grad_discard_out_of_bound_z is not None:
      grad_discard_out_of_bound_z = numpy.float32(grad_discard_out_of_bound_z)
      self.z = grad_discard_out_of_bound(self.z, -grad_discard_out_of_bound_z, grad_discard_out_of_bound_z)
    if not copy_output:
      self.y = y
    else:
      self.index = copy_output.index
      self.y = copy_output.y_out
    if isinstance(y, T.Variable):
      self.y_data_flat = time_batch_make_flat(y)
    else:
      assert self.attrs.get("target", "").endswith("[sparse:coo]")
      assert isinstance(self.y, tuple)
      assert len(self.y) == 3
      s0, s1, weight = self.y
      from NativeOp import max_and_argmax_sparse
      n_time = self.z.shape[0]
      n_batch = self.z.shape[1]
      mask = self.network.j[self.attrs.get("target", "").replace("[sparse:coo]", "[sparse:coo:2:0]")]
      out_arg = T.zeros((n_time, n_batch), dtype="float32")
      out_max = T.zeros((n_time, n_batch), dtype="float32") - numpy.float32(1e16)
      out_arg, out_max = max_and_argmax_sparse(s0, s1, weight, mask, out_arg, out_max)
      assert out_arg.ndim == 2
      self.y_data_flat = out_arg.astype("int32")

    self.norm = numpy.float32(1)
    self.target_index = self.index
    if time_limit == 'inf':
      #target_length = self.index.shape[0]
      #mass = T.cast(T.sum(self.index),'float32')
      #self.index = theano.ifelse.ifelse(T.gt(self.z.shape[0],target_length),self.sources[0].index,self.index)
      #self.norm = mass / T.cast(T.sum(self.index),'float32')
      num = T.cast(T.sum(self.index), 'float32')
      if self.eval_flag:
        self.index = self.sources[0].index
      else:
        import theano.ifelse
        padx = T.zeros((T.abs_(self.index.shape[0] - self.z.shape[0]), self.index.shape[1], self.z.shape[2]), 'float32') + self.z[-1]
        pady = T.zeros((T.abs_(self.index.shape[0] - self.z.shape[0]), self.index.shape[1]), 'int32') #+ y[-1]
        padi = T.ones((T.abs_(self.index.shape[0] - self.z.shape[0]), self.index.shape[1]), 'int8')
        self.z = theano.ifelse.ifelse(T.lt(self.z.shape[0], self.index.shape[0]),
                                      T.concatenate([self.z,padx],axis=0), self.z)
        #self.z = theano.ifelse.ifelse(T.gt(self.z.shape[0], self.index.shape[0]),self.z[:self.index.shape[0]], self.z)
        self.y_data_flat = time_batch_make_flat(theano.ifelse.ifelse(T.gt(self.z.shape[0],self.index.shape[0]),
                                                                     T.concatenate([y,pady], axis=0), y))
        #self.index = theano.ifelse.ifelse(T.gt(self.z.shape[0], self.index.shape[0]), T.concatenate([T.ones((self.z.shape[0] - self.index.shape[0],self.z.shape[1]),'int8'), self.index], axis=0), self.index)
        self.index = theano.ifelse.ifelse(T.gt(self.z.shape[0], self.index.shape[0]),
                                          T.concatenate([padi,self.index],axis=0),self.index)
      self.norm = num / T.cast(T.sum(self.index),'float32')
    elif time_limit > 0:
      end = T.min([self.z.shape[0], T.constant(time_limit, 'int32')])
      nom = T.cast(T.sum(self.index),'float32')
      self.index = T.set_subtensor(self.index[end:], T.zeros_like(self.index[end:]))
      self.norm = nom / T.cast(T.sum(self.index),'float32')
      self.z = T.set_subtensor(self.z[end:], T.zeros_like(self.z[end:]))

    #xs = [s.output for s in self.sources]
    #self.z = AccumulatorOpInstance(*[self.b] + xs + self.W_in)
    #outputs_info = None #[ T.alloc(numpy.cast[theano.config.floatX](0), index.shape[1], self.attrs['n_out']) ]

    #self.z, _ = theano.scan(step,
    #                        sequences = [s.output for s in self.sources],
    #                        non_sequences = self.W_in + [self.b])

    self.set_attr('from', ",".join([s.name for s in self.sources]))
    self.i = (self.index.flatten() > 0).nonzero()
    self.j = ((1 - self.index.flatten()) > 0).nonzero()
    self.loss = loss.encode("utf8")
    self.attrs['loss'] = self.loss
    if self.loss == 'priori':
      self.priori = self.shared(value=numpy.ones((self.attrs['n_out'],), dtype=theano.config.floatX), borrow=True)
    #self.make_output(self.z, collapse = False)
    # Note that self.output is going to be overwritten in our derived classes.
    self.output = self.make_consensus(self.z) if self.depth > 1 else self.z
Beispiel #7
0
 def __init__(self, sources, n_out, index, y_in=None, target=None, target_index=None,
              sparse=False, cost_scale=1.0, input_scale=1.0,
              L1=0.0, L2=0.0, L2_eye=None, varreg=0.0,
              output_L2_reg=0.0, output_entropy_reg=0.0, output_entropy_exp_reg=0.0,
              with_bias=True,
              mask="unity", dropout=0.0, batch_drop=False, batch_norm=False, bn_use_sample=False, layer_drop=0.0, residual=False,
              carry=False,
              sparse_filtering=False, gradient_scale=1.0, trainable=True, device=None,
              dtype='float32',
              **kwargs):
   """
   :param list[NetworkBaseLayer.Layer] sources: list of source layers
   :param int n_out: output dim of W_in and dim of bias
   :param float L1: l1-param-norm regularization
   :param float L2: l2-param-norm regularization
   :param str mask: "unity" or "dropout"
   :type dropout: float
   """
   super(Layer, self).__init__(**kwargs)
   self.index = index
   self.sources = sources; ":type: list[Layer]"
   self.num_sources = len(sources)
   self.D = max([s.D for s in sources if isinstance(s,Layer)] + [0])
   if mask is None: mask = 'none'
   self.set_attr('mask', mask)
   self.set_attr('dropout', dropout)
   self.set_attr('sparse', sparse)
   self.set_attr('bn_use_sample', bn_use_sample)
   self.set_attr('sparse_filtering', sparse_filtering)
   if not trainable:
     self.set_attr('trainable', trainable)  # only store if not default
     self.gradient_scale = 0.0  # just to be sure
   else:
     self.gradient_scale = gradient_scale
   if gradient_scale != 1.0:
     self.set_attr('gradient_scale', gradient_scale)
   self.set_attr('layer_drop', layer_drop)
   assert not carry, "not supported anymore"
   self.set_attr('residual', residual)
   self.set_attr('n_out', n_out)
   self.set_attr('L1', L1)
   self.set_attr('L2', L2)
   if L2_eye:
     self.set_attr('L2_eye', L2_eye)
   self.device = device # if device else str(theano.config.device)
   for s in self.sources:
     s.transfer_output(self.device)
   self.set_attr('varreg', varreg)
   if output_L2_reg:
     self.set_attr('output_L2_reg', output_L2_reg)
   if output_entropy_reg:
     self.set_attr('output_entropy_reg', output_entropy_reg)
   if output_entropy_exp_reg:
     self.set_attr('output_entropy_exp_reg', output_entropy_exp_reg)
   self.set_attr('batch_norm', batch_norm)
   self.set_attr('input_scale', input_scale)
   if y_in is not None:
     self.y_in = {}
     for k in y_in:
       if not isinstance(y_in[k], T.Variable): continue
       self.y_in[k] = time_batch_make_flat(y_in[k])  # TODO: better not flatten here...
       self.y_in[k].n_out = getattr(y_in[k], "n_out", None)
   else:
     self.y_in = None
   self.constraints = T.constant(0)
   if target:
     self.set_attr('target', target)
   if target_index:
     self.set_attr('target_index', target_index)
     assert target_index in self.network.j
     self.index = index = self.network.j[target_index]
   if cost_scale != 1:
     self.set_attr("cost_scale", cost_scale)
   if with_bias:
     self.b = self.add_param(self.create_bias(n_out), 'b_%s'%self.name)
   else:
     self.set_attr('with_bias', False)
     self.b = numpy.float32(0)
   self.mass = T.constant(1., name = "mass_%s" % self.name, dtype='float32')
   self.masks = [None] * len(self.sources)
   assert mask in ['dropout', 'unity', 'none'], "invalid mask: %s" % mask
   if mask == "dropout" or (mask == 'none' and dropout > 0):
     assert 0.0 < dropout < 1.0
     # If we apply this mass during training then we don't need any mask or mass for testing.
     # The expected weight should be 1 in
     #   E[x] = mass * (1-dropout)
     # so mass has to be 1 / (1 - dropout).
     self.mass = T.constant(1.0 / (1.0 - dropout), dtype='float32')
     from theano.sandbox.rng_mrg import MRG_RandomStreams as RandomStreams
     srng = RandomStreams(self.rng.randint(1234) + 1)
     if self.depth > 1:
       self.masks = [T.cast(srng.binomial(n=1, p=1 - dropout, size=(s.attrs['n_out'],self.depth)), theano.config.floatX) for s in self.sources]
     else:
       if batch_drop:
         self.masks = [T.cast(srng.binomial(n=1, p=1 - dropout, size=s.output.shape), theano.config.floatX) for s in self.sources]
       else:
         self.masks = [T.cast(srng.binomial(n=1, p=1 - dropout, size=(s.attrs['n_out'],)), theano.config.floatX) for s in self.sources]
Beispiel #8
0
  def __init__(self, loss, y, dtype=None, copy_input=None, copy_output=None, time_limit=0,
               use_source_index=False,
               compute_priors=False, compute_priors_exp_average=0, compute_distortions=False,
               softmax_smoothing=1.0, grad_clip_z=None, grad_discard_out_of_bound_z=None, normalize_length=False,
               exclude_labels=[],
               apply_softmax=True,
               substract_prior_from_output=False,
               input_output_similarity=None,
               input_output_similarity_scale=1,
               **kwargs):
    """
    :param theano.Variable index: index for batches
    :param str loss: e.g. 'ce'
    """
    super(OutputLayer, self).__init__(**kwargs)
    self.set_attr("normalize_length", normalize_length)
    if dtype:
      self.set_attr('dtype', dtype)
    if copy_input:
      self.set_attr("copy_input", copy_input.name)
    if grad_clip_z is not None:
      self.set_attr("grad_clip_z", grad_clip_z)
    if compute_distortions:
      self.set_attr("compute_distortions", compute_distortions)
    if grad_discard_out_of_bound_z is not None:
      self.set_attr("grad_discard_out_of_bound_z", grad_discard_out_of_bound_z)
    if not apply_softmax:
      self.set_attr("apply_softmax", apply_softmax)
    if substract_prior_from_output:
      self.set_attr("substract_prior_from_output", substract_prior_from_output)
    if input_output_similarity:
      self.set_attr("input_output_similarity", input_output_similarity)
      self.set_attr("input_output_similarity_scale", input_output_similarity_scale)
    if use_source_index:
      self.set_attr("use_source_index", use_source_index)
      src_index = self.sources[0].index
      self.index = src_index
    if not copy_input:
      self.z = self.b
      self.W_in = [self.add_param(self.create_forward_weights(source.attrs['n_out'], self.attrs['n_out'],
                                                              name="W_in_%s_%s" % (source.name, self.name)))
                   for source in self.sources]

      assert len(self.sources) == len(self.masks) == len(self.W_in)
      assert len(self.sources) > 0
      for source, m, W in zip(self.sources, self.masks, self.W_in):
        source_output = source.output
        # 4D input from TwoD Layers -> collapse height dimension
        if source_output.ndim == 4:
          source_output = source_output.sum(axis=0)
        if source.attrs['sparse']:
          if source.output.ndim == 3:
            input = source_output[:, :, 0]  # old sparse format
          else:
            assert source_output.ndim == 2
            input = source.output
          self.z += W[T.cast(input, 'int32')]
        elif m is None:
          self.z += self.dot(source_output, W)
        else:
          self.z += self.dot(self.mass * m * source_output, W)
    else:
      self.z = copy_input.output
    assert self.z.ndim == 3
    if grad_clip_z is not None:
      grad_clip_z = numpy.float32(grad_clip_z)
      self.z = theano.gradient.grad_clip(self.z, -grad_clip_z, grad_clip_z)
    if grad_discard_out_of_bound_z is not None:
      grad_discard_out_of_bound_z = numpy.float32(grad_discard_out_of_bound_z)
      self.z = grad_discard_out_of_bound(self.z, -grad_discard_out_of_bound_z, grad_discard_out_of_bound_z)
    if not copy_output:
      self.y = y
    else:
      self.index = copy_output.index
      self.y = copy_output.y_out
    if y is None:
      self.y_data_flat = None
    elif isinstance(y, T.Variable):
      self.y_data_flat = time_batch_make_flat(y)
    else:
      assert self.attrs.get("target", "").endswith("[sparse:coo]")
      assert isinstance(self.y, tuple)
      assert len(self.y) == 3
      s0, s1, weight = self.y
      from NativeOp import max_and_argmax_sparse
      n_time = self.z.shape[0]
      n_batch = self.z.shape[1]
      mask = self.network.j[self.attrs.get("target", "").replace("[sparse:coo]", "[sparse:coo:2:0]")]
      out_arg = T.zeros((n_time, n_batch), dtype="float32")
      out_max = T.zeros((n_time, n_batch), dtype="float32") - numpy.float32(1e16)
      out_arg, out_max = max_and_argmax_sparse(s0, s1, weight, mask, out_arg, out_max)
      assert out_arg.ndim == 2
      self.y_data_flat = out_arg.astype("int32")

    self.norm = numpy.float32(1)
    self.target_index = self.index
    if time_limit == 'inf':
      # target_length = self.index.shape[0]
      # mass = T.cast(T.sum(self.index),'float32')
      # self.index = theano.ifelse.ifelse(T.gt(self.z.shape[0],target_length),self.sources[0].index,self.index)
      # self.norm = mass / T.cast(T.sum(self.index),'float32')
      num = T.cast(T.sum(self.index), 'float32')
      if self.eval_flag:
        self.index = self.sources[0].index
      else:
        import theano.ifelse
        padx = T.zeros((T.abs_(self.index.shape[0] - self.z.shape[0]), self.index.shape[1], self.z.shape[2]),
                       'float32') + self.z[-1]
        pady = T.zeros((T.abs_(self.index.shape[0] - self.z.shape[0]), self.index.shape[1]), 'int32')  # + y[-1]
        padi = T.ones((T.abs_(self.index.shape[0] - self.z.shape[0]), self.index.shape[1]), 'int8')
        self.z = theano.ifelse.ifelse(T.lt(self.z.shape[0], self.index.shape[0]),
                                      T.concatenate([self.z, padx], axis=0), self.z)
        # self.z = theano.ifelse.ifelse(T.gt(self.z.shape[0], self.index.shape[0]),self.z[:self.index.shape[0]], self.z)
        self.y_data_flat = time_batch_make_flat(theano.ifelse.ifelse(T.gt(self.z.shape[0], self.index.shape[0]),
                                                                     T.concatenate([y, pady], axis=0), y))
        # self.index = theano.ifelse.ifelse(T.gt(self.z.shape[0], self.index.shape[0]), T.concatenate([T.ones((self.z.shape[0] - self.index.shape[0],self.z.shape[1]),'int8'), self.index], axis=0), self.index)
        self.index = theano.ifelse.ifelse(T.gt(self.z.shape[0], self.index.shape[0]),
                                          T.concatenate([padi, self.index], axis=0), self.index)
      self.norm *= num / T.cast(T.sum(self.index), 'float32')
    elif time_limit > 0:
      end = T.min([self.z.shape[0], T.constant(time_limit, 'int32')])
      num = T.cast(T.sum(self.index), 'float32')
      self.index = T.set_subtensor(self.index[end:], T.zeros_like(self.index[end:]))
      self.norm = num / T.cast(T.sum(self.index), 'float32')
      self.z = T.set_subtensor(self.z[end:], T.zeros_like(self.z[end:]))

    # xs = [s.output for s in self.sources]
    # self.z = AccumulatorOpInstance(*[self.b] + xs + self.W_in)
    # outputs_info = None #[ T.alloc(numpy.cast[theano.config.floatX](0), index.shape[1], self.attrs['n_out']) ]

    # self.z, _ = theano.scan(step,
    #                        sequences = [s.output for s in self.sources],
    #                        non_sequences = self.W_in + [self.b])

    self.set_attr('from', ",".join([s.name for s in self.sources]))
    index_flat = self.index.flatten()
    for label in exclude_labels:
      index_flat = T.set_subtensor(index_flat[(T.eq(self.y_data_flat, label) > 0).nonzero()], numpy.int8(0))
    self.i = (index_flat > 0).nonzero()
    self.j = ((numpy.int32(1) - index_flat) > 0).nonzero()
    self.loss = as_str(loss.encode("utf8"))
    self.attrs['loss'] = self.loss
    if compute_priors:
      self.set_attr('compute_priors', compute_priors)
      if compute_priors_exp_average:
        self.set_attr('compute_priors_exp_average', compute_priors_exp_average)
    if softmax_smoothing != 1.0:
      self.attrs['softmax_smoothing'] = softmax_smoothing
      print >> log.v4, "Logits before the softmax scaled with factor ", softmax_smoothing
      self.z *= numpy.float32(softmax_smoothing)
    if self.loss == 'priori':
      self.priori = self.shared(value=numpy.ones((self.attrs['n_out'],), dtype=theano.config.floatX), borrow=True)

    if input_output_similarity:
      # First a self-similarity of input and output,
      # and then add -similarity or distance between those to the constraints,
      # so that the input and output correlate on a frame-by-frame basis.
      # Here some other similarities/distances we could try:
      # http://docs.scipy.org/doc/scipy/reference/generated/scipy.spatial.distance.pdist.html
      # https://brenocon.com/blog/2012/03/cosine-similarity-pearson-correlation-and-ols-coefficients/
      from TheanoUtil import self_similarity_cosine
      self_similarity = self_similarity_cosine  # maybe other
      data_layer = self.find_data_layer()
      assert data_layer
      assert data_layer.output.ndim == 3
      n_time = data_layer.output.shape[0]
      n_batch = data_layer.output.shape[1]
      findex = T.cast(self.output_index(), "float32")
      findex_bc = findex.reshape((n_time * n_batch,)).dimshuffle(0, 'x')
      findex_sum = T.sum(findex)
      data = data_layer.output.reshape((n_time * n_batch, data_layer.output.shape[2])) * findex_bc
      assert self.z.ndim == 3
      z = self.z.reshape((n_time * n_batch, self.z.shape[2])) * findex_bc
      data_self_sim = T.flatten(self_similarity(data))
      z_self_sim = T.flatten(self_similarity(z))
      assert data_self_sim.ndim == z_self_sim.ndim == 1
      sim = T.dot(data_self_sim, z_self_sim)  # maybe others make sense
      assert sim.ndim == 0
      # sim is ~ proportional to T * T, so divide by T.
      sim *= numpy.float32(input_output_similarity_scale) / findex_sum
      self.constraints -= sim

    # self.make_output(self.z, collapse = False)
    # Note that self.output is going to be overwritten in our derived classes.
    self.output = self.make_consensus(self.z) if self.depth > 1 else self.z
    self.y_m = None  # flat log(self.p_y_given_x)