Beispiel #1
0
    def fprop(self,
              state_below,
              use_noise=True,
              no_noise_bias=False,
              first_only=False):
        """
        Constructs the computational graph of this layer.
        If the input is ints, we assume is an index, otherwise we assume is
        a set of floats.
        """
        if self.weight_noise and use_noise and self.noise_params:
            W_ems = [(x + y) for x, y in zip(self.W_ems, self.nW_ems)]
            if not no_noise_bias:
                b_ems = [(x + y) for x, y in zip(self.b_ems, self.nb_ems)]
            else:
                b_ems = self.b_ems
        else:
            W_ems = self.W_ems
            b_ems = self.b_ems
        if self.rank_n_approx:
            if first_only:
                emb_val = self.rank_n_activ(utils.dot(state_below, W_ems[0]))
                self.out = emb_val
                return emb_val
            emb_val = TT.dot(
                self.rank_n_activ(utils.dot(state_below, W_ems[0])), W_ems[1])
            if b_ems:
                emb_val += b_ems[0]
            st_pos = 1
        else:
            emb_val = utils.dot(state_below, W_ems[0])
            if b_ems:
                emb_val += b_ems[0]
            st_pos = 0

        emb_val = self.activation[0](emb_val)

        if self.dropout < 1.:
            if use_noise:
                emb_val = emb_val * self.trng.binomial(
                    emb_val.shape, n=1, p=self.dropout, dtype=emb_val.dtype)
            else:
                emb_val = emb_val * self.dropout
        for dx in xrange(1, self.n_layers):
            emb_val = utils.dot(emb_val, W_ems[st_pos + dx])
            if b_ems:
                emb_val = self.activation[dx](emb_val + b_ems[dx])
            else:
                emb_val = self.activation[dx](emb_val)

            if self.dropout < 1.:
                if use_noise:
                    emb_val = emb_val * self.trng.binomial(emb_val.shape,
                                                           n=1,
                                                           p=self.dropout,
                                                           dtype=emb_val.dtype)
                else:
                    emb_val = emb_val * self.dropout
        self.out = emb_val
        return emb_val
Beispiel #2
0
    def fprop(self, state_below, use_noise=True, no_noise_bias=False,
            first_only = False):
        """
        Constructs the computational graph of this layer.
        If the input is ints, we assume is an index, otherwise we assume is
        a set of floats.
        """
        print 'multilayer use noise:', use_noise
        if self.weight_noise and use_noise and self.noise_params:
            W_ems = [(x+y) for x, y in zip(self.W_ems, self.nW_ems)]
            if not no_noise_bias:
                b_ems = [(x+y) for x, y in zip(self.b_ems, self.nb_ems)]
            else:
                b_ems = self.b_ems
        else:
            W_ems = self.W_ems
            b_ems = self.b_ems
        if self.rank_n_approx:
            if first_only:
                emb_val = self.rank_n_activ(utils.dot(state_below, W_ems[0]))
                self.out = emb_val
                return emb_val
            emb_val = TT.dot(
                    self.rank_n_activ(utils.dot(state_below, W_ems[0])),
                    W_ems[1])
            if b_ems:
                emb_val += b_ems[0]
            st_pos = 1
        else:
            emb_val = utils.dot(state_below, W_ems[0])
            if b_ems:
                emb_val += b_ems[0]
            st_pos = 0


        emb_val = self.activation[0](emb_val)

        if self.dropout < 1.:
            if use_noise:
                print 'training use noise'
                emb_val = emb_val * self.trng.binomial(emb_val.shape, n=1, p=self.dropout, dtype=emb_val.dtype)
            else:
                print 'decoding not use noise'
                emb_val = emb_val * self.dropout
        for dx in xrange(1, self.n_layers):
            emb_val = utils.dot(emb_val, W_ems[st_pos+dx])
            if b_ems:
                emb_val = self.activation[dx](emb_val+ b_ems[dx])
            else:
                emb_val = self.activation[dx](emb_val)

            if self.dropout < 1.:
                if use_noise:
                    emb_val = emb_val * self.trng.binomial(emb_val.shape, n=1, p=self.dropout, dtype=emb_val.dtype)
                else:
                    emb_val = emb_val * self.dropout
        self.out = emb_val
        return emb_val
Beispiel #3
0
    def fprop(self, state_below, use_noise=True, no_noise_bias=False,
            first_only = False):
        """
        Constructs the computational graph of this layer.
        If the input is ints, we assume is an index, otherwise we assume is
        a set of floats.
        """
        if self.weight_noise and use_noise and self.noise_params:
            W_ems = [(x+y) for x, y in zip(self.W_ems, self.nW_ems)]
            if not no_noise_bias:
                b_ems = [(x+y) for x, y in zip(self.b_ems, self.nb_ems)]
            else:
                b_ems = self.b_ems
        else:
            W_ems = self.W_ems
            b_ems = self.b_ems

        #FIXME one bias for the whole layer? or we need different biases for each component
        emb_val1 = utils.dot(state_below, W_ems[0])
        emb_val2 = utils.dot(state_below, W_ems[1])
        if b_ems:
            emb_val1 += b_ems[0]
            emb_val2 += b_ems[1]

        emb_val1 = self.activation[0](emb_val1)
        emb_val2 = self.activation[0](emb_val2)

        emb_val = emb_val1 * emb_val2    

        #FIXME make sure how the dropout for tensor networks works
        if self.dropout < 1.:
            if use_noise:
                emb_val = emb_val * self.trng.binomial(emb_val.shape, n=1, p=self.dropout, dtype=emb_val.dtype)
            else:
                emb_val = emb_val * self.dropout
        for dx in xrange(1, self.n_layers):
            emb_val1 = utils.dot(emb_val, W_ems[2*dx])
            emb_val2 = utils.dot(emb_val, W_ems[2*dx+1])
            if b_ems:
                emb_val1 = emb_val1+ b_ems[2*dx]
                emb_val2 = emb_val2+ b_ems[2*dx+1]
            
            emb_val1 = self.activation[dx](emb_val1)
            emb_val2 = self.activation[dx](emb_val2)

            emb_val = emb_val1 * emb_val2

            #FIXME make sure how the dropout for tensor networks works
            if self.dropout < 1.:
                if use_noise:
                    emb_val = emb_val * self.trng.binomial(emb_val.shape, n=1, p=self.dropout, dtype=emb_val.dtype)
                else:
                    emb_val = emb_val * self.dropout
        self.out = emb_val
        return emb_val
Beispiel #4
0
    def fprop(self,
              state_below,
              temp=numpy.float32(1),
              use_noise=True,
              additional_inputs=None,
              no_noise_bias=False):
        """
        Forward pass through the cost layer.

        :type state_below: tensor or layer
        :param state_below: The theano expression (or groundhog layer)
            representing the input of the cost layer

        :type temp: float or tensor scalar
        :param temp: scalar representing the temperature that should be used
            when sampling from the output distribution

        :type use_noise: bool
        :param use_noise: flag. If true, noise is used when computing the
            output of the model

        :type no_noise_bias: bool
        :param no_noise_bias: flag, stating if weight noise should be added
            to the bias as well, or only to the weights
        """

        if self.rank_n_approx:
            if use_noise and self.noise_params:
                emb_val = self.rank_n_activ(
                    utils.dot(state_below, self.W_em1 + self.nW_em1))
                emb_val = TT.dot(self.W_em2 + self.nW_em2, emb_val)
            else:
                emb_val = self.rank_n_activ(utils.dot(state_below, self.W_em1))
                emb_val = TT.dot(self.W_em2, emb_val)
        else:
            if use_noise and self.noise_params:
                emb_val = utils.dot(state_below, self.W_em + self.nW_em)
            else:
                emb_val = utils.dot(state_below, self.W_em)

        if additional_inputs:
            if use_noise and self.noise_params:
                for inp, weight, noise_weight in zip(
                        additional_inputs, self.additional_weights,
                        self.noise_additional_weights):
                    emb_val += utils.dot(inp, (noise_weight + weight))
            else:
                for inp, weight in zip(additional_inputs,
                                       self.additional_weights):
                    emb_val += utils.dot(inp, weight)
        self.preactiv = emb_val
        if use_noise and self.noise_params and not no_noise_bias:
            emb_val = TT.nnet.sigmoid(temp *
                                      (emb_val + self.b_em + self.nb_em))
        else:
            emb_val = TT.nnet.sigmoid(temp * (emb_val + self.b_em))
        self.out = emb_val
        self.state_below = state_below
        self.model_output = emb_val
        return emb_val
Beispiel #5
0
    def fprop(self,
              state_below,
              temp=numpy.float32(1),
              use_noise=True,
              additional_inputs=None,
              no_noise_bias=False):
        """
        Forward pass through the cost layer.

        :type state_below: tensor or layer
        :param state_below: The theano expression (or groundhog layer)
            representing the input of the cost layer

        :type temp: float or tensor scalar
        :param temp: scalar representing the temperature that should be used
            when sampling from the output distribution

        :type use_noise: bool
        :param use_noise: flag. If true, noise is used when computing the
            output of the model

        :type no_noise_bias: bool
        :param no_noise_bias: flag, stating if weight noise should be added
            to the bias as well, or only to the weights
        """

        if self.rank_n_approx:
            if use_noise and self.noise_params:
                emb_val = self.rank_n_activ(utils.dot(state_below,
                                                      self.W_em1 + self.nW_em1))
                emb_val = TT.dot(self.W_em2 + self.nW_em2, emb_val)
            else:
                emb_val = self.rank_n_activ(utils.dot(state_below, self.W_em1))
                emb_val = TT.dot(self.W_em2, emb_val)
        else:
            if use_noise and self.noise_params:
                emb_val = utils.dot(state_below, self.W_em + self.nW_em)
            else:
                emb_val = utils.dot(state_below, self.W_em)

        if additional_inputs:
            if use_noise and self.noise_params:
                for inp, weight, noise_weight in zip(
                        additional_inputs, self.additional_weights,
                        self.noise_additional_weights):
                    emb_val += utils.dot(inp, (noise_weight + weight))
            else:
                for inp, weight in zip(additional_inputs, self.additional_weights):
                    emb_val += utils.dot(inp, weight)
        self.preactiv = emb_val
        if use_noise and self.noise_params and not no_noise_bias:
            emb_val = TT.nnet.sigmoid(temp *
                                      (emb_val + self.b_em + self.nb_em))
        else:
            emb_val = TT.nnet.sigmoid(temp * (emb_val + self.b_em))
        self.out = emb_val
        self.state_below = state_below
        self.model_output = emb_val
        return emb_val
Beispiel #6
0
    def fprop(self,
              state_below,
              temp=numpy.float32(1),
              use_noise=True,
              additional_inputs=None,
              no_noise_bias=False,
              target=None,
              full_softmax=True):
        """
        Forward pass through the cost layer.

        :type state_below: tensor or layer
        :param state_below: The theano expression (or groundhog layer)
            representing the input of the cost layer

        :type temp: float or tensor scalar
        :param temp: scalar representing the temperature that should be used
            when sampling from the output distribution

        :type use_noise: bool
        :param use_noise: flag. If true, noise is used when computing the
            output of the model

        :type no_noise_bias: bool
        :param no_noise_bias: flag, stating if weight noise should be added
            to the bias as well, or only to the weights
        """
        if not full_softmax:
            assert target is not None, 'target must be given'
        if self.rank_n_approx:
            if self.weight_noise and use_noise and self.noise_params:
                emb_val = self.rank_n_activ(utils.dot(state_below,
                                                      self.W_em1 + self.nW_em1))
                nW_em = self.nW_em2
            else:
                emb_val = self.rank_n_activ(utils.dot(state_below, self.W_em1))
            W_em = self.W_em2
        else:
            W_em = self.W_em
            if self.weight_noise:
                nW_em = self.nW_em
            emb_val = state_below

        if full_softmax:
            if self.weight_noise and use_noise and self.noise_params:
                emb_val = TT.dot(emb_val, W_em + nW_em)
            else:
                emb_val = TT.dot(emb_val, W_em)

            if additional_inputs:
                if use_noise and self.noise_params:
                    for inp, weight, noise_weight in zip(
                            additional_inputs, self.additional_weights,
                            self.noise_additional_weights):
                        emb_val += utils.dot(inp, (noise_weight + weight))
                else:
                    for inp, weight in zip(additional_inputs, self.additional_weights):
                        emb_val += utils.dot(inp, weight)
            if self.weight_noise and use_noise and self.noise_params and \
                    not no_noise_bias:
                emb_val = temp * (emb_val + self.b_em + self.nb_em)
            else:
                emb_val = temp * (emb_val + self.b_em)
        else:
            W_em = W_em[:, target]
            if self.weight_noise:
                nW_em = nW_em[:, target]
                W_em += nW_em
            if emb_val.ndim == 3:
                emb_val = emb_val.reshape([emb_val.shape[0] * emb_val.shape[1], emb_val.shape[2]])
            emb_val = (W_em.T * emb_val).sum(1) + self.b_em[target]
            if self.weight_noise and use_noise:
                emb_val += self.nb_em[target]
            emb_val = temp * emb_val

        self.preactiv = emb_val
        if full_softmax:
            emb_val = utils.softmax(emb_val)
        else:
            emb_val = TT.nnet.sigmoid(emb_val)
        self.out = emb_val
        self.state_below = state_below
        self.model_output = emb_val
        return emb_val
Beispiel #7
0
    def fprop(self,
              state_below,
              temp=numpy.float32(1),
              use_noise=True,
              additional_inputs=None,
              no_noise_bias=False,
              target=None,
              full_softmax=True):
        """
        Forward pass through the cost layer.

        :type state_below: tensor or layer
        :param state_below: The theano expression (or groundhog layer)
            representing the input of the cost layer

        :type temp: float or tensor scalar
        :param temp: scalar representing the temperature that should be used
            when sampling from the output distribution

        :type use_noise: bool
        :param use_noise: flag. If true, noise is used when computing the
            output of the model

        :type no_noise_bias: bool
        :param no_noise_bias: flag, stating if weight noise should be added
            to the bias as well, or only to the weights
        """
        if not full_softmax:
            assert target != None, 'target must be given'
        if self.rank_n_approx:
            if self.weight_noise and use_noise and self.noise_params:
                emb_val = self.rank_n_activ(
                    utils.dot(state_below, self.W_em1 + self.nW_em1))
                nW_em = self.nW_em2
            else:
                emb_val = self.rank_n_activ(utils.dot(state_below, self.W_em1))
            W_em = self.W_em2
        else:
            W_em = self.W_em
            if self.weight_noise:
                nW_em = self.nW_em
            emb_val = state_below

        if full_softmax:
            if self.weight_noise and use_noise and self.noise_params:
                emb_val = TT.dot(emb_val, W_em + nW_em)
            else:
                emb_val = TT.dot(emb_val, W_em)

            if additional_inputs:
                if use_noise and self.noise_params:
                    for inp, weight, noise_weight in zip(
                            additional_inputs, self.additional_weights,
                            self.noise_additional_weights):
                        emb_val += utils.dot(inp, (noise_weight + weight))
                else:
                    for inp, weight in zip(additional_inputs,
                                           self.additional_weights):
                        emb_val += utils.dot(inp, weight)
            if self.weight_noise and use_noise and self.noise_params and \
               not no_noise_bias:
                emb_val = temp * (emb_val + self.b_em + self.nb_em)
            else:
                emb_val = temp * (emb_val + self.b_em)
        else:
            W_em = W_em[:, target]
            if self.weight_noise:
                nW_em = nW_em[:, target]
                W_em += nW_em
            if emb_val.ndim == 3:
                emb_val = emb_val.reshape(
                    [emb_val.shape[0] * emb_val.shape[1], emb_val.shape[2]])
            emb_val = (W_em.T * emb_val).sum(1) + self.b_em[target]
            if self.weight_noise and use_noise:
                emb_val += self.nb_em[target]
            emb_val = temp * emb_val

        self.preactiv = emb_val
        if full_softmax:
            emb_val = utils.softmax(emb_val)
        else:
            emb_val = TT.nnet.sigmoid(emb_val)
        self.out = emb_val
        self.state_below = state_below
        self.model_output = emb_val
        return emb_val
    def fprop(self,
              state_below,
              mask=None,
              init_state=None,
              gater_below=None,
              reseter_below=None,
              c=None,
              c_mask=None,
              nsteps=None,
              batch_size=None,
              use_noise=True,
              truncate_gradient=-1,
              no_noise_bias=False,
              return_alignment=False):

        updater_below = gater_below

        if theano.config.floatX=='float32':
            floatX = numpy.float32
        else:
            floatX = numpy.float64
        if nsteps is None:
            nsteps = state_below.shape[0]
            if batch_size and batch_size != 1:
                nsteps = nsteps / batch_size
        if batch_size is None and state_below.ndim == 3:
            batch_size = state_below.shape[1]
        if state_below.ndim == 2 and \
           (not isinstance(batch_size,int) or batch_size > 1):
            state_below = state_below.reshape((nsteps, batch_size, self.n_in))
            if updater_below:
                updater_below = updater_below.reshape((nsteps, batch_size, self.n_in))
            if reseter_below:
                reseter_below = reseter_below.reshape((nsteps, batch_size, self.n_in))

        if not init_state:
            if not isinstance(batch_size, int) or batch_size != 1:
                init_state = TT.alloc(floatX(0), batch_size, self.n_hids)
            else:
                init_state = TT.alloc(floatX(0), self.n_hids)

        p_from_c =  utils.dot(c, self.A_cp).reshape(#attention weights for each hidden state
                (c.shape[0], c.shape[1], self.n_hids))
        
        if mask:
            sequences = [state_below, mask, updater_below, reseter_below]
            non_sequences = [c, c_mask, p_from_c] 
            #              seqs    | out |  non_seqs
            fn = lambda x, m, g, r,   h,   c1, cm, pc : self.step_fprop(x, h, mask=m,
                    gater_below=g, reseter_below=r,
                    c=c1, p_from_c=pc, c_mask=cm,
                    use_noise=use_noise, no_noise_bias=no_noise_bias,
                    return_alignment=return_alignment)
        else:
            sequences = [state_below, updater_below, reseter_below]
            non_sequences = [c, p_from_c]
            #            seqs   | out | non_seqs
            fn = lambda x, g, r,   h,    c1, pc : self.step_fprop(x, h,
                    gater_below=g, reseter_below=r,
                    c=c1, p_from_c=pc,
                    use_noise=use_noise, no_noise_bias=no_noise_bias,
                    return_alignment=return_alignment)

        outputs_info = [init_state, None]
        if return_alignment:
            outputs_info.append(None)

        #use scan to repetitively update hidden states
        rval, updates = theano.scan(fn,
                        sequences=sequences,
                        non_sequences=non_sequences,
                        outputs_info=outputs_info,
                        name='layer_%s'%self.name,
                        truncate_gradient=truncate_gradient,
                        n_steps=nsteps)
        self.out = rval
        self.rval = rval
        self.updates = updates

        return self.out   
    def step_fprop(self,
                   state_below,
                   state_before,
                   gater_below=None,
                   reseter_below=None,
                   mask=None,
                   c=None,
                   c_mask=None,
                   p_from_c=None,
                   use_noise=True,
                   no_noise_bias=False,
                   step_num=None,
                   return_alignment=False):
        """
        Constructs the computational graph of this layer.

        :type state_below: theano variable
        :param state_below: the input to the layer

        :type mask: None or theano variable
        :param mask: mask describing the length of each sequence in a minibatch

        :type state_before: theano variable
        :param state_before: the previous value of the hidden state of the layer

        :type updater_below: theano variable
        :param updater_below: the input to the update gate

        :type reseter_below: theano variable
        :param reseter_below: the input to the reset gate

        :type use_noise: bool
        :param use_noise: flag saying if weight noise should be used in
            computing the output of this layer

        :type no_noise_bias: bool
        :param no_noise_bias: flag saying if weight noise should be added to
            the bias as well
        """

        updater_below = gater_below

        W_hh = self.W_hh
        G_hh = self.G_hh
        R_hh = self.R_hh
        A_cp = self.A_cp
        B_hp = self.B_hp
        D_pe = self.D_pe

        # The code works only with 3D tensors
        cndim = c.ndim
        if cndim == 2:
            c = c[:, None, :]

        # Warning: either source_num or target_num should be equal,
        #          or one of them sould be 1 (they have to broadcast)
        #          for the following code to make any sense.
        source_len = c.shape[0]#sequence length
        source_num = c.shape[1]#number of sequences in a batch 
        target_num = state_before.shape[0]
        dim = self.n_hids

        # Form projection to the tanh layer from the previous hidden state
        # Shape: (source_len, target_num, dim)
        p_from_h = ReplicateLayer(source_len)(utils.dot(state_before, B_hp)).out

        # Form projection to the tanh layer from the source annotation.
        if not p_from_c:
            p_from_c =  utils.dot(c, A_cp).reshape((source_len, source_num, dim))

        # Sum projections - broadcasting happens at the dimension 1.
        p = p_from_h + p_from_c

        # Apply non-linearity and project to energy.
        energy = TT.exp(utils.dot(TT.tanh(p), D_pe)).reshape((source_len, target_num))
        if c_mask:
            # This is used for batches only, that is target_num == source_num
            energy *= c_mask

        # Calculate energy sums.
        normalizer = energy.sum(axis=0)

        # Get probabilities. (softmax?)
        probs = energy / normalizer 

        # Calculate weighted sums of source annotations.
        # If target_num == 1, c shoulds broadcasted at the 1st dimension.
        # Probabilities are broadcasted at the 2nd dimension.
        ctx = (c * probs.dimshuffle(0, 1, 'x')).sum(axis=0)#averaged context (see the picture)

        state_below += self.c_inputer(ctx).out
        reseter_below += self.c_reseter(ctx).out
        updater_below += self.c_updater(ctx).out

        # Reset gate:
        # optionally reset the hidden state.
        reseter = self.reseter_activation(TT.dot(state_before, R_hh)+reseter_below)
        reseted_state_before = reseter * state_before

        # Feed the input to obtain potential new state.
        preactiv = TT.dot(reseted_state_before, W_hh) + state_below
        h = self.activation(preactiv)

        # Update gate:
        # optionally reject the potential new state and use the new one.
        updater = self.updater_activation(TT.dot(state_before, G_hh) + updater_below)
        h = updater * h + (1-updater) * state_before #h_t=z*h_{t-1}+(1-z)*h_t

        if mask is not None:
            if h.ndim ==2 and mask.ndim==1:
                mask = mask.dimshuffle(0,'x')
            h = mask * h + (1-mask) * state_before

        results = [h, ctx]
        if return_alignment:
            results += [probs]
        return results