def fprop(self, state_below, use_noise=True, no_noise_bias=False, first_only=False): """ Constructs the computational graph of this layer. If the input is ints, we assume is an index, otherwise we assume is a set of floats. """ if self.weight_noise and use_noise and self.noise_params: W_ems = [(x + y) for x, y in zip(self.W_ems, self.nW_ems)] if not no_noise_bias: b_ems = [(x + y) for x, y in zip(self.b_ems, self.nb_ems)] else: b_ems = self.b_ems else: W_ems = self.W_ems b_ems = self.b_ems if self.rank_n_approx: if first_only: emb_val = self.rank_n_activ(utils.dot(state_below, W_ems[0])) self.out = emb_val return emb_val emb_val = TT.dot( self.rank_n_activ(utils.dot(state_below, W_ems[0])), W_ems[1]) if b_ems: emb_val += b_ems[0] st_pos = 1 else: emb_val = utils.dot(state_below, W_ems[0]) if b_ems: emb_val += b_ems[0] st_pos = 0 emb_val = self.activation[0](emb_val) if self.dropout < 1.: if use_noise: emb_val = emb_val * self.trng.binomial( emb_val.shape, n=1, p=self.dropout, dtype=emb_val.dtype) else: emb_val = emb_val * self.dropout for dx in xrange(1, self.n_layers): emb_val = utils.dot(emb_val, W_ems[st_pos + dx]) if b_ems: emb_val = self.activation[dx](emb_val + b_ems[dx]) else: emb_val = self.activation[dx](emb_val) if self.dropout < 1.: if use_noise: emb_val = emb_val * self.trng.binomial(emb_val.shape, n=1, p=self.dropout, dtype=emb_val.dtype) else: emb_val = emb_val * self.dropout self.out = emb_val return emb_val
def fprop(self, state_below, use_noise=True, no_noise_bias=False, first_only = False): """ Constructs the computational graph of this layer. If the input is ints, we assume is an index, otherwise we assume is a set of floats. """ print 'multilayer use noise:', use_noise if self.weight_noise and use_noise and self.noise_params: W_ems = [(x+y) for x, y in zip(self.W_ems, self.nW_ems)] if not no_noise_bias: b_ems = [(x+y) for x, y in zip(self.b_ems, self.nb_ems)] else: b_ems = self.b_ems else: W_ems = self.W_ems b_ems = self.b_ems if self.rank_n_approx: if first_only: emb_val = self.rank_n_activ(utils.dot(state_below, W_ems[0])) self.out = emb_val return emb_val emb_val = TT.dot( self.rank_n_activ(utils.dot(state_below, W_ems[0])), W_ems[1]) if b_ems: emb_val += b_ems[0] st_pos = 1 else: emb_val = utils.dot(state_below, W_ems[0]) if b_ems: emb_val += b_ems[0] st_pos = 0 emb_val = self.activation[0](emb_val) if self.dropout < 1.: if use_noise: print 'training use noise' emb_val = emb_val * self.trng.binomial(emb_val.shape, n=1, p=self.dropout, dtype=emb_val.dtype) else: print 'decoding not use noise' emb_val = emb_val * self.dropout for dx in xrange(1, self.n_layers): emb_val = utils.dot(emb_val, W_ems[st_pos+dx]) if b_ems: emb_val = self.activation[dx](emb_val+ b_ems[dx]) else: emb_val = self.activation[dx](emb_val) if self.dropout < 1.: if use_noise: emb_val = emb_val * self.trng.binomial(emb_val.shape, n=1, p=self.dropout, dtype=emb_val.dtype) else: emb_val = emb_val * self.dropout self.out = emb_val return emb_val
def fprop(self, state_below, use_noise=True, no_noise_bias=False, first_only = False): """ Constructs the computational graph of this layer. If the input is ints, we assume is an index, otherwise we assume is a set of floats. """ if self.weight_noise and use_noise and self.noise_params: W_ems = [(x+y) for x, y in zip(self.W_ems, self.nW_ems)] if not no_noise_bias: b_ems = [(x+y) for x, y in zip(self.b_ems, self.nb_ems)] else: b_ems = self.b_ems else: W_ems = self.W_ems b_ems = self.b_ems #FIXME one bias for the whole layer? or we need different biases for each component emb_val1 = utils.dot(state_below, W_ems[0]) emb_val2 = utils.dot(state_below, W_ems[1]) if b_ems: emb_val1 += b_ems[0] emb_val2 += b_ems[1] emb_val1 = self.activation[0](emb_val1) emb_val2 = self.activation[0](emb_val2) emb_val = emb_val1 * emb_val2 #FIXME make sure how the dropout for tensor networks works if self.dropout < 1.: if use_noise: emb_val = emb_val * self.trng.binomial(emb_val.shape, n=1, p=self.dropout, dtype=emb_val.dtype) else: emb_val = emb_val * self.dropout for dx in xrange(1, self.n_layers): emb_val1 = utils.dot(emb_val, W_ems[2*dx]) emb_val2 = utils.dot(emb_val, W_ems[2*dx+1]) if b_ems: emb_val1 = emb_val1+ b_ems[2*dx] emb_val2 = emb_val2+ b_ems[2*dx+1] emb_val1 = self.activation[dx](emb_val1) emb_val2 = self.activation[dx](emb_val2) emb_val = emb_val1 * emb_val2 #FIXME make sure how the dropout for tensor networks works if self.dropout < 1.: if use_noise: emb_val = emb_val * self.trng.binomial(emb_val.shape, n=1, p=self.dropout, dtype=emb_val.dtype) else: emb_val = emb_val * self.dropout self.out = emb_val return emb_val
def fprop(self, state_below, temp=numpy.float32(1), use_noise=True, additional_inputs=None, no_noise_bias=False): """ Forward pass through the cost layer. :type state_below: tensor or layer :param state_below: The theano expression (or groundhog layer) representing the input of the cost layer :type temp: float or tensor scalar :param temp: scalar representing the temperature that should be used when sampling from the output distribution :type use_noise: bool :param use_noise: flag. If true, noise is used when computing the output of the model :type no_noise_bias: bool :param no_noise_bias: flag, stating if weight noise should be added to the bias as well, or only to the weights """ if self.rank_n_approx: if use_noise and self.noise_params: emb_val = self.rank_n_activ( utils.dot(state_below, self.W_em1 + self.nW_em1)) emb_val = TT.dot(self.W_em2 + self.nW_em2, emb_val) else: emb_val = self.rank_n_activ(utils.dot(state_below, self.W_em1)) emb_val = TT.dot(self.W_em2, emb_val) else: if use_noise and self.noise_params: emb_val = utils.dot(state_below, self.W_em + self.nW_em) else: emb_val = utils.dot(state_below, self.W_em) if additional_inputs: if use_noise and self.noise_params: for inp, weight, noise_weight in zip( additional_inputs, self.additional_weights, self.noise_additional_weights): emb_val += utils.dot(inp, (noise_weight + weight)) else: for inp, weight in zip(additional_inputs, self.additional_weights): emb_val += utils.dot(inp, weight) self.preactiv = emb_val if use_noise and self.noise_params and not no_noise_bias: emb_val = TT.nnet.sigmoid(temp * (emb_val + self.b_em + self.nb_em)) else: emb_val = TT.nnet.sigmoid(temp * (emb_val + self.b_em)) self.out = emb_val self.state_below = state_below self.model_output = emb_val return emb_val
def fprop(self, state_below, temp=numpy.float32(1), use_noise=True, additional_inputs=None, no_noise_bias=False): """ Forward pass through the cost layer. :type state_below: tensor or layer :param state_below: The theano expression (or groundhog layer) representing the input of the cost layer :type temp: float or tensor scalar :param temp: scalar representing the temperature that should be used when sampling from the output distribution :type use_noise: bool :param use_noise: flag. If true, noise is used when computing the output of the model :type no_noise_bias: bool :param no_noise_bias: flag, stating if weight noise should be added to the bias as well, or only to the weights """ if self.rank_n_approx: if use_noise and self.noise_params: emb_val = self.rank_n_activ(utils.dot(state_below, self.W_em1 + self.nW_em1)) emb_val = TT.dot(self.W_em2 + self.nW_em2, emb_val) else: emb_val = self.rank_n_activ(utils.dot(state_below, self.W_em1)) emb_val = TT.dot(self.W_em2, emb_val) else: if use_noise and self.noise_params: emb_val = utils.dot(state_below, self.W_em + self.nW_em) else: emb_val = utils.dot(state_below, self.W_em) if additional_inputs: if use_noise and self.noise_params: for inp, weight, noise_weight in zip( additional_inputs, self.additional_weights, self.noise_additional_weights): emb_val += utils.dot(inp, (noise_weight + weight)) else: for inp, weight in zip(additional_inputs, self.additional_weights): emb_val += utils.dot(inp, weight) self.preactiv = emb_val if use_noise and self.noise_params and not no_noise_bias: emb_val = TT.nnet.sigmoid(temp * (emb_val + self.b_em + self.nb_em)) else: emb_val = TT.nnet.sigmoid(temp * (emb_val + self.b_em)) self.out = emb_val self.state_below = state_below self.model_output = emb_val return emb_val
def fprop(self, state_below, temp=numpy.float32(1), use_noise=True, additional_inputs=None, no_noise_bias=False, target=None, full_softmax=True): """ Forward pass through the cost layer. :type state_below: tensor or layer :param state_below: The theano expression (or groundhog layer) representing the input of the cost layer :type temp: float or tensor scalar :param temp: scalar representing the temperature that should be used when sampling from the output distribution :type use_noise: bool :param use_noise: flag. If true, noise is used when computing the output of the model :type no_noise_bias: bool :param no_noise_bias: flag, stating if weight noise should be added to the bias as well, or only to the weights """ if not full_softmax: assert target is not None, 'target must be given' if self.rank_n_approx: if self.weight_noise and use_noise and self.noise_params: emb_val = self.rank_n_activ(utils.dot(state_below, self.W_em1 + self.nW_em1)) nW_em = self.nW_em2 else: emb_val = self.rank_n_activ(utils.dot(state_below, self.W_em1)) W_em = self.W_em2 else: W_em = self.W_em if self.weight_noise: nW_em = self.nW_em emb_val = state_below if full_softmax: if self.weight_noise and use_noise and self.noise_params: emb_val = TT.dot(emb_val, W_em + nW_em) else: emb_val = TT.dot(emb_val, W_em) if additional_inputs: if use_noise and self.noise_params: for inp, weight, noise_weight in zip( additional_inputs, self.additional_weights, self.noise_additional_weights): emb_val += utils.dot(inp, (noise_weight + weight)) else: for inp, weight in zip(additional_inputs, self.additional_weights): emb_val += utils.dot(inp, weight) if self.weight_noise and use_noise and self.noise_params and \ not no_noise_bias: emb_val = temp * (emb_val + self.b_em + self.nb_em) else: emb_val = temp * (emb_val + self.b_em) else: W_em = W_em[:, target] if self.weight_noise: nW_em = nW_em[:, target] W_em += nW_em if emb_val.ndim == 3: emb_val = emb_val.reshape([emb_val.shape[0] * emb_val.shape[1], emb_val.shape[2]]) emb_val = (W_em.T * emb_val).sum(1) + self.b_em[target] if self.weight_noise and use_noise: emb_val += self.nb_em[target] emb_val = temp * emb_val self.preactiv = emb_val if full_softmax: emb_val = utils.softmax(emb_val) else: emb_val = TT.nnet.sigmoid(emb_val) self.out = emb_val self.state_below = state_below self.model_output = emb_val return emb_val
def fprop(self, state_below, temp=numpy.float32(1), use_noise=True, additional_inputs=None, no_noise_bias=False, target=None, full_softmax=True): """ Forward pass through the cost layer. :type state_below: tensor or layer :param state_below: The theano expression (or groundhog layer) representing the input of the cost layer :type temp: float or tensor scalar :param temp: scalar representing the temperature that should be used when sampling from the output distribution :type use_noise: bool :param use_noise: flag. If true, noise is used when computing the output of the model :type no_noise_bias: bool :param no_noise_bias: flag, stating if weight noise should be added to the bias as well, or only to the weights """ if not full_softmax: assert target != None, 'target must be given' if self.rank_n_approx: if self.weight_noise and use_noise and self.noise_params: emb_val = self.rank_n_activ( utils.dot(state_below, self.W_em1 + self.nW_em1)) nW_em = self.nW_em2 else: emb_val = self.rank_n_activ(utils.dot(state_below, self.W_em1)) W_em = self.W_em2 else: W_em = self.W_em if self.weight_noise: nW_em = self.nW_em emb_val = state_below if full_softmax: if self.weight_noise and use_noise and self.noise_params: emb_val = TT.dot(emb_val, W_em + nW_em) else: emb_val = TT.dot(emb_val, W_em) if additional_inputs: if use_noise and self.noise_params: for inp, weight, noise_weight in zip( additional_inputs, self.additional_weights, self.noise_additional_weights): emb_val += utils.dot(inp, (noise_weight + weight)) else: for inp, weight in zip(additional_inputs, self.additional_weights): emb_val += utils.dot(inp, weight) if self.weight_noise and use_noise and self.noise_params and \ not no_noise_bias: emb_val = temp * (emb_val + self.b_em + self.nb_em) else: emb_val = temp * (emb_val + self.b_em) else: W_em = W_em[:, target] if self.weight_noise: nW_em = nW_em[:, target] W_em += nW_em if emb_val.ndim == 3: emb_val = emb_val.reshape( [emb_val.shape[0] * emb_val.shape[1], emb_val.shape[2]]) emb_val = (W_em.T * emb_val).sum(1) + self.b_em[target] if self.weight_noise and use_noise: emb_val += self.nb_em[target] emb_val = temp * emb_val self.preactiv = emb_val if full_softmax: emb_val = utils.softmax(emb_val) else: emb_val = TT.nnet.sigmoid(emb_val) self.out = emb_val self.state_below = state_below self.model_output = emb_val return emb_val
def fprop(self, state_below, mask=None, init_state=None, gater_below=None, reseter_below=None, c=None, c_mask=None, nsteps=None, batch_size=None, use_noise=True, truncate_gradient=-1, no_noise_bias=False, return_alignment=False): updater_below = gater_below if theano.config.floatX=='float32': floatX = numpy.float32 else: floatX = numpy.float64 if nsteps is None: nsteps = state_below.shape[0] if batch_size and batch_size != 1: nsteps = nsteps / batch_size if batch_size is None and state_below.ndim == 3: batch_size = state_below.shape[1] if state_below.ndim == 2 and \ (not isinstance(batch_size,int) or batch_size > 1): state_below = state_below.reshape((nsteps, batch_size, self.n_in)) if updater_below: updater_below = updater_below.reshape((nsteps, batch_size, self.n_in)) if reseter_below: reseter_below = reseter_below.reshape((nsteps, batch_size, self.n_in)) if not init_state: if not isinstance(batch_size, int) or batch_size != 1: init_state = TT.alloc(floatX(0), batch_size, self.n_hids) else: init_state = TT.alloc(floatX(0), self.n_hids) p_from_c = utils.dot(c, self.A_cp).reshape(#attention weights for each hidden state (c.shape[0], c.shape[1], self.n_hids)) if mask: sequences = [state_below, mask, updater_below, reseter_below] non_sequences = [c, c_mask, p_from_c] # seqs | out | non_seqs fn = lambda x, m, g, r, h, c1, cm, pc : self.step_fprop(x, h, mask=m, gater_below=g, reseter_below=r, c=c1, p_from_c=pc, c_mask=cm, use_noise=use_noise, no_noise_bias=no_noise_bias, return_alignment=return_alignment) else: sequences = [state_below, updater_below, reseter_below] non_sequences = [c, p_from_c] # seqs | out | non_seqs fn = lambda x, g, r, h, c1, pc : self.step_fprop(x, h, gater_below=g, reseter_below=r, c=c1, p_from_c=pc, use_noise=use_noise, no_noise_bias=no_noise_bias, return_alignment=return_alignment) outputs_info = [init_state, None] if return_alignment: outputs_info.append(None) #use scan to repetitively update hidden states rval, updates = theano.scan(fn, sequences=sequences, non_sequences=non_sequences, outputs_info=outputs_info, name='layer_%s'%self.name, truncate_gradient=truncate_gradient, n_steps=nsteps) self.out = rval self.rval = rval self.updates = updates return self.out
def step_fprop(self, state_below, state_before, gater_below=None, reseter_below=None, mask=None, c=None, c_mask=None, p_from_c=None, use_noise=True, no_noise_bias=False, step_num=None, return_alignment=False): """ Constructs the computational graph of this layer. :type state_below: theano variable :param state_below: the input to the layer :type mask: None or theano variable :param mask: mask describing the length of each sequence in a minibatch :type state_before: theano variable :param state_before: the previous value of the hidden state of the layer :type updater_below: theano variable :param updater_below: the input to the update gate :type reseter_below: theano variable :param reseter_below: the input to the reset gate :type use_noise: bool :param use_noise: flag saying if weight noise should be used in computing the output of this layer :type no_noise_bias: bool :param no_noise_bias: flag saying if weight noise should be added to the bias as well """ updater_below = gater_below W_hh = self.W_hh G_hh = self.G_hh R_hh = self.R_hh A_cp = self.A_cp B_hp = self.B_hp D_pe = self.D_pe # The code works only with 3D tensors cndim = c.ndim if cndim == 2: c = c[:, None, :] # Warning: either source_num or target_num should be equal, # or one of them sould be 1 (they have to broadcast) # for the following code to make any sense. source_len = c.shape[0]#sequence length source_num = c.shape[1]#number of sequences in a batch target_num = state_before.shape[0] dim = self.n_hids # Form projection to the tanh layer from the previous hidden state # Shape: (source_len, target_num, dim) p_from_h = ReplicateLayer(source_len)(utils.dot(state_before, B_hp)).out # Form projection to the tanh layer from the source annotation. if not p_from_c: p_from_c = utils.dot(c, A_cp).reshape((source_len, source_num, dim)) # Sum projections - broadcasting happens at the dimension 1. p = p_from_h + p_from_c # Apply non-linearity and project to energy. energy = TT.exp(utils.dot(TT.tanh(p), D_pe)).reshape((source_len, target_num)) if c_mask: # This is used for batches only, that is target_num == source_num energy *= c_mask # Calculate energy sums. normalizer = energy.sum(axis=0) # Get probabilities. (softmax?) probs = energy / normalizer # Calculate weighted sums of source annotations. # If target_num == 1, c shoulds broadcasted at the 1st dimension. # Probabilities are broadcasted at the 2nd dimension. ctx = (c * probs.dimshuffle(0, 1, 'x')).sum(axis=0)#averaged context (see the picture) state_below += self.c_inputer(ctx).out reseter_below += self.c_reseter(ctx).out updater_below += self.c_updater(ctx).out # Reset gate: # optionally reset the hidden state. reseter = self.reseter_activation(TT.dot(state_before, R_hh)+reseter_below) reseted_state_before = reseter * state_before # Feed the input to obtain potential new state. preactiv = TT.dot(reseted_state_before, W_hh) + state_below h = self.activation(preactiv) # Update gate: # optionally reject the potential new state and use the new one. updater = self.updater_activation(TT.dot(state_before, G_hh) + updater_below) h = updater * h + (1-updater) * state_before #h_t=z*h_{t-1}+(1-z)*h_t if mask is not None: if h.ndim ==2 and mask.ndim==1: mask = mask.dimshuffle(0,'x') h = mask * h + (1-mask) * state_before results = [h, ctx] if return_alignment: results += [probs] return results