def __call__(self, expr_seq): """ transduce the sequence, applying masks if given (masked timesteps simply copy previous h / c) Args: expr_seq: expression sequence or list of expression sequences (where each inner list will be concatenated) Returns: expression sequence """ if isinstance(expr_seq, ExpressionSequence): expr_seq = [expr_seq] batch_size = expr_seq[0][0].dim()[1] seq_len = len(expr_seq[0]) if self.dropout_rate > 0.0 and self.train: self.set_dropout_masks(batch_size=batch_size) cur_input = expr_seq self._final_states = [] for layer_i in range(self.num_layers): h = [dy.zeroes(dim=(self.hidden_dim, ), batch_size=batch_size)] c = [dy.zeroes(dim=(self.hidden_dim, ), batch_size=batch_size)] for pos_i in range(seq_len): x_t = [cur_input[j][pos_i] for j in range(len(cur_input))] if isinstance(x_t, dy.Expression): x_t = [x_t] elif type(x_t) != list: x_t = list(x_t) if self.dropout_rate > 0.0 and self.train: # apply dropout according to https://arxiv.org/abs/1512.05287 (tied weights) gates_t = dy.vanilla_lstm_gates_dropout_concat( x_t, h[-1], self.Wx[layer_i], self.Wh[layer_i], self.b[layer_i], self.dropout_mask_x[layer_i], self.dropout_mask_h[layer_i], self.weightnoise_std if self.train else 0.0) else: gates_t = dy.vanilla_lstm_gates_concat( x_t, h[-1], self.Wx[layer_i], self.Wh[layer_i], self.b[layer_i], self.weightnoise_std if self.train else 0.0) c_t = dy.vanilla_lstm_c(c[-1], gates_t) h_t = dy.vanilla_lstm_h(c_t, gates_t) if expr_seq[0].mask is None or np.isclose( np.sum(expr_seq[0].mask.np_arr[:, pos_i:pos_i + 1]), 0.0): c.append(c_t) h.append(h_t) else: c.append(expr_seq[0].mask.cmult_by_timestep_expr( c_t, pos_i, True) + expr_seq[0].mask.cmult_by_timestep_expr( c[-1], pos_i, False)) h.append(expr_seq[0].mask.cmult_by_timestep_expr( h_t, pos_i, True) + expr_seq[0].mask.cmult_by_timestep_expr( h[-1], pos_i, False)) self._final_states.append(FinalTransducerState(h[-1], c[-1])) cur_input = [h[1:]] return ExpressionSequence(expr_list=h[1:], mask=expr_seq[0].mask)
def translate(self, x, beam_size=1): """Translate a source sentence Translate a single source sentence by decoding using beam search Arguments: x (list): Source sentence (list of indices) Keyword Arguments: beam_size (int): Size of the beam for beam search. A value of 1 means greedy decoding (default: (1)) Returns: list: generated translation (list of indices) """ dy.renew_cg() input_len = len(x) encodings = self.encode([x], test=True) # Decode # Add parameters to the graph Wp, bp = self.Wp_p.expr(), self.bp_p.expr() Wo, bo = self.Wo_p.expr(), self.bo_p.expr() D, b = dy.transpose(dy.parameter(self.MT_p)), self.b_p.expr() # Initialize decoder with last encoding last_enc = dy.select_cols(encodings, [encodings.dim()[0][-1] - 1]) init_state = dy.affine_transform([bp, Wp, last_enc]) ds = self.dec.initial_state([init_state, dy.zeroes((self.dh, ))]) # Initialize context context = dy.zeroes((self.enc_dim, )) # Initialize beam beam = [(ds, context, [self.trg_sos], 0.0)] # Loop for i in range(int(min(self.max_len, input_len * 1.5))): new_beam = [] for ds, pc, pw, logprob in beam: embs = dy.lookup(self.MT_p, pw[-1]) # Run LSTM ds = ds.add_input(dy.concatenate([embs, pc])) h = ds.output() # Compute next context context, _ = self.attend(encodings, h) # Compute output with residual connections output = dy.affine_transform( [bo, Wo, dy.concatenate([h, context, embs])]) # Score s = dy.affine_transform([b, D, output]) # Probabilities p = dy.softmax(s).npvalue().flatten() # Careful of float error p = p / p.sum() kbest = np.argsort(p) for nw in kbest[-beam_size:]: new_beam.append( (ds, context, pw + [nw], logprob + np.log(p[nw]))) beam = sorted(new_beam, key=lambda x: x[-1])[-beam_size:] if beam[-1][2][-1] == self.trg_eos: break return beam[-1][2]
def transduce(self, embed_sent): src = embed_sent.as_tensor() sent_len = src.dim()[0][1] src_width = 1 batch_size = src.dim()[1] pad_size = (self.window_receptor - 1) / 2 #TODO adapt it also for even window size src = dy.concatenate([ dy.zeroes((self.input_dim, pad_size), batch_size=batch_size), src, dy.zeroes((self.input_dim, pad_size), batch_size=batch_size) ], d=1) padded_sent_len = sent_len + 2 * pad_size conv1 = dy.parameter(self.pConv1) bias1 = dy.parameter(self.pBias1) src_chn = dy.reshape(src, (self.input_dim, padded_sent_len, 1), batch_size=batch_size) cnn_layer1 = dy.conv2d_bias(src_chn, conv1, bias1, stride=[1, 1]) hidden_layer = dy.reshape(cnn_layer1, (self.internal_dim, sent_len, 1), batch_size=batch_size) if self.non_linearity is 'linear': hidden_layer = hidden_layer elif self.non_linearity is 'tanh': hidden_layer = dy.tanh(hidden_layer) elif self.non_linearity is 'relu': hidden_layer = dy.rectify(hidden_layer) elif self.non_linearity is 'sigmoid': hidden_layer = dy.logistic(hidden_layer) for conv_hid, bias_hid in self.builder_layers: hidden_layer = dy.conv2d_bias(hidden_layer, dy.parameter(conv_hid), dy.parameter(bias_hid), stride=[1, 1]) hidden_layer = dy.reshape(hidden_layer, (self.internal_dim, sent_len, 1), batch_size=batch_size) if self.non_linearity is 'linear': hidden_layer = hidden_layer elif self.non_linearity is 'tanh': hidden_layer = dy.tanh(hidden_layer) elif self.non_linearity is 'relu': hidden_layer = dy.rectify(hidden_layer) elif self.non_linearity is 'sigmoid': hidden_layer = dy.logistic(hidden_layer) last_conv = dy.parameter(self.last_conv) last_bias = dy.parameter(self.last_bias) output = dy.conv2d_bias(hidden_layer, last_conv, last_bias, stride=[1, 1]) output = dy.reshape(output, (sent_len, self.output_dim), batch_size=batch_size) output_seq = ExpressionSequence(expr_tensor=output) self._final_states = [FinalTransducerState(output_seq[-1])] return output_seq
def __call__(self, x, y): x = concatenate([x, zeroes((1, x.dim()[0][1],)) + 1.]) y = concatenate([y, zeroes((1, y.dim()[0][1],)) + 1.]) if self.spec[1] == 1: return self.U[0](x, y) else: return concatenate([u(x, y) for u in self.U], 2)
def __init__(self, network, prev=None, c=None, h=None): self._network = network if c is None: c = [dy.zeroes(dim=(network.hidden_dim,)) for _ in range(network.num_layers)] if h is None: h = [dy.zeroes(dim=(network.hidden_dim,)) for _ in range(network.num_layers)] self._c = tuple(c) self._h = tuple(h) self._prev = prev
def __call__(self, x, y): x = concatenate([x, zeroes(( 1, x.dim()[0][1], )) + 1.]) y = concatenate([y, zeroes(( 1, y.dim()[0][1], )) + 1.]) return concatenate([u(x, y) for u in self.U], 2)
def __init__(self, network: 'UniLSTMSeqTransducer', prev: Optional['UniLSTMState'] = None, c: Sequence[dy.Expression] = None, h: Sequence[dy.Expression] = None) -> None: self._network = network if c is None: c = [dy.zeroes(dim=(network.hidden_dim,)) for _ in range(network.num_layers)] if h is None: h = [dy.zeroes(dim=(network.hidden_dim,)) for _ in range(network.num_layers)] self._c = tuple(c) self._h = tuple(h) self._prev = prev
def decode_loss(self, encodings, trg, test=False): """Compute the negative conditional log likelihood of the target sentence given the encoding of the source sentence Arguments: encodings (dynet.Expression): Source sentence encodings obtained with self.encode trg (list): List of target sentences Keyword Arguments: test (bool): Switch used for things like dropout where the behaviour is different at test time (default: (False) Returns: dynet.Expression: Expression of the loss averaged on the minibatch """ y, masksy = self.prepare_batch(trg, self.trg_eos) slen, bsize = y.shape # Add parameters to the graph Wp, bp = self.Wp_p.expr(), self.bp_p.expr() Wo, bo = self.Wo_p.expr(), self.bo_p.expr() D, b = dy.transpose(dy.parameter(self.MT_p)), self.b_p.expr() # Initialize decoder with last encoding last_enc = dy.select_cols(encodings, [encodings.dim()[0][-1] - 1]) init_state = dy.affine_transform([bp, Wp, last_enc]) ds = self.dec.initial_state( [init_state, dy.zeroes((self.dh, ), batch_size=bsize)]) # Initialize context context = dy.zeroes((self.enc_dim, ), batch_size=bsize) # Start decoding errs = [] for cw, nw, mask in zip(y, y[1:], masksy[1:]): embs = dy.lookup_batch(self.MT_p, cw) # Run LSTM ds = ds.add_input(dy.concatenate([embs, context])) h = ds.output() # Compute next context context, _ = self.attend(encodings, h) # Compute output with residual connections output = dy.affine_transform( [bo, Wo, dy.concatenate([h, context, embs])]) if not test: output = dy.dropout(output, self.dr) # Score s = dy.affine_transform([b, D, output]) masksy_e = dy.inputTensor(mask, batched=True) # Loss err = dy.cmult(dy.pickneglogsoftmax_batch(s, nw), masksy_e) errs.append(err) # Add all losses together err = dy.sum_batches(dy.esum(errs)) / float(bsize) return err
def transduce(self, es): es_expr = es.as_tensor() # e.g. es_expr.dim() ==((276, 240), 1) sent_len = es_expr.dim()[0][0] batch_size=es_expr.dim()[1] # convolutions won't work if sent length is too short; pad if necessary pad_size = 0 while math.ceil(float(sent_len + pad_size - self.filter_size_time + 1) / float(self.stride[0])) < self.filter_size_time: pad_size += 1 if pad_size>0: es_expr = dy.concatenate([es_expr, dy.zeroes((pad_size, self.freq_dim * self.chn_dim), batch_size=es_expr.dim()[1])]) sent_len += pad_size # convolution layers es_chn = dy.reshape(es_expr, (sent_len, self.freq_dim, self.chn_dim), batch_size=batch_size) # ((276, 80, 3), 1) cnn_layer1 = dy.conv2d(es_chn, dy.parameter(self.filters1), stride=self.stride, is_valid=True) # ((137, 39, 32), 1) cnn_layer2 = dy.conv2d(cnn_layer1, dy.parameter(self.filters2), stride=self.stride, is_valid=True) # ((68, 19, 32), 1) cnn_out = dy.reshape(cnn_layer2, (cnn_layer2.dim()[0][0], cnn_layer2.dim()[0][1]*cnn_layer2.dim()[0][2]), batch_size=batch_size) # ((68, 608), 1) es_list = [cnn_out[i] for i in range(cnn_out.dim()[0][0])] # RNN layers for (fb, bb) in self.builder_layers: fs = fb.initial_state().transduce(es_list) bs = bb.initial_state().transduce(reversed(es_list)) es_list = [dy.concatenate([f, b]) for f, b in zip(fs, reversed(bs))] return es_list
def softmax(x): """ Compute the softmax function in tensorflow. You might find the tensorflow functions tf.exp, tf.reduce_max, tf.reduce_sum, tf.expand_dims useful. (Many solutions are possible, so you may not need to use all of these functions). Recall also that many common tensorflow operations are sugared (e.g. x * y does a tensor multiplication if x and y are both tensors). Make sure to implement the numerical stability fixes as in the previous homework! Args: x: tf.Tensor with shape (n_samples, n_features). Note feature vectors are represented by row-vectors. (For simplicity, no need to handle 1-d input as in the previous homework) Returns: out: tf.Tensor with shape (n_sample, n_features). You need to construct this tensor in this problem. """ ### YOUR CODE HERE x_max = dy.max_dim(x, 1) x_sub = dy.colwise_add(x, -x_max) x_exp = dy.exp(x_sub) sum_exp = dy.colwise_add(dy.zeroes(x.dim()[0]), dy.sum_cols(x_exp)) out = dy.cdiv(x_exp, sum_exp) ### END YOUR CODE return out
def evaluate(self, inputs, train=False): """ Apply all MLP layers to concatenated input :param inputs: (key, vector) per feature type :param train: are we training now? :return: output vector of size self.output_dim """ input_keys, inputs = list(map(list, zip(*list(inputs)))) if self.input_keys: assert input_keys == self.input_keys, "Got: %s\nBut expected input keys: %s" % ( self.input_keys_str( self.input_keys), self.input_keys_str(input_keys)) else: self.input_keys = input_keys if self.gated: gates = self.params.get("gates") if gates is None: # FIXME attention weights should not be just parameters, but based on biaffine product? gates = self.params["gates"] = self.model.add_parameters( (len(inputs), self.gated), init=dy.UniformInitializer(1)) input_dims = [i.dim()[0][0] for i in inputs] max_dim = max(input_dims) x = dy.concatenate_cols([ dy.concatenate([i, dy.zeroes(max_dim - d) ]) # Pad with zeros to get uniform dim if d < max_dim else i for i, d in zip(inputs, input_dims) ]) * gates # Possibly multiple "attention heads" -- concatenate outputs to one vector inputs = [dy.reshape(x, (x.dim()[0][0] * x.dim()[0][1], ))] x = dy.concatenate(inputs) assert len( x.dim() [0]) == 1, "Input should be a vector, but has dimension " + str( x.dim()[0]) dim = x.dim()[0][0] if self.input_dim: assert dim == self.input_dim, "Input dim mismatch: %d != %d" % ( dim, self.input_dim) else: self.init_params(dim) self.config.print(self, level=4) if self.total_layers: if self.weights is None: self.weights = [[ self.params[prefix + str(i)] for prefix in ("W", "b") ] for i in range(self.total_layers)] if self.weights[0][0].dim( )[0][1] < dim: # number of columns in W0 self.weights[0][0] = dy.concatenate_cols( [self.weights[0][0], self.params["W0+"]]) for i, (W, b) in enumerate(self.weights): self.config.print(lambda: x.npvalue().tolist(), level=4) try: if train and self.dropout: x = dy.dropout(x, self.dropout) x = self.activation()(W * x + b) except ValueError as e: raise ValueError("Error in evaluating layer %d of %d" % (i + 1, self.total_layers)) from e self.config.print(lambda: x.npvalue().tolist(), level=4) return x
def padding(src, min_size): """ do padding for the sequence input along the time step (for example speech), so that so that the output of convolutional layer has the same size(time) of the input. note that for padding image(two dimensional padding), please refer to dyne.conv2d(..., is_valid = False) """ # pad before put into convolutional layer src_dim = src.dim() if src_dim[0][1] >= min_size: return src pad_size = min_size - src_dim[0][1] channels = src_dim[0][2] if len(src_dim[0]) >= 3 else 1 if pad_size == 1: return dy.concatenate([src, dy.zeroes((src_dim[0][0], 1, channels))], d=1) else: left_border = int(pad_size) / 2 right_border = (int(pad_size)+1) / 2 return dy.concatenate([dy.zeroes((src_dim[0][0], left_border, channels)), src, dy.zeroes((src_dim[0][0], right_border, channels))], d=1) # do concatenate along cols
def pad(self, expr, pad_size): assert pad_size >= 0 if pad_size == 0: return expr return dy.concatenate([ expr, dy.zeroes((pad_size, self.freq_dim * self.chn_dim), batch_size=expr.dim()[1]) ]) # TODO: replicate last frame instead of padding zeros
def softmax(x): ### YOUR CODE HERE x_max = dy.max_dim(x, 1) x_sub = dy.colwise_add(x, -x_max) x_exp = dy.exp(x_sub) x_sum = dy.sum_cols(x_exp) x_tmp = dy.zeroes(x.dim()[0]) x_tmp = dy.colwise_add(x_tmp, x_sum) out = dy.cdiv(x_exp, x_tmp) ### END YOUR CODE return out
def beam_decode(self, encodings, input_len=10, beam_size=1): # Add parameters to the graph self.dec.init(encodings, [[self.trg_sos]], self.usr.user_vector, test=self.test, update=self.update) # Initialize context context = dy.zeroes((self.enc.dim, )) # Process user token if necessary if self.user_token: _, _, _ = self.dec.next(self.usr.user_vector, context, test=self.test) # Get conditional log probability of lengths llp = np.log(self.lex.p_L[input_len]) # Initialize beam beams = [beam.Beam(self.dec.ds, context, [self.trg_sos], llp[0])] # Loop for i in range(int(min(self.max_len, input_len * 1.5))): new_beam = [] for b in beams: if b.words[-1] == self.trg_eos: new_beam.append( beam.Beam(b.state, b.context, b.words, b.logprob, b.align)) continue h, e, b.state = self.dec.next([b.words[-1]], b.context, state=b.state) # Compute next context b.context, att = self.attend(encodings, h) # Score s = self.dec.s(h, b.context, e, test=self.test) # Probabilities p = dy.softmax(s).npvalue() # Careful for floating errors p = p.flatten() / p.sum() # Store alignment for e.g. unk replacement align = np.argmax(att.npvalue()) kbest = np.argsort(p) for nw in kbest[-beam_size:]: new_beam.append( beam.Beam( b.state, b.context, b.words + [nw], b.logprob + np.log(p[nw]) + llp[i + 1] - llp[i], b.align + [align])) # Only keep the best beams = sorted(new_beam, key=lambda b: b.logprob)[-beam_size:] if beams[-1].words[-1] == self.trg_eos: break return beams[-1]
def run_IRNN(x): """ Runs MLP to get the last layer before softmax """ bsize, d = x.shape Wh, Wx, bh = Wh_p.expr(), Wx_p.expr(), bh_p.expr() # Load parameters in computation graph A,b=A_p.expr(),b_p.expr() x_list = [dy.inputTensor(x_t, batched=True) for x_t in x.T] # Initialize layer value h=dy.zeroes((dh,),batch_size=bsize)# Initialize layer value for x_t in x_list: # Iterate over layers a = Wh * h + Wx * x_t + bh # Affine transform h = dy.rectify(a) # Apply non-linearity (except for last layer) return A * h + b
def init(self, x, usr, test=True, update=True): bs = len(x[0]) if not test: self.lstm.set_dropout(self.dr) else: self.lstm.disable_dropout() # Add encoder to computation graph self.Th = self.Th_p.expr(update) init_state = self.Th * usr init_state = [init_state, dy.zeroes((self.dh,), batch_size=bs)] self.es = self.lstm.initial_state(init_state, update=update) if not test: self.lstm.set_dropout_masks(bs)
def encode_sequence(sequence, rnns, embedder, dropout_amount=0.): """ Encodes a sequence given RNN cells and an embedding function. Inputs: seq (list of str): The sequence to encode. rnns (list of dy._RNNBuilder): The RNNs to use. emb_fn (dict str->dy.Expression): Function that embeds strings to word vectors. size (int): The size of the RNN. dropout_amount (float, optional): The amount of dropout to apply. Returns: (list of dy.Expression, list of dy.Expression), list of dy.Expression, where the first pair is the (final cell memories, final cell states) of all layers, and the second list is a list of the final layer's cell state for all tokens in the sequence. """ layer_states = [] for rnn in rnns: hidden_size = rnn.spec[2] layer_states.append(rnn.initial_state([dy.zeroes((hidden_size, 1)), dy.zeroes((hidden_size, 1))])) outputs = [] for token in sequence: rnn_input = embedder(token) (cell_states, hidden_states), output, layer_states = \ forward_one_multilayer(rnn_input, layer_states, dropout_amount) outputs.append(output) return (cell_states, hidden_states), outputs
def evaluate(self, inputs, train=False): """ Apply all MLP layers to concatenated input :param inputs: (key, vector) per feature type :param train: are we training now? :return: output vector of size self.output_dim """ input_keys, inputs = list(map(list, zip(*list(inputs)))) if self.input_keys: assert input_keys == self.input_keys, "Got: %s\nBut expected input keys: %s" % ( self.input_keys_str(self.input_keys), self.input_keys_str(input_keys)) else: self.input_keys = input_keys if self.gated: gates = self.params.get("gates") if gates is None: # FIXME attention weights should not be just parameters, but based on biaffine product? gates = self.params["gates"] = self.model.add_parameters((len(inputs), self.gated), init=dy.UniformInitializer(1)) input_dims = [i.dim()[0][0] for i in inputs] max_dim = max(input_dims) x = dy.concatenate_cols([dy.concatenate([i, dy.zeroes(max_dim - d)]) # Pad with zeros to get uniform dim if d < max_dim else i for i, d in zip(inputs, input_dims)]) * gates # Possibly multiple "attention heads" -- concatenate outputs to one vector inputs = [dy.reshape(x, (x.dim()[0][0] * x.dim()[0][1],))] x = dy.concatenate(inputs) assert len(x.dim()[0]) == 1, "Input should be a vector, but has dimension " + str(x.dim()[0]) dim = x.dim()[0][0] if self.input_dim: assert dim == self.input_dim, "Input dim mismatch: %d != %d" % (dim, self.input_dim) else: self.init_params(dim) self.config.print(self, level=4) if self.total_layers: if self.weights is None: self.weights = [[self.params[prefix + str(i)] for prefix in ("W", "b")] for i in range(self.total_layers)] if self.weights[0][0].dim()[0][1] < dim: # number of columns in W0 self.weights[0][0] = dy.concatenate_cols([self.weights[0][0], self.params["W0+"]]) for i, (W, b) in enumerate(self.weights): self.config.print(lambda: x.npvalue().tolist(), level=4) try: if train and self.dropout: x = dy.dropout(x, self.dropout) x = self.activation()(W * x + b) except ValueError as e: raise ValueError("Error in evaluating layer %d of %d" % (i + 1, self.total_layers)) from e self.config.print(lambda: x.npvalue().tolist(), level=4) return x
def zeroes(hidden_dim: numbers.Integral, batch_size: numbers.Integral = 1) -> Tensor: """ Create a possibly batched zero vector. Args: hidden_dim: vector size batch_size: batch size Returns: DyNet expression of size ((hidden_dim,),batch_size) or PyTorch tensor of size (batch_size,hidden_dim) """ if xnmt.backend_dynet: return dy.zeroes((hidden_dim, ), batch_size=batch_size) else: return torch.zeros(size=( batch_size, hidden_dim, ), device=xnmt.device)
def compute_loss(gold_seq, scores, index_to_token_maps, gold_tok_to_id, noise=0.00000001): """ Computes the loss of a gold sequence given scores. Inputs: gold_seq (list of str): A sequence of gold tokens. scores (list of dy.Expression): Expressions representing the scores of potential output tokens for each token in gold_seq. index_to_tok_maps (list of dict str->list of int): Maps from index in the sequence to a dictionary mapping from a string to a set of integers. gold_tok_to_id (lambda (str, str)->list of int): Maps from the gold token and some lookup function to the indices in the probability distribution where the gold token occurs. noise (float, optional): The amount of noise to add to the loss. Returns: dy.Expression representing the sum of losses over the sequence. """ assert len(gold_seq) == len(scores) assert len(index_to_token_maps) == len(scores) losses = [] for i, gold_tok in enumerate(gold_seq): score = scores[i] token_map = index_to_token_maps[i] gold_indices = gold_tok_to_id(gold_tok, token_map) assert len(gold_indices) > 0 if len(gold_indices) == 1: losses.append(dy.pickneglogsoftmax(score, gold_indices[0])) else: prob_of_tok = dy.zeroes(1) probdist = dy.softmax(score) for index in gold_indices: prob_of_tok += probdist[index] prob_of_tok += noise losses.append(-dy.log(prob_of_tok)) return dy.esum(losses)
def decode_loss(self, encodings, trg): """Compute the negative conditional log likelihood of the target sentence given the encoding of the source sentence :param encodings: Source sentence encodings obtained with self.encode :param trg: List of target sentences :returns: Expression of the loss averaged on the minibatch """ y, masksy = self.prepare_batch(trg, self.trg_eos) slen, bsize = y.shape # Init decoder self.dec.init(encodings, y, self.usr.user_vector, test=self.test, update=self.update) # Initialize context context = dy.zeroes((self.enc.dim, ), batch_size=bsize) # Process user token if necessary if self.user_token: _, _, _ = self.dec.next(self.usr.user_vector, context, test=self.test) # Start decoding errs = [] for cw, nw, mask in zip(y, y[1:], masksy[1:]): # Run LSTM h, e, _ = self.dec.next(cw, context, test=self.test) # Compute next context context, _ = self.attend(encodings, h) # Score s = self.dec.s(h, context, e, test=self.test) masksy_e = dy.inputTensor(mask, batched=True) # Loss loss = self.cross_entropy_loss(s, nw, cw) loss = dy.cmult(loss, masksy_e) errs.append(loss) # Add all losses together err = dy.mean_batches(dy.esum(errs)) return err
def init(self, H, y, test=True, update=True): bs = len(y[0]) if not test: self.lstm.set_dropout(self.dr) else: self.lstm.disable_dropout() # Initialize first state of the decoder with the last state of the encoder self.Wp = self.Wp_p.expr(update) self.bp = self.bp_p.expr(update) last_enc = dy.pick(H, index=H.dim()[0][-1] - 1, dim=1) init_state = dy.affine_transform([self.bp, self.Wp, last_enc]) init_state = [init_state, dy.zeroes((self.dh, ), batch_size=bs)] self.ds = self.lstm.initial_state(init_state, update=update) # Initialize dropout masks if not test: self.lstm.set_dropout_masks(bs) self.Wo = self.Wo_p.expr(update) self.bo = self.bo_p.expr(update) self.E = self.E_p.expr(update) self.b = self.b_p.expr(False)
def search(self, translation_model): beams = [self.Beam(translation_model.decoder.current_state, [translation_model.tgt_vocab.sos], 0.0, 0.0, 0.0, 1)] next_candidate_id = 2 num_pruned = 0 fan_outs = [] target_vocabulary_size = len(translation_model.tgt_vocab) for i in range(self.max_output_len): probabilities = [] next_states = [] for beam in beams: # if already at end of sentence, no work to be done if beam.words[-1] == translation_model.tgt_vocab.eos: probabilities.append(dy.zeroes((target_vocabulary_size,)) + 1) next_states.append(None) continue # calculate decoding scores scores = translation_model.decode([beam.words[-1]], beam.state) # then, keep track of next decoder state next_states.append(translation_model.decoder.current_state) probabilities.append(dy.log_softmax(scores)) # run forward pass probabilities = dy.concatenate_to_batch(probabilities).npvalue().T.reshape(-1, target_vocabulary_size) new_beams = [] for prob, beam, next_state in zip(probabilities, beams, next_states): if beam.words[-1] == translation_model.tgt_vocab.eos: # if we're already at the end of the sentence, keep it as is new_beams.append(beam) else: # otherwise, find the k best candidate words k_best = np.argsort(prob) # best is last for next_word in k_best[-self.beam_size:]: next_word_prob = prob[next_word] new_prob = beam.log_prob + next_word_prob if self.length_norm_alpha: len_norm = (5 + len(beam.words) + 1)**self.length_norm_alpha / (5 + 1) ** self.length_norm_alpha else: len_norm = 1 new_beams.append( self.Beam(next_state, beam.words + [next_word], new_prob, next_word_prob, new_prob / len_norm, next_candidate_id)) next_candidate_id += 1 # Only keep the k best beams = sorted(new_beams, key=lambda beam: beam.len_norm_score)[-self.beam_size:] # if highest scoring candidate is a complete sentence, exit if beams[-1].words[-1] == translation_model.tgt_vocab.eos: break best_score = beams[-1].len_norm_score if self.pruning_strategy.relative: beams = [beam for beam in beams if beam.len_norm_score - best_score > self.pruning_strategy.relative] if self.pruning_strategy.absolute: beams = [beam for beam in beams if logsumexp(a=[best_score, beam.len_norm_score], b=[1, -1]) < self.pruning_strategy.absolute] if self.pruning_strategy.local: best_word_score = max(beam.last_word_prob for beam in beams) beams = [beam for beam in beams if beam.last_word_prob - best_word_score > self.pruning_strategy.local] if self.pruning_strategy.candidate: pruned_beams = [] candidate_counts = defaultdict(lambda: 0) for beam in reversed(beams): if candidate_counts[beam.candidate_id] < self.pruning_strategy.candidate: pruned_beams.insert(0, beam) candidate_counts[beam.candidate_id] += 1 beams = pruned_beams num_pruned += self.beam_size - len(beams) fan_out = 0 for beam in beams: if beam.words[-1] != translation_model.tgt_vocab.eos: fan_out += 1 fan_outs.append(fan_out) total_fan_out = sum(fan_outs) avg_fan_out = total_fan_out / len(fan_outs) if len(fan_outs) != 0 else 0 return beams[-1].words, avg_fan_out, total_fan_out, num_pruned
def init(self, initial_state, batch_size=1): self.current_state = self.lstm.initial_state([ initial_state, dy.zeroes((self.hidden_dim, ), batch_size=batch_size) ])
def transduce( self, expr_seq: expression_seqs.ExpressionSequence ) -> expression_seqs.ExpressionSequence: """ transduce the sequence, applying masks if given (masked timesteps simply copy previous h / c) Args: expr_seq: expression sequence (will be accessed via tensor_expr) Return: expression sequence """ if isinstance(expr_seq, list): mask_out = expr_seq[0].mask seq_len = len(expr_seq[0]) batch_size = expr_seq[0].dim()[1] tensors = [e.as_tensor() for e in expr_seq] input_tensor = dy.reshape(dy.concatenate(tensors), (seq_len, 1, self.input_dim), batch_size=batch_size) else: mask_out = expr_seq.mask seq_len = len(expr_seq) batch_size = expr_seq.dim()[1] input_tensor = dy.reshape(dy.transpose(expr_seq.as_tensor()), (seq_len, 1, self.input_dim), batch_size=batch_size) if self.dropout > 0.0 and self.train: input_tensor = dy.dropout(input_tensor, self.dropout) proj_inp = dy.conv2d_bias(input_tensor, dy.parameter(self.p_f), dy.parameter(self.p_b), stride=(self.stride, 1), is_valid=False) reduced_seq_len = proj_inp.dim()[0][0] proj_inp = dy.transpose( dy.reshape(proj_inp, (reduced_seq_len, self.hidden_dim * 3), batch_size=batch_size)) # proj_inp dims: (hidden, 1, seq_len), batch_size if self.stride > 1 and mask_out is not None: mask_out = mask_out.lin_subsampled(trg_len=reduced_seq_len) h = [dy.zeroes(dim=(self.hidden_dim, 1), batch_size=batch_size)] c = [dy.zeroes(dim=(self.hidden_dim, 1), batch_size=batch_size)] for t in range(reduced_seq_len): f_t = dy.logistic( dy.strided_select(proj_inp, [], [0, t], [self.hidden_dim, t + 1])) o_t = dy.logistic( dy.strided_select(proj_inp, [], [self.hidden_dim, t], [self.hidden_dim * 2, t + 1])) z_t = dy.tanh( dy.strided_select(proj_inp, [], [self.hidden_dim * 2, t], [self.hidden_dim * 3, t + 1])) if self.dropout > 0.0 and self.train: retention_rate = 1.0 - self.dropout dropout_mask = dy.random_bernoulli((self.hidden_dim, 1), retention_rate, batch_size=batch_size) f_t = 1.0 - dy.cmult( dropout_mask, 1.0 - f_t ) # TODO: would be easy to make a zoneout dynet operation to save memory i_t = 1.0 - f_t if t == 0: c_t = dy.cmult(i_t, z_t) else: c_t = dy.cmult(f_t, c[-1]) + dy.cmult(i_t, z_t) h_t = dy.cmult( o_t, c_t) # note: LSTM would use dy.tanh(c_t) instead of c_t if mask_out is None or np.isclose( np.sum(mask_out.np_arr[:, t:t + 1]), 0.0): c.append(c_t) h.append(h_t) else: c.append( mask_out.cmult_by_timestep_expr(c_t, t, True) + mask_out.cmult_by_timestep_expr(c[-1], t, False)) h.append( mask_out.cmult_by_timestep_expr(h_t, t, True) + mask_out.cmult_by_timestep_expr(h[-1], t, False)) self._final_states = [transducers.FinalTransducerState(dy.reshape(h[-1], (self.hidden_dim,), batch_size=batch_size), \ dy.reshape(c[-1], (self.hidden_dim,), batch_size=batch_size))] return expression_seqs.ExpressionSequence(expr_list=h[1:], mask=mask_out)
def transduce( self, xs: 'expression_seqs.ExpressionSequence' ) -> 'expression_seqs.ExpressionSequence': batch_size = xs[0][0].dim()[1] h_bot = [] h_mid = [] h_top = [] z_bot = [] z_mid = [] z_top = [] self.top_layer.h = None self.top_layer.c = None self.top_layer.z = None self.mid_layer.h = None self.mid_layer.c = None self.mid_layer.z = None self.bottom_layer.h = None self.bottom_layer.c = None self.bottom_layer.z = None #?? checkme. want to init z to ones? (cherry paper) z_one = dy.ones(1, batch_size=batch_size) h_bot.append( dy.zeroes(dim=(self.hidden_dim, ), batch_size=batch_size)) #indices for timesteps are +1 h_mid.append(dy.zeroes(dim=(self.hidden_dim, ), batch_size=batch_size)) h_top.append(dy.zeroes(dim=(self.hidden_dim, ), batch_size=batch_size)) for i, x_t in enumerate(xs): h_t_bot, z_t_bot = self.bottom_layer.transduce( h_below=x_t, h_above=h_mid[i], z_below=z_one ) #uses h_t_top from layer above@previous time step, h_t_bot and z_t_bot from previous time step (saved in hmlstmcell) h_t_mid, z_t_mid = self.mid_layer.transduce( h_below=h_t_bot, h_above=h_top[i], z_below=z_t_bot ) #uses h_t_top from layer above@previous time step, h_t_bot and z_t_bot from previous time step (saved in hmlstmcell) h_t_top, z_t_top = self.top_layer.transduce( h_below=h_t_mid, h_above=None, z_below=z_t_mid ) #uses z_t_bot and h_t_bot from previous layer call, h_t_top and z_t_top from previous time step (saved in hmlstmcell) h_bot.append(h_t_bot) z_bot.append(z_t_bot) h_mid.append(h_t_mid) z_mid.append(z_t_mid) h_top.append(h_t_top) z_top.append(z_t_top) # #gated output module # # #sigmoid # W_layer = dy.parameters(dim=(len(self.modules), hidden_dim)) #needs to be moved to init? num layers by hidden_dim # h_cat = dy.transpose(dy.concatenate([h_bot, h_mid, h_top])) # dotted = dy.dot_product(e1, e2) # gates = dy.logistic(dotted) # #relu # # om = dy.relu() #final state is last hidden state from top layer self._final_states = [transducers.FinalTransducerState(h_top[-1])] fin_xs = expression_seqs.ExpressionSequence(expr_list=h_top[1:]) return fin_xs #removes the init zeros to make it same length as seq
def __call__(self, final_encoder_state, encoder_states, max_generation_length, snippets=None, gold_sequence=None, input_sequence=None, dropout_amount=0.): """ Generates a sequence. """ index = 0 context_vector_size = self.token_predictor.attention_module.value_size # Decoder states: just the initialized decoder. # Current input to decoder: phi(start_token) ; zeros the size of the # context vector predictions = [] sequence = [] probability = 1. decoder_states = self._initialize_decoder_lstm(final_encoder_state) decoder_input = dy.concatenate( [self.start_token_embedding, dy.zeroes((context_vector_size, ))]) continue_generating = True while continue_generating: if len(sequence) == 0 or sequence[-1] != EOS_TOK: _, decoder_state, decoder_states = du.forward_one_multilayer( decoder_input, decoder_states, dropout_amount) prediction_input = PredictionInput( decoder_state=decoder_state, input_hidden_states=encoder_states, snippets=snippets, input_sequence=input_sequence) prediction = self.token_predictor( prediction_input, dropout_amount=dropout_amount) predictions.append(prediction) if gold_sequence: decoder_input = dy.concatenate([ self.output_embedder.bow_snippets( gold_sequence[index], snippets), prediction.attention_results.vector ]) sequence.append(gold_sequence[index]) if index >= len(gold_sequence) - 1: continue_generating = False else: probabilities = np.transpose( dy.softmax(prediction.scores).npvalue()).tolist()[0] distribution_map = prediction.aligned_tokens # Get a new probabilities and distribution_map consolidating # duplicates distribution_map, probabilities = flatten_distribution( distribution_map, probabilities) # Modify the probability distribution so that the UNK token can # never be produced probabilities[distribution_map.index(UNK_TOK)] = 0. argmax_index = int(np.argmax(probabilities)) argmax_token = distribution_map[argmax_index] sequence.append(argmax_token) decoder_input = dy.concatenate([ self.output_embedder.bow_snippets( argmax_token, snippets), prediction.attention_results.vector ]) probability *= probabilities[argmax_index] continue_generating = False if index < max_generation_length and argmax_token != EOS_TOK: continue_generating = True index += 1 return SQLPrediction(predictions, sequence, probability)
def transduce( self, expr_seq: 'expression_seqs.ExpressionSequence' ) -> 'expression_seqs.ExpressionSequence': """ transduce the sequence, applying masks if given (masked timesteps simply copy previous h / c) Args: expr_seq: expression sequence or list of expression sequences (where each inner list will be concatenated) Returns: expression sequence """ if isinstance(expr_seq, expression_seqs.ExpressionSequence): expr_seq = [expr_seq] batch_size = expr_seq[0].batch_size() seq_len = expr_seq[0].sent_len() if self.dropout_rate > 0.0 and self.train: self.set_dropout_masks(batch_size=batch_size) cur_input = expr_seq self._final_states = [] for layer_i in range(self.num_layers): h = [dy.zeroes(dim=(self.hidden_dim, ), batch_size=batch_size)] c = [dy.zeroes(dim=(self.hidden_dim, ), batch_size=batch_size)] for pos_i in range(seq_len): x_t = [cur_input[j][pos_i] for j in range(len(cur_input))] if isinstance(x_t, dy.Expression): x_t = [x_t] elif type(x_t) != list: x_t = list(x_t) if (layer_i == 0 and sum([x_t_i.dim()[0][0] for x_t_i in x_t]) != self.total_input_dim) \ or (layer_i>0 and sum([x_t_i.dim()[0][0] for x_t_i in x_t]) != self.hidden_dim): found_dim = sum([x_t_i.dim()[0][0] for x_t_i in x_t]) raise ValueError( f"VanillaLSTMGates: x_t has inconsistent dimension {found_dim}, " f"expecting {self.total_input_dim if layer_i==0 else self.hidden_dim}" ) if self.dropout_rate > 0.0 and self.train: # apply dropout according to https://arxiv.org/abs/1512.05287 (tied weights) gates_t = dy.vanilla_lstm_gates_dropout_concat( x_t, h[-1], self.Wx[layer_i], self.Wh[layer_i], self.b[layer_i], self.dropout_mask_x[layer_i], self.dropout_mask_h[layer_i], self.weightnoise_std if self.train else 0.0) else: gates_t = dy.vanilla_lstm_gates_concat( x_t, h[-1], self.Wx[layer_i], self.Wh[layer_i], self.b[layer_i], self.weightnoise_std if self.train else 0.0) c_t = dy.vanilla_lstm_c(c[-1], gates_t) h_t = dy.vanilla_lstm_h(c_t, gates_t) if expr_seq[0].mask is None or np.isclose( np.sum(expr_seq[0].mask.np_arr[:, pos_i:pos_i + 1]), 0.0): c.append(c_t) h.append(h_t) else: c.append(expr_seq[0].mask.cmult_by_timestep_expr( c_t, pos_i, True) + expr_seq[0].mask.cmult_by_timestep_expr( c[-1], pos_i, False)) h.append(expr_seq[0].mask.cmult_by_timestep_expr( h_t, pos_i, True) + expr_seq[0].mask.cmult_by_timestep_expr( h[-1], pos_i, False)) self._final_states.append( transducers.FinalTransducerState(h[-1], c[-1])) cur_input = [h[1:]] return expression_seqs.ExpressionSequence(expr_list=h[1:], mask=expr_seq[0].mask)
def transduce(self, h_below: 'expression_seqs.ExpressionSequence', h_above, z_below) -> 'expression_seqs.ExpressionSequence': if self.c == None: self.c = dy.zeroes( dim=(self.hidden_dim, )) #?? does (hidden,) take care of batch_size? if self.h == None: self.h = dy.zeroes(dim=(self.hidden_dim, )) if self.z == None: self.z = dy.ones(dim=(1, )) W_1l_r = dy.parameter(self.p_W_1l_r) bias = dy.parameter(self.p_bias) h = dy.parameter(self.h) s_recur = W_1l_r * h #matrix multiply is *, element-wise is dy.cmult. CURRERROR: stale expression if not self.last_layer: W_2l_td = dy.parameter(self.p_W_2l_td) W_0l_bu = dy.parameter(self.p_W_0l_bu) s_bottomup = W_0l_bu * h_below #?? this is becoming (2049,). does it need to be (2049,1) to do scalar * matrix? s_topdown = W_2l_td * h_above else: s_topdown = dy.zeroes( s_recur.dim()[0][0], ) #?? this gets the shape e.g. ((5, 1), 1). do i actually want batch_size as well? s_bottomup = W_1l_r * h s_bottomup = dy.cmult( z_below, s_bottomup ) #to handle batched scalar * matrix -> e.g. (1x10, 2049x10) s_topdown = dy.cmult( self.z, s_topdown ) #will be zeros if last_layer. is this right, or should z=1 in this case ?? fslice = s_recur + s_topdown + s_bottomup + bias #?? checkme. bias has same shape as s_recur et al? [4*hidden+1, batch_size]? i_ft = dy.pick_range(fslice, 0, self.hidden_dim) i_it = dy.pick_range(fslice, self.hidden_dim, self.hidden_dim * 2) i_ot = dy.pick_range(fslice, self.hidden_dim * 2, self.hidden_dim * 3) i_gt = dy.pick_range(fslice, self.hidden_dim * 3, self.hidden_dim * 4) f_t = dy.logistic( i_ft + 1.0 ) #+1.0 bc a paper said it was better to init that way (matthias) i_t = dy.logistic(i_it) o_t = dy.logistic(i_ot) g_t = dy.tanh(i_gt) #z * normal_update + (1-z)*copy: ie, when z_below is 0, z_new = z (copied prev timestamp). when z_below is 1, z_new = dy.round etc #hier = True # z_tmp = dy.pick_range(fslice, self.hidden_dim*4,self.hidden_dim*4+1) # z_tilde = dy.logistic(z_tmp) #original: hard sigmoid + slope annealing (a) # z_new = dy.cmult(1-z_below, self.z) + dy.cmult(z_below, dy.round(z_tilde, gradient_mode="straight_through_gradient")) #hier = False z_tmp = dy.pick_range(fslice, self.hidden_dim * 4, self.hidden_dim * 4 + 1) z_tilde = dy.logistic( z_tmp) #original: hard sigmoid + slope annealing (a) z_new = dy.round( z_tilde, gradient_mode="straight_through_gradient" ) #use straight-through estimator for gradient: step fn forward, hard sigmoid backward #z = z_l,t-1 #z_below = z_l-1,t # if self.z.value() == 1: #FLUSH # c_new = dy.cmult(i_t, g_t) # h_new = dy.cmult(o_t, dy.tanh(c_new)) # elif z_below.value() == 0: #COPY # if flush removed, only copy or normal update # when z_below is 0, c_new and h_new are self.c and self.h. when z_below is 1, c_new, h_new = normal update c_new = dy.cmult((1 - z_below), self.c) + dy.cmult( z_below, (dy.cmult(f_t, self.c) + dy.cmult(i_t, g_t))) h_new = dy.cmult((1 - z_below), self.h) + dy.cmult( z_below, dy.cmult(o_t, dy.tanh(c_new))) # if z_below.value() == 0: #COPY # c_new = self.c # h_new = self.h # else: #UPDATE # c_new = dy.cmult(f_t, self.c) + dy.cmult(i_t, g_t) # h_new = dy.cmult(o_t, dy.tanh(c_new)) self.c = c_new self.h = h_new self.z = z_new return h_new, z_new
def __init__(self, embedding_size, batch_size): self.vector_zero = dynet.zeroes((embedding_size, ), batch_size) self.reading_depth = dynet.inputTensor([1.0] * batch_size, True) self.elements = []